def main(keyword_list: str): print("Searching GreenTechMedia......") # keywords = keyword.split() driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') try: for l in link_list: driver.get(l) process_blocks(keyword_list, driver) for page in result_list: driver.get(page.link) # print(page.link) time.sleep(2) try: content = driver.find_element_by_xpath( '//*[@id="content"]/div/div[1]/article/div/div[1]') page.body = content.find_element_by_tag_name('p').text.strip() # print(page.body) except selenium.common.exceptions.NoSuchElementException: page.body = driver.find_element_by_xpath( '//*[@id="wrapper"]/div[1]/div/div/section/div[2]/div/p' ).text.strip() if len(result_list) != 0: share.write_file(result_list) print("Finished") finally: driver.close()
def main(keyword_list: str): while True: print('Searching Wusuobuneng......') driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') driver.get('http://www.wusuobuneng.com/') timer.sleep(7) try: button_xpath = '//*[@id="root"]/div/div[3]/div/div[3]/ul/li[1]/div[2]/button' while True: button = driver.find_element_by_xpath(button_xpath) button.click() timer.sleep(5) l = driver.find_elements_by_class_name('article-item') last_time = l[-1].find_element_by_class_name( 'article-item-time-icon').text m, d = get_date(last_time) if not share.compare_date(None, m, d): break timer.sleep(2) article_list = l process_articles(article_list, keyword_list) get_all_body(driver) if len(result_list) != 0: share.write_file(result_list) print('Finished') # except selenium.common.exceptions.NoSuchElementException: # main(keyword_list) except: pass else: break finally: driver.close()
def main(keyword_list: list): print('Searching Cleantechnica......') result_list = [] try: for l in link_list: driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') # print(l) driver.get(l) time.sleep(.1) driver.refresh() main_part = driver.find_elements_by_class_name('omc-blog-one') for article in main_part: date = article.find_element_by_class_name( 'omc-date-time-one').text.split('|')[0][13:-1].split() y, m, d = date[-1], date[0], date[1][:-3] title = article.find_element_by_tag_name('h2').text link = article.find_elements_by_tag_name('a')[1].get_attribute( 'href') if share.double_check(keyword_list, title) and share.compare_date(y, m, d): result_list.append(share.Page_info(link, title, None)) driver.close() driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') for page in result_list: driver.get(page.link) time.sleep(1) page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[2]').text if len(page.body) <= 40: page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[3]').text if len(page.body) <= 40: page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[4]').text if len(result_list) != 0: share.write_file(result_list) print('Finished') finally: try: driver.close() except selenium.common.exceptions.WebDriverException: pass
def main(keyword_list: str): print("Searching CleanTech......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') try: driver.get("https://www.cleantech.com/category/cleantech-news/") main_xpath = '//*[@id="main"]/section[2]/div' blocks = driver.find_elements_by_tag_name("article") for article in blocks: whole_text = article.text.rstrip().lstrip() sentence_list = whole_text.split('\n') copy_list = list(sentence_list) for item in sentence_list: strip = item.rstrip().lstrip() if strip == '': copy_list.remove(item) final_list = [sentence.rstrip().lstrip() for sentence in copy_list] title = final_list[0].encode('utf8').decode('utf-8', 'strict') date = final_list[1][10:] if process(date): if share.double_check(keyword_list, title): description = final_list[2] post_id = article.get_attribute('id') xpath = generate_xpath(post_id) link_element = driver.find_element_by_xpath(xpath) link = link_element.get_attribute('href') result_list.append( share.Page_info(link, title, description)) else: continue else: continue if len(result_list) != 0: share.write_file(result_list) print("Finished") finally: driver.close()
def main(keyword_list: list): print("Searching Azocleantech......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') driver.get('https://www.azocleantech.com/news-index.aspx') try: main_part = driver.find_element_by_xpath( '//*[@id="ctl00_cphBody_latestNewsItems_posts"]') full_list = main_part.find_elements_by_class_name('row') striped = [a.text.strip() for a in full_list] length = int(len(striped) / 2) for i in range(length): title_and_despt = striped[2 * i].split('\n') title = title_and_despt[0].strip() date = striped[2 * i + 1].split('\n')[-1].strip().split() y, m, d = date[2], date[1], date[0] if share.compare_date(y, m, d) and share.double_check( keyword_list, title): link, body = get_link(i, driver) result_list.append(share.Page_info(link, title, body)) if len(result_list) != 0: share.write_file(result_list) print("Finished") finally: driver.close()
def main(keyword_list): print("Searching NengApp......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') index = 0 finish = False try: while True: url = generate_url(index) driver.get(url) article_list = driver.find_elements_by_class_name('news-item') for article in article_list: info = article.find_element_by_class_name('news-info') if compare_date(info.text) == False: finish = True break content = article.find_element_by_class_name('news-content') title = content.find_element_by_tag_name('a').text if share.double_check(keyword_list, title) and not check_exist(title): link = article.find_element_by_tag_name('a').get_attribute( 'href') result_list.append(share.Page_info(link, title, None)) if finish: break index += 1 for page in result_list: driver.get(page.link) paragraphs = driver.find_elements_by_tag_name('p') for p in paragraphs: if len(p.text) >= 15: page.body = p.text break share.write_file(result_list) print('Finished') finally: driver.close()