def add_to_result(block, title, year, month, day, driver, keyword_list): if share.double_check(keyword_list, title) and share.compare_date( year, month, day) and not check_exist(title): b = block.find_element_by_class_name('article-wrapper') link = b.find_element_by_tag_name('a').get_attribute('href') # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None)) return
def list_article(block, keyword_list, driver): title = block.find_element_by_tag_name('h2').text date = block.find_element_by_tag_name('time').text.split('.') y, m, d = date[2], date[0], date[1] # print(title,date) if share.double_check(keyword_list, title) and share.compare_date( y, m, d) and not check_exist(title): link = block.find_element_by_tag_name('a').get_attribute('href') # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None))
def feature_box(sublist, keyword_list, driver, count): title = sublist[2] if share.double_check(keyword_list, title): if count == 1: xpath = '//*[@id="content"]/div/div[1]/div[2]/div[2]/div/article[1]/div[3]/h2/a' elif count == 2: xpath = '//*[@id="content"]/div/div[1]/div[2]/div[2]/div/article[2]/div[3]/h2/a' link_element = driver.find_element_by_xpath(xpath) link = link_element.get_attribute("href") # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None))
def featured_head(sublist, keyword_list, driver): title = sublist[3] if share.double_check(keyword_list, title): link_element = driver.find_element_by_xpath( '//*[@id="content"]/div/div[1]/div[2]/div[1]/article/div/div/h1/a') link = link_element.get_attribute("href") driver.get(link) body_element = driver.find_element_by_xpath( '//*[@id="content"]/div/div[1]/article/div/div[1]/p[1]') body = body_element.text.strip() driver.back() result_list.append(share.Page_info(link, title, body))
def same_h(block, keyword_list, driver): title = block.find_element_by_tag_name('h2').text date = block.find_element_by_tag_name('time').text if len(date.split('.')) == 1: date = date.split() y, m, d = date[2], date[0], date[1][:-1] else: date = date.split('.') y, m, d = date[2], date[0], date[1] if share.double_check(keyword_list, title) and share.compare_date( y, m, d) and not check_exist(title): b = block.find_element_by_tag_name('h2') link = b.find_element_by_tag_name('a').get_attribute('href') # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None))
def squared_label(block, keyword_list, driver): text_list = [ text.strip() for text in block.text.split('\n') if text.strip() != '' ] title = text_list[1] date = text_list[-1].split('|')[-1].strip().split() y, m, d = date[2], date[0], date[1][:-1] if share.double_check(keyword_list, title) and share.compare_date( y, m, d) and not check_exist(title): b = block.find_element_by_class_name('article-wrapper') link = b.find_element_by_tag_name('a').get_attribute('href') xpath = '//*[@id="wrapper"]/div[1]/div/div/section/div[2]/div/p' driver.get(link) para = driver.find_element_by_xpath(xpath) body = para.text result_list.append(share.Page_info(link, title, body)) driver.back()
def main(keyword_list: list): print('Searching Cleantechnica......') result_list = [] try: for l in link_list: driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') # print(l) driver.get(l) time.sleep(.1) driver.refresh() main_part = driver.find_elements_by_class_name('omc-blog-one') for article in main_part: date = article.find_element_by_class_name( 'omc-date-time-one').text.split('|')[0][13:-1].split() y, m, d = date[-1], date[0], date[1][:-3] title = article.find_element_by_tag_name('h2').text link = article.find_elements_by_tag_name('a')[1].get_attribute( 'href') if share.double_check(keyword_list, title) and share.compare_date(y, m, d): result_list.append(share.Page_info(link, title, None)) driver.close() driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') for page in result_list: driver.get(page.link) time.sleep(1) page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[2]').text if len(page.body) <= 40: page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[3]').text if len(page.body) <= 40: page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[4]').text if len(result_list) != 0: share.write_file(result_list) print('Finished') finally: try: driver.close() except selenium.common.exceptions.WebDriverException: pass
def main(keyword_list: str): print("Searching CleanTech......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') try: driver.get("https://www.cleantech.com/category/cleantech-news/") main_xpath = '//*[@id="main"]/section[2]/div' blocks = driver.find_elements_by_tag_name("article") for article in blocks: whole_text = article.text.rstrip().lstrip() sentence_list = whole_text.split('\n') copy_list = list(sentence_list) for item in sentence_list: strip = item.rstrip().lstrip() if strip == '': copy_list.remove(item) final_list = [sentence.rstrip().lstrip() for sentence in copy_list] title = final_list[0].encode('utf8').decode('utf-8', 'strict') date = final_list[1][10:] if process(date): if share.double_check(keyword_list, title): description = final_list[2] post_id = article.get_attribute('id') xpath = generate_xpath(post_id) link_element = driver.find_element_by_xpath(xpath) link = link_element.get_attribute('href') result_list.append( share.Page_info(link, title, description)) else: continue else: continue if len(result_list) != 0: share.write_file(result_list) print("Finished") finally: driver.close()
def main(keyword_list: list): print("Searching Azocleantech......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') driver.get('https://www.azocleantech.com/news-index.aspx') try: main_part = driver.find_element_by_xpath( '//*[@id="ctl00_cphBody_latestNewsItems_posts"]') full_list = main_part.find_elements_by_class_name('row') striped = [a.text.strip() for a in full_list] length = int(len(striped) / 2) for i in range(length): title_and_despt = striped[2 * i].split('\n') title = title_and_despt[0].strip() date = striped[2 * i + 1].split('\n')[-1].strip().split() y, m, d = date[2], date[1], date[0] if share.compare_date(y, m, d) and share.double_check( keyword_list, title): link, body = get_link(i, driver) result_list.append(share.Page_info(link, title, body)) if len(result_list) != 0: share.write_file(result_list) print("Finished") finally: driver.close()
def main(keyword_list): print("Searching NengApp......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') index = 0 finish = False try: while True: url = generate_url(index) driver.get(url) article_list = driver.find_elements_by_class_name('news-item') for article in article_list: info = article.find_element_by_class_name('news-info') if compare_date(info.text) == False: finish = True break content = article.find_element_by_class_name('news-content') title = content.find_element_by_tag_name('a').text if share.double_check(keyword_list, title) and not check_exist(title): link = article.find_element_by_tag_name('a').get_attribute( 'href') result_list.append(share.Page_info(link, title, None)) if finish: break index += 1 for page in result_list: driver.get(page.link) paragraphs = driver.find_elements_by_tag_name('p') for p in paragraphs: if len(p.text) >= 15: page.body = p.text break share.write_file(result_list) print('Finished') finally: driver.close()
def process_articles(art_list: list, keyword_list: list) -> None: for article in art_list: title = article.find_element_by_class_name('article-item-title').text if share.double_check(keyword_list, title): link = article.find_element_by_tag_name('a').get_attribute('href') result_list.append(share.Page_info(link, title, None))