def main(keyword_list: str): while True: print('Searching Wusuobuneng......') driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') driver.get('http://www.wusuobuneng.com/') timer.sleep(7) try: button_xpath = '//*[@id="root"]/div/div[3]/div/div[3]/ul/li[1]/div[2]/button' while True: button = driver.find_element_by_xpath(button_xpath) button.click() timer.sleep(5) l = driver.find_elements_by_class_name('article-item') last_time = l[-1].find_element_by_class_name( 'article-item-time-icon').text m, d = get_date(last_time) if not share.compare_date(None, m, d): break timer.sleep(2) article_list = l process_articles(article_list, keyword_list) get_all_body(driver) if len(result_list) != 0: share.write_file(result_list) print('Finished') # except selenium.common.exceptions.NoSuchElementException: # main(keyword_list) except: pass else: break finally: driver.close()
def add_to_result(block, title, year, month, day, driver, keyword_list): if share.double_check(keyword_list, title) and share.compare_date( year, month, day) and not check_exist(title): b = block.find_element_by_class_name('article-wrapper') link = b.find_element_by_tag_name('a').get_attribute('href') # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None)) return
def list_article(block, keyword_list, driver): title = block.find_element_by_tag_name('h2').text date = block.find_element_by_tag_name('time').text.split('.') y, m, d = date[2], date[0], date[1] # print(title,date) if share.double_check(keyword_list, title) and share.compare_date( y, m, d) and not check_exist(title): link = block.find_element_by_tag_name('a').get_attribute('href') # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None))
def compare_date(text: str) -> bool: '''Return True if date is within 14 days, else False''' text_list = text.split() date = text_list[-1] if '昨天' in date or '前天' in date or '分钟' in date or '刚刚' in date or '小时' in date or '今天' in date: return True else: date_list = date.split('-') y, m, d = date_list[0], date_list[1], date_list[2] return share.compare_date(y, m, d)
def same_h(block, keyword_list, driver): title = block.find_element_by_tag_name('h2').text date = block.find_element_by_tag_name('time').text if len(date.split('.')) == 1: date = date.split() y, m, d = date[2], date[0], date[1][:-1] else: date = date.split('.') y, m, d = date[2], date[0], date[1] if share.double_check(keyword_list, title) and share.compare_date( y, m, d) and not check_exist(title): b = block.find_element_by_tag_name('h2') link = b.find_element_by_tag_name('a').get_attribute('href') # body = find_body(link,driver) result_list.append(share.Page_info(link, title, None))
def squared_label(block, keyword_list, driver): text_list = [ text.strip() for text in block.text.split('\n') if text.strip() != '' ] title = text_list[1] date = text_list[-1].split('|')[-1].strip().split() y, m, d = date[2], date[0], date[1][:-1] if share.double_check(keyword_list, title) and share.compare_date( y, m, d) and not check_exist(title): b = block.find_element_by_class_name('article-wrapper') link = b.find_element_by_tag_name('a').get_attribute('href') xpath = '//*[@id="wrapper"]/div[1]/div/div/section/div[2]/div/p' driver.get(link) para = driver.find_element_by_xpath(xpath) body = para.text result_list.append(share.Page_info(link, title, body)) driver.back()
def main(keyword_list: list): print('Searching Cleantechnica......') result_list = [] try: for l in link_list: driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') # print(l) driver.get(l) time.sleep(.1) driver.refresh() main_part = driver.find_elements_by_class_name('omc-blog-one') for article in main_part: date = article.find_element_by_class_name( 'omc-date-time-one').text.split('|')[0][13:-1].split() y, m, d = date[-1], date[0], date[1][:-3] title = article.find_element_by_tag_name('h2').text link = article.find_elements_by_tag_name('a')[1].get_attribute( 'href') if share.double_check(keyword_list, title) and share.compare_date(y, m, d): result_list.append(share.Page_info(link, title, None)) driver.close() driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') for page in result_list: driver.get(page.link) time.sleep(1) page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[2]').text if len(page.body) <= 40: page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[3]').text if len(page.body) <= 40: page.body = driver.find_element_by_xpath( '//*[@id="omc-full-article"]/p[4]').text if len(result_list) != 0: share.write_file(result_list) print('Finished') finally: try: driver.close() except selenium.common.exceptions.WebDriverException: pass
def main(keyword_list: list): print("Searching Azocleantech......") driver = webdriver.Chrome(share.FILE_PATH + '/chromedriver') driver.get('https://www.azocleantech.com/news-index.aspx') try: main_part = driver.find_element_by_xpath( '//*[@id="ctl00_cphBody_latestNewsItems_posts"]') full_list = main_part.find_elements_by_class_name('row') striped = [a.text.strip() for a in full_list] length = int(len(striped) / 2) for i in range(length): title_and_despt = striped[2 * i].split('\n') title = title_and_despt[0].strip() date = striped[2 * i + 1].split('\n')[-1].strip().split() y, m, d = date[2], date[1], date[0] if share.compare_date(y, m, d) and share.double_check( keyword_list, title): link, body = get_link(i, driver) result_list.append(share.Page_info(link, title, body)) if len(result_list) != 0: share.write_file(result_list) print("Finished") finally: driver.close()
def process(date: str) -> bool: real_date = date[:date.find('20') + 4].split() y, m, d = real_date[-1], real_date[0], real_date[1][:-1] return share.compare_date(y, m, d)