def parse_file(self, url, page_file): try: self.children = set() parser = PageParser(self, url) parser.parse_links(page_file) finally: page_file.close()
def __init__(self, month, year, threshold): self.const_one_month_in_days = 30 self.article_creation_threshold = threshold self.threshold_very_active = 100 self.threshold_active = 5 self.const_very_active = "Very Active" self.const_active = "Active" self.const_not_active = "Not Active" self.month = month self.year = year self.const_max_requests = 500 self.url_userinfo = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=users" self.url_usercontb = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=usercontribs&" self.url_propages = "https://en.wikipedia.org/w/api.php?action=query&format=json&list=prefixsearch&" self.url_contributors = "https://en.wikipedia.org/w/api.php?action=query&format=json&prop=contributors&" self.debug = False self.list_editors_sorted = [] self.editors_WIR = {} self.dict_editor_article = {} self.dict_editor_infoboxes = {} self.set_members = set() self.parser = PageParser() self.tabler = TableGenerator()
def test_follow_succeeded(self): driver = self.driver button_getter = ButtonGetter(driver) page_parser = PageParser(driver) follow_buttons = button_getter.get_follow_buttons() followed_title = follow_buttons[FOLLOWING_INDEX].get_attribute( 'data-tumblelog-name') follow_buttons[FOLLOWING_INDEX].send_keys(Keys.RETURN) #click self.driver.get(URL + "/following") assert followed_title in page_parser.get_following() button_getter.get_unfollow_button(followed_title).click() time.sleep(1) button_getter.get_ok_button().click() time.sleep(1) self.driver.refresh() assert followed_title not in page_parser.get_following()
def get_new_state(driver, old_state): parser = PageParser(driver) state_elements = [] live_elements = parser.get_all_elements() #first look for the old elements that are still present for element in old_state.elements: webelement = WebElement(driver, element.locators) if webelement.is_present(1): webelement.highlight(color="green") if webelement in live_elements: state_elements.append(element) live_elements.remove(webelement) #look for any changed elements for element in live_elements: element.highligbht(color="blue") new_element_state = element_builder.build_element(driver, element) if new_element_state is None: logging.error("No locators found for element %s" % element) else: state_elements.append(new_element_state) return State(elements=state_elements, url=driver.current_url, html=driver.html, screenshot=driver.get_screenshot_as_base64())
def __init__(self, base_url): self._base_url = base_url self._page_parser = PageParser() try: self._mongo_wrapper = MongoWrapper() except DBConnectionError as ex: logging.error(f"Couldn't connect to DB: {str(ex)}")
def test_scroll_works(self): driver = self.driver action_handler = ActionHandler(driver) page_parser = PageParser(driver) for section_index in range(len(SECTIONS)): page_parser.get_dots()[section_index].click() time.sleep(3) action_handler.assert_active_section(SECTIONS, section_index)
def parse(self, url, url_id): print("parsing " + url + "...", file=sys.stderr) volume_id = self.get_volume_id(url_id) reader = self.open_page(url_id, volume_id) if reader: try: parser = PageParser(self, url) parser.parse_links(reader) finally: reader.close()
def test_url_read(self): """ 测试了三个场景: 使用标准url 使用无效url 使用其他格式的url文档,如jpg :return: """ url1 = 'localhost:8081/page1.html' parser = PageParser(url1) content1 = parser.url_read() self.assertEqual(content1.__contains__('page1_4.html'), True) self.assertEqual(content1.__contains__('page1_1.html'), True) #invalid url test url2 = 'localhost:8081/page7.html' parser = PageParser(url2) content2 = parser.url_read() self.assertEqual(content2, '', "return content should be empty") #No support url test url3 = 'localhost:8081/3/image.jpg' parser = PageParser(url3) content3 = parser.url_read() self.assertEqual(content3, '') self.assertLogs(logger='../logs/spider.log', level='error')
def get_urls(cls): while len(Test.urls) > 0: url = Test.get_url() try: Test.count += 1 print(Test.count, url) analysis = PageParser(url) test = analysis.get_urls() Test.urls += test except: pass
def get_urls(cls): while len(Test.urls) > 0: url = Test.get_url() try: Test.count += 1 print(Test.count,url) analysis = PageParser(url) test = analysis.get_urls() Test.urls += test except: pass
def test_show_community_info(self): driver = self.driver presence_checker = PresenceChecker(driver) page_parser = PageParser(driver) assert not presence_checker.is_there_drawer_container() assert not presence_checker.is_there_glass_container() followers_links = page_parser.get_follower_links() followers_links[FOLLOWING_INDEX].send_keys(Keys.RETURN) #click assert presence_checker.is_there_drawer_container() assert presence_checker.is_there_glass_container()
def parse(self, url_id, url, volume_id): print("parsing " + url + "...", file=sys.stderr) reader = self.open_page(url_id, volume_id) if reader: try: parser = PageParser(self, url) parser.parse_links(reader) finally: reader.close() self.cur.execute( """update field set parsed=localtimestamp where id=%s""", (url_id, ))
def dump(): client = HttpClient() torrent_id = get_torrent_id() res = get_dump() new_records = [] last_torrent_id = torrent_id direction = Direction.UP if direction == Direction.UP: increment = 1 else: increment = -1 i = 0 failed = 0 while run: last_torrent_id = last_torrent_id + increment print str(last_torrent_id) link = 'http://rutor.is/torrent/' + str(last_torrent_id) response = client.get_response(link) if not response.has_error: parser = PageParser(last_torrent_id, response.response_text) valid = parser.is_valid() if valid: failed = 0 torrent_info = parser.parse() if torrent_info.category == u'Зарубежные фильмы' or torrent_info.category == u'Наши фильмы': res.append(torrent_info) new_records.append(torrent_info) else: print str(last_torrent_id) + ' is invalid' failed = failed + 1 if failed == 10: print 'end of torrent list reached' last_torrent_id = last_torrent_id - 10 - 1 break i = i + 1 time.sleep(4) dump = json.dumps(res, cls=MyEncoder, ensure_ascii=False) save_dump(dump) save_history(last_torrent_id + increment) save_to_db(new_records) print 'finished'
def test_dismiss_succeeded(self): driver = self.driver button_getter = ButtonGetter(driver) page_parser = PageParser(driver) dismiss_buttons = button_getter.get_dismiss_buttons() dismiss_titles = page_parser.get_dismiss_titles() dismiss_buttons[FOLLOWING_INDEX].click() time.sleep(1) assert dismiss_titles[ FOLLOWING_INDEX].text not in page_parser.get_dismiss_titles()
def gen_docs(): page_list = [] with open(config.DATA_DIR + 'page_list.txt') as fin: for line in fin: page_list.append(line.rstrip()) template_name = config.TEMPLATE_DIR + 'doutula.template' template_parser = TemplateParser(template_name) page_parser = PageParser(template_parser.xpath_list) for page_url in page_list[1104: ]: info_list = page_parser.parse(page_url) if len(info_list) > 0: for docinfo in info_list: print docinfo else: print 'page parse fail.'
def gen_docs(): page_list = [] with open(config.DATA_DIR + 'page_list.txt') as fin: for line in fin: page_list.append(line.rstrip()) template_name = config.TEMPLATE_DIR + 'doutula.template' template_parser = TemplateParser(template_name) page_parser = PageParser(template_parser.xpath_list) for page_url in page_list[1104:]: info_list = page_parser.parse(page_url) if len(info_list) > 0: for docinfo in info_list: print docinfo else: print 'page parse fail.'
def test_login_failed_with_wrong_email(self): driver = self.driver action_handler = ActionHandler(driver) page_parser = PageParser(driver) assert not page_parser.get_error_message().is_displayed() assert "Tumblr" in driver.title action_handler.click_login_button() action_handler.type_and_confirm_email(EMAIL + EMAIL_WRONG_APPENDIX) time.sleep(2) assert not page_parser.get_password_input_field().is_displayed() assert page_parser.get_error_message().is_displayed()
def get_urls(url): global urls, counter try: #可选,进行抓取的url写入一个文件中,但会增加I/O操作 # with open('url_list.txt','a') as test: # test.write(url + '\n') data.delete(url) print(url) analysis = PageParser(url) for i in analysis.get_urls(): if data.check(i): data.delete(i) else: data.insert(i) except: pass
def __init__(self, output_tar, error_tar, save): self.save = save self.output_tar = output_tar self.error_tar = error_tar self.parser = PageParser() self.lock = threading.Lock() self.items = {}
def get_state(driver): parser = PageParser(driver) locator_elements = [] elements = parser.get_usual_elements()[:50] print "Found %s elements " % len(elements) for element in elements: new_element = element_builder.build_element(driver, element) if new_element is not None: locator_elements.append(new_element) screenshot = driver.get_screenshot_as_base64() state = State(elements=locator_elements, url=driver.current_url, html=driver.html, screenshot=screenshot) return state
def crawl(self, url_q): """ spider的爬取逻辑, 调用page_retriever解析下载url, 将提取的子url返回来 并进行去重,加到队列中 :param url_q: 待解析的url地址,绝对路径 :return: """ if not isinstance(url_q, tuple): print("Type error") return if CrawlerThreadPool.interval_links_cnt > \ ConfReader.instance().get_max_links_count(): interval = ConfReader.instance().get_crawl_interval() if interval == 0: interval = 60 * 5 # default every 5 minutes logger.info("Thread %s begin to sleep, %d s later continue" % (threading.currentThread().getName(), interval)) print("Waiting for %d seconds ..." % interval) sleep(interval) #重新计数 self._lock.acquire() CrawlerThreadPool.interval_links_cnt = 0 self._lock.release() else: pass (url, depth) = url_q if depth > ConfReader.instance().get_max_depth(): print("Depth exceed. The max depth is {}".format(depth - 1)) return page_parser = PageParser(url) links = page_parser.parse() new_links = links.difference(CrawlerThreadPool.seen_urls) for new_link in new_links: self._q.put((new_link, depth + 1)) #statistic links number self._lock.acquire() CrawlerThreadPool.total_links += len(new_links) CrawlerThreadPool.interval_links_cnt += len(new_links) print("Spider have crawl {} links.".format( CrawlerThreadPool.total_links)) CrawlerThreadPool.seen_urls.update(new_links) self._lock.release()
def count_words(job_url, search_words): job_page = pp.get_page(job_url) word_counter = {a: 0 for a in search_words} for i in range(len(job_page)): for w in search_words: if w.lower() == job_page[i:(i + len(w))].lower(): word_counter[w] += 1 return word_counter
def test_search_success(self): driver = self.driver action_handler = ActionHandler(driver) page_parser = PageParser(driver) assert not page_parser.get_search_results_container().is_displayed() action_handler.type_search_query(SEARCH_QEURY) time.sleep(1) assert page_parser.get_search_results_container().is_displayed() action_handler.confirm_search_query() assert len(page_parser.get_posts_content()) >= MIN_NUMBER_OF_POSTS
def test_like_succeeded(self): driver = self.driver post_operator = PostOperator(driver) button_getter = ButtonGetter(driver) presence_checker = PresenceChecker(driver) page_parser = PageParser(driver) time.sleep(2) posts = post_operator.get_posts() old_liked_count = page_parser.get_liked_count(URL) like_button = button_getter.get_like_button(posts[FIRST_POST_INDEX + FOLLOWING_INDEX]) post_id = post_operator.get_post_id(posts[FIRST_POST_INDEX + FOLLOWING_INDEX]) like_button.click() time.sleep(1) assert 'liked' in like_button.get_attribute('class') self.driver.get(URL + "/likes") assert presence_checker.is_there_post(post_operator, post_id) new_liked_count = page_parser.get_liked_count(URL) assert new_liked_count == old_liked_count + 1 time.sleep(2) button_getter.get_like_button_by_post_id(post_operator, post_id).click() driver.refresh() assert button_getter.get_like_button_by_post_id( post_operator, post_id) is None new_liked_count = page_parser.get_liked_count(URL) assert new_liked_count == old_liked_count
def get_info(self): parser = PageParser() ##htmmparser遇到/>就表示tag结尾,所以必须替换,遇到<br/>替换为BRBR,否则会解析失败 htmlcontent = self.html htmlcontent = re.compile('<br/>').sub('BRBR', htmlcontent) parser.feed(htmlcontent) finalparseurl = parser.getdata() # print finalparseurl return finalparseurl
def crawl(self): for conf in config_lists: for url in conf['urls']: resp = Downloader().download(url, conf) if resp: proxy_list = PageParser().parse(resp, conf) print(proxy_list) print('正在验证代理可以用性') valid_many(proxy_list, 'spider')
def get_links(search_word): start_url = 'https://rabota.by/search/vacancy?area=1002&fromSearchLine=true&st=searchVacancy&text=' start_page = pp.get_page(start_url + search_word) start_soup = BeautifulSoup(start_page, 'lxml') vacancies = [] for link in start_soup.find_all('a', href=True): if link['href'][0:18] == 'https://rabota.by/': vacancies.append(link['href']) return vacancies
def fail_search(search_word): search_url = 'https://rabota.by/search/vacancy?area=1002&fromSearchLine=true&st=searchVacancy&text=' + search_word search_page = pp.get_page(search_url) search_soup = BeautifulSoup(search_page, 'lxml') key = search_soup.find_all('h1') no_patern = 'ничего не надено' if str(key[0]).find(no_patern): return True else: return False
class ActionHandler(ActionHandlerGeneral): def __init__(self, driver): self.page_parser = PageParser(driver) self.button_getter = ButtonGetter(driver) def type_and_confirm_language(self, language): language_selector = self.page_parser.get_language_selector() language_selector.send_keys(language) language_selector.send_keys(Keys.RETURN) def type_language(self, language): language_selector = self.page_parser.get_language_selector() language_selector.send_keys(language) def confirm_language(self): language_selector = self.page_parser.get_language_selector() language_selector.send_keys(Keys.RETURN) def click_delete_account_button(self): self.button_getter.get_delete_account_button().click()
def crawl_prxxy_by_pages(page_urls, queue): page_parser = PageParser.PageParser() for page_url in page_urls: headers = {'User-Agent': random.choice(constants.USER_AGENT)} r = requests.get(page_url, headers=headers) page_parser.set_html_doc(r.text) proxy_dict_list = page_parser.extract_proxy_urls() for proxy_dict in proxy_dict_list: if utils.check_https_proxy(proxy_dict): print('crawled a valid proxy:%s' % utils.format_proxy_dict(proxy_dict)) queue.put(proxy_dict)
def handle_link_shared(event): unfurls = {} for link in event.get('links'): url = link.get('url') origin = requests.get(url) p = PageParser() p.feed(origin.text) p.close() if p.content_type == PageParser.IMAGE: unfurls[url] = { 'text': 'image', 'image_url': p.content } response = requests.post('https://slack.com/api/chat.unfurl', json={ 'token': WEB_API_TOKEN, 'channel': event.get('channel'), 'ts': event.get('message_ts'), 'unfurls': unfurls }, headers={ 'Content-type': 'application/json;charset=utf-8', 'Authorization': 'Bearer %s' % WEB_API_TOKEN }) print('unfurl %s' % response.text) return('Done')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_pages', type=int) parser.add_argument('--update_tag_db', '-u', help='Upsert scraped data to database', required=False, action='store_true') args = parser.parse_args() db = Database() if db.is_empty('Tag') or args.update_tag_db: tag_parser = PageParser("tag") tags = tag_parser.get_pages(Tag, MAX) db.update_tag_table(tags) print("Getting Question Summaries...") summary_parser = PageParser("question_summary") summaries = summary_parser.get_pages(QuestionSummary, args.num_pages) print("Getting Articles...") article_parser = APIParser() articles = article_parser.get_responses(Article, summaries) #Enrich Question Summary with articles for question_summary, articles_list in zip(summaries, articles): question_summary.add_articles(articles_list) print("Populating DB...") db.insert_question_summaries(summaries)
def test_parse(self): """ 测试了三个场景: 使用标准url 使用无效url 使用其他格式的url文档,如jpg :return: """ #parser = Page_parser() url1 = 'localhost:8081/page1.html' expect_sub_url = 'localhost:8081/1/page1_1.html' parser = PageParser(url1) links = parser.parse() self.assertIn(expect_sub_url, links) url2 = 'localhost:8081/page7.html' parser = PageParser(url2) links = parser.parse() self.assertEqual(links, set()) url3 = 'localhost:8081/3/image.jpg' parser = PageParser(url3) self.assertEqual(parser.parse(), set())
def create_apartment_body(self, html, url): pp = PageParser(html) return pp.create_apartment_page(url)
class NewEggCrawlHandler(crawle.Handler): ITEM_URL_PREFIX = 'http://www.newegg.com/Product/Product.aspx\?Item=' CART_URL = 'http://secure.newegg.com/Shopping/ShoppingCart.aspx' MAP_URL_PREFIX = 'http://www.newegg.com/Product/MappingPrice.aspx?Item=' ZIP_COOKIE = ''.join(['NV%5FORDERCOOKIE=#4%7b%22Sites%22%3a%7b%22USA%22', '%3a%7b%22Values%22%3a%7b', '%22NVS%255FCUSTOMER%255FSHIPPING%255FMETHOD1%22', '%3a%22038%22%2c', '%22NVS%255FCUSTOMER%255FZIP%255FCODE%22%3a', '%2293117%22%7d%7d%7d%7d']) ITEM = 0 CART = 1 MAPPING = 2 @staticmethod def transform_id(id): return '%s-%s-%s' % (id[7:9], id[9:12], id[12:]) def __init__(self, output_tar, error_tar, save): self.save = save self.output_tar = output_tar self.error_tar = error_tar self.parser = PageParser() self.lock = threading.Lock() self.items = {} def handle_error(self, rr): if not self.save: return temp_file = StringIO() cPickle.dump(rr, temp_file, cPickle.HIGHEST_PROTOCOL) temp_file.seek(0) info = tarfile.TarInfo('error/%s-%s' % rr.request_url) info.size = len(temp_file.buf) info.mtime = time.time() self.lock.acquire() self.error_tar.members = [] self.error_tar.addfile(info, temp_file) self.lock.release() temp_file.close() def save_page(self, rr): if not self.save: return temp_file = StringIO() cPickle.dump(rr, temp_file, cPickle.HIGHEST_PROTOCOL) temp_file.seek(0) info = tarfile.TarInfo('pages/%s-%s' % rr.request_url) info.size = len(temp_file.buf) info.mtime = time.time() self.lock.acquire() self.output_tar.members = [] self.output_tar.addfile(info, temp_file) self.lock.release() temp_file.close() def pre_process(self, rr): if not isinstance(rr.request_url, tuple): print 'Something slid by: %s' % rr.response_url item_id, r_type = rr.request_url if r_type == self.ITEM: rr.response_url = ''.join([self.ITEM_URL_PREFIX, item_id]) elif r_type == self.CART: rr.response_url = self.CART_URL c_id = ''.join(['NV%5FNEWEGGCOOKIE=#4{"Sites":{"USA":{"Values":{"', self.transform_id(item_id), '":"1"}}}}']) rr.request_headers = {'Cookie':';'.join([self.ZIP_COOKIE, c_id])} elif r_type == self.MAPPING: rr.response_url = ''.join([self.MAP_URL_PREFIX, item_id]) else: raise Exception('Unknown type') rr.redirects = 0 def process(self, rr, queue): if rr.response_status == None: try: if isinstance(rr.error, socket.error): queue.put(rr.request_url) elif isinstance(rr.error, crawle.CrawleRedirectsExceeded): pass else: self.handle_error(rr) except: self.handle_error(rr) return elif rr.response_status != 200: self.handle_error(rr) return item_id, r_type = rr.request_url if r_type == self.ITEM: info = self.parser.parse_item_page_info(item_id, rr.response_body) if not info: return if 'deactivated' not in info and 'price' not in info: queue.put((item_id, self.CART)) elif r_type == self.CART: info = self.parser.parse_cart_page(item_id, rr.response_body) if not info: queue.put((item_id, self.MAPPING)) return elif r_type == self.MAPPING: info = self.parser.parse_mapping_page(item_id, rr.response_body) else: raise Exception('Unknown Type') self.lock.acquire() if r_type == self.ITEM: self.items[item_id] = info else: self.items[item_id].update(info) self.lock.release() self.save_page(rr)