def start_requests(self): start_url = list() for i in range(100): i = str(i) u = self.url + i start_url.append(u) for url in start_url: yield Request(url, callback=self._parse)
def paper_page_parser(search_exp): """docstring for paper_page_parser""" search_exp = urllib2.quote(search_exp.encode('utf-8')) page_content = requests.get( PAPER_SEARCH_URL.format(search_exp=search_exp, page=1)).text page_count = int(NUM_RE.search(page_content).group(1)) / 20 + 2 result = [] for i in xrange(1, page_count): result.append( Request(arg=PAPER_SEARCH_URL.format(search_exp=search_exp, page=i), parser=paper_parser)) return result
def start_requests(self): start_url = list() for i in range(180, 200): i = str(i) u = self.url + i start_url.append(u) self.num = start_url.__len__() for url in start_url: yield Request(url, self._parse)
def __init__(self, url, html=None): self._html = html self._url = url self._content_start_pos = "" self._content_end_pos = "" self._content_center_pos = "" self._paragraphs = "" if not html: resp = Request(url).get_response() self._html = resp.text self._text = self.__del_html_tag(self._html, save_useful_tag=True)
def parser_desc(self, text): """add detail page to queue""" find_desc_url = DESC_P.findall(text) for desc_url in self.store.record_url(find_desc_url): static_desc_url = self.get_static_url(desc_url) self.spider.queue.put(Request(static_desc_url))
def parser_detail(self, text): """add detail page to queue""" find_item_url = ITEM_P.findall(text) for item_url in self.store.record_url(find_item_url): static_item_url = self.get_static_url(item_url) self.spider.queue.put(Request(static_item_url))
def parser_list(self, text): """add next list page to queue""" find_list_url = LIST_PAGE_P.findall(text) for list_url in self.store.record_url(find_list_url): static_list_url = self.get_list_url(list_url) self.spider.queue.put(Request(static_list_url))
def request(**kwargs): kwargs.setdefault("proxies", None) response = Request(**kwargs).get_response() print(response) IPython.embed(header="now you can use response")
release_time = get_release_time_in_paragraph(self._content_start_pos) if not release_time: release_time = get_release_time_in_paragraph( self._content_center_pos) return release_time if __name__ == "__main__": urls = [ "http://news.cctv.com/2020/06/27/ARTIWaUMWOEtQNxyLiVqrH0Q200627.shtml", "http://column.caijing.com.cn/20200724/4684426.shtml", ] for url in urls: resp = Request(url).get_response() html = resp.text article_extractor = ArticleExtractor(url, html) content = article_extractor.get_content() title = article_extractor.get_title() release_time = article_extractor.get_release_time() author = article_extractor.get_author() print("---------------------------") print(url) print("title : ", title) print("release_time: ", release_time) print("author", author) print("content : ", content) print("---------------------------")
user=config.db_user, passwd=config.db_password, db=config.db_database, charset='utf8') cursor = conn.cursor() cursor.execute( 'select configValue from t_spider_config where configKey=%s', (arg_config.get(sys.argv[1]), )) config_values = [row[0] for row in cursor.fetchall()] if sys.argv[1] == 'paper': spider_paper = Spider('paper') for search_exp in config_values: reqs = parser.paper_page_parser(search_exp)[:500] for req in reqs: spider_paper.add_request(req) spider_paper.crawl() if sys.argv[1] == 'news': spider_news = Spider('news') for seed_url in config_values: spider_news.add_request( Request(arg=seed_url, parser=parser.news_parser)) spider_news.crawl() if sys.argv[1] == 'patent': spider_patent = Spider('patent') for search_exp in config_values: spider_patent.add_request( Request(arg=search_exp, parser=parser.patent_parser)) spider_patent.crawl()