Example #1
0
    def start_requests(self):
        start_url = list()

        for i in range(100):
            i = str(i)
            u = self.url + i
            start_url.append(u)

        for url in start_url:
            yield Request(url, callback=self._parse)
Example #2
0
def paper_page_parser(search_exp):
    """docstring for paper_page_parser"""
    search_exp = urllib2.quote(search_exp.encode('utf-8'))
    page_content = requests.get(
        PAPER_SEARCH_URL.format(search_exp=search_exp, page=1)).text
    page_count = int(NUM_RE.search(page_content).group(1)) / 20 + 2
    result = []
    for i in xrange(1, page_count):
        result.append(
            Request(arg=PAPER_SEARCH_URL.format(search_exp=search_exp, page=i),
                    parser=paper_parser))
    return result
Example #3
0
    def start_requests(self):
        start_url = list()

        for i in range(180, 200):
            i = str(i)
            u = self.url + i
            start_url.append(u)

        self.num = start_url.__len__()

        for url in start_url:
            yield Request(url, self._parse)
    def __init__(self, url, html=None):
        self._html = html
        self._url = url

        self._content_start_pos = ""
        self._content_end_pos = ""
        self._content_center_pos = ""
        self._paragraphs = ""

        if not html:
            resp = Request(url).get_response()
            self._html = resp.text

        self._text = self.__del_html_tag(self._html, save_useful_tag=True)
Example #5
0
 def parser_desc(self, text):
     """add detail page to queue"""
     find_desc_url = DESC_P.findall(text)
     for desc_url in self.store.record_url(find_desc_url):
         static_desc_url = self.get_static_url(desc_url)
         self.spider.queue.put(Request(static_desc_url))
Example #6
0
 def parser_detail(self, text):
     """add detail page to queue"""
     find_item_url = ITEM_P.findall(text)
     for item_url in self.store.record_url(find_item_url):
         static_item_url = self.get_static_url(item_url)
         self.spider.queue.put(Request(static_item_url))
Example #7
0
 def parser_list(self, text):
     """add next list page to queue"""
     find_list_url = LIST_PAGE_P.findall(text)
     for list_url in self.store.record_url(find_list_url):
         static_list_url = self.get_list_url(list_url)
         self.spider.queue.put(Request(static_list_url))
Example #8
0
def request(**kwargs):
    kwargs.setdefault("proxies", None)
    response = Request(**kwargs).get_response()
    print(response)

    IPython.embed(header="now you can use response")
        release_time = get_release_time_in_paragraph(self._content_start_pos)
        if not release_time:
            release_time = get_release_time_in_paragraph(
                self._content_center_pos)

        return release_time


if __name__ == "__main__":
    urls = [
        "http://news.cctv.com/2020/06/27/ARTIWaUMWOEtQNxyLiVqrH0Q200627.shtml",
        "http://column.caijing.com.cn/20200724/4684426.shtml",
    ]
    for url in urls:
        resp = Request(url).get_response()
        html = resp.text

        article_extractor = ArticleExtractor(url, html)
        content = article_extractor.get_content()
        title = article_extractor.get_title()
        release_time = article_extractor.get_release_time()
        author = article_extractor.get_author()
        print("---------------------------")
        print(url)
        print("title : ", title)
        print("release_time: ", release_time)
        print("author", author)
        print("content : ", content)
        print("---------------------------")
Example #10
0
                           user=config.db_user,
                           passwd=config.db_password,
                           db=config.db_database,
                           charset='utf8')
    cursor = conn.cursor()
    cursor.execute(
        'select configValue from t_spider_config where configKey=%s',
        (arg_config.get(sys.argv[1]), ))
    config_values = [row[0] for row in cursor.fetchall()]
    if sys.argv[1] == 'paper':
        spider_paper = Spider('paper')
        for search_exp in config_values:
            reqs = parser.paper_page_parser(search_exp)[:500]
            for req in reqs:
                spider_paper.add_request(req)
        spider_paper.crawl()

    if sys.argv[1] == 'news':
        spider_news = Spider('news')
        for seed_url in config_values:
            spider_news.add_request(
                Request(arg=seed_url, parser=parser.news_parser))
        spider_news.crawl()

    if sys.argv[1] == 'patent':
        spider_patent = Spider('patent')
        for search_exp in config_values:
            spider_patent.add_request(
                Request(arg=search_exp, parser=parser.patent_parser))
        spider_patent.crawl()