Ejemplo n.º 1
0
 def parse(self, response):
     if self.mode == "c":
         article_list = response.xpath(self.company_list_xpath)
         for article in article_list:
             time_str = get_time(article.xpath(self.company_time_xpath).extract_first().strip()[-10:])
             if is_in_filtered_time(time_str):
                 article_detail = {
                     "title": article.xpath(self.company_title_xpath).extract_first().strip(),
                     "time": time_str,
                     "link": self.target_root + article.xpath(self.company_link_xpath).extract_first().strip()}
                 yield article_detail
     else:
         market_list = response.xpath(self.market_list_xpath)
         if response.request.url.find("?page=1") > -1:
             for article in market_list[0].xpath("./li"):
                 time_str = get_time(article.xpath(self.market_top_time_xpath).extract_first().strip())
                 if is_in_filtered_time(time_str):
                     article_detail = {
                         "title": article.xpath(self.market_top_title_xpath).extract_first().strip() + " " + time_str,
                         "time": time_str,
                         "link": self.target_root + article.xpath(self.market_top_link_xpath).extract_first().strip()}
                     yield article_detail
         for article in market_list[1].xpath("./li"):
             time_str = get_time(article.xpath(self.market_time_xpath).extract_first().strip())
             if is_in_filtered_time(time_str):
                 article_detail = {
                     "title": article.xpath(self.market_title_xpath).extract_first().strip() + " " + time_str,
                     "time": time_str,
                     "link": self.target_root + article.xpath(self.market_link_xpath).extract_first().strip()}
                 yield article_detail
Ejemplo n.º 2
0
 def parse(self, response):
     # Get article in list
     article_list = response.selector.xpath(self.list_xpath)
     for article in article_list:
         article_link = article.xpath(
             self.link_xpath).extract_first().strip()
         article_detail = {
             "title":
             article.xpath(self.title_xpath).extract_first().strip(),
             "time":
             get_time(
                 article.xpath(self.time_xpath).extract_first().strip()),
             "init":
             article.xpath(self.init_xpath).extract_first().strip(),
             "link":
             article_link
         }
         if is_in_filtered_time(article_detail.get("time")):
             if self.keyword:
                 yield Request(url=article_link,
                               callback=self.examine_article,
                               meta={
                                   "article_detail": article_detail,
                                   "keyword": self.keyword
                               })
             else:
                 yield article_detail
Ejemplo n.º 3
0
    def examine_article(response):
        article_content_xpath = "//article[@id='detail-content']/div[@class='post-content ']/p"
        # 2018/11/22-START-Myhq-REMOVED: Redundant scraped item
        # article_init_xpath = "//article[@id='detail-content']/div[2]/h2/strong/text()"
        # 2018/11/22-END-Myhq-REMOVED: Redundant scraped item
        article_detail = response.meta.get("article_detail")
        # 2018/11/22-START-Myhq-REMOVED: Redundant scraped item
        # article_detail["init"] = response.selector.xpath(article_init_xpath).extract_first().strip()
        # 2018/11/22-END-Myhq-REMOVED: Redundant scraped item

        keyword_list = response.meta.get("keyword")
        match_flg = True
        article_content = response.selector.xpath(article_content_xpath)

        for kw in keyword_list:
            for paragraph in article_content:
                paragraph_content = str(
                    paragraph.xpath(".//text()").extract_first())
                if paragraph_content.lower().find(" " + kw + " ") != -1:
                    # Keyword found
                    match_flg = True
                    break
                # Keyword not found
                match_flg = False
            # Article did not have a keyword
            if not match_flg:
                break
        if match_flg:
            time_xpath = "//div[@class='pull-right mt-5 mr-10']/text()"
            time_str = response.selector.xpath(time_xpath)
            formatted_time = get_time("".join(time_str.extract()).strip())
            article_detail["time"] = formatted_time
            if is_in_filtered_time(formatted_time):
                yield article_detail
                pass
Ejemplo n.º 4
0
 def parse(self, response):
     response.selector.remove_namespaces()
     res_html = HtmlResponse(
         url="my HTML string",
         body=response.selector.xpath(
             "//CallbackContent/text()").extract_first().strip(),
         encoding='utf-8')
     article_list = res_html.xpath(self.list_xpath)
     for article in article_list:
         time_str = get_time(
             article.xpath(self.time_xpath).extract_first().strip())
         if is_in_filtered_time(time_str):
             article_link = article.xpath(
                 self.link_xpath).extract_first().strip()
             article_detail = {
                 "title":
                 article.xpath(self.title_xpath).extract_first().strip(),
                 "time":
                 time_str,
                 "init":
                 "",
                 "link":
                 article_link
             }
             yield Request(url=article_link,
                           callback=self.get_summary,
                           meta={"article_detail": article_detail})
Ejemplo n.º 5
0
 def parse(self, response):
     # Get article in list
     response_json = json.loads(response.text)
     article_list = response_json["members"]["items"]
     for article in article_list:
         article_link = article["url"]
         article_detail = {
             "title": article["headline"],
             "time": get_time(article["created"]),
             "init": article["summary"],
             "link": article_link
         }
         if is_in_filtered_time(article_detail.get("time")):
             if self.keyword:
                 yield Request(url=article_link,
                               callback=self.examine_article,
                               meta={
                                   "article_detail": article_detail,
                                   "keyword": self.keyword
                               })
             else:
                 yield article_detail
    def examine_article(response):
        article_detail = response.meta.get("article_detail")
        keyword_list = response.meta.get("keyword")

        article_content_xpath = "//div[@class='news-body-content']/p"
        article_content = response.selector.xpath(article_content_xpath)

        time_str = "".join(
            response.selector.xpath(
                "//div[@class='news-author-info']//text()").extract()).strip()
        time_regex = "((0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[0-2])/\d\d\d\d)"
        article_detail["time"] = get_time(
            re.search(time_regex, time_str).group(1))

        match_flg = False
        if is_in_filtered_time(article_detail.get("time")):
            if keyword_list:
                for kw in keyword_list:
                    for paragraph in article_content:
                        paragraph_content = str(
                            paragraph.xpath(".//text()").extract_first())
                        if paragraph_content.lower().find(" " + kw +
                                                          " ") != -1:
                            # Keyword found
                            match_flg = True
                            break
                        # Keyword not found
                        match_flg = False
                    # Article did not have a keyword
                    if not match_flg:
                        break
                if match_flg:
                    yield response.meta.get("article_detail")
                    pass
            else:
                yield response.meta.get("article_detail")
Ejemplo n.º 7
0
 def parse(self, response):
     article_list = response.selector.xpath(self.list_xpath)
     for article in article_list:
         if response.request.url.find("NextPageTinCPNY_CBTCPH") != -1:
             title = article.xpath(self.stock_id_xpath).extract_first().strip() \
                     + " - " + article.xpath(self.org_name_xpath).extract_first().strip()
             init = article.xpath(
                 self.title_issuer_xpath).extract_first().strip()
         else:
             title = article.xpath(
                 self.stock_id_xpath).extract_first().strip()
             init = article.xpath(
                 self.title_hnx_xpath).extract_first().strip()
         article_detail = {
             "title":
             title,
             "time":
             get_time(
                 article.xpath(self.time_xpath).extract_first().strip()),
             "init":
             init
         }
         if is_in_filtered_time(article_detail.get("time")):
             yield article_detail
 def parse(self, response):
     # Get top_focus
     if str(response.request.url).endswith("p1.htm"):
         top_focus = response.selector.xpath(self.top_focus_xpath)
         top_link = top_focus.xpath(
             self.top_focus_link_xpath).extract_first().strip()
         article_detail = {
             "title":
             top_focus.xpath(
                 self.top_focus_title_xpath).extract_first().strip(),
             "time":
             get_time_from_link(top_link),
             "intro":
             "",
             "link":
             self.target_root + top_link
         }
         if is_in_filtered_time(article_detail.get("time")):
             if self.keyword:
                 yield Request(url=(self.target_root + top_link),
                               callback=self.examine_article,
                               meta={
                                   "article_detail": article_detail,
                                   "keyword": self.keyword
                               })
             else:
                 yield article_detail
         top_list = response.selector.xpath(self.top_list_xpath)
         for item in top_list:
             top_list_item_class = item.xpath(
                 self.top_list_item_class_xpath).extract_first().strip()
             if top_list_item_class == "block-normal-item" or top_list_item_class == "block-normal-item last":
                 item_link = item.xpath(
                     self.top_list_item_link_xpath).extract_first().strip()
                 article_detail = {
                     "title":
                     item.xpath(self.top_list_item_title_xpath).
                     extract_first().strip(),
                     "time":
                     get_time_from_link(item_link),
                     "intro":
                     "",
                     "link":
                     self.target_root + item_link
                 }
                 if is_in_filtered_time(article_detail.get("time")):
                     if self.keyword:
                         yield Request(url=(self.target_root + item_link),
                                       callback=self.examine_article,
                                       meta={
                                           "article_detail": article_detail,
                                           "keyword": self.keyword
                                       })
                     else:
                         yield article_detail
     # Get articles
     article_list = response.selector.xpath(self.list_xpath)
     for article in article_list:
         article_link = article.xpath(
             self.article_link_xpath).extract_first().strip()
         article_detail = {
             "title":
             article.xpath(
                 self.article_title_xpath).extract_first().strip(),
             "time":
             get_time_from_link(article_link),
             "intro":
             article.xpath(
                 self.article_intro_xpath).extract_first().strip(),
             "link":
             self.target_root + article_link
         }
         if is_in_filtered_time(article_detail.get("time")):
             if self.keyword:
                 yield Request(url=(self.target_root + article_link),
                               callback=self.examine_article,
                               meta={
                                   "article_detail": article_detail,
                                   "keyword": self.keyword
                               })
             else:
                 yield article_detail
Ejemplo n.º 9
0
    def parse(self, response):
        if str(response.request.url).endswith("page-1.html"):
            # Get top item
            top_item = response.selector.xpath(self.top_item_xpath)
            top_item_link = top_item.xpath(
                self.top_item_link_xpath).extract_first().strip()
            article_detail = {
                "title":
                top_item.xpath(
                    self.top_item_title_xpath).extract_first().strip(),
                "time":
                date.today().strftime("%Y/%m/%d"),
                # 2018/11/22-START-Myhq-MODIFIED: Redundant scraped item
                # "init": "",
                "init":
                top_item.xpath(
                    self.top_item_init_xpath).extract_first().strip(),
                # 2018/11/22-END-Myhq-MODIFIED: Redundant scraped item
                "link":
                top_item_link
            }
            if self.keyword:
                yield Request(url=top_item_link,
                              callback=self.examine_article,
                              meta={
                                  "article_detail": article_detail,
                                  "keyword": self.keyword
                              })
            else:
                yield article_detail
            # 2018/11/22-START-Myhq-REMOVED: Redundant scraped item
            # # Get top list
            # top_list = response.selector.xpath(self.top_list_xpath)
            # top_list_item_link = ""
            # for item in top_list:
            #     top_list_item_link = item.xpath(self.top_list_item_link_xpath).extract_first().strip()
            #     article_detail = {"title": item.xpath(self.top_list_item_title_xpath).extract_first().strip(),
            #                       "time": date.today().strftime("%Y/%m/%d"),
            #                       "init": "",
            #                       "link": top_list_item_link}
            # if self.keyword:
            #     yield Request(url=top_list_item_link, callback=self.examine_article,
            #                   meta={"article_detail": article_detail,
            #                         "keyword": self.keyword})
            # else:
            #     yield article_detail
            # 2018/11/22-END-Myhq-REMOVED: Redundant scraped item

        # Get article in list
        article_list = response.selector.xpath(self.list_xpath)
        for article in article_list:
            article_link = article.xpath(
                self.list_item_link_xpath).extract_first().strip()
            time_str = "".join(
                article.xpath(self.list_item_time_xpath).extract()).strip()
            time_regex = "((0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[0-2])/\d\d\d\d)"
            time_str = re.search(time_regex, time_str).group(1)
            article_detail = {
                "title":
                article.xpath(
                    self.list_item_title_xpath).extract_first().strip(),
                "time":
                get_time(time_str),
                "init":
                article.xpath(
                    self.list_item_init_xpath).extract_first().strip(),
                "link":
                article_link
            }
            if is_in_filtered_time(article_detail.get("time")):
                if self.keyword:
                    yield Request(url=article_link,
                                  callback=self.examine_article,
                                  meta={
                                      "article_detail": article_detail,
                                      "keyword": self.keyword
                                  })
                else:
                    yield article_detail
Ejemplo n.º 10
0
    def parse(self, response):
        # Not make-it sub
        if str(response.request.url).find("make-it") == -1:
            # Get headline article
            if str(response.request.url).endswith("?page=1"):
                headline = response.selector.xpath(self.headline_xpath)
                if headline is not None:
                    headline_link = self.target_root + headline.xpath(self.link_xpath).extract_first().strip()
                    if headline_link.find("/video/") == -1:
                        article_detail = {"title": headline.xpath(self.title_xpath).extract_first().strip(),
                                          "time": headline.xpath(self.time_xpath).extract_first().strip()[1:11],
                                          "init": headline.xpath(self.init_xpath).extract_first().strip(),
                                          "link": headline_link}
                        if is_in_filtered_time(article_detail.get("time")):
                            if self.keyword:
                                yield Request(url=headline_link, callback=self.examine_article,
                                              meta={"article_detail": article_detail,
                                                    "keyword": self.keyword})
                            else:
                                yield article_detail

            # Get headline article in list
            headline_list = response.selector.xpath(self.headline_list_xpath)
            for headline in headline_list:
                if headline.xpath("./@id").extract_first() is None:
                    headline_link = self.target_root + headline.xpath(self.link_xpath).extract_first().strip()
                    if headline_link.find("/video/") == -1:
                        article_detail = {"title": headline.xpath(self.title_xpath).extract_first().strip(),
                                          "time": headline.xpath(self.time_xpath).extract_first().strip()[1:11],
                                          "init": headline.xpath(self.init_xpath).extract_first().strip(),
                                          "link": headline_link}
                        if is_in_filtered_time(article_detail.get("time")):
                            if self.keyword:
                                yield Request(url=headline_link, callback=self.examine_article,
                                              meta={"article_detail": article_detail,
                                                    "keyword": self.keyword})
                            else:
                                yield article_detail

            # Get article in list
            article_list = response.selector.xpath(self.list_xpath)
            for article in article_list:
                if article.xpath("./@id").extract_first() is None:
                    article_link = self.target_root + article.xpath(self.link_xpath).extract_first().strip()
                    if article_link.find("/video/") == -1:
                        article_detail = {"title": article.xpath(self.title_xpath).extract_first().strip(),
                                          "time": article.xpath(self.time_xpath).extract_first().strip()[1:11],
                                          "init": article.xpath(self.init_xpath).extract_first().strip(),
                                          "link": article_link}
                        if is_in_filtered_time(article_detail.get("time")):
                            if self.keyword:
                                yield Request(url=article_link, callback=self.examine_article,
                                              meta={"article_detail": article_detail,
                                                    "keyword": self.keyword})
                            else:
                                yield article_detail

        # Make-it sub
        else:
            # Get headline article
            if str(response.request.url).endswith("?page=1"):
                makeit_headline_xpath = self.list_xpath + "/div/a"
                headline = response.selector.xpath(makeit_headline_xpath)
                if headline is not None:
                    headline_link = self.target_root + headline.xpath("./@href").extract_first().strip()
                    if headline_link.find("/video/") == -1:
                        article_detail = {
                            "title": headline.xpath("./div[@class='headline']/text()").extract_first().strip(),
                            "time": headline.xpath("./@href").extract_first().strip()[1:11],
                            "init": headline.xpath(self.init_xpath).extract_first().strip(),
                            "link": headline_link}
                        if is_in_filtered_time(article_detail.get("time")):
                            if self.keyword:
                                yield Request(url=headline_link, callback=self.examine_article,
                                              meta={"article_detail": article_detail,
                                                    "keyword": self.keyword})
                            else:
                                yield article_detail

            # Get article in list
            article_list = response.selector.xpath(self.list_xpath)
            for article in article_list:
                if str(article.xpath("./@class").extract_first()).find(" card") != -1:
                    article_link = self.target_root + article.xpath(self.link_xpath).extract_first().strip()
                    if article_link.find("/video/") == -1:
                        article_detail = {"title": article.xpath(self.title_xpath).extract_first().strip(),
                                          "time": article.xpath(self.time_xpath).extract_first().strip()[1:11],
                                          "init": article.xpath(self.init_xpath).extract_first().strip(),
                                          "link": article_link}
                        if is_in_filtered_time(article_detail.get("time")):
                            if self.keyword:
                                yield Request(url=article_link, callback=self.examine_article,
                                              meta={"article_detail": article_detail,
                                                    "keyword": self.keyword})
                            else:
                                yield article_detail