예제 #1
0
    def __get_article_items2(self):
        ##### get article genre list
        logging.info('start crawling_delay')
        time.sleep(self.crawling_delay)
        logging.info('end crawling_delay')

        logging.info('start get method')
        page = webs.get(self.journal_url + self.latest_articles_url,
                        headers=hds,
                        timeout=self.timeout)
        logging.info('end get method')

        aritcle_genre_htmls = re.findall(self.pat_article_genre, page.text)
        logging.info('get %s article genres', str(len(aritcle_genre_htmls)))

        ###### get article list
        article_list_buf = []

        for genre_html in aritcle_genre_htmls:
            article_kind = self.check_items_in_article(
                re.findall(self.pat_article_kind, genre_html))
            article_kind = self.format_text(article_kind)

            aritcle_htmls = re.findall(self.pat_article, genre_html)
            logging.info('get %s articles in %s', str(len(aritcle_htmls)),
                         article_kind)

            ##### get article items
            counter = 0
            for html in aritcle_htmls:
                a = article_module.Aritcle()

                # get items in article
                a.title_e = self.check_items_in_article(
                    re.findall(self.pat_title, html))
                a.url = self.check_items_in_article(
                    re.findall(self.pat_url, html))
                a.kind = article_kind
                a.date = self.check_items_in_article(
                    re.findall(self.pat_publish_date, html))

                # format items
                a.authors = self.format_item_of_authors(
                    re.findall(self.pat_authors, html))
                a.title_e = self.format_text(a.title_e)
                a.date = self.format_date(a.date)

                counter += 1
                article_list_buf.append(a)
                logging.info('added a article:%s', str(counter))

            self.article_list = tuple(article_list_buf)
예제 #2
0
    def __get_article_items3(self):
        ##### get new articles part
        logging.info('start crawling_delay')
        time.sleep(self.crawling_delay)
        logging.info('end crawling_delay')

        logging.info('start get method')
        page = webs.get(self.journal_url + self.latest_articles_url,
                        headers=hds,
                        timeout=self.timeout)
        logging.info('end get method')

        new_articles_html = re.findall(self.pat_article_part, page.text)[0]

        ###### get articles
        counter = 0
        article_list_buf = []
        aritcle_htmls = re.findall(self.pat_article, new_articles_html)
        logging.info('get %s articles', str(len(aritcle_htmls)))

        # take articles by oldest first
        for html in reversed(aritcle_htmls):
            a = article_module.Aritcle()

            # get items in article
            a.title_e = self.check_items_in_article(
                re.findall(self.pat_title, html))
            a.url = self.check_items_in_article(re.findall(self.pat_url, html))
            a.kind = self.check_items_in_article(
                re.findall(self.pat_article_kind, html))
            a.date = self.check_items_in_article(
                re.findall(self.pat_publish_date, html))

            # format items
            a.authors = self.format_text(
                self.format_item_of_authors(re.findall(self.pat_authors,
                                                       html)))
            a.title_e = self.format_text(a.title_e)
            a.kind = self.format_text(a.kind)
            a.date = self.format_date(a.date)

            article_list_buf.append(a)
            counter += 1
            logging.info('added a article: %s', str(counter))

            if self.counter_limit != -1:
                if counter == self.counter_limit:
                    break

        self.article_list = tuple(reversed(article_list_buf))
예제 #3
0
def get_items_in_PNAS(page):
    html = lxml.html.fromstring(page.content)

    article_section = html.xpath(
        "//div[@class='highwire-cite highwire-cite-highwire-article highwire-citation-pnas-list-complete clearfix']"
    )

    try:
        #for a_sec in reversed(article_section):
        for a_sec in article_section:
            a = article_module.Aritcle()

            # get item sections
            title_sec = a_sec.xpath(".//span[@class='highwire-cite-title']")

            # get items
            a.title_e = ''.join(title_sec[0].itertext())
            pass

    except IndexError:
        raise IndexError
예제 #4
0
    def get_article_items(self):
        ##### get latest articles
        logging.info('start crawling_delay')
        time.sleep(self.crawling_delay)
        logging.info('end crawling_delay')

        logging.info('start get method')
        #page = webs.get(self.journal_url + self.latest_articles_url, headers=self.hds, timeout=self.timeout)
        page = webs.get(self.journal_url + self.latest_articles_url,
                        timeout=self.timeout)
        logging.info('end get method')

        #logging page_source each time for debug
        with open('log/page_source_{}.binf'.format(self.journal_name),
                  'wb') as f:
            pickle.dump(page, f)

        ###### get article items
        html = lxml.html.fromstring(page.content)

        article_section = html.xpath("//div[@class='toc__item__body']")
        article_list_buf = []
        counter = 0

        logging.info('found %s articles', str(len(article_section)))

        try:
            for a_sec in reversed(article_section):
                a = article_module.Aritcle()

                # get item sections
                title_sec = a_sec.xpath(".//h3[@class='toc__item__title']/a")
                author_sec = a_sec.xpath(
                    ".//ul[@class='toc__item__authors loa rlist--inline']/li/text()"
                )
                kind_sec = a_sec.xpath(
                    ".//div[@class='toc__item__type']/text()")
                date_sec = a_sec.xpath(
                    ".//div[@class='toc__item__date']/text()")

                # get items
                buf_str = lxml.html.tostring(title_sec[0]).decode('utf-8')
                buf_str = text_convert.unescape(buf_str)

                #buf_str = entity_references.change_entity_references_to_utf8_in_text(buf_str)
                title_sec2 = lxml.html.fromstring(buf_str)

                a.title_e = title_sec2.text_content()
                a.url = title_sec[0].values()[0]
                a.authors = ' '.join(author_sec)
                a.kind = kind_sec[0]
                a.date = self.format_date(date_sec[0])

                # add article to article_list
                article_list_buf.append(a)

                # logging
                counter += 1
                logging.info('added a article:%s', str(counter))

                # limitation for getting articles
                if self.counter_limit != -1:
                    if counter == self.counter_limit:
                        break

            self.article_list = tuple(reversed(article_list_buf))

        except IndexError:
            raise IndexError
예제 #5
0
    def get_article_items(self):
        ##### get latest articles
        logging.info('start crawling_delay')
        time.sleep(self.crawling_delay)
        logging.info('end crawling_delay')

        logging.info('start get method')
        page = webs.get(self.journal_url + self.latest_articles_url,
                        headers=self.hds,
                        timeout=self.timeout)
        logging.info('end get method')

        #logging page_source each time for debug
        with open('log/page_source_{}.binf'.format(self.journal_name),
                  'wb') as f:
            pickle.dump(page, f)

        ###### get article items
        prs = lxml.html.HTMLParser(encoding='utf-8')
        html = lxml.html.fromstring(page.content, parser=prs)

        article_section1 = html.xpath(".//ul[@class='issue-toc item-list']")
        article_section2 = article_section1[0].xpath(".//article")
        article_list_buf = []
        counter = 0

        logging.info('found %s articles', str(len(article_section2)))

        for a_sec in article_section2:
            a = article_module.Aritcle()

            # get item sections
            title_sec = a_sec.xpath("./div/h3/a/div")
            author_sec = a_sec.xpath(
                ".//span[@class='highwire-citation-authors']/span/text()")
            url_sec = a_sec.xpath(
                ".//a[@class='highwire-cite-linked-title']/@href")
            date_sec = a_sec.xpath(".//time/text()")

            # get items
            a.title_e = title_sec[0].text_content()
            a.url = url_sec[0]
            a.authors = ', '.join(author_sec)
            a.date = self.format_date(date_sec[0])

            # get article type
            kind_sec = title_sec[0].xpath(".//ancestor::node()")
            for k_sec in reversed(kind_sec):
                #祖先ノードで一番初めのh2タグのテキストがarticle type
                sec = k_sec.xpath(".//h2")
                if len(sec) != 0:
                    buf = sec[0].text_content().strip().replace("\n",
                                                                " ").replace(
                                                                    "\t", "")
                    a.kind = buf
                    break

            # add article to article_list
            article_list_buf.append(a)

            # logging
            counter += 1
            logging.info('added a article:%s', str(counter))

            # limitation for getting articles
            if self.counter_limit != -1:
                if counter == self.counter_limit:
                    break

        self.article_list = tuple(article_list_buf)
예제 #6
0
    def get_article_items(self):
        ##### get latest articles
        logging.info('start crawling_delay')
        time.sleep(self.crawling_delay)
        logging.info('end crawling_delay')

        logging.info('start get method')
        page = webs.get(self.journal_url + self.latest_articles_url,
                        headers=self.hds,
                        timeout=self.timeout)
        logging.info('end get method')

        #logging page_source each time for debug
        with open('log/page_source_{}.binf'.format(self.journal_name),
                  'wb') as f:
            pickle.dump(page, f)

        ###### get article items
        prs = lxml.html.HTMLParser(encoding='utf-8')
        html = lxml.html.fromstring(page.content, parser=prs)

        article_section = html.xpath(self.article_sec_path)
        article_list_buf = []
        counter = 0

        logging.info('found %s articles', str(len(article_section)))

        try:
            for i in range(3):  # search articles in three pages
                for a_sec in article_section:

                    #for a_sec in article_section:
                    a = article_module.Aritcle()

                    # get item sections
                    title_sec = a_sec.xpath(
                        self.title_sec_path
                    )  #text()は使わない。title無いで<i>などのタグが使用されていたときに、正常に取得できない。
                    url_sec = a_sec.xpath(self.url_sec_path)
                    date_sec = a_sec.xpath(self.date_sec_path)
                    kind_sec = a_sec.xpath(self.kind_sec_path)

                    # get items
                    a.title_e = title_sec[0].text_content().strip()
                    a.url = url_sec[0]
                    a.date = self.format_date(date_sec[0])
                    a.kind = kind_sec[0]

                    # add article to article_list
                    article_list_buf.append(a)

                    # logging
                    counter += 1
                    logging.info('added a article:%s', str(counter))

                    # limitation for getting articles
                    if self.counter_limit != -1:
                        if counter == self.counter_limit:
                            break

                if counter == self.counter_limit:
                    break

                # go to the next page
                next_url = html.xpath(self.next_btn_path)[0]
                time.sleep(self.crawling_delay)
                page = webs.get(self.journal_url + next_url,
                                headers=self.hds,
                                timeout=self.timeout)
                html = lxml.html.fromstring(page.content, parser=prs)
                article_section = html.xpath(self.article_sec_path)

            self.article_list = tuple(reversed(article_list_buf))

        except IndexError:
            raise IndexError