def __get_article_items2(self): ##### get article genre list logging.info('start crawling_delay') time.sleep(self.crawling_delay) logging.info('end crawling_delay') logging.info('start get method') page = webs.get(self.journal_url + self.latest_articles_url, headers=hds, timeout=self.timeout) logging.info('end get method') aritcle_genre_htmls = re.findall(self.pat_article_genre, page.text) logging.info('get %s article genres', str(len(aritcle_genre_htmls))) ###### get article list article_list_buf = [] for genre_html in aritcle_genre_htmls: article_kind = self.check_items_in_article( re.findall(self.pat_article_kind, genre_html)) article_kind = self.format_text(article_kind) aritcle_htmls = re.findall(self.pat_article, genre_html) logging.info('get %s articles in %s', str(len(aritcle_htmls)), article_kind) ##### get article items counter = 0 for html in aritcle_htmls: a = article_module.Aritcle() # get items in article a.title_e = self.check_items_in_article( re.findall(self.pat_title, html)) a.url = self.check_items_in_article( re.findall(self.pat_url, html)) a.kind = article_kind a.date = self.check_items_in_article( re.findall(self.pat_publish_date, html)) # format items a.authors = self.format_item_of_authors( re.findall(self.pat_authors, html)) a.title_e = self.format_text(a.title_e) a.date = self.format_date(a.date) counter += 1 article_list_buf.append(a) logging.info('added a article:%s', str(counter)) self.article_list = tuple(article_list_buf)
def __get_article_items3(self): ##### get new articles part logging.info('start crawling_delay') time.sleep(self.crawling_delay) logging.info('end crawling_delay') logging.info('start get method') page = webs.get(self.journal_url + self.latest_articles_url, headers=hds, timeout=self.timeout) logging.info('end get method') new_articles_html = re.findall(self.pat_article_part, page.text)[0] ###### get articles counter = 0 article_list_buf = [] aritcle_htmls = re.findall(self.pat_article, new_articles_html) logging.info('get %s articles', str(len(aritcle_htmls))) # take articles by oldest first for html in reversed(aritcle_htmls): a = article_module.Aritcle() # get items in article a.title_e = self.check_items_in_article( re.findall(self.pat_title, html)) a.url = self.check_items_in_article(re.findall(self.pat_url, html)) a.kind = self.check_items_in_article( re.findall(self.pat_article_kind, html)) a.date = self.check_items_in_article( re.findall(self.pat_publish_date, html)) # format items a.authors = self.format_text( self.format_item_of_authors(re.findall(self.pat_authors, html))) a.title_e = self.format_text(a.title_e) a.kind = self.format_text(a.kind) a.date = self.format_date(a.date) article_list_buf.append(a) counter += 1 logging.info('added a article: %s', str(counter)) if self.counter_limit != -1: if counter == self.counter_limit: break self.article_list = tuple(reversed(article_list_buf))
def get_items_in_PNAS(page): html = lxml.html.fromstring(page.content) article_section = html.xpath( "//div[@class='highwire-cite highwire-cite-highwire-article highwire-citation-pnas-list-complete clearfix']" ) try: #for a_sec in reversed(article_section): for a_sec in article_section: a = article_module.Aritcle() # get item sections title_sec = a_sec.xpath(".//span[@class='highwire-cite-title']") # get items a.title_e = ''.join(title_sec[0].itertext()) pass except IndexError: raise IndexError
def get_article_items(self): ##### get latest articles logging.info('start crawling_delay') time.sleep(self.crawling_delay) logging.info('end crawling_delay') logging.info('start get method') #page = webs.get(self.journal_url + self.latest_articles_url, headers=self.hds, timeout=self.timeout) page = webs.get(self.journal_url + self.latest_articles_url, timeout=self.timeout) logging.info('end get method') #logging page_source each time for debug with open('log/page_source_{}.binf'.format(self.journal_name), 'wb') as f: pickle.dump(page, f) ###### get article items html = lxml.html.fromstring(page.content) article_section = html.xpath("//div[@class='toc__item__body']") article_list_buf = [] counter = 0 logging.info('found %s articles', str(len(article_section))) try: for a_sec in reversed(article_section): a = article_module.Aritcle() # get item sections title_sec = a_sec.xpath(".//h3[@class='toc__item__title']/a") author_sec = a_sec.xpath( ".//ul[@class='toc__item__authors loa rlist--inline']/li/text()" ) kind_sec = a_sec.xpath( ".//div[@class='toc__item__type']/text()") date_sec = a_sec.xpath( ".//div[@class='toc__item__date']/text()") # get items buf_str = lxml.html.tostring(title_sec[0]).decode('utf-8') buf_str = text_convert.unescape(buf_str) #buf_str = entity_references.change_entity_references_to_utf8_in_text(buf_str) title_sec2 = lxml.html.fromstring(buf_str) a.title_e = title_sec2.text_content() a.url = title_sec[0].values()[0] a.authors = ' '.join(author_sec) a.kind = kind_sec[0] a.date = self.format_date(date_sec[0]) # add article to article_list article_list_buf.append(a) # logging counter += 1 logging.info('added a article:%s', str(counter)) # limitation for getting articles if self.counter_limit != -1: if counter == self.counter_limit: break self.article_list = tuple(reversed(article_list_buf)) except IndexError: raise IndexError
def get_article_items(self): ##### get latest articles logging.info('start crawling_delay') time.sleep(self.crawling_delay) logging.info('end crawling_delay') logging.info('start get method') page = webs.get(self.journal_url + self.latest_articles_url, headers=self.hds, timeout=self.timeout) logging.info('end get method') #logging page_source each time for debug with open('log/page_source_{}.binf'.format(self.journal_name), 'wb') as f: pickle.dump(page, f) ###### get article items prs = lxml.html.HTMLParser(encoding='utf-8') html = lxml.html.fromstring(page.content, parser=prs) article_section1 = html.xpath(".//ul[@class='issue-toc item-list']") article_section2 = article_section1[0].xpath(".//article") article_list_buf = [] counter = 0 logging.info('found %s articles', str(len(article_section2))) for a_sec in article_section2: a = article_module.Aritcle() # get item sections title_sec = a_sec.xpath("./div/h3/a/div") author_sec = a_sec.xpath( ".//span[@class='highwire-citation-authors']/span/text()") url_sec = a_sec.xpath( ".//a[@class='highwire-cite-linked-title']/@href") date_sec = a_sec.xpath(".//time/text()") # get items a.title_e = title_sec[0].text_content() a.url = url_sec[0] a.authors = ', '.join(author_sec) a.date = self.format_date(date_sec[0]) # get article type kind_sec = title_sec[0].xpath(".//ancestor::node()") for k_sec in reversed(kind_sec): #祖先ノードで一番初めのh2タグのテキストがarticle type sec = k_sec.xpath(".//h2") if len(sec) != 0: buf = sec[0].text_content().strip().replace("\n", " ").replace( "\t", "") a.kind = buf break # add article to article_list article_list_buf.append(a) # logging counter += 1 logging.info('added a article:%s', str(counter)) # limitation for getting articles if self.counter_limit != -1: if counter == self.counter_limit: break self.article_list = tuple(article_list_buf)
def get_article_items(self): ##### get latest articles logging.info('start crawling_delay') time.sleep(self.crawling_delay) logging.info('end crawling_delay') logging.info('start get method') page = webs.get(self.journal_url + self.latest_articles_url, headers=self.hds, timeout=self.timeout) logging.info('end get method') #logging page_source each time for debug with open('log/page_source_{}.binf'.format(self.journal_name), 'wb') as f: pickle.dump(page, f) ###### get article items prs = lxml.html.HTMLParser(encoding='utf-8') html = lxml.html.fromstring(page.content, parser=prs) article_section = html.xpath(self.article_sec_path) article_list_buf = [] counter = 0 logging.info('found %s articles', str(len(article_section))) try: for i in range(3): # search articles in three pages for a_sec in article_section: #for a_sec in article_section: a = article_module.Aritcle() # get item sections title_sec = a_sec.xpath( self.title_sec_path ) #text()は使わない。title無いで<i>などのタグが使用されていたときに、正常に取得できない。 url_sec = a_sec.xpath(self.url_sec_path) date_sec = a_sec.xpath(self.date_sec_path) kind_sec = a_sec.xpath(self.kind_sec_path) # get items a.title_e = title_sec[0].text_content().strip() a.url = url_sec[0] a.date = self.format_date(date_sec[0]) a.kind = kind_sec[0] # add article to article_list article_list_buf.append(a) # logging counter += 1 logging.info('added a article:%s', str(counter)) # limitation for getting articles if self.counter_limit != -1: if counter == self.counter_limit: break if counter == self.counter_limit: break # go to the next page next_url = html.xpath(self.next_btn_path)[0] time.sleep(self.crawling_delay) page = webs.get(self.journal_url + next_url, headers=self.hds, timeout=self.timeout) html = lxml.html.fromstring(page.content, parser=prs) article_section = html.xpath(self.article_sec_path) self.article_list = tuple(reversed(article_list_buf)) except IndexError: raise IndexError