def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@property="og:title"]/@content').extract_first() item['link'] = response.meta['link'] issn = tranfrom_issn(response.xpath('//small//span//text()').get()) item['issn'] = issn if issn == "1468-2060": item['source'] = "Annals of the rheumatic diseases" item['if_2017'] = 12.35 elif issn == "1468-3288": item['source'] = "Gut" item['if_2017'] = 17.016 item['pub_date'] = response.xpath( '//meta[@name="DC.Date"]/@content').get() item['abstract'] = strip_tag( response.xpath( '//meta[@name="DC.Description"]/@content').extract()) item['doi'] = response.xpath( '//meta[@name="citation_doi"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@name="wkhealth_title"]/@content').extract_first() item['link'] = response.meta['link'] issn = response.xpath('//meta[@name="wkhealth_issn"]/@content').get() item['issn'] = issn if issn == "0021-9355": item[ 'source'] = "The Journal of bone and joint surgery. American volume" item['if_2017'] = 4.583 else: item['source'] = "Annals of surgery" item['if_2017'] = 9.203 item['pub_date'] = tranfrom_date( response.xpath('//meta[@name="wkhealth_date"]/@content').get()) item['abstract'] = strip_tag( response.xpath('//section[@class="article-abstract"]//div//p'). extract()) or '' item['doi'] = response.xpath( '//meta[@name="wkhealth_doi"]/@content').get() item['authors'] = response.xpath( '//meta[@name="wkhealth_authors"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = strip_tag( response.xpath('//meta[@property="og:title"]/@content').extract()) item['link'] = response.meta['link'] issn = response.xpath( '//meta[@name="citation_issn"]/@content').extract()[0] item['issn'] = issn if issn == "0140-6736" or "1474-547X": item['source'] = "Lancet" item['if_2017'] = 53.254 elif issn == "2468-2667": item['source'] = "The Lancet. Public health" item['if_2017'] = 1.441 elif issn == "2213-2600" or "2213-2619": item['source'] = "The Lancet. Respiratory medicine" item['if_2017'] = 3.23 elif issn == "2542-5196": item['source'] = "The Lancet. Planetary health" item['if_2017'] = "" elif issn == "2215-0366" or "2215-0374": item['source'] = "The Lancet. Child & adolescent health" item['if_2017'] = 15.233 elif issn == "1474-4465" or "1474-4422": item['source'] = "The Lancet. Neurology" item['if_2017'] = 27.138 elif issn == "1470-2045" or "1474-5488": item['source'] = "The Lancet. Oncology" item['if_2017'] = 36.418 elif issn == "2213-8587" or "2213-8595": item['source'] = "The lancet. Diabetes & endocrinology" item['if_2017'] = 19.313 else: item['source'] = "" item['if_2017'] = "" item['pub_date'] = tranfrom_date( response.xpath('//meta[contains(@name,"date")]/@content').get()) item['abstract'] = strip_tag(response.xpath('//meta[@name="citation_abstract"]/@content').extract()) or \ handler_abstract(strip_tag(response.xpath('//div[@class="section-paragraph"]//text()').extract())) \ or " " item['doi'] = response.xpath( '//meta[@name="citation_doi"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = strip_tag(response.xpath('//meta[@property="og:title"]/@content').extract()) item['link'] = response.meta['link'] item['source'] = "Science immunology" item['pub_date'] = response.xpath('//meta[@name="DC.Date"]/@content').get() or \ response.xpath('//meta[@property="article:published_time"]/@content').get()[:10] item['abstract'] = strip_tag(response.xpath('//meta[@name="og:description"]/@content').extract()) or \ strip_tag(response.xpath('//meta[@itemprop="description"]/@content').extract()) item['doi'] = response.xpath('//meta[@name="citation_doi"]/@content').get() or \ response.xpath('//meta[@name="news_doi"]/@content').get() or ' ' item['authors'] = response.xpath('//meta[@name="citation_author"]/@content').extract() or " " # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() or "" item['if_2017'] = 0.001 item['issn'] = '2470-9468' item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = strip_tag(response.xpath('//meta[@property="og:title"]/@content').extract()) item['link'] = response.meta['link'] item['source'] = "JAMA internal medicine" item['pub_date'] = tranfrom_date(response.xpath('//meta[@name="citation_online_date"]/@content').get()) or \ tranfrom_date(response.xpath('//meta[@name="citation_publication_date"]/@content').get()) abstract = response.xpath('//meta[@name="citation_abstract"]/@content').extract_first() abstract1 = strip_tag(handler_abstract(response.xpath('//div[@class="abstract-content"]//p[@class="para"]//text() | ' '//div[@class="article-full-text"]//p[@class="para"]//text()').extract())) item['abstract'] = abstract1 if not abstract else strip_tag(abstract) item['doi'] = response.xpath('//meta[@name="citation_doi"]/@content').get() item['authors'] = response.xpath('//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item['if_2017'] = 19.989 item['issn'] = "2168-6114" item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = strip_tag( response.xpath('//meta[@property="og:title"]/@content').extract()) item['link'] = response.meta['link'] item['source'] = "Diabetes" item['pub_date'] = response.xpath( '//meta[@name="DC.Date"]/@content').get() item['abstract'] = strip_tag(response.xpath('//meta[@name="og:description"]//@content').extract()) or \ strip_tag(handler_abstract(response.xpath('//meta[@name="citation_abstract"]//@content').extract())) item['doi'] = response.xpath( '//meta[@name="citation_doi"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item['if_2017'] = 7.273 # "issnElectronic": "1939-327X" item['issn'] = '0012-1797' item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@name="citation_title"]/@content').extract_first() item['link'] = response.meta['link'] item['issn'] = response.xpath( '//meta[@name="citation_issn"]/@content').extract()[0] item['source'] = "BMJ : British medical journal" item['if_2017'] = 23.259 item['pub_date'] = response.xpath( '//meta[@name="article:published_time"]/@content').get() item['abstract'] = strip_tag( response.xpath('//meta[@name="DC.Description"]//@content').get()) item['doi'] = response.xpath( '//meta[@name="citation_doi"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@name="DC.Title"]/@content').extract_first() item['link'] = response.meta['link'] item['issn'] = "2159-8290" item['source'] = "Cancer discovery" item['if_2017'] = 24.373 item['pub_date'] = response.xpath( '//meta[@name="DC.Date"]/@content').get() item['abstract'] = strip_tag( response.xpath( '//meta[@name="DC.Description"]/@content').extract()) item['doi'] = response.xpath( '//meta[@name="DC.Identifier"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = strip_tag( response.xpath('//meta[@property="og:title"]/@content').extract()) item['link'] = response.meta['link'] # "issnPrint" : "0036-8075", item["issn"] = "1095-9203" item['if_2017'] = 41.058 item['source'] = "Science" item['pub_date'] = response.xpath('//meta[@name="DC.Date"]/@content').get() or \ response.xpath('//meta[@property="article:published_time"]/@content').get()[:10] item['abstract'] = strip_tag(response.xpath('//meta[@name="og:description"]/@content').extract()) or \ strip_tag(response.xpath('//meta[@itemprop="description"]/@content').extract()) item['doi'] = response.xpath('//meta[@name="citation_doi"]/@content').get() or \ response.xpath('//meta[@name="news_doi"]/@content').get() or \ response.xpath('//meta[@name="DC.Relation"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() or " " # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() or "" item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@name="DC.Title"]/@content').extract_first() item['link'] = response.meta['link'] # "issnPrint" : "1078-0432" item['issn'] = '1557-3265' item[ 'source'] = "Clinical cancer research : an official journal of the American Association for Cancer Research" item['if_2017'] = 10.199 item['pub_date'] = response.xpath( '//meta[@name="DC.Date"]/@content').get() item['abstract'] = strip_tag( response.xpath( '//meta[@name="DC.Description"]/@content').extract()) item['doi'] = response.xpath( '//meta[@name="DC.Identifier"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@name="twitter:title"]/@content').get() item['link'] = response.meta['link'] item['source'] = "The New England journal of medicine" item['pub_date'] = response.xpath('//meta[@name="evt-dt"]/@content').get() or\ response.xpath('//meta[@name="dc.Date"]/@content').get() item['abstract'] = strip_tag(response.xpath('//section[@class="o-article-body__section"]//text()').extract()) or \ strip_tag(response.xpath('//div[@class="m-letter"]//p[@class="f-body"]//text()').extract()) or '' item['doi'] = response.xpath( '//meta[@name="evt-doiPage"]/@content').get() item['authors'] = response.xpath( '//header//ul[contains(@class,"m-article-header__authors")]//li//text()' ).extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item['if_2017'] = 79.258 # item['issn'] = tranfrom_issn(response.xpath('//li[@class="hidden-xs nowrap"]//text()').extract()[0]) # "issnPrint" : "0028-4793" item['issn'] = "1533-4406" item["is_pubmed"] = 0 yield item
def parse_info(self, response): item = JamanetworkItem() item['title'] = response.xpath( '//meta[@property="og:title"]/@content').extract_first() link = response.meta['link'] item['link'] = link item[ 'source'] = "Brain : a journal of neurology" if 'brain' in link else "European heart journal" item['pub_date'] = response.xpath( '//meta[@property="og:updated_time"]/@content').get() abstract = strip_tag(response.xpath('//section[@class="abstract"]//text()').extract()).strip("[']") or \ strip_tag(response.xpath('//div[@class="widget-items"]//p//text()').extract()).strip("[']") item['abstract'] = abstract if abstract else " " item['doi'] = response.xpath( '//meta[@name="citation_doi"]/@content').get() item['authors'] = response.xpath( '//meta[@name="citation_author"]/@content').extract() # item['AffiliationInfo'] = response.xpath('//meta[@name="citation_author_institution"]/@content').extract() item['if_2017'] = 10.84 if 'brain' in link else 23.425 item['issn'] = tranfrom_issn( response.xpath( '//div[@class="journal-footer-colophon"]//li//text()').get()) item["is_pubmed"] = 0 yield item