def parse_article(self, html, pmid=None, metadata_dir=None): ''' Takes HTML article as input and returns an Article. PMID Can also be passed, which prevents having to scrape it from the article and/or look it up in PubMed. ''' # Skip rest of processing if this record already exists if pmid is not None and self.database.article_exists( pmid) and not config.OVERWRITE_EXISTING_ROWS: return False html = html.decode('utf-8') # Make sure we're working with unicode html = self.decode_html_entities(html) soup = BeautifulSoup(html) doi = self.extract_doi(soup) pmid = self.extract_pmid(soup) if pmid is None else pmid metadata = scrape.get_pubmed_metadata(pmid, store=metadata_dir, save=True) # TODO: add Source-specific delimiting of salient text boundaries--e.g., exclude References text = soup.get_text() if self.database.article_exists(pmid): if config.OVERWRITE_EXISTING_ROWS: self.database.delete_article(pmid) else: return False self.article = database.Article(text, pmid=pmid, doi=doi, metadata=metadata) return soup
def parse_article(self, html, pmid=None, metadata_dir=None): ''' Takes HTML article as input and returns an Article. PMID Can also be passed, which prevents having to scrape it from the article and/or look it up in PubMed. ''' # Skip rest of processing if this record already exists if pmid is not None and self.database.article_exists(pmid) and not config.OVERWRITE_EXISTING_ROWS: return False html = html.decode('utf-8') # Make sure we're working with unicode html = self.decode_html_entities(html) soup = BeautifulSoup(html) doi = self.extract_doi(soup) pmid = self.extract_pmid(soup) if pmid is None else pmid metadata = scrape.get_pubmed_metadata(pmid, store=metadata_dir, save=True) # TODO: add Source-specific delimiting of salient text boundaries--e.g., exclude References text = soup.get_text() if self.database.article_exists(pmid): if config.OVERWRITE_EXISTING_ROWS: self.database.delete_article(pmid) else: return False self.article = database.Article(text, pmid=pmid, doi=doi, metadata=metadata) return soup
def update_metadata_from_pubmed(self, pmid): pmd = scrape.get_pubmed_metadata(pmid) self.id = int(pmid) self.title = pmd['title'] self.journal = pmd['journal'] self.pubmed_metadata = pmd self.year = pmd['year'] self.abstract = pmd['abstract'] self.citation = pmd['citation']