def test_article_by_pmid(self): pmid = '4' fetch = PubMedFetcher() article = fetch.article_by_pmid(pmid) assert str(article.pmid) == pmid pmid = '25763451' fetch = PubMedFetcher() article = fetch.article_by_pmid(pmid) assert str(article.pmid) == pmid
def search(source = "PubMed", level = "basic", db = "PubMed", query = None, unlabeled_string = None, affiliation = None, article_identifier = None, all_fields = None, author = None, author_identifier = None, book = None, corporate_author = None, create_date = None, completion_date = None, conflict_of_interest = None, ec_rn_number = None, editor = None, entrez_date = None, filter_citations = None, first_author_name = None, full_author_name = None, full_investigator_name = None, grant_number = None, investigator = None, isbn = None, issue = None, journal = None, language = None, last_author = None, location_id = None, mesh_date = None, mesh_major_topic = None, mesh_subheadings = None, mesh_terms = None, modification_date = None, nlm_unique_id = None, other_term = None, owner = None, pagination = None, personal_name_as_subject = None, pharmacological_action = None, place_of_publication = None, pmid = None, publisher = None, publication_date = None, publication_type = None, retmax = None, retmode = None, secondary_source_id = None, sort = None, subset = None, supplementary_concept = None, text_words = None, title = None, title_abstract = None, transliterated_title = None, uid = None, volume = None, raw = False, exact = False, user = None): if source.lower() in ["pubmed"] and level.lower() == "complex": return eutils_search(db = db, retmode = retmode, retmax = retmax, sort = sort, unlabeled_string = unlabeled_string, affiliation = affiliation, article_identifier = article_identifier, all_fields = all_fields, author = author, author_identifier = author_identifier, book = book, corporate_author = corporate_author, create_date = create_date, completion_date = completion_date, conflict_of_interest = conflict_of_interest, ec_rn_number = ec_rn_number, editor = editor, entrez_date = entrez_date, filter_citations = filter_citations, first_author_name = first_author_name, full_author_name = full_author_name, full_investigator_name = full_investigator_name, grant_number = grant_number, investigator = investigator, isbn = isbn, issue = issue, journal = journal, language = language, last_author = last_author, location_id = location_id, mesh_date = mesh_date, mesh_major_topic = mesh_major_topic, mesh_subheadings = mesh_subheadings, mesh_terms = mesh_terms, modification_date = modification_date, nlm_unique_id = nlm_unique_id, other_term = other_term, owner = owner, pagination = pagination, personal_name_as_subject = personal_name_as_subject, pharmacological_action = pharmacological_action, place_of_publication = place_of_publication, pmid = pmid, publisher = publisher, publication_date = publication_date, publication_type = publication_type, secondary_source_id = secondary_source_id, subset = subset, supplementary_concept = supplementary_concept, text_words = text_words, title = title, title_abstract = title_abstract, transliterated_title = transliterated_title, uid = uid, volume = volume, raw = raw, exact = exact) elif source.lower() in ["pubmed"] and level.lower() == "basic": # Use 'unlabeled_string' or 'query' here. # This function already takes completed # PubMed queries as strings (with # various connectors and constructors). if unlabeled_string: fetch = PubMedFetcher() pubmed_id_list = fetch.pmids_for_query(unlabeled_string) ref_list = [] for pubmed_id in pubmed_id_list: article = fetch.article_by_pmid(pubmed_id) # Need a faster way to get titles... temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed", name = article.title) ref_list.append(temp_ref) return ref_list elif query: # This is where the basic reference # search redirects for now, but it # is relatively slow. fetch = PubMedFetcher() pubmed_id_list = fetch.pmids_for_query(query) ref_list = [] for pubmed_id in pubmed_id_list: try: article = fetch.article_by_pmid(pubmed_id) # Need a faster way to get titles... temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed", name = article.title) ref_list.append(temp_ref) except metapub.exceptions.InvalidPMID: print("An invalid PMID error occurred.") temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed") ref_list.append(temp_ref) else: temp_ref = Reference(identifier = str(pubmed_id), identifier_type = "PubMed ID", source = "PubMed") ref_list.append(temp_ref) return ref_list elif source.lower() in ["google", "google scholar"]: return google_scholar_search(unlabeled_string) elif source.lower() in ["openlibrary"]: return openlibrary_search(unlabeled_string)
def processPMID(self, description, document, text): """XXX""" pmid = re.compile('PMID *(\d+)') list_pmid = pmid.findall(description) description = re.sub(r'\[PMID *\d+\]', '', description) pmcid = re.compile('PMCID *(\d+)') list_pmcid = pmcid.findall(description) description = re.sub(r'\[PMCID *\d+\]', '', description) para = description.split(ur'\n') for para_str in para: #print para_str p = document.add_paragraph(' ') p.add_run(para_str) std_str = u"我们通过检测您的基因位点,使用PUBMED等国际公认参考系统,我们认为" + text + u"。" p = document.add_paragraph(' ') p.add_run(std_str) fetch = PubMedFetcher() for pmid in list_pmid: # http://www.ncbi.nlm.nih.gov/pubmed/26471457 pm = fetch.article_by_pmid(pmid) title = pm.title title = re.sub('\.', '', title) citation = '. '.join([title, pm.journal]) p = document.add_paragraph() p.add_run(citation).italic = True for pmcid in list_pmcid: pm = fetch.article_by_pmcid(pmcid) title = pm.title title = re.sub('\.', '', title) citation = '. '.join([title, pm.journal]) p = document.add_paragraph() p.add_run(citation).italic = True
class TestPubMedArticle(unittest.TestCase): def setUp(self): self.fetch = PubMedFetcher() def tearDown(self): pass def test_random_efetch(self): pmid = str(random.randint(22222222, 23333333)) try: article = self.fetch.article_by_pmid(pmid) if article is not None: assert article.pmid == pmid assert article.title is not None except InvalidPMID: self.test_random_efetch() # print "PMID %s returned InvalidPMID response (which is totally OK). Run test again!" % pmid def test_init1(self): """ Test on the xml returned by eutils """ article = PubMedArticle(xml_str1) assert str(article.pmid) == '4' def test_init2(self): """ Test on the xml downloaded from medline """ article = PubMedArticle(xml_str2) assert str(article.pmid) == '23697015' def test_to_dict(self): article = PubMedArticle(xml_str1) self.assertTrue(isinstance(article.to_dict(), dict))
def consultametapub(): fetch = PubMedFetcher() if not request.json: abort(400) pmid = request.json['id'] article = fetch.article_by_pmid(pmid) return jsonify(output=article.title)
def downloadAbstract(self, keywords, file_name,max_return=1e+6): fetcher = PubMedFetcher(cachedir=self.cache_dir, api_key=self.api_key) pmids = fetcher.pmids_for_query(keywords, retmax=max_return) corpus = ET.Element('corpus') keywords_item = ET.SubElement(corpus, 'keywords') keywords_item.text = keywords for pmid in pmids: print(pmid) fetcher._eutils_article_by_pmid(pmid) doc = fetcher.article_by_pmid(pmid) title_str = self.removeHtmlTags(doc.title) abstract_str = self.removeHtmlTags(doc.abstract) if abstract_str == '': continue doc_item = ET.SubElement(corpus, 'article') doc_item.set('id', pmid) title_item = ET.SubElement(doc_item, 'title') title_item.text = title_str abstract_item = ET.SubElement(doc_item, 'abstract') abstract_item.text = abstract_str corpus_in_string = ET.tostring(corpus) xml_file = open(file_name, 'wb') xml_file.write(corpus_in_string)
def search(entry): fetch = PubMedFetcher() try: article = fetch.article_by_pmid(entry['pmid']) except: try: article = fetch.article_by_pmcid(entry['pmcid']) except: try: article = fetch.article_by_doi(entry['doi']) except: try: pmids = fetch.pmids_for_citation(authors=entry['author'], journal=entry['journal'], year=entry['year'], volume=entry['volume']) # pmids2 = fetch.pmids_for_query(entry['title']) article = fetch.article_by_pmid(pmids[0]) except: return None return article
def crawl_chem_abstract(self, keyword, retmax=300): fetch = PubMedFetcher() self.progress_bar_value.emit(self.count) pmids = fetch.pmids_for_query(keyword, retmax=retmax) self.textBrowser_value.emit("Scanning Iteration : " + str(retmax)) self.textBrowser_value.emit("Expected Running Time : " + str(retmax * 2) + " seconds.") self.textBrowser_value.emit("PMID Scan Done!") json_dicts = [] self.textBrowser_value.emit("Crawling Paper Info..") for i in range(len(pmids)): pmid = pmids[i] try: if int(i / len(pmids) * 100) > self.count: self.count = int(i / len(pmids) * 100) self.progress_bar_value.emit(self.count) try: article = fetch.article_by_pmid(pmid) except: self.textBrowser_value.emit("Error reading " + str(pmid)) continue chemical = article.chemicals if not chemical: continue abstract = article.abstract.replace(",", "*") if not abstract: continue elif "\t" in abstract or "\n" in abstract: abstract = abstract.replace("\t", " ") abstract = abstract.replace("\n", " ") title = article.title if not title: continue elif "\t" in title or "\n" in title: title = title.replace("\t", " ") title = title.replace("\n", " ") chemical["title"] = title chemical["abstract"] = abstract json_dicts.append(chemical) except: continue self.textBrowser_value.emit("Progress Done!") return json_dicts
def keyword_query(keywords=sys.argv[1], savepath=sys.argv[2], start_date=None, end_date=None, num_of_articles=1000): """ keyword_query takes in a keyword string or list of keywords, and outputs a dataframe with article meta data that matches the keyword query. **NOTE**: Long queries (~1000+ articles) will take > 5 minutes. Thus, it is advisable to add additional keywords and filters to constrain the search space. :param keywords: A string or a list of keywords to query. :param savepath: A string denoting the full path to save the file in. :param start_date: A string denoting the start date. :param end_date: A string denoting the end date. :param num_of_articles: An integer denoting the maximum number of articles. :return df: A pandas dataframe of the query. """ fetch = PubMedFetcher() # Get PMIDs using query pmids = fetch.pmids_for_query(query=keywords, since=start_date, until=end_date, retmax=num_of_articles) print("Number of PMIDs with search query: " + str(len(pmids))) # Get abstracts based on keyword search. # The query saves to a dictionary, using the PMID as the key. abstracts = {} for id in pmids: article = fetch.article_by_pmid(id) abstracts[id] = [ article.title, article.abstract, article.journal, article.year, article.authors ] # Save the dictionary as a dataframe df = pd.DataFrame.from_dict( abstracts, orient='index', columns=['Title', 'Abstract', 'Journal', 'Year', 'Authors']) # Save the dataframe df.index.name = 'PMID' df.to_csv(savepath) return df
def pmid_article(ref, user=None): article_array = [] if user: if user.email is not None: for pmid in Reference.pmid(ref): url = "http://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid) lookup = PubMedLookup(url, user.email) publication = Publication(lookup) article_array.append(publication) fetch = PubMedFetcher() for pmid in Reference.pmid(ref): article = fetch.article_by_pmid(pmid) article_array.append(article) return article_array
def pmid_article(ref, user=None): article_array = [] if user: if user.email is not None: for pmid in Reference.pmid(ref): url = "http://www.ncbi.nlm.nih.gov/pubmed/" + str(pmid) lookup = PubMedLookup(url, user.email) publication = Publication(lookup) article_array.append(publication) fetch = PubMedFetcher() for pmid in Reference.pmid(ref): article = fetch.article_by_pmid(pmid) article_array.append(article) return article_array
def __init__(self, pmid): self.pmid = pmid fetch = PubMedFetcher(email='*****@*****.**') article = fetch.article_by_pmid(pmid) self.title = article.title self.journal = article.journal self.authors = article.authors # pm_cited - which papers cited current paper try: self.pm_cited = fetch.related_pmids(pmid)['citedin'] except: self.pm_cited = None self.h_index = self.get_H_index() + 1 # self.h_index = 1 # pm_cite - which papers cited by current paper self.pm_cite = [] print("create paper with pmid" + pmid)
def fetch_pubmed(pub_id, id_type = "pmid"): """ Fetches and formats pub data from pubmed """ pm = PubMedFetcher() if id_type == 'doi': try: result = pm.article_by_doi(pub_id) except (AttributeError, MetaPubError, EutilsNCBIError): return None elif id_type == "pmid": try: result = pm.article_by_pmid(pub_id) except (AttributeError, InvalidPMID, EutilsNCBIError): return None elif id_type == "pmc": try: result = pm.article_by_pmcid('PMC' + str(pub_id)) except (AttributeError, MetaPubError, EutilsNCBIError): return None result = result.to_dict() # Set link using DOI if result.get('doi'): result['url'] = "http://dx.doi.org/" + result.get('doi') else: result['url'] = result.get('url') # Provide PDF if possible if result.get('pmc'): result['pdf_url'] = f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{result['pmc']}/pdf" out = {"pub_title": result.get('title'), "pub_authors": result.get('authors'), "pub_abstract": result.get('abstract'), "pub_doi": result.get('doi'), "pub_pmid": result.get('pmid'), "pub_pmc": pub_id if id_type == 'pmc' else None, "pub_url": result.get('url'), "pub_pdf_url": result.get('pdf_url') or 'searching', "pub_journal": result.get('journal'), "pub_date": result['history'].get('pubmed')} return out
def get_info_by_PMID(PMID: str) -> Dict: '''This function takes a PMID str, requests information about the corresponding article via metapub and checks if all necessary information has been retrieved.''' article_dict = {} fetch = PubMedFetcher() try: article = fetch.article_by_pmid(PMID) # Save information in Dict for info in dir(article): if info[0] != '_': article_dict[info] = eval('article.' + info) except MetaPubError: pass #if contains_minimal_information(article_dict): # Add data retrieval info to the dict and return it article_dict = add_retrieval_information(article_dict, 'MetaPub', 'PMID', PMID) return article_dict
def filter_results(results, words_in_tilte, limit): fetch = PubMedFetcher(email='*****@*****.**') filtered_results = [] counter = 0 for paper in results: pmid = paper.split('/')[-1].split('\n')[0] article = fetch.article_by_pmid(pmid) for words in words_in_tilte: include = False for word in words: if word.strip().lower() in article.title.lower(): include = True continue if not include: break if include: filtered_results.append(paper) counter += 1 if counter == limit: return filtered_results return filtered_results
def get_reference_from_pmid_by_metapub(pmid:str)->dict: fetch = PubMedFetcher(cachedir=cache) reference = None try: time.sleep(0.34) article = fetch.article_by_pmid(pmid) reference = {'journal':article.journal, 'authors': article.authors, 'issue':article.issue, 'first_page':article.first_page, 'last_page': article.last_page, 'volume':article.volume, 'year': str(article.year), 'abstract': replace_characters(article.abstract), 'title': replace_characters(article.title), 'doi': article.doi, 'pmid': article.pmid } except: print('*** Bad PMID:',pmid) return reference
def measure_similarity_abstracts(nlp, pmid): def scrape_related_abstracts(pm_id): related_ids = scrape_related_ids(pm_id) if len(related_ids) > 8: related_ids = related_ids[:8] abstracts = [] for related in related_ids: starter = 'https://pubmed.ncbi.nlm.nih.gov/' link = starter + related data = requests.get(link).text soup = BeautifulSoup(data, 'html.parser') abstract_header = soup.find('div', {'id': 'en-abstract'}) try: abstract = str(abstract_header.p.string).strip() abstracts.append(abstract) except: pass return abstracts fetch = PubMedFetcher() exemplary = fetch.article_by_pmid(pmid).abstract doc1 = nlp(exemplary) scores = [] for abstract in scrape_related_abstracts(pmid): doc2 = nlp(abstract) scores.append(doc1.similarity(doc2)) return mean(scores)
def crawl_chem_json(keyword, retmax=1000): fetch = PubMedFetcher() pmids = fetch.pmids_for_query(keyword, retmax=retmax) print("PMID scan Done!") json_dicts = [] print("Crawling Paper Info..") for pmid in tqdm(pmids): try: article = fetch.article_by_pmid(pmid) except: print("Error reading " + str(pmid)) continue chemical = article.chemicals if not chemical: continue json_dicts.append(chemical) print("Process Done!") return json_dicts
def crawl_chem_json(self, keyword, retmax=300): fetch = PubMedFetcher() pmids = fetch.pmids_for_query(keyword, retmax=retmax) self.textBrowser_value.emit("Scanning Iteration : " + str(retmax)) self.textBrowser_value.emit("Expected Running Time : " + str(retmax * 2) + " seconds.") self.textBrowser_value.emit("PMID Scan Done!") self.progress_bar_value.emit(self.count) json_dicts = [] self.textBrowser_value.emit("Crawling Paper Info..") for i in range(len(pmids)): pmid = pmids[i] try: if int(i / len(pmids) * 100) > self.count: self.count = int(i / len(pmids) * 100) self.progress_bar_value.emit(self.count) try: article = fetch.article_by_pmid(pmid) except: self.textBrowser_value.emit("Error reading " + str(pmid)) continue chemical = article.chemicals if not chemical: continue json_dicts.append(chemical) except: continue self.textBrowser_value.emit("Progress Done!") return json_dicts
from __future__ import absolute_import, print_function, unicode_literals import logging from metapub import PubMedFetcher logging.getLogger('eutils').setLevel(logging.DEBUG) logging.getLogger('metapub').setLevel(logging.DEBUG) fetch = PubMedFetcher() pmbook = fetch.article_by_pmid('20301577') print(pmbook.title) print(pmbook.abstract) print(pmbook.year)
def search(source="PubMed", level="basic", db="PubMed", query=None, unlabeled_string=None, affiliation=None, article_identifier=None, all_fields=None, author=None, author_identifier=None, book=None, corporate_author=None, create_date=None, completion_date=None, conflict_of_interest=None, ec_rn_number=None, editor=None, entrez_date=None, filter_citations=None, first_author_name=None, full_author_name=None, full_investigator_name=None, grant_number=None, investigator=None, isbn=None, issue=None, journal=None, language=None, last_author=None, location_id=None, mesh_date=None, mesh_major_topic=None, mesh_subheadings=None, mesh_terms=None, modification_date=None, nlm_unique_id=None, other_term=None, owner=None, pagination=None, personal_name_as_subject=None, pharmacological_action=None, place_of_publication=None, pmid=None, publisher=None, publication_date=None, publication_type=None, retmax=None, retmode=None, secondary_source_id=None, sort=None, subset=None, supplementary_concept=None, text_words=None, title=None, title_abstract=None, transliterated_title=None, uid=None, volume=None, raw=False, exact=False, user=None): if source.lower() in ["pubmed"] and level.lower() == "complex": return eutils_search( db=db, retmode=retmode, retmax=retmax, sort=sort, unlabeled_string=unlabeled_string, affiliation=affiliation, article_identifier=article_identifier, all_fields=all_fields, author=author, author_identifier=author_identifier, book=book, corporate_author=corporate_author, create_date=create_date, completion_date=completion_date, conflict_of_interest=conflict_of_interest, ec_rn_number=ec_rn_number, editor=editor, entrez_date=entrez_date, filter_citations=filter_citations, first_author_name=first_author_name, full_author_name=full_author_name, full_investigator_name=full_investigator_name, grant_number=grant_number, investigator=investigator, isbn=isbn, issue=issue, journal=journal, language=language, last_author=last_author, location_id=location_id, mesh_date=mesh_date, mesh_major_topic=mesh_major_topic, mesh_subheadings=mesh_subheadings, mesh_terms=mesh_terms, modification_date=modification_date, nlm_unique_id=nlm_unique_id, other_term=other_term, owner=owner, pagination=pagination, personal_name_as_subject=personal_name_as_subject, pharmacological_action=pharmacological_action, place_of_publication=place_of_publication, pmid=pmid, publisher=publisher, publication_date=publication_date, publication_type=publication_type, secondary_source_id=secondary_source_id, subset=subset, supplementary_concept=supplementary_concept, text_words=text_words, title=title, title_abstract=title_abstract, transliterated_title=transliterated_title, uid=uid, volume=volume, raw=raw, exact=exact) elif source.lower() in ["pubmed"] and level.lower() == "basic": # Use 'unlabeled_string' or 'query' here. # This function already takes completed # PubMed queries as strings (with # various connectors and constructors). if unlabeled_string: fetch = PubMedFetcher() pubmed_id_list = fetch.pmids_for_query(unlabeled_string) ref_list = [] for pubmed_id in pubmed_id_list: article = fetch.article_by_pmid( pubmed_id) # Need a faster way to get titles... temp_ref = Reference(identifier=str(pubmed_id), identifier_type="PubMed ID", source="PubMed", name=article.title) ref_list.append(temp_ref) return ref_list elif query: # This is where the basic reference # search redirects for now, but it # is relatively slow. fetch = PubMedFetcher() pubmed_id_list = fetch.pmids_for_query(query) ref_list = [] for pubmed_id in pubmed_id_list: try: article = fetch.article_by_pmid( pubmed_id) # Need a faster way to get titles... temp_ref = Reference(identifier=str(pubmed_id), identifier_type="PubMed ID", source="PubMed", name=article.title) ref_list.append(temp_ref) except metapub.exceptions.InvalidPMID: print("An invalid PMID error occurred.") temp_ref = Reference(identifier=str(pubmed_id), identifier_type="PubMed ID", source="PubMed") ref_list.append(temp_ref) else: temp_ref = Reference(identifier=str(pubmed_id), identifier_type="PubMed ID", source="PubMed") ref_list.append(temp_ref) return ref_list elif source.lower() in ["google", "google scholar"]: return google_scholar_search(unlabeled_string) elif source.lower() in ["openlibrary"]: return openlibrary_search(unlabeled_string)
for key in file_annotations: tool = file_annotations[key] if 'identifiers' in tool and ('keywords' not in tool or len(tool['keywords']) == 0): identifiers = tool['identifiers'] for identifier in identifiers: try: if 'doi' in identifier: doi = identifier.replace('doi:', '') pubmedid = doi2pmid(doi) print('doi: ' + doi + ' --> ' + 'pmid: ' + pubmedid) if pubmedid is not None: fetch = PubMedFetcher() article = fetch.article_by_pmid(pubmedid) if article.mesh is not None: keywords = [] if 'keywords' in tools: keywords = tool['keywords'] for keyword_key in article.mesh: keyword = article.mesh[keyword_key] if keyword['descriptor_name'] not in top_words: keywords.append(keyword['descriptor_name']) keywords = list(dict.fromkeys(keywords)) tool['keywords'] = keywords print(article.mesh) except Exception as e: print('Error doi --' + doi) tools[key] = tool
import eutils from metapub import PubMedFetcher fetch = PubMedFetcher() # get the first 1000 pmids matching "breast neoplasm" keyword search pmids = fetch.pmids_for_query('breast neoplasm', retmax=1000) # get abstract for each article: abstracts = {} for pmid in pmids: abstracts[pmid] = fetch.article_by_pmid(pmid).abstract
def main(folder_name, query): initial_location = os.getcwd() working_directory = folder_name if os.path.exists(working_directory) is True: os.chdir(working_directory) # Check if ActiveSite directory exists if os.path.exists("RelevantPapers") is False: os.mkdir("RelevantPapers") os.chdir("./RelevantPapers") else: os.mkdir(working_directory) os.chdir(working_directory) os.mkdir("RelevantPapers") os.chdir("./RelevantPapers") # Check if the files from the Blast&Modeller exist if query == 'YES': if os.path.exists("../Blast&Modeller") is True: shutil.copy("../Blast&Modeller/query.fasta", "./query.fasta") else: print( "The query file doesn't exists! Make sure you have run the BLAST&Modeller and the query.fasta is ok!" ) else: with open("query.fasta", "w+") as f: protein_sequence = query f.write(">query" + "\n" + protein_sequence) # Using the query sequence copied or indicated by the user, perform a blast to identify the uniprot identifier try: query = SeqIO.read(open("query.fasta"), format="fasta") print("Blast search running online... This migth take a while.") result_handle = NCBIWWW.qblast("blastp", "swissprot", query, auto_format="XML", matrix_name="BLOSUM62", expect=0.0001, word_size="6", gapcosts="11 1", alignments=10) print("Blast succesfull!") with open("blast_result.xml", "w+") as blast_result: blast_result.write(result_handle.read()) except BaseException as ex: print("Some error occured:" + ex.message) time.sleep(5) quit() # Check if blast was succesfull try: blastup = SearchIO.read("blast_result.xml", "blast-xml") except BaseException as ex: print("Some error occured during your blast search.\n" + ex.message) time.sleep(5) quit() # Extract uniprotID and the protein name uniprot_id = blastup[0].id.split("|")[1][0:6] if uniprot_id.isalpha(): uniprot_id = input( "It seems there is some problem finding the Uniprot ID of your protein. " "If you have it, please enter it. Else, press enter to exit.") else: print("Uniprot id found (" + str(uniprot_id) + "). Extracting information...") handle = urllib.request.urlopen("https://www.uniprot.org/uniprot/" + str(uniprot_id) + ".xml") record = SeqIO.read(handle, "uniprot-xml") with open("uniprot_papers.txt", "w+") as papers1: for papers in record.annotations["references"]: papers1.write(str(papers) + "\n") protein = [] for info in record.annotations: if info == "submittedName_fullName": protein.append(record.annotations[info]) elif info == "recommendedName_fullName": protein.append(record.annotations[info]) elif info == "alternativeName_fullName": protein.append(record.annotations[info]) # Use the protein name, term by term, as keywords and add immob* keywords = [] flat_keywords = [] for names in protein: for name in names: keywords.append(name.split(" ")) for list_1 in keywords: for a in list_1: flat_keywords.append(a) keywords = "" for words in flat_keywords: keywords += (str(words) + " OR ") keywords = keywords[0:-3] keywords += "AND immob*" # Search on pubmed using Entrez from Biopython print("Search on PubMed database is going to start with: \"" + keywords + "\" as the keywords.") Entrez.email = "*****@*****.**" handle1 = Entrez.esearch(db="pubmed", sort="relevance", retmax="20", retmode="xml", term=keywords) article_list = Entrez.read(handle1) article_id_list = article_list["IdList"] uniprot_articles = [] with open("uniprot_papers.txt", "r") as papers1: for line in papers1: if "pubmed id" in line: uniprot_articles.append(line[11:-1]) for i in range(len(uniprot_articles) - 1): try: if uniprot_articles[i] == "": del (uniprot_articles[i]) except ErrorBase: break # Use metapub package to retreive the information and write it in a csv file print("Retrieving the information...") with open("relevant_papers.csv", "w", newline="", encoding='utf-8') as file: writer = csv.writer(file) writer.writerow( ["Number", "Article ID", "Title", "Year", "Link", "DOI"]) fetcher = PubMedFetcher() for i in range(0, len(article_id_list) - 1): src = fetcher.article_by_pmid(article_id_list[i]) number = i + 1 article_id = article_id_list[i] title = src.title year = src.year link = "https://pubmed.ncbi.nlm.nih.gov/" + article_id_list[i] DOI = src.doi writer.writerow([number, article_id, title, year, link, DOI]) for i in range(0, len(uniprot_articles) - 1): up_src = fetcher.article_by_pmid(uniprot_articles[i]) number = "Uniprot" + str(i + 1) article_id = uniprot_articles[i] title = up_src.title year = up_src.year link = "https://pubmed.ncbi.nlm.nih.gov/" + uniprot_articles[i] DOI = up_src.doi writer.writerow([number, article_id, title, year, link, DOI]) print( "\tFinished running the module Reference retrieval module!\n You can find your result files in " + str(working_directory) + " in the RelevantPapers folder.\n The papers are organized in the csv file named " "\"relevant_papers.csv!\"") os.chdir(initial_location)
hostname = urlparse(url).hostname write_one_mapping(hostname, jrnl) # PII based for jrnl, url in misc_pii.simple_formats_pii.items(): hostname = urlparse(url).hostname write_one_mapping(hostname, jrnl) # BIOCHEMSOC (VIP format) for jrnl, value in biochemsoc.biochemsoc_journals.items(): write_one_mapping(value['host'], jrnl) # AAAS (VIP format) # dummy pma for formatting pma = fetch.article_by_pmid(27095592) for jrnl, value in aaas.aaas_journals.items(): hostname = urlparse(aaas.aaas_format.format(ja=value['ja'], a=pma)).hostname write_one_mapping(hostname, jrnl) # One-offs we know about write_one_mapping('joponline.org', 'J Periodontol') write_one_mapping('medicinabuenosaires.com', 'Medicina (B Aires)') fh.write('}\n') # More complicated reversals... # JAMA?
def psearch(pmid): fetch = PubMedFetcher() ret = fetch.article_by_pmid(pmid) print ret.to_dict()
df = pd.read_excel('Journals_PMID.xlsx', dtype=str) with open('pmlist.txt', 'w') as wrtf: for i, column in enumerate(df): pmids = df[column].tolist() for j, pmid in enumerate(pmids): if str(pmid) != 'nan': wrtf.write(str(pmid) + '\n') print('Journal: ' + str(i) + ' | Abs: ' + str(j)) with open('abs.txt', 'w') as wrtf: pmids = [line.rstrip('\n') for line in open('pmlist.txt')] print('Totaly: ' + str(len(pmids)) + ' papers') for j, pmid in enumerate(pmids): try: download = fetch.article_by_pmid(pmid) if download.abstract and download.journal and download.year: wrtf.write(download.journal + '-!!-' + str(download.year) + '-##-' + download.abstract + '\n') print(' | Abs: ' + str(j) + ' downloaded for: ' + pmid) except: print('download fail for: ' + ' | Abs: ' + str(j) + ' pmid: ' + pmid) ''' df = pd.read_excel('Journals_PMID.xlsx', dtype=str) with open('abs.txt' , 'w') as wrtf: for i, column in enumerate(df): pmids = df[column].tolist() for j, pmid in enumerate(pmids): if pmid:
def main(): ''' Collects all .ris citation files from the publications folder and generates a Publications.md in the wiki folder containing all important information. ''' #Collect .ris files ris_files = [] for ris in glob.glob( os.path.join( 'publications', '**', '*.ris' ), recursive=True ): ris_files.append(ris) #Extract information from ris files #and store it in a dictionary publications_dict = {} all_ris_doi = set() for fullpath in ris_files: head, ris = os.path.split(fullpath) subfolder = os.path.basename(head) if subfolder not in publications_dict.keys(): publications_dict[subfolder] = {} with open(fullpath, 'r') as in_file: tmp_dict = { 'Authors': [] } doi = None for line in in_file: l = line.strip() if l[:2] in ['A1', 'AU']: tmp_dict['Authors'].append( l.split(' - ')[1] ) elif l[:2] in ['T1', 'TI']: title = l.split(' - ')[1].replace('<em>', '').replace('</em>', '') tmp_dict['Title'] = title elif l[:2] in ['Y1', 'DA','PY']: year = int(l.split(' - ')[1].split('/')[0]) tmp_dict['Year'] = year elif l[:2] in ['JO', 'JF', 'T2']: tmp_dict['Journal'] = l.split(' - ')[1] elif l[:2] in ['VL']: tmp_dict['Volume'] = l.split(' - ')[1] elif l[:2] in ['IS']: tmp_dict['Issue'] = l.split(' - ')[1] elif l[:2] in ['UR']: tmp_dict['URL'] = l.split(' - ')[1] elif l[:2] in ['N2', 'AB']: tmp_dict['Abstract'] = l.split(' - ')[1] elif l[:2] in ['DO', 'M3', 'N1']: doi_line = l.split(' - ')[1].replace('doi:', '') doi = '/'.join(doi_line.split('/')[-2:]) tmp_dict['DOI'] = doi for k in ['Title', 'Authors', 'Year', 'Journal', 'URL', 'DOI']: if k not in tmp_dict.keys(): print(''' {0} is required but could not be found for {1} '''.format( k, fullpath ) ) sys.exit(1) for k in ['Volume', 'Issue', 'Abstract']: if k not in tmp_dict.keys(): tmp_dict[k] = '' publications_dict[subfolder][doi] = tmp_dict publications_dict[subfolder][doi]['Authors'] = '; '.join(tmp_dict['Authors']) citation_file = 'https://github.com/halophiles/halowiki//tree/master/publications/{0}/{1}'.format( subfolder, ris ) publications_dict[subfolder][doi]['Citation'] = citation_file all_ris_doi.add(doi) #Fetching publications from PubMed #and store their info in the same dict pm_fetch = PubMedFetcher() hfx_pmids = pm_fetch.pmids_for_query('Haloferax volcanii') known_problems = [ '29906440', '29888297', '29038254', '28660233', '25954264', '24240572', ] for pmid in hfx_pmids: if pmid in known_problems: continue try: article = pm_fetch.article_by_pmid(pmid) doi = '/'.join(article.doi.split('/')[-2:]) tmp_dict = {} tmp_dict['Authors'] = '; '.join(article.authors) tmp_dict['Title'] = article.title.replace('<em>', '').replace('</em>', '') tmp_dict['Year'] = int(article.year) tmp_dict['Journal'] = article.journal tmp_dict['Volume'] = article.volume tmp_dict['Issue'] = article.issue tmp_dict['URL'] = article.url tmp_dict['Abstract'] = article.abstract.replace('~', '') tmp_dict['DOI'] = doi tmp_dict['Citation'] = '' except: print('unsuccessful for {0}'.format(pmid)) continue if doi in all_ris_doi: continue publications_dict['Others'][doi] = tmp_dict #Write markdown file for wiki #based on info in dict output_filename = os.path.join( 'wiki', 'Publications.md' ) total_pubs = 0 with open(output_filename, 'w', encoding="utf-8") as out_file: print('# Publications [ ](# )', file=out_file) print('', file=out_file) for subheading in sorted(publications_dict.keys()): print(' * [{0}](#{1})'.format( subheading.replace ('_', ' '), subheading.replace(' ', '-').lower() ), file=out_file) print('', file=out_file) for subheading in sorted(publications_dict.keys()): print('## {0}'.format(subheading.replace ('_', ' ')), file=out_file) print('', file=out_file) pub_list = [] for pub in publications_dict[subheading].keys(): try: publications_dict[subheading][pub]['Lead Author'] = publications_dict[subheading][pub]['Authors'][0] pub_list.append(publications_dict[subheading][pub]) except: print(pub) print(publications_dict[subheading][pub]['Authors']) for pub in sorted( pub_list, key=itemgetter('Year', 'Lead Author'), reverse=True, ): total_pubs += 1 print( '''*{Title}*<br/> {Authors}<br/> **{Year}**<br/> {Journal} {Volume}({Issue})<br/> {DOI} <details> <summary>Abstract and Links</summary> [Link to Publication]({URL})<br/> [Citation]({Citation})<br/> {Abstract}<br/> </details><br/> --- '''.format(**pub), file=out_file) print( '''[Go to top of page](# )<br/> ----''', file=out_file ) print('Total Number of Publications written to Publications.md:') print(total_pubs)
from metapub import PubMedFetcher fetch = PubMedFetcher() print "Get paper information by PMID" article = fetch.article_by_pmid('21931568') print article.title print article.journal, article.year, article.volume, article.issue print article.authors print '\nGet paper information by PMCID' article = fetch.article_by_pmcid(2674488) print article.title print article.journal, article.year, article.volume, article.issue print article.authors
from metapub.convert import pmid2doi DEBUG = True #### logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("eutils").setLevel(logging.INFO) #### fetch = PubMedFetcher() if __name__ == '__main__': try: filename = sys.argv[1] except IndexError: print( 'Supply a filename containing a list of PMIDs as argument to this script.' ) sys.exit() pmids = open(filename, 'r').readlines() for pmid in [item.strip() for item in pmids if item.strip() != '']: try: pma = fetch.article_by_pmid(pmid) doi = pmid2doi(pmid) or '' print(','.join([pmid, doi, pma.title])) print('') except InvalidPMID: print(pmid, ',,INVALID')
def crawl_abstract(keyword, outfile=None, max_iter=1000, has_chem_only=False): fetch = PubMedFetcher() pmids = fetch.pmids_for_query(keyword, retmax=max_iter) print("PMID scan Done!") if not outfile: outfile = "[Crawling Results]" + keyword + ".tsv" o_file = open(outfile, 'w', encoding="utf8") header = "PMID\tAuthors\tYear\tTitle\tAbstract\tURL\tCitation\tChemicals\n" o_file.write(header) print("Crawling Paper Info..") for pmid in tqdm(pmids): article = fetch.article_by_pmid(pmid) if not article: continue authors = article.authors_str if not authors: continue elif "\t" in authors or "\n" in authors: authors = remove_escape(authors) year = article.year if not year: continue elif "\t" in year or "\n" in year: year = remove_escape(year) title = article.title if not title: continue elif "\t" in title or "\n" in title: title = remove_escape(title) abstract = article.abstract if not abstract: continue elif "\t" in abstract or "\n" in abstract: abstract = remove_escape(abstract) url = article.url if not url: continue elif "\t" in url or "\n" in url: url = remove_escape(url) citation = article.citation if not citation: continue elif "\t" in citation or "\n" in citation: citation = remove_escape(citation) chemical = article.chemicals if not chemical: if has_chem_only: continue chemical = "None" else: chemical = str(chemical).replace("\'", "\"") if "\t" in chemical or "\n" in chemical: chemical = remove_escape(chemical) o_file.write(pmid + "\t") o_file.write(authors + "\t") o_file.write(year + "\t") o_file.write(title + "\t") o_file.write(abstract + "\t") o_file.write(url + "\t") o_file.write(citation + "\t") o_file.write(chemical + "\n") o_file.close() print("Process Done!") print("Result is saved in <" + outfile + ">.")