def fetch_abstract(pmids): pmid_str = ",".join(pmids) try: handle = efetch(db='pubmed', id=pmid_str, retmode='xml') except urllib.error.HTTPError: handle = efetch(db='pubmed', id=pmid_str, retmode='xml') xml_data = read(handle)['PubmedArticle'] try: articles = [rec['MedlineCitation'] for rec in xml_data] except KeyError: articles = None return articles
def author(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)['PubmedArticle'][0] data = xml_data['MedlineCitation']['Article']['AuthorList'] author_list = [] for n in range(len(data)): author = data[n] name = author['ForeName'] + ' ' + author['LastName'] aff_info = author['AffiliationInfo'] if aff_info: aff = aff_info[0]['Affiliation'] else: aff = '' author_list.append('#Name ' + name) if aff: author_list.append('#Affiliation ' + aff) author_str = ' '.join(author_list) return author_str
def get_terms_abstract(self, pubmed_id): handle = efetch(db='pubmed', id=pubmed_id, retmode='text', rettype='abstract', email='*****@*****.**') text = handle.read() terms = TermsBag() # terms.termine_service(text) self.view.terms[self.view.pubmed_id.isin([int(pubmed_id)])] = terms.termine_service(text) self.terms = terms.to_data_frame()
def fetch_abstract(pmid): ## ## Return abstract of a given ## article using pmid ## ## => Return None when pmid can't be return ## (can happen when article is in chinese) ## try: handle = efetch( db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle) xml_data = xml_data['PubmedArticle'][0] except: return None try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None except KeyError: return None except: return None
def fetch_abstract(id_list): ids = ','.join(id_list) handle = efetch(db='pubmed', id=ids, retmode='xml') ### this step returns xml object #results = Entrez.read(handle) #this step read xml to string #return results return handle
def pubmed(gi,ids,query): """ Get the pubmed articles listed by *ids """ _ids=",".join(ids) for id in ids: handle = efetch(db="pubmed",id=id,retmode='xml',rettype='xml',retmax=MAX_RETURN) try: #print handle.read() results = eread(handle) for citation in results: #runtime().debug(citation.keys()) citation = citation['MedlineCitation'] pmid = citation['PMID'] article = citation['Article'] title = article['ArticleTitle'] journal = article['Journal']['Title'] try: date = citation['DateCompleted'] if citation.has_key('DateCompleted') else citation['DateCreated'] year = date['Year'] month = date['Month'] day = date['Day'] datetime = "%s-%s-%s" % (year,month,day) except: datetime = '0000-00-00' runtime().debug("Parsed pmid:%s" % id) yield Citation(gi, pmid, title, journal, datetime, query) except: runtime().debug("Failure fetching pmid:%s" % id) continue finally: handle.close()
def fetch_abstract(pmid): print(pmid) Entrez.email = '*****@*****.**' handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle) return xml_data
def specialization(author,affiliation): # import libraries import wikipedia import re from Bio.Entrez import efetch, read author = '"'+author+'"' # Find ID's for doctor + affiliation ids = [] results = search('%s[AUTH] AND %s[AFFL]' % (author,affiliation))['IdList'] for i in results: ids.append(i) num_paper = len(ids) # get abstracts from list of ID's query_abstracts = '' keywords = [] query_keywords = '' query_title = '' for i in ids: xml_data = read(efetch(db='pubmed', id=i, retmode='xml')) try: abstract = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'] query_abstracts = query_abstracts + str(abstract) + ' ' except: print('Paper with ID: ' + i + ' has no abstract') #get keuywords from ID's if xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'] != []: for x in xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0] : keywords.append(str(re.sub("[^a-zA-Z]", " ", x))) query_keywords = query_keywords + x + ' ' #get paper titel from ID's try: query_title = query_title + ' ' + xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle'] except: print('Paper with ID: ' + i + ' has no title') # get wiki pages first sentence of keywords query_wiki = '' for keyword in keywords: try: page = wikipedia.summary(keyword,sentences = 1) query_wiki = query_wiki + ' ' + str(re.sub("[^a-zA-Z]", " ", page)) except: print('Disambiguation error for keyword: '+keyword+', action: keyword excluded') # find specialism corpus = query_abstracts + ' ' + query_keywords + ' ' + query_wiki + ' ' + query_title specialization = str(spec_search(corpus)) if num_paper == 0: print('no papers found') specialization = [] else: print('this doctor is specialized in: '+specialization) return specialization
def query(self, term): id_list = self.__term_querier.query_matching_paper_ids(term) id_list_string = ','.join(id_list) handle = efetch(db=self.__db, id=id_list_string, retmode=self.__retmode) papers = self.__entrez.read(handle) return papers['PubmedArticle']
def get_abstract(pmid): """ function to collect abstract from pubmed shameless taken from: https://stackoverflow.com/questions/17409107/obtaining-data-from-pubmed-using-python """ handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract') return handle.read()
def print_xml(pmid): """ from the result of the query (pubmed ids) to parse out title, abstract, article type, and journal info :param pmid: list of PubMed IDs after querying :return: list of titles,abstracts, types, and journal names """ handle = efetch(db='pubmed', id=','.join(map(str, pmid)), retmode='xml', rettype='text') # try: print('entering print_xml') doc = handle.read() # except http.client.IncompleteRead: # continue doc = xmltodict.parse(doc) doc = json.loads(json.dumps(doc)) print('have read the doc') d = doc['PubmedArticleSet']["PubmedArticle"] titles = [] types = [] abstracts = [] jour_names = [] for i in d: # iterate through each article # find journal information if i["MedlineCitation"]['Article']['Journal']['Title'] is not None: jour_name = i["MedlineCitation"]['Article']['Journal']['Title'] jour_names.append(jour_name) else: jour_names.append('no journal found') # find title information t = i["MedlineCitation"]['Article']['ArticleTitle'] if isinstance(t, str): t = i["MedlineCitation"]['Article']['ArticleTitle'] elif i["MedlineCitation"]['Article']['ArticleTitle'] is None: t = "no title" else: t = i["MedlineCitation"]['Article']['ArticleTitle']['#text'] titles.append(t) if 'Abstract' in i['MedlineCitation']['Article']: abstracts.append( i['MedlineCitation']['Article']['Abstract']['AbstractText']) else: abstracts.append('no abstract') # find type of article type = i['MedlineCitation']['Article']['PublicationTypeList'][ 'PublicationType'] if isinstance(type, dict): types.append(type['#text']) # print(ty['#text']) else: # print(ty) type_stripped = [] for d in type: type_stripped.append(d['#text']) type_stripped = ', '.join(type_stripped) types.append(type_stripped) return titles, abstracts, types, jour_names
def fetch_abstract(self,pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml',email='*****@*****.**',retmax=1000) xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except (IndexError, KeyError): return None
def fetch_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None
def fetch_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = handle.read() r = re.compile('<AbstractText>(.*?)</AbstractText>') m = r.search(xml_data) if m: abstract = m.group(1) return abstract else: return ""
def fetch_abstract(pmid): # not really being used at all, just a ref. function handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = Entrez.read(handle) print(xml_data) try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract'] return abstract except IndexError: return None
def fetch_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle) try: article = xml_data['PubmedArticle'][0]['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] title = article['ArticleTitle'] return abstract, title except (IndexError, KeyError) as _: return None
def fetch_abstract(pmid): """Pass in an article id.""" pmid = str(pmid) try: handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except Exception as e : return '{}: {}'.format(e.__class__.__name__, e)
def fetch_abstract(pmid): """Pass in an article id.""" pmid = str(pmid) try: handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except Exception as e: return '{}: {}'.format(e.__class__.__name__, e)
def fetch_abstract(pmid): Entrez.email = "*****@*****.**" handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None
def __init__(self, pmid): self.pmid = pmid Entrez.email = '*****@*****.**' inCache = False if self.pmid in pmidTermFind.pmid_xmlData_HISTORY: # If ID in cache self.xml_data = pmidTermFind.pmid_xmlData_HISTORY[self.pmid] inCache = True if inCache == False: self.handle = efetch(db='pubmed', id=self.pmid, retmode='xml') self.xml_data = Entrez.read(self.handle)[0] pmidTermFind.pmid_xmlData_HISTORY[pmid] = self.xml_data
def get_mesh_from_pmid(self, user): Entrez.email = user.email handle = efetch(db="pubmed", id=str(self.pmid), retmode="xml") xml_data = read(handle)[0] # Skips articles without MeSH terms if u'MeshHeadingList' in xml_data["MedlineCitation"]: for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']: major = "N" qualifiers = mesh[u'QualifierName'] if len(qualifiers) > 0: major = str(qualifiers[0].attributes.items()[0][1]) descr = mesh[u'DescriptorName'] name = descr.title()
def get_mesh_from_pmid(self, user): Entrez.email = user.email handle = efetch(db = "pubmed", id = str(self.pmid), retmode = "xml") xml_data = read(handle)[0] # Skips articles without MeSH terms if u'MeshHeadingList' in xml_data["MedlineCitation"]: for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']: major = "N" qualifiers = mesh[u'QualifierName'] if len(qualifiers) > 0: major = str(qualifiers[0].attributes.items()[0][1]) descr = mesh[u'DescriptorName'] name = descr.title()
def fetch_article(pmid): """ Test function => Not working """ handle = efetch(db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return article except IndexError: return None
def add_pmid_article_to_database(article_id): """ Given a PMID, use external APIs to get the necessary article data in order to add the article to our database. """ if len(list(get_article_object(article_id))) == 0: pmid = str(article_id) handle = efetch("pubmed", id=[pmid], rettype="medline", retmode="text") records = list(Medline.parse(handle)) records = records[0] if "TI" not in records: return False # catch bad PMIDs article_info = {} article_info["title"] = records["TI"] article_info["PMID"] = pmid article_info["authors"] = ', '.join(records["AU"]) article_info["abstract"] = records["AB"] article_info["DOI"] = getDOI(records["AID"]) article_info["experiments"] = "" article_info["metadata"] = str({"meshHeadings": []}) article_info["reference"] = None identity = "" try: article_info["experiments"] = { "locations": eval( urllib.request.urlopen( "http://neurosynth.org/api/studies/peaks/" + str(pmid) + "/").read().decode())["data"] } k = article_info["experiments"]["locations"] for i in range(len(k)): if len(k[i]) == 4: identity = k[0] k[i] = k[i][1:] k[i] = ",".join([str(x) for x in (k[i])]) except BaseException: pass article_info["id"] = identity article_info["experiments"] = [article_info["experiments"]] Articles.insert(abstract=article_info["abstract"], authors=article_info["authors"], doi=article_info["DOI"], experiments=article_info["experiments"], pmid=article_info["PMID"], title=article_info["title"]).execute() return True return False
def date(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)['PubmedArticle'][0] data = xml_data['MedlineCitation']['Article']['Journal']['JournalIssue'][ 'PubDate'] if 'Day' in data: day = data['Day'] else: day = '' return data['Year'] + ' ' + data['Month'] + ' ' + day
def get_metadata_from_PMID( pmid ): """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin....""" handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] verbose_output = False try: date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] ) except: print "Data Completed not available??",pmid try: otherID = xml_data['MedlineCitation']['OtherID'] except: print "Other ID Not availble??",pmid try: MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList'] except: print "Unable to get mesheadings for",pmid try: article = xml_data['MedlineCitation']['Article'] if verbose_output: print xml_data #print date_completed,otherID for author in article['AuthorList']: #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] } author_key = author['LastName'] + ','+ author['Initials'] #print author['LastName'],author['Initials'],author,'MOO' if author_key in global_author_list: global_author_list[ author_key ] +=1 #print "adding author" else: global_author_list[ author_key ] = 1 #print "I ADDED AN AUTHOR..." #return abstract except IndexError: return None except: print "unable to process",pmid print "Unexpected error:", sys.exc_info()[0] try: abstract = article['Abstract']['AbstractText'][0] except: print "Unable to get abstract for",pmid print "Unexpected error:", sys.exc_info()[0]
def get_mesh(pmid): # call PubMed API handle = efetch(db='pubmed', id=str(pmid), retmode='xml') xml_data = read(handle)[0] # skip articles without MeSH terms if u'MeshHeadingList' in xml_data['MedlineCitation']: for mesh in xml_data['MedlineCitation'][u'MeshHeadingList']: # grab the qualifier major/minor flag, if any major = 'N' qualifiers = mesh[u'QualifierName'] if len(qualifiers) > 0: major = str(qualifiers[0].attributes.items()[0][1]) # grab descriptor name descr = mesh[u'DescriptorName'] name = descr.title() yield(name, major)
def get_article_title(pmid): """ Connect to pubmed database and get the article title of the given pmid. Return NA if faild """ handle = efetch( db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle) xml_data = xml_data['PubmedArticle'][0] try: title = xml_data['MedlineCitation']['Article']['ArticleTitle'] except: title = "NA" return title
def getDocument(self,pmid): #this method will return all associated attributes for an article #including Article Title , Publication Date , Authors' Names , Citations......etc. # it will return it as a python dictionary suitable for storage in mongodb handle = efetch(db='pubmed', id=str(pmid), retmode='xml') xml_data = read(handle)[0] article = dict(id = pmid,Title = str(xml_data['MedlineCitation']['Article'][u'ArticleTitle']) , Abstract=str(self.safeAbstract(xml_data['MedlineCitation'],u'Abstract')), DateCompleted="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Day'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Month'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Year'),), DateRevised="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Day'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Month'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Year'))) return (xml_data,article)
def fetch_article(pmid): """ Test function => Not working """ handle = efetch(db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] print month print day print year return "choucroute"
def fetch_abstract(pmid): """ Retrun abstract of a given article using pmid => Return None when pmid can't be return (can happen when article is in chinese) """ handle = efetch(db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle) xml_data = xml_data['PubmedArticle'][0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None except KeyError: return None
def get_country_publication_stat(run_folder): ## ## Get the publications stats for country. ## get the list of pmid retrieved from the ## meta folder and connect to the NCBI to fecth ## publications informations, parse it to get the ## country of publication. ## ## return a dictionnary ## ## init structure country_to_count = {} ## get list of PMID to process meta_file_list = glob.glob(run_folder+"/meta/*.csv") for meta_file in meta_file_list: meta_file_in_array = meta_file.split("/") file_name = meta_file_in_array[-1] file_name_in_array = file_name.split(".") pmid = file_name_in_array[0] ## get country publication try: handle = efetch(db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country'] print country # to delete except: country = "NA" ## fill dictionnary if(country not in country_to_count.keys()): country_to_count[country] = 1 else: country_to_count[country] += 1 return country_to_count
def fetch_abstract(pmid): ''' This method was originally written by Karol. Colin Hortman added the try/except block to handle articles that abstracts were not found http://stackoverflow.com/questions/17409107/obtaining-data-from-pubmed-using-python :param pmid: PubMed ID :return: Abstract ( will return as StringElement but behaves mostly like a string ) ''' handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = Entrez.read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] try: abstract = article['Abstract']['AbstractText'][0] return abstract except KeyError: pass #print('No Abstract found for PMID: ', pmid) except IndexError: return None return None
def date(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)['PubmedArticle'] if xml_data: xml_list = xml_data[0] data = xml_list['MedlineCitation']['Article']['Journal'][ 'JournalIssue']['PubDate'] if 'Day' in data: day = data['Day'] else: day = '' if 'Month' in data: month = data['Month'] + ' ' else: month = '' if 'Year' in data: year = data['Year'] + ' ' else: year = '' return year + month + day else: return ''
def fetch_article(pmid): """ Test function => Not working """ handle = efetch( db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] print month print day print year return "choucroute"
def parse_pmc_ids(self, pmcid, retmode='xml'): #get xml from pmc handle = efetch(db='pmc', id=pmcid, retmode=retmode) xml_string = handle.read() xml = ET.fromstring(xml_string) #check for keywords and MeshTerms keys = [] for art in xml.getchildren(): #title title = ''.join(art.xpath('.//article-meta//article-title/text()')) #authors auth = zip( art.findall('.//*[@contrib-type="author"]/name/given-names'), art.findall('.//*[@contrib-type="author"]/name/surname')) auth = ';'.join([' '.join([i.text, j.text]) for (i, j) in auth]) #affiliations aff = ';'.join(art.xpath('.//aff/text()')) #publication_date pub_date = '-'.join( art.xpath('.//article-meta/pub-date[@pub-type="epub"]/*/text()' )[::-1]) #ids ##pubmed_id pubmed_id = ''.join( art.xpath( './/article-meta/article-id[@pub-id-type="pmid"]/text()')) ##doi doi = ''.join( art.xpath( './/article-meta/article-id[@pub-id-type="doi"]/text()')) ##pmcid pmcid = ''.join( art.xpath( './/article-meta/article-id[@pub-id-type="pmc"]/text()')) #abstract abstract = ''.join(art.xpath('.//abstract//*/text()')) #fulltext full_text = ''.join(art.xpath('.//body//*/text()')) #journal journal = ''.join( art.xpath( '//journal-meta/journal-id[@journal-id-type="iso-abbrev"]/text()' )) #pmcclass pmcclass = ''.join( art.xpath( './/article-meta/article-categories//subject/text()')) #pmc_keywords pmc_keywords = art.xpath('.//kwd/text()') keys.append((pmcid, title, auth, aff, pub_date, pubmed_id, doi, abstract, full_text, journal, pmcclass, pmc_keywords)) return keys, xml_string
def add_pmid_article_to_database(article_id): """ Given a PMID, use external APIs to get the necessary article data in order to add the article to our database. """ pmid = str(article_id) try: handle = efetch("pubmed", id=[pmid], rettype="medline", retmode="text") except BaseException: return False # Could not access correct pubmed ID records = list(Medline.parse(handle)) records = records[0] article_info = {} article_info["title"] = records.get("TI") article_info["PMID"] = pmid article_info["authors"] = ', '.join(records.get("AU", [])) article_info["abstract"] = records.get("AB") article_info["DOI"] = getDOI(records.get("AID", [])) article_info["experiments"] = [] article_info["metadata"] = str({"meshHeadings": []}) article_info["reference"] = None identity = "" try: locations_list = eval( urllib.request.urlopen( "http://neurosynth.org/api/studies/peaks/" + str(pmid) + "/").read().decode())["data"] id_map = {} greatest_id = 89999 current_exp = None for loc in locations_list: current_loc_id = None vals = loc if len(loc) == 4: current_loc_id = loc[0] vals = vals[1:] # vals is the x, y, z array; current_loc_id is the Neurosynth ID if current_loc_id not in id_map: greatest_id += 1 id_map[current_loc_id] = greatest_id if current_exp is not None: # Add the current experiment if its not None article_info["experiments"].append(current_exp) current_exp = { "caption": "", "locations": [], "descriptors": [], "contrast": "", "space": "", "effect": "" } current_exp["locations"].append(",".join([str(v) for v in vals])) if current_exp is not None: article_info["experiments"].append(current_exp) except BaseException: pass Articles.create(abstract=article_info["abstract"], authors=article_info["authors"], doi=article_info["DOI"], experiments=str(article_info["experiments"]), pmid=article_info["PMID"], title=article_info["title"]) return True
def fetch_abstract(): colnames = ['pmids'] data = pandas.read_csv("pmids.csv", names=colnames) pmids = list(data.pmids) full = [] for i in pmids: handle = efetch(db='pubmed', id=i, retmode='xml') xml_data = handle.read() soup = BeautifulSoup(xml_data) a_recs = [] for tag in soup.findAll("pubmedarticle"): title = tag.articletitle.text journal = tag.findAll("journal") try: for info in journal: year = info.find("year").text except: for info in journal: year = info.find("year") for a_tag in tag.findAll("author"): a_rec = {} a_rec['year'] = year a_rec['title'] = title a_rec['pmid'] = int(tag.pmid.text) try: a_rec['lastname'] = str(a_tag.lastname.text) except: a_rec['lastname'] = str(a_tag.lastname) try: a_rec['forename'] = str(a_tag.forename.text) except: a_rec['forename'] = str(a_tag.forename) try: a_rec['initials'] = str(a_tag.initials.text) except: a_rec['initials'] = str(a_tag.initials) try: a_rec['affiliation'] = str(a_tag.affiliation.text) except: a_rec['affiliation'] = str(a_tag.affiliation) a_recs.append(a_rec) #full.append(a_recs) full.append(a_recs) def convert(input): if isinstance(input, dict): return {convert(key): convert(value) for key, value in input.iteritems()} elif isinstance(input, list): return [convert(element) for element in input] elif isinstance(input, unicode): return input.encode('utf-8') else: return input full = convert(full) print "running" #article = ET.XML(xml_data) #print article.find('AuthorList').findall('Author') for entry in full: #print entry keys = ['year', 'title', 'pmid', 'lastname', 'forename', 'initials', 'affiliation'] f = open("citations.csv", "ab") dict_writer = csv.DictWriter(f, keys) dict_writer.writer.writerow(keys) dict_writer.writerows(entry) return
def fetch_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract') data = handle.read() return data
hit_gi = int(aln.hit_id.split("|")[1]) if hit_gi in processed_ids: continue else: processed_ids[hit_gi] = None if "LOW QUALITY" in aln.hit_def: print '\t\t"LOW QUALITY" in seq name' continue for hsp in aln.hsps: if float(hsp.expect) < EXPECT: qlen = len(hsp.query.replace("-","")) coverage = (float(qlen)/ float(record.query_letters))*100.0 if coverage > 80.0: sleep(5) print "\t\tExp: %f start: %i Qend: %i Coverage%%: %f" % (float(hsp.expect), hsp.query_start, hsp.query_end, coverage) ef_handle = efetch(db="protein", id=hit_gi, rettype="fasta", retmode="text") fasta_txt = ef_handle.read() while fasta_txt.find("unavailable") >=0: print "\t\tentrez is failing hard. sleeping. (id:%i)" % hit_gi sleep(5) ef_handle = efetch(db="protein", id=hit_gi, rettype="fasta", retmode="text") fasta_txt = ef_handle.read() # edit fasta_txt fasta_lines = fasta_txt.splitlines() fasta_header = fasta_lines[0] fasta_seq = "".join(fasta_lines[1:]) fasta_header = fasta_header.replace('predicted protein', 'pred.') fasta_header = fasta_header.replace('hypothetical protein', 'pred.') fasta_header = fasta_header.replace('PREDICTED:', 'pred.')
def collect_NCBI(): global all_pmids global pmid_dict if os.path.exists(f'./{rel_name}/{rel_name}_pmid_dict.json'): with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'r') as f: jd = f.read() temp_dict = json.loads(jd) pmid_dict.update(temp_dict) return pmid_dict for idx in tqdm(range(len(all_pmids))): pmid = all_pmids[idx] # get records for each pmid fetch_records_handle1 = efetch(db="pubmed", id=str(pmid), rettype="medline", retmode="text") # parse fetched records records1 = Medline.parse(fetch_records_handle1) # Need to iterate over records to extract information for record1 in records1: # try except check to be sure that NCBI is not returning empty result try: # let's get pmcid if exists id2 = record1['PMC'][3:] #print('PMC',id2) # get records for pmcid fetch_records_handle2 = efetch(db="pubmed", id=str(id2), rettype="medline", retmode="text") # parse records for pmcid records2 = Medline.parse(fetch_records_handle2) # Need to iterate over records to extract information ''' Collect following information: authors, authors' affiliations, publication date, citations, grants Store all these information in an dictionary (pmid_dict) ''' for record2 in records2: authors = record2['FAU'] affiliations = record2['AD'] pub_date = record2['DCOM'] citations = get_links_id(pmid) grants = record2['GR'] pmid_dict[pmid] = { 'pmcid_number': id2, 'pmcid': True, 'authors': authors, 'affiliations': affiliations, 'grants': grants, 'pub_date': pub_date, 'citations': citations } except: authors = record1['FAU'] try: affiliations = record1['AD'] except: affiliations = '' try: pub_date = record1['DCOM'] except: pub_date = '' try: citations = get_links_id(pmid) except: citations = '' try: grants = record1['GR'] except: grants = '' pmid_dict[pmid] = { 'pmcid_number': '', 'pmcid': False, 'authors': authors, 'affiliations': affiliations, 'grants': grants, 'pub_date': pub_date, 'citations': citations } with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'w') as output: output.write(json.dumps(pmid_dict)) return pmid_dict
csvfile.writerow(['Quad', 'Quad_Alpha', 'V1', 'V2', 'V3', 'V4', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6']) for x,y in zip(verts, edge_lengths): quad = [TOLC[atoms[z]['res']] for z in x] resverts = [atoms[z]['resseq'] for z in x] csvfile.writerow([''.join(quad), ''.join(sorted(quad))]+resverts+y) # removing very irregular tetrahedra if task_profile or task_threading: # Calculate potentials orig_sim_pot = simplex_potential([[atoms[x]['res'] for x in y] for y in verts]) orig_res_pot = residue_potential(len(atoms), verts, orig_sim_pot) if task_threading: if seq_download: seqfile = efetch(db="protein", id=seq_id, rettype=seq_format) else: seqfile = seq_filename ids,seq = zip(*[(x.id, x.seq) for x in AlignIO.read(seqfile,seq_format)]) # Make PDB sequence the first one idtemp,seqtemp = [(x,y) for x,y in zip(ids, seq) if PDB_id in x][0] ids,seq = zip(*[(x,y) for x,y in zip(ids, seq) if PDB_id not in x]) ids = [idtemp] + list(ids) seq = [seqtemp] + list(seq) trimmed_nums, trimmed_seq = zip(*[trim_gaps(x,seq[0], number_sequence(x)) for x in seq]) dqs = [] for z,i,d in zip(trimmed_seq[1:], trimmed_nums[1:], ids[1:]): res_numbers,residues = thread_sequence(z, trimmed_seq[0], pdbdata['DBREF']['seqbegin'], i, span) mut_sim_pot = simplex_potential([[residues[x] for x in y] for y in verts])
def print_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract') print handle.read()
def get_metadata_from_PMID( pmid, output_errors=False, dump_xml=False ): """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin....""" handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] verbose_output = False # output_errors= False author_affiliation_list = [] cur_paper_author_list = [] try: date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] ) except: print "Date Completed not available??",pmid ## Will try date created #date_completed = None #date_created = format_ddate( xml_data['MedlineCitation']['DateCreated'] ) ## I am removing the difference between date completed and created-- it doens't really matter for my purposes date_completed = format_ddate( xml_data['MedlineCitation']['DateCreated'] ) #fp_error.write('Date Completed Not Avaiable:\n'+str(xml_data)+'\n\n') try: otherID = xml_data['MedlineCitation']['OtherID'] except: print "Other ID Not availble??",pmid try: MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList'] except: print "Unable to get mesheadings for",pmid if output_errors: fp_error.write('MESH NOT AVAIABLE:\n'+str(xml_data)+'\n\n') try: article = xml_data['MedlineCitation']['Article'] if verbose_output: print xml_data for author in article['AuthorList']: #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] } #print author if 'LastName' in author: author_key = author['LastName'] + ','+ author['Initials'] #print author,author_key cur_paper_author_list.append(author_key) elif 'CollectiveName' in author: print "FOUND A COLLECTION EXAMPLE",author if 'Affiliation' in author: author_affil = author['Affiliation'] author_affiliation_list.append( (author, author_affil) ) # print author_affil # sys.exit() except NameError as e: print e except IndexError: return None except: print "unable to proces article tag",pmid print "Unexpected error parsing author string:", sys.exc_info()[0] if output_errors: fp_error.write('Article NOT AVAILABLE\n'+str(xml_data)+'\n\n') print author #print xml_data try: abstract = article['Abstract']['AbstractText'][0] except: print "Unable to get abstract for",pmid if dump_xml: print xml_data return xml_data else: return { 'auth_list': cur_paper_author_list, 'affiliations': author_affiliation_list, 'publication_date': date_completed }
def evaluate_article(pmid): ## ## [IN PROGRESS] ## ## -> Test if the abstract is cool ## -> return true or false ## ## TODO : write doc ## ##------------------------## ## Parameters for filters ## ##------------------------## ## initialize parameters oldest_year_authorized = "NA" case_report_only = False case_report_check = False authorized_languages = [] valid_article = False check_date = True check_language = True validation_check = {} validation_keywords = {} exclusion_check = {} exclusion_keywords = {} exclusion_keywords_found = False ## test if config file exist if (os.path.isfile("config.conf")): config_data = open("config.conf", "r") validation_keywords_cmpt = 0 exclusion_keywords_cmpt = 0 for line in config_data: line = line.replace("\n", "") line_in_array = line.split(";") if (line_in_array[0] == "min year"): oldest_year_authorized = line_in_array[1] elif (line_in_array[0] == "authorized languages"): languages_list = line_in_array[1].split(",") for elt in languages_list: authorized_languages.append(unicode(elt)) elif (line_in_array[0] == "validation keywords"): validation_keywords_cmpt += 1 validation_check["keywords_" + str(validation_keywords_cmpt)] = False validation_keywords["keywords_" + str(validation_keywords_cmpt)] = [] keywords_list = line_in_array[1].split(",") for elt in keywords_list: if (elt not in validation_keywords[ "keywords_" + str(validation_keywords_cmpt)]): validation_keywords[ "keywords_" + str(validation_keywords_cmpt)].append(str(elt)) ## Retrieve Exclusion list elif (line_in_array[0] == "exclusion keywords"): exclusion_keywords_found = True exclusion_keywords_cmpt += 1 exclusion_check["exclusion_" + str(exclusion_keywords_cmpt)] = False exclusion_keywords["exclusion_" + str(exclusion_keywords_cmpt)] = [] keywords_list = line_in_array[1].split(",") for elt in keywords_list: if (elt not in exclusion_keywords[ "exclusion_" + str(exclusion_keywords_cmpt)]): exclusion_keywords[ "exclusion_" + str(exclusion_keywords_cmpt)].append(str(elt)) ## case report only option ## if nothing is set, default is False elif (line_in_array[0] == "case report only" and str(line_in_array[1]) == "True"): case_report_only = True config_data.close() ## default configuration else: oldest_year_authorized = 2008 authorized_languages = [u'eng'] validation_check["keywords_1"] = False validation_check["keywords_2"] = False validation_keywords["keywords_1"] = [ "algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model" ] validation_keywords["keywords_2"] = [ "Sjogren", "sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE" ] exclusion_check["exclusion_1"] = False exclusion_keywords["exclusion_1"] = [] if (not exclusion_keywords_found): exclusion_check["exclusion_1"] = False exclusion_keywords["exclusion_1"] = [] ##---------------## ## The Easy Part ## ##---------------## ## get meta data on the articles try: handle = efetch( db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] ## get date from the history attribute, select ## the date of acceptation. date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] ## get the name of the review journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'MedlineJournalInfo'][u'MedlineTA'] ## get the keywords for the articles ## the format is a bit strange, may have to be carreful ## with this data (mix of strings and unicode elements) keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'KeywordList'] ## Get the author's conflict of interest, ## because we can. try: conflict_of_interest = informations[u'PubmedArticle'][0][ u'MedlineCitation'][u'CoiStatement'] except: conflict_of_interest = "NA" ## Get title of the article article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'Article'][u'ArticleTitle'] ## Get language of the article article_language = informations[u'PubmedArticle'][0][ u'MedlineCitation'][u'Article'][u'Language'][0] ## Get country of publications country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country'] except: return (False, False, False) ##----------------## ## The Smart Part ## ##----------------## ## run further analysis on the abstract using nltk ## ## WORKING ON EXCLUSION LIST ## ## fetch the abstract and convert it to ## a nltk text object. abstract_file_name = "abstract/" + str(pmid) + "_abstract.txt" abstract = fetch_abstract(pmid) if (abstract): save_abstract(abstract, abstract_file_name) abstract_text = load_text(abstract_file_name) ## Play with tokenization and chunking ## Get all the commun names in the abstract names_found_in_abstract = [] try: tokens = nltk.word_tokenize(abstract.encode('utf8')) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) except: print "[WARNINGS] => can't perform nlp operation" entities = [] for item in entities: try: if (item[1] in ["NN", "NNS", "NNP"]): if (item[0] not in names_found_in_abstract): names_found_in_abstract.append(item[0]) except: ## Somethig went wrong choucroute = True ## Check validation list for item in names_found_in_abstract: for key in validation_keywords.keys(): keywords_validation_list = validation_keywords[key] if (item in keywords_validation_list): validation_check[key] = True ## Check exclusion list for item in names_found_in_abstract: for key in exclusion_keywords.keys(): exclusion_validation_list = exclusion_keywords[key] if (item in exclusion_validation_list): exclusion_check[key] = True ## Check if is a case report if (case_report_only): print "[DEBUG] => Case report only" if (article_is_a_case_report(abstract_file_name)): case_report_check = True ##--------------## ## PASS OR FAIL ## ##--------------## ## General check phase easy_check_passed = False smart_check_passed = True ## Basic check on meta data ## - check date if (int(year) < int(oldest_year_authorized)): check_date = False ## - check language if (article_language not in authorized_languages): check_language = False ## Easy Filter if (check_date and check_language): easy_check_passed = True ## Complex filter (inclusion) if (False in validation_check.values()): smart_check_passed = False ## Complex filter (exclusion) if (True in exclusion_check.values()): smart_check_passed = False ## Case reprot filter if (case_report_only and case_report_check): print "[DEBUG] => EXLUDED" smart_check_passed = False ## Global check if (easy_check_passed and smart_check_passed): valid_article = True ##-------------## ## SAVING DATA ## ##-------------## ## Write and delete files if (valid_article): ## Save meta data in a text file ## for further use title_line = u'>Title;' + unicode(article_title) + u"\n" date_line = u'>Date;' + unicode(day) + u"/" + unicode( month) + u"/" + unicode(year) + u"\n" #date_line = '>Date;'+str(day.encode('utf8'))+"/"+str(month.encode(utf8))+"/"+str(year.encode("utf8"))+"\n" journal_line = u">Journal;" + unicode(journal_name) + u"\n" country_line = u">Country;" + unicode(country) + u"\n" conflict_of_interest_line = u">Conflict;" + unicode( conflict_of_interest) + u"\n" meta_data = open("meta/" + str(pmid) + ".csv", "w") meta_data.write(title_line.encode('utf8')) meta_data.write(date_line.encode('utf8')) meta_data.write(journal_line.encode('utf8')) meta_data.write(country_line.encode('utf8')) meta_data.write(conflict_of_interest_line.encode('utf8')) meta_data.close() else: ## Delete the abstract try: if (abstract): os.remove(abstract_file_name) except: print "[WARNING] => can't delete " + str(abstract_file_name) ##------------------## ## RETURN SOMETHING ## ##------------------## ## return True if the article pass the ## evaluation, else False. return (valid_article, easy_check_passed, smart_check_passed)
def parse_pubmed_ids(cls, pub_id, retmode='xml'): #get xml from pubmed handle = efetch(db='pubmed', id=pub_id, retmode=retmode) xml_string = handle.read() xml = ET.fromstring(xml_string) #check for keywords and MeshTerms keys = [] for art in xml.getchildren(): #authors auth = zip(art.findall('.//Author/ForeName'), art.findall('.//Author/LastName')) auth = ';'.join([' '.join([i.text, j.text]) for (i, j) in auth]) #affiliations email_re = re.compile('Electronic address.*\.') aff = ';'.join( np.unique([ email_re.sub('', aff.text).strip(string.punctuation + ' ') for aff in art.findall('.//Affiliation') ])) #publication date pub_date = '-'.join([ x.text for x in [ art.find('.//PubMedPubDate[@PubStatus="entrez"]/Year'), art.find('.//PubMedPubDate[@PubStatus="entrez"]/Month'), art.find('.//PubMedPubDate[@PubStatus="entrez"]/Day') ] ]) if pub_date == '': pub_date = '1900-01-01' pubmed_id = ''.join( art.xpath( './/PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]/text()' )) doi = ''.join( art.xpath( './/PubmedData/ArticleIdList/ArticleId[@IdType="doi"]/text()' )) pmcid = ''.join( art.xpath( './/PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]/text()' )) title = ''.join(art.xpath('.//ArticleTitle/text()')).strip() abstract = ''.join(art.xpath('.//AbstractText/text()')) journal = ''.join(art.xpath('.//Journal/ISOAbbreviation/text()')) pubmed_class = ','.join(art.xpath('.//PublicationType/text()')) keys.append(( pubmed_id, title, auth, aff, pub_date, abstract, doi, pmcid, journal, pubmed_class, art.xpath('MedlineCitation/KeywordList/*/text()'), art.xpath( 'MedlineCitation/MeshHeadingList/MeshHeading/*[@MajorTopicYN="Y"]/../DescriptorName/text()' ), art.xpath( 'MedlineCitation/MeshHeadingList/MeshHeading/*[@MajorTopicYN="Y"]/../QualifierName/text()' ))) # df = pd.DataFrame(keys, columns=['pubmed_id', 'pub_date', 'keywords', 'mesh_descriptors', 'mesh_qualifiers']) # df.pub_date = pd.to_datetime(df.pub_date) gc.collect() return keys, xml_string
def evaluate_article(pmid): ## ## [IN PROGRESS] ## ## -> Test if the abstract is cool ## -> return true or false ## ##------------------------## ## Parameters for filters ## ##------------------------## oldest_year_authorized = 2008 authorized_languages = [u'eng'] valid_article = False check_date = True check_language = True validation_check_keywords_1 = False validation_check_keywords_2 = False ##---------------## ## The Easy Part ## ##---------------## ## get meta data on the articles try: handle = efetch(db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] ## get date from the history attribute, select ## the date of acceptation. date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] ## get the name of the review journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'MedlineJournalInfo'][u'MedlineTA'] ## get the keywords for the articles ## the format is a bit strange, may have to be carreful ## with this data (mix of strings and unicode elements) keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'KeywordList'] ## Get the author's conflict of interest, ## because we can. try: conflict_of_interest = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'CoiStatement'] except: conflict_of_interest = "NA" ## Get title of the article article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'ArticleTitle'] ## Get language of the article article_language = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'Language'][0] except: return (False,False,False) ##----------------## ## The Smart Part ## ##----------------## ## run further analysis on the abstract using nltk ## fetch the abstract and convert it to ## a nltk text object. abstract_file_name = "abstract/"+str(pmid)+"_abstract.txt" abstract = fetch_abstract(pmid) if(abstract): save_abstract(abstract, abstract_file_name) abstract_text = load_text(abstract_file_name) ## Play with tokenization and chunking ## Get all the commun names in the abstract names_found_in_abstract = [] try: tokens = nltk.word_tokenize(abstract.encode('utf8')) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) except: print "[WARNINGS] => can't perform nlp operation" entities = [] for item in entities: try: if(item[1] in ["NN", "NNS", "NNP"]): if(item[0] not in names_found_in_abstract): names_found_in_abstract.append(item[0]) except: ## Somethig went wrong choucroute = True ## -> Biology keywords check ## -> Artificial intelligence keywords check IA_keywords = ["algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model"] Clinical_keywords = ["Sjogren" ,"sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE"] for item in names_found_in_abstract: if(item in IA_keywords): validation_check_keywords_1 = True if(item in Clinical_keywords): validation_check_keywords_2 = True ##--------------## ## PASS OR FAIL ## ##--------------## ## General check phase easy_check_passed = False smart_check_passed = False ## Basic check on meta data ## - check date if(int(year) < int(oldest_year_authorized)): check_date = False ## - check language if(article_language not in authorized_languages): check_language = False ## Easy Filter if(check_date and check_language): easy_check_passed = True ## Complex filter if(validation_check_keywords_1 and validation_check_keywords_2): smart_check_passed = True ## Global check if(easy_check_passed and smart_check_passed): valid_article = True ##-------------## ## SAVING DATA ## ##-------------## ## Write and delete files if(valid_article): ## Save meta data in a text file ## for further use title_line = u'>Title;'+unicode(article_title)+u"\n" date_line = u'>Date;'+unicode(day)+u"/"+unicode(month)+u"/"+unicode(year)+u"\n" journal_line = u">Journal;"+unicode(journal_name)+u"\n" conflict_of_interest_line = u">Conflict;"+unicode(conflict_of_interest)+u"\n" meta_data = open("meta/"+str(pmid)+".csv", "w") meta_data.write(title_line.encode('utf8')) meta_data.write(date_line.encode('utf8')) meta_data.write(journal_line.encode('utf8')) meta_data.write(conflict_of_interest_line.encode('utf8')) meta_data.close() else: ## Delete the abstract try: if(abstract): os.remove(abstract_file_name) except: print "[WARNING] => can't delete "+str(abstract_file_name) ##------------------## ## RETURN SOMETHING ## ##------------------## ## return True if the article pass the ## evaluation, else False. return (valid_article, easy_check_passed, smart_check_passed)