def fetch_abstract(pmid): print(pmid) Entrez.email = '*****@*****.**' handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle) return xml_data
def fetch_abstract(pmid): ## ## Return abstract of a given ## article using pmid ## ## => Return None when pmid can't be return ## (can happen when article is in chinese) ## try: handle = efetch( db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle) xml_data = xml_data['PubmedArticle'][0] except: return None try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None except KeyError: return None except: return None
def author(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)['PubmedArticle'][0] data = xml_data['MedlineCitation']['Article']['AuthorList'] author_list = [] for n in range(len(data)): author = data[n] name = author['ForeName'] + ' ' + author['LastName'] aff_info = author['AffiliationInfo'] if aff_info: aff = aff_info[0]['Affiliation'] else: aff = '' author_list.append('#Name ' + name) if aff: author_list.append('#Affiliation ' + aff) author_str = ' '.join(author_list) return author_str
def specialization(author,affiliation): # import libraries import wikipedia import re from Bio.Entrez import efetch, read author = '"'+author+'"' # Find ID's for doctor + affiliation ids = [] results = search('%s[AUTH] AND %s[AFFL]' % (author,affiliation))['IdList'] for i in results: ids.append(i) num_paper = len(ids) # get abstracts from list of ID's query_abstracts = '' keywords = [] query_keywords = '' query_title = '' for i in ids: xml_data = read(efetch(db='pubmed', id=i, retmode='xml')) try: abstract = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'] query_abstracts = query_abstracts + str(abstract) + ' ' except: print('Paper with ID: ' + i + ' has no abstract') #get keuywords from ID's if xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'] != []: for x in xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0] : keywords.append(str(re.sub("[^a-zA-Z]", " ", x))) query_keywords = query_keywords + x + ' ' #get paper titel from ID's try: query_title = query_title + ' ' + xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle'] except: print('Paper with ID: ' + i + ' has no title') # get wiki pages first sentence of keywords query_wiki = '' for keyword in keywords: try: page = wikipedia.summary(keyword,sentences = 1) query_wiki = query_wiki + ' ' + str(re.sub("[^a-zA-Z]", " ", page)) except: print('Disambiguation error for keyword: '+keyword+', action: keyword excluded') # find specialism corpus = query_abstracts + ' ' + query_keywords + ' ' + query_wiki + ' ' + query_title specialization = str(spec_search(corpus)) if num_paper == 0: print('no papers found') specialization = [] else: print('this doctor is specialized in: '+specialization) return specialization
def fetch_abstract(self,pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml',email='*****@*****.**',retmax=1000) xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except (IndexError, KeyError): return None
def fetch_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None
def fetch_abstract(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle) try: article = xml_data['PubmedArticle'][0]['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] title = article['ArticleTitle'] return abstract, title except (IndexError, KeyError) as _: return None
def fetch_abstract(pmid): """Pass in an article id.""" pmid = str(pmid) try: handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except Exception as e: return '{}: {}'.format(e.__class__.__name__, e)
def fetch_abstract(pmid): """Pass in an article id.""" pmid = str(pmid) try: handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except Exception as e : return '{}: {}'.format(e.__class__.__name__, e)
def search(self, search_term, retmax=2000): handle = esearch(db=self.db, term=search_term, retmax=retmax) self.search_result = read(handle) self.ids = [str(i) for i in self.search_result.get('IdList')] print('No. of results: ', len(self.ids)) if len(self.ids) == retmax: print('###! There might be more results available !###') return self
def fetch_abstract(pmid): Entrez.email = "*****@*****.**" handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None
def get_mesh_from_pmid(self, user): Entrez.email = user.email handle = efetch(db="pubmed", id=str(self.pmid), retmode="xml") xml_data = read(handle)[0] # Skips articles without MeSH terms if u'MeshHeadingList' in xml_data["MedlineCitation"]: for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']: major = "N" qualifiers = mesh[u'QualifierName'] if len(qualifiers) > 0: major = str(qualifiers[0].attributes.items()[0][1]) descr = mesh[u'DescriptorName'] name = descr.title()
def get_mesh_from_pmid(self, user): Entrez.email = user.email handle = efetch(db = "pubmed", id = str(self.pmid), retmode = "xml") xml_data = read(handle)[0] # Skips articles without MeSH terms if u'MeshHeadingList' in xml_data["MedlineCitation"]: for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']: major = "N" qualifiers = mesh[u'QualifierName'] if len(qualifiers) > 0: major = str(qualifiers[0].attributes.items()[0][1]) descr = mesh[u'DescriptorName'] name = descr.title()
def fetch_article(pmid): """ Test function => Not working """ handle = efetch(db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle)[0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return article except IndexError: return None
def fetch_abstract(pmids): pmid_str = ",".join(pmids) try: handle = efetch(db='pubmed', id=pmid_str, retmode='xml') except urllib.error.HTTPError: handle = efetch(db='pubmed', id=pmid_str, retmode='xml') xml_data = read(handle)['PubmedArticle'] try: articles = [rec['MedlineCitation'] for rec in xml_data] except KeyError: articles = None return articles
def date(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)['PubmedArticle'][0] data = xml_data['MedlineCitation']['Article']['Journal']['JournalIssue'][ 'PubDate'] if 'Day' in data: day = data['Day'] else: day = '' return data['Year'] + ' ' + data['Month'] + ' ' + day
def get_mesh(pmid): # call PubMed API handle = efetch(db='pubmed', id=str(pmid), retmode='xml') xml_data = read(handle)[0] # skip articles without MeSH terms if u'MeshHeadingList' in xml_data['MedlineCitation']: for mesh in xml_data['MedlineCitation'][u'MeshHeadingList']: # grab the qualifier major/minor flag, if any major = 'N' qualifiers = mesh[u'QualifierName'] if len(qualifiers) > 0: major = str(qualifiers[0].attributes.items()[0][1]) # grab descriptor name descr = mesh[u'DescriptorName'] name = descr.title() yield(name, major)
def get_metadata_from_PMID( pmid ): """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin....""" handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] verbose_output = False try: date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] ) except: print "Data Completed not available??",pmid try: otherID = xml_data['MedlineCitation']['OtherID'] except: print "Other ID Not availble??",pmid try: MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList'] except: print "Unable to get mesheadings for",pmid try: article = xml_data['MedlineCitation']['Article'] if verbose_output: print xml_data #print date_completed,otherID for author in article['AuthorList']: #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] } author_key = author['LastName'] + ','+ author['Initials'] #print author['LastName'],author['Initials'],author,'MOO' if author_key in global_author_list: global_author_list[ author_key ] +=1 #print "adding author" else: global_author_list[ author_key ] = 1 #print "I ADDED AN AUTHOR..." #return abstract except IndexError: return None except: print "unable to process",pmid print "Unexpected error:", sys.exc_info()[0] try: abstract = article['Abstract']['AbstractText'][0] except: print "Unable to get abstract for",pmid print "Unexpected error:", sys.exc_info()[0]
def get_article_title(pmid): """ Connect to pubmed database and get the article title of the given pmid. Return NA if faild """ handle = efetch( db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle) xml_data = xml_data['PubmedArticle'][0] try: title = xml_data['MedlineCitation']['Article']['ArticleTitle'] except: title = "NA" return title
def getDocument(self,pmid): #this method will return all associated attributes for an article #including Article Title , Publication Date , Authors' Names , Citations......etc. # it will return it as a python dictionary suitable for storage in mongodb handle = efetch(db='pubmed', id=str(pmid), retmode='xml') xml_data = read(handle)[0] article = dict(id = pmid,Title = str(xml_data['MedlineCitation']['Article'][u'ArticleTitle']) , Abstract=str(self.safeAbstract(xml_data['MedlineCitation'],u'Abstract')), DateCompleted="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Day'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Month'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Year'),), DateRevised="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Day'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Month'), self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Year'))) return (xml_data,article)
def fetch_article(pmid): """ Test function => Not working """ handle = efetch(db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] print month print day print year return "choucroute"
def fetch_abstract(pmid): """ Retrun abstract of a given article using pmid => Return None when pmid can't be return (can happen when article is in chinese) """ handle = efetch(db='pubmed', id=pmid, retmode='xml', ) xml_data = read(handle) xml_data = xml_data['PubmedArticle'][0] try: article = xml_data['MedlineCitation']['Article'] abstract = article['Abstract']['AbstractText'][0] return abstract except IndexError: return None except KeyError: return None
def get_country_publication_stat(run_folder): ## ## Get the publications stats for country. ## get the list of pmid retrieved from the ## meta folder and connect to the NCBI to fecth ## publications informations, parse it to get the ## country of publication. ## ## return a dictionnary ## ## init structure country_to_count = {} ## get list of PMID to process meta_file_list = glob.glob(run_folder+"/meta/*.csv") for meta_file in meta_file_list: meta_file_in_array = meta_file.split("/") file_name = meta_file_in_array[-1] file_name_in_array = file_name.split(".") pmid = file_name_in_array[0] ## get country publication try: handle = efetch(db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country'] print country # to delete except: country = "NA" ## fill dictionnary if(country not in country_to_count.keys()): country_to_count[country] = 1 else: country_to_count[country] += 1 return country_to_count
def date(pmid): handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)['PubmedArticle'] if xml_data: xml_list = xml_data[0] data = xml_list['MedlineCitation']['Article']['Journal'][ 'JournalIssue']['PubDate'] if 'Day' in data: day = data['Day'] else: day = '' if 'Month' in data: month = data['Month'] + ' ' else: month = '' if 'Year' in data: year = data['Year'] + ' ' else: year = '' return year + month + day else: return ''
def fetch_article(pmid): """ Test function => Not working """ handle = efetch( db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] print month print day print year return "choucroute"
ifsituationlist = [] ifyearlist = [] sumimpactfactor = 0 repeatcount = 0 for ID in IdList: allIdLink.append("https://pubmed.ncbi.nlm.nih.gov/" + ID + "/") repeatcount += 1 if repeatcount == 1: pass else: allgenename.append("") issn = [] handle = efetch(db="pubmed", id=ID, retmode="xml") xml_data = read(handle) articledata = xml_data["PubmedArticle"][0]["MedlineCitation"][ "Article"] try: titledata = articledata["ArticleTitle"] except: titledata = [] print("False:titledata") strtitledata = str(titledata) titlelist.append(strtitledata) try: yeardata = articledata["Journal"]["JournalIssue"]["PubDate"][ "MedlineDate"] except:
def evaluate_article(pmid): ## ## [IN PROGRESS] ## ## -> Test if the abstract is cool ## -> return true or false ## ##------------------------## ## Parameters for filters ## ##------------------------## oldest_year_authorized = 2008 authorized_languages = [u'eng'] valid_article = False check_date = True check_language = True validation_check_keywords_1 = False validation_check_keywords_2 = False ##---------------## ## The Easy Part ## ##---------------## ## get meta data on the articles try: handle = efetch( db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] ## get date from the history attribute, select ## the date of acceptation. date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] ## get the name of the review journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'MedlineJournalInfo'][u'MedlineTA'] ## get the keywords for the articles ## the format is a bit strange, may have to be carreful ## with this data (mix of strings and unicode elements) keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'KeywordList'] ## Get the author's conflict of interest, ## because we can. try: conflict_of_interest = informations[u'PubmedArticle'][0][ u'MedlineCitation'][u'CoiStatement'] except: conflict_of_interest = "NA" ## Get title of the article article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'Article'][u'ArticleTitle'] ## Get language of the article article_language = informations[u'PubmedArticle'][0][ u'MedlineCitation'][u'Article'][u'Language'][0] except: return (False, False, False) ##----------------## ## The Smart Part ## ##----------------## ## run further analysis on the abstract using nltk ## fetch the abstract and convert it to ## a nltk text object. abstract_file_name = "abstract/" + str(pmid) + "_abstract.txt" abstract = fetch_abstract(pmid) if (abstract): save_abstract(abstract, abstract_file_name) abstract_text = load_text(abstract_file_name) ## Play with tokenization and chunking ## Get all the commun names in the abstract names_found_in_abstract = [] try: tokens = nltk.word_tokenize(abstract.encode('utf8')) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) except: print "[WARNINGS] => can't perform nlp operation" entities = [] for item in entities: try: if (item[1] in ["NN", "NNS", "NNP"]): if (item[0] not in names_found_in_abstract): names_found_in_abstract.append(item[0]) except: ## Somethig went wrong choucroute = True ## -> Biology keywords check ## -> Artificial intelligence keywords check IA_keywords = [ "algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model" ] Clinical_keywords = [ "Sjogren", "sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE" ] for item in names_found_in_abstract: if (item in IA_keywords): validation_check_keywords_1 = True if (item in Clinical_keywords): validation_check_keywords_2 = True ##--------------## ## PASS OR FAIL ## ##--------------## ## General check phase easy_check_passed = False smart_check_passed = False ## Basic check on meta data ## - check date if (int(year) < int(oldest_year_authorized)): check_date = False ## - check language if (article_language not in authorized_languages): check_language = False ## Easy Filter if (check_date and check_language): easy_check_passed = True ## Complex filter if (validation_check_keywords_1 and validation_check_keywords_2): smart_check_passed = True ## Global check if (easy_check_passed and smart_check_passed): valid_article = True ##-------------## ## SAVING DATA ## ##-------------## ## Write and delete files if (valid_article): ## Save meta data in a text file ## for further use title_line = u'>Title;' + unicode(article_title) + u"\n" date_line = u'>Date;' + unicode(day) + u"/" + unicode( month) + u"/" + unicode(year) + u"\n" journal_line = u">Journal;" + unicode(journal_name) + u"\n" conflict_of_interest_line = u">Conflict;" + unicode( conflict_of_interest) + u"\n" meta_data = open("meta/" + str(pmid) + ".csv", "w") meta_data.write(title_line.encode('utf8')) meta_data.write(date_line.encode('utf8')) meta_data.write(journal_line.encode('utf8')) meta_data.write(conflict_of_interest_line.encode('utf8')) meta_data.close() else: ## Delete the abstract try: if (abstract): os.remove(abstract_file_name) except: print "[WARNING] => can't delete " + str(abstract_file_name) ##------------------## ## RETURN SOMETHING ## ##------------------## ## return True if the article pass the ## evaluation, else False. return (valid_article, easy_check_passed, smart_check_passed)
def get_metadata_from_PMID( pmid, output_errors=False, dump_xml=False ): """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin....""" handle = efetch(db='pubmed', id=pmid, retmode='xml') xml_data = read(handle)[0] verbose_output = False # output_errors= False author_affiliation_list = [] cur_paper_author_list = [] try: date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] ) except: print "Date Completed not available??",pmid ## Will try date created #date_completed = None #date_created = format_ddate( xml_data['MedlineCitation']['DateCreated'] ) ## I am removing the difference between date completed and created-- it doens't really matter for my purposes date_completed = format_ddate( xml_data['MedlineCitation']['DateCreated'] ) #fp_error.write('Date Completed Not Avaiable:\n'+str(xml_data)+'\n\n') try: otherID = xml_data['MedlineCitation']['OtherID'] except: print "Other ID Not availble??",pmid try: MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList'] except: print "Unable to get mesheadings for",pmid if output_errors: fp_error.write('MESH NOT AVAIABLE:\n'+str(xml_data)+'\n\n') try: article = xml_data['MedlineCitation']['Article'] if verbose_output: print xml_data for author in article['AuthorList']: #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] } #print author if 'LastName' in author: author_key = author['LastName'] + ','+ author['Initials'] #print author,author_key cur_paper_author_list.append(author_key) elif 'CollectiveName' in author: print "FOUND A COLLECTION EXAMPLE",author if 'Affiliation' in author: author_affil = author['Affiliation'] author_affiliation_list.append( (author, author_affil) ) # print author_affil # sys.exit() except NameError as e: print e except IndexError: return None except: print "unable to proces article tag",pmid print "Unexpected error parsing author string:", sys.exc_info()[0] if output_errors: fp_error.write('Article NOT AVAILABLE\n'+str(xml_data)+'\n\n') print author #print xml_data try: abstract = article['Abstract']['AbstractText'][0] except: print "Unable to get abstract for",pmid if dump_xml: print xml_data return xml_data else: return { 'auth_list': cur_paper_author_list, 'affiliations': author_affiliation_list, 'publication_date': date_completed }
def evaluate_article(pmid): ## ## [IN PROGRESS] ## ## -> Test if the abstract is cool ## -> return true or false ## ##------------------------## ## Parameters for filters ## ##------------------------## oldest_year_authorized = 2008 authorized_languages = [u'eng'] valid_article = False check_date = True check_language = True validation_check_keywords_1 = False validation_check_keywords_2 = False ##---------------## ## The Easy Part ## ##---------------## ## get meta data on the articles try: handle = efetch(db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] ## get date from the history attribute, select ## the date of acceptation. date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] ## get the name of the review journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'MedlineJournalInfo'][u'MedlineTA'] ## get the keywords for the articles ## the format is a bit strange, may have to be carreful ## with this data (mix of strings and unicode elements) keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'KeywordList'] ## Get the author's conflict of interest, ## because we can. try: conflict_of_interest = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'CoiStatement'] except: conflict_of_interest = "NA" ## Get title of the article article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'ArticleTitle'] ## Get language of the article article_language = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'Language'][0] except: return (False,False,False) ##----------------## ## The Smart Part ## ##----------------## ## run further analysis on the abstract using nltk ## fetch the abstract and convert it to ## a nltk text object. abstract_file_name = "abstract/"+str(pmid)+"_abstract.txt" abstract = fetch_abstract(pmid) if(abstract): save_abstract(abstract, abstract_file_name) abstract_text = load_text(abstract_file_name) ## Play with tokenization and chunking ## Get all the commun names in the abstract names_found_in_abstract = [] try: tokens = nltk.word_tokenize(abstract.encode('utf8')) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) except: print "[WARNINGS] => can't perform nlp operation" entities = [] for item in entities: try: if(item[1] in ["NN", "NNS", "NNP"]): if(item[0] not in names_found_in_abstract): names_found_in_abstract.append(item[0]) except: ## Somethig went wrong choucroute = True ## -> Biology keywords check ## -> Artificial intelligence keywords check IA_keywords = ["algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model"] Clinical_keywords = ["Sjogren" ,"sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE"] for item in names_found_in_abstract: if(item in IA_keywords): validation_check_keywords_1 = True if(item in Clinical_keywords): validation_check_keywords_2 = True ##--------------## ## PASS OR FAIL ## ##--------------## ## General check phase easy_check_passed = False smart_check_passed = False ## Basic check on meta data ## - check date if(int(year) < int(oldest_year_authorized)): check_date = False ## - check language if(article_language not in authorized_languages): check_language = False ## Easy Filter if(check_date and check_language): easy_check_passed = True ## Complex filter if(validation_check_keywords_1 and validation_check_keywords_2): smart_check_passed = True ## Global check if(easy_check_passed and smart_check_passed): valid_article = True ##-------------## ## SAVING DATA ## ##-------------## ## Write and delete files if(valid_article): ## Save meta data in a text file ## for further use title_line = u'>Title;'+unicode(article_title)+u"\n" date_line = u'>Date;'+unicode(day)+u"/"+unicode(month)+u"/"+unicode(year)+u"\n" journal_line = u">Journal;"+unicode(journal_name)+u"\n" conflict_of_interest_line = u">Conflict;"+unicode(conflict_of_interest)+u"\n" meta_data = open("meta/"+str(pmid)+".csv", "w") meta_data.write(title_line.encode('utf8')) meta_data.write(date_line.encode('utf8')) meta_data.write(journal_line.encode('utf8')) meta_data.write(conflict_of_interest_line.encode('utf8')) meta_data.close() else: ## Delete the abstract try: if(abstract): os.remove(abstract_file_name) except: print "[WARNING] => can't delete "+str(abstract_file_name) ##------------------## ## RETURN SOMETHING ## ##------------------## ## return True if the article pass the ## evaluation, else False. return (valid_article, easy_check_passed, smart_check_passed)
def evaluate_article(pmid): ## ## [IN PROGRESS] ## ## -> Test if the abstract is cool ## -> return true or false ## ## TODO : write doc ## ##------------------------## ## Parameters for filters ## ##------------------------## ## initialize parameters oldest_year_authorized = "NA" case_report_only = False case_report_check = False authorized_languages = [] valid_article = False check_date = True check_language = True validation_check = {} validation_keywords = {} exclusion_check = {} exclusion_keywords = {} exclusion_keywords_found = False ## test if config file exist if (os.path.isfile("config.conf")): config_data = open("config.conf", "r") validation_keywords_cmpt = 0 exclusion_keywords_cmpt = 0 for line in config_data: line = line.replace("\n", "") line_in_array = line.split(";") if (line_in_array[0] == "min year"): oldest_year_authorized = line_in_array[1] elif (line_in_array[0] == "authorized languages"): languages_list = line_in_array[1].split(",") for elt in languages_list: authorized_languages.append(unicode(elt)) elif (line_in_array[0] == "validation keywords"): validation_keywords_cmpt += 1 validation_check["keywords_" + str(validation_keywords_cmpt)] = False validation_keywords["keywords_" + str(validation_keywords_cmpt)] = [] keywords_list = line_in_array[1].split(",") for elt in keywords_list: if (elt not in validation_keywords[ "keywords_" + str(validation_keywords_cmpt)]): validation_keywords[ "keywords_" + str(validation_keywords_cmpt)].append(str(elt)) ## Retrieve Exclusion list elif (line_in_array[0] == "exclusion keywords"): exclusion_keywords_found = True exclusion_keywords_cmpt += 1 exclusion_check["exclusion_" + str(exclusion_keywords_cmpt)] = False exclusion_keywords["exclusion_" + str(exclusion_keywords_cmpt)] = [] keywords_list = line_in_array[1].split(",") for elt in keywords_list: if (elt not in exclusion_keywords[ "exclusion_" + str(exclusion_keywords_cmpt)]): exclusion_keywords[ "exclusion_" + str(exclusion_keywords_cmpt)].append(str(elt)) ## case report only option ## if nothing is set, default is False elif (line_in_array[0] == "case report only" and str(line_in_array[1]) == "True"): case_report_only = True config_data.close() ## default configuration else: oldest_year_authorized = 2008 authorized_languages = [u'eng'] validation_check["keywords_1"] = False validation_check["keywords_2"] = False validation_keywords["keywords_1"] = [ "algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model" ] validation_keywords["keywords_2"] = [ "Sjogren", "sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE" ] exclusion_check["exclusion_1"] = False exclusion_keywords["exclusion_1"] = [] if (not exclusion_keywords_found): exclusion_check["exclusion_1"] = False exclusion_keywords["exclusion_1"] = [] ##---------------## ## The Easy Part ## ##---------------## ## get meta data on the articles try: handle = efetch( db='pubmed', id=pmid, retmode='xml', ) informations = read(handle) stuff = informations[u'PubmedArticle'][0] ## get date from the history attribute, select ## the date of acceptation. date = stuff[u'PubmedData']["History"][1] month = date[u'Month'] day = date[u'Day'] year = date[u'Year'] ## get the name of the review journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'MedlineJournalInfo'][u'MedlineTA'] ## get the keywords for the articles ## the format is a bit strange, may have to be carreful ## with this data (mix of strings and unicode elements) keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'KeywordList'] ## Get the author's conflict of interest, ## because we can. try: conflict_of_interest = informations[u'PubmedArticle'][0][ u'MedlineCitation'][u'CoiStatement'] except: conflict_of_interest = "NA" ## Get title of the article article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][ u'Article'][u'ArticleTitle'] ## Get language of the article article_language = informations[u'PubmedArticle'][0][ u'MedlineCitation'][u'Article'][u'Language'][0] ## Get country of publications country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country'] except: return (False, False, False) ##----------------## ## The Smart Part ## ##----------------## ## run further analysis on the abstract using nltk ## ## WORKING ON EXCLUSION LIST ## ## fetch the abstract and convert it to ## a nltk text object. abstract_file_name = "abstract/" + str(pmid) + "_abstract.txt" abstract = fetch_abstract(pmid) if (abstract): save_abstract(abstract, abstract_file_name) abstract_text = load_text(abstract_file_name) ## Play with tokenization and chunking ## Get all the commun names in the abstract names_found_in_abstract = [] try: tokens = nltk.word_tokenize(abstract.encode('utf8')) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) except: print "[WARNINGS] => can't perform nlp operation" entities = [] for item in entities: try: if (item[1] in ["NN", "NNS", "NNP"]): if (item[0] not in names_found_in_abstract): names_found_in_abstract.append(item[0]) except: ## Somethig went wrong choucroute = True ## Check validation list for item in names_found_in_abstract: for key in validation_keywords.keys(): keywords_validation_list = validation_keywords[key] if (item in keywords_validation_list): validation_check[key] = True ## Check exclusion list for item in names_found_in_abstract: for key in exclusion_keywords.keys(): exclusion_validation_list = exclusion_keywords[key] if (item in exclusion_validation_list): exclusion_check[key] = True ## Check if is a case report if (case_report_only): print "[DEBUG] => Case report only" if (article_is_a_case_report(abstract_file_name)): case_report_check = True ##--------------## ## PASS OR FAIL ## ##--------------## ## General check phase easy_check_passed = False smart_check_passed = True ## Basic check on meta data ## - check date if (int(year) < int(oldest_year_authorized)): check_date = False ## - check language if (article_language not in authorized_languages): check_language = False ## Easy Filter if (check_date and check_language): easy_check_passed = True ## Complex filter (inclusion) if (False in validation_check.values()): smart_check_passed = False ## Complex filter (exclusion) if (True in exclusion_check.values()): smart_check_passed = False ## Case reprot filter if (case_report_only and case_report_check): print "[DEBUG] => EXLUDED" smart_check_passed = False ## Global check if (easy_check_passed and smart_check_passed): valid_article = True ##-------------## ## SAVING DATA ## ##-------------## ## Write and delete files if (valid_article): ## Save meta data in a text file ## for further use title_line = u'>Title;' + unicode(article_title) + u"\n" date_line = u'>Date;' + unicode(day) + u"/" + unicode( month) + u"/" + unicode(year) + u"\n" #date_line = '>Date;'+str(day.encode('utf8'))+"/"+str(month.encode(utf8))+"/"+str(year.encode("utf8"))+"\n" journal_line = u">Journal;" + unicode(journal_name) + u"\n" country_line = u">Country;" + unicode(country) + u"\n" conflict_of_interest_line = u">Conflict;" + unicode( conflict_of_interest) + u"\n" meta_data = open("meta/" + str(pmid) + ".csv", "w") meta_data.write(title_line.encode('utf8')) meta_data.write(date_line.encode('utf8')) meta_data.write(journal_line.encode('utf8')) meta_data.write(country_line.encode('utf8')) meta_data.write(conflict_of_interest_line.encode('utf8')) meta_data.close() else: ## Delete the abstract try: if (abstract): os.remove(abstract_file_name) except: print "[WARNING] => can't delete " + str(abstract_file_name) ##------------------## ## RETURN SOMETHING ## ##------------------## ## return True if the article pass the ## evaluation, else False. return (valid_article, easy_check_passed, smart_check_passed)