Esempio n. 1
0
def fetch_abstract(pmid):
    print(pmid)
    Entrez.email = '*****@*****.**'
    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)
    return xml_data
Esempio n. 2
0
def fetch_abstract(pmid):
    ##
    ## Return abstract of a given
    ## article using pmid
    ##
    ## => Return None when pmid can't be return
    ## (can happen when article is in chinese)
    ##

    try:
        handle = efetch(
            db='pubmed',
            id=pmid,
            retmode='xml',
        )
        xml_data = read(handle)
        xml_data = xml_data['PubmedArticle'][0]

    except:
        return None

    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except IndexError:
        return None
    except KeyError:
        return None
    except:
        return None
def author(pmid):

    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)['PubmedArticle'][0]
    data = xml_data['MedlineCitation']['Article']['AuthorList']

    author_list = []

    for n in range(len(data)):

        author = data[n]
        name = author['ForeName'] + ' ' + author['LastName']
        aff_info = author['AffiliationInfo']

        if aff_info:

            aff = aff_info[0]['Affiliation']

        else:
            aff = ''

        author_list.append('#Name ' + name)

        if aff:

            author_list.append('#Affiliation ' + aff)

    author_str = ' '.join(author_list)

    return author_str
def specialization(author,affiliation):
    # import libraries
    import wikipedia
    import re
    from Bio.Entrez import efetch, read
    author = '"'+author+'"'
    
    # Find ID's for doctor + affiliation
    ids = []
    results = search('%s[AUTH] AND %s[AFFL]' % (author,affiliation))['IdList']
    for i in results:
        ids.append(i)    
    num_paper = len(ids)
    
    # get abstracts from list of ID's
    query_abstracts = ''
    keywords = []
    query_keywords = ''
    query_title = '' 
    for i in ids:
        xml_data = read(efetch(db='pubmed', id=i, retmode='xml'))
        try:
            abstract = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
            query_abstracts = query_abstracts + str(abstract) + ' '
        except:
            print('Paper with ID: ' + i + ' has no abstract')
            
    #get keuywords from ID's     
        if xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'] != []:
            for x in xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0] :
                keywords.append(str(re.sub("[^a-zA-Z]", " ", x)))
                query_keywords = query_keywords + x + ' '   
                
    #get paper titel from ID's
        try:
            query_title = query_title + ' ' + xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        except:
            print('Paper with ID: ' + i + ' has no title')
     
    # get wiki pages first sentence of keywords
    query_wiki = ''
    for keyword in keywords:
        try:
            page = wikipedia.summary(keyword,sentences = 1)
            query_wiki = query_wiki + ' ' + str(re.sub("[^a-zA-Z]", " ", page))
        except:
            print('Disambiguation error for keyword: '+keyword+', action: keyword excluded')
        
    
    # find specialism
    corpus = query_abstracts + ' ' + query_keywords + ' ' + query_wiki + ' ' + query_title 
    specialization = str(spec_search(corpus))
    
    if num_paper == 0:
        print('no papers found')
        specialization = []
    else:
        print('this doctor is specialized in: '+specialization)
    return specialization
Esempio n. 5
0
	def fetch_abstract(self,pmid):
		handle = efetch(db='pubmed', id=pmid, retmode='xml',email='*****@*****.**',retmax=1000)
		xml_data = read(handle)[0]
		try:
		    article = xml_data['MedlineCitation']['Article']	
		    abstract = article['Abstract']['AbstractText'][0]
		    return abstract
		except (IndexError, KeyError):
		    return None
Esempio n. 6
0
def fetch_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]
    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except IndexError:
        return None
Esempio n. 7
0
def fetch_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)
    try:
        article = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        title = article['ArticleTitle']
        return abstract, title
    except (IndexError, KeyError) as _:
        return None
Esempio n. 8
0
def fetch_abstract(pmid):
    """Pass in an article id."""
    pmid = str(pmid)
    try:
        handle = efetch(db='pubmed', id=pmid, retmode='xml')
        xml_data = read(handle)[0]
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except Exception as e:
        return '{}: {}'.format(e.__class__.__name__, e)
Esempio n. 9
0
def fetch_abstract(pmid):
    """Pass in an article id."""
    pmid = str(pmid)
    try:
        handle = efetch(db='pubmed', id=pmid, retmode='xml')
        xml_data = read(handle)[0]
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except Exception as e :
        return '{}: {}'.format(e.__class__.__name__, e)
Esempio n. 10
0
    def search(self, search_term, retmax=2000):

        handle = esearch(db=self.db, term=search_term, retmax=retmax)

        self.search_result = read(handle)
        self.ids = [str(i) for i in self.search_result.get('IdList')]

        print('No. of results: ', len(self.ids))
        if len(self.ids) == retmax:
            print('###! There might be more results available !###')

        return self
Esempio n. 11
0
def fetch_abstract(pmid):

    Entrez.email = "*****@*****.**"

    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]

    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except IndexError:
        return None
Esempio n. 12
0
    def get_mesh_from_pmid(self, user):
        Entrez.email = user.email
        handle = efetch(db="pubmed", id=str(self.pmid), retmode="xml")
        xml_data = read(handle)[0]

        # Skips articles without MeSH terms
        if u'MeshHeadingList' in xml_data["MedlineCitation"]:
            for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']:
                major = "N"
                qualifiers = mesh[u'QualifierName']
                if len(qualifiers) > 0:
                    major = str(qualifiers[0].attributes.items()[0][1])
                descr = mesh[u'DescriptorName']
                name = descr.title()
Esempio n. 13
0
    def get_mesh_from_pmid(self, user):
        Entrez.email = user.email
        handle = efetch(db = "pubmed", id = str(self.pmid), retmode = "xml")
        xml_data = read(handle)[0]

        # Skips articles without MeSH terms
        if u'MeshHeadingList' in xml_data["MedlineCitation"]:
            for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']:
                major = "N"
                qualifiers = mesh[u'QualifierName']
                if len(qualifiers) > 0:
                    major = str(qualifiers[0].attributes.items()[0][1])
                descr = mesh[u'DescriptorName']
                name = descr.title()
Esempio n. 14
0
def fetch_article(pmid):
	"""
	Test function
	=> Not working
	"""
	handle = efetch(db='pubmed', id=pmid, retmode='xml', )
	xml_data = read(handle)[0]

	try:
		article = xml_data['MedlineCitation']['Article']
		abstract = article['Abstract']['AbstractText'][0]
		return article

	except IndexError:
		return None
def fetch_abstract(pmids):
    pmid_str = ",".join(pmids)
    try:
        handle = efetch(db='pubmed', id=pmid_str, retmode='xml')
    except urllib.error.HTTPError:
        handle = efetch(db='pubmed', id=pmid_str, retmode='xml')

    xml_data = read(handle)['PubmedArticle']

    try:
        articles = [rec['MedlineCitation'] for rec in xml_data]
    except KeyError:
        articles = None

    return articles
Esempio n. 16
0
def date(pmid):

    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)['PubmedArticle'][0]
    data = xml_data['MedlineCitation']['Article']['Journal']['JournalIssue'][
        'PubDate']

    if 'Day' in data:

        day = data['Day']

    else:
        day = ''

    return data['Year'] + ' ' + data['Month'] + ' ' + day
Esempio n. 17
0
def get_mesh(pmid):
    # call PubMed API
    handle = efetch(db='pubmed', id=str(pmid), retmode='xml')
    xml_data = read(handle)[0]
    # skip articles without MeSH terms
    if u'MeshHeadingList' in xml_data['MedlineCitation']:
        for mesh in xml_data['MedlineCitation'][u'MeshHeadingList']:
            # grab the qualifier major/minor flag, if any
            major = 'N'
            qualifiers = mesh[u'QualifierName']
            if len(qualifiers) > 0:
                major = str(qualifiers[0].attributes.items()[0][1])
            # grab descriptor name
            descr = mesh[u'DescriptorName']
            name = descr.title()

            yield(name, major)
Esempio n. 18
0
def get_metadata_from_PMID( pmid ):
    """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin...."""
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]
    verbose_output = False
    try:
        date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] )
    except:
        print "Data Completed not available??",pmid

    try:
        otherID = xml_data['MedlineCitation']['OtherID']
    except:
        print "Other ID Not availble??",pmid
    try:
        MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList']
    except:
        print "Unable to get mesheadings for",pmid
    
    
    try:
        article = xml_data['MedlineCitation']['Article']
        if verbose_output: print xml_data
        #print date_completed,otherID
        for author in  article['AuthorList']:
            #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] }
            author_key =    author['LastName'] + ','+  author['Initials'] 
            #print author['LastName'],author['Initials'],author,'MOO'
            if author_key in global_author_list:
                global_author_list[ author_key ] +=1
                #print "adding author"
            else:
                global_author_list[ author_key ] = 1
                #print "I ADDED AN AUTHOR..."
        #return abstract
    except IndexError:
        return None
    except:
        print "unable to process",pmid
        print "Unexpected error:", sys.exc_info()[0]

    try:
        abstract = article['Abstract']['AbstractText'][0]
    except:
        print "Unable to get abstract for",pmid
        print "Unexpected error:", sys.exc_info()[0]
Esempio n. 19
0
def get_article_title(pmid):
    """
	Connect to pubmed database and get the article
	title of the given pmid.
	Return NA if faild
	"""
    handle = efetch(
        db='pubmed',
        id=pmid,
        retmode='xml',
    )
    xml_data = read(handle)
    xml_data = xml_data['PubmedArticle'][0]
    try:
        title = xml_data['MedlineCitation']['Article']['ArticleTitle']
    except:
        title = "NA"
    return title
Esempio n. 20
0
    def getDocument(self,pmid):
        #this method will return all associated attributes for an article
        #including Article Title , Publication Date , Authors' Names , Citations......etc.
        # it will return it as a python dictionary suitable for storage in mongodb
        handle = efetch(db='pubmed', id=str(pmid), retmode='xml')
        xml_data = read(handle)[0]

        article = dict(id = pmid,Title = str(xml_data['MedlineCitation']['Article'][u'ArticleTitle'])
                       , Abstract=str(self.safeAbstract(xml_data['MedlineCitation'],u'Abstract')),
                       DateCompleted="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Day'),
                                                       self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Month'),
                                                       self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Year'),),
                       DateRevised="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Day'),
                                                     self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Month'),
                                                     self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Year')))


        return (xml_data,article)
Esempio n. 21
0
def fetch_article(pmid):
	"""
	Test function
	=> Not working
	"""
	handle = efetch(db='pubmed', id=pmid, retmode='xml', )
	informations = read(handle)

	stuff = informations[u'PubmedArticle'][0] 
	date = stuff[u'PubmedData']["History"][1]
	month = date[u'Month']
	day = date[u'Day']
	year = date[u'Year']

	print month
	print day
	print year

	return "choucroute"
Esempio n. 22
0
def fetch_abstract(pmid):
	"""
	Retrun abstract of a given
	article using pmid

	=> Return None when pmid can't be return
	(can happen when article is in chinese)
	"""
	handle = efetch(db='pubmed', id=pmid, retmode='xml', )
	xml_data = read(handle)
	xml_data = xml_data['PubmedArticle'][0]
	
	try:
		article = xml_data['MedlineCitation']['Article']
		abstract = article['Abstract']['AbstractText'][0]
		return abstract
	except IndexError:
		return None
	except KeyError:
		return None
Esempio n. 23
0
def get_country_publication_stat(run_folder):
	##
	## Get the publications stats for country.
	## get the list of pmid retrieved from the
	## meta folder and connect to the NCBI to fecth
	## publications informations, parse it to get the
	## country of publication.
	## 
	## return a dictionnary
	##

	## init structure
	country_to_count = {}

	## get list of PMID to process
	meta_file_list = glob.glob(run_folder+"/meta/*.csv")
	for meta_file in meta_file_list:
		meta_file_in_array = meta_file.split("/")
		file_name = meta_file_in_array[-1]
		file_name_in_array = file_name.split(".")
		pmid = file_name_in_array[0]

		## get country publication
		try:
			handle = efetch(db='pubmed', id=pmid, retmode='xml', )
			informations = read(handle)
			stuff = informations[u'PubmedArticle'][0]
			country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country']
			print country # to delete
		except:
			country = "NA"

		## fill dictionnary
		if(country not in country_to_count.keys()):
			country_to_count[country] = 1
		else:
			country_to_count[country] += 1

	return country_to_count
Esempio n. 24
0
def date(pmid):

    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)['PubmedArticle']

    if xml_data:

        xml_list = xml_data[0]

        data = xml_list['MedlineCitation']['Article']['Journal'][
            'JournalIssue']['PubDate']

        if 'Day' in data:

            day = data['Day']

        else:
            day = ''

        if 'Month' in data:

            month = data['Month'] + ' '

        else:
            month = ''

        if 'Year' in data:

            year = data['Year'] + ' '

        else:
            year = ''

        return year + month + day

    else:
        return ''
Esempio n. 25
0
def fetch_article(pmid):
    """
	Test function
	=> Not working
	"""
    handle = efetch(
        db='pubmed',
        id=pmid,
        retmode='xml',
    )
    informations = read(handle)

    stuff = informations[u'PubmedArticle'][0]
    date = stuff[u'PubmedData']["History"][1]
    month = date[u'Month']
    day = date[u'Day']
    year = date[u'Year']

    print month
    print day
    print year

    return "choucroute"
Esempio n. 26
0
    ifsituationlist = []
    ifyearlist = []
    sumimpactfactor = 0
    repeatcount = 0

    for ID in IdList:
        allIdLink.append("https://pubmed.ncbi.nlm.nih.gov/" + ID + "/")
        repeatcount += 1

        if repeatcount == 1:
            pass
        else:
            allgenename.append("")
        issn = []
        handle = efetch(db="pubmed", id=ID, retmode="xml")
        xml_data = read(handle)
        articledata = xml_data["PubmedArticle"][0]["MedlineCitation"][
            "Article"]

        try:
            titledata = articledata["ArticleTitle"]
        except:
            titledata = []
            print("False:titledata")
        strtitledata = str(titledata)
        titlelist.append(strtitledata)

        try:
            yeardata = articledata["Journal"]["JournalIssue"]["PubDate"][
                "MedlineDate"]
        except:
Esempio n. 27
0
def evaluate_article(pmid):
    ##
    ## [IN PROGRESS]
    ##
    ## -> Test if the abstract is cool
    ## -> return true or false
    ##

    ##------------------------##
    ## Parameters for filters ##
    ##------------------------##
    oldest_year_authorized = 2008
    authorized_languages = [u'eng']

    valid_article = False
    check_date = True
    check_language = True
    validation_check_keywords_1 = False
    validation_check_keywords_2 = False

    ##---------------##
    ## The Easy Part ##
    ##---------------##
    ## get meta data on the articles
    try:
        handle = efetch(
            db='pubmed',
            id=pmid,
            retmode='xml',
        )
        informations = read(handle)
        stuff = informations[u'PubmedArticle'][0]

        ## get date from the history attribute, select
        ## the date of acceptation.
        date = stuff[u'PubmedData']["History"][1]
        month = date[u'Month']
        day = date[u'Day']
        year = date[u'Year']

        ## get the name of the review
        journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'MedlineJournalInfo'][u'MedlineTA']

        ## get the keywords for the articles
        ## the format is a bit strange, may have to be carreful
        ## with this data (mix of strings and unicode elements)
        keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'KeywordList']

        ## Get the author's conflict of interest,
        ## because we can.
        try:
            conflict_of_interest = informations[u'PubmedArticle'][0][
                u'MedlineCitation'][u'CoiStatement']
        except:
            conflict_of_interest = "NA"

        ## Get title of the article
        article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'Article'][u'ArticleTitle']

        ## Get language of the article
        article_language = informations[u'PubmedArticle'][0][
            u'MedlineCitation'][u'Article'][u'Language'][0]

    except:
        return (False, False, False)

    ##----------------##
    ## The Smart Part ##
    ##----------------##
    ## run further analysis on the abstract using nltk

    ## fetch the abstract and convert it to
    ## a nltk text object.
    abstract_file_name = "abstract/" + str(pmid) + "_abstract.txt"
    abstract = fetch_abstract(pmid)
    if (abstract):
        save_abstract(abstract, abstract_file_name)
        abstract_text = load_text(abstract_file_name)

        ## Play with tokenization and chunking
        ## Get all the commun names in the abstract
        names_found_in_abstract = []
        try:
            tokens = nltk.word_tokenize(abstract.encode('utf8'))
            tagged = nltk.pos_tag(tokens)
            entities = nltk.chunk.ne_chunk(tagged)
        except:
            print "[WARNINGS] => can't perform nlp operation"
            entities = []

        for item in entities:
            try:
                if (item[1] in ["NN", "NNS", "NNP"]):
                    if (item[0] not in names_found_in_abstract):
                        names_found_in_abstract.append(item[0])
            except:
                ## Somethig went wrong
                choucroute = True

        ## -> Biology keywords check
        ## -> Artificial intelligence keywords check
        IA_keywords = [
            "algorithm", "machine"
            "learning", "neural", "network", "statistic", "deep",
            "classification", "model"
        ]
        Clinical_keywords = [
            "Sjogren", "sjogren", "lupus", "autoimmunity", "rhumatoid",
            "arthrisis", "RA", "SjS", "SLE"
        ]
        for item in names_found_in_abstract:
            if (item in IA_keywords):
                validation_check_keywords_1 = True
            if (item in Clinical_keywords):
                validation_check_keywords_2 = True

    ##--------------##
    ## PASS OR FAIL ##
    ##--------------##
    ## General check phase
    easy_check_passed = False
    smart_check_passed = False

    ## Basic check on meta data
    ## - check date
    if (int(year) < int(oldest_year_authorized)):
        check_date = False

    ## - check language
    if (article_language not in authorized_languages):
        check_language = False

    ## Easy Filter
    if (check_date and check_language):
        easy_check_passed = True

    ## Complex filter
    if (validation_check_keywords_1 and validation_check_keywords_2):
        smart_check_passed = True

    ## Global check
    if (easy_check_passed and smart_check_passed):
        valid_article = True

    ##-------------##
    ## SAVING DATA ##
    ##-------------##
    ## Write and delete files
    if (valid_article):

        ## Save meta data in a text file
        ## for further use
        title_line = u'>Title;' + unicode(article_title) + u"\n"
        date_line = u'>Date;' + unicode(day) + u"/" + unicode(
            month) + u"/" + unicode(year) + u"\n"
        journal_line = u">Journal;" + unicode(journal_name) + u"\n"
        conflict_of_interest_line = u">Conflict;" + unicode(
            conflict_of_interest) + u"\n"
        meta_data = open("meta/" + str(pmid) + ".csv", "w")
        meta_data.write(title_line.encode('utf8'))
        meta_data.write(date_line.encode('utf8'))
        meta_data.write(journal_line.encode('utf8'))
        meta_data.write(conflict_of_interest_line.encode('utf8'))
        meta_data.close()

    else:
        ## Delete the abstract
        try:
            if (abstract):
                os.remove(abstract_file_name)
        except:
            print "[WARNING] => can't delete " + str(abstract_file_name)

    ##------------------##
    ## RETURN SOMETHING ##
    ##------------------##
    ## return True if the article pass the
    ## evaluation, else False.
    return (valid_article, easy_check_passed, smart_check_passed)
def get_metadata_from_PMID( pmid, output_errors=False, dump_xml=False ):
    """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin...."""
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]
    verbose_output = False
#   output_errors= False
    author_affiliation_list = []
    cur_paper_author_list = []

    try:
        date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] )
    except:
        print "Date Completed not available??",pmid
	## Will try date created	
	#date_completed = None
	#date_created = format_ddate( xml_data['MedlineCitation']['DateCreated'] )
	## I am removing the difference between date completed and created-- it doens't really matter for my purposes
	date_completed = format_ddate( xml_data['MedlineCitation']['DateCreated'] )
	
	#fp_error.write('Date Completed Not Avaiable:\n'+str(xml_data)+'\n\n')    

    try:
        otherID = xml_data['MedlineCitation']['OtherID']
    except:
        print "Other ID Not availble??",pmid

    try:
        MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList']
    except:
        print "Unable to get mesheadings for",pmid
	if output_errors: fp_error.write('MESH NOT AVAIABLE:\n'+str(xml_data)+'\n\n')    
    
    try:
        article = xml_data['MedlineCitation']['Article']
        if verbose_output: print xml_data
        for author in article['AuthorList']:
            #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] }
	    #print author 
	    if 'LastName' in author:
        	author_key =    author['LastName'] + ','+  author['Initials'] 
		#print author,author_key
		cur_paper_author_list.append(author_key)
	    elif 'CollectiveName' in author:
		print "FOUND A COLLECTION EXAMPLE",author
            if 'Affiliation' in author:
	    	author_affil = author['Affiliation']
	    	author_affiliation_list.append( (author, author_affil) )
	    #	print author_affil
	    #	sys.exit()
    except NameError as e:
	print e
    except IndexError:
        return None
    except:
        print "unable to proces article tag",pmid
        print "Unexpected error parsing author string:", sys.exc_info()[0]
	if output_errors: fp_error.write('Article NOT AVAILABLE\n'+str(xml_data)+'\n\n')    
	print author
	#print xml_data

    try:
        abstract = article['Abstract']['AbstractText'][0]
    except:
        print "Unable to get abstract for",pmid




    if dump_xml:
        print xml_data
	return xml_data		
    else:
	return { 'auth_list': cur_paper_author_list, 'affiliations': author_affiliation_list, 'publication_date': date_completed }
Esempio n. 29
0
def evaluate_article(pmid):
	##
	## [IN PROGRESS]
	##
	## -> Test if the abstract is cool
	## -> return true or false
	##

	##------------------------##
	## Parameters for filters ##
	##------------------------##
	oldest_year_authorized = 2008
	authorized_languages = [u'eng']

	valid_article = False
	check_date = True
	check_language = True
	validation_check_keywords_1 = False
	validation_check_keywords_2 = False



	##---------------##
	## The Easy Part ##
	##---------------##
	## get meta data on the articles
	try:
		handle = efetch(db='pubmed', id=pmid, retmode='xml', )
		informations = read(handle)
		stuff = informations[u'PubmedArticle'][0] 
		
		## get date from the history attribute, select
		## the date of acceptation.
		date = stuff[u'PubmedData']["History"][1]
		month = date[u'Month']
		day = date[u'Day']
		year = date[u'Year']

		## get the name of the review
		journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'MedlineJournalInfo'][u'MedlineTA']
		
		## get the keywords for the articles
		## the format is a bit strange, may have to be carreful
		## with this data (mix of strings and unicode elements)
		keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'KeywordList']

		## Get the author's conflict of interest,
		## because we can.
		try:
			conflict_of_interest = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'CoiStatement']
		except:
			conflict_of_interest = "NA"

		## Get title of the article
		article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'ArticleTitle']

		## Get language of the article
		article_language = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'Language'][0]

	except:
		return (False,False,False)

	##----------------##
	## The Smart Part ## 
	##----------------##
	## run further analysis on the abstract using nltk

	## fetch the abstract and convert it to
	## a nltk text object.
	abstract_file_name = "abstract/"+str(pmid)+"_abstract.txt"
	abstract = fetch_abstract(pmid)
	if(abstract):
		save_abstract(abstract, abstract_file_name)
		abstract_text = load_text(abstract_file_name)
		
		## Play with tokenization and chunking
		## Get all the commun names in the abstract
		names_found_in_abstract = []
		try:
			tokens = nltk.word_tokenize(abstract.encode('utf8'))
			tagged = nltk.pos_tag(tokens)
			entities = nltk.chunk.ne_chunk(tagged)
		except:
			print "[WARNINGS] => can't perform nlp operation"
			entities = []

		for item in entities:
			try:
				if(item[1] in ["NN", "NNS", "NNP"]):
					if(item[0] not in names_found_in_abstract):
						names_found_in_abstract.append(item[0])
			except:
				## Somethig went wrong
				choucroute = True
				
		## -> Biology keywords check
		## -> Artificial intelligence keywords check
		IA_keywords = ["algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model"]
		Clinical_keywords = ["Sjogren" ,"sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE"]
		for item in names_found_in_abstract:
			if(item in IA_keywords):
				validation_check_keywords_1 = True
			if(item in Clinical_keywords):
				validation_check_keywords_2 = True
		
	##--------------##
	## PASS OR FAIL ##
	##--------------##
	## General check phase
	easy_check_passed = False
	smart_check_passed = False

	## Basic check on meta data
	## - check date
	if(int(year) < int(oldest_year_authorized)):
		check_date = False

	## - check language
	if(article_language not in authorized_languages):
		check_language = False

	## Easy Filter
	if(check_date and check_language):
		easy_check_passed = True

	## Complex filter
	if(validation_check_keywords_1 and validation_check_keywords_2):
		smart_check_passed = True

	## Global check
	if(easy_check_passed and smart_check_passed):
		valid_article = True

	##-------------##
	## SAVING DATA ##
	##-------------##
	## Write and delete files
	if(valid_article):

		## Save meta data in a text file
		## for further use
		title_line = u'>Title;'+unicode(article_title)+u"\n"
		date_line = u'>Date;'+unicode(day)+u"/"+unicode(month)+u"/"+unicode(year)+u"\n"
		journal_line = u">Journal;"+unicode(journal_name)+u"\n"
		conflict_of_interest_line = u">Conflict;"+unicode(conflict_of_interest)+u"\n"
		meta_data = open("meta/"+str(pmid)+".csv", "w")
		meta_data.write(title_line.encode('utf8'))
		meta_data.write(date_line.encode('utf8'))
		meta_data.write(journal_line.encode('utf8'))
		meta_data.write(conflict_of_interest_line.encode('utf8'))
		meta_data.close()

	else:
		## Delete the abstract
		try:
			if(abstract):
				os.remove(abstract_file_name)
		except:
			print "[WARNING] => can't delete "+str(abstract_file_name)

	##------------------##
	## RETURN SOMETHING ##
	##------------------##
	## return True if the article pass the 
	## evaluation, else False.
	return (valid_article, easy_check_passed, smart_check_passed)
Esempio n. 30
0
def evaluate_article(pmid):
    ##
    ## [IN PROGRESS]
    ##
    ## -> Test if the abstract is cool
    ## -> return true or false
    ##
    ## TODO : write doc
    ##

    ##------------------------##
    ## Parameters for filters ##
    ##------------------------##

    ## initialize parameters
    oldest_year_authorized = "NA"
    case_report_only = False
    case_report_check = False
    authorized_languages = []
    valid_article = False
    check_date = True
    check_language = True
    validation_check = {}
    validation_keywords = {}

    exclusion_check = {}
    exclusion_keywords = {}

    exclusion_keywords_found = False

    ## test if config file exist
    if (os.path.isfile("config.conf")):
        config_data = open("config.conf", "r")
        validation_keywords_cmpt = 0
        exclusion_keywords_cmpt = 0
        for line in config_data:
            line = line.replace("\n", "")
            line_in_array = line.split(";")

            if (line_in_array[0] == "min year"):
                oldest_year_authorized = line_in_array[1]
            elif (line_in_array[0] == "authorized languages"):
                languages_list = line_in_array[1].split(",")
                for elt in languages_list:
                    authorized_languages.append(unicode(elt))
            elif (line_in_array[0] == "validation keywords"):
                validation_keywords_cmpt += 1
                validation_check["keywords_" +
                                 str(validation_keywords_cmpt)] = False
                validation_keywords["keywords_" +
                                    str(validation_keywords_cmpt)] = []
                keywords_list = line_in_array[1].split(",")
                for elt in keywords_list:
                    if (elt not in validation_keywords[
                            "keywords_" + str(validation_keywords_cmpt)]):
                        validation_keywords[
                            "keywords_" +
                            str(validation_keywords_cmpt)].append(str(elt))

            ## Retrieve Exclusion list
            elif (line_in_array[0] == "exclusion keywords"):
                exclusion_keywords_found = True
                exclusion_keywords_cmpt += 1
                exclusion_check["exclusion_" +
                                str(exclusion_keywords_cmpt)] = False
                exclusion_keywords["exclusion_" +
                                   str(exclusion_keywords_cmpt)] = []
                keywords_list = line_in_array[1].split(",")
                for elt in keywords_list:
                    if (elt not in exclusion_keywords[
                            "exclusion_" + str(exclusion_keywords_cmpt)]):
                        exclusion_keywords[
                            "exclusion_" +
                            str(exclusion_keywords_cmpt)].append(str(elt))

            ## case report only option
            ## if nothing is set, default is False
            elif (line_in_array[0] == "case report only"
                  and str(line_in_array[1]) == "True"):
                case_report_only = True

        config_data.close()

    ## default configuration
    else:
        oldest_year_authorized = 2008
        authorized_languages = [u'eng']
        validation_check["keywords_1"] = False
        validation_check["keywords_2"] = False
        validation_keywords["keywords_1"] = [
            "algorithm", "machine"
            "learning", "neural", "network", "statistic", "deep",
            "classification", "model"
        ]
        validation_keywords["keywords_2"] = [
            "Sjogren", "sjogren", "lupus", "autoimmunity", "rhumatoid",
            "arthrisis", "RA", "SjS", "SLE"
        ]
        exclusion_check["exclusion_1"] = False
        exclusion_keywords["exclusion_1"] = []

    if (not exclusion_keywords_found):
        exclusion_check["exclusion_1"] = False
        exclusion_keywords["exclusion_1"] = []

    ##---------------##
    ## The Easy Part ##
    ##---------------##
    ## get meta data on the articles
    try:
        handle = efetch(
            db='pubmed',
            id=pmid,
            retmode='xml',
        )
        informations = read(handle)
        stuff = informations[u'PubmedArticle'][0]

        ## get date from the history attribute, select
        ## the date of acceptation.
        date = stuff[u'PubmedData']["History"][1]
        month = date[u'Month']
        day = date[u'Day']
        year = date[u'Year']

        ## get the name of the review
        journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'MedlineJournalInfo'][u'MedlineTA']

        ## get the keywords for the articles
        ## the format is a bit strange, may have to be carreful
        ## with this data (mix of strings and unicode elements)
        keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'KeywordList']

        ## Get the author's conflict of interest,
        ## because we can.
        try:
            conflict_of_interest = informations[u'PubmedArticle'][0][
                u'MedlineCitation'][u'CoiStatement']
        except:
            conflict_of_interest = "NA"

        ## Get title of the article
        article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'Article'][u'ArticleTitle']

        ## Get language of the article
        article_language = informations[u'PubmedArticle'][0][
            u'MedlineCitation'][u'Article'][u'Language'][0]

        ## Get country of publications
        country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country']

    except:
        return (False, False, False)

    ##----------------##
    ## The Smart Part ##
    ##----------------##
    ## run further analysis on the abstract using nltk

    ##
    ## WORKING ON EXCLUSION LIST
    ##

    ## fetch the abstract and convert it to
    ## a nltk text object.
    abstract_file_name = "abstract/" + str(pmid) + "_abstract.txt"
    abstract = fetch_abstract(pmid)
    if (abstract):
        save_abstract(abstract, abstract_file_name)
        abstract_text = load_text(abstract_file_name)

        ## Play with tokenization and chunking
        ## Get all the commun names in the abstract
        names_found_in_abstract = []
        try:
            tokens = nltk.word_tokenize(abstract.encode('utf8'))
            tagged = nltk.pos_tag(tokens)
            entities = nltk.chunk.ne_chunk(tagged)
        except:
            print "[WARNINGS] => can't perform nlp operation"
            entities = []

        for item in entities:
            try:
                if (item[1] in ["NN", "NNS", "NNP"]):
                    if (item[0] not in names_found_in_abstract):
                        names_found_in_abstract.append(item[0])
            except:
                ## Somethig went wrong
                choucroute = True

        ## Check validation list
        for item in names_found_in_abstract:
            for key in validation_keywords.keys():
                keywords_validation_list = validation_keywords[key]
                if (item in keywords_validation_list):
                    validation_check[key] = True

        ## Check exclusion list
        for item in names_found_in_abstract:
            for key in exclusion_keywords.keys():
                exclusion_validation_list = exclusion_keywords[key]
                if (item in exclusion_validation_list):
                    exclusion_check[key] = True

        ## Check if is a case report
        if (case_report_only):
            print "[DEBUG] => Case report only"
            if (article_is_a_case_report(abstract_file_name)):
                case_report_check = True

    ##--------------##
    ## PASS OR FAIL ##
    ##--------------##
    ## General check phase
    easy_check_passed = False
    smart_check_passed = True

    ## Basic check on meta data
    ## - check date
    if (int(year) < int(oldest_year_authorized)):
        check_date = False

    ## - check language
    if (article_language not in authorized_languages):
        check_language = False

    ## Easy Filter
    if (check_date and check_language):
        easy_check_passed = True

    ## Complex filter (inclusion)
    if (False in validation_check.values()):
        smart_check_passed = False

    ## Complex filter (exclusion)
    if (True in exclusion_check.values()):
        smart_check_passed = False

    ## Case reprot filter
    if (case_report_only and case_report_check):
        print "[DEBUG] => EXLUDED"
        smart_check_passed = False

    ## Global check
    if (easy_check_passed and smart_check_passed):
        valid_article = True

    ##-------------##
    ## SAVING DATA ##
    ##-------------##
    ## Write and delete files
    if (valid_article):

        ## Save meta data in a text file
        ## for further use
        title_line = u'>Title;' + unicode(article_title) + u"\n"
        date_line = u'>Date;' + unicode(day) + u"/" + unicode(
            month) + u"/" + unicode(year) + u"\n"
        #date_line = '>Date;'+str(day.encode('utf8'))+"/"+str(month.encode(utf8))+"/"+str(year.encode("utf8"))+"\n"
        journal_line = u">Journal;" + unicode(journal_name) + u"\n"
        country_line = u">Country;" + unicode(country) + u"\n"
        conflict_of_interest_line = u">Conflict;" + unicode(
            conflict_of_interest) + u"\n"
        meta_data = open("meta/" + str(pmid) + ".csv", "w")
        meta_data.write(title_line.encode('utf8'))
        meta_data.write(date_line.encode('utf8'))
        meta_data.write(journal_line.encode('utf8'))
        meta_data.write(country_line.encode('utf8'))
        meta_data.write(conflict_of_interest_line.encode('utf8'))
        meta_data.close()

    else:
        ## Delete the abstract
        try:
            if (abstract):
                os.remove(abstract_file_name)
        except:
            print "[WARNING] => can't delete " + str(abstract_file_name)

    ##------------------##
    ## RETURN SOMETHING ##
    ##------------------##
    ## return True if the article pass the
    ## evaluation, else False.
    return (valid_article, easy_check_passed, smart_check_passed)