def fetch_abstract(pmids):
    pmid_str = ",".join(pmids)
    try:
        handle = efetch(db='pubmed', id=pmid_str, retmode='xml')
    except urllib.error.HTTPError:
        handle = efetch(db='pubmed', id=pmid_str, retmode='xml')

    xml_data = read(handle)['PubmedArticle']

    try:
        articles = [rec['MedlineCitation'] for rec in xml_data]
    except KeyError:
        articles = None

    return articles
예제 #2
0
def author(pmid):

    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)['PubmedArticle'][0]
    data = xml_data['MedlineCitation']['Article']['AuthorList']

    author_list = []

    for n in range(len(data)):

        author = data[n]
        name = author['ForeName'] + ' ' + author['LastName']
        aff_info = author['AffiliationInfo']

        if aff_info:

            aff = aff_info[0]['Affiliation']

        else:
            aff = ''

        author_list.append('#Name ' + name)

        if aff:

            author_list.append('#Affiliation ' + aff)

    author_str = ' '.join(author_list)

    return author_str
예제 #3
0
 def get_terms_abstract(self, pubmed_id):
     handle = efetch(db='pubmed', id=pubmed_id, retmode='text', rettype='abstract', email='*****@*****.**')
     text = handle.read()
     terms = TermsBag()
     # terms.termine_service(text)
     self.view.terms[self.view.pubmed_id.isin([int(pubmed_id)])] = terms.termine_service(text)
     self.terms = terms.to_data_frame()
예제 #4
0
def fetch_abstract(pmid):
    ##
    ## Return abstract of a given
    ## article using pmid
    ##
    ## => Return None when pmid can't be return
    ## (can happen when article is in chinese)
    ##

    try:
        handle = efetch(
            db='pubmed',
            id=pmid,
            retmode='xml',
        )
        xml_data = read(handle)
        xml_data = xml_data['PubmedArticle'][0]

    except:
        return None

    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except IndexError:
        return None
    except KeyError:
        return None
    except:
        return None
def fetch_abstract(id_list):
    ids = ','.join(id_list)
    handle = efetch(db='pubmed', id=ids,
                    retmode='xml')  ### this step returns xml object
    #results = Entrez.read(handle) #this step read xml to string
    #return results
    return handle
예제 #6
0
def pubmed(gi,ids,query):
    """
    Get the pubmed articles listed by *ids
    """
    _ids=",".join(ids)
    for id in ids:
        handle = efetch(db="pubmed",id=id,retmode='xml',rettype='xml',retmax=MAX_RETURN)
        try:
            #print handle.read()
            results = eread(handle)
            for citation in results:
                #runtime().debug(citation.keys())
                citation = citation['MedlineCitation']
                pmid = citation['PMID']
                article = citation['Article']
                title = article['ArticleTitle']
                journal = article['Journal']['Title']
                try:
                    date = citation['DateCompleted'] if citation.has_key('DateCompleted') else citation['DateCreated']
                    year = date['Year']
                    month = date['Month']
                    day = date['Day']
                    datetime = "%s-%s-%s" % (year,month,day)
                except:
                    datetime = '0000-00-00'
                
                runtime().debug("Parsed pmid:%s" % id)
                yield Citation(gi, pmid, title, journal, datetime, query)
        except:
            runtime().debug("Failure fetching pmid:%s" % id)
            continue
        finally:
            handle.close()
예제 #7
0
def fetch_abstract(pmid):
    print(pmid)
    Entrez.email = '*****@*****.**'
    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)
    return xml_data
def specialization(author,affiliation):
    # import libraries
    import wikipedia
    import re
    from Bio.Entrez import efetch, read
    author = '"'+author+'"'
    
    # Find ID's for doctor + affiliation
    ids = []
    results = search('%s[AUTH] AND %s[AFFL]' % (author,affiliation))['IdList']
    for i in results:
        ids.append(i)    
    num_paper = len(ids)
    
    # get abstracts from list of ID's
    query_abstracts = ''
    keywords = []
    query_keywords = ''
    query_title = '' 
    for i in ids:
        xml_data = read(efetch(db='pubmed', id=i, retmode='xml'))
        try:
            abstract = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText']
            query_abstracts = query_abstracts + str(abstract) + ' '
        except:
            print('Paper with ID: ' + i + ' has no abstract')
            
    #get keuywords from ID's     
        if xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'] != []:
            for x in xml_data['PubmedArticle'][0]['MedlineCitation']['KeywordList'][0] :
                keywords.append(str(re.sub("[^a-zA-Z]", " ", x)))
                query_keywords = query_keywords + x + ' '   
                
    #get paper titel from ID's
        try:
            query_title = query_title + ' ' + xml_data['PubmedArticle'][0]['MedlineCitation']['Article']['ArticleTitle']
        except:
            print('Paper with ID: ' + i + ' has no title')
     
    # get wiki pages first sentence of keywords
    query_wiki = ''
    for keyword in keywords:
        try:
            page = wikipedia.summary(keyword,sentences = 1)
            query_wiki = query_wiki + ' ' + str(re.sub("[^a-zA-Z]", " ", page))
        except:
            print('Disambiguation error for keyword: '+keyword+', action: keyword excluded')
        
    
    # find specialism
    corpus = query_abstracts + ' ' + query_keywords + ' ' + query_wiki + ' ' + query_title 
    specialization = str(spec_search(corpus))
    
    if num_paper == 0:
        print('no papers found')
        specialization = []
    else:
        print('this doctor is specialized in: '+specialization)
    return specialization
예제 #9
0
 def query(self, term):
     id_list = self.__term_querier.query_matching_paper_ids(term)
     id_list_string = ','.join(id_list)
     handle = efetch(db=self.__db,
                     id=id_list_string,
                     retmode=self.__retmode)
     papers = self.__entrez.read(handle)
     return papers['PubmedArticle']
예제 #10
0
def get_abstract(pmid):
    """
	function to collect abstract from pubmed

	shameless taken from: https://stackoverflow.com/questions/17409107/obtaining-data-from-pubmed-using-python
	"""
    handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    return handle.read()
예제 #11
0
def print_xml(pmid):
    """
    from the result of the query (pubmed ids) to parse out title, abstract, article type, and journal info
    :param pmid: list of PubMed IDs after querying
    :return: list of titles,abstracts, types, and journal names
    """
    handle = efetch(db='pubmed',
                    id=','.join(map(str, pmid)),
                    retmode='xml',
                    rettype='text')
    # try:
    print('entering print_xml')
    doc = handle.read()
    # except http.client.IncompleteRead:
    #     continue
    doc = xmltodict.parse(doc)
    doc = json.loads(json.dumps(doc))
    print('have read the doc')
    d = doc['PubmedArticleSet']["PubmedArticle"]
    titles = []
    types = []
    abstracts = []
    jour_names = []
    for i in d:  # iterate through each article
        # find journal information
        if i["MedlineCitation"]['Article']['Journal']['Title'] is not None:
            jour_name = i["MedlineCitation"]['Article']['Journal']['Title']
            jour_names.append(jour_name)
        else:
            jour_names.append('no journal found')
        # find title information
        t = i["MedlineCitation"]['Article']['ArticleTitle']
        if isinstance(t, str):
            t = i["MedlineCitation"]['Article']['ArticleTitle']
        elif i["MedlineCitation"]['Article']['ArticleTitle'] is None:
            t = "no title"
        else:
            t = i["MedlineCitation"]['Article']['ArticleTitle']['#text']
        titles.append(t)
        if 'Abstract' in i['MedlineCitation']['Article']:
            abstracts.append(
                i['MedlineCitation']['Article']['Abstract']['AbstractText'])
        else:
            abstracts.append('no abstract')
        # find type of article
        type = i['MedlineCitation']['Article']['PublicationTypeList'][
            'PublicationType']
        if isinstance(type, dict):
            types.append(type['#text'])
            # print(ty['#text'])
        else:
            # print(ty)
            type_stripped = []
            for d in type:
                type_stripped.append(d['#text'])
            type_stripped = ', '.join(type_stripped)
            types.append(type_stripped)
    return titles, abstracts, types, jour_names
예제 #12
0
파일: pubmed_api.py 프로젝트: mac389/lovasi
	def fetch_abstract(self,pmid):
		handle = efetch(db='pubmed', id=pmid, retmode='xml',email='*****@*****.**',retmax=1000)
		xml_data = read(handle)[0]
		try:
		    article = xml_data['MedlineCitation']['Article']	
		    abstract = article['Abstract']['AbstractText'][0]
		    return abstract
		except (IndexError, KeyError):
		    return None
예제 #13
0
def fetch_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]
    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except IndexError:
        return None
예제 #14
0
def fetch_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = handle.read()
    r = re.compile('<AbstractText>(.*?)</AbstractText>')
    m = r.search(xml_data)
    if m:
        abstract = m.group(1)
        return abstract
    else:
        return ""
예제 #15
0
def fetch_abstract(pmid):  # not really being used at all, just a ref. function
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = Entrez.read(handle)
    print(xml_data)
    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']
        return abstract
    except IndexError:
        return None
예제 #16
0
def fetch_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)
    try:
        article = xml_data['PubmedArticle'][0]['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        title = article['ArticleTitle']
        return abstract, title
    except (IndexError, KeyError) as _:
        return None
예제 #17
0
파일: bio.py 프로젝트: rhyswat/blewog
def fetch_abstract(pmid):
    """Pass in an article id."""
    pmid = str(pmid)
    try:
        handle = efetch(db='pubmed', id=pmid, retmode='xml')
        xml_data = read(handle)[0]
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except Exception as e :
        return '{}: {}'.format(e.__class__.__name__, e)
예제 #18
0
파일: bio.py 프로젝트: rhyswat/blewog
def fetch_abstract(pmid):
    """Pass in an article id."""
    pmid = str(pmid)
    try:
        handle = efetch(db='pubmed', id=pmid, retmode='xml')
        xml_data = read(handle)[0]
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except Exception as e:
        return '{}: {}'.format(e.__class__.__name__, e)
예제 #19
0
def fetch_abstract(pmid):

    Entrez.email = "*****@*****.**"

    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]

    try:
        article = xml_data['MedlineCitation']['Article']
        abstract = article['Abstract']['AbstractText'][0]
        return abstract
    except IndexError:
        return None
예제 #20
0
    def __init__(self, pmid):
        self.pmid = pmid
        Entrez.email = '*****@*****.**'

        inCache = False
        if self.pmid in pmidTermFind.pmid_xmlData_HISTORY:  # If ID in cache
            self.xml_data = pmidTermFind.pmid_xmlData_HISTORY[self.pmid]
            inCache = True

        if inCache == False:
            self.handle = efetch(db='pubmed', id=self.pmid, retmode='xml')
            self.xml_data = Entrez.read(self.handle)[0]
            pmidTermFind.pmid_xmlData_HISTORY[pmid] = self.xml_data
예제 #21
0
    def get_mesh_from_pmid(self, user):
        Entrez.email = user.email
        handle = efetch(db="pubmed", id=str(self.pmid), retmode="xml")
        xml_data = read(handle)[0]

        # Skips articles without MeSH terms
        if u'MeshHeadingList' in xml_data["MedlineCitation"]:
            for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']:
                major = "N"
                qualifiers = mesh[u'QualifierName']
                if len(qualifiers) > 0:
                    major = str(qualifiers[0].attributes.items()[0][1])
                descr = mesh[u'DescriptorName']
                name = descr.title()
예제 #22
0
    def get_mesh_from_pmid(self, user):
        Entrez.email = user.email
        handle = efetch(db = "pubmed", id = str(self.pmid), retmode = "xml")
        xml_data = read(handle)[0]

        # Skips articles without MeSH terms
        if u'MeshHeadingList' in xml_data["MedlineCitation"]:
            for mesh in xml_data["MedlineCitation"][u'MeshHeadingList']:
                major = "N"
                qualifiers = mesh[u'QualifierName']
                if len(qualifiers) > 0:
                    major = str(qualifiers[0].attributes.items()[0][1])
                descr = mesh[u'DescriptorName']
                name = descr.title()
예제 #23
0
def fetch_article(pmid):
	"""
	Test function
	=> Not working
	"""
	handle = efetch(db='pubmed', id=pmid, retmode='xml', )
	xml_data = read(handle)[0]

	try:
		article = xml_data['MedlineCitation']['Article']
		abstract = article['Abstract']['AbstractText'][0]
		return article

	except IndexError:
		return None
예제 #24
0
def add_pmid_article_to_database(article_id):
    """
    Given a PMID, use external APIs to get the necessary article data
    in order to add the article to our database.
    """
    if len(list(get_article_object(article_id))) == 0:
        pmid = str(article_id)
        handle = efetch("pubmed", id=[pmid], rettype="medline", retmode="text")
        records = list(Medline.parse(handle))
        records = records[0]
        if "TI" not in records:
            return False  # catch bad PMIDs
        article_info = {}
        article_info["title"] = records["TI"]
        article_info["PMID"] = pmid
        article_info["authors"] = ', '.join(records["AU"])
        article_info["abstract"] = records["AB"]
        article_info["DOI"] = getDOI(records["AID"])
        article_info["experiments"] = ""
        article_info["metadata"] = str({"meshHeadings": []})
        article_info["reference"] = None
        identity = ""
        try:
            article_info["experiments"] = {
                "locations":
                eval(
                    urllib.request.urlopen(
                        "http://neurosynth.org/api/studies/peaks/" +
                        str(pmid) + "/").read().decode())["data"]
            }
            k = article_info["experiments"]["locations"]
            for i in range(len(k)):
                if len(k[i]) == 4:
                    identity = k[0]
                    k[i] = k[i][1:]
                k[i] = ",".join([str(x) for x in (k[i])])
        except BaseException:
            pass
        article_info["id"] = identity
        article_info["experiments"] = [article_info["experiments"]]
        Articles.insert(abstract=article_info["abstract"],
                        authors=article_info["authors"],
                        doi=article_info["DOI"],
                        experiments=article_info["experiments"],
                        pmid=article_info["PMID"],
                        title=article_info["title"]).execute()
        return True
    return False
예제 #25
0
def date(pmid):

    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)['PubmedArticle'][0]
    data = xml_data['MedlineCitation']['Article']['Journal']['JournalIssue'][
        'PubDate']

    if 'Day' in data:

        day = data['Day']

    else:
        day = ''

    return data['Year'] + ' ' + data['Month'] + ' ' + day
예제 #26
0
def get_metadata_from_PMID( pmid ):
    """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin...."""
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]
    verbose_output = False
    try:
        date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] )
    except:
        print "Data Completed not available??",pmid

    try:
        otherID = xml_data['MedlineCitation']['OtherID']
    except:
        print "Other ID Not availble??",pmid
    try:
        MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList']
    except:
        print "Unable to get mesheadings for",pmid
    
    
    try:
        article = xml_data['MedlineCitation']['Article']
        if verbose_output: print xml_data
        #print date_completed,otherID
        for author in  article['AuthorList']:
            #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] }
            author_key =    author['LastName'] + ','+  author['Initials'] 
            #print author['LastName'],author['Initials'],author,'MOO'
            if author_key in global_author_list:
                global_author_list[ author_key ] +=1
                #print "adding author"
            else:
                global_author_list[ author_key ] = 1
                #print "I ADDED AN AUTHOR..."
        #return abstract
    except IndexError:
        return None
    except:
        print "unable to process",pmid
        print "Unexpected error:", sys.exc_info()[0]

    try:
        abstract = article['Abstract']['AbstractText'][0]
    except:
        print "Unable to get abstract for",pmid
        print "Unexpected error:", sys.exc_info()[0]
예제 #27
0
def get_mesh(pmid):
    # call PubMed API
    handle = efetch(db='pubmed', id=str(pmid), retmode='xml')
    xml_data = read(handle)[0]
    # skip articles without MeSH terms
    if u'MeshHeadingList' in xml_data['MedlineCitation']:
        for mesh in xml_data['MedlineCitation'][u'MeshHeadingList']:
            # grab the qualifier major/minor flag, if any
            major = 'N'
            qualifiers = mesh[u'QualifierName']
            if len(qualifiers) > 0:
                major = str(qualifiers[0].attributes.items()[0][1])
            # grab descriptor name
            descr = mesh[u'DescriptorName']
            name = descr.title()

            yield(name, major)
예제 #28
0
def get_article_title(pmid):
    """
	Connect to pubmed database and get the article
	title of the given pmid.
	Return NA if faild
	"""
    handle = efetch(
        db='pubmed',
        id=pmid,
        retmode='xml',
    )
    xml_data = read(handle)
    xml_data = xml_data['PubmedArticle'][0]
    try:
        title = xml_data['MedlineCitation']['Article']['ArticleTitle']
    except:
        title = "NA"
    return title
예제 #29
0
파일: Entrez.py 프로젝트: snouto/CloudApp
    def getDocument(self,pmid):
        #this method will return all associated attributes for an article
        #including Article Title , Publication Date , Authors' Names , Citations......etc.
        # it will return it as a python dictionary suitable for storage in mongodb
        handle = efetch(db='pubmed', id=str(pmid), retmode='xml')
        xml_data = read(handle)[0]

        article = dict(id = pmid,Title = str(xml_data['MedlineCitation']['Article'][u'ArticleTitle'])
                       , Abstract=str(self.safeAbstract(xml_data['MedlineCitation'],u'Abstract')),
                       DateCompleted="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Day'),
                                                       self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Month'),
                                                       self.safeDateCompleted(xml_data['MedlineCitation'],'DateCompleted','Year'),),
                       DateRevised="{}/{}/{}".format(self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Day'),
                                                     self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Month'),
                                                     self.safeDateCompleted(xml_data['MedlineCitation'],'DateRevised','Year')))


        return (xml_data,article)
예제 #30
0
def fetch_article(pmid):
	"""
	Test function
	=> Not working
	"""
	handle = efetch(db='pubmed', id=pmid, retmode='xml', )
	informations = read(handle)

	stuff = informations[u'PubmedArticle'][0] 
	date = stuff[u'PubmedData']["History"][1]
	month = date[u'Month']
	day = date[u'Day']
	year = date[u'Year']

	print month
	print day
	print year

	return "choucroute"
예제 #31
0
파일: trashlib.py 프로젝트: Nurtal/BIBOT
def fetch_abstract(pmid):
	"""
	Retrun abstract of a given
	article using pmid

	=> Return None when pmid can't be return
	(can happen when article is in chinese)
	"""
	handle = efetch(db='pubmed', id=pmid, retmode='xml', )
	xml_data = read(handle)
	xml_data = xml_data['PubmedArticle'][0]
	
	try:
		article = xml_data['MedlineCitation']['Article']
		abstract = article['Abstract']['AbstractText'][0]
		return abstract
	except IndexError:
		return None
	except KeyError:
		return None
예제 #32
0
def get_country_publication_stat(run_folder):
	##
	## Get the publications stats for country.
	## get the list of pmid retrieved from the
	## meta folder and connect to the NCBI to fecth
	## publications informations, parse it to get the
	## country of publication.
	## 
	## return a dictionnary
	##

	## init structure
	country_to_count = {}

	## get list of PMID to process
	meta_file_list = glob.glob(run_folder+"/meta/*.csv")
	for meta_file in meta_file_list:
		meta_file_in_array = meta_file.split("/")
		file_name = meta_file_in_array[-1]
		file_name_in_array = file_name.split(".")
		pmid = file_name_in_array[0]

		## get country publication
		try:
			handle = efetch(db='pubmed', id=pmid, retmode='xml', )
			informations = read(handle)
			stuff = informations[u'PubmedArticle'][0]
			country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country']
			print country # to delete
		except:
			country = "NA"

		## fill dictionnary
		if(country not in country_to_count.keys()):
			country_to_count[country] = 1
		else:
			country_to_count[country] += 1

	return country_to_count
예제 #33
0
        def fetch_abstract(pmid):
            '''
            This method was originally written by Karol.  Colin Hortman added the try/except block to handle articles that
            abstracts were not found
            http://stackoverflow.com/questions/17409107/obtaining-data-from-pubmed-using-python

            :param pmid: PubMed ID
            :return: Abstract ( will return as StringElement but behaves mostly like a string )
            '''
            handle = efetch(db='pubmed', id=pmid, retmode='xml')
            xml_data = Entrez.read(handle)[0]
            try:
                article = xml_data['MedlineCitation']['Article']
                try:
                    abstract = article['Abstract']['AbstractText'][0]
                    return abstract
                except KeyError:
                    pass
                    #print('No Abstract found for PMID: ', pmid)
            except IndexError:
                return None
            return None
예제 #34
0
def date(pmid):

    handle = efetch(db='pubmed', id=pmid, retmode='xml')

    xml_data = read(handle)['PubmedArticle']

    if xml_data:

        xml_list = xml_data[0]

        data = xml_list['MedlineCitation']['Article']['Journal'][
            'JournalIssue']['PubDate']

        if 'Day' in data:

            day = data['Day']

        else:
            day = ''

        if 'Month' in data:

            month = data['Month'] + ' '

        else:
            month = ''

        if 'Year' in data:

            year = data['Year'] + ' '

        else:
            year = ''

        return year + month + day

    else:
        return ''
예제 #35
0
파일: bibotlite.py 프로젝트: Nurtal/BIBOT
def fetch_article(pmid):
    """
	Test function
	=> Not working
	"""
    handle = efetch(
        db='pubmed',
        id=pmid,
        retmode='xml',
    )
    informations = read(handle)

    stuff = informations[u'PubmedArticle'][0]
    date = stuff[u'PubmedData']["History"][1]
    month = date[u'Month']
    day = date[u'Day']
    year = date[u'Year']

    print month
    print day
    print year

    return "choucroute"
예제 #36
0
    def parse_pmc_ids(self, pmcid, retmode='xml'):

        #get xml from pmc
        handle = efetch(db='pmc', id=pmcid, retmode=retmode)
        xml_string = handle.read()
        xml = ET.fromstring(xml_string)

        #check for keywords and MeshTerms
        keys = []

        for art in xml.getchildren():

            #title
            title = ''.join(art.xpath('.//article-meta//article-title/text()'))

            #authors
            auth = zip(
                art.findall('.//*[@contrib-type="author"]/name/given-names'),
                art.findall('.//*[@contrib-type="author"]/name/surname'))
            auth = ';'.join([' '.join([i.text, j.text]) for (i, j) in auth])

            #affiliations
            aff = ';'.join(art.xpath('.//aff/text()'))

            #publication_date
            pub_date = '-'.join(
                art.xpath('.//article-meta/pub-date[@pub-type="epub"]/*/text()'
                          )[::-1])

            #ids
            ##pubmed_id
            pubmed_id = ''.join(
                art.xpath(
                    './/article-meta/article-id[@pub-id-type="pmid"]/text()'))
            ##doi
            doi = ''.join(
                art.xpath(
                    './/article-meta/article-id[@pub-id-type="doi"]/text()'))
            ##pmcid
            pmcid = ''.join(
                art.xpath(
                    './/article-meta/article-id[@pub-id-type="pmc"]/text()'))

            #abstract
            abstract = ''.join(art.xpath('.//abstract//*/text()'))

            #fulltext
            full_text = ''.join(art.xpath('.//body//*/text()'))

            #journal
            journal = ''.join(
                art.xpath(
                    '//journal-meta/journal-id[@journal-id-type="iso-abbrev"]/text()'
                ))

            #pmcclass
            pmcclass = ''.join(
                art.xpath(
                    './/article-meta/article-categories//subject/text()'))

            #pmc_keywords
            pmc_keywords = art.xpath('.//kwd/text()')

            keys.append((pmcid, title, auth, aff, pub_date, pubmed_id, doi,
                         abstract, full_text, journal, pmcclass, pmc_keywords))

        return keys, xml_string
예제 #37
0
def add_pmid_article_to_database(article_id):
    """
    Given a PMID, use external APIs to get the necessary article data
    in order to add the article to our database.
    """

    pmid = str(article_id)
    try:
        handle = efetch("pubmed", id=[pmid], rettype="medline", retmode="text")
    except BaseException:
        return False  # Could not access correct pubmed ID

    records = list(Medline.parse(handle))
    records = records[0]
    article_info = {}
    article_info["title"] = records.get("TI")
    article_info["PMID"] = pmid
    article_info["authors"] = ', '.join(records.get("AU", []))
    article_info["abstract"] = records.get("AB")
    article_info["DOI"] = getDOI(records.get("AID", []))
    article_info["experiments"] = []
    article_info["metadata"] = str({"meshHeadings": []})
    article_info["reference"] = None
    identity = ""
    try:
        locations_list = eval(
            urllib.request.urlopen(
                "http://neurosynth.org/api/studies/peaks/" +
                str(pmid) +
                "/").read().decode())["data"]

        id_map = {}
        greatest_id = 89999
        current_exp = None

        for loc in locations_list:
            current_loc_id = None
            vals = loc
            if len(loc) == 4:
                current_loc_id = loc[0]
                vals = vals[1:]
            # vals is the x, y, z array; current_loc_id is the Neurosynth ID
            if current_loc_id not in id_map:
                greatest_id += 1
                id_map[current_loc_id] = greatest_id
                if current_exp is not None:
                    # Add the current experiment if its not None
                    article_info["experiments"].append(current_exp)
                current_exp = {
                    "caption": "",
                    "locations": [],
                    "descriptors": [],
                    "contrast": "",
                    "space": "",
                    "effect": ""
                }
            current_exp["locations"].append(",".join([str(v) for v in vals]))
        if current_exp is not None:
            article_info["experiments"].append(current_exp)
    except BaseException:
        pass

    Articles.create(abstract=article_info["abstract"],
                    authors=article_info["authors"],
                    doi=article_info["DOI"],
                    experiments=str(article_info["experiments"]),
                    pmid=article_info["PMID"],
                    title=article_info["title"])
    return True
예제 #38
0
파일: pull.py 프로젝트: huihuifan/jq1
def fetch_abstract():

    colnames = ['pmids']
    data = pandas.read_csv("pmids.csv", names=colnames)

    pmids = list(data.pmids)

    full = []

    for i in pmids:

            handle = efetch(db='pubmed', id=i, retmode='xml')

            xml_data = handle.read()

            soup = BeautifulSoup(xml_data)

            a_recs = []

            for tag in soup.findAll("pubmedarticle"): 

                title = tag.articletitle.text
                journal = tag.findAll("journal")

                try:
                    for info in journal:
                        year = info.find("year").text
                except:
                    for info in journal:
                        year = info.find("year")

                for a_tag in tag.findAll("author"):
                    a_rec = {}
 
                    a_rec['year'] = year

                    a_rec['title'] = title

                    a_rec['pmid'] = int(tag.pmid.text)
                    
                    try:
                        a_rec['lastname'] = str(a_tag.lastname.text)
                    except:
                        a_rec['lastname'] = str(a_tag.lastname)
                    
                    try:
                        a_rec['forename'] = str(a_tag.forename.text)
                    except:
                        a_rec['forename'] = str(a_tag.forename)
                    
                    try: 
                        a_rec['initials'] = str(a_tag.initials.text)
                    except:
                        a_rec['initials'] = str(a_tag.initials)

                    try:
                        a_rec['affiliation'] = str(a_tag.affiliation.text)
                    except:
                        a_rec['affiliation'] = str(a_tag.affiliation)
                    
                    a_recs.append(a_rec)
            

            #full.append(a_recs)
            full.append(a_recs)

            def convert(input):
                if isinstance(input, dict):
                    return {convert(key): convert(value) for key, value in input.iteritems()}
                elif isinstance(input, list):
                    return [convert(element) for element in input]
                elif isinstance(input, unicode):
                    return input.encode('utf-8')
                else:
                    return input

            full = convert(full)

            print "running"

            #article = ET.XML(xml_data)

            #print article.find('AuthorList').findall('Author')

    for entry in full:
        #print entry
        keys = ['year', 'title', 'pmid', 'lastname', 'forename', 'initials', 'affiliation']
        f = open("citations.csv", "ab")
        dict_writer = csv.DictWriter(f, keys)
        dict_writer.writer.writerow(keys)
        dict_writer.writerows(entry)

    return 
예제 #39
0
def fetch_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    data = handle.read()
    return data
예제 #40
0
        hit_gi = int(aln.hit_id.split("|")[1])
        if hit_gi in processed_ids:
            continue
        else:
            processed_ids[hit_gi] = None
        if "LOW QUALITY" in aln.hit_def:
            print '\t\t"LOW QUALITY" in seq name'
            continue
        for hsp in aln.hsps:
            if float(hsp.expect) < EXPECT:
                qlen = len(hsp.query.replace("-",""))
                coverage = (float(qlen)/ float(record.query_letters))*100.0
                if coverage > 80.0:
                    sleep(5)
                    print "\t\tExp: %f start: %i Qend: %i Coverage%%: %f" % (float(hsp.expect), hsp.query_start, hsp.query_end, coverage)
                    ef_handle = efetch(db="protein", id=hit_gi, rettype="fasta", retmode="text")
                    fasta_txt = ef_handle.read()
                    while fasta_txt.find("unavailable") >=0:
                        print "\t\tentrez is failing hard. sleeping. (id:%i)" % hit_gi
                        sleep(5)
                        ef_handle = efetch(db="protein", id=hit_gi, rettype="fasta", retmode="text")
                        fasta_txt = ef_handle.read()

                    # edit fasta_txt
                    fasta_lines = fasta_txt.splitlines()
                    fasta_header = fasta_lines[0]
                    fasta_seq = "".join(fasta_lines[1:])

                    fasta_header = fasta_header.replace('predicted protein', 'pred.')
                    fasta_header = fasta_header.replace('hypothetical protein', 'pred.')
                    fasta_header = fasta_header.replace('PREDICTED:', 'pred.')
예제 #41
0
def collect_NCBI():
    global all_pmids
    global pmid_dict

    if os.path.exists(f'./{rel_name}/{rel_name}_pmid_dict.json'):
        with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'r') as f:
            jd = f.read()
            temp_dict = json.loads(jd)
        pmid_dict.update(temp_dict)
        return pmid_dict

    for idx in tqdm(range(len(all_pmids))):
        pmid = all_pmids[idx]
        # get records for each pmid
        fetch_records_handle1 = efetch(db="pubmed",
                                       id=str(pmid),
                                       rettype="medline",
                                       retmode="text")
        # parse fetched records
        records1 = Medline.parse(fetch_records_handle1)

        # Need to iterate over records to extract information
        for record1 in records1:
            # try except check to be sure that NCBI is not returning empty result
            try:
                # let's get pmcid if exists
                id2 = record1['PMC'][3:]
                #print('PMC',id2)

                # get records for pmcid
                fetch_records_handle2 = efetch(db="pubmed",
                                               id=str(id2),
                                               rettype="medline",
                                               retmode="text")
                # parse records for pmcid
                records2 = Medline.parse(fetch_records_handle2)

                # Need to iterate over records to extract information
                '''
                Collect following information: authors, authors' affiliations, publication date, citations, grants
                Store all these information in an dictionary (pmid_dict)
                '''
                for record2 in records2:
                    authors = record2['FAU']
                    affiliations = record2['AD']
                    pub_date = record2['DCOM']
                    citations = get_links_id(pmid)
                    grants = record2['GR']
                    pmid_dict[pmid] = {
                        'pmcid_number': id2,
                        'pmcid': True,
                        'authors': authors,
                        'affiliations': affiliations,
                        'grants': grants,
                        'pub_date': pub_date,
                        'citations': citations
                    }
            except:
                authors = record1['FAU']
                try:
                    affiliations = record1['AD']
                except:
                    affiliations = ''
                try:
                    pub_date = record1['DCOM']
                except:
                    pub_date = ''
                try:
                    citations = get_links_id(pmid)
                except:
                    citations = ''
                try:
                    grants = record1['GR']
                except:
                    grants = ''
                pmid_dict[pmid] = {
                    'pmcid_number': '',
                    'pmcid': False,
                    'authors': authors,
                    'affiliations': affiliations,
                    'grants': grants,
                    'pub_date': pub_date,
                    'citations': citations
                }

    with open(f'./{rel_name}/{rel_name}_pmid_dict.json', 'w') as output:
        output.write(json.dumps(pmid_dict))

    return pmid_dict
예제 #42
0
            csvfile.writerow(['Quad', 'Quad_Alpha', 'V1', 'V2', 'V3', 'V4', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6'])
            for x,y in zip(verts, edge_lengths):
                quad = [TOLC[atoms[z]['res']] for z in x]
                resverts = [atoms[z]['resseq'] for z in x]
                csvfile.writerow([''.join(quad), ''.join(sorted(quad))]+resverts+y)

    # removing very irregular tetrahedra
        
    if task_profile or task_threading:
        # Calculate potentials
        orig_sim_pot = simplex_potential([[atoms[x]['res'] for x in y] for y in verts]) 
        orig_res_pot = residue_potential(len(atoms), verts, orig_sim_pot)

    if task_threading: 	
        if seq_download:
            seqfile = efetch(db="protein", id=seq_id, rettype=seq_format)
        else:
            seqfile = seq_filename
        ids,seq = zip(*[(x.id, x.seq) for x in AlignIO.read(seqfile,seq_format)])

        # Make PDB sequence the first one
        idtemp,seqtemp = [(x,y) for x,y in zip(ids, seq) if PDB_id in x][0]
        ids,seq = zip(*[(x,y) for x,y in zip(ids, seq) if PDB_id not in x])
        ids = [idtemp] + list(ids)
        seq = [seqtemp] + list(seq)

        trimmed_nums, trimmed_seq = zip(*[trim_gaps(x,seq[0], number_sequence(x)) for x in seq])
        dqs = []
        for z,i,d in zip(trimmed_seq[1:], trimmed_nums[1:], ids[1:]):
            res_numbers,residues = thread_sequence(z, trimmed_seq[0], pdbdata['DBREF']['seqbegin'], i, span)
            mut_sim_pot = simplex_potential([[residues[x] for x in y] for y in verts]) 
예제 #43
0
def print_abstract(pmid):
    handle = efetch(db='pubmed', id=pmid, retmode='text', rettype='abstract')
    print handle.read()
예제 #44
0
def get_metadata_from_PMID( pmid, output_errors=False, dump_xml=False ):
    """This function will take an input PMID and parse the attributes I am interested in for the cytoscape plugin...."""
    handle = efetch(db='pubmed', id=pmid, retmode='xml')
    xml_data = read(handle)[0]
    verbose_output = False
#   output_errors= False
    author_affiliation_list = []
    cur_paper_author_list = []

    try:
        date_completed = format_ddate( xml_data['MedlineCitation']['DateCompleted'] )
    except:
        print "Date Completed not available??",pmid
	## Will try date created	
	#date_completed = None
	#date_created = format_ddate( xml_data['MedlineCitation']['DateCreated'] )
	## I am removing the difference between date completed and created-- it doens't really matter for my purposes
	date_completed = format_ddate( xml_data['MedlineCitation']['DateCreated'] )
	
	#fp_error.write('Date Completed Not Avaiable:\n'+str(xml_data)+'\n\n')    

    try:
        otherID = xml_data['MedlineCitation']['OtherID']
    except:
        print "Other ID Not availble??",pmid

    try:
        MeshHeadings = xml_data['MedlineCitation']['MeshHeadingList']
    except:
        print "Unable to get mesheadings for",pmid
	if output_errors: fp_error.write('MESH NOT AVAIABLE:\n'+str(xml_data)+'\n\n')    
    
    try:
        article = xml_data['MedlineCitation']['Article']
        if verbose_output: print xml_data
        for author in article['AuthorList']:
            #author_key = { 'LastNAme': author['LastName'], 'Initials': author['Initials'] }
	    #print author 
	    if 'LastName' in author:
        	author_key =    author['LastName'] + ','+  author['Initials'] 
		#print author,author_key
		cur_paper_author_list.append(author_key)
	    elif 'CollectiveName' in author:
		print "FOUND A COLLECTION EXAMPLE",author
            if 'Affiliation' in author:
	    	author_affil = author['Affiliation']
	    	author_affiliation_list.append( (author, author_affil) )
	    #	print author_affil
	    #	sys.exit()
    except NameError as e:
	print e
    except IndexError:
        return None
    except:
        print "unable to proces article tag",pmid
        print "Unexpected error parsing author string:", sys.exc_info()[0]
	if output_errors: fp_error.write('Article NOT AVAILABLE\n'+str(xml_data)+'\n\n')    
	print author
	#print xml_data

    try:
        abstract = article['Abstract']['AbstractText'][0]
    except:
        print "Unable to get abstract for",pmid




    if dump_xml:
        print xml_data
	return xml_data		
    else:
	return { 'auth_list': cur_paper_author_list, 'affiliations': author_affiliation_list, 'publication_date': date_completed }
예제 #45
0
def evaluate_article(pmid):
    ##
    ## [IN PROGRESS]
    ##
    ## -> Test if the abstract is cool
    ## -> return true or false
    ##
    ## TODO : write doc
    ##

    ##------------------------##
    ## Parameters for filters ##
    ##------------------------##

    ## initialize parameters
    oldest_year_authorized = "NA"
    case_report_only = False
    case_report_check = False
    authorized_languages = []
    valid_article = False
    check_date = True
    check_language = True
    validation_check = {}
    validation_keywords = {}

    exclusion_check = {}
    exclusion_keywords = {}

    exclusion_keywords_found = False

    ## test if config file exist
    if (os.path.isfile("config.conf")):
        config_data = open("config.conf", "r")
        validation_keywords_cmpt = 0
        exclusion_keywords_cmpt = 0
        for line in config_data:
            line = line.replace("\n", "")
            line_in_array = line.split(";")

            if (line_in_array[0] == "min year"):
                oldest_year_authorized = line_in_array[1]
            elif (line_in_array[0] == "authorized languages"):
                languages_list = line_in_array[1].split(",")
                for elt in languages_list:
                    authorized_languages.append(unicode(elt))
            elif (line_in_array[0] == "validation keywords"):
                validation_keywords_cmpt += 1
                validation_check["keywords_" +
                                 str(validation_keywords_cmpt)] = False
                validation_keywords["keywords_" +
                                    str(validation_keywords_cmpt)] = []
                keywords_list = line_in_array[1].split(",")
                for elt in keywords_list:
                    if (elt not in validation_keywords[
                            "keywords_" + str(validation_keywords_cmpt)]):
                        validation_keywords[
                            "keywords_" +
                            str(validation_keywords_cmpt)].append(str(elt))

            ## Retrieve Exclusion list
            elif (line_in_array[0] == "exclusion keywords"):
                exclusion_keywords_found = True
                exclusion_keywords_cmpt += 1
                exclusion_check["exclusion_" +
                                str(exclusion_keywords_cmpt)] = False
                exclusion_keywords["exclusion_" +
                                   str(exclusion_keywords_cmpt)] = []
                keywords_list = line_in_array[1].split(",")
                for elt in keywords_list:
                    if (elt not in exclusion_keywords[
                            "exclusion_" + str(exclusion_keywords_cmpt)]):
                        exclusion_keywords[
                            "exclusion_" +
                            str(exclusion_keywords_cmpt)].append(str(elt))

            ## case report only option
            ## if nothing is set, default is False
            elif (line_in_array[0] == "case report only"
                  and str(line_in_array[1]) == "True"):
                case_report_only = True

        config_data.close()

    ## default configuration
    else:
        oldest_year_authorized = 2008
        authorized_languages = [u'eng']
        validation_check["keywords_1"] = False
        validation_check["keywords_2"] = False
        validation_keywords["keywords_1"] = [
            "algorithm", "machine"
            "learning", "neural", "network", "statistic", "deep",
            "classification", "model"
        ]
        validation_keywords["keywords_2"] = [
            "Sjogren", "sjogren", "lupus", "autoimmunity", "rhumatoid",
            "arthrisis", "RA", "SjS", "SLE"
        ]
        exclusion_check["exclusion_1"] = False
        exclusion_keywords["exclusion_1"] = []

    if (not exclusion_keywords_found):
        exclusion_check["exclusion_1"] = False
        exclusion_keywords["exclusion_1"] = []

    ##---------------##
    ## The Easy Part ##
    ##---------------##
    ## get meta data on the articles
    try:
        handle = efetch(
            db='pubmed',
            id=pmid,
            retmode='xml',
        )
        informations = read(handle)
        stuff = informations[u'PubmedArticle'][0]

        ## get date from the history attribute, select
        ## the date of acceptation.
        date = stuff[u'PubmedData']["History"][1]
        month = date[u'Month']
        day = date[u'Day']
        year = date[u'Year']

        ## get the name of the review
        journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'MedlineJournalInfo'][u'MedlineTA']

        ## get the keywords for the articles
        ## the format is a bit strange, may have to be carreful
        ## with this data (mix of strings and unicode elements)
        keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'KeywordList']

        ## Get the author's conflict of interest,
        ## because we can.
        try:
            conflict_of_interest = informations[u'PubmedArticle'][0][
                u'MedlineCitation'][u'CoiStatement']
        except:
            conflict_of_interest = "NA"

        ## Get title of the article
        article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][
            u'Article'][u'ArticleTitle']

        ## Get language of the article
        article_language = informations[u'PubmedArticle'][0][
            u'MedlineCitation'][u'Article'][u'Language'][0]

        ## Get country of publications
        country = stuff[u'MedlineCitation'][u'MedlineJournalInfo'][u'Country']

    except:
        return (False, False, False)

    ##----------------##
    ## The Smart Part ##
    ##----------------##
    ## run further analysis on the abstract using nltk

    ##
    ## WORKING ON EXCLUSION LIST
    ##

    ## fetch the abstract and convert it to
    ## a nltk text object.
    abstract_file_name = "abstract/" + str(pmid) + "_abstract.txt"
    abstract = fetch_abstract(pmid)
    if (abstract):
        save_abstract(abstract, abstract_file_name)
        abstract_text = load_text(abstract_file_name)

        ## Play with tokenization and chunking
        ## Get all the commun names in the abstract
        names_found_in_abstract = []
        try:
            tokens = nltk.word_tokenize(abstract.encode('utf8'))
            tagged = nltk.pos_tag(tokens)
            entities = nltk.chunk.ne_chunk(tagged)
        except:
            print "[WARNINGS] => can't perform nlp operation"
            entities = []

        for item in entities:
            try:
                if (item[1] in ["NN", "NNS", "NNP"]):
                    if (item[0] not in names_found_in_abstract):
                        names_found_in_abstract.append(item[0])
            except:
                ## Somethig went wrong
                choucroute = True

        ## Check validation list
        for item in names_found_in_abstract:
            for key in validation_keywords.keys():
                keywords_validation_list = validation_keywords[key]
                if (item in keywords_validation_list):
                    validation_check[key] = True

        ## Check exclusion list
        for item in names_found_in_abstract:
            for key in exclusion_keywords.keys():
                exclusion_validation_list = exclusion_keywords[key]
                if (item in exclusion_validation_list):
                    exclusion_check[key] = True

        ## Check if is a case report
        if (case_report_only):
            print "[DEBUG] => Case report only"
            if (article_is_a_case_report(abstract_file_name)):
                case_report_check = True

    ##--------------##
    ## PASS OR FAIL ##
    ##--------------##
    ## General check phase
    easy_check_passed = False
    smart_check_passed = True

    ## Basic check on meta data
    ## - check date
    if (int(year) < int(oldest_year_authorized)):
        check_date = False

    ## - check language
    if (article_language not in authorized_languages):
        check_language = False

    ## Easy Filter
    if (check_date and check_language):
        easy_check_passed = True

    ## Complex filter (inclusion)
    if (False in validation_check.values()):
        smart_check_passed = False

    ## Complex filter (exclusion)
    if (True in exclusion_check.values()):
        smart_check_passed = False

    ## Case reprot filter
    if (case_report_only and case_report_check):
        print "[DEBUG] => EXLUDED"
        smart_check_passed = False

    ## Global check
    if (easy_check_passed and smart_check_passed):
        valid_article = True

    ##-------------##
    ## SAVING DATA ##
    ##-------------##
    ## Write and delete files
    if (valid_article):

        ## Save meta data in a text file
        ## for further use
        title_line = u'>Title;' + unicode(article_title) + u"\n"
        date_line = u'>Date;' + unicode(day) + u"/" + unicode(
            month) + u"/" + unicode(year) + u"\n"
        #date_line = '>Date;'+str(day.encode('utf8'))+"/"+str(month.encode(utf8))+"/"+str(year.encode("utf8"))+"\n"
        journal_line = u">Journal;" + unicode(journal_name) + u"\n"
        country_line = u">Country;" + unicode(country) + u"\n"
        conflict_of_interest_line = u">Conflict;" + unicode(
            conflict_of_interest) + u"\n"
        meta_data = open("meta/" + str(pmid) + ".csv", "w")
        meta_data.write(title_line.encode('utf8'))
        meta_data.write(date_line.encode('utf8'))
        meta_data.write(journal_line.encode('utf8'))
        meta_data.write(country_line.encode('utf8'))
        meta_data.write(conflict_of_interest_line.encode('utf8'))
        meta_data.close()

    else:
        ## Delete the abstract
        try:
            if (abstract):
                os.remove(abstract_file_name)
        except:
            print "[WARNING] => can't delete " + str(abstract_file_name)

    ##------------------##
    ## RETURN SOMETHING ##
    ##------------------##
    ## return True if the article pass the
    ## evaluation, else False.
    return (valid_article, easy_check_passed, smart_check_passed)
예제 #46
0
    def parse_pubmed_ids(cls, pub_id, retmode='xml'):

        #get xml from pubmed
        handle = efetch(db='pubmed', id=pub_id, retmode=retmode)
        xml_string = handle.read()
        xml = ET.fromstring(xml_string)

        #check for keywords and MeshTerms
        keys = []

        for art in xml.getchildren():

            #authors
            auth = zip(art.findall('.//Author/ForeName'),
                       art.findall('.//Author/LastName'))
            auth = ';'.join([' '.join([i.text, j.text]) for (i, j) in auth])

            #affiliations
            email_re = re.compile('Electronic address.*\.')
            aff = ';'.join(
                np.unique([
                    email_re.sub('', aff.text).strip(string.punctuation + ' ')
                    for aff in art.findall('.//Affiliation')
                ]))

            #publication date
            pub_date = '-'.join([
                x.text for x in [
                    art.find('.//PubMedPubDate[@PubStatus="entrez"]/Year'),
                    art.find('.//PubMedPubDate[@PubStatus="entrez"]/Month'),
                    art.find('.//PubMedPubDate[@PubStatus="entrez"]/Day')
                ]
            ])

            if pub_date == '':
                pub_date = '1900-01-01'

            pubmed_id = ''.join(
                art.xpath(
                    './/PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]/text()'
                ))
            doi = ''.join(
                art.xpath(
                    './/PubmedData/ArticleIdList/ArticleId[@IdType="doi"]/text()'
                ))
            pmcid = ''.join(
                art.xpath(
                    './/PubmedData/ArticleIdList/ArticleId[@IdType="pmc"]/text()'
                ))
            title = ''.join(art.xpath('.//ArticleTitle/text()')).strip()

            abstract = ''.join(art.xpath('.//AbstractText/text()'))
            journal = ''.join(art.xpath('.//Journal/ISOAbbreviation/text()'))
            pubmed_class = ','.join(art.xpath('.//PublicationType/text()'))
            keys.append((
                pubmed_id, title, auth, aff, pub_date, abstract, doi, pmcid,
                journal, pubmed_class,
                art.xpath('MedlineCitation/KeywordList/*/text()'),
                art.xpath(
                    'MedlineCitation/MeshHeadingList/MeshHeading/*[@MajorTopicYN="Y"]/../DescriptorName/text()'
                ),
                art.xpath(
                    'MedlineCitation/MeshHeadingList/MeshHeading/*[@MajorTopicYN="Y"]/../QualifierName/text()'
                )))

#         df = pd.DataFrame(keys, columns=['pubmed_id', 'pub_date', 'keywords', 'mesh_descriptors', 'mesh_qualifiers'])
#         df.pub_date = pd.to_datetime(df.pub_date)

        gc.collect()

        return keys, xml_string
예제 #47
0
def evaluate_article(pmid):
	##
	## [IN PROGRESS]
	##
	## -> Test if the abstract is cool
	## -> return true or false
	##

	##------------------------##
	## Parameters for filters ##
	##------------------------##
	oldest_year_authorized = 2008
	authorized_languages = [u'eng']

	valid_article = False
	check_date = True
	check_language = True
	validation_check_keywords_1 = False
	validation_check_keywords_2 = False



	##---------------##
	## The Easy Part ##
	##---------------##
	## get meta data on the articles
	try:
		handle = efetch(db='pubmed', id=pmid, retmode='xml', )
		informations = read(handle)
		stuff = informations[u'PubmedArticle'][0] 
		
		## get date from the history attribute, select
		## the date of acceptation.
		date = stuff[u'PubmedData']["History"][1]
		month = date[u'Month']
		day = date[u'Day']
		year = date[u'Year']

		## get the name of the review
		journal_name = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'MedlineJournalInfo'][u'MedlineTA']
		
		## get the keywords for the articles
		## the format is a bit strange, may have to be carreful
		## with this data (mix of strings and unicode elements)
		keywords_list = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'KeywordList']

		## Get the author's conflict of interest,
		## because we can.
		try:
			conflict_of_interest = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'CoiStatement']
		except:
			conflict_of_interest = "NA"

		## Get title of the article
		article_title = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'ArticleTitle']

		## Get language of the article
		article_language = informations[u'PubmedArticle'][0][u'MedlineCitation'][u'Article'][u'Language'][0]

	except:
		return (False,False,False)

	##----------------##
	## The Smart Part ## 
	##----------------##
	## run further analysis on the abstract using nltk

	## fetch the abstract and convert it to
	## a nltk text object.
	abstract_file_name = "abstract/"+str(pmid)+"_abstract.txt"
	abstract = fetch_abstract(pmid)
	if(abstract):
		save_abstract(abstract, abstract_file_name)
		abstract_text = load_text(abstract_file_name)
		
		## Play with tokenization and chunking
		## Get all the commun names in the abstract
		names_found_in_abstract = []
		try:
			tokens = nltk.word_tokenize(abstract.encode('utf8'))
			tagged = nltk.pos_tag(tokens)
			entities = nltk.chunk.ne_chunk(tagged)
		except:
			print "[WARNINGS] => can't perform nlp operation"
			entities = []

		for item in entities:
			try:
				if(item[1] in ["NN", "NNS", "NNP"]):
					if(item[0] not in names_found_in_abstract):
						names_found_in_abstract.append(item[0])
			except:
				## Somethig went wrong
				choucroute = True
				
		## -> Biology keywords check
		## -> Artificial intelligence keywords check
		IA_keywords = ["algorithm", "machine" "learning", "neural", "network", "statistic", "deep", "classification", "model"]
		Clinical_keywords = ["Sjogren" ,"sjogren", "lupus", "autoimmunity", "rhumatoid", "arthrisis", "RA", "SjS", "SLE"]
		for item in names_found_in_abstract:
			if(item in IA_keywords):
				validation_check_keywords_1 = True
			if(item in Clinical_keywords):
				validation_check_keywords_2 = True
		
	##--------------##
	## PASS OR FAIL ##
	##--------------##
	## General check phase
	easy_check_passed = False
	smart_check_passed = False

	## Basic check on meta data
	## - check date
	if(int(year) < int(oldest_year_authorized)):
		check_date = False

	## - check language
	if(article_language not in authorized_languages):
		check_language = False

	## Easy Filter
	if(check_date and check_language):
		easy_check_passed = True

	## Complex filter
	if(validation_check_keywords_1 and validation_check_keywords_2):
		smart_check_passed = True

	## Global check
	if(easy_check_passed and smart_check_passed):
		valid_article = True

	##-------------##
	## SAVING DATA ##
	##-------------##
	## Write and delete files
	if(valid_article):

		## Save meta data in a text file
		## for further use
		title_line = u'>Title;'+unicode(article_title)+u"\n"
		date_line = u'>Date;'+unicode(day)+u"/"+unicode(month)+u"/"+unicode(year)+u"\n"
		journal_line = u">Journal;"+unicode(journal_name)+u"\n"
		conflict_of_interest_line = u">Conflict;"+unicode(conflict_of_interest)+u"\n"
		meta_data = open("meta/"+str(pmid)+".csv", "w")
		meta_data.write(title_line.encode('utf8'))
		meta_data.write(date_line.encode('utf8'))
		meta_data.write(journal_line.encode('utf8'))
		meta_data.write(conflict_of_interest_line.encode('utf8'))
		meta_data.close()

	else:
		## Delete the abstract
		try:
			if(abstract):
				os.remove(abstract_file_name)
		except:
			print "[WARNING] => can't delete "+str(abstract_file_name)

	##------------------##
	## RETURN SOMETHING ##
	##------------------##
	## return True if the article pass the 
	## evaluation, else False.
	return (valid_article, easy_check_passed, smart_check_passed)