Example #1
0
def ukpmc(ids=None):
    """
    Given a pubmed id, Load entities into DB from UKPMC
    """
    ids = ids.split(",") if ids else demo_pubmeds
    
    url = 'http://ukpmc.ac.uk/abstract/MED/'
    import requests,re
    col = mongo.getCollection('publication')
    for id in ids:
        print "#### proceesing %s" %id
        p = col.find_one({'_id':'publ%s'%id})
        pub = Publication( p )
        u = "%s%s" %(url, id)
        r = requests.get(u)         
        if r.status_code == 200:
                from django.utils.encoding import smart_str, smart_unicode            
                content = smart_str(r.text)
                entities = {}
                for m in re.finditer(r'<span class="(disease|protein|geneOntology|species|chemical)".*?_blank">(.*?)</a></span>', content):
                    group = m.group(1)
                    group = 'go' if group == 'geneOntology' else group.lower()                
                    e = {'name': m.group(2).lower(), 'group': group}
                    entities[e['name']] = e
                pub.entities = []
                for en, item in entities.items():
                    pub.entities.append(item)
                if(pub.entities):                    
                    pub.save()     
                    print("Saved %d items" %(len(entities)))           
Example #2
0
def load_pubmeds(ids=None):
    ids = ids.split(",") if ids else demo_pubmeds
        
    url = "http://togows.dbcls.jp/entry/pubmed/$ID?format=xml"

    """
    pub={    '_id':'',
            'name':'',
            'refs':{ 'pubmed': '' },
            'abstract':'', 
            'local': 0,  
            'url':'',  
            'published': 1, 
            'authors':[]
            }
    """
    pc = mongo.getCollection('people')
    try:
        pc.create_index([("last", 1), ("middle",1), ("first",1)], unique=True)
    except:
        pass
    
    pubs = []
    peoples = []
    for pid in pubmeds:
        try:
            uri = url.replace('$ID', pid)
            print "Loading %s" %uri
            doc = XML2Dict().fromurl(uri)
            #print doc
            article = doc['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article']
            article = doc.PubmedArticleSet.PubmedArticle.MedlineCitation.Article
            
            pub = Publication()
            pub._id = "publ_pubmed%s" % (pid)
            pub.refs= {'pubmed': pid}
            pub.name= article['ArticleTitle']['value'] if article.ArticleTitle else ''
            pub.abstract = ''
            if article.Abstract and article.Abstract.AbstractText:
                texts = [ article.Abstract.AbstractText ] if not isinstance(article.Abstract.AbstractText, list) else article.Abstract.AbstractText
                pub.abstract= "\n\n".join([ text['value'] for text in texts ]) 
                                
            pub.language=article['Language']['value'] if article.Language else ''
            pubs.append(pub) 
            
            pub.authors=[]
            authors = article['AuthorList']['Author']
            for author in authors:
                people = {'first': author.ForeName.value if author.ForeName and author.ForeName.value else '',
                          'last': author.LastName.value if author.LastName and author.LastName.value else '',
                          'middle': author.Initials.value if author.Initials and author.Initials.value else '' }
                if not people['last']: continue             
                people['namekey'] = "%s.%s.%s" %(people['first'].lower(), people['middle'].lower(), people['last'].lower())                
                
                people['_id'] = idtool.generate('peop')
                try:
                    pc.insert(people, safe=True)
                    print "Inserted %s" %people
                except:
                    del people['_id']
                    people = pc.find_one(people)
                if people:
                    pc.update({'_id':people['_id']}, {'$addToSet': {'publications':pub._id}}, safe=True)
                    pub.authors.append(people)                            
            #print authors            
        except:
            print "ERROR: %s" %traceback.format_exc()    
                
            
    pubc = mongo.getCollection('publication')
    for pub in pubs:
        try:
            pubc.insert(pub)
            print "Inserted pub: %s" %pub
        except:
            print "ERROR %s"  %traceback.format_exc()

    log("Done")
    
    return pubs