def handler(req): q = req.REQUEST.get("term") if not q: return SmartResponse(Exception("Missing parameter: term"), req) q = {'_id':{'$regex': '^%s' %q.lower() } } # filter by entity/network/people etc filter = req.REQUEST.get("filter") or None ret = [] tmp = [] for r in mongo.getCollection('indices').find(q): if not 'names' in r: continue for id, name in r['names'].items(): if id in tmp: continue; tmp.append(id) if(not name): continue name = ("%s"%name)[0:80] md = '' if id[0:4] in models.prefix_mapping: md = models.prefix_mapping[id[0:4]] pre = models.model_label_mapping[md] label = "%s: %s" %(pre, name) if(filter and filter!=md): continue ret.append( {'label':label, 'value':name, '_id':id} ) if(len(ret)>50): break # max 20 results return SmartResponse(ret, req)
def ukpmc(ids=None): """ Given a pubmed id, Load entities into DB from UKPMC """ ids = ids.split(",") if ids else demo_pubmeds url = 'http://ukpmc.ac.uk/abstract/MED/' import requests,re col = mongo.getCollection('publication') for id in ids: print "#### proceesing %s" %id p = col.find_one({'_id':'publ%s'%id}) pub = Publication( p ) u = "%s%s" %(url, id) r = requests.get(u) if r.status_code == 200: from django.utils.encoding import smart_str, smart_unicode content = smart_str(r.text) entities = {} for m in re.finditer(r'<span class="(disease|protein|geneOntology|species|chemical)".*?_blank">(.*?)</a></span>', content): group = m.group(1) group = 'go' if group == 'geneOntology' else group.lower() e = {'name': m.group(2).lower(), 'group': group} entities[e['name']] = e pub.entities = [] for en, item in entities.items(): pub.entities.append(item) if(pub.entities): pub.save() print("Saved %d items" %(len(entities)))
def handler(req): q = req.REQUEST.get("term") if not q: return SmartResponse(Exception("Missing parameter: term"), req) q = {'_id': {'$regex': '^%s' % q.lower()}} # filter by entity/network/people etc filter = req.REQUEST.get("filter") or None ret = [] tmp = [] for r in mongo.getCollection('indices').find(q): if not 'names' in r: continue for id, name in r['names'].items(): if id in tmp: continue tmp.append(id) if (not name): continue name = ("%s" % name)[0:80] md = '' if id[0:4] in models.prefix_mapping: md = models.prefix_mapping[id[0:4]] pre = models.model_label_mapping[md] label = "%s: %s" % (pre, name) if (filter and filter != md): continue ret.append({'label': label, 'value': name, '_id': id}) if (len(ret) > 50): break # max 20 results return SmartResponse(ret, req)
def ukpmc(ids=None): """ Given a pubmed id, Load entities into DB from UKPMC """ ids = ids.split(",") if ids else demo_pubmeds url = 'http://ukpmc.ac.uk/abstract/MED/' import requests, re col = mongo.getCollection('publication') for id in ids: print "#### proceesing %s" % id p = col.find_one({'_id': 'publ%s' % id}) pub = Publication(p) u = "%s%s" % (url, id) r = requests.get(u) if r.status_code == 200: from django.utils.encoding import smart_str, smart_unicode content = smart_str(r.text) entities = {} for m in re.finditer( r'<span class="(disease|protein|geneOntology|species|chemical)".*?_blank">(.*?)</a></span>', content): group = m.group(1) group = 'go' if group == 'geneOntology' else group.lower() e = {'name': m.group(2).lower(), 'group': group} entities[e['name']] = e pub.entities = [] for en, item in entities.items(): pub.entities.append(item) if (pub.entities): pub.save() print("Saved %d items" % (len(entities)))
def index(keywords, obj): if (isinstance (keywords, basestring)): keywords = keywords.split() for kwd in keywords: if(not kwd): continue; kwd = kwd.lower() updates = {} updates['names.'+ obj._id] = obj.name col =mongo.getCollection('indices') col.update( {'_id': kwd}, {'$set': updates, '$addToSet':{'ids' :obj._id}}, upsert=True );
def index(keywords, obj): if (isinstance(keywords, basestring)): keywords = keywords.split() for kwd in keywords: if (not kwd): continue kwd = kwd.lower() updates = {} updates['names.' + obj._id] = obj.name col = mongo.getCollection('indices') col.update({'_id': kwd}, { '$set': updates, '$addToSet': { 'ids': obj._id } }, upsert=True)
def save(self): if self.validate: self.validate() if self.beforeSave: self.beforeSave() col = mongo.getCollection(self._col) self._id = self.cleanup_id(self._id) logger.debug("Persisting %s: %s" %(self._col, self._id)) self.create_tm = self.create_tm or getTime() self.update_tm = getTime() self.tm = self.tm or time.time() col.save(self, safe=True) #logger.debug("Done") if self.afterSave: self.afterSave()
def save(self): if self.validate: self.validate() if self.beforeSave: self.beforeSave() col = mongo.getCollection(self._col) self._id = self.cleanup_id(self._id) logger.debug("Persisting %s: %s" % (self._col, self._id)) self.create_tm = self.create_tm or getTime() self.update_tm = getTime() self.tm = self.tm or time.time() col.save(self, safe=True) #logger.debug("Done") if self.afterSave: self.afterSave()
def importEdges(filename=None): """ import edges, one pair per line Edge property: - no direction, -> left to right, <- right to left """ filename = filename or r'C:\work\caida\Dropbox\precon\engineering\Contents\GBM_BN-Massaged.csv' # add a network n = Network.findOne({'name': "GBM Predicted Tumor Network"}) or Network() n.name = "GBM Predicted Tumor Network" n.group = "predicted" n.owner = 'precon' n.save() col = mongo.db()['entity'] count = 0 now = time.time() ec = mongo.getCollection('edge') with open(filename) as f: content = f.read() for line in content.split('\n'): doc = {} line = line.strip() pos = line.find("///") if pos > 0: line = line[0:pos].strip() doc['comment'] = line[pos:] items = line.split("->") if (len(items) == 1): items = line.split("<-") if (len(items) == 1): items = line.split(" - ") if (len(items) == 1): error("Ignore invalid line: [%s]" % line) continue count += 1 if (count < 8378): continue tmp = [] tmp.append(items[0].lower().strip()) tmp.append(items[1].lower().strip()) entities = ['', ''] print "!!! %d " % (col.find({'symbol': {'$in': tmp}}).count()) for r in col.find({'symbol': {'$in': tmp}}): if (r['symbol'] == tmp[0]): entities[0] = Entity(r) if (r['symbol'] == tmp[1]): entities[1] = Entity(r) if (len(entities) != 2): raise "Invalid nodes %s, continue" % entities node1 = Node.findOne({'network': n._id, 'entity': entities[0]._id}) if not node1: node1 = Node({'network': n._id}, entities[0]) node1.save() node2 = Node.findOne({'network': n._id, 'entity': entities[1]._id}) if not node2: node2 = Node({'network': n._id}, entities[1]) node2.save() con = Connection() con._id = "conn_%s_%s" % (tmp[0], tmp[1]) con.nodes = [node1._id, node2._id] con.entities = [entities[0]._id, entities[1]._id] con.type = 'predicted' con.network = n._id con.label = "" con.save() print "Saving connection %d %s" % (count, con._id) finish = time.time() print "Imported %d edges, time elpased %d seconds" % (count, finish - now)
def findOne(cls, query): col = mongo.getCollection(cls._col) r = col.find_one(query) if(r): return cls(r) return None
def importmif(): dups = {} basedir = "data/IntAct/psi25/datasets" cats = os.listdir(basedir) networks = [] entities = [] connections = [] nodes = [] for c in cats: if c != 'Parkinsons': continue print "Processing category %s" %c files = os.listdir("%s\\%s" %(basedir,c) ) for filename in files: file = "%s\\%s\\%s" %(basedir, c, filename) if os.path.isdir(file): continue log( "Processing %s" %file) res = Network() res.group = c res.refs = {} res.connections = [] res.entities = [] res.refs['intact'] = filename.replace(".xml", "") parseFile(file, res) if res._id in dups: error("Duplicated id: %s/%s"%(c, file)) continue networks.append(res) dups[res._id ] = 1 if res.entities: entities.extend(res.entities) connections.extend(res.connections) tmp_nodes = [] for con in res.connections: if con.nodes: tmp_nodes.extend(con.nodes) nodes.extend(tmp_nodes) log("Connections: %d Participants %d Interactors: %d" %(len(res.connections), len(tmp_nodes), len(res.entities) )) #interactors.extend(a) #interactions.extend(b) #log("interactors : %d" % len(res.entities)) #log("interactions: %d" % len(res.entities)) #break #log( "Total interactions: %d" % len(interactions)) nc = mongo.getCollection('network') ec = mongo.getCollection('entity') cc = mongo.getCollection('connection') nodec=mongo.getCollection('node') for con in connections: node_ids = [] con.entities = [] for node in con.nodes: ent_id = '' if node.refs and node.refs['entity']: # node.entity is IntAct internal ID intact_id = node.refs['entity'] for item in entities: if item.refs and item.refs['intact'] == intact_id: ent_id = item._id break if not ent_id: error("Unresolved interactorRef for %s" %node) else: node.entity = ent_id node_ids.append(node._id) con.entities.append(ent_id) con.nodes = node_ids for con in connections: cc.insert(con, safe=True) log("Saved connection %s" %con._id) for network in networks: del network['entities'] del network['connections'] nc.insert(network, safe=True) log("Saved network %s" %network._id) for node in nodes: if not node.entity: continue nodec.insert(node, safe=True) log("Saved node %s" %node._id) dups = [] for entity in entities: if entity._id in dups: continue ec.insert(entity, safe=True) dups.append(entity._id) log("Saved entity %s" %entity._id) log( "###########################") log( "Total networks: %d" % len(networks)) log( "Total interactors: %d" % len(entities)) log( "Total nodes: %d" %(len(nodes))) log("Done") return networks
def load_pubmeds(ids=None): ids = ids.split(",") if ids else demo_pubmeds url = "http://togows.dbcls.jp/entry/pubmed/$ID?format=xml" """ pub={ '_id':'', 'name':'', 'refs':{ 'pubmed': '' }, 'abstract':'', 'local': 0, 'url':'', 'published': 1, 'authors':[] } """ pc = mongo.getCollection('people') try: pc.create_index([("last", 1), ("middle", 1), ("first", 1)], unique=True) except: pass pubs = [] peoples = [] for pid in pubmeds: try: uri = url.replace('$ID', pid) print "Loading %s" % uri doc = XML2Dict().fromurl(uri) #print doc article = doc['PubmedArticleSet']['PubmedArticle'][ 'MedlineCitation']['Article'] article = doc.PubmedArticleSet.PubmedArticle.MedlineCitation.Article pub = Publication() pub._id = "publ_pubmed%s" % (pid) pub.refs = {'pubmed': pid} pub.name = article['ArticleTitle'][ 'value'] if article.ArticleTitle else '' pub.abstract = '' if article.Abstract and article.Abstract.AbstractText: texts = [ article.Abstract.AbstractText ] if not isinstance(article.Abstract.AbstractText, list) else article.Abstract.AbstractText pub.abstract = "\n\n".join([text['value'] for text in texts]) pub.language = article['Language'][ 'value'] if article.Language else '' pubs.append(pub) pub.authors = [] authors = article['AuthorList']['Author'] for author in authors: people = { 'first': author.ForeName.value if author.ForeName and author.ForeName.value else '', 'last': author.LastName.value if author.LastName and author.LastName.value else '', 'middle': author.Initials.value if author.Initials and author.Initials.value else '' } if not people['last']: continue people['namekey'] = "%s.%s.%s" % (people['first'].lower(), people['middle'].lower(), people['last'].lower()) people['_id'] = idtool.generate('peop') try: pc.insert(people, safe=True) print "Inserted %s" % people except: del people['_id'] people = pc.find_one(people) if people: pc.update({'_id': people['_id']}, {'$addToSet': { 'publications': pub._id }}, safe=True) pub.authors.append(people) #print authors except: print "ERROR: %s" % traceback.format_exc() pubc = mongo.getCollection('publication') for pub in pubs: try: pubc.insert(pub) print "Inserted pub: %s" % pub except: print "ERROR %s" % traceback.format_exc() log("Done") return pubs
def importEdges(filename=None): """ import edges, one pair per line Edge property: - no direction, -> left to right, <- right to left """ filename = filename or r'C:\work\caida\Dropbox\precon\engineering\Contents\GBM_BN-Massaged.csv' # add a network n = Network.findOne({'name': "GBM Predicted Tumor Network"}) or Network() n.name = "GBM Predicted Tumor Network" n.group = "predicted" n.owner = 'precon' n.save() col = mongo.db()['entity'] count = 0 now = time.time() ec = mongo.getCollection('edge') with open(filename) as f: content = f.read() for line in content.split('\n'): doc = {} line = line.strip() pos = line.find("///") if pos>0: line = line[0:pos].strip() doc['comment'] = line[pos:] items = line.split("->") if (len(items) == 1): items = line.split("<-") if (len(items) == 1): items = line.split(" - ") if (len(items) == 1): error("Ignore invalid line: [%s]" %line) continue count+=1 if(count<8378): continue tmp =[] tmp.append( items[0].lower().strip()) tmp.append( items[1].lower().strip()) entities = ['',''] print "!!! %d " %( col.find({'symbol': {'$in': tmp } }).count() ) for r in col.find({'symbol': {'$in': tmp } }): if(r['symbol'] == tmp[0]): entities[0] = Entity(r) if(r['symbol'] == tmp[1]): entities[1] = Entity(r) if(len(entities)!=2 ): raise "Invalid nodes %s, continue" % entities node1 = Node.findOne({'network': n._id, 'entity': entities[0]._id}) if not node1: node1 = Node({'network':n._id}, entities[0]) node1.save() node2 = Node.findOne({'network': n._id, 'entity': entities[1]._id}) if not node2: node2 = Node({'network':n._id}, entities[1]) node2.save() con = Connection() con._id = "conn_%s_%s" %(tmp[0], tmp[1]) con.nodes = [node1._id, node2._id] con.entities = [ entities[0]._id, entities[1]._id ] con.type = 'predicted' con.network = n._id con.label = "" con.save() print "Saving connection %d %s" % (count, con._id) finish = time.time() print "Imported %d edges, time elpased %d seconds" %(count, finish - now)
def importmif(): dups = {} basedir = "data/IntAct/psi25/datasets" cats = os.listdir(basedir) networks = [] entities = [] connections = [] nodes = [] for c in cats: if c != 'Parkinsons': continue print "Processing category %s" % c files = os.listdir("%s\\%s" % (basedir, c)) for filename in files: file = "%s\\%s\\%s" % (basedir, c, filename) if os.path.isdir(file): continue log("Processing %s" % file) res = Network() res.group = c res.refs = {} res.connections = [] res.entities = [] res.refs['intact'] = filename.replace(".xml", "") parseFile(file, res) if res._id in dups: error("Duplicated id: %s/%s" % (c, file)) continue networks.append(res) dups[res._id] = 1 if res.entities: entities.extend(res.entities) connections.extend(res.connections) tmp_nodes = [] for con in res.connections: if con.nodes: tmp_nodes.extend(con.nodes) nodes.extend(tmp_nodes) log("Connections: %d Participants %d Interactors: %d" % (len(res.connections), len(tmp_nodes), len(res.entities))) #interactors.extend(a) #interactions.extend(b) #log("interactors : %d" % len(res.entities)) #log("interactions: %d" % len(res.entities)) #break #log( "Total interactions: %d" % len(interactions)) nc = mongo.getCollection('network') ec = mongo.getCollection('entity') cc = mongo.getCollection('connection') nodec = mongo.getCollection('node') for con in connections: node_ids = [] con.entities = [] for node in con.nodes: ent_id = '' if node.refs and node.refs['entity']: # node.entity is IntAct internal ID intact_id = node.refs['entity'] for item in entities: if item.refs and item.refs['intact'] == intact_id: ent_id = item._id break if not ent_id: error("Unresolved interactorRef for %s" % node) else: node.entity = ent_id node_ids.append(node._id) con.entities.append(ent_id) con.nodes = node_ids for con in connections: cc.insert(con, safe=True) log("Saved connection %s" % con._id) for network in networks: del network['entities'] del network['connections'] nc.insert(network, safe=True) log("Saved network %s" % network._id) for node in nodes: if not node.entity: continue nodec.insert(node, safe=True) log("Saved node %s" % node._id) dups = [] for entity in entities: if entity._id in dups: continue ec.insert(entity, safe=True) dups.append(entity._id) log("Saved entity %s" % entity._id) log("###########################") log("Total networks: %d" % len(networks)) log("Total interactors: %d" % len(entities)) log("Total nodes: %d" % (len(nodes))) log("Done") return networks
def load_pubmeds(ids=None): ids = ids.split(",") if ids else demo_pubmeds url = "http://togows.dbcls.jp/entry/pubmed/$ID?format=xml" """ pub={ '_id':'', 'name':'', 'refs':{ 'pubmed': '' }, 'abstract':'', 'local': 0, 'url':'', 'published': 1, 'authors':[] } """ pc = mongo.getCollection('people') try: pc.create_index([("last", 1), ("middle",1), ("first",1)], unique=True) except: pass pubs = [] peoples = [] for pid in ids: try: uri = url.replace('$ID', pid) print "Loading %s" %uri doc = XML2Dict().fromurl(uri) #print doc article = doc['PubmedArticleSet']['PubmedArticle']['MedlineCitation']['Article'] article = doc.PubmedArticleSet.PubmedArticle.MedlineCitation.Article pub = Publication() pub._id = "publ_pubmed%s" % (pid) pub.refs= {'pubmed': pid} pub.name= article['ArticleTitle']['value'] if article.ArticleTitle else '' pub.abstract = '' if article.Abstract and article.Abstract.AbstractText: texts = [ article.Abstract.AbstractText ] if not isinstance(article.Abstract.AbstractText, list) else article.Abstract.AbstractText pub.abstract= "\n\n".join([ text['value'] for text in texts ]) pub.language=article['Language']['value'] if article.Language else '' pubs.append(pub) pub.authors=[] authors = article['AuthorList']['Author'] for author in authors: people = {'first': author.ForeName.value if author.ForeName and author.ForeName.value else '', 'last': author.LastName.value if author.LastName and author.LastName.value else '', 'middle': author.Initials.value if author.Initials and author.Initials.value else '' } if not people['last']: continue people['namekey'] = "%s.%s.%s" %(people['first'].lower(), people['middle'].lower(), people['last'].lower()) people['_id'] = idtool.generate('peop') try: pc.insert(people, safe=True) print "Inserted %s" %people except: del people['_id'] people = pc.find_one(people) if people: pc.update({'_id':people['_id']}, {'$addToSet': {'publications':pub._id}}, safe=True) pub.authors.append(people) #print authors except: print "ERROR: %s" %traceback.format_exc() pubc = mongo.getCollection('publication') for pub in pubs: try: pubc.insert(pub) print "Inserted pub: %s" %pub except: print "ERROR %s" %traceback.format_exc() log("Done") return pubs
def findOne(cls, query): col = mongo.getCollection(cls._col) r = col.find_one(query) if (r): return cls(r) return None