def testIterTree(self): id_order_list = [] f_list = ['expected', 'input'] for v in ['1.0', '1.2']: for f in f_list: inp = pathmap.nexson_obj('merge/merge-{f}.v{v}.json'.format(v=v, f=f)) id_order = [] for t_tuple in iter_trees(inp): ti = t_tuple[1] id_order.append(ti) id_order_list.append(id_order) for i in range(1, 4): self.assertEqual(id_order_list[0], id_order_list[i])
phy = Phylesystem() study_dict = defaultdict(int) tree_dict = defaultdict(int) out = codecs.getwriter("utf-8")(sys.stdout) for study_id, n in phy.iter_study_objs(): nexml = get_nexml_el(n) t = nexml.get("^ot:tag") if t: # print study_id, t if isinstance(t, list): for tag in t: study_dict[tag] += 1 else: study_dict[t] += 1 for trees_group_id, tree_id, tree in iter_trees(n): t = tree.get("^ot:tag") if t: # print study_id, tree_id, t if isinstance(t, list): for tag in t: study_dict[tag] += 1 else: tree_dict[t] += 1 print "\nStudy tag counts:" for k, v in study_dict.items(): print k, "\t", v print "\nTree tag counts:" for k, v in tree_dict.items(): print k, "\t", v
path_list.append('uploads') path_list.append(record['doc']) size = os.path.getsize(os.path.join(*path_list)) File = db(db.supporting_files.id == id).select()[0] db.supporting_files[id] = dict(file_size=size) db.supporting_files[id] = dict(study_id=response.study_id) # Put annotation- message information to a top-level property, so that # it can be added to the message collection in the main (nexml-level) # 'supporting-files-metadata' annotationEvent. This matches our current # policy for annotations; see # https://github.com/OpenTreeOfLife/phylesystem-api/wiki/Annotations-in-NexSON#33-storage-and-placement-of-message-objects tree_ids = [ tree_id for (tree_group_id, tree_id, imported_tree) in iter_trees(nex) ] quoted_tree_ids = ["'{s}'".format(s=tree_id) for tree_id in tree_ids] r['annotationFileInfo'] = { u'@filename': read_filename, u'@size': size, u'sourceForTree': [{ '$': tree_id } for tree_id in tree_ids], u'@type': read_inp_format, u'@url': URL(f='download', args=[File['doc']]), u'description': { u'$': len(quoted_tree_ids) and "Source data for tree(s) {s}".format( s=', '.join(quoted_tree_ids)) or "No trees found in this file."
else: v_dict = defaultdict(int) def process_val(v, id_str): if v is not None: if report_ids: v_dict.setdefault(v, []).append(id_str) elif summarize_as_set: v_dict[v] += 1 else: out.write(u'{i}: {v}\n'.format(i=study_id, v=v)) for study_id, n in phy.iter_study_objs(): nexml = get_nexml_el(n) if check_trees: for trees_group_id, tree_id, tree in iter_trees(n): id_str = 'study: {s} tree: {t}'.format(s=study_id, t=tree_id) process_val(tree.get(study_prop), id_str) else: process_val(nexml.get(study_prop), study_id) if report_ids: as_list = [(len(v), k, v) for k, v in v_dict.items()] as_list.sort(reverse=True) for n, k, v in as_list: out.write(u'{k}\tseen {n:d} times\t{v}\n'.format(k=k, n=n, v='\t'.join(v))) elif summarize_as_set: as_list = [(v, k) for k, v in v_dict.items()] as_list.sort(reverse=True) for v, k in as_list: out.write(u'"{k}" (seen {v:d} times)\n'.format(k=k, v=v))
bundle_properties = json.load( codecs.open(RETURN_ATT_FILEPATH, 'rU', encoding='utf-8')) try: dd = bundle_properties.get('dataDeposit') if dd: n = nex.get('nex:nexml') or nex['nex'] add_resource_meta(n, "ot:dataDeposit", dd, NEXSON_VERSION) except: pass r.update(bundle_properties) r['numberOfTrees'] = num_trees r['nexml2json'] = NEXSON_VERSION read_inp_format = bundle_properties.get('inputFormat', '') read_filename = bundle_properties.get('filename', '') for tree_tup in iter_trees(nex): tree_group_id, tree_id, imported_tree = tree_tup # create (or replace) the file information for this imported tree imported_tree[u'^ot:messages'] = { u'message': [{ u'@id': "message{u}".format(u=unique_id), u'@code': u'SUPPORTING_FILE_INFO', u'@humanMessageType': u'NONE', u'@severity': u'INFO', u'@wasGeneratedBy': u'opentree.2nexml', # TODO: Do we need to add this agent to the main study? u'data': { u'@movedToPermanentArchive': False, u'files': { u'file': [{ u'@filename':
path_list = [] path_list.append(request.folder) path_list.append('uploads') path_list.append(record['doc']) size = os.path.getsize(os.path.join(*path_list)) File = db(db.supporting_files.id==id).select()[0] db.supporting_files[id] = dict(file_size=size) db.supporting_files[id] = dict(study_id=response.study_id) # Put annotation- message information to a top-level property, so that # it can be added to the message collection in the main (nexml-level) # 'supporting-files-metadata' annotationEvent. This matches our current # policy for annotations; see # https://github.com/OpenTreeOfLife/phylesystem-api/wiki/Annotations-in-NexSON#33-storage-and-placement-of-message-objects tree_ids = [tree_id for (tree_group_id, tree_id, imported_tree) in iter_trees(nex)] quoted_tree_ids = ["'{s}'".format(s=tree_id) for tree_id in tree_ids] r['annotationFileInfo'] = { u'@filename': read_filename, u'@size': size, u'sourceForTree': [{'$':tree_id} for tree_id in tree_ids], u'@type': read_inp_format, u'@url': URL(f='download', args=[File['doc']]), u'description': { u'$': len(quoted_tree_ids) and "Source data for tree(s) {s}".format(s=', '.join(quoted_tree_ids)) or "No trees found in this file." } } return r assert (False) # provide support for CrossRef.org URLs via HTTPS
def add_study(study_id): _LOG.debug('adding study {s}'.format(s=study_id)) # get latest version of nexson # location of repo (test vs dev) dependent on peyotl config phy = create_phylesystem_obj() try: studyobj = phy.get_study(study_id)['data'] except: _LOG.debug('did not find study {s} in phylesystem'.format(s=study_id)) raise HTTPNotFound("Study {s} not found in phylesystem".format(s=study_id)) nexml = get_nexml_el(studyobj) proposedTrees = nexml.get('^ot:candidateTreeForSynthesis') if proposedTrees is None: proposedTrees = [] # create a new Study object new_study = Study(id=study_id) DBSession.add(new_study) # update with treebase id, if exists datadeposit = nexml.get('^ot:dataDeposit') if (datadeposit): url = datadeposit['@href'] if (url): pattern = re.compile(u'.+TB2:(.+)$') matchobj = re.match(pattern,url) if (matchobj): tb_id = matchobj.group(1) new_study.treebase_id=tb_id # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get('^ot:curatorName') # create list of curator objects curator_list=[] if (isinstance(c,basestring)): curator_list.append(c) else: curator_list = c for curator in curator_list: test_c = DBSession.query(Curator).filter(Curator.name==curator).first() if test_c: _LOG.debug("curator {c} already exists".format(c=curator)) #DBSession.add(curator) new_study.curators.append(test_c) else: _LOG.debug("curator {c} does not yet exist".format(c=curator)) new_study.curators.append(Curator(name=curator)) # mapped otus in this study otu_dict = gen_otu_dict(studyobj) # iterate over the OTUs in the study, collecting the mapped # ones (oid to ott_id mapping held at the study level) mapped_otus = {} for oid, o in otu_dict.items(): ottID = o.get('^ot:ottId') if ottID is not None: mapped_otus[oid]=ottID # iterate over trees and insert tree data ntrees = 0 for trees_group_id, tree_id, tree in iter_trees(studyobj): _LOG.debug(' tree : {t}'.format(t=tree_id)) ntrees+=1 proposedForSynth = False if (tree_id in proposedTrees): proposedForSynth = True treejson = json.dumps(tree) new_tree = Tree( tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson ) # get otus ottIDs = set() # ott ids for this tree ntips=0 for node_id, node in iter_node(tree): oid = node.get('@otu') # no @otu property on internal nodes if oid is not None: ntips+=1 #ottID = mapped_otus[oid] if oid in mapped_otus: ottID = mapped_otus[oid] # _LOG.debug(' mapped ottID: {m}'.format(m=ottID)) # check that this exists in the taxonomy # (it might not, if the ID has been deprecated) taxon = DBSession.query(Taxonomy).filter( Taxonomy.id==ottID ).first() if taxon: lineage = get_lineage(ottID) _LOG.debug(' lineage of {m} = {l}'.format(m=ottID,l=lineage)) for t in lineage: ottIDs.add(t) new_tree.ntips = ntips for t in ottIDs: taxon = DBSession.query(Taxonomy).filter( Taxonomy.id==t ).first() # _LOG.debug(' adding {t},{n} to tree {tid}'.format( # t=t, # n=taxon.name, # tid=tree_id) # ) new_tree.otus.append(taxon) # add the tree DBSession.add(new_tree) # now that we have added the tree info, update the study record # with the json data (minus the tree info) del nexml['treesById'] studyjson = json.dumps(nexml) new_study.data=studyjson new_study.ntrees = ntrees
def create_phylesystem_obj(): # create connection to local phylesystem phylesystem_api_wrapper = PhylesystemAPI(get_from='local') phylesystem = phylesystem_api_wrapper.phylesystem_obj return phylesystem if __name__ == "__main__": counter = 0 limit = None tree_key_set = set() study_key_set = set() phy = create_phylesystem_obj() for study_id, studyobj in phy.iter_study_objs(): for k in studyobj['nexml'].keys(): study_key_set.add(k) for trees_group_id, tree_id, tree in iter_trees(studyobj): for k in tree.keys(): tree_key_set.add(k) counter += 1 if (counter % 100 == 0): print("Read {n} studies".format(n=counter)) if (limit and counter > limit): break print("found {n} study properties".format(n=len(study_key_set))) for k in study_key_set: print(k) print("found {n} tree properties".format(n=len(tree_key_set))) for k in tree_key_set: print(k)
def addStudy(session, study_id): # get latest version of nexson print "adding study {s}".format(s=study_id) phy = PhylesystemAPI(get_from="local") studyobj = phy.get_study(study_id)["data"] nexml = get_nexml_el(studyobj) year = nexml.get("^ot:studyYear") proposedTrees = nexml.get("^ot:candidateTreeForSynthesis") if proposedTrees is None: proposedTrees = [] # create a new Study object new_study = Study(id=study_id, year=year) session.add(new_study) # session.commit() # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get("^ot:curatorName") print " ot:curatorName: ", c # create list of curator objects curator_list = [] if isinstance(c, basestring): curator_list.append(c) else: curator_list = c for curator in curator_list: test_c = session.query(Curator).filter(Curator.name == curator).first() if test_c: print "curator {c} already exists".format(c=curator) # session.add(curator) new_study.curators.append(test_c) else: print "curator {c} does no exist".format(c=curator) new_study.curators.append(Curator(name=curator)) # mapped otus in this study otu_dict = gen_otu_dict(studyobj) # iterate over the OTUs in the study, collecting the mapped # ones (oid to ott_id mapping held at the study level) mapped_otus = {} for oid, o in otu_dict.items(): ottID = o.get("^ot:ottId") if ottID is not None: mapped_otus[oid] = ottID # iterate over trees and insert tree data for trees_group_id, tree_id, tree in iter_trees(studyobj): print " tree :", tree_id proposedForSynth = False if tree_id in proposedTrees: proposedForSynth = True treejson = json.dumps(tree) new_tree = Tree(tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson) # get otus ottIDs = set() # ott ids for this tree ntips = 0 for node_id, node in iter_node(tree): oid = node.get("@otu") # no @otu property on internal nodes if oid is not None: ntips += 1 # ottID = mapped_otus[oid] if oid in mapped_otus: ottID = mapped_otus[oid] # check that this exists in the taxonomy # (it might not, if the ID has been deprecated) taxon = session.query(Taxonomy).filter(Taxonomy.id == ottID).first() if taxon: new_tree.otus.append(taxon) ottIDs.add(ottID) new_tree.ntips = ntips # need to write function for recursive query of Taxonomy table # ottIDs = parent_closure(ottIDs,taxonomy) # update with treebase id, if exists datadeposit = nexml.get("^ot:dataDeposit") if datadeposit: url = datadeposit["@href"] pattern = re.compile(u".+TB2:(.+)$") matchobj = re.match(pattern, url) if matchobj: tb_id = matchobj.group(1) new_tree.treebase_id = tb_id session.add(new_tree) # now that we have added the tree info, update the study record # with the json data (minus the tree info) del nexml["treesById"] studyjson = json.dumps(nexml) new_study.data = studyjson session.commit()
path_list.append(record['doc']) size = os.path.getsize(os.path.join(*path_list)) File = db(db.supporting_files.id==id).select()[0] db.supporting_files[id] = dict(file_size=size) db.supporting_files[id] = dict(study_id=response.study_id) # Put annotation- message information to a top-level property, so that # it can be added to the message collection in the main (nexml-level) # 'supporting-files-metadata' annotationEvent. This matches our current # policy for annotations; see # https://github.com/OpenTreeOfLife/phylesystem-api/wiki/Annotations-in-NexSON#33-storage-and-placement-of-message-objects r['annotationFileInfo'] = { u'@filename': read_filename, u'@size': size, u'sourceForTree': [{'$':tree_id} for (tree_group_id, tree_id, imported_tree) in iter_trees(nex)], u'@type': read_inp_format, u'@url': URL(f='download', args=[File['doc']]), u'description': { u'$': "Source data for tree '{u}'".format(u=tree_id) } } return r assert (False) # provide support for CrossRef.org URLs via HTTPS def search_crossref_proxy(): search_crossref_url = request.env.web2py_original_uri.split('search_crossref_proxy')[1] # prepend the real domain, using HTTP, and return the response search_crossref_url = 'http://search.crossref.org/%s' % search_crossref_url req = urllib2.Request(url=search_crossref_url)
r = {"data": nex} bundle_properties = json.load(codecs.open(RETURN_ATT_FILEPATH, "rU", encoding="utf-8")) try: dd = bundle_properties.get("dataDeposit") if dd: n = nex.get("nex:nexml") or nex["nex"] add_resource_meta(n, "ot:dataDeposit", dd, NEXSON_VERSION) except: pass r.update(bundle_properties) r["numberOfTrees"] = num_trees r["nexml2json"] = NEXSON_VERSION read_inp_format = bundle_properties.get("inputFormat", "") read_filename = bundle_properties.get("filename", "") for tree_tup in iter_trees(nex): tree_group_id, tree_id, imported_tree = tree_tup # create (or replace) the file information for this imported tree imported_tree[u"^ot:messages"] = { u"message": [ { u"@id": "message{u}".format(u=unique_id), u"@code": u"SUPPORTING_FILE_INFO", u"@humanMessageType": u"NONE", u"@severity": u"INFO", u"@wasGeneratedBy": u"opentree.2nexml", # TODO: Do we need to add this agent to the main study? u"data": { u"@movedToPermanentArchive": False, u"files": { u"file": [
def load_nexsons(connection,cursor,phy,config_obj,nstudies=None): counter = 0 study_properties = set() tree_properties = set() for study_id, studyobj in phy.iter_study_objs(): nexml = get_nexml_el(studyobj) #print 'STUDY: ',study_id study_properties.update(nexml.keys()) # study data for study table STUDYTABLE = config_obj.get('database_tables','studytable') year = nexml.get('^ot:studyYear') proposedTrees = nexml.get('^ot:candidateTreeForSynthesis') if proposedTrees is None: proposedTrees = [] # must insert study before trees sqlstring = ("INSERT INTO {tablename} (id) " "VALUES (%s);" .format(tablename=STUDYTABLE) ) data = (study_id,) #print ' SQL: ',cursor.mogrify(sqlstring) cursor.execute(sqlstring,data) connection.commit() # update with treebase id, if exists datadeposit = nexml.get('^ot:dataDeposit') if (datadeposit): url = datadeposit['@href'] pattern = re.compile(u'.+TB2:(.+)$') matchobj = re.match(pattern,url) if (matchobj): tb_id = matchobj.group(1) sqlstring = ("UPDATE {tablename} " "SET treebase_id=%s " "WHERE id=%s;" .format(tablename=STUDYTABLE) ) data = (tb_id,study_id) #print ' SQL: ',cursor.mogrify(sqlstring,data) cursor.execute(sqlstring,data) connection.commit() # get curator(s), noting that ot:curators might be a # string or a list c = nexml.get('^ot:curatorName') #print ' ot:curatorName: ',c curators=[] if (isinstance(c,basestring)): curators.append(c) else: curators=c # remove duplicates curators = list(set(curators)) insert_curators(connection,cursor,config_obj,study_id,curators) # iterate over trees and insert tree data # note that OTU data done separately as COPY # due to size of table (see script <scriptname>) TREETABLE = config_obj.get('database_tables','treetable') ntrees = 0 try: for trees_group_id, tree_id, tree in iter_trees(studyobj): #print ' tree :' ,tree_id ntrees += 1 proposedForSynth = False tree_properties.update(tree.keys()) if (tree_id in proposedTrees): proposedForSynth = True treejson = json.dumps(tree) ntips = 0 for node_id, node in iter_node(tree): oid = node.get('@otu') # no @otu property on internal nodes if oid is not None: ntips+=1 sqlstring = ("INSERT INTO {tablename} " "(tree_id,study_id,ntips,proposed,data) " "VALUES (%s,%s,%s,%s,%s);" .format(tablename=TREETABLE) ) data = (tree_id,study_id,ntips,proposedForSynth,treejson) #print ' SQL: ',cursor.mogrify(sqlstring,data) cursor.execute(sqlstring,data) connection.commit() except psy.Error as e: print e.pgerror # now that we have added the tree info, update the study record # with the json data (minus the tree info) and ntrees del nexml['treesById'] studyjson = json.dumps(nexml) sqlstring = ("UPDATE {tablename} " "SET data=%s,ntrees=%s " "WHERE id=%s;" .format(tablename=STUDYTABLE) ) data = (studyjson,ntrees,study_id) cursor.execute(sqlstring,data) connection.commit() counter+=1 if (counter%500 == 0): print "loaded {n} studies".format(n=counter) if (nstudies and counter>=nstudies): print "finished inserting",nstudies,"studies" break # load the tree and study properties PROPERTYTABLE = config_obj.get('database_tables','propertytable') load_properties( connection, cursor, PROPERTYTABLE, study_properties, tree_properties)