Beispiel #1
0
 def testIterTree(self):
     id_order_list = []
     f_list = ['expected', 'input']
     for v in ['1.0', '1.2']:
         for f in f_list:
             inp = pathmap.nexson_obj('merge/merge-{f}.v{v}.json'.format(v=v, f=f))
             id_order = []
             for t_tuple in iter_trees(inp):
                 ti = t_tuple[1]
                 id_order.append(ti)
             id_order_list.append(id_order)
     for i in range(1, 4):
         self.assertEqual(id_order_list[0], id_order_list[i])
phy = Phylesystem()
study_dict = defaultdict(int)
tree_dict = defaultdict(int)
out = codecs.getwriter("utf-8")(sys.stdout)
for study_id, n in phy.iter_study_objs():
    nexml = get_nexml_el(n)
    t = nexml.get("^ot:tag")
    if t:
        # print study_id, t
        if isinstance(t, list):
            for tag in t:
                study_dict[tag] += 1
        else:
            study_dict[t] += 1
    for trees_group_id, tree_id, tree in iter_trees(n):
        t = tree.get("^ot:tag")
        if t:
            # print study_id, tree_id, t
            if isinstance(t, list):
                for tag in t:
                    study_dict[tag] += 1
            else:
                tree_dict[t] += 1
print "\nStudy tag counts:"
for k, v in study_dict.items():
    print k, "\t", v
print "\nTree tag counts:"
for k, v in tree_dict.items():
    print k, "\t", v
Beispiel #3
0
        path_list.append('uploads')
        path_list.append(record['doc'])
        size = os.path.getsize(os.path.join(*path_list))

        File = db(db.supporting_files.id == id).select()[0]
        db.supporting_files[id] = dict(file_size=size)
        db.supporting_files[id] = dict(study_id=response.study_id)

        # Put annotation- message information to a top-level property, so that
        # it can be added to the message collection in the main (nexml-level)
        # 'supporting-files-metadata' annotationEvent. This matches our current
        # policy for annotations; see
        # https://github.com/OpenTreeOfLife/phylesystem-api/wiki/Annotations-in-NexSON#33-storage-and-placement-of-message-objects
        tree_ids = [
            tree_id
            for (tree_group_id, tree_id, imported_tree) in iter_trees(nex)
        ]
        quoted_tree_ids = ["'{s}'".format(s=tree_id) for tree_id in tree_ids]
        r['annotationFileInfo'] = {
            u'@filename': read_filename,
            u'@size': size,
            u'sourceForTree': [{
                '$': tree_id
            } for tree_id in tree_ids],
            u'@type': read_inp_format,
            u'@url': URL(f='download', args=[File['doc']]),
            u'description': {
                u'$':
                len(quoted_tree_ids) and "Source data for tree(s) {s}".format(
                    s=', '.join(quoted_tree_ids))
                or "No trees found in this file."
Beispiel #4
0
else:
    v_dict = defaultdict(int)

def process_val(v, id_str):
    if v is not None:
        if report_ids:
            v_dict.setdefault(v, []).append(id_str)
        elif summarize_as_set:
            v_dict[v] += 1
        else:
            out.write(u'{i}: {v}\n'.format(i=study_id, v=v))

for study_id, n in phy.iter_study_objs():
    nexml = get_nexml_el(n)
    if check_trees:
        for trees_group_id, tree_id, tree in iter_trees(n):
            id_str = 'study: {s} tree: {t}'.format(s=study_id, t=tree_id)
            process_val(tree.get(study_prop), id_str)
    else:
        process_val(nexml.get(study_prop), study_id)

if report_ids:
    as_list = [(len(v), k, v) for k, v in v_dict.items()]
    as_list.sort(reverse=True)
    for n, k, v in as_list:
        out.write(u'{k}\tseen {n:d} times\t{v}\n'.format(k=k, n=n, v='\t'.join(v)))
elif summarize_as_set:
    as_list = [(v, k) for k, v in v_dict.items()]
    as_list.sort(reverse=True)
    for v, k in as_list:
        out.write(u'"{k}" (seen {v:d} times)\n'.format(k=k, v=v))
Beispiel #5
0
        bundle_properties = json.load(
            codecs.open(RETURN_ATT_FILEPATH, 'rU', encoding='utf-8'))
        try:
            dd = bundle_properties.get('dataDeposit')
            if dd:
                n = nex.get('nex:nexml') or nex['nex']
                add_resource_meta(n, "ot:dataDeposit", dd, NEXSON_VERSION)
        except:
            pass
        r.update(bundle_properties)
        r['numberOfTrees'] = num_trees
        r['nexml2json'] = NEXSON_VERSION
        read_inp_format = bundle_properties.get('inputFormat', '')
        read_filename = bundle_properties.get('filename', '')

        for tree_tup in iter_trees(nex):
            tree_group_id, tree_id, imported_tree = tree_tup
            # create (or replace) the file information for this imported tree
            imported_tree[u'^ot:messages'] = {
                u'message': [{
                    u'@id': "message{u}".format(u=unique_id),
                    u'@code': u'SUPPORTING_FILE_INFO',
                    u'@humanMessageType': u'NONE',
                    u'@severity': u'INFO',
                    u'@wasGeneratedBy': u'opentree.2nexml',
                    # TODO: Do we need to add this agent to the main study?
                    u'data': {
                        u'@movedToPermanentArchive': False,
                        u'files': {
                            u'file': [{
                                u'@filename':
        path_list = []
        path_list.append(request.folder)
        path_list.append('uploads')
        path_list.append(record['doc'])
        size = os.path.getsize(os.path.join(*path_list))
         
        File = db(db.supporting_files.id==id).select()[0]
        db.supporting_files[id] = dict(file_size=size)
        db.supporting_files[id] = dict(study_id=response.study_id)
         
        # Put annotation- message information to a top-level property, so that
        # it can be added to the message collection in the main (nexml-level)
        # 'supporting-files-metadata' annotationEvent. This matches our current
        # policy for annotations; see 
        # https://github.com/OpenTreeOfLife/phylesystem-api/wiki/Annotations-in-NexSON#33-storage-and-placement-of-message-objects
        tree_ids = [tree_id for (tree_group_id, tree_id, imported_tree) in iter_trees(nex)]
        quoted_tree_ids = ["'{s}'".format(s=tree_id) for tree_id in tree_ids]
        r['annotationFileInfo'] = { 
            u'@filename': read_filename,
            u'@size': size,
            u'sourceForTree': [{'$':tree_id} for tree_id in tree_ids],
            u'@type': read_inp_format,
            u'@url': URL(f='download', args=[File['doc']]),
            u'description': {
                u'$': len(quoted_tree_ids) and "Source data for tree(s) {s}".format(s=', '.join(quoted_tree_ids)) or "No trees found in this file."
            }
        }
        return r
    assert (False)

# provide support for CrossRef.org URLs via HTTPS
def add_study(study_id):
    _LOG.debug('adding study {s}'.format(s=study_id))

    # get latest version of nexson
    # location of repo (test vs dev) dependent on peyotl config
    phy = create_phylesystem_obj()
    try:
        studyobj = phy.get_study(study_id)['data']
    except:
        _LOG.debug('did not find study {s} in phylesystem'.format(s=study_id))
        raise HTTPNotFound("Study {s} not found in phylesystem".format(s=study_id))
    nexml = get_nexml_el(studyobj)
    proposedTrees = nexml.get('^ot:candidateTreeForSynthesis')
    if proposedTrees is None:
        proposedTrees = []

    # create a new Study object
    new_study = Study(id=study_id)
    DBSession.add(new_study)

    # update with treebase id, if exists
    datadeposit = nexml.get('^ot:dataDeposit')
    if (datadeposit):
        url = datadeposit['@href']
        if (url):
            pattern = re.compile(u'.+TB2:(.+)$')
            matchobj = re.match(pattern,url)
            if (matchobj):
                tb_id = matchobj.group(1)
                new_study.treebase_id=tb_id

    # get curator(s), noting that ot:curators might be a
    # string or a list
    c = nexml.get('^ot:curatorName')
    # create list of curator objects
    curator_list=[]
    if (isinstance(c,basestring)):
        curator_list.append(c)
    else:
        curator_list = c
    for curator in curator_list:
        test_c = DBSession.query(Curator).filter(Curator.name==curator).first()
        if test_c:
            _LOG.debug("curator {c} already exists".format(c=curator))
            #DBSession.add(curator)
            new_study.curators.append(test_c)
        else:
            _LOG.debug("curator {c} does not yet exist".format(c=curator))
            new_study.curators.append(Curator(name=curator))

    # mapped otus in this study
    otu_dict = gen_otu_dict(studyobj)
    # iterate over the OTUs in the study, collecting the mapped
    # ones (oid to ott_id mapping held at the study level)
    mapped_otus = {}
    for oid, o in otu_dict.items():
        ottID = o.get('^ot:ottId')
        if ottID is not None:
            mapped_otus[oid]=ottID

    # iterate over trees and insert tree data
    ntrees = 0
    for trees_group_id, tree_id, tree in iter_trees(studyobj):
        _LOG.debug(' tree : {t}'.format(t=tree_id))
        ntrees+=1
        proposedForSynth = False
        if (tree_id in proposedTrees):
            proposedForSynth = True

        treejson = json.dumps(tree)
        new_tree = Tree(
            tree_id=tree_id,
            study_id=study_id,
            proposed=proposedForSynth,
            data=treejson
            )

        # get otus
        ottIDs = set()     # ott ids for this tree
        ntips=0
        for node_id, node in iter_node(tree):
            oid = node.get('@otu')
            # no @otu property on internal nodes
            if oid is not None:
                ntips+=1
                #ottID = mapped_otus[oid]
                if oid in mapped_otus:
                    ottID = mapped_otus[oid]
                    # _LOG.debug(' mapped ottID: {m}'.format(m=ottID))
                    # check that this exists in the taxonomy
                    # (it might not, if the ID has been deprecated)
                    taxon = DBSession.query(Taxonomy).filter(
                        Taxonomy.id==ottID
                        ).first()
                    if taxon:
                        lineage = get_lineage(ottID)
                        _LOG.debug(' lineage of {m} = {l}'.format(m=ottID,l=lineage))
                        for t in lineage:
                            ottIDs.add(t)
        new_tree.ntips = ntips
        for t in ottIDs:
            taxon = DBSession.query(Taxonomy).filter(
                Taxonomy.id==t
                ).first()
            # _LOG.debug(' adding {t},{n} to tree {tid}'.format(
            #     t=t,
            #     n=taxon.name,
            #     tid=tree_id)
            #     )
            new_tree.otus.append(taxon)

        # add the tree
        DBSession.add(new_tree)

    # now that we have added the tree info, update the study record
    # with the json data (minus the tree info)
    del nexml['treesById']
    studyjson = json.dumps(nexml)
    new_study.data=studyjson
    new_study.ntrees = ntrees
def create_phylesystem_obj():
    # create connection to local phylesystem
    phylesystem_api_wrapper = PhylesystemAPI(get_from='local')
    phylesystem = phylesystem_api_wrapper.phylesystem_obj
    return phylesystem


if __name__ == "__main__":
    counter = 0
    limit = None
    tree_key_set = set()
    study_key_set = set()
    phy = create_phylesystem_obj()
    for study_id, studyobj in phy.iter_study_objs():
        for k in studyobj['nexml'].keys():
            study_key_set.add(k)
        for trees_group_id, tree_id, tree in iter_trees(studyobj):
            for k in tree.keys():
                tree_key_set.add(k)
        counter += 1
        if (counter % 100 == 0):
            print("Read {n} studies".format(n=counter))
        if (limit and counter > limit):
            break
    print("found {n} study properties".format(n=len(study_key_set)))
    for k in study_key_set:
        print(k)
    print("found {n} tree properties".format(n=len(tree_key_set)))
    for k in tree_key_set:
        print(k)
def addStudy(session, study_id):
    # get latest version of nexson
    print "adding study {s}".format(s=study_id)
    phy = PhylesystemAPI(get_from="local")
    studyobj = phy.get_study(study_id)["data"]
    nexml = get_nexml_el(studyobj)
    year = nexml.get("^ot:studyYear")
    proposedTrees = nexml.get("^ot:candidateTreeForSynthesis")
    if proposedTrees is None:
        proposedTrees = []

    # create a new Study object
    new_study = Study(id=study_id, year=year)
    session.add(new_study)
    # session.commit()

    # get curator(s), noting that ot:curators might be a
    # string or a list
    c = nexml.get("^ot:curatorName")
    print " ot:curatorName: ", c
    # create list of curator objects
    curator_list = []
    if isinstance(c, basestring):
        curator_list.append(c)
    else:
        curator_list = c
    for curator in curator_list:
        test_c = session.query(Curator).filter(Curator.name == curator).first()
        if test_c:
            print "curator {c} already exists".format(c=curator)
            # session.add(curator)
            new_study.curators.append(test_c)
        else:
            print "curator {c} does no exist".format(c=curator)
            new_study.curators.append(Curator(name=curator))

    # mapped otus in this study
    otu_dict = gen_otu_dict(studyobj)
    # iterate over the OTUs in the study, collecting the mapped
    # ones (oid to ott_id mapping held at the study level)
    mapped_otus = {}
    for oid, o in otu_dict.items():
        ottID = o.get("^ot:ottId")
        if ottID is not None:
            mapped_otus[oid] = ottID

    # iterate over trees and insert tree data
    for trees_group_id, tree_id, tree in iter_trees(studyobj):
        print " tree :", tree_id
        proposedForSynth = False
        if tree_id in proposedTrees:
            proposedForSynth = True

        treejson = json.dumps(tree)
        new_tree = Tree(tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson)

        # get otus
        ottIDs = set()  # ott ids for this tree
        ntips = 0
        for node_id, node in iter_node(tree):
            oid = node.get("@otu")
            # no @otu property on internal nodes
            if oid is not None:
                ntips += 1
                # ottID = mapped_otus[oid]
                if oid in mapped_otus:
                    ottID = mapped_otus[oid]
                    # check that this exists in the taxonomy
                    # (it might not, if the ID has been deprecated)
                    taxon = session.query(Taxonomy).filter(Taxonomy.id == ottID).first()
                    if taxon:
                        new_tree.otus.append(taxon)
                        ottIDs.add(ottID)
        new_tree.ntips = ntips
        # need to write function for recursive query of Taxonomy table
        # ottIDs = parent_closure(ottIDs,taxonomy)

        # update with treebase id, if exists
        datadeposit = nexml.get("^ot:dataDeposit")
        if datadeposit:
            url = datadeposit["@href"]
            pattern = re.compile(u".+TB2:(.+)$")
            matchobj = re.match(pattern, url)
            if matchobj:
                tb_id = matchobj.group(1)
                new_tree.treebase_id = tb_id
        session.add(new_tree)

    # now that we have added the tree info, update the study record
    # with the json data (minus the tree info)
    del nexml["treesById"]
    studyjson = json.dumps(nexml)
    new_study.data = studyjson
    session.commit()
Beispiel #10
0
        path_list.append(record['doc'])
        size = os.path.getsize(os.path.join(*path_list))
         
        File = db(db.supporting_files.id==id).select()[0]
        db.supporting_files[id] = dict(file_size=size)
        db.supporting_files[id] = dict(study_id=response.study_id)
         
        # Put annotation- message information to a top-level property, so that
        # it can be added to the message collection in the main (nexml-level)
        # 'supporting-files-metadata' annotationEvent. This matches our current
        # policy for annotations; see 
        # https://github.com/OpenTreeOfLife/phylesystem-api/wiki/Annotations-in-NexSON#33-storage-and-placement-of-message-objects
        r['annotationFileInfo'] = { 
            u'@filename': read_filename,
            u'@size': size,
            u'sourceForTree': [{'$':tree_id} for (tree_group_id, tree_id, imported_tree) in iter_trees(nex)],
            u'@type': read_inp_format,
            u'@url': URL(f='download', args=[File['doc']]),
            u'description': {
                u'$': "Source data for tree '{u}'".format(u=tree_id)
            }
        }
        return r
    assert (False)

# provide support for CrossRef.org URLs via HTTPS
def search_crossref_proxy():
    search_crossref_url = request.env.web2py_original_uri.split('search_crossref_proxy')[1]
    # prepend the real domain, using HTTP, and return the response
    search_crossref_url = 'http://search.crossref.org/%s' % search_crossref_url
    req = urllib2.Request(url=search_crossref_url) 
Beispiel #11
0
        r = {"data": nex}
        bundle_properties = json.load(codecs.open(RETURN_ATT_FILEPATH, "rU", encoding="utf-8"))
        try:
            dd = bundle_properties.get("dataDeposit")
            if dd:
                n = nex.get("nex:nexml") or nex["nex"]
                add_resource_meta(n, "ot:dataDeposit", dd, NEXSON_VERSION)
        except:
            pass
        r.update(bundle_properties)
        r["numberOfTrees"] = num_trees
        r["nexml2json"] = NEXSON_VERSION
        read_inp_format = bundle_properties.get("inputFormat", "")
        read_filename = bundle_properties.get("filename", "")

        for tree_tup in iter_trees(nex):
            tree_group_id, tree_id, imported_tree = tree_tup
            # create (or replace) the file information for this imported tree
            imported_tree[u"^ot:messages"] = {
                u"message": [
                    {
                        u"@id": "message{u}".format(u=unique_id),
                        u"@code": u"SUPPORTING_FILE_INFO",
                        u"@humanMessageType": u"NONE",
                        u"@severity": u"INFO",
                        u"@wasGeneratedBy": u"opentree.2nexml",
                        # TODO: Do we need to add this agent to the main study?
                        u"data": {
                            u"@movedToPermanentArchive": False,
                            u"files": {
                                u"file": [
def load_nexsons(connection,cursor,phy,config_obj,nstudies=None):
    counter = 0
    study_properties = set()
    tree_properties = set()
    for study_id, studyobj in phy.iter_study_objs():
        nexml = get_nexml_el(studyobj)
        #print 'STUDY: ',study_id
        study_properties.update(nexml.keys())
        # study data for study table
        STUDYTABLE = config_obj.get('database_tables','studytable')
        year = nexml.get('^ot:studyYear')
        proposedTrees = nexml.get('^ot:candidateTreeForSynthesis')
        if proposedTrees is None:
            proposedTrees = []

        # must insert study before trees
        sqlstring = ("INSERT INTO {tablename} (id) "
            "VALUES (%s);"
            .format(tablename=STUDYTABLE)
            )
        data = (study_id,)
        #print '  SQL: ',cursor.mogrify(sqlstring)
        cursor.execute(sqlstring,data)
        connection.commit()

        # update with treebase id, if exists
        datadeposit = nexml.get('^ot:dataDeposit')
        if (datadeposit):
            url = datadeposit['@href']
            pattern = re.compile(u'.+TB2:(.+)$')
            matchobj = re.match(pattern,url)
            if (matchobj):
                tb_id = matchobj.group(1)
                sqlstring = ("UPDATE {tablename} "
                    "SET treebase_id=%s "
                    "WHERE id=%s;"
                    .format(tablename=STUDYTABLE)
                    )
                data = (tb_id,study_id)
                #print '  SQL: ',cursor.mogrify(sqlstring,data)
                cursor.execute(sqlstring,data)
                connection.commit()

        # get curator(s), noting that ot:curators might be a
        # string or a list
        c = nexml.get('^ot:curatorName')
        #print ' ot:curatorName: ',c
        curators=[]
        if (isinstance(c,basestring)):
            curators.append(c)
        else:
            curators=c
        # remove duplicates
        curators = list(set(curators))
        insert_curators(connection,cursor,config_obj,study_id,curators)

        # iterate over trees and insert tree data
        # note that OTU data done separately as COPY
        # due to size of table (see script <scriptname>)
        TREETABLE = config_obj.get('database_tables','treetable')
        ntrees = 0
        try:
            for trees_group_id, tree_id, tree in iter_trees(studyobj):
                #print ' tree :' ,tree_id
                ntrees += 1
                proposedForSynth = False
                tree_properties.update(tree.keys())
                if (tree_id in proposedTrees):
                    proposedForSynth = True
                treejson = json.dumps(tree)
                ntips = 0
                for node_id, node in iter_node(tree):
                    oid = node.get('@otu')
                    # no @otu property on internal nodes
                    if oid is not None:
                        ntips+=1

                sqlstring = ("INSERT INTO {tablename} "
                    "(tree_id,study_id,ntips,proposed,data) "
                    "VALUES (%s,%s,%s,%s,%s);"
                    .format(tablename=TREETABLE)
                    )
                data = (tree_id,study_id,ntips,proposedForSynth,treejson)
                #print '  SQL: ',cursor.mogrify(sqlstring,data)
                cursor.execute(sqlstring,data)
                connection.commit()

        except psy.Error as e:
            print e.pgerror

        # now that we have added the tree info, update the study record
        # with the json data (minus the tree info) and ntrees
        del nexml['treesById']
        studyjson = json.dumps(nexml)
        sqlstring = ("UPDATE {tablename} "
            "SET data=%s,ntrees=%s "
            "WHERE id=%s;"
            .format(tablename=STUDYTABLE)
        )
        data = (studyjson,ntrees,study_id)
        cursor.execute(sqlstring,data)
        connection.commit()

        counter+=1
        if (counter%500 == 0):
            print "loaded {n} studies".format(n=counter)

        if (nstudies and counter>=nstudies):
            print "finished inserting",nstudies,"studies"
            break

    # load the tree and study properties
    PROPERTYTABLE = config_obj.get('database_tables','propertytable')
    load_properties(
        connection,
        cursor,
        PROPERTYTABLE,
        study_properties,
        tree_properties)