Esempio n. 1
0
def addStudy(session, study_id):
    # get latest version of nexson
    print "adding study {s}".format(s=study_id)
    phy = PhylesystemAPI(get_from="local")
    studyobj = phy.get_study(study_id)["data"]
    nexml = get_nexml_el(studyobj)
    year = nexml.get("^ot:studyYear")
    proposedTrees = nexml.get("^ot:candidateTreeForSynthesis")
    if proposedTrees is None:
        proposedTrees = []

    # create a new Study object
    new_study = Study(id=study_id, year=year)
    session.add(new_study)
    # session.commit()

    # get curator(s), noting that ot:curators might be a
    # string or a list
    c = nexml.get("^ot:curatorName")
    print " ot:curatorName: ", c
    # create list of curator objects
    curator_list = []
    if isinstance(c, basestring):
        curator_list.append(c)
    else:
        curator_list = c
    for curator in curator_list:
        test_c = session.query(Curator).filter(Curator.name == curator).first()
        if test_c:
            print "curator {c} already exists".format(c=curator)
            # session.add(curator)
            new_study.curators.append(test_c)
        else:
            print "curator {c} does no exist".format(c=curator)
            new_study.curators.append(Curator(name=curator))

    # mapped otus in this study
    otu_dict = gen_otu_dict(studyobj)
    # iterate over the OTUs in the study, collecting the mapped
    # ones (oid to ott_id mapping held at the study level)
    mapped_otus = {}
    for oid, o in otu_dict.items():
        ottID = o.get("^ot:ottId")
        if ottID is not None:
            mapped_otus[oid] = ottID

    # iterate over trees and insert tree data
    for trees_group_id, tree_id, tree in iter_trees(studyobj):
        print " tree :", tree_id
        proposedForSynth = False
        if tree_id in proposedTrees:
            proposedForSynth = True

        treejson = json.dumps(tree)
        new_tree = Tree(tree_id=tree_id, study_id=study_id, proposed=proposedForSynth, data=treejson)

        # get otus
        ottIDs = set()  # ott ids for this tree
        ntips = 0
        for node_id, node in iter_node(tree):
            oid = node.get("@otu")
            # no @otu property on internal nodes
            if oid is not None:
                ntips += 1
                # ottID = mapped_otus[oid]
                if oid in mapped_otus:
                    ottID = mapped_otus[oid]
                    # check that this exists in the taxonomy
                    # (it might not, if the ID has been deprecated)
                    taxon = session.query(Taxonomy).filter(Taxonomy.id == ottID).first()
                    if taxon:
                        new_tree.otus.append(taxon)
                        ottIDs.add(ottID)
        new_tree.ntips = ntips
        # need to write function for recursive query of Taxonomy table
        # ottIDs = parent_closure(ottIDs,taxonomy)

        # update with treebase id, if exists
        datadeposit = nexml.get("^ot:dataDeposit")
        if datadeposit:
            url = datadeposit["@href"]
            pattern = re.compile(u".+TB2:(.+)$")
            matchobj = re.match(pattern, url)
            if matchobj:
                tb_id = matchobj.group(1)
                new_tree.treebase_id = tb_id
        session.add(new_tree)

    # now that we have added the tree info, update the study record
    # with the json data (minus the tree info)
    del nexml["treesById"]
    studyjson = json.dumps(nexml)
    new_study.data = studyjson
    session.commit()
def add_study(study_id):
    _LOG.debug('adding study {s}'.format(s=study_id))

    # get latest version of nexson
    # location of repo (test vs dev) dependent on peyotl config
    phy = create_phylesystem_obj()
    try:
        studyobj = phy.get_study(study_id)['data']
    except:
        _LOG.debug('did not find study {s} in phylesystem'.format(s=study_id))
        raise HTTPNotFound("Study {s} not found in phylesystem".format(s=study_id))
    nexml = get_nexml_el(studyobj)
    proposedTrees = nexml.get('^ot:candidateTreeForSynthesis')
    if proposedTrees is None:
        proposedTrees = []

    # create a new Study object
    new_study = Study(id=study_id)
    DBSession.add(new_study)

    # update with treebase id, if exists
    datadeposit = nexml.get('^ot:dataDeposit')
    if (datadeposit):
        url = datadeposit['@href']
        if (url):
            pattern = re.compile(u'.+TB2:(.+)$')
            matchobj = re.match(pattern,url)
            if (matchobj):
                tb_id = matchobj.group(1)
                new_study.treebase_id=tb_id

    # get curator(s), noting that ot:curators might be a
    # string or a list
    c = nexml.get('^ot:curatorName')
    # create list of curator objects
    curator_list=[]
    if (isinstance(c,basestring)):
        curator_list.append(c)
    else:
        curator_list = c
    for curator in curator_list:
        test_c = DBSession.query(Curator).filter(Curator.name==curator).first()
        if test_c:
            _LOG.debug("curator {c} already exists".format(c=curator))
            #DBSession.add(curator)
            new_study.curators.append(test_c)
        else:
            _LOG.debug("curator {c} does not yet exist".format(c=curator))
            new_study.curators.append(Curator(name=curator))

    # mapped otus in this study
    otu_dict = gen_otu_dict(studyobj)
    # iterate over the OTUs in the study, collecting the mapped
    # ones (oid to ott_id mapping held at the study level)
    mapped_otus = {}
    for oid, o in otu_dict.items():
        ottID = o.get('^ot:ottId')
        if ottID is not None:
            mapped_otus[oid]=ottID

    # iterate over trees and insert tree data
    ntrees = 0
    for trees_group_id, tree_id, tree in iter_trees(studyobj):
        _LOG.debug(' tree : {t}'.format(t=tree_id))
        ntrees+=1
        proposedForSynth = False
        if (tree_id in proposedTrees):
            proposedForSynth = True

        treejson = json.dumps(tree)
        new_tree = Tree(
            tree_id=tree_id,
            study_id=study_id,
            proposed=proposedForSynth,
            data=treejson
            )

        # get otus
        ottIDs = set()     # ott ids for this tree
        ntips=0
        for node_id, node in iter_node(tree):
            oid = node.get('@otu')
            # no @otu property on internal nodes
            if oid is not None:
                ntips+=1
                #ottID = mapped_otus[oid]
                if oid in mapped_otus:
                    ottID = mapped_otus[oid]
                    # _LOG.debug(' mapped ottID: {m}'.format(m=ottID))
                    # check that this exists in the taxonomy
                    # (it might not, if the ID has been deprecated)
                    taxon = DBSession.query(Taxonomy).filter(
                        Taxonomy.id==ottID
                        ).first()
                    if taxon:
                        lineage = get_lineage(ottID)
                        _LOG.debug(' lineage of {m} = {l}'.format(m=ottID,l=lineage))
                        for t in lineage:
                            ottIDs.add(t)
        new_tree.ntips = ntips
        for t in ottIDs:
            taxon = DBSession.query(Taxonomy).filter(
                Taxonomy.id==t
                ).first()
            # _LOG.debug(' adding {t},{n} to tree {tid}'.format(
            #     t=t,
            #     n=taxon.name,
            #     tid=tree_id)
            #     )
            new_tree.otus.append(taxon)

        # add the tree
        DBSession.add(new_tree)

    # now that we have added the tree info, update the study record
    # with the json data (minus the tree info)
    del nexml['treesById']
    studyjson = json.dumps(nexml)
    new_study.data=studyjson
    new_study.ntrees = ntrees
Esempio n. 3
0
phy = Phylesystem()

out = codecs.getwriter('utf-8')(sys.stdout)
for study_id, n in phy.iter_study_objs():
    otu_dict = gen_otu_dict(n)
    o_dict = {}
    for oid, o in otu_dict.items():
        try:
            lab = o[label_prop_name]
            orig = o[orig_prop_name]
            o_dict[oid] = [orig, None, lab]
        except:
            pass
    del otu_dict
    for tree in iter_trees(n):
        for node in iter_node(tree):
            oid = node.get('@otu')
            if oid is not None:
                ott = node.get(tax_prop_name)
                if ott is not None:
                    try:
                        o_dict[oid][1] = ott
                    except:
                        e = 'study {f} node {n} refers to otu {o} which is not found.\n'
                        m = e.format(f=study_id, n=node.get('@id'), o=oid)
                        sys.stderr.write(m)
    for oid, v in o_dict.items():
        t = v[1]
        l = v[2]
        if l and (t != l):
            orig = v[0]
phy = Phylesystem()

out = codecs.getwriter('utf-8')(sys.stdout)
for study_id, n in phy.iter_study_objs():
    otu_dict = gen_otu_dict(n)
    o_dict = {}
    for oid, o in otu_dict.items():
        try:
            lab = o[label_prop_name]
            orig = o[orig_prop_name]
            o_dict[oid] = [orig, None, lab]
        except:
            pass
    del otu_dict
    for tree in iter_trees(n):
        for node in iter_node(tree):
            oid = node.get('@otu')
            if oid is not None:
                ott = node.get(tax_prop_name)
                if ott is not None:
                    try:
                        o_dict[oid][1] = ott
                    except:
                        e = 'study {f} node {n} refers to otu {o} which is not found.\n'
                        m = e.format(f=study_id, n=node.get('@id'), o=oid)
                        sys.stderr.write(m)
    for oid, v in o_dict.items():
        t = v[1]
        l = v[2]
        if l and (t != l):
            orig = v[0]
out = codecs.getwriter('utf-8')(sys.stdout)
for study_id, n in phy.iter_study_objs():
    print(study_id)
    otu_dict = gen_otu_dict(n)
    o_dict = {}
    for oid, o in otu_dict.items():
        try:
            lab = o[label_prop_name]
            orig = o[orig_prop_name]
            o_dict[oid] = [orig, None, lab]
        except:
            pass
    del otu_dict
    for trees_group_id, tree_id, tree in iter_trees(n):
        for node_id, node in iter_node(tree):
            oid = node.get('@otu')
            if oid is not None:
                ott = node.get(tax_prop_name)
                if ott is not None:
                    try:
                        o_dict[oid][1] = ott
                    except:
                        e = 'study {f} node {n} refers to otu {o} which is not found.\n'
                        m = e.format(f=study_id, n=node.get('@id'), o=oid)
                        sys.stderr.write(m)
    for oid, v in o_dict.items():
        t = v[1]
        l = v[2]
        if l and (t != l):
            orig = v[0]
Esempio n. 6
0
out = codecs.getwriter('utf-8')(sys.stdout)
for study_id, n in phy.iter_study_objs():
    print(study_id)
    otu_dict = gen_otu_dict(n)
    o_dict = {}
    for oid, o in otu_dict.items():
        try:
            lab = o[label_prop_name]
            orig = o[orig_prop_name]
            o_dict[oid] = [orig, None, lab]
        except:
            pass
    del otu_dict
    for trees_group_id, tree_id, tree in iter_trees(n):
        for node_id, node in iter_node(tree):
            oid = node.get('@otu')
            if oid is not None:
                ott = node.get(tax_prop_name)
                if ott is not None:
                    try:
                        o_dict[oid][1] = ott
                    except:
                        e = 'study {f} node {n} refers to otu {o} which is not found.\n'
                        m = e.format(f=study_id, n=node.get('@id'), o=oid)
                        sys.stderr.write(m)
    for oid, v in o_dict.items():
        t = v[1]
        l = v[2]
        if l and (t != l):
            orig = v[0]
Esempio n. 7
0
def load_nexsons(connection,cursor,phy,config_obj,nstudies=None):
    counter = 0
    study_properties = set()
    tree_properties = set()
    for study_id, studyobj in phy.iter_study_objs():
        nexml = get_nexml_el(studyobj)
        #print 'STUDY: ',study_id
        study_properties.update(nexml.keys())
        # study data for study table
        STUDYTABLE = config_obj.get('database_tables','studytable')
        year = nexml.get('^ot:studyYear')
        proposedTrees = nexml.get('^ot:candidateTreeForSynthesis')
        if proposedTrees is None:
            proposedTrees = []

        # must insert study before trees
        sqlstring = ("INSERT INTO {tablename} (id) "
            "VALUES (%s);"
            .format(tablename=STUDYTABLE)
            )
        data = (study_id,)
        #print '  SQL: ',cursor.mogrify(sqlstring)
        cursor.execute(sqlstring,data)
        connection.commit()

        # update with treebase id, if exists
        datadeposit = nexml.get('^ot:dataDeposit')
        if (datadeposit):
            url = datadeposit['@href']
            pattern = re.compile(u'.+TB2:(.+)$')
            matchobj = re.match(pattern,url)
            if (matchobj):
                tb_id = matchobj.group(1)
                sqlstring = ("UPDATE {tablename} "
                    "SET treebase_id=%s "
                    "WHERE id=%s;"
                    .format(tablename=STUDYTABLE)
                    )
                data = (tb_id,study_id)
                #print '  SQL: ',cursor.mogrify(sqlstring,data)
                cursor.execute(sqlstring,data)
                connection.commit()

        # get curator(s), noting that ot:curators might be a
        # string or a list
        c = nexml.get('^ot:curatorName')
        #print ' ot:curatorName: ',c
        curators=[]
        if (isinstance(c,basestring)):
            curators.append(c)
        else:
            curators=c
        # remove duplicates
        curators = list(set(curators))
        insert_curators(connection,cursor,config_obj,study_id,curators)

        # iterate over trees and insert tree data
        # note that OTU data done separately as COPY
        # due to size of table (see script <scriptname>)
        TREETABLE = config_obj.get('database_tables','treetable')
        ntrees = 0
        try:
            for trees_group_id, tree_id, tree in iter_trees(studyobj):
                #print ' tree :' ,tree_id
                ntrees += 1
                proposedForSynth = False
                tree_properties.update(tree.keys())
                if (tree_id in proposedTrees):
                    proposedForSynth = True
                treejson = json.dumps(tree)
                ntips = 0
                for node_id, node in iter_node(tree):
                    oid = node.get('@otu')
                    # no @otu property on internal nodes
                    if oid is not None:
                        ntips+=1

                sqlstring = ("INSERT INTO {tablename} "
                    "(tree_id,study_id,ntips,proposed,data) "
                    "VALUES (%s,%s,%s,%s,%s);"
                    .format(tablename=TREETABLE)
                    )
                data = (tree_id,study_id,ntips,proposedForSynth,treejson)
                #print '  SQL: ',cursor.mogrify(sqlstring,data)
                cursor.execute(sqlstring,data)
                connection.commit()

        except psy.Error as e:
            print e.pgerror

        # now that we have added the tree info, update the study record
        # with the json data (minus the tree info) and ntrees
        del nexml['treesById']
        studyjson = json.dumps(nexml)
        sqlstring = ("UPDATE {tablename} "
            "SET data=%s,ntrees=%s "
            "WHERE id=%s;"
            .format(tablename=STUDYTABLE)
        )
        data = (studyjson,ntrees,study_id)
        cursor.execute(sqlstring,data)
        connection.commit()

        counter+=1
        if (counter%500 == 0):
            print "loaded {n} studies".format(n=counter)

        if (nstudies and counter>=nstudies):
            print "finished inserting",nstudies,"studies"
            break

    # load the tree and study properties
    PROPERTYTABLE = config_obj.get('database_tables','propertytable')
    load_properties(
        connection,
        cursor,
        PROPERTYTABLE,
        study_properties,
        tree_properties)