Ejemplo n.º 1
0
def load_data():

    nex_session = get_session()

    bud_id_to_id = dict([(x.bud_id, x.updatelog_id)
                         for x in nex_session.query(Updatelog).all()])

    for bud_id in bud_id_to_id:
        print bud_id
    return

    i = 0
    j = 0
    f = open(file)
    for line in f:
        pieces = line.strip().split("\t")
        if int(pieces[0]) in bud_id_to_id:
            continue
        insert_update_log(nex_session, pieces)
        i = i + 1
        j = j + 1
        if i == 500:
            nex_session.commit()
            i = 0
        if j == 200000:
            nex_session.close()
            nex_session = get_session()
            j = 0
    f.close()

    # nex_session.rollback()
    nex_session.commit()
Ejemplo n.º 2
0
def update_reference_table(log_file):

    nex_session = get_session()

    fw = open(log_file, "w")

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting PMID list...\n")

    pmid_to_reference = dict([
        (x.pmid, x) for x in nex_session.query(Referencedbentity).all()
    ])
    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    journal_id_to_abbrev = dict([(x.journal_id, x.med_abbr)
                                 for x in nex_session.query(Journal).all()])

    #################################################################

    source_id = source_to_id[SRC]

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting Pubmed records...\n")

    print(datetime.now())
    print("Getting Pubmed records...")

    pmids = []
    j = 0
    for pmid in pmid_to_reference:

        if pmid is None or pmid in [26842620, 27823544, 11483584]:
            continue

        j = j + 1
        if j > MAX_4_CONNECTION:
            nex_session.close()
            nex_session = get_session()
            j = 0

        if len(pmids) >= MAX:
            records = get_pubmed_record(','.join(pmids))
            update_database_batch(nex_session, fw, records, pmid_to_reference,
                                  journal_id_to_abbrev, source_id)

            pmids = []
            time.sleep(SLEEP_TIME)
        pmids.append(str(pmid))

    if len(pmids) > 0:
        records = get_pubmed_record(','.join(pmids))
        update_database_batch(nex_session, fw, records, pmid_to_reference,
                              journal_id_to_abbrev, source_id)

    print("Done")

    fw.close()
    nex_session.commit()
def update_data():

    nex_session = get_session()

    all_conds = nex_session.query(PhenotypeannotationCond).filter_by(
        condition_class='chemical').all()

    nex_session.close()
    nex_session = get_session()

    i = 0
    for x in all_conds:
        if x.condition_value is None or x.condition_value == "":
            continue
        if x.condition_unit is not None and x.condition_unit != "":
            continue
        else:
            condition_value = x.condition_value
            if " " not in condition_value:
                condition_value = condition_value.replace("uM", " uM")
                condition_value = condition_value.replace("mM", " mM")
                condition_value = condition_value.replace("g/L", " g/L")
                condition_value = condition_value.replace(
                    "%", " %")  # should we do this??
            values = condition_value.split(' ')
            if len(values) >= 3 or "," in condition_value or len(values) == 1:
                # print "TO_FIX:", str(x.condition_id) + "\t" + str(x.annotation_id) + "\t" + x.condition_name + "\t" + x.condition_value + "\t" + str(x.condition_unit)
                continue
            elif values[1] in [
                    'analog', 'derivative', 'B', 'C', 'X', 'CaCl2', 'Brix',
                    'B1', '2.7'
            ]:
                # print "TO_FIX:", str(x.condition_id) + "\t" + str(x.annotation_id) + "\t" + x.condition_name + "\t" + x.condition_value + "\t" + str(x.condition_unit)
                continue
            else:
                print values[0], ":", values[1]
                nex_session.query(PhenotypeannotationCond).filter_by(
                    condition_id=x.condition_id).update({
                        "condition_value":
                        values[0],
                        "condition_unit":
                        values[1]
                    })
                i = i + 1
                if i > 500:
                    nex_session.commit()
                    i = 0

    nex_session.commit()
    nex_session.close()
Ejemplo n.º 4
0
def load_go_refs(mapping_file):

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    sgdid_to_dbentity_id = dict([(x.sgdid, x.dbentity_id)
                                 for x in nex_session.query(Dbentity).all()])

    source_id = source_to_id[src]

    f = open(mapping_file)

    for line in f:
        pieces = line.strip().split("\t")

        print pieces[0]
        print pieces[1]

        x = ReferenceAlias(display_name=pieces[0],
                           source_id=source_id,
                           reference_id=sgdid_to_dbentity_id[pieces[1]],
                           alias_type=type,
                           created_by=CREATED_BY)
        nex_session.add(x)

    nex_session.commit()
    nex_session.close()
Ejemplo n.º 5
0
def load_ontology():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    ecid_to_ec = dict([(x.ecid, x) for x in nex_session.query(Ec).all()])

    ec_id_to_alias = {}
    for x in nex_session.query(EcAlia).all():
        aliases = []
        if x.ec_id in ec_id_to_alias:
            aliases = ec_id_to_alias[x.ec_id]
        aliases.append(x.display_name)
        ec_id_to_alias[x.ec_id] = aliases

    ####################################
    fw = open(log_file, "w")

    data = read_data_file(enzyme_file)

    [update_log,
     to_delete_list] = load_new_data(nex_session, data, source_to_id,
                                     ecid_to_ec, ec_id_to_alias, fw)

    write_summary_and_send_email(fw, update_log, to_delete_list)

    nex_session.close()

    fw.close()
Ejemplo n.º 6
0
def load_eco_urls():

    nex_session = get_session()

    f = open("data/eco_go_urls.txt")
    for line in f:
        if line.startswith('eco_id'):
            continue
        pieces = line.strip().split('\t')
        eco_id = int(pieces[0])
        display_name = pieces[3]
        obj_url = pieces[4]
        url_type = pieces[5]
        source_id = int(pieces[6])

        x = EcoUrl(eco_id=eco_id,
                   display_name=display_name,
                   obj_url=obj_url,
                   url_type=url_type,
                   source_id=source_id,
                   created_by=CREATED_BY)
        nex_session.add(x)
        nex_session.commit()

        print(eco_id, display_name, obj_url, url_type, source_id)

    nex_session.commit()
    nex_session.close()
Ejemplo n.º 7
0
def load_ontology():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    chebiid_to_chebi = dict([(x.chebiid, x)
                             for x in nex_session.query(Chebi).all()])

    chebi_id_to_alias = {}
    for x in nex_session.query(ChebiAlia).all():
        aliases = []
        if x.chebi_id in chebi_id_to_alias:
            aliases = chebi_id_to_alias[x.chebi_id]
        aliases.append((x.display_name, x.alias_type))
        chebi_id_to_alias[x.chebi_id] = aliases

    ####################################
    fw = open(log_file, "w")

    is_3_star_term = {}
    data = read_owl(ontology_file, ontology, is_3_star_term)

    [update_log, to_delete_list,
     term_name_changed] = load_new_data(nex_session, data, source_to_id,
                                        chebiid_to_chebi, chebi_id_to_alias,
                                        is_3_star_term, fw)

    write_summary_and_send_email(fw, update_log, to_delete_list,
                                 term_name_changed)

    nex_session.close()

    fw.close()
Ejemplo n.º 8
0
def load_ontology():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()])
    obiid_to_obi =  dict([(x.obiid, x) for x in nex_session.query(Obi).all()])
    term_to_ro_id = dict([(x.display_name, x.ro_id) for x in nex_session.query(Ro).all()])
    
    obi_id_to_parent = {}
    for x in nex_session.query(ObiRelation).all():
        parents = []
        if x.child_id in obi_id_to_parent:
            parents = obi_id_to_parent[x.child_id]
        parents.append(x.parent_id)
        obi_id_to_parent[x.child_id] = parents


    ####################################
    fw = open(log_file, "w")
    
    is_sgd_term = {}
    data = read_owl(ontology_file, ontology)
    
    [update_log, to_delete_list] = load_new_data(nex_session, data, 
                                                 source_to_id, 
                                                 obiid_to_obi, 
                                                 term_to_ro_id['is a'],
                                                 obi_id_to_parent,
                                                 fw)
    
    write_summary_and_send_email(fw, update_log, to_delete_list)
    
    nex_session.close()

    fw.close()
Ejemplo n.º 9
0
def load_ontology():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    psimodid_to_psimod = dict([(x.psimodid, x)
                               for x in nex_session.query(Psimod).all()])
    term_to_ro_id = dict([(x.display_name, x.ro_id)
                          for x in nex_session.query(Ro).all()])

    psimod_id_to_parent = {}
    for x in nex_session.query(PsimodRelation).all():
        parents = []
        if x.child_id in psimod_id_to_parent:
            parents = psimod_id_to_parent[x.child_id]
        parents.append(x.parent_id)
        psimod_id_to_parent[x.child_id] = parents

    ####################################
    fw = open(log_file, "w")

    data = read_obo(ontology_file)

    [update_log,
     to_delete_list] = load_new_data(nex_session, data, source_to_id,
                                     psimodid_to_psimod, term_to_ro_id['is a'],
                                     psimod_id_to_parent, fw)

    write_summary_and_send_email(fw, update_log, to_delete_list)

    nex_session.close()

    fw.close()
def update_database():

    nex_session = get_session()

    dataset_id_to_dataset = dict([(x.dataset_id, x) for x in nex_session.query(Dataset).filter_by(channel_count=1).all()])
    datasetsample_id_to_dataset_id = dict([(x.datasetsample_id, x.dataset_id) for x in nex_session.query(Datasetsample).all()])

    all_expressions = nex_session.query(Expressionannotation).all()

    for x in all_expressions:
        if x.log_ratio_value is not None:
            continue
        dataset_id = datasetsample_id_to_dataset_id.get(x.datasetsample_id)
        if dataset_id is None:
            # print "BAD: The datasetsample_id: ", x.datasetsample_id, " is not in the database."
            continue
        if dataset_id not in dataset_id_to_dataset:
            # print "BAD: The datasetsample_id: ", x.datasetsample_id, " is mapped to a dataset_id=", dataset_id, " that doesn't have channel_count=1"
            continue

        ## update data from here
        value = x.normalized_expression_value
        new_value = 0
        if value > 0:
            new_value = math.log(value, 2)
            ## round down to 2 decimal point
            new_value = float("%.2f" % new_value)  
       
        print x.annotation_id, value, new_value

    nex_session.close()
Ejemplo n.º 11
0
def load_summaries(summary_file):

    nex_session = get_session()

    biocyc_id_to_dbentity_id = dict([
        (x.biocyc_id, x.dbentity_id)
        for x in nex_session.query(Pathwaydbentity).all()
    ])
    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])

    sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none()
    source_id = sgd.source_id

    f = open(summary_file)
    fw = open(log_file, "w")

    for line in f:

        pieces = line.strip().split("\t")

        pathway_name = pieces[0].strip()
        summary_text = pieces[1].strip()
        pmids = pieces[2].strip().replace(" ", "").split("|")

        dbentity_id = biocyc_id_to_dbentity_id.get(pathway_name)

        if dbentity_id is None:
            print "TO CHECK: The biocyc_id:", pathway_name, " is not in the database."
            print line
            continue

        # summary_id = insert_pathwaysummary(nex_session, fw, dbentity_id, summary_text, source_id)

        # if summary_id is None:
        #    print "TO CHECK: Can't insert summary for biocyc_id: ", pathway_name
        #    print line

#    continue

        reference_id_list = []
        bad = 0
        for pmid in pmids:
            if int(pmid) in pmid_to_reference_id:
                reference_id_list.append(pmid_to_reference_id[int(pmid)])
            else:
                print "TO CHECK: The pmid: ", pmid, " is not in the database."
                bad = 1
        if bad == 1:
            print line
            continue

        # insert_summary_references(nex_session, fw, reference_id_list, summary_id, source_id)
        # nex_session.commit()

    f.close()
    fw.close()
def update_all_urls(log_file):

    nex_session = get_session()

    fw = open(log_file, "w")

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting PMID list...\n")

    pmid_to_reference = dict([
        (x.pmid, x) for x in nex_session.query(Referencedbentity).all()
    ])
    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])

    reference_id_to_urls = {}
    for x in nex_session.query(ReferenceUrl).all():
        urls = []
        if x.reference_id in reference_id_to_urls:
            urls = reference_id_to_urls[x.reference_id]
        urls.append((x.url_type, x.obj_url))
        reference_id_to_urls[x.reference_id] = urls

    #################################################################

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting Pubmed records...\n")

    print(datetime.now())
    print("Getting Pubmed records...")

    source_id = source_to_id[SRC]

    pmids = []
    for pmid in pmid_to_reference:

        fw.write("Getting data for PMID:" + str(pmid) + "\n")

        if pmid is None or pmid in [26842620, 27823544]:
            continue
        if len(pmids) >= MAX:
            records = get_pubmed_record(','.join(pmids))
            update_database_batch(nex_session, fw, records, pmid_to_reference,
                                  reference_id_to_urls, source_id)
            pmids = []
            # time.sleep(SLEEP_TIME)
        pmids.append(str(pmid))

    if len(pmids) > 0:
        records = get_pubmed_record(','.join(pmids))
        update_database_batch(nex_session, fw, records, pmid_to_reference,
                              reference_id_to_urls, source_id)

    print("Done")

    fw.close()
    nex_session.commit()
Ejemplo n.º 13
0
def load_domains():

    nex_session = get_session()

    fw = open(log_file, "w")

    read_data_and_update_database(nex_session, fw)

    nex_session.close()

    fw.close()
def get_data():

    nex_session = get_session()

    format_name_to_datasetsample_id = dict([
        (x.format_name, x.datasetsample_id)
        for x in nex_session.query(Datasetsample).all()
    ])
    systematic_name_to_dbentity_id = dict([
        (x.systematic_name, x.dbentity_id)
        for x in nex_session.query(Locusdbentity).all()
    ])
    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])
    tax = nex_session.query(Taxonomy).filter_by(taxid="TAX:4932").one_or_none()
    taxonomy_id = tax.taxonomy_id
    sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none()
    source_id = sgd.source_id

    fw = open(outfile, "w")
    fw.write(
        "dbentity_id\tsource_id\ttaxonomy_id\treference_id\tdatasetsample_id\tnormalized_expression_value\tlog_ratio_value\n"
    )

    f = open(infile)
    seen = {}
    for line in f:
        pieces = line.strip().split("\t")
        reference_id = pmid_to_reference_id.get(int(pieces[0]))
        if reference_id is None:
            print "The pmid: ", pieces[0], " is not in the database."
            continue
        dbentity_id = systematic_name_to_dbentity_id.get(pieces[6])
        if dbentity_id is None:
            print "The feature_name: ", pieces[6], " is not in the database."
            continue
        datasetsample_id = format_name_to_datasetsample_id.get(pieces[1])
        if datasetsample_id is None:
            print "The datasetsample format_name: ", pieces[
                1], " is not in the database."
            continue
        key = (dbentity_id, datasetsample_id)
        if key in seen:
            continue
        seen[key] = 1
        fw.write(
            str(dbentity_id) + "\t" + str(source_id) + "\t" +
            str(taxonomy_id) + "\t" + str(reference_id) + "\t" +
            str(datasetsample_id) + "\t" + pieces[4] + "\t" + pieces[5] + "\n")

    f.close()
    fw.close()
def load_geo_urls():

    nex_session = get_session()

    all = nex_session.query(Datasetsample).all()

    nex_session.close()

    nex_session = get_session()

    i = 0
    for x in all:
        if x.dbxref_id and x.dbxref_url is None:
            print(x.dbxref_id)
            dbxref_url = geo_root_url + x.dbxref_id    
            nex_session.query(Datasetsample).filter_by(datasetsample_id=x.datasetsample_id).update({"dbxref_url": dbxref_url})
            i = i + 1
            if i == 200:
                nex_session.commit()
                i = 0
    nex_session.commit()
    nex_session.close()
Ejemplo n.º 16
0
def load_data():

    nex_session = get_session()

    key_to_id = dict([((x.dbentity_id, x.datasetsample_id), x.annotation_id)
                      for x in nex_session.query(Expressionannotation).all()])

    nex_session.close()

    nex_session = get_session()

    fw = open(log_file, "w")
    i = 0
    j = 0
    for file in files_to_load:
        f = open(file)
        for line in f:
            if line.startswith('dbentity_id'):
                continue
            pieces = line.strip().split("\t")
            if (int(pieces[0]), int(pieces[4])) in key_to_id:
                continue
            insert_expressionannotation(nex_session, fw, pieces)
            i = i + 1
            j = j + 1
            if i == 500:
                nex_session.commit()
                i = 0
            if j == 200000:
                nex_session.close()
                nex_session = get_session()
                j = 0

        f.close()

    fw.close()

    # nex_session.rollback()
    nex_session.commit()
Ejemplo n.º 17
0
def load_data():

    nex_session = get_session()

    sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none()
    source_id = sgd.source_id

    key_to_id = {}
    for x in nex_session.query(ArchLocuschange).all():
        old_value = ""
        if x.old_value is not None:
            old_value = x.old_value
        key = (x.dbentity_id, x.change_type, old_value, x.new_value,
               x.date_added_to_database)
        key_to_id[key] = x.archive_id

    fw = open(log_file, "w")

    for file in files_to_load:

        f = open(file)

        for line in f:

            if line.startswith('dbentity_id'):
                continue
            pieces = line.strip().split("\t")
            if len(pieces) < 9:
                print("Unknown line: ", line)
                continue
            date_added_to_database = reformat_date(pieces[5].strip())
            date_standardized = reformat_date(pieces[7].strip())
            date_archived = reformat_date(pieces[8].strip())

            # key = (int(pieces[0].strip()), change_type, pieces[3].strip(), pieces[4].strip(), date_added_to_database)
            # if key in key_to_id:
            #    print "In database: ", key
            #    continue

            insert_into_database(nex_session, fw,
                                 int(pieces[0].strip()), source_id,
                                 int(pieces[1].strip()), pieces[3].strip(),
                                 pieces[4].strip(), date_added_to_database,
                                 pieces[6].strip(), date_archived,
                                 date_standardized),

        f.close()

    fw.close()
Ejemplo n.º 18
0
def update_data():

    nex_session = get_session()

    fw = open(log_file, "w")

    source_id_to_source = dict([(x.source_id, x.display_name)
                                for x in nex_session.query(Source).all()])

    all_aliases = nex_session.query(LocusAlias).all()

    nex_session.close()
    nex_session = get_session()

    i = 0
    for x in all_aliases:
        if x.obj_url:
            # print "OLD:", x.obj_url
            continue
        else:
            obj_url = get_url(x.alias_type, x.display_name,
                              source_id_to_source[x.source_id])
            if obj_url != "":
                print "OLD:", x.obj_url, "NEW:", obj_url
                nex_session.query(LocusAlias).filter_by(
                    locus_id=x.locus_id,
                    alias_type=x.alias_type,
                    display_name=x.display_name,
                    source_id=x.source_id).update({"obj_url": obj_url})
                i = i + 1
                if i > 500:
                    nex_session.commit()
                    i = 0

    nex_session.commit()
    nex_session.close()
Ejemplo n.º 19
0
def load_ontology():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()])
    doid_to_disease =  dict([(x.doid, x) for x in nex_session.query(Disease).all()])
    term_to_ro_id = dict([(x.display_name, x.ro_id) for x in nex_session.query(Ro).all()])
    
    disease_id_to_alias = {}
    for x in nex_session.query(DiseaseAlia).all():
        aliases = []
        if x.disease_id in disease_id_to_alias:
            aliases = disease_id_to_alias[x.disease_id]
        aliases.append((x.display_name, x.alias_type))
        disease_id_to_alias[x.disease_id] = aliases

    disease_id_to_parent = {}
    for x in nex_session.query(DiseaseRelation).all():
        parents = []
        if x.child_id in disease_id_to_parent:
            parents = disease_id_to_parent[x.child_id]
        parents.append(x.parent_id)
        disease_id_to_parent[x.child_id] = parents


    ####################################
    fw = open(log_file, "w")
    
    is_sgd_term = {}
    data = read_owl(ontology_file, ontology)
    
    [update_log, to_delete_list] = load_new_data(nex_session, data, 
                                                 source_to_id, 
                                                 doid_to_disease, 
                                                 term_to_ro_id['is a'],
                                                 disease_id_to_alias,
                                                 disease_id_to_parent,
                                                 fw)
    
    write_summary_and_send_email(fw, update_log, to_delete_list)
    
    nex_session.close()

    fw.close()
Ejemplo n.º 20
0
def update_all_relations(log_file):
 
    nex_session = get_session()

    fw = open(log_file,"w")

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting PMID list...\n")

    print(datetime.now())
    print("Getting PMID list...")

    pmid_to_reference =  dict([(x.pmid, x) for x in nex_session.query(Referencedbentity).all()])
    source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()])
    key_to_type = dict([((x.parent_id, x.child_id), x.relation_type) for x in nex_session.query(ReferenceRelation).all()])

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting Pubmed records...\n")

    print(datetime.now())
    print("Getting Pubmed records...")

    pmids = []
    for pmid in pmid_to_reference:

        fw.write("Getting data for PMID=" + str(pmid) + "\n")

        if pmid is None or pmid in [26842620, 27823544]:
            continue
        if len(pmids) >= MAX:
            records = get_pubmed_record(','.join(pmids))
            update_database_batch(nex_session, fw, records, pmid_to_reference, key_to_type, source_to_id)
            pmids = []
            time.sleep(SLEEP_TIME)
        pmids.append(str(pmid))

    if len(pmids) > 0:
        records = get_pubmed_record(','.join(pmids))
        update_database_batch(nex_session, fw, records, pmid_to_reference, key_to_type, source_to_id)

    print("Done")
    fw.close()
    nex_session.commit()
def add_display_name():

    f = open(ocelot_file)

    biocyc_id = None
    display_name = None
    biocyc_to_display_name = {}
    prev_line = None
    for line in f:
        line = line.strip()
        if len(line) == 0:
            if biocyc_id and display_name:
                biocyc_to_display_name[biocyc_id] = display_name
            biocyc_id = None
            display_name = None
            continue
        if line.startswith('(') and line.endswith(' NIL ('):
            # print line
            biocyc_id = line.replace("(", "").split(" ")[0]
            continue
        if line.startswith("(COMMON-NAME ") and line != "(COMMON-NAME NIL)":
            # print line
            display_name = line.replace('(COMMON-NAME "', '').replace('")', '')
            continue

    f.close()

    nex_session = get_session()

    all_pathways = nex_session.query(Pathwaydbentity).all()

    for x in all_pathways:
        if x.biocyc_id in biocyc_to_display_name:
            nex_session.query(Pathwaydbentity).filter_by(
                biocyc_id=x.biocyc_id).update(
                    {'display_name': biocyc_to_display_name[x.biocyc_id]})
            print x.biocyc_id + "\t" + biocyc_to_display_name[x.biocyc_id]
        else:
            print "NOT FOUND:", x.biocyc_id

    nex_session.rollback()
Ejemplo n.º 22
0
def load_data():

    nex_session = get_session()

    i = 0
    for file in files_to_load:
        f = open(file)
        for line in f:
            if line.startswith('dbentity_id'):
                continue
            pieces = line.strip().split("\t")
            insert_expressionannotation(nex_session, pieces)
            i = i + 1
            if i == 500:
                nex_session.commit()
                i = 0

        f.close()

    # nex_session.rollback()
    nex_session.commit()
Ejemplo n.º 23
0
def load_ontology():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    roid_to_ro = dict([(x.roid, x) for x in nex_session.query(Ro).all()])

    fw = open(log_file, "w")

    data = read_owl(ontology_file, ontology)

    [update_log,
     to_delete_list] = load_new_data(nex_session, data, source_to_id[src],
                                     roid_to_ro, fw)

    write_summary_and_send_email(fw, update_log, to_delete_list)

    nex_session.close()

    fw.close()
def load_geo_urls():

    nex_session = get_session()

    source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()])
    dataset_id_to_url = dict([(x.dataset_id, x) for x in nex_session.query(DatasetUrl).filter_by(url_type='GEO').all()])
    
    source_id = source_to_id[src]
    for x in nex_session.query(Dataset).all():
        if x.dbxref_id and x.dbxref_id.startswith('GSE') and x.dataset_id not in dataset_id_to_url:
            print(x.dbxref_id)

            y = DatasetUrl(display_name = type,
                           dataset_id = x.dataset_id,
                           source_id = source_id,
                           obj_url = geo_root_url + x.dbxref_id,
                           url_type = type,
                           created_by = CREATED_BY)
            nex_session.add(y)

    nex_session.commit()
    nex_session.close()
Ejemplo n.º 25
0
def get_data():

    nex_session = get_session()

    format_name_to_datasetsample_id = dict([
        (x.format_name, x.datasetsample_id)
        for x in nex_session.query(Datasetsample).all()
    ])
    systematic_name_to_dbentity_id = dict([
        (x.systematic_name, x.dbentity_id)
        for x in nex_session.query(Locusdbentity).all()
    ])

    fw = open(outfile, "w")
    fw.write("dbentity_id\tdatasetsample_id\tlog_ratio_value\n")

    f = open(infile)
    seen = {}
    for line in f:
        pieces = line.strip().split("\t")
        dbentity_id = systematic_name_to_dbentity_id.get(pieces[6])
        if dbentity_id is None:
            print("The feature_name: ", pieces[6], " is not in the database.")
            continue
        datasetsample_id = format_name_to_datasetsample_id.get(pieces[1])
        if datasetsample_id is None:
            print("The datasetsample format_name: ", pieces[1],
                  " is not in the database.")
            continue
        key = (dbentity_id, datasetsample_id)
        if key in seen:
            continue
        seen[key] = 1
        fw.write(
            str(dbentity_id) + "\t" + str(datasetsample_id) + "\t" +
            pieces[5] + "\n")

    f.close()
    fw.close()
def update_all_authors(log_file):
 
    nex_session = get_session()

    fw = open(log_file,"w")

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting PMID list...\n")
    
    print(datetime.now())
    print("Getting PMID list...")

    pmid_to_reference =  dict([(x.pmid, x) for x in nex_session.query(Referencedbentity).all()])
    source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()])

    reference_id_to_authors = {}
    for x in nex_session.query(Referenceauthor).order_by(Referenceauthor.reference_id, Referenceauthor.author_order).all():
        authors = []
        if x.reference_id in reference_id_to_authors:
            authors = reference_id_to_authors[x.reference_id]
        authors.append(x.display_name)
        reference_id_to_authors[x.reference_id] = authors

    #################################################################

    f = open('./authors_updated_pmid.lst')
    updated_pmid = {}
    for pmid in f:
        pmid = int(pmid.strip())
        updated_pmid[pmid] = 1
    f.close()

    fw.write(str(datetime.now()) + "\n")
    fw.write("Getting Pubmed records...\n")

    print(datetime.now())
    print("Getting Pubmed records...")

    source_id = source_to_id[SRC]

    j = 0
    pmids = []
    for pmid in pmid_to_reference:

        fw.write("Getting data for PMID:" + str(pmid) + "\n")

        if pmid is None or pmid in [26842620, 27823544, 11483584]:
            continue

        if pmid in updated_pmid:
            continue

        j = j + 1
        if j > MAX_4_CONNECTION:
            nex_session.close()
            nex_session = get_session()
            j = 0

        if len(pmids) >= MAX:
            records = get_pubmed_record_from_xml(','.join(pmids))
            update_database_batch(nex_session, fw, records, pmid_to_reference, 
                                  reference_id_to_authors, source_id)
            pmids = []
            time.sleep(SLEEP_TIME)
        pmids.append(str(pmid))

    if len(pmids) > 0:
        records = get_pubmed_record_from_xml(','.join(pmids))
        update_database_batch(nex_session, fw, records, pmid_to_reference, 
                              reference_id_to_authors, source_id)

    print("Done")

    fw.close()
    nex_session.commit()
def load_summaries(summary_type, summary_file, log_file):

    nex_session = get_session()

    fw = open(log_file,"w")
    
    fw.write(str(datetime.now()) + "\n")
    fw.write("reading data from summary_file...\n")

    data = read_summary_file(nex_session, fw, summary_type, summary_file, log_file)

    fw.write(str(datetime.now()) + "\n")
    fw.write("retriveing data from database and store the data in dictionary...\n")
    
    key_to_summary = dict([((x.locus_id, x.summary_type, x.summary_order), x) for x in nex_session.query(Locussummary).all()])
    key_to_summaryref = dict([((x.summary_id, x.reference_id, x.reference_order), x) for x in nex_session.query(LocussummaryReference).all()])
    
    source_to_id = dict([(x.display_name, x.source_id) for x in nex_session.query(Source).all()])
    source_id = source_to_id.get('SGD')

    summary_id_to_references = {}
    for x in nex_session.query(LocussummaryReference).all():
        references = []
        if x.summary_id in summary_id_to_references:
            references = summary_id_to_references[x.summary_id]
        references.append(x)
        summary_id_to_references[x.summary_id] = references

    load_summary_holder = { "summary_added": 0,
                            "summary_updated": 0,
                            "summary_reference_added": 0 }

    fw.write(str(datetime.now()) + "\n")
    fw.write("updating the database...\n")

    for x in data:
        key = (x['locus_id'], x['summary_type'], x['summary_order'])
        summary_id = None
        if key in key_to_summary:
            if x['text'] != key_to_summary[key].text.strip():
                fw.write("OLD:" + key_to_summary[key].text + ":\n")
                fw.write("NEW:" + x['text'] + ":\n")
                nex_session.query(Locussummary).filter_by(summary_id=key_to_summary[key].summary_id).update({'text': x['text'], 'html': x['html']})
                nex_session.commit()
                load_summary_holder['summary_updated'] = load_summary_holder['summary_updated'] + 1
            else:
                fw.write("SUMMARY is in DB\n")
            summary_id = key_to_summary[key].summary_id
            update_references(nex_session,
                              fw,
                              load_summary_holder,
                              source_id, 
                              summary_id, 
                              summary_id_to_references.get(summary_id), 
                              x.get('references'))
        else:
            summary_id = insert_summary(nex_session, fw, load_summary_holder, source_id, x)
            if x.get('references'):
                for y in x['references']:
                    insert_summary_reference(nex_session, fw, load_summary_holder, source_id, summary_id, y)

    nex_session.commit()
    nex_session.close()
 
    fw.write(str(datetime.now()) + "\n")
    fw.write("writing summary and sending an email to curators...\n")

    write_summary_and_send_email(load_summary_holder, fw, summary_type)

    fw.close()
def update_all_abstracts(log_file):

    nex_session = get_session()

    fw = open(log_file, "w")

    fw.write(str(datetime.now()) + "\n")

    print(datetime.now())

    pmid_to_reference = dict([
        (x.pmid, x) for x in nex_session.query(Referencedbentity).all()
    ])
    source_to_id = dict([(x.display_name, x.source_id)
                         for x in nex_session.query(Source).all()])
    reference_id_to_abstract = dict([
        (x.reference_id, x.text)
        for x in nex_session.query(Referencedocument).filter_by(
            document_type="Abstract").all()
    ])

    #################################################################

    fw.write("Getting Pubmed records...\n")

    print("Getting Pubmed records...")

    source_id = source_to_id[SRC]

    pmids = []
    j = 0
    for pmid in pmid_to_reference:

        fw.write("Getting data for PMID: " + str(pmid) + "\n")

        if pmid is None or pmid in [26842620, 27823544]:
            continue

        j = j + 1
        if j > MAX_4_CONNECTION:
            nex_session.close()
            nex_session = get_session()
            j = 0

        if len(pmids) >= MAX:

            records = get_pubmed_record(','.join(pmids))
            update_database_batch(nex_session, fw, records, pmid_to_reference,
                                  reference_id_to_abstract, source_id)

            pmids = []
            time.sleep(SLEEP_TIME)
        pmids.append(str(pmid))

    if len(pmids) > 0:
        records = get_pubmed_record(','.join(pmids))
        update_database_batch(nex_session, fw, records, pmid_to_reference,
                              reference_id_to_abstract, source_id)

    print("Done")

    fw.close()
    nex_session.commit()
Ejemplo n.º 29
0
def load_data():

    nex_session = get_session()

    format_name_to_dataset_id_src = dict([(x.format_name, (x.dataset_id, x.source_id)) for x in nex_session.query(Dataset).all()])
    taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id) for x in nex_session.query(Taxonomy).all()])
    format_name_to_datasetsample_id = dict([(x.format_name, x.datasetsample_id) for x in nex_session.query(Datasetsample).all()])

    fw = open(log_file, "w")

    GSE_GSM_to_assay = {}
    f = open(ds_multiassays_to_sample_mapping_file)
    for line in f:
        pieces = line.strip().split("\t")
        GSE = pieces[0].strip()
        assay_name = pieces[1]
        GSM_list = pieces[2].strip().split('|')
        for GSM in GSM_list:
            GSE_GSM_to_assay[(GSE, GSM)] = assay_name
    f.close()

    GSE_assay_to_dataset_format_name = {}
    f = open(ds_with_multiassays_file)
    for line in f:
        pieces = line.strip().split("\t")
        GSE = pieces[0].strip()
        dataset_format_name = pieces[1].strip()
        assay_name = pieces[2].strip()
        GSE_assay_to_dataset_format_name[(GSE, assay_name)] = dataset_format_name
    f.close()
        
    format_name2display_name = {}
    dataset2index = {}
    for file in files_to_load:
        print "Loading data from ", file
        f = open(file)
        for line in f:
            if line.startswith('dataset'):
                continue
            line = line.strip()
            if line:
                pieces = line.replace('"', '').split("\t")
                GSE = pieces[0].strip()
                GSM = pieces[3].strip()
                dataset_format_name = GSE
                if (GSE, GSM) in GSE_GSM_to_assay:
                    assay = GSE_GSM_to_assay[(GSE, GSM)]
                    # print "FOUND assay:", (GSE, GSM), assay
                    if (GSE, assay) in GSE_assay_to_dataset_format_name:
                        dataset_format_name = GSE_assay_to_dataset_format_name[(GSE, assay)]
                    #    print "FOUND dataset format_name:", (GSE, assay), dataset_format_name
                    # else:
                    #    print "NOT FOUND dataset format_name:", (GSE, assay)

                if dataset_format_name not in format_name_to_dataset_id_src:
                    print "The dataset: ", dataset_format_name, " is not in DATASET table."
                    continue
                (dataset_id, source_id) = format_name_to_dataset_id_src[dataset_format_name]
                if len(pieces) < 9 or pieces[8] == '':
                    print "SHORT LINE:", len(pieces), line
                    continue
                display_name = pieces[1]
        
                description = ""
                if pieces[2] != '':                        
                    description = pieces[2]                                                     
                    if len(pieces[2]) > 500:
                        description = display_name
                       
                data = { "source_id": source_id,
                         "dataset_id": dataset_id,
                         "display_name": display_name,
                         "sample_order": int(pieces[8]) }

                if pieces[2] != '':
                    data['description'] = pieces[2]
                    if len(pieces[2]) > 500:
                        data['description'] = display_name
                if pieces[5] != '':
                    data['biosample'] = pieces[5]
                if pieces[7] != '':
                    data['strain_name'] = pieces[7]
                if len(pieces) > 9 and pieces[9]:
                    taxonomy_id = taxid_to_taxonomy_id.get("TAX:"+pieces[9])
                    if taxonomy_id is None:
                        print "The taxid = ", pieces[9], " for: ", dataset_format_name, GSM, " is not in TAXONOMY table."
                    else:
                        data['taxonomy_id'] = taxonomy_id
                if GSM == '':
                    index = dataset2index.get(dataset_format_name, 0) + 1
                    data['format_name'] = dataset_format_name + "_sample_" + str(index)
                    if data['format_name'] in format_name_to_datasetsample_id:
                        print "format_name for Non GSM row: ", data['format_name'], " is used."
                        continue
                    dataset2index[dataset_format_name] = index
                    data['obj_url'] = "/datasetsample/" + data['format_name']
                    insert_datasetsample(nex_session, fw, data)
                else:
                    data['dbxref_type'] = pieces[4]
                    if format_name2display_name.get(GSM):
                        print "The format_name: ", GSM, " has been used for other sample", format_name2display_name.get(GSM)
                        continue
                    format_name2display_name[GSM] = display_name
                    data['format_name'] = dataset_format_name + "_" + GSM
                    if data['format_name'] in format_name_to_datasetsample_id:
                        print "format_name for GSM row: ", data['format_name'], " is used."
                        continue
                    data['obj_url'] = "/datasetsample/" + data['format_name']
                    data['dbxref_id'] = GSM
                    insert_datasetsample(nex_session, fw, data)
        f.close()

    fw.close()

    # nex_session.rollback()
    nex_session.commit()
Ejemplo n.º 30
0
import sys
reload(sys)  
sys.setdefaultencoding('UTF8')
sys.path.insert(0, '../../../src/')
from models import Dataset, Datasetsample, Referencedbentity, DatasetReference
sys.path.insert(0, '../')
from database_session import get_nex_session as get_session

__author__ = 'sweng66'

nex_session = get_session()

dataset_id_to_sample_count = dict([(x.dataset_id, x.sample_count) for x in nex_session.query(Dataset).filter_by(is_in_spell='true').all()])

sample_count = {}

for x in nex_session.query(Datasetsample).all():
    if x.dataset_id in sample_count:
        sample_count[x.dataset_id] = sample_count[x.dataset_id] + 1
    else:
        sample_count[x.dataset_id] = 1

for dataset_id in sample_count:
    if dataset_id not in dataset_id_to_sample_count:
        continue
    if sample_count[dataset_id] != dataset_id_to_sample_count[dataset_id]:
        print "MISMATCH: ", dataset_id, sample_count[dataset_id], dataset_id_to_sample_count[dataset_id]
    else:
        print "MATCH:    ", dataset_id,sample_count[dataset_id], dataset_id_to_sample_count[dataset_id]