Esempio n. 1
0
def load_data():
 
    nex_session = get_nex_session()

    brenda = nex_session.query(Source).filter_by(format_name='BRENDA').one_or_none()
    b_source_id = brenda.source_id
    expasy = nex_session.query(Source).filter_by(format_name='ExPASy').one_or_none()
    e_source_id = expasy.source_id
    
    fw = open(log_file, "w")

    f = open(file_to_load)
    for line in f:
        line = line.strip()
        if len(line) < 8 or line[1] != ".":
            continue
        line = line.replace(". ", ".")
        # convert multiple spaces to single space
        line = ' '.join(line.split())
        pieces = line.split(" ")
        ec = pieces.pop(0)
        desc = ' '.join(pieces)
        ec_id = load_ec(nex_session, fw, ec, desc, e_source_id)
        load_ec_url(nex_session, fw, ec_id, ec, e_source_id, b_source_id)

    f.close()
    fw.close()

    # nex_session.rollback()
    nex_session.commit()
Esempio n. 2
0
def load_data(data_file, log_file):

    nex_session = get_nex_session()

    sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none()
    source_id = sgd.source_id
    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])
    name_to_locus_id = dict([(x.systematic_name, x.dbentity_id)
                             for x in nex_session.query(Locusdbentity).all()])
    taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id)
                                 for x in nex_session.query(Taxonomy).all()])
    eco_to_id = dict([(x.ecoid, x.eco_id)
                      for x in nex_session.query(Eco).all()])
    goid_to_id = dict([(x.goid, x.go_id) for x in nex_session.query(Go).all()])

    key_to_annotation = {}

    for x in nex_session.query(Regulationannotation).all():
        happens_during = x.happens_during if x.happens_during is not None else ''
        key = (x.target_id, x.regulator_id, x.taxonomy_id, x.reference_id,
               x.eco_id, x.regulator_type, x.regulation_type,
               x.annotation_type, happens_during)
        key_to_annotation[key] = x

    strain_to_taxid = get_strain_taxid_mapping()

    fw = open(log_file, "w")

    loaded = {}

    f = open(data_file)
    for line in f:
        if line.startswith('Regulator'):
            continue
        pieces = line.strip().split("\t")

        regulator_id = name_to_locus_id.get(pieces[0].strip())
        if regulator_id is None:
            print "The regulator name: ", pieces[0], " is not in the database."
            continue

        target_id = name_to_locus_id.get(pieces[3].strip())
        if target_id is None:
            print "The target name: ", pieces[3], " is not in the database."
            continue

        strain = pieces[5].strip()
        if strain == 'CEN.PK':
            strain = 'CENPK'
        taxid = strain_to_taxid.get(strain)
        if taxid is None:
            print "The strain name: ", pieces[
                5], " is not in the mapping module."
            continue
        taxonomy_id = taxid_to_taxonomy_id.get(taxid)
        if taxonomy_id is None:
            print "The taxid: ", taxid, " is not in the database."
            continue

        happens_during = ''
        if pieces[8]:
            happens_during = goid_to_id.get(pieces[8].strip().split(' ')[0])
            if happens_during is None:
                print "Unknown GOID: ", pieces[8].strip().split(' ')[0]
                continue

        reference_id = pmid_to_reference_id.get(int(pieces[10]))
        if reference_id is None:
            print "The pmid: ", pieces[10], " is not in the database"
            continue

        regulator_type = pieces[2].strip()
        direction = pieces[6].strip()
        regulation_type = pieces[7].strip()
        annotation_type = pieces[11].strip()
        created_by = pieces[12].strip()

        if regulator_type not in allowable_regulator_type:
            print "Unknown regulator_type: ", regulator_type
            continue
        if regulation_type not in allowable_regulation_type:
            print "Unknown regulation_type: ", regulation_type
            continue
        if direction and direction not in allowable_regulation_direction:
            print "Unknown regulation_direction: ", direction
            continue
        if annotation_type not in allowable_annotation_type:
            print "Unknown annotation_type: ", annotation_type

        if regulation_type == 'protein activity' and regulator_type in [
                'transcription factor', 'chromatin modifier'
        ]:
            print "regulator_type in (transcription factor, chromatin modifier) cannot be used with regulation_type = 'protein activity'. See line below:"
            print line
            continue

        if regulator_type == 'protein modifier' and regulation_type == 'regulation of transcription':
            print "regulator_type = 'protein modifier' cannot be used with regulation_type = 'regulation of transcription'. See line below:"
            print line
            continue

        eco_items = pieces[9].strip().split(',')
        for eco_item in eco_items:
            eco_id = eco_to_id.get(eco_item.strip().split(' ')[0])
            if eco_id is None:
                print "The ECO code: ", pieces[9], " is not in the database."
                continue

            key = (target_id, regulator_id, taxonomy_id, reference_id, eco_id,
                   regulator_type, regulation_type, annotation_type,
                   happens_during)

            if key in loaded:
                print "Same row exists: ", loaded[key]
                print "Same row exists: ", line
                continue
            loaded[key] = line

            if key in key_to_annotation:
                x = key_to_annotation[key]
                direction_DB = x.direction
                if direction_DB is None:
                    direction_DB = ''
                if direction_DB == direction:
                    fw.write("IN database: " + line.strip() + " KEY=" +
                             str(key) + " direction_in_db=" +
                             str(x.direction) + "\n")
                    continue

                ## update
                if x.direction is None:
                    if direction:
                        x.direction = direction
                elif x.direction != direction:
                    x.direction = direction
                nex_session.add(x)
                nex_session.commit()
                fw.write("The direction has been updated for key=" + str(key) +
                         "\n")
            else:
                insert_a_row(nex_session, fw, source_id, target_id,
                             regulator_id, eco_id, reference_id, taxonomy_id,
                             regulator_type, regulation_type, annotation_type,
                             direction, happens_during, created_by)
def load_data():
 
    nex_session = get_nex_session()

    bud_id_to_reference_id =  dict([(x.bud_id, x.dbentity_id) for x in nex_session.query(Referencedbentity).all()])
    name_to_locus_id =  dict([(x.systematic_name, x.dbentity_id) for x in nex_session.query(Locusdbentity).all()])
    bud_id_to_colleague_id = dict([(x.bud_id, x.colleague_id) for x in nex_session.query(Colleague).all()])
    key_to_colleague_locus_id = dict([((x.colleague_id, x.locus_id), x.colleague_locus_id) for x in nex_session.query(ColleagueLocus).all()])

    sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none()
    sgd_source_id = sgd.source_id
    direct = nex_session.query(Source).filter_by(display_name='Direct submission').one_or_none()
    direct_source_id = direct.source_id
    

    fw = open(log_file, "w")

    f = open(file_to_load)

    for line in f:
        pieces = line.strip().split("\t")
        gene_name = pieces[0].strip()
        name_desc = pieces[2].strip()
        if pieces[1] == 'ORF':
            continue
        locus_id = name_to_locus_id.get(pieces[1].strip())
        if locus_id is None:
            print "The ORF name: ", pieces[1], " is not in the database."
            continue
        colleague_id = bud_id_to_colleague_id.get(int(pieces[3]))
        if colleague_id is None:
            print "The colleague bud_id:", pieces[3], " is not in the database."
            continue
        reference_id = None
        if pieces[4]:
            if int(pieces[4]) in bud_id_to_reference_id:
                reference_id = bud_id_to_reference_id.get(int(pieces[4]))
            else:
                print "The reference bud_id:", pieces[4], " is not in the database."
                continue
        else:
            print "NO reference_no provided."
            print line
            continue

        [reserved_date, expired_date] = reformat_date(pieces[6])

        print gene_name, locus_id, colleague_id, reference_id, reserved_date, expired_date, name_desc        
        
        # update LOCUSDBENTITY
        nex_session.query(Locusdbentity).filter_by(dbentity_id=locus_id).update({"gene_name": gene_name, "name_description": name_desc})
        fw.write("Update LOCUSDBENTITY row for "+pieces[1]+": gene_name="+gene_name+", name_desc="+name_desc+"\n")

        # update DBENTITY

        nex_session.query(Dbentity).filter_by(dbentity_id=locus_id).update({"display_name": gene_name})

        fw.write("Update DBENTITY row for "+pieces[1]+": display_name="+gene_name+"\n")

        
        add_locus_reference(nex_session, fw, locus_id, reference_id, sgd_source_id)
    
        if (colleague_id, locus_id) not in key_to_colleague_locus_id:
            add_colleague_locus(nex_session, fw, locus_id, colleague_id, direct_source_id)
        
        add_reservedname(nex_session, fw, locus_id, gene_name, reference_id, colleague_id,
                         reserved_date, expired_date, direct_source_id)

    f.close()
    fw.close()

    # nex_session.rollback()
    nex_session.commit()
Esempio n. 4
0
def load_references(infile, logfile):

    nex_session = get_nex_session()

    name_to_locus_id = {}
    for x in nex_session.query(Locusdbentity).all():
        name_to_locus_id[x.systematic_name] = x.dbentity_id
        if x.gene_name:
            name_to_locus_id[x.gene_name] = x.dbentity_id

    sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none()
    source_id = sgd.source_id

    tax = nex_session.query(Taxonomy).filter_by(taxid=taxon).one_or_none()
    taxonomy_id = tax.taxonomy_id

    fw = open(logfile, "w")

    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])

    load_papers(fw, infile, pmid_to_reference_id)

    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])
    pmid_to_refdeleted_id = dict([
        (x.pmid, x.referencedeleted_id)
        for x in nex_session.query(Referencedeleted).all()
    ])

    key_in_annotation = {}
    key_in_curation = {}
    for x in nex_session.query(Literatureannotation).all():
        dbentity_id = None
        if x.dbentity_id:
            dbentity_id = x.dbentity_id
        key_in_annotation[(x.reference_id, dbentity_id, x.topic)] = 1

    for x in nex_session.query(CurationReference).all():
        locus_id = None
        if x.locus_id:
            locus_id = x.locus_id
            key_in_curation[(x.reference_id, locus_id, x.curation_tag)] = 1

    f = open(infile)

    header = []

    for line in f:

        line = line.replace("Homology Disease", "Homology/Disease")

        pieces = line.strip().split("\t")

        if pieces[0] in ['PMID', 'pmid', 'pubmed', '']:
            header = pieces
            continue

        pmid = int(pieces[0])
        created_by = pieces[1]
        date_created = pieces[15]

        if pieces[2] == '1':
            # add to DB only - reference has been loaded so skip this one
            print("Add to DB only: ", pieces[0])
            continue
        if pieces[3] == '1':
            print("Discard this paper")
            if pmid in pmid_to_refdeleted_id:
                print("The row for PMID: ", pmid,
                      " is in the REFERENCEDELETED table.")
                continue
            insert_referencedeleted(nex_session, fw, pmid, created_by,
                                    date_created)
            continue

        reference_id = pmid_to_reference_id.get(pmid)
        if reference_id is None:
            print("The pmid: ", pmid, " is not in the database.")
            continue

        # curation tags
        for i in [4, 5, 7, 8, 9, 13, 14]:
            # if len(pieces) <= i:
            #    continue
            if pieces[i] != "":
                curation_tag = header[i].strip()
                if pieces[i] == '1':
                    locus_id = None
                    if (reference_id, locus_id,
                            curation_tag) in key_in_curation:
                        print("The row for ",
                              (reference_id, locus_id, curation_tag),
                              " is already in the CURATION_REFERENCE table.")
                        continue
                    insert_curation_reference(nex_session, fw, reference_id,
                                              locus_id, curation_tag,
                                              created_by, date_created,
                                              source_id)
                    key_in_curation[(reference_id, locus_id, curation_tag)] = 1
                else:
                    names = pieces[i].strip().split(" ")
                    for name in names:
                        name = name.strip()
                        locus_id = name_to_locus_id.get(name)
                        if locus_id is None:
                            print("The gene name: ", name,
                                  " is not in the database.")
                            continue

                        if (reference_id, locus_id,
                                curation_tag) in key_in_curation:
                            print(
                                "The row for ",
                                (reference_id, locus_id, curation_tag),
                                " is already in the CURATION_REFERENCE table.")
                            continue
                        insert_curation_reference(nex_session, fw,
                                                  reference_id, locus_id,
                                                  curation_tag, created_by,
                                                  date_created, source_id)
                        key_in_curation[(reference_id, locus_id,
                                         curation_tag)] = 1
        # literature topics
        for i in [6, 7, 8, 9, 10, 11, 12]:
            # if len(pieces) <= i:
            #    continue
            if pieces[i] != "":
                topic = header[i].strip()
                if i in [7, 8, 9]:
                    topic = "Primary Literature"
                if pieces[i] == '1' or topic == 'Omics':
                    locus_id = None
                    if (reference_id, locus_id, topic) in key_in_annotation:
                        print(
                            "The row for ", (reference_id, locus_id, topic),
                            " is already in the LITERATUREANNOTATION table.")
                        continue
                    insert_literatureannotation(nex_session, fw, reference_id,
                                                taxonomy_id, locus_id, topic,
                                                created_by, date_created,
                                                source_id)
                    key_in_annotation[(reference_id, locus_id, topic)] = 1
                else:
                    names = pieces[i].strip().split(" ")
                    for name in names:
                        name = name.strip()
                        locus_id = name_to_locus_id.get(name)
                        if locus_id is None:
                            print("The gene name: ", name,
                                  " is not in the database.")
                            continue
                        if (reference_id, locus_id,
                                topic) in key_in_annotation:
                            print(
                                "The row for ",
                                (reference_id, locus_id, topic),
                                " is already in the LITERATUREANNOTATION table."
                            )
                            continue
                        insert_literatureannotation(nex_session, fw,
                                                    reference_id, taxonomy_id,
                                                    locus_id, topic,
                                                    created_by, date_created,
                                                    source_id)
                        key_in_annotation[(reference_id, locus_id, topic)] = 1

    # nex_session.rollback()
    nex_session.commit()

    fw.close()
    f.close()