Ejemplo n.º 1
0
def read_data_and_update_database(nex_session, fw):

    taxon_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id)
                                 for x in nex_session.query(Taxonomy).all()])
    name_to_dbentity_id = dict([
        (x.systematic_name, x.dbentity_id)
        for x in nex_session.query(Locusdbentity).all()
    ])
    contig_to_contig_id = dict([(x.format_name, x.contig_id)
                                for x in nex_session.query(Contig).all()])
    key_to_annotation_id = dict([
        ((x.dbentity_id, x.taxonomy_id, x.contig_id), x.annotation_id)
        for x in nex_session.query(Proteinsequenceannotation).all()
    ])

    f = open(data_file)

    strain_to_taxon_mapping = get_strain_taxid_mapping()

    header = None
    for line in f:
        pieces = line.strip().split("\t")
        if pieces[0] == 'name':
            header = pieces[3:]
            continue
        name = pieces[0]
        dbentity_id = name_to_dbentity_id.get(name)
        if dbentity_id is None:
            print(name + " is not in the database")
        strain = pieces[1]
        taxon = strain_to_taxon_mapping.get(strain)
        if taxon is None:
            print("The strain = " + strain + " is not in the mapping module.")
            continue
        taxonomy_id = taxon_to_taxonomy_id.get(taxon)
        if taxonomy_id is None:
            print("The taxid = " + taxon + " is not in the database.")
            continue
        contig = pieces[2]
        contig_id = contig_to_contig_id.get(contig)
        if contig_id is None:
            print(contig + " is not in the database.")
            continue
        annotation_id = key_to_annotation_id.get(
            (dbentity_id, taxonomy_id, contig_id))
        if annotation_id is None:
            print((dbentity_id, taxonomy_id, contig_id) +
                  " is not in the database.")
            continue

        data = pieces[3:]

        insert_proteinsequence_detail(nex_session, fw, annotation_id, data,
                                      header)

    f.close()

    # nex_session.rollback()
    nex_session.commit()
Ejemplo n.º 2
0
def load_phenotypes(infile, logfile):

    nex_session = get_session()

    name_to_locus_id = {}
    for x in nex_session.query(Locusdbentity).all():
        name_to_locus_id[x.systematic_name] = x.dbentity_id
        if x.gene_name:
            name_to_locus_id[x.gene_name] = x.dbentity_id

    sgd = nex_session.query(Source).filter_by(format_name='SGD').one_or_none()
    source_id = sgd.source_id

    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])

    experiment_to_id = {}
    mutant_to_id = {}
    for x in nex_session.query(Apo).all():
        if x.apo_namespace == 'experiment_type':
            experiment_to_id[x.display_name] = x.apo_id
        if x.apo_namespace == 'mutant_type':
            mutant_to_id[x.display_name] = x.apo_id

    annotation_id_to_last_group_id = {}
    for x in nex_session.query(PhenotypeannotationCond).all():
        last_group_id = 1
        if x.annotation_id in annotation_id_to_last_group_id:
            last_group_id = annotation_id_to_last_group_id[x.annotation_id]
        if x.group_id > last_group_id:
            last_group_id = x.group_id
        annotation_id_to_last_group_id[x.annotation_id] = last_group_id

    phenotype_to_id = dict([(x.display_name, x.phenotype_id)
                            for x in nex_session.query(Phenotype).all()])
    taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id)
                                 for x in nex_session.query(Taxonomy).all()])
    allele_to_id = dict([(x.display_name, x.allele_id)
                         for x in nex_session.query(Allele).all()])
    reporter_to_id = dict([(x.display_name, x.reporter_id)
                           for x in nex_session.query(Reporter).all()])
    chebiid_to_name = dict([(x.chebiid, x.display_name)
                            for x in nex_session.query(Chebi).all()])

    fw = open(logfile, "w")

    key_to_annotation_id = dict([
        ((x.dbentity_id, x.taxonomy_id, x.reference_id, x.phenotype_id,
          x.experiment_id, x.mutant_id, x.allele_id, x.reporter_id,
          x.strain_name, x.details), x.annotation_id)
        for x in nex_session.query(Phenotypeannotation).all()
    ])

    strain_taxid_mapping = get_strain_taxid_mapping()

    f0 = open(degree_file)
    degree = None
    for line in f0:
        field = line.split("\t")
        degree = field[26]
    f0.close()

    f = open(infile)

    header = []

    i = 0
    superheader = []
    header = []
    cond_header = []
    for line in f:
        i = i + 1
        pieces = line.strip().split("\t")

        if i == 1:
            superheader = pieces
            continue

        if i == 2:
            j = 0
            for x in pieces:
                if x in ['required', 'Required'] or x == '':
                    x = superheader[j]
                if x == "ChEBI ID":
                    x = "chemical_name"
                header.append(x)
                j = j + 1
            cond_header = header[cond_start_index:cond_stop_index]
            continue

        if len(pieces) < column_size:
            for r in range(len(pieces), column_size - 1):
                pieces.append("")

        conds = {}
        created_by = None
        dbentity_id = None
        reference_id = None
        taxonomy_id = None
        experiment_id = None
        mutant_id = None
        allele_id = None
        allele_comment = ""
        reporter_id = None
        reporter_comment = ""
        details = ""
        observable = ""
        qualifier = ""
        phenotype_id = None
        strain_name = ""
        bad_row = 0
        conds = pieces[cond_start_index:cond_stop_index]

        k = 0
        for x in pieces:
            field_name = header[k].strip()
            if k < cond_stop_index and k >= cond_start_index:
                k = k + 1
                continue
            k = k + 1
            if x is "":
                continue

            ## the rest is for phenotypeannotation table

            if field_name.startswith('curator'):
                created_by = x.strip()

            if field_name == 'feature_name':
                dbentity_id = name_to_locus_id.get(x.strip())
                if dbentity_id is None:
                    print "The feature_name:", x, " is not in the database."
                    bad_row = 1
                    break

            if field_name == 'PMID':
                reference_id = pmid_to_reference_id.get(int(x.strip()))
                if reference_id is None:
                    print "The PMID: ", x, " is not in the database."
                    bad_row = 1
                    break

            if field_name == "experiment_type":
                experiment_id = experiment_to_id.get(x.strip().replace(
                    '"', ''))
                if experiment_id is None:
                    print "The experiment_type:", x, " is not in the APO table."
                    bad_row = 1
                    break

            if field_name == "mutant_type":
                mutant_id = mutant_to_id.get(x.strip())
                if mutant_id is None:
                    print "The mutant_type:", x, " is not in the APO table."
                    bad_row = 1
                    continue

            if field_name == "observable":
                observable = x.strip()

            if field_name == "qualifier":
                qualifier = x.strip()

            if field_name == "strain_background":
                taxid = strain_taxid_mapping.get(x.strip())
                if taxid is None:
                    print "The strain_background:", x, " is not in the mapping."
                    bad_row = 1
                    continue
                taxonomy_id = taxid_to_taxonomy_id.get(taxid)
                if taxonomy_id is None:
                    print "The TAXON ID: ", taxid, " is not in the database."
                    bad_row = 1
                    continue

            if field_name == "strain_name":
                strain_name = x.strip()

            if field_name == "allele_name":
                allele_id = allele_to_id.get(x.strip())
                if allele_id is None:
                    allele_id = insert_allele(nex_session, fw, source_id,
                                              created_by, x.strip())
                    allele_to_id[x.strip()] = allele_id

            if field_name == "allele_description":
                allele_comment = x

            if field_name == "reporter_name":
                reporter_id = reporter_to_id.get(x.strip())
                if reporter_id is None:
                    reporter_id = insert_reporter(nex_session, fw, source_id,
                                                  created_by, x.strip())
                    reporter_to_id[x.strip()] = reporter_id

            if field_name == "reporter_description":
                reporter_comment = x

            if field_name == "details":
                details = x

        if bad_row == 1:
            continue

        if created_by is None and observable == "":
            continue

        if observable != "":
            phenotype = observable
            if qualifier != "":
                phenotype = observable + ": " + qualifier
            phenotype_id = phenotype_to_id.get(phenotype)
            if phenotype_id is None:
                print "The phenotype:", phenotype, " is not in the database."
                continue
        else:
            print "No observable is provided for line:", line
            continue

        if dbentity_id is None:
            print "No feature_name is provided for line:", line
            continue

        if taxonomy_id is None:
            print "No strain_background is provided for line:", line
            continue

        if reference_id is None:
            print "No PMID is provided for line:", line
            continue

        if created_by is None:
            print "No curator ID is provided for line:", line
            continue

        # print "dbentity_id=", dbentity_id, ", source_id=", source_id, ", taxonomy_id=", taxonomy_id, ", reference_id=", reference_id, ", phenotype_id=", phenotype_id, ", allele_id=", allele_id, ", allele_comment=", allele_comment, ", reporter_id=", reporter_id

        key = (dbentity_id, taxonomy_id, reference_id, phenotype_id,
               experiment_id, mutant_id, allele_id, reporter_id, strain_name,
               details)

        annotation_id = key_to_annotation_id.get(key)

        group_id = 1
        if annotation_id is None:
            annotation_id = insert_phenotypeannotation(
                nex_session, fw, source_id, created_by, dbentity_id,
                taxonomy_id, reference_id, phenotype_id, experiment_id,
                mutant_id, allele_id, allele_comment, reporter_id,
                reporter_comment, strain_name, details)
            key_to_annotation_id[key] = annotation_id
        else:
            group_id = annotation_id_to_last_group_id.get(annotation_id)
            if group_id is None:
                group_id = 1
            else:
                group_id = group_id + 1

        ## insert conditions here

        m = 0
        for r in range(0, len(cond_header) / 3):
            cond_name = conds[m]
            cond_value = conds[m + 1]
            cond_unit = conds[m + 2]
            cond_class = cond_header[m].split("_")[0]
            m = m + 3
            if cond_name == "":
                continue
            if cond_class == "chemical":
                chemical_names = cond_name.split(',')
                chemical_values = cond_value.split(',')
                chemical_units = cond_unit.split(',')

                print "chemical_names=", chemical_names
                print "chemical_values=", chemical_values
                print "chemical_units=", chemical_units

                n = 0
                for chemical_name in chemical_names:
                    chebiid = "CHEBI:" + chemical_name
                    cond_name = chebiid_to_name.get(chebiid)
                    cond_value = chemical_values[n]
                    cond_unit = chemical_units[n]

                    print "cond_name=", cond_name
                    print "cond_value=", cond_value
                    print "cond_unit=", cond_unit

                    n = n + 1
                    if cond_name is None:
                        print "The ChEBI ID", chebi, " is not in the database."
                        continue
                    insert_phenotypeannotation_cond(nex_session, fw,
                                                    created_by, annotation_id,
                                                    group_id, cond_class,
                                                    cond_name, cond_value,
                                                    cond_unit)
            else:

                if cond_class in ['temperature', 'treatment'
                                  ] and cond_unit.endswith('C'):
                    cond_unit = degree
                    # cond_unit = cond_unit.encode('utf8')

                insert_phenotypeannotation_cond(nex_session, fw, created_by,
                                                annotation_id, group_id,
                                                cond_class, cond_name,
                                                cond_value, cond_unit)

        annotation_id_to_last_group_id[annotation_id] = group_id

    ##########
    # nex_session.rollback()
    nex_session.commit()

    fw.close()
    f.close()
def create_seqs(strain):

    nex_session = get_session()
    strain_to_taxid = get_strain_taxid_mapping()
    taxon = strain_to_taxid.get(strain)
    if taxon is None:
        print("The strain=", strain, " is not in the mapping.")
        return

    taxonomy = nex_session.query(Taxonomy).filter_by(taxid=taxon).one_or_none()
    if taxonomy is None:
        print("The taxon ID=", taxon, " is not in the database.")
        return
    taxonomy_id = taxonomy.taxonomy_id

    dbentity_id_to_name = dict([
        (x.dbentity_id, (x.systematic_name, x.dbentity_status))
        for x in nex_session.query(Locusdbentity).all()
    ])
    so_id_to_display_name = dict([(x.so_id, x.display_name)
                                  for x in nex_session.query(So).all()])

    outfile = dataDir + "not_feature_" + strain + ".fsa"

    featureOrder = []
    if strain != 'S288C':
        f = open(refFile)
        for line in f:
            if line.startswith(">"):
                seqID = line.replace(">", "").split(' ')[0]
                [name1, name2, RefStrain] = seqID.split('|')
                featureOrder.append((name1, name2))
        f.close()

    fw = open(outfile, "w")

    found = {}
    prevRow = None
    prevContigId = None
    contig_id_to_seq = {}
    contig_id_to_display_name = {}
    defline_to_seq = {}
    for x in nex_session.query(Dnasequenceannotation).filter_by(
            dna_type='GENOMIC', taxonomy_id=taxonomy_id).order_by(
                Dnasequenceannotation.contig_id,
                Dnasequenceannotation.start_index,
                Dnasequenceannotation.end_index).all():
        (name, status) = dbentity_id_to_name[x.dbentity_id]
        if status in ['Deleted', 'Merged']:
            continue
        type = so_id_to_display_name.get(x.so_id)
        if type not in [
                'ORF', 'ncRNA gene', 'snoRNA gene', 'snRNA gene', 'tRNA gene',
                'rRNA gene', 'telomerase RNA gene'
        ]:
            continue
        if prevContigId is None or prevContigId != x.contig_id:
            prevRow = (name, x.start_index, x.end_index)
            prevContigId = x.contig_id
            continue

        (prevName, prevStart, prevEnd) = prevRow

        if x.start_index >= prevStart and x.end_index <= prevEnd:
            continue

        start = prevEnd + 1
        end = x.start_index - 1

        if end <= start:
            prevRow = (name, x.start_index, x.end_index)
            prevContigId = x.contig_id
            continue

        #if prevName[0:2] == name[0:2] and prevName[2] != name[2]:
        #    print (name, prevName)
        #    # eg YAL002W and YAR002W
        #    prevRow = (name, x.start_index, x.end_index)
        #    prevContigId = x.contig_id
        #    continue

        if x.contig_id not in contig_id_to_seq:
            contig = nex_session.query(Contig).filter_by(
                contig_id=x.contig_id).one_or_none()
            if contig is None:
                print("The contig_id=", x.contig_id,
                      " is not in the database.")
                exit()
            contig_id_to_seq[x.contig_id] = contig.residues
            contig_id_to_display_name[x.contig_id] = contig.display_name
        seq = contig_id_to_seq[x.contig_id][start - 1:end]
        seqID = prevName + "|" + name + "|" + strain

        if (prevName, name) not in featureOrder and (name,
                                                     prevName) in featureOrder:
            seqID = name + "|" + prevName + "|" + strain
            (start, end) = (end, start)
            seq = reverse_complement(seq)

        if seqID in found:
            print("The seqID is already in the file.", seqID)
            continue
        found[seqID] = 1
        defline = ">" + seqID + " " + contig_id_to_display_name[
            x.contig_id] + " " + "from " + str(start) + "-" + str(end)

        fw_mapping.write(seqID + "\t" + str(x.contig_id) + "\t" + str(start) +
                         "\t" + str(end) + "\n")

        if strain == 'S288C':
            defline = defline + ", Genome Release 64-2-1,"
        defline = defline + " between " + seqID.split(
            '|')[0] + " and " + seqID.split('|')[1]
        if strain == 'S288C':
            fw.write(defline + "\n")
            fw.write(seq + "\n")
        else:
            defline_to_seq[defline] = seq

        prevRow = (name, x.start_index, x.end_index)
        prevContigId = x.contig_id

    if strain != 'S288C':
        for defline in sorted(defline_to_seq.keys()):
            fw.write(defline + "\n")
            fw.write(defline_to_seq[defline] + "\n")

    fw.close()
    fw_mapping.close()
Ejemplo n.º 4
0
def load_data():

    nex_session = get_session()

    taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id)
                                 for x in nex_session.query(Taxonomy).all()])
    sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none()
    genBank = nex_session.query(Source).filter_by(
        display_name='GenBank/EMBL/DDBJ').one_or_none()
    uniprot = nex_session.query(Source).filter_by(
        display_name='UniProtKB').one_or_none()

    so_to_so_id = dict([(x.display_name, x.so_id)
                        for x in nex_session.query(So).all()])
    name_to_locus_id = dict([(x.systematic_name, x.dbentity_id)
                             for x in nex_session.query(Locusdbentity).all()])

    source_id = sgd.source_id
    genBank_src_id = genBank.source_id
    uniprot_src_id = uniprot.source_id

    strain_taxid_mapping = get_strain_taxid_mapping()

    fw = open(log_file, "w")

    for seq_file in [genomic_file, coding_file, kb_file]:

        f = open(seq_file)

        defline = ""
        seq = ""
        dna_type = None
        if 'coding' in seq_file:
            dna_type = 'CODING'
        elif 'genomic' in seq_file:
            dna_type = 'GENOMIC'
        else:
            dna_type = '1KB'

        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq and defline:
                    insert_dnasequenceannotation(nex_session, fw, source_id,
                                                 dna_type, defline, seq,
                                                 so_to_so_id,
                                                 taxid_to_taxonomy_id,
                                                 name_to_locus_id,
                                                 strain_taxid_mapping)
                defline = line
                seq = ""
            else:
                seq = seq + line

        insert_dnasequenceannotation(nex_session, fw, source_id, dna_type,
                                     defline, seq, so_to_so_id,
                                     taxid_to_taxonomy_id, name_to_locus_id,
                                     strain_taxid_mapping)
        f.close()

    ## protein sequences

    f = open(protein_file)

    defline = ""
    seq = ""
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            if seq and defline:
                insert_proteinsequenceannotation(nex_session, fw, source_id,
                                                 defline, seq,
                                                 taxid_to_taxonomy_id,
                                                 name_to_locus_id,
                                                 strain_taxid_mapping)
            defline = line
            seq = ""
        else:
            seq = seq + line

    insert_proteinsequenceannotation(nex_session, fw, source_id, defline, seq,
                                     taxid_to_taxonomy_id, name_to_locus_id,
                                     strain_taxid_mapping)
    f.close()

    ## cds sequences

    f = open(cds_file)

    defline = ""
    seq = ""
    for line in f:
        line = line.strip()
        if line.startswith('>'):
            if seq and defline:
                insert_dnasubsequence(nex_session, fw, source_id, defline, seq,
                                      taxid_to_taxonomy_id, name_to_locus_id,
                                      strain_taxid_mapping, so_to_so_id)

            defline = line
            seq = ""
        else:
            seq = seq + line

    insert_dnasubsequence(nex_session, fw, source_id, defline, seq,
                          taxid_to_taxonomy_id, name_to_locus_id,
                          strain_taxid_mapping, so_to_so_id)

    f.close()

    ## locus_alias + locusdbentity
    f = open(gene_file)
    for line in f:
        if line.startswith('systematic_name'):
            continue
        [name, genBankID, uniprotID] = line.strip().split("\t")
        locus_id = name_to_locus_id.get(name)
        if locus_id is None:
            print(name + " is not in the database.")
            continue
        nex_session.query(Locusdbentity).filter_by(
            dbentity_id=locus_id).update({
                'has_sequence': '1',
                'has_protein': '1',
                'has_sequence_section': '1'
            })
        insert_locus_alias(nex_session, fw, locus_id, genBankID,
                           genBank_src_id, 'DNA accession ID',
                           'https://www.ncbi.nlm.nih.gov/nuccore/' + genBankID)
        insert_locus_alias(nex_session, fw, locus_id, uniprotID,
                           uniprot_src_id, 'UniProtKB ID',
                           'http://www.uniprot.org/uniprot/' + uniprotID)

    f.close()
    fw.close()

    # nex_session.rollback()
    nex_session.commit()
import os
from os import path
from src.models import Locusdbentity, Dnasubsequence, Dnasequenceannotation, Taxonomy
from scripts.loading.database_session import get_session
from scripts.loading.variant import calculate_variant_data, aligned_sequence_to_snp_sequence, \
     strain_to_id, calculate_block_data
from scripts.loading.util import get_strain_taxid_mapping

nex_session = get_session()
strain_to_taxid = get_strain_taxid_mapping()
strain_to_id = strain_to_id()
taxon = strain_to_taxid['S288C']

dataDir = 'scripts/loading/variant/data/'

dnaSeqAlignFile = dataDir + 'dna_sequence_alignment.txt'
proteinSeqAlignFile = dataDir + 'protein_sequence_alignment.txt'
dnaVariantFile = dataDir + 'dna_variant.txt'
proteinVariantFile = dataDir + 'protein_variant.txt'

dnaDir = dataDir + 'dna_align/'
proteinDir = dataDir + 'protein_align/'


def generate_protein_data(name_to_dbentity_id):

    fw = open(proteinSeqAlignFile, "w")
    fw2 = open(proteinVariantFile, "w")

    fw.write("sequence_name\tdbentity_id\taligned_sequence\n")
    fw2.write(
Ejemplo n.º 6
0
def load_data():

    nex_session = get_session()

    sgd = nex_session.query(Source).filter_by(display_name='SGD').one_or_none()
    source_id = sgd.source_id
    name_to_dbentity_id = dict([
        (x.systematic_name, x.dbentity_id)
        for x in nex_session.query(Locusdbentity).all()
    ])
    pmid_to_reference_id = dict([
        (x.pmid, x.dbentity_id)
        for x in nex_session.query(Referencedbentity).all()
    ])
    ecoid_to_eco_id = dict([(x.ecoid, x.eco_id)
                            for x in nex_session.query(Eco).all()])
    efoid_to_efo_id = dict([(x.efoid, x.efo_id)
                            for x in nex_session.query(Efo).all()])
    chebiid_to_chebi_id = dict([(x.chebiid, x.chebi_id)
                                for x in nex_session.query(Chebi).all()])
    goid_to_go_id = dict([(x.goid, x.go_id)
                          for x in nex_session.query(Go).all()])
    taxid_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id)
                                 for x in nex_session.query(Taxonomy).all()])
    strain_to_taxid_mapping = get_strain_taxid_mapping()
    reference_id = pmid_to_reference_id.get(PMID)
    if reference_id is None:
        print("The PMID:", PMID, " is not in the database.")
        return

    log.info("Start loading:\n")
    log.info(str(datetime.now()) + "\n")

    fw = open(logfile, "w")
    f = open(datafile)

    i = 0

    for line in f:
        if line.startswith("SYSTEMATIC_NMAE"):
            continue
        pieces = line.strip().replace("None", "").split("\t")
        dbentity_id = name_to_dbentity_id.get(pieces[0])
        if dbentity_id is None:
            print("The ORF name is not in the Locusdbentity table:", pieces[0])
            continue
        original_reference_id = pmid_to_reference_id.get(int(pieces[2]))
        data_value = int(pieces[3])
        eco_id = ecoid_to_eco_id.get(pieces[4])
        if eco_id is None:
            print("The ECOID:", pieces[4], " is not in the database.")
            continue
        efo_id = efoid_to_efo_id.get(pieces[5])
        if efo_id is None:
            print("The EFOID:", pieces[5], " is not in the database.")
            continue
        taxid = strain_to_taxid_mapping.get(pieces[6])
        if taxid is None:
            print("The strain:", pieces[6], " is not in the mapping list.")
            continue
        taxonomy_id = taxid_to_taxonomy_id.get(taxid)
        if taxonomy_id is None:
            print("The TAXID:", taxid, " is not in the database.")
            continue
        chebi_id = None
        go_id = None
        time_value = None
        time_unit = None
        conc_value = None
        conc_unit = None
        fold_change = None
        median = None
        mad = None
        if len(pieces) >= 8:
            if pieces[7]:
                chebi_id = chebiid_to_chebi_id.get(pieces[7])
                if chebi_id is None:
                    print("The chebiid:", pieces[7],
                          " is not in the database.")
                    continue
            if pieces[8]:
                go_id = goid_to_go_id.get(pieces[8])
                if go_id is None:
                    print("The goid:", pieces[8], " is not in the database.")
                    continue
            if pieces[9]:
                time_value = int(pieces[9])
            if pieces[10]:
                time_unit = pieces[10]
                if time_unit.startswith('hour'):
                    time_unit = 'hr'
                if time_unit.startswith('day'):
                    time_unit = 'd'
                if time_unit.startswith('min'):
                    time_unit = 'min'
            if pieces[11]:
                conc_value = float(pieces[11])
                conc_unit = pieces[12]
            if pieces[13]:
                fold_change = float(pieces[13])
            if pieces[14]:
                median = int(pieces[14])
            if pieces[15]:
                mad = int(pieces[15])

        insert_proteinabundanceannotation(nex_session, fw, dbentity_id,
                                          source_id, taxonomy_id, reference_id,
                                          original_reference_id, eco_id,
                                          efo_id, chebi_id, go_id, data_value,
                                          fold_change, time_value, time_unit,
                                          conc_value, conc_unit, median, mad)

        i = i + 1
        if i > 500:
            # nex_session.rollback()
            nex_session.commit()
            i = 0

    f.close()

    # nex_session.rollback()
    nex_session.commit()
    nex_session.close()

    log.info("Done loading\n")
    log.info(str(datetime.now()) + "\n")
Ejemplo n.º 7
0
def read_data_and_update_database(nex_session, fw):

    ipr = nex_session.query(Source).filter_by(
        format_name='InterPro').one_or_none()
    # taxon = nex_session.query(Taxonomy).filter_by(taxid=taxid).one_or_none()

    taxon_to_taxonomy_id = dict([(x.taxid, x.taxonomy_id)
                                 for x in nex_session.query(Taxonomy).all()])
    name_to_dbentity_id = dict([
        (x.systematic_name, x.dbentity_id)
        for x in nex_session.query(Locusdbentity).all()
    ])
    format_name_to_id = dict([(x.format_name, x.proteindomain_id)
                              for x in nex_session.query(Proteindomain).all()])

    source_id = ipr.source_id
    # taxonomy_id = taxon.taxonomy_id

    key_to_annotation = {}
    for x in nex_session.query(Proteindomainannotation).all():
        key = (x.dbentity_id, x.proteindomain_id, x.start_index, x.end_index,
               x.taxonomy_id)
        key_to_annotation[key] = x

    f = open(domain_file)

    strain_to_taxon_mapping = get_strain_taxid_mapping()

    i = 0
    found = {}
    for line in f:
        items = line.strip().split("\t")
        IDs = items[0].split('_')
        name = IDs[0]
        strain = IDs[2]
        taxon = strain_to_taxon_mapping.get(strain)
        if taxon is None:
            print("The strain = " + strain + " is not in the mapping module.")
            continue
        taxonomy_id = taxon_to_taxonomy_id.get(taxon)
        if taxonomy_id is None:
            print("The taxid = " + taxon + " is not in the database.")
            continue
        dbentity_id = name_to_dbentity_id.get(name)
        if dbentity_id is None:
            print("The systematic_name ", name,
                  " is not in the LOCUSDBENTITY table.")
            continue
        domain_name = items[4].replace(' ', '_')
        proteindomain_id = format_name_to_id.get(domain_name)
        if proteindomain_id is None:
            print("The domain name:", domain_name,
                  " is not in the PROTEINDOMAIN table.")
            continue
        start = int(items[6])
        end = int(items[7])
        run_time = items[10].split('-')
        run_date = run_time[2] + '-' + run_time[1] + '-' + run_time[0]
        key = (dbentity_id, proteindomain_id, start, end, taxonomy_id)
        if key not in key_to_annotation and key not in found:
            i = i + 1
            insert_annotation(nex_session, fw, dbentity_id, proteindomain_id,
                              source_id, taxonomy_id, start, end, run_date)
            nex_session.commit()
            found[key] = 1

    f.close()

    nex_session.commit()