Esempio n. 1
0
def store_proteins_descriptions(pgdb, fastafn, tsvfn, mapfn, header, decoy,
                                fastadelim, genefield):
    if not fastafn:
        proteins = {}
        for psm in tsvreader.generate_tsv_psms(tsvfn, header):
            proteins.update({x: 1 for x in
                             tsvreader.get_proteins_from_psm(psm)})
        proteins = [(protein,) for protein in proteins.keys()]
        pgdb.store_proteins(proteins)
    else:
        proteins, sequences, evidences = fastareader.get_proteins_for_db(
            fastafn)
        proteins = [x for x in proteins]
        pgdb.store_proteins(proteins, evidences, sequences)
        if not mapfn:
            associations = fastareader.get_proteins_genes(fastafn, fastadelim,
                                                          genefield)
            genes, descriptions = [], []
            for assoc in associations:
                genes.append((assoc[1], assoc[0]))
                descriptions.append((assoc[0], assoc[3]))
            pgdb.store_descriptions(descriptions)
            pgdb.store_genes(genes)
    if mapfn:
        proteins_with_versions = {}
        for protein in proteins:
            proteins_with_versions[protein[0].split('.')[0]] = protein
        if decoy:
            mod = fastareader.get_decoy_mod_string(proteins[0][0])
            proteins_with_versions = {k.replace(mod, ''): v for k, v
                                      in proteins_with_versions.items()}
        gpmap = get_protein_gene_map(mapfn, proteins_with_versions, decoy)
        pgdb.store_gene_and_associated_id(gpmap)
    return set([x[0] for x in proteins])
Esempio n. 2
0
def add_genes_to_psm_table(psms, pgdb):
    gpmap = pgdb.get_protein_gene_map()
    for psm in psms:
        outpsm = {x: y for x, y in psm.items()}
        proteins = tsvreader.get_proteins_from_psm(psm)
        outpsm[mzidtsvdata.HEADER_GENE] = ';'.join(get_genes(proteins, gpmap))
        symbols = get_symbols(proteins, gpmap)
        desc = get_descriptions(proteins, gpmap)
        outpsm[mzidtsvdata.HEADER_SYMBOL] = ';'.join(symbols)
        outpsm[mzidtsvdata.HEADER_DESCRIPTION] = ';'.join(desc)
        yield outpsm
Esempio n. 3
0
def add_genes_to_psm_table(psmfn, oldheader, pgdb):
    gpmap = pgdb.get_protein_gene_map()
    for psm in tsvreader.generate_tsv_psms(psmfn, oldheader):
        outpsm = {x: y for x, y in psm.items()}
        proteins = tsvreader.get_proteins_from_psm(psm)
        outpsm[mzidtsvdata.HEADER_GENE] = ';'.join(get_genes(proteins, gpmap))
        symbols = get_symbols(proteins, gpmap)
        desc = get_descriptions(proteins, gpmap)
        outpsm[mzidtsvdata.HEADER_SYMBOL] = ';'.join(symbols)
        outpsm[mzidtsvdata.HEADER_DESCRIPTION] = ';'.join(desc)
        yield outpsm
Esempio n. 4
0
def store_proteins_descriptions(pgdb, fastafn, fastamd5, tsvfn, header,
                                fastadelim, genefield):
    if not fastafn:
        prots = {}
        for psm in tsvreader.generate_split_tsv_lines(tsvfn, header):
            prots.update({x: 1 for x in tsvreader.get_proteins_from_psm(psm)})
        prots = [(protein, ) for protein in prots.keys()]
        pgdb.store_proteins(prots)
    else:
        prots, seqs, desc, evids, ensgs, symbols = fastareader.get_proteins_for_db(
            fastafn, fastadelim, genefield)
        pgdb.store_fasta(fastafn, fastamd5, prots, evids, seqs, desc, ensgs,
                         symbols)
    return set([x[0] for x in prots])
Esempio n. 5
0
def generate_psms_with_proteingroups(psms, pgdb, specfncol, unroll):
    rownr = 0
    use_evi = pgdb.check_evidence_tables()
    all_protein_group_content = pgdb.get_all_psms_proteingroups(use_evi)
    protein = next(all_protein_group_content)
    for psm in psms:
        if unroll:
            psm_id = tsvreader.get_psm_id(psm, specfncol)
            lineproteins = get_all_proteins_from_unrolled_psm(psm_id, pgdb)
        else:
            lineproteins = tsvreader.get_proteins_from_psm(psm)
        proteins_in_groups = {}
        while protein[0] == rownr:
            try:
                proteins_in_groups[protein[lookups.MASTER_INDEX]].append(
                    protein)
            except KeyError:
                proteins_in_groups[protein[lookups.MASTER_INDEX]] = [protein]
            try:
                protein = next(all_protein_group_content)
            except StopIteration:
                protein = [-1]
        sorted_pgs = sort_protein_groups(proteins_in_groups, use_evi)
        psm_masters = []
        psm_pg_proteins = []
        for master, group in sorted(sorted_pgs.items()):
            psm_masters.append(master)
            psm_pg_proteins.append(
                [protein[lookups.PROTEIN_ACC_INDEX] for protein in group])
        outpsm = {
            mzidtsvdata.HEADER_MASTER_PROT:
            ';'.join(psm_masters),
            mzidtsvdata.HEADER_PG_CONTENT:
            ';'.join([','.join([y for y in x]) for x in psm_pg_proteins]),
            mzidtsvdata.HEADER_PG_AMOUNT_PROTEIN_HITS:
            ';'.join(count_protein_group_hits(lineproteins, psm_pg_proteins))
        }
        outpsm.update(psm)
        rownr += 1
        yield outpsm
Esempio n. 6
0
def generate_psms_with_proteingroups(fn, oldheader, newheader, pgdb,
                                     unroll=False):
    rownr = 0
    use_evi = pgdb.check_evidence_tables()
    all_protein_group_content = pgdb.get_all_psms_proteingroups(use_evi)
    protein = next(all_protein_group_content)
    for psm in tsvreader.generate_tsv_psms(fn, oldheader):
        if unroll:
            psm_id = tsvreader.get_psm_id(psm)
            lineproteins = get_all_proteins_from_unrolled_psm(psm_id, pgdb)
        else:
            lineproteins = tsvreader.get_proteins_from_psm(psm)
        proteins_in_groups = {}
        while protein[0] == rownr:
            try:
                proteins_in_groups[protein[
                    lookups.MASTER_INDEX]].append(protein)
            except KeyError:
                proteins_in_groups[protein[lookups.MASTER_INDEX]] = [protein]
            try:
                protein = next(all_protein_group_content)
            except StopIteration:
                protein = [-1]
        sorted_pgs = sorters.sort_protein_groups(proteins_in_groups, use_evi)
        psm_masters = []
        psm_pg_proteins = []
        for master, group in sorted_pgs.items():
            psm_masters.append(master)
            psm_pg_proteins.append([protein[lookups.PROTEIN_ACC_INDEX]
                                    for protein in group])
        outpsm = {mzidtsvdata.HEADER_MASTER_PROT: ';'.join(psm_masters),
                  mzidtsvdata.HEADER_PG_CONTENT: ';'.join(
                      [','.join([y for y in x]) for x in psm_pg_proteins]),
                  mzidtsvdata.HEADER_PG_AMOUNT_PROTEIN_HITS: ';'.join(
                      count_protein_group_hits(lineproteins, psm_pg_proteins))
                  }
        outpsm.update(psm)
        rownr += 1
        yield outpsm
Esempio n. 7
0
def store_proteins_descriptions(pgdb, fastafn, tsvfn, mapfn, header, decoy,
                                fastadelim, genefield):
    if not fastafn:
        proteins = {}
        for psm in tsvreader.generate_tsv_psms(tsvfn, header):
            proteins.update(
                {x: 1
                 for x in tsvreader.get_proteins_from_psm(psm)})
        proteins = [(protein, ) for protein in proteins.keys()]
        pgdb.store_proteins(proteins)
    else:
        proteins, sequences, evidences = fastareader.get_proteins_for_db(
            fastafn)
        proteins = [x for x in proteins]
        pgdb.store_proteins(proteins, evidences, sequences)
        if not mapfn:
            associations = fastareader.get_proteins_genes(
                fastafn, fastadelim, genefield)
            genes, descriptions = [], []
            for assoc in associations:
                genes.append((assoc[1], assoc[0]))
                descriptions.append((assoc[0], assoc[3]))
            pgdb.store_descriptions(descriptions)
            pgdb.store_genes(genes)
    if mapfn:
        proteins_with_versions = {}
        for protein in proteins:
            proteins_with_versions[protein[0].split('.')[0]] = protein
        if decoy:
            mod = fastareader.get_decoy_mod_string(proteins[0][0])
            proteins_with_versions = {
                k.replace(mod, ''): v
                for k, v in proteins_with_versions.items()
            }
        gpmap = get_protein_gene_map(mapfn, proteins_with_versions, decoy)
        pgdb.store_gene_and_associated_id(gpmap)
    return set([x[0] for x in proteins])