def store_proteins_descriptions(pgdb, fastafn, tsvfn, mapfn, header, decoy, fastadelim, genefield): if not fastafn: proteins = {} for psm in tsvreader.generate_tsv_psms(tsvfn, header): proteins.update({x: 1 for x in tsvreader.get_proteins_from_psm(psm)}) proteins = [(protein,) for protein in proteins.keys()] pgdb.store_proteins(proteins) else: proteins, sequences, evidences = fastareader.get_proteins_for_db( fastafn) proteins = [x for x in proteins] pgdb.store_proteins(proteins, evidences, sequences) if not mapfn: associations = fastareader.get_proteins_genes(fastafn, fastadelim, genefield) genes, descriptions = [], [] for assoc in associations: genes.append((assoc[1], assoc[0])) descriptions.append((assoc[0], assoc[3])) pgdb.store_descriptions(descriptions) pgdb.store_genes(genes) if mapfn: proteins_with_versions = {} for protein in proteins: proteins_with_versions[protein[0].split('.')[0]] = protein if decoy: mod = fastareader.get_decoy_mod_string(proteins[0][0]) proteins_with_versions = {k.replace(mod, ''): v for k, v in proteins_with_versions.items()} gpmap = get_protein_gene_map(mapfn, proteins_with_versions, decoy) pgdb.store_gene_and_associated_id(gpmap) return set([x[0] for x in proteins])
def add_genes_to_psm_table(psms, pgdb): gpmap = pgdb.get_protein_gene_map() for psm in psms: outpsm = {x: y for x, y in psm.items()} proteins = tsvreader.get_proteins_from_psm(psm) outpsm[mzidtsvdata.HEADER_GENE] = ';'.join(get_genes(proteins, gpmap)) symbols = get_symbols(proteins, gpmap) desc = get_descriptions(proteins, gpmap) outpsm[mzidtsvdata.HEADER_SYMBOL] = ';'.join(symbols) outpsm[mzidtsvdata.HEADER_DESCRIPTION] = ';'.join(desc) yield outpsm
def add_genes_to_psm_table(psmfn, oldheader, pgdb): gpmap = pgdb.get_protein_gene_map() for psm in tsvreader.generate_tsv_psms(psmfn, oldheader): outpsm = {x: y for x, y in psm.items()} proteins = tsvreader.get_proteins_from_psm(psm) outpsm[mzidtsvdata.HEADER_GENE] = ';'.join(get_genes(proteins, gpmap)) symbols = get_symbols(proteins, gpmap) desc = get_descriptions(proteins, gpmap) outpsm[mzidtsvdata.HEADER_SYMBOL] = ';'.join(symbols) outpsm[mzidtsvdata.HEADER_DESCRIPTION] = ';'.join(desc) yield outpsm
def store_proteins_descriptions(pgdb, fastafn, fastamd5, tsvfn, header, fastadelim, genefield): if not fastafn: prots = {} for psm in tsvreader.generate_split_tsv_lines(tsvfn, header): prots.update({x: 1 for x in tsvreader.get_proteins_from_psm(psm)}) prots = [(protein, ) for protein in prots.keys()] pgdb.store_proteins(prots) else: prots, seqs, desc, evids, ensgs, symbols = fastareader.get_proteins_for_db( fastafn, fastadelim, genefield) pgdb.store_fasta(fastafn, fastamd5, prots, evids, seqs, desc, ensgs, symbols) return set([x[0] for x in prots])
def generate_psms_with_proteingroups(psms, pgdb, specfncol, unroll): rownr = 0 use_evi = pgdb.check_evidence_tables() all_protein_group_content = pgdb.get_all_psms_proteingroups(use_evi) protein = next(all_protein_group_content) for psm in psms: if unroll: psm_id = tsvreader.get_psm_id(psm, specfncol) lineproteins = get_all_proteins_from_unrolled_psm(psm_id, pgdb) else: lineproteins = tsvreader.get_proteins_from_psm(psm) proteins_in_groups = {} while protein[0] == rownr: try: proteins_in_groups[protein[lookups.MASTER_INDEX]].append( protein) except KeyError: proteins_in_groups[protein[lookups.MASTER_INDEX]] = [protein] try: protein = next(all_protein_group_content) except StopIteration: protein = [-1] sorted_pgs = sort_protein_groups(proteins_in_groups, use_evi) psm_masters = [] psm_pg_proteins = [] for master, group in sorted(sorted_pgs.items()): psm_masters.append(master) psm_pg_proteins.append( [protein[lookups.PROTEIN_ACC_INDEX] for protein in group]) outpsm = { mzidtsvdata.HEADER_MASTER_PROT: ';'.join(psm_masters), mzidtsvdata.HEADER_PG_CONTENT: ';'.join([','.join([y for y in x]) for x in psm_pg_proteins]), mzidtsvdata.HEADER_PG_AMOUNT_PROTEIN_HITS: ';'.join(count_protein_group_hits(lineproteins, psm_pg_proteins)) } outpsm.update(psm) rownr += 1 yield outpsm
def generate_psms_with_proteingroups(fn, oldheader, newheader, pgdb, unroll=False): rownr = 0 use_evi = pgdb.check_evidence_tables() all_protein_group_content = pgdb.get_all_psms_proteingroups(use_evi) protein = next(all_protein_group_content) for psm in tsvreader.generate_tsv_psms(fn, oldheader): if unroll: psm_id = tsvreader.get_psm_id(psm) lineproteins = get_all_proteins_from_unrolled_psm(psm_id, pgdb) else: lineproteins = tsvreader.get_proteins_from_psm(psm) proteins_in_groups = {} while protein[0] == rownr: try: proteins_in_groups[protein[ lookups.MASTER_INDEX]].append(protein) except KeyError: proteins_in_groups[protein[lookups.MASTER_INDEX]] = [protein] try: protein = next(all_protein_group_content) except StopIteration: protein = [-1] sorted_pgs = sorters.sort_protein_groups(proteins_in_groups, use_evi) psm_masters = [] psm_pg_proteins = [] for master, group in sorted_pgs.items(): psm_masters.append(master) psm_pg_proteins.append([protein[lookups.PROTEIN_ACC_INDEX] for protein in group]) outpsm = {mzidtsvdata.HEADER_MASTER_PROT: ';'.join(psm_masters), mzidtsvdata.HEADER_PG_CONTENT: ';'.join( [','.join([y for y in x]) for x in psm_pg_proteins]), mzidtsvdata.HEADER_PG_AMOUNT_PROTEIN_HITS: ';'.join( count_protein_group_hits(lineproteins, psm_pg_proteins)) } outpsm.update(psm) rownr += 1 yield outpsm
def store_proteins_descriptions(pgdb, fastafn, tsvfn, mapfn, header, decoy, fastadelim, genefield): if not fastafn: proteins = {} for psm in tsvreader.generate_tsv_psms(tsvfn, header): proteins.update( {x: 1 for x in tsvreader.get_proteins_from_psm(psm)}) proteins = [(protein, ) for protein in proteins.keys()] pgdb.store_proteins(proteins) else: proteins, sequences, evidences = fastareader.get_proteins_for_db( fastafn) proteins = [x for x in proteins] pgdb.store_proteins(proteins, evidences, sequences) if not mapfn: associations = fastareader.get_proteins_genes( fastafn, fastadelim, genefield) genes, descriptions = [], [] for assoc in associations: genes.append((assoc[1], assoc[0])) descriptions.append((assoc[0], assoc[3])) pgdb.store_descriptions(descriptions) pgdb.store_genes(genes) if mapfn: proteins_with_versions = {} for protein in proteins: proteins_with_versions[protein[0].split('.')[0]] = protein if decoy: mod = fastareader.get_decoy_mod_string(proteins[0][0]) proteins_with_versions = { k.replace(mod, ''): v for k, v in proteins_with_versions.items() } gpmap = get_protein_gene_map(mapfn, proteins_with_versions, decoy) pgdb.store_gene_and_associated_id(gpmap) return set([x[0] for x in proteins])