def run( motif_csv, test_fasta, out_html, aa_probs, variant='fixed', alpha=1.0, beta=0.01): log_odds_pssm = get_motif_log_odds( motif_csv, aa_probs, alpha=alpha, beta=beta, variant=variant) write_log_odds(log_odds_pssm, motif_csv.replace('.pssm', '').replace('.csv', '.pssm')) n_position = len(log_odds_pssm.keys()) seqids, fastas = uniprot.read_fasta(test_fasta) saved_scores = [] html = "<body>" for seqid, entry in fastas.items(): # print "Sequence", seqid test_seq = entry['sequence'] scores = score_full_sequence(log_odds_pssm, test_seq) html += "<h1>%s</h1>" % entry['description'] html += make_html_for_seq_scores(test_seq, n_position, scores) saved_scores.append({ 'seqid': seqid, 'description': entry['description'], 'scores': scores, 'seq': test_seq }) html += "</body>" open(out_html, 'w').write(html) titles = ['seqid', 'description', 'position', 'seq', 'score'] rows = [titles] for entry in saved_scores: n_seq = len(entry['seq']) for i in range(0, n_seq - n_position): score = entry['scores'][i] pos = i + 1 if score > 0: row = [ entry['seqid'], entry['description'], pos, score, entry['seq'][i:i+n_position] ] rows.append(row) datafile.write_csv(rows, out_html + '.csv')
def count_aa_in_fasta(fasta, counts_txt): if os.path.isfile(counts_txt): with open(counts_txt) as f: return eval(f.read()) seqids, fastas = uniprot.read_fasta(fasta) counts = {} n = 0 for entry in fastas.values(): seq = entry['sequence'] for c in seq: if c not in counts: counts[c] = 0 counts[c] += 1 n += len(seq) with open(counts_txt, 'w') as f: f.write(repr(counts)) return counts
import os import uniprot import pprint import sys # Clean up caches os.system('rm cache*') # Example 1 - reading a fasta file seqids, fastas = uniprot.read_fasta('example.fasta') pprint.pprint(seqids, indent=2) # Example 2 - map identifiers for RefSeq to Uniprot seqids = "NP_000508.1 NP_001018081.3".split() pairs = uniprot.batch_uniprot_id_mapping_pairs('P_REFSEQ_AC', 'ACC', seqids) pprint.pprint(pairs, indent=2) # Example 2 - get UniProt metadata uniprot_seqids = [j for i, j in pairs] uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, 'cache') pprint.pprint(uniprot_data, indent=2) # Example 3 - parse for isoforms in metadata text = open('cache/metadata.0.txt').read() uniprot_data = uniprot.parse_isoforms(text) pprint.pprint(uniprot_data) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split() uniprot_data = uniprot.get_metadata_with_some_seqid_conversions( seqids, 'cache2')
fname = 'KL-A1_pep15.csv' peptides = [] proteins = {} for row in csv.DictReader(open(fname)): seqid = row['Protein'] if seqid not in proteins: proteins[seqid] = [] proteins[seqid].append(row) row['matches'] = [] peptides.append(row) seqids, fasta = uniprot.read_fasta('use.fasta') for seqid, protein in proteins.items(): bare_seqid = clean_seqid(seqid) full_sequence = fasta[bare_seqid]['sequence'] for i_peptide1, peptide1 in enumerate(protein): seq1 = peptide1['Sequence'] i1 = full_sequence.find(seq1) peptide1['Start'] = i1 + 1 peptide1['End'] = i1 + len(seq1) for peptide2 in protein: if peptide1 == peptide2: continue seq2 = peptide2['Sequence'] i2 = full_sequence.find(seq2) match = {
import os import uniprot import pprint # Clean up caches os.system("rm *output* *cache*") # Example 1 - reading a fasta file seqids, fastas = uniprot.read_fasta("example.fasta") pprint.pprint(seqids, indent=2) # Example 2 - batch read identifier mappings with # prespecified identifier types seqids = """ NP_000508.1 NP_001018081.3 """.split() pairs = uniprot.batch_uniprot_id_mapping_pairs("P_REFSEQ_AC", "ACC", seqids) pprint.pprint(pairs, indent=2) # Example 3 - sequential identifier mapping to UniProt # identifiers using robust but slow method
('%s%d' % (cell_type, i_repeat), color, i_source)) print source_sets protein = {} for exp, color, i_source in source_sets: fname = 'Data_PeptideOverlay/%s_motif.csv' % exp for entry in datafile.read_csv(fname): seqid = uniprot.parse_fasta_header(entry['Accessions'])[0] if seqid not in protein: protein[seqid] = default_protein() protein[seqid]['description'] = entry['Names'] source = protein[seqid]['sources'][i_source] source['color'] = color source['peptides'].append(entry['sequence']) seqids, fasta = uniprot.read_fasta('../db/uniprot_sprot.fasta') for seqid in protein: sequence = fasta[seqid]['sequence'] protein[seqid]['sequence'] = sequence protein[seqid]['length'] = len(sequence) for source in protein[seqid]['sources']: for peptide in source['peptides']: i = sequence.index(peptide) j = i + len(peptide) source['intervals'].append([i, j]) del source['peptides'] print 'write overlay%s.csv' % skip make_csv(protein, 'overlay%s.csv' % skip)
import os import uniprot import pprint # Clean up caches os.system('rm *output* *cache*') # Example 1 - reading a fasta file seqids, fastas = uniprot.read_fasta('example.fasta') pprint.pprint(seqids, indent=2) # Example 2 - batch read identifier mappings with # prespecified identifier types seqids = """ NP_000508.1 NP_001018081.3 """.split() pairs = uniprot.batch_uniprot_id_mapping_pairs( 'P_REFSEQ_AC', 'ACC', seqids) pprint.pprint(pairs, indent=2) # Example 3 - sequential identifier mapping to UniProt # identifiers using robust but slow method
import os import uniprot import pprint import sys # Clean up caches os.system('rm cache*') # Example 1 - reading a fasta file seqids, fastas = uniprot.read_fasta('example.fasta') pprint.pprint(seqids, indent=2) # Example 2 - map identifiers for RefSeq to Uniprot seqids = "NP_000508.1 NP_001018081.3".split() pairs = uniprot.batch_uniprot_id_mapping_pairs( 'P_REFSEQ_AC', 'ACC', seqids) pprint.pprint(pairs, indent=2) # Example 2 - get UniProt metadata uniprot_seqids = [j for i,j in pairs] uniprot_data = uniprot.batch_uniprot_metadata( uniprot_seqids, 'cache') pprint.pprint(uniprot_data, indent=2) # Example 3 - parse for isoforms in metadata text = open('cache/metadata.0.txt').read() uniprot_data = uniprot.parse_isoforms(text) pprint.pprint(uniprot_data) # Example 4 - chaining commands to map seqids seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()