Ejemplo n.º 1
0
def run(
    motif_csv, test_fasta, out_html, aa_probs,
    variant='fixed', 
    alpha=1.0, beta=0.01):

    log_odds_pssm = get_motif_log_odds(
        motif_csv, aa_probs, 
        alpha=alpha, beta=beta, 
        variant=variant)

    write_log_odds(log_odds_pssm, motif_csv.replace('.pssm', '').replace('.csv', '.pssm'))

    n_position = len(log_odds_pssm.keys())

    seqids, fastas = uniprot.read_fasta(test_fasta)

    saved_scores = []

    html = "<body>"
    for seqid, entry in fastas.items():
        # print "Sequence", seqid
        test_seq = entry['sequence']
        scores = score_full_sequence(log_odds_pssm, test_seq)
        html += "<h1>%s</h1>" % entry['description']
        html += make_html_for_seq_scores(test_seq, n_position, scores)
        saved_scores.append({
            'seqid': seqid,
            'description': entry['description'],
            'scores': scores,
            'seq': test_seq
        })
    html += "</body>"
    open(out_html, 'w').write(html)

    titles = ['seqid', 'description', 'position', 'seq', 'score']
    rows = [titles]
    for entry in saved_scores:
        n_seq = len(entry['seq'])
        for i in range(0, n_seq - n_position):
            score = entry['scores'][i]
            pos = i + 1
            if score > 0:
                row = [
                    entry['seqid'], 
                    entry['description'],
                    pos, 
                    score, 
                    entry['seq'][i:i+n_position]
                ]
                rows.append(row)
    datafile.write_csv(rows, out_html + '.csv')
Ejemplo n.º 2
0
def count_aa_in_fasta(fasta, counts_txt):

    if os.path.isfile(counts_txt):
        with open(counts_txt) as f:
            return eval(f.read())

    seqids, fastas = uniprot.read_fasta(fasta)

    counts = {}
    n = 0
    for entry in fastas.values():
        seq = entry['sequence']
        for c in seq:
            if c not in counts:
                counts[c] = 0
            counts[c] += 1
        n += len(seq)

    with open(counts_txt, 'w') as f:
        f.write(repr(counts))

    return counts
Ejemplo n.º 3
0
import os
import uniprot
import pprint
import sys

# Clean up caches
os.system('rm cache*')

# Example 1 - reading a fasta file
seqids, fastas = uniprot.read_fasta('example.fasta')
pprint.pprint(seqids, indent=2)

# Example 2 - map identifiers for RefSeq to Uniprot
seqids = "NP_000508.1  NP_001018081.3".split()
pairs = uniprot.batch_uniprot_id_mapping_pairs('P_REFSEQ_AC', 'ACC', seqids)
pprint.pprint(pairs, indent=2)

# Example 2 - get UniProt metadata
uniprot_seqids = [j for i, j in pairs]
uniprot_data = uniprot.batch_uniprot_metadata(uniprot_seqids, 'cache')
pprint.pprint(uniprot_data, indent=2)

# Example 3 - parse for isoforms in metadata
text = open('cache/metadata.0.txt').read()
uniprot_data = uniprot.parse_isoforms(text)
pprint.pprint(uniprot_data)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()
uniprot_data = uniprot.get_metadata_with_some_seqid_conversions(
    seqids, 'cache2')
Ejemplo n.º 4
0

fname = 'KL-A1_pep15.csv'

peptides = []

proteins = {}
for row in csv.DictReader(open(fname)):
    seqid = row['Protein']
    if seqid not in proteins:
        proteins[seqid] = []
    proteins[seqid].append(row)
    row['matches'] = []
    peptides.append(row)

seqids, fasta = uniprot.read_fasta('use.fasta')

for seqid, protein in proteins.items():
    bare_seqid = clean_seqid(seqid)
    full_sequence = fasta[bare_seqid]['sequence']
    for i_peptide1, peptide1 in enumerate(protein):
        seq1 = peptide1['Sequence']
        i1 = full_sequence.find(seq1)
        peptide1['Start'] = i1 + 1
        peptide1['End'] = i1 + len(seq1)
        for peptide2 in protein:
            if peptide1 == peptide2:
                continue
            seq2 = peptide2['Sequence']
            i2 = full_sequence.find(seq2)
            match = {
Ejemplo n.º 5
0
import os
import uniprot
import pprint


# Clean up caches

os.system("rm *output* *cache*")


# Example 1 - reading a fasta file

seqids, fastas = uniprot.read_fasta("example.fasta")
pprint.pprint(seqids, indent=2)


# Example 2 - batch read identifier mappings with
# prespecified identifier types

seqids = """
NP_000508.1  NP_001018081.3
""".split()

pairs = uniprot.batch_uniprot_id_mapping_pairs("P_REFSEQ_AC", "ACC", seqids)

pprint.pprint(pairs, indent=2)


# Example 3 - sequential identifier mapping to UniProt
# identifiers using robust but slow method
Ejemplo n.º 6
0
                ('%s%d' % (cell_type, i_repeat), color, i_source))
    print source_sets

    protein = {}
    for exp, color, i_source in source_sets:
        fname = 'Data_PeptideOverlay/%s_motif.csv' % exp
        for entry in datafile.read_csv(fname):
            seqid = uniprot.parse_fasta_header(entry['Accessions'])[0]
            if seqid not in protein:
                protein[seqid] = default_protein()
                protein[seqid]['description'] = entry['Names']
            source = protein[seqid]['sources'][i_source]
            source['color'] = color
            source['peptides'].append(entry['sequence'])

    seqids, fasta = uniprot.read_fasta('../db/uniprot_sprot.fasta')

    for seqid in protein:
        sequence = fasta[seqid]['sequence']
        protein[seqid]['sequence'] = sequence
        protein[seqid]['length'] = len(sequence)
        for source in protein[seqid]['sources']:
            for peptide in source['peptides']:
                i = sequence.index(peptide)
                j = i + len(peptide)
                source['intervals'].append([i, j])
            del source['peptides']

    print 'write overlay%s.csv' % skip
    make_csv(protein, 'overlay%s.csv' % skip)
Ejemplo n.º 7
0
import os
import uniprot
import pprint


# Clean up caches

os.system('rm *output* *cache*')


# Example 1 - reading a fasta file

seqids, fastas = uniprot.read_fasta('example.fasta')
pprint.pprint(seqids, indent=2)


# Example 2 - batch read identifier mappings with
# prespecified identifier types

seqids = """
NP_000508.1  NP_001018081.3
""".split()

pairs = uniprot.batch_uniprot_id_mapping_pairs(
  'P_REFSEQ_AC', 'ACC', seqids)

pprint.pprint(pairs, indent=2)


# Example 3 - sequential identifier mapping to UniProt 
# identifiers using robust but slow method
Ejemplo n.º 8
0

fname = 'KL-A1_pep15.csv'

peptides = []

proteins = {}
for row in csv.DictReader(open(fname)):
    seqid = row['Protein']
    if seqid not in proteins:
        proteins[seqid] = []
    proteins[seqid].append(row)
    row['matches'] = []
    peptides.append(row)

seqids, fasta = uniprot.read_fasta('use.fasta')

for seqid, protein in proteins.items():
    bare_seqid = clean_seqid(seqid)
    full_sequence = fasta[bare_seqid]['sequence']
    for i_peptide1, peptide1 in enumerate(protein):
        seq1 = peptide1['Sequence']
        i1 = full_sequence.find(seq1)
        peptide1['Start'] = i1 + 1
        peptide1['End'] = i1 + len(seq1)
        for peptide2 in protein:
            if peptide1 == peptide2: 
                continue
            seq2 = peptide2['Sequence']
            i2 = full_sequence.find(seq2)
            match = {
Ejemplo n.º 9
0
import os
import uniprot
import pprint
import sys

# Clean up caches
os.system('rm cache*')

# Example 1 - reading a fasta file
seqids, fastas = uniprot.read_fasta('example.fasta')
pprint.pprint(seqids, indent=2)

# Example 2 - map identifiers for RefSeq to Uniprot
seqids = "NP_000508.1  NP_001018081.3".split()
pairs = uniprot.batch_uniprot_id_mapping_pairs(
  'P_REFSEQ_AC', 'ACC', seqids)
pprint.pprint(pairs, indent=2)

# Example 2 - get UniProt metadata
uniprot_seqids = [j for i,j in pairs]
uniprot_data = uniprot.batch_uniprot_metadata(
    uniprot_seqids, 'cache')
pprint.pprint(uniprot_data, indent=2)

# Example 3 - parse for isoforms in metadata
text = open('cache/metadata.0.txt').read()
uniprot_data = uniprot.parse_isoforms(text)
pprint.pprint(uniprot_data)

# Example 4 - chaining commands to map seqids
seqids = "EFG_MYCA1 YP_885981.1 ENSG00000196176 Q91ZU6-8".split()