Beispiel #1
0
 def test_fasta_get_sequence(self):
     f = parser.FastaIterator(self.handle, index=self.index)
     out = f.get_sequence('c1', 5, 30)
     digest = hashlib.sha224(out).hexdigest()
     self.assertEqual(
         'ddb5a96ada0f651bffeb8ef856c76faf610ca669a68be904b0acb8b8', digest,
         "Fasta get_sequence #1 Failure")
     f.fasta_file.close()
Beispiel #2
0
 def test_fasta_index_build(self):
     f = parser.FastaIterator(self.handle)
     f.build_fasta_index()
     out = '\n'.join([row.strip() for row in open(self.index, 'rb')])
     digest = hashlib.sha224(out).hexdigest()
     self.assertEqual(
         'e071a4ec04e59d55231dc667e06b81b17d96fad0d40fe2ac883e9fe3', digest,
         "Fasta Index Build Failure")
Beispiel #3
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    vcf = args.vcf
    snps = args.no_snps
    dels = args.dels
    ins = args.ins
    homs = args.no_homozygous
    hets = args.heterozygous
    individual = args.individual-1
    fasta_file = fasta.FastaIterator(file_name)
    vcf_file = gp.VCFIterator( vcf )
    #store our vcf file first
    entries = {}
    to_append = 'chr' if args.append_chromosome else ''
    for info in vcf_file:
        checked = False
        valid_variant = False
        if homs:
            if info.is_homozygous()[individual]:
                if ((snps and not info.has_snp(individual=individual)) and
                    (dels and not info.has_deletion(individual=individual)) and
                    (ins and not info.has_insertion(individual=individual))):
                        checked = True
                        continue
                valid_variant = True
                try:
                    entries['%s%s' % (to_append,info.chrom)][int(info.pos)-1] = info
                except KeyError:
                    entries['%s%s' % (to_append,info.chrom)] = {int(info.pos)-1: info}
        if hets:
            if info.is_heterozygous()[individual]:
                if ((not valid_variant and not checked) and
                    (snps and not info.has_snp(individual=individual)) and
                    (dels and not info.has_deletion(individual=individual)) and
                    (ins and not info.has_insertion(individual=individual))):
                        continue
                try:
                    entries['%s%s' % (to_append,info.chrom)][int(info.pos)-1] = info
                except KeyError:
                    entries['%s%s' % (to_append,info.chrom)] = {int(info.pos)-1: info}
    with args.out as o:
        for header, sequence in fasta_file:
            d = entries.get(header, None)
            if d:
                bases = d.keys()
                bases.sort(reverse=True)
                sequence = list(sequence)
                #we go from the back of the sequence so we don't have to bother
                #with offsets if we are inserting/deleting bases as well
                for i in bases:
                    var_info = d[i]
                    ref = var_info.ref
                    alt = var_info.get_alt(individual=individual)[0]
                    # sys.stderr.write('swapping %s with %s\n' % (ref,alt))
                    sequence[i:i+len(ref)] = list(alt)
                sequence = ''.join(sequence)
            o.write('>%s\n%s\n' % (header, sequence))
Beispiel #4
0
 def test_fasta_iterator(self):
     out = ""
     f = parser.FastaIterator(self.handle, delimiter='>')
     assert (isinstance(f, parser.FastaIterator))
     for header, sequence in f:
         out += ">%s\n%s\n" % (header, sequence)
     digest = hashlib.sha224(out).hexdigest()
     self.assertEqual(
         'a4b6987095e97824cbcb36674f9757c4ccfad161eeb9fd8a993e851a', digest,
         "Fasta Iterator Failure")
Beispiel #5
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    no_met = args.no_met_start
    from_met = args.from_met
    from_met_keep = args.from_met_keep
    if from_met_keep:
        from_met = True
        no_met = True

    def write_sequence(handle, header, protein_index, protein_sequence):
        header1 = '>%s F:%s%d Orf:%d' % (header, strand, i + 1,
                                         protein_index + 1)
        protein_sequences = [(header1, protein_sequence)]
        if from_met:
            pos = protein_sequence.find('M')
            if pos == -1:
                return
            header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % (
                header, pos, strand, i + 1, protein_index + 1)
            protein_sequences.append((header2, protein_sequence[pos:]))
        for protein_header, protein_sequence in protein_sequences:
            if len(protein_sequence) >= orf_min and (
                    no_met or protein_sequence[0] == 'M'):
                handle.write('%s\n%s\n' % (protein_header, protein_sequence))

    with args.out as o:
        for header, sequence in fasta_file:
            for i in xrange(3):
                strand = '+'
                translation = fasta._translate(sequence[i:])
                translation = translation.split('*')
                for protein_index, protein_sequence in enumerate(translation):
                    write_sequence(o, header, protein_index, protein_sequence)
                if negative_strand:
                    strand = '-'
                    translation = fasta._translate(
                        fasta._reverse_complement(sequence)[i:])
                    for protein_index, protein_sequence in enumerate(
                            translation):
                        write_sequence(o, header, protein_index,
                                       protein_sequence)
Beispiel #6
0
 def test_protein_digestion(self):
     out = ""
     f = parser.FastaIterator(self.handle, delimiter='>')
     assert (isinstance(f, parser.FastaIterator))
     enzyme = digest.Enzyme(enzyme='trypsin')
     assert (isinstance(enzyme, digest.Enzyme))
     for __, sequence in f:
         out += sequence
     peptides = ''.join(enzyme.cleave(out, min=7, max=30))
     hash_sum = hashlib.sha224(peptides).hexdigest()
     self.assertEqual(
         '31c6612b85dcea10c26e35826f4e5577b674624725477eb5202b18bb',
         hash_sum, "Protein Digestion With Trypsin Failure")
     enzyme = digest.Enzyme(enzyme='lysc')
     peptides = ''.join(enzyme.cleave(out, min=0, max=9999, unique=True))
     hash_sum = hashlib.sha224(peptides).hexdigest()
     self.assertEqual(
         '2b5e17ce606e9a296095d8b4b9cf75d44ba662d5eb3531e0a187def4',
         hash_sum, "Unique Protein Digestion with Lys-C Failure")
Beispiel #7
0
def main():
    global protein_sequences
    global fasta_headers
    global il_convert
    peptides_mapped = Value('i', 0)
    args = parser.parse_args()
    cores = args.p
    fasta_file = fasta.FastaIterator(args.fasta)
    peptide_column = args.peptide_col
    try:
        peptide_index = int(peptide_column) - 1
        peptide_column = peptide_index
    except ValueError:
        peptide_index = None
    tsv_file = args.tsv
    il_convert = not args.no_equality
    out_file = args.out
    header_lines = args.header
    delimiter = args.delimiter
    inference = not args.no_inference
    inferred_name = args.inferred_name
    digest_min = args.min
    digest_max = args.max
    normalize = args.normalize
    ibaq = args.ibaq
    ibaq_redunant = not args.non_redundant
    case_sens = args.case_sensitive
    mod_site = args.modification_site
    unique = args.unique_only
    out_position = args.position
    if mod_site:
        case_sens = True
        inference = True
    precursor_columns = [i for i in args.precursors.split(',')
                         ] if args.precursors else None
    if ibaq:
        enzyme = digest.Enzyme(enzyme=args.enzyme[0] if isinstance(
            args.enzyme, list) else args.enzyme)
    sys.stderr.write("Reading in Fasta file.\n")
    fasta_headers, protein_sequences = zip(
        *[(header.replace(';', ''), sequence)
          for header, sequence in fasta_file])

    #replace headers with parsed ones
    if args.regex:
        regex = re.compile(args.regex)
        fasta_headers = [regex.search(header) for header in fasta_headers]
        protein_sequences = [
            protein_sequences[i] for i, v in enumerate(fasta_headers) if v
        ]
        fasta_headers = [' '.join(i.groups()) for i in fasta_headers if i]
        sys.stderr.write(
            '{0} header sequences did not match regex {1} and have been discarded.\n'
            .format(len(fasta_headers) - len(protein_sequences), args.regex))
    if ibaq:
        ibaq_protein_sequence = {
            header: sequence
            for header, sequence in zip(fasta_headers, protein_sequences)
        }
        cleaved = {}
    protein_sequences = '\n'.join([
        '{}\t{}'.format(header, sequence)
        for header, sequence in zip(fasta_headers, protein_sequences)
    ])
    # protein_sequences = '\n'.join(protein_sequences)
    peptide_history = {}
    mod_grouping = {}  # ordered by protein, site, type
    pep_count = 0
    pep_set = set([])
    mod_col = args.mod_col
    correlator = ColumnFunctions(args)
    motif_search = args.motifs
    if motif_search:
        motif_window = args.motif_window
        motif_unique = args.motif_unique
        if args.motif_out:
            motif_out = open(args.motif_out, 'wb')
        elif args.out:
            motif_out = open('{0}_motif'.format(args.out.name), 'wb')
        else:
            sys.stderr.write(
                "You must provide an output name for motif-out if you are piping to stdout.\n"
            )
            return -1
    mod_col_func = getattr(correlator, args.mod_col_func, correlator.concat)
    ibaq_col_func = getattr(correlator, args.ibaq_function, correlator.concat)
    with tsv_file as f:
        reader = csv.reader(f, delimiter=delimiter)
        for line_num, entry in enumerate(reader):
            if line_num < header_lines:  # we assume the first header line is the one we care about
                if peptide_index is None:
                    for i, v in enumerate(entry):
                        if v.lower() == args.peptide_col.lower():
                            peptide_column = i
                            break
                if mod_col is not None and mod_col.isdigit():
                    mod_col = int(mod_col) - 1
                elif mod_col is not None:
                    for i, v in enumerate(entry):
                        if v.lower() == args.mod_col.lower():
                            mod_col = i
                if not precursor_columns:
                    precursor_columns = [
                        i for i, v in enumerate(entry)
                        if 'precursor' in v.lower()
                    ]
                try:
                    precursor_columns = [int(i) for i in precursor_columns]
                except ValueError:
                    precursor_columns = [
                        entry.index(i) for i in precursor_columns
                    ]
                normalizations = [0 for i in precursor_columns]
            else:
                peptide = entry[peptide_column]
                pep_count += 1
                if not case_sens:
                    peptide = peptide.upper()
                pep_set.add(peptide)
                if peptide not in peptide_history:
                    peptide_history[peptide] = {
                        'intensities':
                        dict([(i, set([]))
                              for i in xrange(len(precursor_columns))])
                        if precursor_columns is not None else {},
                    }
                if precursor_columns:
                    for n_i, e_i in enumerate(precursor_columns):
                        if entry[e_i]:
                            try:
                                intensity = decimal.Decimal(entry[e_i])
                            except decimal.InvalidOperation:
                                intensity = decimal.Decimal(0)
                            peptide_history[peptide]['intensities'][n_i].add(
                                intensity)
                if mod_col is not None:
                    peptide_history[peptide]['mod_col'] = entry[mod_col]
        if ibaq and normalize and precursor_columns:
            for peptide in peptide_history:
                for i, v in peptide_history[peptide]['intensities'].iteritems(
                ):
                    normalizations[i] += sum(v)
        else:
            normalizations = [decimal.Decimal(1) for i in normalizations]

    # map our peptides is a multi-cored manner
    pool = Pool(cores)
    # get our matches

    peptides = list(set([i.upper() for i in peptide_history.keys()]))
    # break into groups of 100 (empirically gives fastest mapping)
    subpeptides = [
        peptides[n:n + peptides_per_core]
        for n in xrange(0, len(peptides), peptides_per_core)
    ]
    if n < len(peptides):
        subpeptides.extend(peptides[n + peptides_per_core:])
    num_peps = len(peptides)

    progress_finish()

    sys.stderr.write('Mapping Peptides.\n')
    results = pool.map_async(mapper, subpeptides)
    results.wait()
    mapped_peptides = dict(
        (k, v) for d in results.get() for (k, v) in d.items())
    sys.stderr.write('\nPeptides mapped.\n')

    protein_grouping = {}
    peptide_grouping = {}
    stats = {'peptides': pep_count}
    stats['peptides_found'] = len(pep_set)
    proteins_mapped = set([])
    peptide_out = []
    empty_dict = {
        'proteins': '',
        'positions': [],
        'accessions': [],
        'matches': [],
        'unique': True
    }
    for index, (peptide, d) in enumerate(peptide_history.iteritems()):
        try:
            peptide_dict = peptide_grouping[peptide]
        except KeyError:
            peptide_dict = {'intensities': {}}
            peptide_grouping[peptide] = peptide_dict
        if not index % 100:
            progress_update(index, len(peptide_history))
        mapped_info = mapped_peptides.get(peptide.upper(), empty_dict)
        precursor_int = float(
            sum([sum(d['intensities'][i]) for i in d['intensities']]))
        entry = [
            peptide,
            sum([len(d['intensities'][i]) for i in d['intensities']]),
            precursor_int
        ]
        if 'inference' not in peptide_dict:
            peptide_dict['inference'] = {'proteins': ''}
            if inference:
                proteins = mapped_info['proteins']
                accessions = mapped_info['accessions']
                start_positions = mapped_info['positions'] if mod_site else []
                proteins_mapped |= set(proteins)
                if unique:
                    proteins = make_unique(proteins)
                    if len(proteins) > 1:
                        mapped_info['unique'] = False
                matches = ';'.join(proteins)
                peptide_dict['inference']['proteins'] = matches
                if not unique or mapped_info['unique']:
                    entry.append(matches)
                else:
                    entry.append('')
                for protein_index, protein in enumerate(proteins):
                    try:
                        protein_grouping[protein][peptide] = d
                    except KeyError:
                        protein_grouping[protein] = {peptide: d}

                if mod_site:
                    mod_site_additions = []
                    motifs_found = {}
                    find_motif = False
                    if motif_search and (len(proteins) == 1
                                         or not motif_unique):
                        find_motif = True
                    for start_position, protein in zip(start_positions,
                                                       accessions):
                        mod_site_addition = []
                        for j, k in enumerate(peptide):
                            if k.islower():
                                mod_pos = start_position + j
                                mod_key = '%s:%d' % (k, mod_pos)
                                if find_motif:
                                    motif_sequences = [
                                        protein_sequences[i + j -
                                                          motif_window:i + j +
                                                          motif_window + 1]
                                        for i in mapped_info['matches']
                                    ]
                                    motif_pos = motif_window
                                    # remove any newlines to the left of us
                                    for motif_sequence in motif_sequences:
                                        cut = motif_sequence[:motif_pos].rfind(
                                            '\t')
                                        if cut != -1:
                                            motif_sequence = motif_sequence[
                                                cut + 1:]
                                            motif_pos -= (cut + 1)
                                        cut = motif_sequence[motif_pos +
                                                             1:].rfind('\t')
                                        if cut != -1:
                                            motif_sequence = motif_sequence[:
                                                                            motif_pos
                                                                            +
                                                                            cut]
                                        found = motifs_found.get(mod_key, [])
                                        motifs_found[mod_key] = make_unique(
                                            found + [motif_sequence])
                                mod_site_addition.append(mod_key)
                                if mod_col or mod_site:
                                    try:
                                        mod_values = mod_grouping[protein][
                                            mod_key]['values']
                                        mod_peptides = mod_grouping[protein][
                                            mod_key]['peptides']
                                        if mod_col:
                                            mod_values.append(d['mod_col'])
                                        mod_grouping[protein][mod_key][
                                            'values'] = make_unique(mod_values)
                                        mod_grouping[protein][mod_key][
                                            'peptides'] = make_unique(
                                                mod_peptides + [peptide])
                                    except KeyError:
                                        try:
                                            mod_grouping[protein][mod_key] = {
                                                'values':
                                                make_unique([d['mod_col']])
                                                if mod_col else '',
                                                'peptides':
                                                make_unique([peptide])
                                            }
                                        except KeyError:
                                            mod_grouping[protein] = {
                                                mod_key: {
                                                    'values':
                                                    make_unique([d['mod_col']])
                                                    if mod_col else '',
                                                    'peptides':
                                                    make_unique([peptide])
                                                }
                                            }
                        mod_site_additions.append(
                            '%s(%s)' % (protein, ','.join(mod_site_addition)))
                    peptide_dict['inference']['mod_sites'] = ';'.join(
                        mod_site_additions)
                    peptide_dict['inference']['motifs'] = motifs_found
                peptide_dict['inference']['matched_positions'] = ','.join(
                    str(i) for i in start_positions)
        if ibaq:
            ibaqs = []
            intensities = [sum(d['intensities'][i]) for i in d['intensities']]
            try:
                precursor_int = sum([
                    intensities[i] / normalizations[i]
                    for i in xrange(len(normalizations))
                ])
            except decimal.InvalidOperation:
                precursor_int = 0
            entry.append(precursor_int)
            for protein_index in mapped_info['accessions']:
                peptides = cleaved.get(protein_index, None)
                if peptides is None:
                    if ibaq_redunant:
                        peptides = sum([
                            len(
                                enzyme.cleave(
                                    ibaq_protein_sequence[protein_accession],
                                    min=digest_min,
                                    max=digest_max))
                            for protein_accession in mapped_info['accessions']
                        ])
                    else:
                        peptides = len(
                            set([
                                peptide for tryptic_peptides in [
                                    enzyme.cleave(ibaq_protein_sequence[
                                        possible_protein_index],
                                                  min=digest_min,
                                                  max=digest_max)
                                    for possible_protein_index in
                                    mapped_info['indices']
                                ] for peptide in tryptic_peptides
                            ]))
                    cleaved[protein_index] = peptides
                if not peptides:
                    ibaqs.append(0)
                    continue
                # this divides the precursor intensity of the given peptide by the number of theoretically
                #  possible cleaved peptides per protein.
                # If the user is grouping things at a higher level, say the gene level this will output the ibaq
                # per each mapped isoform if that gene has isoforms.
                # if peptide.upper() == 'HMSFHAHVR':
                #     import pdb; pdb.set_trace();
                ibaqs.append(precursor_int /
                             peptides if peptides and precursor_int else 0)
            peptide_dict['inference']['iBAQ'] = ibaq_col_func(
                [int(IBAQ_NORMALIZATION * i) for i in ibaqs]) if ibaqs else 0
            entry.append(peptide_dict['inference']['iBAQ']
                         if not unique or mapped_info['unique'] else '')
        if out_position:
            entry.append(peptide_dict['inference'].get('matched_positions', '')
                         if not unique or mapped_info['unique'] else '')
        if mod_site:
            entry.append(peptide_dict['inference'].get('mod_sites', '')
                         if not unique or mapped_info['unique'] else '')
        if motif_search:
            entry.append(';'.join([
                '{}({})'.format(motif_site, ';'.join(motifs))
                for motif_site, motifs in peptide_dict['inference'].get(
                    'motifs', {}).iteritems()
            ]))
        peptide_out.append(entry)
    progress_finish()
    with args.peptide_out as o:
        writer = csv.writer(o, delimiter=delimiter)
        header = ['Peptide', 'PSMS', 'Total Precursor Area']
        if inference:
            header.append(inferred_name)
        if ibaq:
            if normalize:
                header.append('Normalized Precursor Intensity')
            header.append('iBAQ')
        if out_position:
            header.append('Peptide %s Position' % inferred_name)
        if mod_site:
            header.append('Modification Positions')
        if motif_search:
            header.append('Motif')
        writer.writerow(header)
        for i in peptide_out:
            writer.writerow(i)
    if motif_search:
        with motif_out as o:
            writer = csv.writer(o, delimiter=delimiter)
            header = ['Residue', 'Motif']
            if inference:
                header.append(inferred_name)
            writer.writerow(header)
            for peptide, peptide_dict in peptide_grouping.iteritems():
                for motif_key, motifs in peptide_dict['inference'].get(
                        'motifs', {}).iteritems():
                    writer.writerow([
                        motif_key, ';'.join(motifs),
                        peptide_dict['inference']['proteins']
                    ])
    stats['proteins_mapped'] = len(proteins_mapped)
    if inference:
        with args.protein_out as o:
            writer = csv.writer(o, delimiter=delimiter)
            header = [inferred_name, 'Peptides', 'Total Precursor Area']
            if mod_site:
                header.append('Modification Positions')
            if ibaq:
                if normalize:
                    header.append('Normalized Precursor Intensity')
                header.append('iBAQ')
            writer.writerow(header)
            for protein in protein_grouping:
                entry = [protein]
                intensities = []
                precursor_int = 0
                peptide_psm_count = []
                mods = set([])
                for peptide in protein_grouping[protein]:
                    if mod_site:
                        peptide_dict = peptide_grouping.get(peptide, False)
                        if peptide_dict:
                            mod_proteins = peptide_dict['inference'][
                                'mod_sites']
                            for mod_protein in mod_proteins.split(';'):
                                #mod protein looks like:
                                #WBGene00004829(y:467,k:471);WBGene00019361(m:68);WBGene00019361(m:118);WBGene00019361(m:68);WBGene00020808(m:261);WBGene00020808(m:156)
                                mod_prots = mod_protein.split(';')
                                for mod_prot_ in mod_prots:
                                    mod_prot, mod_prot_sites = mod_prot_.rsplit(
                                        '(', 1)
                                    if mod_prot == protein:
                                        for mod_prot_site in mod_prot_sites[:-1].split(
                                                ','):
                                            if mod_prot_site:
                                                mod_aa, mod_prot_site = mod_prot_site[:-1].split(
                                                    ':')
                                                mods.add(
                                                    (mod_aa, mod_prot_site))
                    d = protein_grouping[protein][peptide]
                    if not unique or mapped_peptides.get(peptide,
                                                         {}).get('unique'):
                        peptide_psm_count.append((peptide,
                                                  sum([
                                                      len(d['intensities'][i])
                                                      for i in d['intensities']
                                                  ])))
                        intensities += [
                            sum(d['intensities'][i]) for i in d['intensities']
                        ]
                        if ibaq and normalize:
                            try:
                                precursor_int += sum([
                                    intensities[i] / normalizations[i]
                                    for i in xrange(len(normalizations))
                                ])
                            except decimal.InvalidOperation:
                                pass
                entry.append(';'.join(
                    ['%s(%s)' % (i, j) for i, j in peptide_psm_count]))
                entry.append(sum(intensities))
                if mod_site:
                    mods = list(mods)
                    mods.sort(key=lambda x: x[1])
                    entry.append(';'.join(['%s%s' % (i, j) for i, j in mods]))
                if ibaq:
                    if normalize:
                        entry.append(precursor_int)
                    peptides = cleaved.get(protein_index, None)
                    ibaq_value = [
                        int(IBAQ_NORMALIZATION * precursor_int /
                            peptides) if peptides and precursor_int else 0
                    ]
                    entry.append(ibaq_col_func(ibaq_value))
                writer.writerow(entry)
    tsv_file = open(tsv_file.name)
    with tsv_file as f:
        reader = csv.reader(f, delimiter=delimiter)
        mod_stats = {}
        with out_file as o:
            out_writer = csv.writer(o, delimiter=delimiter)
            total_mods = Counter()
            for line_num, entry in enumerate(reader):
                if line_num < header_lines:  #we assume the first header line is the one we care about
                    if inference:
                        entry.append(inferred_name)
                    if out_position:
                        entry.append('Peptide %s Position' % inferred_name)
                    if mod_site:
                        entry.append('Modification Position')
                    if ibaq:
                        entry.append('iBAQ')
                else:
                    peptide = entry[peptide_column]
                    if not case_sens:
                        peptide = peptide.upper()
                    d = peptide_grouping.get(peptide, False)
                    total_mods.update([k for k in peptide if k.islower()])
                    if d:
                        if inference:
                            entry.append(
                                d['inference']['proteins']
                                if not unique or mapped_peptides.
                                get(peptide, {}).get('unique') else '')
                        if out_position:
                            entry.append(
                                d['inference']['matched_positions']
                                if not unique or mapped_peptides.
                                get(peptide, {}).get('unique') else '')
                        if mod_site:
                            mod_proteins = d['inference']['mod_sites']
                            peptide_mods = {}
                            mod_entry = []
                            if not unique or mapped_peptides.get(
                                    peptide, {}).get('unique'):
                                for mod_protein in mod_proteins.split(';'):
                                    #mod protein looks like:
                                    mod_prots = mod_protein.split(';')
                                    for mod_prot_ in mod_prots:
                                        if not mod_prot_:
                                            continue
                                        mod_prot, mod_prot_sites = mod_prot_.rsplit(
                                            '(', 1)
                                        for mod_prot_site in mod_prot_sites[:-1].split(
                                                ','):
                                            if mod_prot_site:
                                                mod_aa, mod_prot_site = mod_prot_site.split(
                                                    ':')
                                                try:
                                                    peptide_mods[mod_prot].add(
                                                        (mod_aa,
                                                         mod_prot_site))
                                                except KeyError:
                                                    peptide_mods[
                                                        mod_prot] = set([
                                                            (mod_aa,
                                                             mod_prot_site)
                                                        ])
                                                try:
                                                    mod_stats[mod_aa].add(
                                                        (mod_prot,
                                                         mod_prot_site))
                                                except KeyError:
                                                    mod_stats[mod_aa] = set([
                                                        (mod_prot,
                                                         mod_prot_site)
                                                    ])
                                for mod_prot, mods in peptide_mods.iteritems():
                                    modl = list(mods)
                                    modl.sort(key=lambda x: x[1])
                                    mod_entry.append(
                                        '%s(%s)' % (mod_prot, ' '.join([
                                            '%s:%s' % (i, j) for i, j in modl
                                        ])))
                            entry.append(';'.join(mod_entry))
                        if ibaq:
                            entry.append(d['inference'].get('iBAQ', 0)
                                         if not unique or mapped_peptides.
                                         get(peptide, {}).get('unique') else 0)
                # if peptide.upper() == 'HYNEAVKR':
                #     import pdb; pdb.set_trace();
                out_writer.writerow(entry)
        stats['modifications'] = mod_stats
    mod_out = args.mod_out if args.mod_out else open(
        os.path.join('{}_mods'.format(out_file.name)), 'wb')
    with mod_out as o:
        writer = csv.writer(o, delimiter=delimiter)
        header = ['Site', inferred_name, 'Peptide']
        if mod_col:
            header.append(args.mod_col)
        writer.writerow(header)
        #mod_grouping[protein] = {'%s%d'%(k, mod_pos): {'values': set([d['mod_col']]), 'peptides': set([peptide])}}
        for protein, sites_dict in mod_grouping.iteritems():
            for site, site_dict in sites_dict.iteritems():
                entry = [site, protein, ';'.join(site_dict.get('peptides'))]
                if mod_col:
                    entry.append(mod_col_func(site_dict.get('values', [])))
                writer.writerow(entry)
    # write stats
    sys.stderr.write('Peptides Searched: %s\n' % stats['peptides'])
    sys.stderr.write('Unique Peptides Found: %s\n' % stats['peptides_found'])
    sys.stderr.write('%s Mapped to: %s\n' %
                     (inferred_name, stats['proteins_mapped']))
    if stats['modifications']:
        sys.stderr.write('Modifications:\n')
        for site, sites in stats['modifications'].iteritems():
            sys.stderr.write(
                '  %s: %s found with %d potential sites (%d mappings)\n' %
                (site, total_mods[site], len(sites),
                 len(set([i[0] for i in sites]))))
Beispiel #8
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    enzyme_choice = args.enzyme
    enzyme_pattern = args.enzyme_pattern
    digest_type = args.type
    digest_frame = args.frame
    digest_negative = False
    if digest_frame == 6:
        digest_negative = True
        digest_frame = 3
    digest_min = args.min
    digest_max = args.max
    genome = args.genome
    unique_digest = args.unique
    #if we're splitting a genome
    if genome:
        import re
        regex = re.compile(r'([\*])')
        digest_type = 'nt'
    if digest_type == 'prot' and digest_frame:
        sys.stderr.write("Protein digestions cannot have a frame.\n")
        return 1
    if digest_type == 'nt' and not digest_frame:
        sys.stderr.write("Nucleotide digestions must specify the frame.\n")
        return 1
    fasta_file = fasta.FastaIterator(file_name)
    if enzyme_pattern:
        enzymes = [digest.Enzyme(pattern=enzyme_pattern)]
    elif enzyme_choice:
        enzymes = [
            digest.Enzyme(enzyme=protease) for protease in enzyme_choice
        ]
    with args.out as o:
        if digest_type == 'nt':
            for header, sequence in fasta_file:
                if genome:
                    slen = len(sequence)
                for i in xrange(digest_frame):
                    strand = '+'
                    translation = fasta._translate(sequence[i:])
                    if genome:
                        position = i + 1
                        translation = [j for j in regex.split(translation)]
                        translation = [
                            ''.join(j)
                            for j in itertools.izip_longest(translation[0::2],
                                                            translation[1::2],
                                                            fillvalue='')
                        ]
                    else:
                        translation = translation.split('*')
                    for protein_index, protein_sequence in enumerate(
                            translation):
                        if genome:
                            enzyme_kwargs = {
                                'min': 0,
                                'max': 999999,
                                'unique': unique_digest
                            }
                        else:
                            enzyme_kwargs = {
                                'min': digest_min,
                                'max': digest_max,
                                'unique': unique_digest
                            }
                        peptides = enzymes[0].cleave(protein_sequence,
                                                     **enzyme_kwargs)
                        for enzyme in enzymes[1:]:
                            peptides = [
                                sub_seq for peptide_sequence in peptides
                                for sub_seq in enzyme.cleave(
                                    peptide_sequence, **enzyme_kwargs)
                            ]
                        for peptide_index, peptide in enumerate(peptides):
                            if genome:
                                if len(peptide) >= digest_min:
                                    if peptide.endswith('*'):
                                        o.write(
                                            '>%s F:%s%d Start:%d End:%d \n%s\n'
                                            % (header, strand, i + 1, position,
                                               position + len(peptide) * 3 - 1,
                                               peptide[:-1]))
                                    else:
                                        o.write(
                                            '>%s F:%s%d Start:%d End:%d \n%s\n'
                                            % (header, strand, i + 1, position,
                                               position + len(peptide) * 3 - 1,
                                               peptide))
                                position += len(peptide) * 3
                            else:
                                o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' %
                                        (header, strand, i + 1, protein_index +
                                         1, peptide_index + 1, peptide))
                    if digest_negative:
                        strand = '-'
                        translation = fasta._translate(
                            fasta._reverse_complement(sequence)[i:])
                        if genome:
                            position = slen - i
                            translation = [j for j in regex.split(translation)]
                            translation = [
                                ''.join(j) for j in itertools.izip_longest(
                                    translation[0::2],
                                    translation[1::2],
                                    fillvalue='')
                            ]
                        else:
                            translation = translation.split('*')
                        for protein_index, protein_sequence in enumerate(
                                translation):
                            if genome:
                                enzyme_kwargs = {
                                    'min': 0,
                                    'max': 999999,
                                    'unique': unique_digest
                                }
                            else:
                                enzyme_kwargs = {
                                    'min': digest_min,
                                    'max': digest_max,
                                    'unique': unique_digest
                                }
                            peptides = enzymes[0].cleave(
                                protein_sequence, **enzyme_kwargs)
                            for enzyme in enzymes[1:]:
                                peptides = [
                                    sub_seq for peptide_sequence in peptides
                                    for sub_seq in enzyme.cleave(
                                        peptide_sequence, **enzyme_kwargs)
                                ]
                            for peptide_index, peptide in enumerate(peptides):
                                if genome:
                                    if len(peptide) >= digest_min:
                                        if peptide.endswith('*'):
                                            o.write(
                                                '>%s F:%s%d Start:%d End:%d \n%s\n'
                                                %
                                                (header, strand, i + 1,
                                                 position - len(peptide) * 3 +
                                                 1, position, peptide[:-1]))
                                        else:
                                            o.write(
                                                '>%s F:%s%d Start:%d End:%d \n%s\n'
                                                %
                                                (header, strand, i + 1,
                                                 position - len(peptide) * 3 +
                                                 1, position, peptide))
                                    position -= (len(peptide) * 3)
                                else:
                                    o.write(
                                        '>%s F:%s%d Orf:%d Pep:%d \n%s\n' %
                                        (header, strand, i + 1, protein_index +
                                         1, peptide_index + 1, peptide))
        else:
            for header, sequence in fasta_file:
                enzyme_kwargs = {
                    'min': digest_min,
                    'max': digest_max,
                    'unique': unique_digest
                }
                peptides = enzymes[0].cleave(sequence, **enzyme_kwargs)
                for enzyme in enzymes[1:]:
                    peptides = [
                        sub_seq for peptide_sequence in peptides for sub_seq in
                        enzyme.cleave(peptide_sequence, **enzyme_kwargs)
                    ]
                for peptide_index, peptide in enumerate(peptides):
                    o.write('>%s Pep:%d \n%s\n' %
                            (header, peptide_index + 1, peptide))
Beispiel #9
0
def main():
    args = parser.parse_args()
    digest_min = args.min
    digest_max = args.max
    enzymes = args.enzyme
    peptides_found = {}
    retained = {}
    total = 0
    proteinMap = {}
    coverageMap = {}
    aas = config.RESIDUE_MASSES.keys()
    aas.sort()
    tlen = 0
    parallel = args.parallel
    for protease_index, protease in enumerate(enzymes):
        if parallel or protease_index == 0:
            fasta_file = fasta.FastaIterator(args.fasta)
        enzyme = digest.Enzyme(enzyme=protease)
        sys.stderr.write('processing %s\n' % protease)
        #if doing in series, this iterator is not reset and will never run
        for header, sequence in fasta_file:
            if protease_index == 0:
                total += 1
                proteinMap[header] = sequence
                tlen += len(sequence)
            for peptide in set(
                    enzyme.cleave(sequence, min=digest_min, max=999999)):
                if len(peptide) > digest_max:
                    #we don't see this one
                    if not parallel:
                        try:
                            retained[header].add(peptide)
                        except KeyError:
                            retained[header] = set([peptide])
                else:
                    #we see this one
                    try:
                        peptides_found[peptide].add(header)
                    except KeyError:
                        peptides_found[peptide] = set([header])
                    try:
                        coverageMap[header].add(peptide)
                    except KeyError:
                        coverageMap[header] = set([peptide])
        if not parallel and protease_index > 0:
            for header in retained:
                sequences = copy.deepcopy(retained[header])
                for sequence in sequences:
                    for peptide in set(
                            enzyme.cleave(sequence, min=digest_min,
                                          max=999999)):
                        if len(peptide) > digest_max:
                            if not parallel:
                                retained[header].add(peptide)
                        else:
                            try:
                                peptides_found[peptide].add(header)
                            except KeyError:
                                peptides_found[peptide] = set([header])
                            try:
                                coverageMap[header].add(peptide)
                            except KeyError:
                                coverageMap[header] = set([peptide])
        sys.stderr.write('%d total peptides after digesting with %s\n' %
                         (len(peptides_found), protease))
        if parallel:
            args.fasta.seek(0)
    unique_proteins = set([])
    for peptide in peptides_found:
        if len(peptides_found[peptide]) == 1:
            unique_proteins |= peptides_found[peptide]
    with args.out as o:
        o.write(
            'Protein\tDetectable Length\tTotal Length\tCoverage%%\tUnique ID\t%s\n'
            % '\t'.join(aas))
        sys.stderr.write(
            '%d proteins found out of %d total proteins in database.\n' %
            (len(coverageMap), total))
        sys.stderr.write(
            '%d of these detectable proteins may be uniquely identified.\n' %
            (len(unique_proteins)))
        #figure out coverage
        covered = {}
        found_proteins = set([])
        inum = 0
        for peptide in peptides_found:
            inum += 1
            if inum % 50000 == 0:
                sys.stderr.write('%d peptides processed\n' % inum)
            for header in peptides_found[peptide]:
                found_proteins.add(header)
                sequence = proteinMap[header]
                found = covered.get(header, set(xrange(len(sequence))))
                sites = [
                    match.start() for match in re.finditer(peptide, sequence)
                ]
                for match_position in sites:
                    found -= set(
                        xrange(match_position, match_position + len(peptide)))
                covered[header] = found
        avg_cov = 0
        missed_len = 0
        detected = 0
        for header in coverageMap:
            total_len = len(proteinMap[header])
            found_len = total_len - len(covered.get(header, []))
            perc_cov = float(found_len) / float(total_len)
            o.write('%s\t%d\t%d\t%d\t%s\t' %
                    (header, found_len, total_len, perc_cov * 100.0,
                     str(header in unique_proteins)))
            #what aa's do we miss
            aas_missed = ''.join(proteinMap[header][i]
                                 for i in covered[header])
            missed = [aas_missed.count(j) for j in aas]
            missed_len += sum(missed)
            o.write('%s\n' % '\t'.join([str(i) for i in missed]))
            avg_cov += perc_cov
            if header in found_proteins:
                detected += 1
        sys.stderr.write('average coverage is %0.4f over entire proteome\n' %
                         (float(tlen - missed_len) / float(tlen)))
        sys.stderr.write('average coverage is %0.4f over detected proteins\n' %
                         (avg_cov / detected))