def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands no_met = args.no_met_start from_met = args.from_met from_met_keep = args.from_met_keep if from_met_keep: from_met = True no_met = True def write_sequence(handle, header, protein_index, protein_sequence): header1 = '>%s F:%s%d Orf:%d' % (header, strand, i + 1, protein_index + 1) protein_sequences = [(header1, protein_sequence)] if from_met: pos = protein_sequence.find('M') if pos == -1: return header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % ( header, pos, strand, i + 1, protein_index + 1) protein_sequences.append((header2, protein_sequence[pos:])) for protein_header, protein_sequence in protein_sequences: if len(protein_sequence) >= orf_min and ( no_met or protein_sequence[0] == 'M'): handle.write('%s\n%s\n' % (protein_header, protein_sequence)) with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand = '+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index, protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence) if negative_strand: strand = '-' translation = fasta._translate( fasta._reverse_complement(sequence)[i:]) for protein_index, protein_sequence in enumerate( translation): write_sequence(o, header, protein_index, protein_sequence)
def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand='+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): if len(protein_sequence) >= orf_min and protein_sequence[0] == 'M': o.write('>%s F:%s%d Orf:%d\n%s\n' % (header,strand,i+1,protein_index+1,protein_sequence)) if negative_strand: strand = '-' translation = fasta._translate(fasta._reverse_complement(sequence)[i:]) for protein_index,protein_sequence in enumerate(translation): if len(protein_sequence) >= orf_min and protein_sequence[0] == 'M': o.write('>%s F:%s%d Orf:%d\n%s\n' % (header,strand,i+1,protein_index+1,protein_sequence))
def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands no_met = args.no_met_start from_met = args.from_met from_met_keep = args.from_met_keep if from_met_keep: from_met = True no_met = True def write_sequence(handle, header, protein_index, protein_sequence): header1 = '>%s F:%s%d Orf:%d' % (header,strand,i+1,protein_index+1) protein_sequences = [(header1, protein_sequence)] if from_met: pos = protein_sequence.find('M') if pos == -1: return header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % (header,pos, strand,i+1,protein_index+1) protein_sequences.append((header2, protein_sequence[pos:])) for protein_header, protein_sequence in protein_sequences: if len(protein_sequence) >= orf_min and (no_met or protein_sequence[0] == 'M'): handle.write('%s\n%s\n' % (protein_header, protein_sequence)) with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand='+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence) if negative_strand: strand = '-' translation = fasta._translate(fasta._reverse_complement(sequence)[i:]) for protein_index,protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence)
def main(): args = parser.parse_args() file_name = args.fasta enzyme_choice = args.enzyme digest_type = args.type digest_frame = args.frame digest_negative = False if digest_frame == 6: digest_negative = True digest_frame = 3 digest_min = args.min digest_max = args.max genome = args.genome unique_digest = args.unique #if we're splitting a genome if genome: import re regex = re.compile(r'([\*])') digest_type = 'nt' if digest_type == 'prot' and digest_frame: sys.stderr.write("Protein digestions cannot have a frame.\n") return 1 if digest_type == 'nt' and not digest_frame: sys.stderr.write("Nucleotide digestions must specify the frame.\n") return 1 fasta_file = fasta.FastaIterator(file_name) enzyme = digest.Enzyme( enzyme=enzyme_choice ) with args.out as o: if digest_type == 'nt': for header, sequence in fasta_file: if genome: slen = len(sequence) for i in xrange(digest_frame): strand='+' translation = fasta._translate(sequence[i:]) if genome: position = i+1 translation = [j for j in regex.split(translation)] translation = [''.join(j) for j in itertools.izip_longest(translation[0::2],translation[1::2],fillvalue='')] else: translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): if genome: peptides = enzyme.cleave(protein_sequence, min=0, max=99999, unique=unique_digest) else: peptides = enzyme.cleave(protein_sequence, min=digest_min, max=digest_max, unique=unique_digest) for peptide_index,peptide in enumerate(peptides): if genome: if len(peptide)>=digest_min: if peptide.endswith('*'): o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position,position+len(peptide)*3-1,peptide[:-1])) else: o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position,position+len(peptide)*3-1,peptide)) position+=len(peptide)*3 else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header,strand,i+1,protein_index+1,peptide_index+1,peptide)) if digest_negative: strand = '-' translation = fasta._translate(fasta._reverse_complement(sequence)[i:]) if genome: position = slen-i translation = [j for j in regex.split(translation)] translation = [''.join(j) for j in itertools.izip_longest(translation[0::2],translation[1::2],fillvalue='')] else: translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): if genome: peptides = enzyme.cleave(protein_sequence, min=0, max=999999, unique=unique_digest) else: peptides = enzyme.cleave(protein_sequence, min=digest_min, max=digest_max, unique=unique_digest) for peptide_index,peptide in enumerate(peptides): if genome: if len(peptide)>=digest_min: if peptide.endswith('*'): o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position-len(peptide)*3+1,position,peptide[:-1])) else: o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position-len(peptide)*3+1,position,peptide)) position-=(len(peptide)*3) else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header,strand,i+1,protein_index+1,peptide_index+1,peptide)) else: for header, sequence in fasta_file: peptides = enzyme.cleave(sequence, min=digest_min, max=digest_max) for peptide_index,peptide in enumerate(peptides): o.write('>%s Pep:%d \n%s\n' % (header,peptide_index+1,peptide))
def main(): args = parser.parse_args() file_name = args.fasta enzyme_choice = args.enzyme enzyme_pattern = args.enzyme_pattern digest_type = args.type digest_frame = args.frame digest_negative = False if digest_frame == 6: digest_negative = True digest_frame = 3 digest_min = args.min digest_max = args.max genome = args.genome unique_digest = args.unique #if we're splitting a genome if genome: import re regex = re.compile(r'([\*])') digest_type = 'nt' if digest_type == 'prot' and digest_frame: sys.stderr.write("Protein digestions cannot have a frame.\n") return 1 if digest_type == 'nt' and not digest_frame: sys.stderr.write("Nucleotide digestions must specify the frame.\n") return 1 fasta_file = fasta.FastaIterator(file_name) if enzyme_pattern: enzymes = [digest.Enzyme(pattern=enzyme_pattern)] elif enzyme_choice: enzymes = [ digest.Enzyme(enzyme=protease) for protease in enzyme_choice ] with args.out as o: if digest_type == 'nt': for header, sequence in fasta_file: if genome: slen = len(sequence) for i in xrange(digest_frame): strand = '+' translation = fasta._translate(sequence[i:]) if genome: position = i + 1 translation = [j for j in regex.split(translation)] translation = [ ''.join(j) for j in itertools.izip_longest(translation[0::2], translation[1::2], fillvalue='') ] else: translation = translation.split('*') for protein_index, protein_sequence in enumerate( translation): if genome: enzyme_kwargs = { 'min': 0, 'max': 999999, 'unique': unique_digest } else: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave(protein_sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave( peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): if genome: if len(peptide) >= digest_min: if peptide.endswith('*'): o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position, position + len(peptide) * 3 - 1, peptide[:-1])) else: o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position, position + len(peptide) * 3 - 1, peptide)) position += len(peptide) * 3 else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header, strand, i + 1, protein_index + 1, peptide_index + 1, peptide)) if digest_negative: strand = '-' translation = fasta._translate( fasta._reverse_complement(sequence)[i:]) if genome: position = slen - i translation = [j for j in regex.split(translation)] translation = [ ''.join(j) for j in itertools.izip_longest( translation[0::2], translation[1::2], fillvalue='') ] else: translation = translation.split('*') for protein_index, protein_sequence in enumerate( translation): if genome: enzyme_kwargs = { 'min': 0, 'max': 999999, 'unique': unique_digest } else: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave( protein_sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave( peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): if genome: if len(peptide) >= digest_min: if peptide.endswith('*'): o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position - len(peptide) * 3 + 1, position, peptide[:-1])) else: o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position - len(peptide) * 3 + 1, position, peptide)) position -= (len(peptide) * 3) else: o.write( '>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header, strand, i + 1, protein_index + 1, peptide_index + 1, peptide)) else: for header, sequence in fasta_file: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave(sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave(peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): o.write('>%s Pep:%d \n%s\n' % (header, peptide_index + 1, peptide))