def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands no_met = args.no_met_start from_met = args.from_met from_met_keep = args.from_met_keep if from_met_keep: from_met = True no_met = True def write_sequence(handle, header, protein_index, protein_sequence): header1 = '>%s F:%s%d Orf:%d' % (header, strand, i + 1, protein_index + 1) protein_sequences = [(header1, protein_sequence)] if from_met: pos = protein_sequence.find('M') if pos == -1: return header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % ( header, pos, strand, i + 1, protein_index + 1) protein_sequences.append((header2, protein_sequence[pos:])) for protein_header, protein_sequence in protein_sequences: if len(protein_sequence) >= orf_min and ( no_met or protein_sequence[0] == 'M'): handle.write('%s\n%s\n' % (protein_header, protein_sequence)) with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand = '+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index, protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence) if negative_strand: strand = '-' translation = fasta._translate( fasta._reverse_complement(sequence)[i:]) for protein_index, protein_sequence in enumerate( translation): write_sequence(o, header, protein_index, protein_sequence)
def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand='+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): if len(protein_sequence) >= orf_min and protein_sequence[0] == 'M': o.write('>%s F:%s%d Orf:%d\n%s\n' % (header,strand,i+1,protein_index+1,protein_sequence)) if negative_strand: strand = '-' translation = fasta._translate(fasta._reverse_complement(sequence)[i:]) for protein_index,protein_sequence in enumerate(translation): if len(protein_sequence) >= orf_min and protein_sequence[0] == 'M': o.write('>%s F:%s%d Orf:%d\n%s\n' % (header,strand,i+1,protein_index+1,protein_sequence))
def main(): args = parser.parse_args() file_name = args.fasta orf_min = args.min fasta_file = fasta.FastaIterator(file_name) negative_strand = args.both_strands no_met = args.no_met_start from_met = args.from_met from_met_keep = args.from_met_keep if from_met_keep: from_met = True no_met = True def write_sequence(handle, header, protein_index, protein_sequence): header1 = '>%s F:%s%d Orf:%d' % (header,strand,i+1,protein_index+1) protein_sequences = [(header1, protein_sequence)] if from_met: pos = protein_sequence.find('M') if pos == -1: return header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % (header,pos, strand,i+1,protein_index+1) protein_sequences.append((header2, protein_sequence[pos:])) for protein_header, protein_sequence in protein_sequences: if len(protein_sequence) >= orf_min and (no_met or protein_sequence[0] == 'M'): handle.write('%s\n%s\n' % (protein_header, protein_sequence)) with args.out as o: for header, sequence in fasta_file: for i in xrange(3): strand='+' translation = fasta._translate(sequence[i:]) translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence) if negative_strand: strand = '-' translation = fasta._translate(fasta._reverse_complement(sequence)[i:]) for protein_index,protein_sequence in enumerate(translation): write_sequence(o, header, protein_index, protein_sequence)
def main(): args = parser.parse_args() snps = not args.no_snps dels = args.dels ins = args.ins homs = not args.no_homozygous hets = args.heterozygous individual = args.individual-1 fasta_file = fasta.FastaIterator(args.fasta) splice_variants = args.splice_partial id_tag = args.group_on vars_only = args.variants_only chosen_feature = args.feature if args.cufflinks: gff = gp.GFFReader(args.gff, preset='cufflinks') else: gff = gp.GFFReader(args.gff, tag_map={'ID': id_tag, 'Parent': 'Parent'}) vcf = None if args.vcf: vcf = gp.VCFReader(args.vcf, append_chromosome=args.append_chromosome) with args.out as o: for feature_name, feature in gff.feature_map.iteritems(): header = feature_name if args.cufflinks: gff_objects = [(gff_object, gff_object.start) for gff_object in feature.parts() if gff_object.feature_type != 'transcript'] else: if chosen_feature: gff_objects = [(gff_object, gff_object.start) for gff_object in feature.parts() if gff_object.feature_type == chosen_feature] else: gff_objects = [(gff_object, gff_object.start) for gff_object in feature.parts()] if not gff_objects: continue if vcf: #for vcf, we want to sort from the end to the start so we can incorporate variants without having to #worry about an offset gff_objects.sort(key=operator.itemgetter(1), reverse=True) seq = [] variant_info = [] for gff_object, _ in gff_objects: tseq = list(fasta_file.get_sequence(gff_object.seqid, gff_object.start, gff_object.end)) overlapping_variants = [(int(entry.pos), entry) for entry in vcf.contains(gff_object.seqid, gff_object.start, gff_object.end)] #sort our variants from end to start as well overlapping_variants.sort(key=operator.itemgetter(0), reverse=True) to_remove = [] for position, vcf_entry in overlapping_variants: checked = False valid_variant = False if homs: if vcf_entry.is_homozygous()[individual]: if ((snps and not vcf_entry.has_snp(individual=individual)) and (dels and not vcf_entry.has_deletion(individual=individual)) and (ins and not vcf_entry.has_insertion(individual=individual))): continue valid_variant = True if hets: if vcf_entry.is_heterozygous()[individual]: if ((not valid_variant and not checked) and (snps and not vcf_entry.has_snp(individual=individual)) and (dels and not vcf_entry.has_deletion(individual=individual)) and (ins and not vcf_entry.has_insertion(individual=individual))): continue valid_variant = True if not valid_variant: to_remove.append(vcf_entry) continue position -= gff_object.start ref = vcf_entry.ref lref = len(ref) if splice_variants and position < 0: if args.splice_partial: alt = max(vcf_entry.get_alt(individual=individual)) alt = ''.join(list(alt)[abs(position):]) lref += position position=0 elif position > 0: alt = max(vcf_entry.get_alt(individual=individual)) else: continue variant_info.append('%s %s %s->%s' % (vcf_entry.chrom, vcf_entry.pos, ref, alt)) tseq[position:position+lref] = list(alt) vcf.remove_variants(to_remove) seq.append(''.join(tseq)) if variant_info: header += '\t%s' % ';'.join(variant_info) seq.reverse() seq = ''.join(seq) else: gff_objects.sort(key=operator.itemgetter(1)) seq = ''.join([fasta_file.get_sequence(gff_object.seqid, gff_object.start, gff_object.end) for gff_object, _ in gff_objects]) if gff_object.strand == '-': seq = fasta._reverse_complement(seq) if seq and not vars_only or (vars_only and variant_info): o.write('>%s\n%s\n' % (header, seq))
def main(): args = parser.parse_args() file_name = args.fasta enzyme_choice = args.enzyme digest_type = args.type digest_frame = args.frame digest_negative = False if digest_frame == 6: digest_negative = True digest_frame = 3 digest_min = args.min digest_max = args.max genome = args.genome unique_digest = args.unique #if we're splitting a genome if genome: import re regex = re.compile(r'([\*])') digest_type = 'nt' if digest_type == 'prot' and digest_frame: sys.stderr.write("Protein digestions cannot have a frame.\n") return 1 if digest_type == 'nt' and not digest_frame: sys.stderr.write("Nucleotide digestions must specify the frame.\n") return 1 fasta_file = fasta.FastaIterator(file_name) enzyme = digest.Enzyme( enzyme=enzyme_choice ) with args.out as o: if digest_type == 'nt': for header, sequence in fasta_file: if genome: slen = len(sequence) for i in xrange(digest_frame): strand='+' translation = fasta._translate(sequence[i:]) if genome: position = i+1 translation = [j for j in regex.split(translation)] translation = [''.join(j) for j in itertools.izip_longest(translation[0::2],translation[1::2],fillvalue='')] else: translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): if genome: peptides = enzyme.cleave(protein_sequence, min=0, max=99999, unique=unique_digest) else: peptides = enzyme.cleave(protein_sequence, min=digest_min, max=digest_max, unique=unique_digest) for peptide_index,peptide in enumerate(peptides): if genome: if len(peptide)>=digest_min: if peptide.endswith('*'): o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position,position+len(peptide)*3-1,peptide[:-1])) else: o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position,position+len(peptide)*3-1,peptide)) position+=len(peptide)*3 else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header,strand,i+1,protein_index+1,peptide_index+1,peptide)) if digest_negative: strand = '-' translation = fasta._translate(fasta._reverse_complement(sequence)[i:]) if genome: position = slen-i translation = [j for j in regex.split(translation)] translation = [''.join(j) for j in itertools.izip_longest(translation[0::2],translation[1::2],fillvalue='')] else: translation = translation.split('*') for protein_index,protein_sequence in enumerate(translation): if genome: peptides = enzyme.cleave(protein_sequence, min=0, max=999999, unique=unique_digest) else: peptides = enzyme.cleave(protein_sequence, min=digest_min, max=digest_max, unique=unique_digest) for peptide_index,peptide in enumerate(peptides): if genome: if len(peptide)>=digest_min: if peptide.endswith('*'): o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position-len(peptide)*3+1,position,peptide[:-1])) else: o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position-len(peptide)*3+1,position,peptide)) position-=(len(peptide)*3) else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header,strand,i+1,protein_index+1,peptide_index+1,peptide)) else: for header, sequence in fasta_file: peptides = enzyme.cleave(sequence, min=digest_min, max=digest_max) for peptide_index,peptide in enumerate(peptides): o.write('>%s Pep:%d \n%s\n' % (header,peptide_index+1,peptide))
def main(): args = parser.parse_args() file_name = args.fasta enzyme_choice = args.enzyme enzyme_pattern = args.enzyme_pattern digest_type = args.type digest_frame = args.frame digest_negative = False if digest_frame == 6: digest_negative = True digest_frame = 3 digest_min = args.min digest_max = args.max genome = args.genome unique_digest = args.unique #if we're splitting a genome if genome: import re regex = re.compile(r'([\*])') digest_type = 'nt' if digest_type == 'prot' and digest_frame: sys.stderr.write("Protein digestions cannot have a frame.\n") return 1 if digest_type == 'nt' and not digest_frame: sys.stderr.write("Nucleotide digestions must specify the frame.\n") return 1 fasta_file = fasta.FastaIterator(file_name) if enzyme_pattern: enzymes = [digest.Enzyme(pattern=enzyme_pattern)] elif enzyme_choice: enzymes = [ digest.Enzyme(enzyme=protease) for protease in enzyme_choice ] with args.out as o: if digest_type == 'nt': for header, sequence in fasta_file: if genome: slen = len(sequence) for i in xrange(digest_frame): strand = '+' translation = fasta._translate(sequence[i:]) if genome: position = i + 1 translation = [j for j in regex.split(translation)] translation = [ ''.join(j) for j in itertools.izip_longest(translation[0::2], translation[1::2], fillvalue='') ] else: translation = translation.split('*') for protein_index, protein_sequence in enumerate( translation): if genome: enzyme_kwargs = { 'min': 0, 'max': 999999, 'unique': unique_digest } else: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave(protein_sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave( peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): if genome: if len(peptide) >= digest_min: if peptide.endswith('*'): o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position, position + len(peptide) * 3 - 1, peptide[:-1])) else: o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position, position + len(peptide) * 3 - 1, peptide)) position += len(peptide) * 3 else: o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header, strand, i + 1, protein_index + 1, peptide_index + 1, peptide)) if digest_negative: strand = '-' translation = fasta._translate( fasta._reverse_complement(sequence)[i:]) if genome: position = slen - i translation = [j for j in regex.split(translation)] translation = [ ''.join(j) for j in itertools.izip_longest( translation[0::2], translation[1::2], fillvalue='') ] else: translation = translation.split('*') for protein_index, protein_sequence in enumerate( translation): if genome: enzyme_kwargs = { 'min': 0, 'max': 999999, 'unique': unique_digest } else: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave( protein_sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave( peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): if genome: if len(peptide) >= digest_min: if peptide.endswith('*'): o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position - len(peptide) * 3 + 1, position, peptide[:-1])) else: o.write( '>%s F:%s%d Start:%d End:%d \n%s\n' % (header, strand, i + 1, position - len(peptide) * 3 + 1, position, peptide)) position -= (len(peptide) * 3) else: o.write( '>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header, strand, i + 1, protein_index + 1, peptide_index + 1, peptide)) else: for header, sequence in fasta_file: enzyme_kwargs = { 'min': digest_min, 'max': digest_max, 'unique': unique_digest } peptides = enzymes[0].cleave(sequence, **enzyme_kwargs) for enzyme in enzymes[1:]: peptides = [ sub_seq for peptide_sequence in peptides for sub_seq in enzyme.cleave(peptide_sequence, **enzyme_kwargs) ] for peptide_index, peptide in enumerate(peptides): o.write('>%s Pep:%d \n%s\n' % (header, peptide_index + 1, peptide))