Ejemplo n.º 1
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    no_met = args.no_met_start
    from_met = args.from_met
    from_met_keep = args.from_met_keep
    if from_met_keep:
        from_met = True
        no_met = True

    def write_sequence(handle, header, protein_index, protein_sequence):
        header1 = '>%s F:%s%d Orf:%d' % (header, strand, i + 1,
                                         protein_index + 1)
        protein_sequences = [(header1, protein_sequence)]
        if from_met:
            pos = protein_sequence.find('M')
            if pos == -1:
                return
            header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % (
                header, pos, strand, i + 1, protein_index + 1)
            protein_sequences.append((header2, protein_sequence[pos:]))
        for protein_header, protein_sequence in protein_sequences:
            if len(protein_sequence) >= orf_min and (
                    no_met or protein_sequence[0] == 'M'):
                handle.write('%s\n%s\n' % (protein_header, protein_sequence))

    with args.out as o:
        for header, sequence in fasta_file:
            for i in xrange(3):
                strand = '+'
                translation = fasta._translate(sequence[i:])
                translation = translation.split('*')
                for protein_index, protein_sequence in enumerate(translation):
                    write_sequence(o, header, protein_index, protein_sequence)
                if negative_strand:
                    strand = '-'
                    translation = fasta._translate(
                        fasta._reverse_complement(sequence)[i:])
                    for protein_index, protein_sequence in enumerate(
                            translation):
                        write_sequence(o, header, protein_index,
                                       protein_sequence)
Ejemplo n.º 2
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    with args.out as o:
        for header, sequence in fasta_file:
            for i in xrange(3):
                strand='+'
                translation = fasta._translate(sequence[i:])
                translation = translation.split('*')
                for protein_index,protein_sequence in enumerate(translation):
                    if len(protein_sequence) >= orf_min and protein_sequence[0] == 'M':
                        o.write('>%s F:%s%d Orf:%d\n%s\n' % (header,strand,i+1,protein_index+1,protein_sequence))
                if negative_strand:
                    strand = '-'
                    translation = fasta._translate(fasta._reverse_complement(sequence)[i:])
                    for protein_index,protein_sequence in enumerate(translation):
                        if len(protein_sequence) >= orf_min and protein_sequence[0] == 'M':
                            o.write('>%s F:%s%d Orf:%d\n%s\n' % (header,strand,i+1,protein_index+1,protein_sequence))
Ejemplo n.º 3
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    orf_min = args.min
    fasta_file = fasta.FastaIterator(file_name)
    negative_strand = args.both_strands
    no_met = args.no_met_start
    from_met = args.from_met
    from_met_keep = args.from_met_keep
    if from_met_keep:
        from_met = True
        no_met = True
    def write_sequence(handle, header, protein_index, protein_sequence):
        header1 = '>%s F:%s%d Orf:%d' % (header,strand,i+1,protein_index+1)
        protein_sequences = [(header1, protein_sequence)]
        if from_met:
            pos = protein_sequence.find('M')
            if pos == -1:
                return
            header2 = '>%s(%d upstream removed) F:%s%d Orf:%d' % (header,pos, strand,i+1,protein_index+1)
            protein_sequences.append((header2, protein_sequence[pos:]))
        for protein_header, protein_sequence in protein_sequences:
            if len(protein_sequence) >= orf_min and (no_met or protein_sequence[0] == 'M'):
                handle.write('%s\n%s\n' % (protein_header, protein_sequence))

    with args.out as o:
        for header, sequence in fasta_file:
            for i in xrange(3):
                strand='+'
                translation = fasta._translate(sequence[i:])
                translation = translation.split('*')
                for protein_index,protein_sequence in enumerate(translation):
                    write_sequence(o, header, protein_index, protein_sequence)
                if negative_strand:
                    strand = '-'
                    translation = fasta._translate(fasta._reverse_complement(sequence)[i:])
                    for protein_index,protein_sequence in enumerate(translation):
                        write_sequence(o, header, protein_index, protein_sequence)
Ejemplo n.º 4
0
def main():
    args = parser.parse_args()
    snps = not args.no_snps
    dels = args.dels
    ins = args.ins
    homs = not args.no_homozygous
    hets = args.heterozygous
    individual = args.individual-1
    fasta_file = fasta.FastaIterator(args.fasta)
    splice_variants = args.splice_partial
    id_tag = args.group_on
    vars_only = args.variants_only
    chosen_feature = args.feature
    if args.cufflinks:
        gff = gp.GFFReader(args.gff, preset='cufflinks')
    else:
        gff = gp.GFFReader(args.gff, tag_map={'ID': id_tag, 'Parent': 'Parent'})
    vcf = None
    if args.vcf:
        vcf = gp.VCFReader(args.vcf, append_chromosome=args.append_chromosome)
    with args.out as o:
        for feature_name, feature in gff.feature_map.iteritems():
            header = feature_name
            if args.cufflinks:
                gff_objects = [(gff_object, gff_object.start) for gff_object in feature.parts()
                               if gff_object.feature_type != 'transcript']
            else:
                if chosen_feature:
                    gff_objects = [(gff_object, gff_object.start) for gff_object in feature.parts()
                                   if gff_object.feature_type == chosen_feature]
                else:
                    gff_objects = [(gff_object, gff_object.start) for gff_object in feature.parts()]
            if not gff_objects:
                continue
            if vcf:
                #for vcf, we want to sort from the end to the start so we can incorporate variants without having to
                #worry about an offset
                gff_objects.sort(key=operator.itemgetter(1), reverse=True)
                seq = []
                variant_info = []
                for gff_object, _ in gff_objects:
                    tseq = list(fasta_file.get_sequence(gff_object.seqid, gff_object.start, gff_object.end))
                    overlapping_variants = [(int(entry.pos), entry) for entry in
                                            vcf.contains(gff_object.seqid, gff_object.start, gff_object.end)]
                    #sort our variants from end to start as well
                    overlapping_variants.sort(key=operator.itemgetter(0), reverse=True)
                    to_remove = []
                    for position, vcf_entry in overlapping_variants:
                        checked = False
                        valid_variant = False
                        if homs:
                            if vcf_entry.is_homozygous()[individual]:
                                if ((snps and not vcf_entry.has_snp(individual=individual)) and
                                    (dels and not vcf_entry.has_deletion(individual=individual)) and
                                    (ins and not vcf_entry.has_insertion(individual=individual))):
                                        continue
                                valid_variant = True
                        if hets:
                            if vcf_entry.is_heterozygous()[individual]:
                                if ((not valid_variant and not checked) and
                                    (snps and not vcf_entry.has_snp(individual=individual)) and
                                    (dels and not vcf_entry.has_deletion(individual=individual)) and
                                    (ins and not vcf_entry.has_insertion(individual=individual))):
                                        continue
                                valid_variant = True
                        if not valid_variant:
                            to_remove.append(vcf_entry)
                            continue
                        position -= gff_object.start
                        ref = vcf_entry.ref
                        lref = len(ref)
                        if splice_variants and position < 0:
                            if args.splice_partial:
                                alt = max(vcf_entry.get_alt(individual=individual))
                                alt = ''.join(list(alt)[abs(position):])
                                lref += position
                                position=0
                        elif position > 0:
                            alt = max(vcf_entry.get_alt(individual=individual))
                        else:
                            continue
                        variant_info.append('%s %s %s->%s' % (vcf_entry.chrom, vcf_entry.pos, ref, alt))
                        tseq[position:position+lref] = list(alt)
                    vcf.remove_variants(to_remove)
                    seq.append(''.join(tseq))
                if variant_info:
                    header += '\t%s' % ';'.join(variant_info)
                seq.reverse()
                seq = ''.join(seq)
            else:
                gff_objects.sort(key=operator.itemgetter(1))
                seq = ''.join([fasta_file.get_sequence(gff_object.seqid, gff_object.start, gff_object.end)
                               for gff_object, _ in gff_objects])
            if gff_object.strand == '-':
                seq = fasta._reverse_complement(seq)
            if seq and not vars_only or (vars_only and variant_info):
                o.write('>%s\n%s\n' % (header, seq))
Ejemplo n.º 5
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    enzyme_choice = args.enzyme
    digest_type = args.type
    digest_frame = args.frame
    digest_negative = False
    if digest_frame == 6:
        digest_negative = True
        digest_frame = 3
    digest_min = args.min
    digest_max = args.max
    genome = args.genome
    unique_digest = args.unique
    #if we're splitting a genome
    if genome:
        import re
        regex = re.compile(r'([\*])')
        digest_type = 'nt'
    if digest_type == 'prot' and digest_frame:
        sys.stderr.write("Protein digestions cannot have a frame.\n")
        return 1
    if digest_type == 'nt' and not digest_frame:
        sys.stderr.write("Nucleotide digestions must specify the frame.\n")
        return 1
    fasta_file = fasta.FastaIterator(file_name)
    enzyme = digest.Enzyme( enzyme=enzyme_choice )
    with args.out as o:
        if digest_type == 'nt':
            for header, sequence in fasta_file:
                if genome:
                    slen = len(sequence)
                for i in xrange(digest_frame):
                    strand='+'
                    translation = fasta._translate(sequence[i:])
                    if genome:
                        position = i+1
                        translation = [j for j in regex.split(translation)]
                        translation = [''.join(j) for j in itertools.izip_longest(translation[0::2],translation[1::2],fillvalue='')]
                    else:
                        translation = translation.split('*')
                    for protein_index,protein_sequence in enumerate(translation):
                        if genome:
                            peptides = enzyme.cleave(protein_sequence, min=0, max=99999, unique=unique_digest)
                        else:
                            peptides = enzyme.cleave(protein_sequence, min=digest_min, max=digest_max, unique=unique_digest)
                        for peptide_index,peptide in enumerate(peptides):
                            if genome:
                                if len(peptide)>=digest_min:
                                    if peptide.endswith('*'):
                                        o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position,position+len(peptide)*3-1,peptide[:-1]))
                                    else:
                                        o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position,position+len(peptide)*3-1,peptide))
                                position+=len(peptide)*3
                            else:
                                o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header,strand,i+1,protein_index+1,peptide_index+1,peptide))
                    if digest_negative:
                        strand = '-'
                        translation = fasta._translate(fasta._reverse_complement(sequence)[i:])
                        if genome:
                            position = slen-i
                            translation = [j for j in regex.split(translation)]
                            translation = [''.join(j) for j in itertools.izip_longest(translation[0::2],translation[1::2],fillvalue='')]
                        else:
                            translation = translation.split('*')
                        for protein_index,protein_sequence in enumerate(translation):
                            if genome:
                                peptides = enzyme.cleave(protein_sequence, min=0, max=999999, unique=unique_digest)
                            else:
                                peptides = enzyme.cleave(protein_sequence, min=digest_min, max=digest_max, unique=unique_digest)
                            for peptide_index,peptide in enumerate(peptides):
                                if genome:
                                    if len(peptide)>=digest_min:
                                        if peptide.endswith('*'):
                                            o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position-len(peptide)*3+1,position,peptide[:-1]))
                                        else:
                                            o.write('>%s F:%s%d Start:%d End:%d \n%s\n' % (header,strand,i+1,position-len(peptide)*3+1,position,peptide))
                                    position-=(len(peptide)*3)
                                else:
                                    o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' % (header,strand,i+1,protein_index+1,peptide_index+1,peptide))
        else:
            for header, sequence in fasta_file:
                peptides = enzyme.cleave(sequence, min=digest_min, max=digest_max)
                for peptide_index,peptide in enumerate(peptides):
                    o.write('>%s Pep:%d \n%s\n' % (header,peptide_index+1,peptide))
Ejemplo n.º 6
0
def main():
    args = parser.parse_args()
    file_name = args.fasta
    enzyme_choice = args.enzyme
    enzyme_pattern = args.enzyme_pattern
    digest_type = args.type
    digest_frame = args.frame
    digest_negative = False
    if digest_frame == 6:
        digest_negative = True
        digest_frame = 3
    digest_min = args.min
    digest_max = args.max
    genome = args.genome
    unique_digest = args.unique
    #if we're splitting a genome
    if genome:
        import re
        regex = re.compile(r'([\*])')
        digest_type = 'nt'
    if digest_type == 'prot' and digest_frame:
        sys.stderr.write("Protein digestions cannot have a frame.\n")
        return 1
    if digest_type == 'nt' and not digest_frame:
        sys.stderr.write("Nucleotide digestions must specify the frame.\n")
        return 1
    fasta_file = fasta.FastaIterator(file_name)
    if enzyme_pattern:
        enzymes = [digest.Enzyme(pattern=enzyme_pattern)]
    elif enzyme_choice:
        enzymes = [
            digest.Enzyme(enzyme=protease) for protease in enzyme_choice
        ]
    with args.out as o:
        if digest_type == 'nt':
            for header, sequence in fasta_file:
                if genome:
                    slen = len(sequence)
                for i in xrange(digest_frame):
                    strand = '+'
                    translation = fasta._translate(sequence[i:])
                    if genome:
                        position = i + 1
                        translation = [j for j in regex.split(translation)]
                        translation = [
                            ''.join(j)
                            for j in itertools.izip_longest(translation[0::2],
                                                            translation[1::2],
                                                            fillvalue='')
                        ]
                    else:
                        translation = translation.split('*')
                    for protein_index, protein_sequence in enumerate(
                            translation):
                        if genome:
                            enzyme_kwargs = {
                                'min': 0,
                                'max': 999999,
                                'unique': unique_digest
                            }
                        else:
                            enzyme_kwargs = {
                                'min': digest_min,
                                'max': digest_max,
                                'unique': unique_digest
                            }
                        peptides = enzymes[0].cleave(protein_sequence,
                                                     **enzyme_kwargs)
                        for enzyme in enzymes[1:]:
                            peptides = [
                                sub_seq for peptide_sequence in peptides
                                for sub_seq in enzyme.cleave(
                                    peptide_sequence, **enzyme_kwargs)
                            ]
                        for peptide_index, peptide in enumerate(peptides):
                            if genome:
                                if len(peptide) >= digest_min:
                                    if peptide.endswith('*'):
                                        o.write(
                                            '>%s F:%s%d Start:%d End:%d \n%s\n'
                                            % (header, strand, i + 1, position,
                                               position + len(peptide) * 3 - 1,
                                               peptide[:-1]))
                                    else:
                                        o.write(
                                            '>%s F:%s%d Start:%d End:%d \n%s\n'
                                            % (header, strand, i + 1, position,
                                               position + len(peptide) * 3 - 1,
                                               peptide))
                                position += len(peptide) * 3
                            else:
                                o.write('>%s F:%s%d Orf:%d Pep:%d \n%s\n' %
                                        (header, strand, i + 1, protein_index +
                                         1, peptide_index + 1, peptide))
                    if digest_negative:
                        strand = '-'
                        translation = fasta._translate(
                            fasta._reverse_complement(sequence)[i:])
                        if genome:
                            position = slen - i
                            translation = [j for j in regex.split(translation)]
                            translation = [
                                ''.join(j) for j in itertools.izip_longest(
                                    translation[0::2],
                                    translation[1::2],
                                    fillvalue='')
                            ]
                        else:
                            translation = translation.split('*')
                        for protein_index, protein_sequence in enumerate(
                                translation):
                            if genome:
                                enzyme_kwargs = {
                                    'min': 0,
                                    'max': 999999,
                                    'unique': unique_digest
                                }
                            else:
                                enzyme_kwargs = {
                                    'min': digest_min,
                                    'max': digest_max,
                                    'unique': unique_digest
                                }
                            peptides = enzymes[0].cleave(
                                protein_sequence, **enzyme_kwargs)
                            for enzyme in enzymes[1:]:
                                peptides = [
                                    sub_seq for peptide_sequence in peptides
                                    for sub_seq in enzyme.cleave(
                                        peptide_sequence, **enzyme_kwargs)
                                ]
                            for peptide_index, peptide in enumerate(peptides):
                                if genome:
                                    if len(peptide) >= digest_min:
                                        if peptide.endswith('*'):
                                            o.write(
                                                '>%s F:%s%d Start:%d End:%d \n%s\n'
                                                %
                                                (header, strand, i + 1,
                                                 position - len(peptide) * 3 +
                                                 1, position, peptide[:-1]))
                                        else:
                                            o.write(
                                                '>%s F:%s%d Start:%d End:%d \n%s\n'
                                                %
                                                (header, strand, i + 1,
                                                 position - len(peptide) * 3 +
                                                 1, position, peptide))
                                    position -= (len(peptide) * 3)
                                else:
                                    o.write(
                                        '>%s F:%s%d Orf:%d Pep:%d \n%s\n' %
                                        (header, strand, i + 1, protein_index +
                                         1, peptide_index + 1, peptide))
        else:
            for header, sequence in fasta_file:
                enzyme_kwargs = {
                    'min': digest_min,
                    'max': digest_max,
                    'unique': unique_digest
                }
                peptides = enzymes[0].cleave(sequence, **enzyme_kwargs)
                for enzyme in enzymes[1:]:
                    peptides = [
                        sub_seq for peptide_sequence in peptides for sub_seq in
                        enzyme.cleave(peptide_sequence, **enzyme_kwargs)
                    ]
                for peptide_index, peptide in enumerate(peptides):
                    o.write('>%s Pep:%d \n%s\n' %
                            (header, peptide_index + 1, peptide))