def main():
    parser = argparse.ArgumentParser(
        description=
        'Reverse or reverse-complement selected sequences within a multi-FASTA'
    )

    ## output file to be written
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-i',
                        '--id_file',
                        type=str,
                        required=True,
                        help='Path to file with IDs to process')
    parser.add_argument(
        '-a',
        '--action',
        type=str,
        required=True,
        choices=['reverse', 'revcomp'],
        help='What should be done to the sequences in the ID file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    seqs = utils.fasta_dict_from_file(args.fasta_file)

    ids = list()

    for line in open(args.id_file):
        line = line.rstrip()
        ids.append(line)

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in ids:
            if args.action == 'reverse':
                seq['s'] = seq['s'][::-1]
            elif args.action == 'revcomp':
                seq['s'] = utils.reverse_complement(seq['s'])

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq['s']), 60):
            fout.write(seq['s'][i:i + 60] + "\n")
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file')

    ## output file to be written
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates' )
    parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers' )
    parser.add_argument('-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions' )
    parser.add_argument('-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions' )
    parser.add_argument('-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment' )
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')
    
    seqs = utils.fasta_dict_from_file(args.fasta_file)

    start_col = args.start_coord_col - 1
    stop_col  = args.stop_coord_col - 1
    mol_col   = args.mol_col - 1

    for line in open(args.coords_file):
        line = line.rstrip()
        cols = line.split('\t')

        if len(cols) < 3:
            continue

        (fmin, fmax, strand) = utils.humancoords_to_0interbase(int(cols[start_col]), int(cols[stop_col]))
        mol_id = cols[mol_col]

        if mol_id not in seqs:
            raise Exception("ERROR: molecule ID ({0}) not found in FASTA file".format(mol_id))

        seq = seqs[mol_id]['s'][fmin:fmax]

        seq_id = None
        if args.name_col is None:
            seq_id = "{0}___{1}.{2}.{3}".format( mol_id, fmin, fmax, strand  )
        else:
            seq_id = cols[int(args.name_col) - 1]

        if strand == -1:
            seq = utils.reverse_complement(seq)
        
        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq), 60):
            fout.write(seq[i : i + 60] + "\n")
Esempio n. 3
0
def main():
    parser = argparse.ArgumentParser(
        description="Reverse or reverse-complement selected sequences within a multi-FASTA"
    )

    ## output file to be written
    parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file")
    parser.add_argument("-i", "--id_file", type=str, required=True, help="Path to file with IDs to process")
    parser.add_argument(
        "-a",
        "--action",
        type=str,
        required=True,
        choices=["reverse", "revcomp"],
        help="What should be done to the sequences in the ID file",
    )
    parser.add_argument(
        "-o",
        "--output_file",
        type=str,
        required=False,
        default=None,
        help="Optional Path to an output file to be created",
    )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, "wt")

    seqs = utils.fasta_dict_from_file(args.fasta_file)

    ids = list()

    for line in open(args.id_file):
        line = line.rstrip()
        ids.append(line)

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in ids:
            if args.action == "reverse":
                seq["s"] = seq["s"][::-1]
            elif args.action == "revcomp":
                seq["s"] = utils.reverse_complement(seq["s"])

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq["s"]), 60):
            fout.write(seq["s"][i : i + 60] + "\n")
Esempio n. 4
0
    def get_residues(self):
        if len(self.locations) == 0:
            raise Exception("ERROR: gene.get_residues() requested but gene {0} isn't located on anything.".format(self.id))
        elif len(self.locations) > 1:
            raise Exception("ERROR: gene {0} is located on multiple molecules.  Can't automatically extract the residues.".format(self.id))

        loc = self.location()
        mol = loc.on

        # make sure this thing has its residues populated
        if len(mol.residues) <= 0:
            raise Exception("ERROR: gene.get_residues() requested but its molecule {0} has no stored residues".format(mol.id))

        self.residues = mol.residues[loc.fmin:loc.fmax]
        self.length = len(self.residues)

        if loc.strand == -1:
            self.residues = utils.reverse_complement(self.residues)

        return self.residues
Esempio n. 5
0
def main():
    parser = argparse.ArgumentParser( description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i', '--input_file', type=str, required=True, help='Path to the input GFF3' )
    parser.add_argument('-g', '--genome_fasta', type=str, required=False, help='Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF')
    parser.add_argument('-o', '--output_gff', type=str, required=False, help='Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop')
    parser.add_argument('-el', '--extension_limit', type=int, required=False, default=100, help='Optional.  Limits how far an extension will happen looking for an in-frame stop codon')
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print("\tcoding sequence ends with {0}, last three a.a.: {1}".format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])
                    
                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print("\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print("\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t".format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos, mRNA_loc.strand), end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (mRNA_loc.strand == -1 and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[CDS_pos:CDS_pos + 3]
                            
                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(next_codon)
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos, CDS_pos - 3), end='')
                            else:
                                print(".{0}({1}-{2})".format(next_codon, CDS_pos - 3, CDS_pos), end='')
                        
                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=(CDS_pos + 3))
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(on=assemblies[assembly_id], to=CDS_pos)
                                    print(" Found a stop, extending to: {0} ({1})".format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))


    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' )
    parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against.  If this is passed, you should also pass the -fo argument' )
    parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' )
    args = parser.parse_args()
    seqs = dict()

    if args.fasta_in is not None:
        seqs = utils.fasta_dict_from_file(args.fasta_in)

        if args.fasta_out is not None:
            out_fh = open(args.fasta_out, 'w')
        else:
            raise Exception("ERROR: You must pass a value for -fo if you pass -fi")

    total_read_mappings = 0
    last_transcript_id = None
    counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }
    transcript_count = 0
    correct_orientation_count = 0
    incorrect_orientation_count = 0

    transcripts_to_correct = dict()

    for line in open(args.sam_file):
        if line.startswith('@'): continue
        
        cols = line.split("\t")
        if len(cols) < 5: continue

        read_dir = cols[0][-1]
        transcript_id = cols[2]
        total_read_mappings += 1

        flag = cols[1]
        if int(flag) & 16:
            seq_revcomped = 'T'
        else:
            seq_revcomped = 'F'

        #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id))

        if transcript_id == last_transcript_id:
            counts[read_dir][seq_revcomped] += 1
        else:
            transcript_count += 1
            
            if last_transcript_id is not None:
                ## determine transcript orientation
                ## Given an RF library, the 1:T count should outnumber the 1:F one
                if counts['1']['T'] > counts['1']['F']:
                    correct_orientation_count += 1
                else:
                    incorrect_orientation_count += 1
                    transcripts_to_correct[last_transcript_id] = 1
                
                ## report counts
                print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F']))

            ## reset
            last_transcript_id = transcript_id
            counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }


    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in transcripts_to_correct:
            seq['s'] = utils.reverse_complement(seq['s'])

        out_fh.write(">{0} {2}\n{1}\n".format(seq_id, utils.wrapped_fasta(seq['s']), seq['h']))

    print("Total transcripts: {0}".format(transcript_count))
    print("Total reads mapped: {0}".format(total_read_mappings))
    print("Transcripts in correct orientation: {0}".format(correct_orientation_count))
    print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
Esempio n. 7
0
def main():
    parser = argparse.ArgumentParser(
        description='Extends GFF gene models to the first in-frame stop')

    ## output file to be written
    parser.add_argument('-i',
                        '--input_file',
                        type=str,
                        required=True,
                        help='Path to the input GFF3')
    parser.add_argument(
        '-g',
        '--genome_fasta',
        type=str,
        required=False,
        help=
        'Optional.  You must specify this unless the FASTA sequences for the molecules are embedded in the GFF'
    )
    parser.add_argument(
        '-o',
        '--output_gff',
        type=str,
        required=False,
        help=
        'Optional.  Writes an output GFF3 file with CDS (and containing features) extended to nearest stop'
    )
    parser.add_argument(
        '-el',
        '--extension_limit',
        type=int,
        required=False,
        default=100,
        help=
        'Optional.  Limits how far an extension will happen looking for an in-frame stop codon'
    )
    args = parser.parse_args()

    (assemblies, features) = gff.get_gff3_features(args.input_file)

    # deal with the FASTA file if the user passed one
    if args.genome_fasta is not None:
        utils.add_assembly_fasta(assemblies, args.genome_fasta)

    total_mRNAs = 0
    mRNAs_with_terminal_stops = 0
    stop_codons = ['TAG', 'TAA', 'TGA']
    mRNAs_corrected = 0

    for assembly_id in sorted(assemblies):
        print("Assembly {0} has length {1}".format(
            assembly_id, assemblies[assembly_id].length))
        for gene in sorted(assemblies[assembly_id].genes()):
            for mRNA in gene.mRNAs():
                coding_seq = mRNA.get_CDS_residues()
                total_mRNAs += 1
                translation = utils.translate(coding_seq)

                if translation.endswith('*'):
                    mRNAs_with_terminal_stops += 1
                else:
                    print("gene:{1}, mRNA: {0} is missing a stop".format(
                        mRNA.id, gene.id))
                    print("\tCDS: {0}".format(coding_seq))
                    print(
                        "\tcoding sequence ends with {0}, last three a.a.: {1}"
                        .format(coding_seq[-3:], translation[-3:]))
                    mRNA_loc = mRNA.location_on(assemblies[assembly_id])

                    CDSs = sorted(mRNA.CDSs())
                    CDS_frame_overhang = len(coding_seq) % 3
                    print(
                        "\tCDS frame overhang: {0}".format(CDS_frame_overhang))
                    codon_step_size = 3

                    if mRNA_loc.strand == 1:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[-1].location_on(
                            assemblies[assembly_id]).fmax - CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmax + args.extension_limit
                    else:
                        # get the in-frame end coordinate of the last CDS position
                        CDS_pos = CDSs[0].location_on(
                            assemblies[assembly_id]).fmin + CDS_frame_overhang
                        mRNA_limit = mRNA_loc.fmin - args.extension_limit
                        codon_step_size = -3

                    print(
                        "\tmRNA:{0}-{1} ({3}), CDS end: {2}.  Extending ... \n\t"
                        .format(mRNA_loc.fmin, mRNA_loc.fmax, CDS_pos,
                                mRNA_loc.strand),
                        end='')

                    new_stop_found = False

                    # We have to step backwards to start if on the reverse strand
                    CDS_pos += codon_step_size

                    while True:
                        if (mRNA_loc.strand == 1 and CDS_pos > mRNA_limit) or (
                                mRNA_loc.strand == -1
                                and CDS_pos < mRNA_limit):
                            print("  Reached the mRNA limit")
                            break
                        elif CDS_pos < 1:
                            print("  Reached beginning of the molecule")
                            break
                        else:
                            next_codon = assemblies[assembly_id].residues[
                                CDS_pos:CDS_pos + 3]

                            if mRNA_loc.strand == -1:
                                next_codon = utils.reverse_complement(
                                    next_codon)
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos, CDS_pos - 3),
                                      end='')
                            else:
                                print(".{0}({1}-{2})".format(
                                    next_codon, CDS_pos - 3, CDS_pos),
                                      end='')

                            if next_codon in stop_codons:
                                if mRNA_loc.strand == 1:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id],
                                        to=(CDS_pos + 3))
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos + 3, mRNA_loc.strand))
                                else:
                                    mRNA.extend_stop(
                                        on=assemblies[assembly_id], to=CDS_pos)
                                    print(
                                        " Found a stop, extending to: {0} ({1})"
                                        .format(CDS_pos, mRNA_loc.strand))

                                new_stop_found = True
                                break

                        CDS_pos += codon_step_size

                    if new_stop_found == True:
                        print("\tCDS_pos: UPDATE: {0}".format(CDS_pos))
                        mRNAs_corrected += 1
                    else:
                        print("\tCDS_pos:   SAME: {0}".format(CDS_pos))

    print("\nTotal mRNAs found:{0}".format(total_mRNAs))
    print("mRNAs initially with terminal stops: {0}".format(
        mRNAs_with_terminal_stops))
    print("mRNAs successfully extended: {0}".format(mRNAs_corrected))

    ofh = open(args.output_gff, 'wt')
    gff.print_gff3_from_assemblies(assemblies=assemblies, ofh=ofh)
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser(
        description='Extract regions from a multi-FASTA file')

    ## output file to be written
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-c',
                        '--coords_file',
                        type=str,
                        required=True,
                        help='Path to a tab-delimited file with coordinates')
    parser.add_argument('-m',
                        '--mol_col',
                        type=int,
                        required=True,
                        help='Tabdel file column with molecule identifiers')
    parser.add_argument(
        '-x',
        '--start_coord_col',
        type=int,
        required=True,
        help='Tabdel file column with coordinate start positions')
    parser.add_argument(
        '-y',
        '--stop_coord_col',
        type=int,
        required=True,
        help='Tabdel file column with coordinate stop positions')
    parser.add_argument(
        '-n',
        '--name_col',
        type=int,
        required=False,
        default=None,
        help='Optional tabdel file column with name for exported fragment')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    seqs = utils.fasta_dict_from_file(args.fasta_file)

    start_col = args.start_coord_col - 1
    stop_col = args.stop_coord_col - 1
    mol_col = args.mol_col - 1

    for line in open(args.coords_file):
        line = line.rstrip()
        cols = line.split('\t')

        if len(cols) < 3:
            continue

        (fmin, fmax,
         strand) = utils.humancoords_to_0interbase(int(cols[start_col]),
                                                   int(cols[stop_col]))
        mol_id = cols[mol_col]

        if mol_id not in seqs:
            raise Exception(
                "ERROR: molecule ID ({0}) not found in FASTA file".format(
                    mol_id))

        seq = seqs[mol_id]['s'][fmin:fmax]

        seq_id = None
        if args.name_col is None:
            seq_id = "{0}___{1}.{2}.{3}".format(mol_id, fmin, fmax, strand)
        else:
            seq_id = cols[int(args.name_col) - 1]

        if strand == -1:
            seq = utils.reverse_complement(seq)

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq), 60):
            fout.write(seq[i:i + 60] + "\n")