Exemple #1
0
    def get_residues(self):
        if len(self.locations) == 0:
            raise Exception(
                "ERROR: gene.get_residues() requested but gene {0} isn't located on anything."
                .format(self.id))
        elif len(self.locations) > 1:
            raise Exception(
                "ERROR: gene {0} is located on multiple molecules.  Can't automatically extract the residues."
                .format(self.id))

        loc = self.location()
        mol = loc.on

        # make sure this thing has its residues populated
        if len(mol.residues) <= 0:
            raise Exception(
                "ERROR: gene.get_residues() requested but its molecule {0} has no stored residues"
                .format(mol.id))

        self.residues = mol.residues[loc.fmin:loc.fmax]
        self.length = len(self.residues)

        if loc.strand == -1:
            self.residues = biocodeutils.reverse_complement(self.residues)

        return self.residues
Exemple #2
0
    def get_residues(self):
        if len(self.locations) == 0:
            raise Exception(
                "ERROR: CDS.get_residues() requested but CDS {0} isn't located on anything.".format(self.id)
            )
        elif len(self.locations) > 1:
            raise Exception(
                "ERROR: CDS {0} is located on multiple molecules.  Can't automatically extract the residues.".format(
                    self.id
                )
            )

        loc = self.location()
        mol = loc.on

        # make sure this thing has its residues populated
        if len(mol.residues) <= 0:
            raise Exception(
                "ERROR: CDS.get_residues() requested but its molecule {0} has no stored residues".format(mol.id)
            )

        self.residues = mol.residues[loc.fmin : loc.fmax]
        self.length = len(self.residues)

        if loc.strand == -1:
            self.residues = biocodeutils.reverse_complement(self.residues)

        return self.residues
def main():
    parser = argparse.ArgumentParser(
        description=
        'Reverse or reverse-complement selected sequences within a multi-FASTA'
    )

    ## output file to be written
    parser.add_argument('-f',
                        '--fasta_file',
                        type=str,
                        required=True,
                        help='Path to an input FASTA file')
    parser.add_argument('-i',
                        '--id_file',
                        type=str,
                        required=True,
                        help='Path to file with IDs to process')
    parser.add_argument(
        '-a',
        '--action',
        type=str,
        required=True,
        choices=['reverse', 'revcomp'],
        help='What should be done to the sequences in the ID file')
    parser.add_argument('-o',
                        '--output_file',
                        type=str,
                        required=False,
                        default=None,
                        help='Optional Path to an output file to be created')
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')

    seqs = biocodeutils.fasta_dict_from_file(args.fasta_file)

    ids = list()

    for line in open(args.id_file):
        line = line.rstrip()
        ids.append(line)

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in ids:
            if args.action == 'reverse':
                seq['s'] = seq['s'][::-1]
            elif args.action == 'revcomp':
                seq['s'] = biocodeutils.reverse_complement(seq['s'])

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq['s']), 60):
            fout.write(seq['s'][i:i + 60] + "\n")
def main():
    parser = argparse.ArgumentParser(
        description="Reverse or reverse-complement selected sequences within a multi-FASTA"
    )

    ## output file to be written
    parser.add_argument("-f", "--fasta_file", type=str, required=True, help="Path to an input FASTA file")
    parser.add_argument("-i", "--id_file", type=str, required=True, help="Path to file with IDs to process")
    parser.add_argument(
        "-a",
        "--action",
        type=str,
        required=True,
        choices=["reverse", "revcomp"],
        help="What should be done to the sequences in the ID file",
    )
    parser.add_argument(
        "-o",
        "--output_file",
        type=str,
        required=False,
        default=None,
        help="Optional Path to an output file to be created",
    )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, "wt")

    seqs = biocodeutils.fasta_dict_from_file(args.fasta_file)

    ids = list()

    for line in open(args.id_file):
        line = line.rstrip()
        ids.append(line)

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in ids:
            if args.action == "reverse":
                seq["s"] = seq["s"][::-1]
            elif args.action == "revcomp":
                seq["s"] = biocodeutils.reverse_complement(seq["s"])

        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq["s"]), 60):
            fout.write(seq["s"][i : i + 60] + "\n")
Exemple #5
0
def main():
    parser = argparse.ArgumentParser( description='Extract regions from a multi-FASTA file')

    ## output file to be written
    parser.add_argument('-f', '--fasta_file', type=str, required=True, help='Path to an input FASTA file' )
    parser.add_argument('-c', '--coords_file', type=str, required=True, help='Path to a tab-delimited file with coordinates' )
    parser.add_argument('-m', '--mol_col', type=int, required=True, help='Tabdel file column with molecule identifiers' )
    parser.add_argument('-x', '--start_coord_col', type=int, required=True, help='Tabdel file column with coordinate start positions' )
    parser.add_argument('-y', '--stop_coord_col', type=int, required=True, help='Tabdel file column with coordinate stop positions' )
    parser.add_argument('-n', '--name_col', type=int, required=False, default=None, help='Optional tabdel file column with name for exported fragment' )
    parser.add_argument('-o', '--output_file', type=str, required=False, default=None, help='Optional Path to an output file to be created' )
    args = parser.parse_args()

    ## output will either be a file or STDOUT
    fout = sys.stdout
    if args.output_file is not None:
        fout = open(args.output_file, 'wt')
    
    seqs = biocodeutils.fasta_dict_from_file( args.fasta_file )

    start_col = args.start_coord_col - 1
    stop_col  = args.stop_coord_col - 1
    mol_col   = args.mol_col - 1

    for line in open(args.coords_file):
        line = line.rstrip()
        cols = line.split('\t')

        if len(cols) < 3:
            continue

        (fmin, fmax, strand) = biocodeutils.humancoords_to_0interbase( int(cols[start_col]), int(cols[stop_col]) )
        mol_id = cols[mol_col]

        if mol_id not in seqs:
            raise Exception("ERROR: molecule ID ({0}) not found in FASTA file".format(mol_id))

        seq = seqs[mol_id]['s'][fmin:fmax]

        seq_id = None
        if args.name_col is None:
            seq_id = "{0}___{1}.{2}.{3}".format( mol_id, fmin, fmax, strand  )
        else:
            seq_id = cols[int(args.name_col) - 1]

        if strand == -1:
            seq = biocodeutils.reverse_complement(seq)
        
        ## write this sequence, 60bp per line
        fout.write(">{0}\n".format(seq_id))
        for i in range(0, len(seq), 60):
            fout.write(seq[i : i + 60] + "\n")
def main():
    parser = argparse.ArgumentParser( description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-s', '--sam_file', type=str, required=True, help='Input SAM file with reads aligned to reference' )
    parser.add_argument('-fi', '--fasta_in', type=str, required=False, help='Path to a FASTA file representing sequences that were aligned against.  If this is passed, you should also pass the -fo argument' )
    parser.add_argument('-fo', '--fasta_out', type=str, required=False, help='If passed along with -fi, the orientation-corrected sequences will be written here.' )
    args = parser.parse_args()
    seqs = dict()

    if args.fasta_in is not None:
        seqs = biocodeutils.fasta_dict_from_file( args.fasta_in )

        if args.fasta_out is not None:
            out_fh = open(args.fasta_out, 'w')
        else:
            raise Exception("ERROR: You must pass a value for -fo if you pass -fi")

    total_read_mappings = 0
    last_transcript_id = None
    counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }
    transcript_count = 0
    correct_orientation_count = 0
    incorrect_orientation_count = 0

    transcripts_to_correct = dict()

    for line in open(args.sam_file):
        if line.startswith('@'): continue
        
        cols = line.split("\t")
        if len(cols) < 5: continue

        read_dir = cols[0][-1]
        transcript_id = cols[2]
        total_read_mappings += 1

        flag = cols[1]
        if int(flag) & 16:
            seq_revcomped = 'T'
        else:
            seq_revcomped = 'F'

        #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id))

        if transcript_id == last_transcript_id:
            counts[read_dir][seq_revcomped] += 1
        else:
            transcript_count += 1
            
            if last_transcript_id is not None:
                ## determine transcript orientation
                ## Given an RF library, the 1:T count should outnumber the 1:F one
                if counts['1']['T'] > counts['1']['F']:
                    correct_orientation_count += 1
                else:
                    incorrect_orientation_count += 1
                    transcripts_to_correct[last_transcript_id] = 1
                
                ## report counts
                print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(last_transcript_id, counts['1']['T'], counts['1']['F'], counts['2']['T'], counts['2']['F']))

            ## reset
            last_transcript_id = transcript_id
            counts = { '1':{'T':0,'F':0}, '2':{'T':0,'F':0} }


    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in transcripts_to_correct:
            seq['s'] = biocodeutils.reverse_complement(seq['s'])

        out_fh.write(">{0} {2}\n{1}\n".format(seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h']))

    print("Total transcripts: {0}".format(transcript_count))
    print("Total reads mapped: {0}".format(total_read_mappings))
    print("Transcripts in correct orientation: {0}".format(correct_orientation_count))
    print("Transcripts in reverse orientation: {0}".format(incorrect_orientation_count))
def main():
    parser = argparse.ArgumentParser(
        description='Put a description of your script here')

    ## output file to be written
    parser.add_argument('-s',
                        '--sam_file',
                        type=str,
                        required=True,
                        help='Input SAM file with reads aligned to reference')
    parser.add_argument(
        '-fi',
        '--fasta_in',
        type=str,
        required=False,
        help=
        'Path to a FASTA file representing sequences that were aligned against.  If this is passed, you should also pass the -fo argument'
    )
    parser.add_argument(
        '-fo',
        '--fasta_out',
        type=str,
        required=False,
        help=
        'If passed along with -fi, the orientation-corrected sequences will be written here.'
    )
    args = parser.parse_args()
    seqs = dict()

    if args.fasta_in is not None:
        seqs = biocodeutils.fasta_dict_from_file(args.fasta_in)

        if args.fasta_out is not None:
            out_fh = open(args.fasta_out, 'w')
        else:
            raise Exception(
                "ERROR: You must pass a value for -fo if you pass -fi")

    total_read_mappings = 0
    last_transcript_id = None
    counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}}
    transcript_count = 0
    correct_orientation_count = 0
    incorrect_orientation_count = 0

    transcripts_to_correct = dict()

    for line in open(args.sam_file):
        if line.startswith('@'): continue

        cols = line.split("\t")
        if len(cols) < 5: continue

        read_dir = cols[0][-1]
        transcript_id = cols[2]
        total_read_mappings += 1

        flag = cols[1]
        if int(flag) & 16:
            seq_revcomped = 'T'
        else:
            seq_revcomped = 'F'

        #print("DEBUG: match:{2}, SEQ_revcomped={0}, read_dir={1}".format(seq_revcomped, read_dir, transcript_id))

        if transcript_id == last_transcript_id:
            counts[read_dir][seq_revcomped] += 1
        else:
            transcript_count += 1

            if last_transcript_id is not None:
                ## determine transcript orientation
                ## Given an RF library, the 1:T count should outnumber the 1:F one
                if counts['1']['T'] > counts['1']['F']:
                    correct_orientation_count += 1
                else:
                    incorrect_orientation_count += 1
                    transcripts_to_correct[last_transcript_id] = 1

                ## report counts
                print("{0}\t1-T:{1}\t1-F:{2}\t2-T:{3}\t2-F:{4}".format(
                    last_transcript_id, counts['1']['T'], counts['1']['F'],
                    counts['2']['T'], counts['2']['F']))

            ## reset
            last_transcript_id = transcript_id
            counts = {'1': {'T': 0, 'F': 0}, '2': {'T': 0, 'F': 0}}

    for seq_id in seqs:
        seq = seqs[seq_id]

        if seq_id in transcripts_to_correct:
            seq['s'] = biocodeutils.reverse_complement(seq['s'])

        out_fh.write(">{0} {2}\n{1}\n".format(
            seq_id, biocodeutils.wrapped_fasta(seq['s']), seq['h']))

    print("Total transcripts: {0}".format(transcript_count))
    print("Total reads mapped: {0}".format(total_read_mappings))
    print("Transcripts in correct orientation: {0}".format(
        correct_orientation_count))
    print("Transcripts in reverse orientation: {0}".format(
        incorrect_orientation_count))