Example #1
0
def segmentWithCpG(infile, with_contig_sizes=False):
    '''segment a fasta file, output locations of CpG.'''

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)

    segments, contig_sizes = [], collections.OrderedDict()

    for cur_record in iterator:
        ninput += 1
        contig = re.sub("\s.*", "", cur_record.title)
        last = None
        contig_sizes[contig] = (0, len(cur_record.sequence))
        for pos, this in enumerate(cur_record.sequence.upper()):
            if last == "C" and this == "G":
                segments.append((contig, pos - 1, pos + 1, 1.0))
            last = this

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    if with_contig_sizes:
        return segments, contig_sizes

    return segments
Example #2
0
def segmentUngapped(infile, gap_char, min_gap_size=0):

    iterator = FastaIterator.FastaIterator(infile)

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        size = len(cur_record.sequence)

        last_end = 0
        for start, end in gapped_regions(cur_record.sequence, gap_char):
            if end - start < min_gap_size:
                continue

            if last_end != 0:
                yield (contig, last_end, start, 0)
            last_end = end

        if last_end < size:
            yield (contig, last_end, size, 0)
Example #3
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$", usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)


    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

   # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:


        # This is a temp fix because bedtools getfasta --name seems to have
        # changed the way it names the fasta titles. This may be temp but This
        # will fix this issue for the time being.
        m = re.match("(chr\d+.tRNA\d+-\S+-(pseudo)?)::\S+([+|-])", cur_record.title.replace("(","").replace(")",""))

        if m == None:
            continue
        if m.group(2) == "pseudo":
            pass
        else:
            key = str(m.group(1) +  m.group(3))
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n")%(key, value))

    E.stop()
Example #4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    (options, args) = E.start(parser)

    if len(args) < 2:
        raise ValueError(
            "please supply at least two filenames to concatenate.")

    iterators = []
    for a in args:
        iterators.append(FastaIterator.FastaIterator(iotools.open_file(a,
                                                                       "r")))

    ninput, noutput, nerrors = 0, 0, 0

    while 1:

        sequences = []
        ids = []

        for iterator in iterators:
            try:
                cur_record = next(iterator)
            except StopIteration:
                break

            sequences.append(re.sub(" ", "", cur_record.sequence))
            ids.append(cur_record.title)

        if not sequences:
            break
        ninput += 1

        if len(sequences) != len(iterators):
            raise ValueError("unequal number of sequences in files")

        noutput += 1

        options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences)))

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.stop()
Example #5
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:

        key = cur_record.title
        if "pseudo" in key:
            pass

        else:
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n") % (key, value))

    E.stop()
Example #6
0
def segmentGaps(infile, gap_char):

    iterator = FastaIterator.FastaIterator(infile)

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)

        for start, end in gapped_regions(cur_record.sequence, gap_char):
            yield (contig, start, end, 0)
Example #7
0
def segmentFixedWidthWindows(infile, window_size, window_shift):
    """return a list of fixed contig sizes."""

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)
    window_shift = window_size
    # at most 50% can be gap
    gap_cutoff = int(window_size // 2)
    segments = []

    while 1:
        ninput += 1
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        seq = cur_record.sequence
        size = len(cur_record.sequence)

        for x in range(0, size, window_shift):
            s = seq[x:x + window_size].upper()
            gc, at = 0, 0
            for c in s:
                if c in "GC":
                    gc += 1
                elif c in "AT":
                    at += 1

            # skip segments containing mostly gaps
            if window_size - (gc + at) > gap_cutoff:
                nskipped += 1
                continue

            segments.append(
                (contig, x, x + window_size, float(gc) / (gc + at)))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped_windows=%i" %
           (ninput, noutput, nskipped))

    return segments
Example #8
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type=str,
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        nargs="*",
                        type=str,
                        choices=("length", "sequence", "hid", "na", "aa",
                                 "cpg", "dn", "degeneracy", "gaps", "codons",
                                 "codon-usage", "codon-translator",
                                 "codon-bias"),
                        help="which sections to output ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type=str,
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids .")

    parser.add_argument(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type=str,
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_argument(
        "--split-fasta-identifier",
        dest="split_id",
        action="store_true",
        help="split fasta description line (starting >) and use "
        "only text before first space")

    parser.add_argument(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (args) = E.start(parser, argv=argv)

    rx = re.compile(args.regex_identifier)

    reference_codons = []
    if args.filename_weights:
        args.filename_weights = args.filename_weights.split(",")
        for filename in args.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    iotools.ReadMap(iotools.open_file(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        args.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in list(a.items()):
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                args.stdlog.write(
                    "# tablediff\t%s\t%s\t%f\n" %
                    (args.filename_weights[x], args.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(args.stdin)

    def getCounter(section):

        if args.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(args.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif args.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in args.sections:
        totals[section] = getCounter(section)

    args.stdout.write("id")
    for section in args.sections:
        args.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    args.stdout.write("\n")
    args.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if args.split_id is True:
            args.stdout.write("%s" % id.split()[0])
        else:
            args.stdout.write("%s" % id)
        args.stdout.flush()

        for section in args.sections:
            s = getCounter(section)
            s.loadSequence(sequence, args.seqtype)
            totals[section].addProperties(s)

            args.stdout.write("\t" + "\t".join(s.getFields()))

        args.stdout.write("\n")

    if args.add_total:
        args.stdout.write("total")
        for section in args.sections:
            args.stdout.write("\t" + "\t".join(totals[section].getFields()))
        args.stdout.write("\n")

    E.stop()
Example #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--trna-scheme",
        dest="trna_scheme",
        type="choice",
        choices=("tDR-5'", "tRH-DA"),
        help="name of the tRNA scheme to make bed file for[default=%default]")

    parser.set_defaults(trna_scheme=None)

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    outfile = IOTools.open_file(options.stdout.name, "w")
    trna_options = [
        "tRH-5'", "tRH-DA", "tRH-DTA", "tRH-AT", "tRH-3'", "tRF-5'", "tRF-3'",
        "tRF-D", "tRF-DA", "tRF-A", "tRF-AT", "tRF-T"
    ]
    for trna in trna_options:
        infile = IOTools.open_file(options.stdin.name)
        iterator = FastaIterator.FastaIterator(infile)

        d = collections.OrderedDict()
        cluster_dict = dict()

        # first iterate over the fasta file

        for cur_record in iterator:

            title = cur_record.title
            m = re.match("(cluster\d+):chr\S+.tRNA\d+-(\S+)-\((\S+)\)", title)

            cluster = m.group(1)
            trna_group = m.group(2)
            strand = m.group(3)

            chrom = cluster + ":" + trna_group + "-"
            score = "."
            print(trna)
            if trna == "tRH-5'":
                start = "1"
                end = "33"
            elif trna == "tRH-DA":
                start = "14"
                end = "43"
            elif trna == "tRH-DTA":
                start = "17"
                end = "54"
            elif trna == "tRH-AT":
                start = "38"
                end = "69"
            elif trna == "tRH-3'":
                start = "43"
                end = "73"
            elif trna == "tRF-5'":
                start = "1"
                end = "15"
            elif trna == "tRF-3'":
                start = "58"
                end = "73"
            elif trna == "tRF-D":
                start = "8"
                end = "23"
            elif trna == "tRF-DA":
                start = "20"
                end = "35"
            elif trna == "tRF-A":
                start = "27"
                end = "42"
            elif trna == "tRF-AT":
                start = "33"
                end = "53"
            elif trna == "tRF-T":
                start = "45"
                end = "71"
            else:
                print("tRNA fragment not implemented")
                break
            outfile.write(("%s\t%s\t%s\t%s\t%s\t%s\n") %
                          (chrom, start, end, trna, score, strand))

    E.stop()
Example #10
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--output-quality-format",
                        dest="q_format",
                        type=int,
                        help="sequence quality format, e.g 33 = +33/Sanger")

    parser.add_argument("--output-paired-end",
                        dest="paired",
                        action="store_true",
                        help="generate paired end reads")

    parser.add_argument("--insert-length-mean",
                        dest="insert_mean",
                        type=float,
                        help="mean insert length.")

    parser.add_argument("--insert-length-sd",
                        dest="insert_sd",
                        type=float,
                        help="insert length standard deviation.")

    parser.add_argument(
        "--counts-method",
        dest="counts_method",
        type=str,
        choices=("reads", "copies"),
        help="simulate a ground truth number of reads per entry or"
        "copies per entry.")

    parser.add_argument(
        "--counts-min",
        dest="counts_min",
        type=float,
        help="minimum number of reads/read pairs per fasta entry"
        "or copies per entry.")

    parser.add_argument(
        "--counts-max",
        dest="counts_max",
        type=float,
        help="maximum number of reads/read pairs per fasta entry "
        "or copies per entry.")

    parser.add_argument("--output-read-length",
                        dest="read_length",
                        type=int,
                        help="read length.")

    parser.add_argument("--sequence-error-phred",
                        dest="phred",
                        type=int,
                        help="phred quality score.")

    parser.add_argument("--output-counts",
                        dest="output_counts",
                        type=str,
                        help="name for counts outfile.")

    parser.add_argument("--output-fastq2",
                        dest="fastq2_out",
                        type=str,
                        help="filename for second fastq outfile.")

    parser.add_argument("--premrna-fraction",
                        dest="premrna_fraction",
                        type=float,
                        help="the fraction of reads to simulate from pre-mRNA")

    parser.add_argument("--infile-premrna-fasta",
                        dest="premrna_fasta",
                        type=str,
                        help="filename for pre-mRNA fasta.")

    parser.set_defaults(q_format=33,
                        paired=False,
                        insert_mean=0,
                        insert_sd=1,
                        counts_method="reads",
                        counts_min=1,
                        counts_max=1,
                        read_length=50,
                        fastq2_out=None,
                        output_counts=None,
                        phred=30,
                        premrna_fraction=0,
                        premrna_fasta=None)

    (args) = E.start(parser)

    if args.paired:
        assert args.fastq2_out, ("must specify a second fastq outfile for "
                                 "paired end (--output-fastq2)")
        outf2 = iotools.open_file(args.fastq2_out, "w")

    if args.premrna_fraction:
        assert args.premrna_fasta, ("must specfify the location of the"
                                    "fasta file for the pre-mRNA")

    # the sequence quality string will always be the same so define here
    sequence_quality = chr(args.q_format + args.phred)
    qual = "".join([sequence_quality] * args.read_length)

    if args.premrna_fraction:
        iterator = FastaIterator.iterate_together(
            args.stdin, iotools.open_file(args.premrna_fasta))
    else:
        iterator = FastaIterator.FastaIterator(args.stdin)

    # set a cut off of twice the read/pair length for short entries
    if args.paired:
        minimum_entry_length = (2 *
                                ((args.read_length * 2) + args.insert_mean))
    else:
        minimum_entry_length = 2 * args.read_length

    c = collections.Counter()
    counts = collections.Counter()
    copies = collections.Counter()

    for f_entry in iterator:

        if args.premrna_fraction:

            assert getTitle(f_entry[0]) == getTitle(
                f_entry[1]), ("entry ids do not match: %s != %s" %
                              (f_entry[0].title, f_entry[1].title))
            entry = f_entry[0]
            pre_entry = f_entry[1]

        else:
            entry = f_entry

        # reject short fasta entries
        if len(entry.sequence) < minimum_entry_length:
            E.info("skipping short transcript: %s length=%i" %
                   (entry.title, len(entry.sequence)))
            c['skipped'] += 1
            continue

        else:
            c['not_skipped'] += 1

        if args.paired:
            fragment_length = ((2 * args.read_length) + args.insert_mean)
        else:
            fragment_length = args.read_length

        reads_per_entry = float(len(entry.sequence)) / fragment_length

        if args.counts_method == "reads":
            n_reads = random.randint(args.counts_min, args.counts_max + 1)

            n_copies = float(n_reads) / reads_per_entry

            if args.premrna_fraction:
                n_reads_pre = int(round(n_reads * args.premrna_fraction))

        elif args.counts_method == "copies":

            # random float [0-1]
            rand = np.random.random_sample()
            n_copies = (args.counts_min +
                        (rand * (args.counts_max - args.counts_min)))

            n_reads = int(round(n_copies * reads_per_entry, 0))

            # as n_reads must be rounded to int, need to redefine n_copies
            n_copies = float(n_reads) / reads_per_entry

            if args.premrna_fraction:
                reads_per_pre_entry = (float(len(pre_entry.sequence)) /
                                       fragment_length)
                n_copies_pre = n_copies * args.premrna_fraction
                n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0))
                # as n_reads_pre must be rounded to int, need to
                # redefine n_copies_pre
                n_copies_pre = float(n_reads_pre) / reads_per_pre_entry

        entry_id = getTitle(entry)

        counts[entry_id] = n_reads
        copies[entry_id] = n_copies

        if "N" in entry.sequence.upper():
            E.warn("fasta entry %s contains unknown bases ('N')" % entry_id)

        for i in range(0, n_reads):

            read = generateRead(entry=entry.sequence.upper(),
                                read_length=args.read_length,
                                error_rate=args.phred,
                                paired=args.paired,
                                insert_mean=args.insert_mean,
                                insert_sd=args.insert_sd)

            if args.paired:
                r1, r2 = read
                h1 = "@%s_%i/1" % (entry_id, i)
                h2 = "@%s_%i/2" % (entry_id, i)

                args.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

            else:
                h = "@%s_%i/1" % (entry_id, i)

                args.stdout.write("\n".join((h, read, "+", qual)) + "\n")

        if args.premrna_fraction:
            c['pre_counts'] += n_reads_pre
            c['pre_copies'] += n_copies_pre

            for i in range(0, n_reads_pre):

                read = generateRead(entry=pre_entry.sequence.upper(),
                                    read_length=args.read_length,
                                    error_rate=args.phred,
                                    paired=args.paired,
                                    insert_mean=args.insert_mean,
                                    insert_sd=args.insert_sd)

                if args.paired:
                    r1, r2 = read
                    h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i)
                    h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i)

                    args.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                    outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

                else:
                    h = "@%s_pre-mRNA_%i/1" % (entry_id, i)

                    args.stdout.write("\n".join((h, read, "+", qual)) + "\n")

    if args.paired:
        outf2.close()

    with iotools.open_file(args.output_counts, "w") as counts_out:

        counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm")))

        sum_copies = sum(copies.values())
        sum_counts = sum(counts.values())

        for entry_id, count in counts.items():
            tpm = 1000000 * (float(copies[entry_id]) / sum_copies)
            counts_out.write("%s\n" %
                             "\t".join(map(str, (entry_id, count, tpm))))

    E.info("Reads simulated for %i fasta entries, %i entries skipped" %
           (c['not_skipped'], c['skipped']))

    E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), "
           "%f transcripts (%f mRNA, %f pre-mRNA)" %
           (sum_counts + c['pre_counts'], sum_counts, c['pre_counts'],
            sum_copies + c['pre_copies'], sum_copies, c['pre_copies']))

    E.stop()
Example #11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="methods",
        type=str,
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "map-identifier", "nop", "remove-stops", "upper", "lower",
                 "reverse-complement", "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_argument("-p",
                        "--parameters",
                        dest="parameters",
                        type=str,
                        help="parameter stack for methods that require one ")

    parser.add_argument("-x",
                        "--ignore-errors",
                        dest="ignore_errors",
                        action="store_true",
                        help="ignore errors.")

    parser.add_argument("--sample-proportion",
                        dest="sample_proportion",
                        type=float,
                        help="sample proportion.")

    parser.add_argument(
        "--exclude-pattern",
        dest="exclude_pattern",
        type=str,
        help="exclude all sequences with ids matching pattern ")

    parser.add_argument(
        "--include-pattern",
        dest="include_pattern",
        type=str,
        help="include only sequences with ids matching pattern ")

    parser.add_argument("--filter-method",
                        dest="filter_methods",
                        type=str,
                        action="append",
                        help="filtering methods to apply ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="type",
        type=str,
        choices=("aa", "na"),
        help="sequence type (aa or na) . This option determines "
        "which characters to use for masking.")

    parser.add_argument(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type=str,
        help="template for numerical identifier"
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.add_argument(
        "--map-tsv-file",
        dest="map_tsv_file",
        type=str,
        help=
        "input filename with map for identifiers. The first row is a header")

    parser.add_argument("--fold-width",
                        dest="fold_width",
                        type=int,
                        help="fold width for sequence output. 0 is unfolded ")

    parser.set_defaults(methods=[],
                        parameters="",
                        type="na",
                        aa_mask_chars="xX",
                        aa_mask_char="x",
                        na_mask_chars="nN",
                        na_mask_char="n",
                        gap_chars="-.",
                        gap_char="-",
                        template_identifier="ID%06i",
                        ignore_errors=False,
                        exclude_pattern=None,
                        include_pattern=None,
                        sample_proportion=None,
                        filter_methods=[],
                        input_filename_fasta="-",
                        input_filename_map=None,
                        fold_width=80)

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) > 0:
        args.input_filename_fasta = unknown[0]

    args.parameters = args.parameters.split(",")

    rx_include, rx_exclude = None, None
    if args.include_pattern:
        rx_include = re.compile(args.include_pattern)
    if args.exclude_pattern:
        rx_exclude = re.compile(args.exclude_pattern)

    iterator = FastaIterator.FastaIterator(args.stdin)

    nseq = 0

    map_seq2nid = {}

    map_identifier = ("apply-map" in args.methods
                      or "map-identifier" in args.methods)
    if map_identifier:
        if args.input_filename_map is None:
            raise ValueError("for method=map-identifier use --map-tsv-file")
        with iotools.open_file(args.input_filename_map) as infile:
            map_identifier = iotools.read_map(infile, has_header=True)

    if args.type == "na":
        mask_chars = args.na_mask_chars
        mask_char = args.na_mask_char
    else:
        mask_chars = args.aa_mask_chars
        mask_char = args.aa_mask_char

    if "map-codons" in args.methods:
        map_codon2code = iotools.ReadMap(open(args.parameters[0], "r"))
        del args.parameters[0]

    if "mask-soft" in args.methods:
        f = args.parameters[0]
        del args.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in args.methods or "back-translate" in args.methods:

        # open a second stream to read sequences from
        f = args.parameters[0]
        del args.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "sample" in args.methods:
        if not args.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = args.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in args.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in iotools.open_file(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    iterator = pysam.FastxFile(args.input_filename_fasta)

    c = E.Counter()

    fold_width = args.fold_width

    def fold(s, w):
        return "\n".join([s[x:x + w] for x in range(0, len(s), w)])

    for record in iterator:
        c.nseq += 1
        c.input += 1

        sequence = re.sub(" ", "", record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(record.name):
            c.skipped += 1
            continue

        if rx_exclude and rx_exclude.search(record.name):
            c.skipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or record.name in filter_id_list):
            c.skipped += 1
            continue

        for method in args.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % args.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        record.name, ls)
                    c.errors += 1
                    if args.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if record.name != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        record.name, other_record.title)

                other_sequence = re.sub("[ %s]" % args.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % args.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in args.gap_chars:
                        c = args.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = sequence.translate(
                    str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = args.na_mask_char
                elif method == "remove-stops":
                    char = args.gap_char

                for x in sequence:

                    if x not in args.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in args.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (record.name))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in args.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", record.name).groups()[0]
                if id in map_seq2nid:
                    rest = record.name[len(id):]
                    record.name = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", record.name).groups()[0]
                new_id = args.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                record.name = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if record.name != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (record.name, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        %
                        (record.name, len(other_sequence) * 3, len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in args.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [args.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [args.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            c.skipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            c.skipped += 1
            continue

        record.sequence = sequence
        if fold_width >= 0:
            if record.comment:
                args.stdout.write(">{} {}\n{}\n".format(
                    record.name, record.comment,
                    fold(record.sequence, fold_width)))
            else:
                args.stdout.write(">{}\n{}\n".format(
                    record.name, fold(record.sequence, fold_width)))
        else:
            args.stdout.write(str(record) + "\n")

        c.output += 1

    if "build-map" in args.methods:
        p = args.parameters[0]
        if p:
            outfile = iotools.open_file(p, "w")
        else:
            outfile = args.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info(c)
    E.stop()
Example #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--info-file-out",
                      dest="info_file",
                      type="str",
                      help="name of the info file name[default=%default]")

    parser.set_defaults(info_file="info_file.fa")

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the sequnce as the key and the name as the value
    # only add if the sequence occurs once
    for cur_record in iterator:

        key = cur_record.sequence
        if key in d:
            pass
        else:
            d[key] = cur_record.title
    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name
    n = 0
    for key, value in d.items():
        n += 1
        cluster_dict[key] = n
        # output this to std out
        m = re.match("(chr\d+).tRNA\d+-(\S+)-(\S+)", value)

        value = m.group(1) + "-" + m.group(2) + "-" + m.group(3)

        options.stdout.write((">cluster%s:%s\n%s\n") % (n, value, key))

    # iterate over the infile again, this time use the
    # sequence to pull out the cluster it belongs to

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    for cur_record in iterator:
        cluster = cluster_dict[cur_record.sequence]
        outfile_info.write((">cluster%s:%s\n%s\n") %
                           (cluster, cur_record.title, cur_record.sequence))

    E.stop()
Example #13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)
    fastafile = IOTools.open_file(options.stdin.name)

    fasta = FastaIterator.FastaIterator(fastafile)

    for line in fasta:
        chrom = line.title
        total_len = len(line.sequence)

        trna_list = []
        string = None

        n = 0
        for letter in line.sequence:

            n += 1
            if n == 1:
                string = letter
            else:
                if string.isupper() and letter.isupper():
                    string = str(string) + str(letter)
                elif string.isupper() and letter.islower():
                    trna_list.append(string)
                    string = letter
                elif string.islower() and letter.islower():
                    string = str(string) + str(letter)
                elif string.islower() and letter.isupper():
                    trna_list.append(string)
                    string = letter
        trna_list.append(string)

        start = 1
        end = 1
        chrom = line.title
        for sequence in trna_list:
            start = end
            end = start + len(sequence)

            if sequence.islower():
                strand = chrom.split("(")[1].split(")")[0]
                options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\n") %
                                     (chrom, start, end, chrom, ".", strand))

    E.stop()