Example #1
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    primer_thermodynamics_parameters_path = PARAMS["general_primer_thermodynamics_parameters_path"]
    
    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.open_file(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.open_file(fasta)):
        if f.title in ids:
            outf = IOTools.open_file(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.items():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n" % seq)
            outf.write("PRIMER_THERMODYNAMIC_PARAMETERS_PATH=%s\n=\n" % primer_thermodynamics_parameters_path)
            outf.close()
Example #2
0
def countMotifs(infile, motifs):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''

    it = FastaIterator.FastaIterator(infile)
    positions = []
    while 1:
        try:
            seq = next(it)
        except StopIteration:
            break
        if not seq:
            break

        rseq = Genomics.reverse_complement(seq.sequence)
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer(seq.sequence):
                pos.append((motif, "+", x.start(), x.end()))
            for x in pattern.finditer(rseq):
                pos.append(
                    (motif, "-", lsequence - x.end(), lsequence - x.start()))

        positions.append((seq.title, pos))

    return positions
Example #3
0
def segmentUngapped(infile, gap_char, min_gap_size=0):

    iterator = FastaIterator.FastaIterator(infile)

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        size = len(cur_record.sequence)

        last_end = 0
        for start, end in gapped_regions(cur_record.sequence, gap_char):
            if end - start < min_gap_size:
                continue

            if last_end != 0:
                yield (contig, last_end, start, 0)
            last_end = end

        if last_end < size:
            yield (contig, last_end, size, 0)
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(iotools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
Example #5
0
def segmentWithCpG(infile, with_contig_sizes=False):
    '''segment a fasta file, output locations of CpG.'''

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)

    segments, contig_sizes = [], collections.OrderedDict()

    for cur_record in iterator:
        ninput += 1
        contig = re.sub("\s.*", "", cur_record.title)
        last = None
        contig_sizes[contig] = (0, len(cur_record.sequence))
        for pos, this in enumerate(cur_record.sequence.upper()):
            if last == "C" and this == "G":
                segments.append((contig, pos - 1, pos + 1, 1.0))
            last = this

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    if with_contig_sizes:
        return segments, contig_sizes

    return segments
Example #6
0
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf:
            for x, s in enumerate(sequences):
                outf.write(">%i\n%s\n" % (x, s))

        infile = outf.name
        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))
        
        result = [
            x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))]

        os.remove(infile)

        return result
def calculateSequenceComposition(interval_names,
                                 sequence_file,
                                 outfile,
                                 header_line=True):
    '''
    given a set of interval names that are present in a
    fasta file, return CpG content file
    '''
    interval_file = open(interval_names)
    if header_line:
        interval_file.readline()
    sequence_file = open(sequence_file)

    interval_set = set()
    for line in interval_file.readlines():
        interval_set.add(line[:-1])

    temp = P.getTempFile("/ifs/scratch")
    for record in FastaIterator.iterate(sequence_file):
        seq_id = record.title.split(" ")[0]
        if seq_id in interval_set:
            temp.write(">%s\n%s\n" % (record.title, record.sequence))
    temp.close()

    inf = temp.name
    statement = '''
    cat %(inf)s | cgat fasta2table
    -s na -s cpg -s length
    --log=%(outfile)s.log > %(outfile)s'''

    P.run()
Example #8
0
def RenameFastaTitle(fastafile, file2tax, outfile):
    taxonomy = file2tax[fastafile]
    suffix=1
    for fasta in fastaiterator.iterate(iotools.open_file(fastafile)):
        suffix_str = "(" + str(suffix) + ")"
        new_title = taxonomy + suffix_str
        suffix += 1
        outfile.write(">" + new_title + "\n" + fasta.sequence + "\n")
Example #9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$", usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)


    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

   # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:


        # This is a temp fix because bedtools getfasta --name seems to have
        # changed the way it names the fasta titles. This may be temp but This
        # will fix this issue for the time being.
        m = re.match("(chr\d+.tRNA\d+-\S+-(pseudo)?)::\S+([+|-])", cur_record.title.replace("(","").replace(")",""))

        if m == None:
            continue
        if m.group(2) == "pseudo":
            pass
        else:
            key = str(m.group(1) +  m.group(3))
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n")%(key, value))

    E.stop()
Example #10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    (options, args) = E.start(parser)

    if len(args) < 2:
        raise ValueError(
            "please supply at least two filenames to concatenate.")

    iterators = []
    for a in args:
        iterators.append(FastaIterator.FastaIterator(iotools.open_file(a,
                                                                       "r")))

    ninput, noutput, nerrors = 0, 0, 0

    while 1:

        sequences = []
        ids = []

        for iterator in iterators:
            try:
                cur_record = next(iterator)
            except StopIteration:
                break

            sequences.append(re.sub(" ", "", cur_record.sequence))
            ids.append(cur_record.title)

        if not sequences:
            break
        ninput += 1

        if len(sequences) != len(iterators):
            raise ValueError("unequal number of sequences in files")

        noutput += 1

        options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences)))

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.stop()
Example #11
0
def specformatter(Infile, Outfile):

    infile = iotools.open_file(Infile)
    fastas = fastaiterator.iterate(infile)
    outfile = open(Outfile, "w")

    for fasta in fastas:
        name = fasta.title.split(";")[6]
        specID = name.split("(")[1]
        specID = specID[:-1]
        genusspecies = name.split("(")[0]
        genus = genusspecies.split("_")[0]
        species = genusspecies.split("_")[1]
        newtitle = " ".join([specID, genus, species])
        outfile.write(">" + newtitle + "\n" + fasta.sequence + "\n")
Example #12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:

        key = cur_record.title
        if "pseudo" in key:
            pass

        else:
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n") % (key, value))

    E.stop()
Example #13
0
def segmentGaps(infile, gap_char):

    iterator = FastaIterator.FastaIterator(infile)

    while 1:
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)

        for start, end in gapped_regions(cur_record.sequence, gap_char):
            yield (contig, start, end, 0)
Example #14
0
def buildMisprimingLib(infiles, outfile):
    '''
    build fasta file of sequences to check for mispriming
    '''
    fasta, identifiers = infiles
    inf = IOTools.open_file(fasta)
    
    E.info("reading ids for sequences to keep")
    ids = readIdentifiers(identifiers)

    outf = IOTools.open_file(outfile, "w")
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.open_file(fasta)):
        if f.title not in ids:
            outf.write(">%s\n%s\n" % (f.title, f.sequence))
    outf.close()
Example #15
0
def segmentFixedWidthWindows(infile, window_size, window_shift):
    """return a list of fixed contig sizes."""

    ninput, nskipped, noutput = 0, 0, 0

    iterator = FastaIterator.FastaIterator(infile)
    window_shift = window_size
    # at most 50% can be gap
    gap_cutoff = int(window_size // 2)
    segments = []

    while 1:
        ninput += 1
        try:
            cur_record = next(iterator)
        except StopIteration:
            break

        if cur_record is None:
            break
        contig = re.sub("\s.*", "", cur_record.title)
        seq = cur_record.sequence
        size = len(cur_record.sequence)

        for x in range(0, size, window_shift):
            s = seq[x:x + window_size].upper()
            gc, at = 0, 0
            for c in s:
                if c in "GC":
                    gc += 1
                elif c in "AT":
                    at += 1

            # skip segments containing mostly gaps
            if window_size - (gc + at) > gap_cutoff:
                nskipped += 1
                continue

            segments.append(
                (contig, x, x + window_size, float(gc) / (gc + at)))
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped_windows=%i" %
           (ninput, noutput, nskipped))

    return segments
Example #16
0
def runMEMEOnSequences(infile, outfile):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio, MEME is not run on
    all intervals but only the top 10% of intervals (peakval) are
    used.  Also, only the segment of 200 bp around the peak is used
    and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    '''
    # job_options = "-l mem_free=8000M"

    nseqs = int(FastaIterator.count(infile))
    if nseqs == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
        return

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)
    tmpdir = P.get_temp_dir(".")

    statement = '''
    meme %(infile)s -dna -revcomp
    -mod %(meme_model)s
    -nmotifs %(meme_nmotifs)s
    -oc %(tmpdir)s
    -maxsize %(motifs_max_size)s
    %(meme_options)s
       > %(outfile)s.log
    '''

    P.run(statement)

    collectMEMEResults(tmpdir, target_path, outfile)
Example #17
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--correct-gap-shift",
                      dest="correct_shift",
                      action="store_true",
                      help="correct gap length shifts in alignments. "
                      "Requires alignlib_lite.py [%default]")

    parser.add_option(
        "-1",
        "--pattern1",
        dest="pattern1",
        type="string",
        help="pattern to extract identifier from in identifiers1. "
        "[%default]")

    parser.add_option(
        "-2",
        "--pattern2",
        dest="pattern2",
        type="string",
        help="pattern to extract identifier from in identifiers2. "
        "[%default]")

    parser.add_option("-o",
                      "--output-section",
                      dest="output",
                      type="choice",
                      action="append",
                      choices=("diff", "missed", "seqdiff"),
                      help="what to output [%default]")

    parser.set_defaults(correct_shift=False,
                        pattern1="(\S+)",
                        pattern2="(\S+)",
                        output=[])

    (options, args) = E.start(parser)

    if len(args) != 2:
        raise ValueError("two files needed to compare.")

    if options.correct_shift:
        try:
            import alignlib_lite
        except ImportError:
            raise ImportError(
                "option --correct-shift requires alignlib_lite.py_ "
                "but alignlib not found")

    seqs1 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(iotools.open_file(args[0], "r"))
    ])
    seqs2 = dict([
        (x.title, x.sequence)
        for x in FastaIterator.iterate(iotools.open_file(args[1], "r"))
    ])

    if not seqs1:
        raise ValueError("first file %s is empty." % (args[0]))
    if not seqs2:
        raise ValueError("second file %s is empty." % (args[1]))

    MapIdentifiers(seqs1, options.pattern1)
    MapIdentifiers(seqs2, options.pattern2)

    nsame = 0
    nmissed1 = 0
    nmissed2 = 0
    ndiff = 0
    ndiff_first = 0
    ndiff_last = 0
    ndiff_prefix = 0
    ndiff_selenocysteine = 0
    ndiff_masked = 0
    nfixed = 0
    found2 = {}

    write_missed1 = "missed" in options.output
    write_missed2 = "missed" in options.output
    write_seqdiff = "seqdiff" in options.output
    write_diff = "diff" in options.output or write_seqdiff

    for k in sorted(seqs1):
        if k not in seqs2:
            nmissed1 += 1
            if write_missed1:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed1"))
            continue

        found2[k] = 1

        s1 = seqs1[k].upper()
        s2 = seqs2[k].upper()
        m = min(len(s1), len(s2))

        if s1 == s2:
            nsame += 1
        else:
            status = "other"

            ndiff += 1

            if s1[1:] == s2[1:]:
                ndiff_first += 1
                status = "first"
            elif s1[:m] == s2[:m]:
                ndiff_prefix += 1
                status = "prefix"
            elif s1[:-1] == s2[:-1]:
                ndiff_last += 1
                status = "last"
            else:
                if len(s1) == len(s2):
                    # get all differences: the first and last residues
                    # can be different for peptide sequences when
                    # comparing my translations with ensembl peptides.
                    differences = []
                    for x in range(1, len(s1) - 1):
                        if s1[x] != s2[x]:
                            differences.append((s1[x], s2[x]))

                    l = len(differences)
                    # check for Selenocysteins
                    if len(
                        [x for x in differences
                         if x[0] == "U" or x[1] == "U"]) == l:
                        ndiff_selenocysteine += 1
                        status = "selenocysteine"

                    # check for masked residues
                    elif len([
                            x for x in differences
                            if x[0] in "NX" or x[1] in "NX"
                    ]) == l:
                        ndiff_masked += 1
                        status = "masked"

            # correct for different gap lengths
            if options.correct_shift:

                map_a2b = alignlib_lite.py_makeAlignmentVector()

                a, b = 0, 0
                keep = False

                x = 0
                while x < m and not (a == len(s1) and b == len(s2)):
                    try:
                        if s1[a] != s2[b]:
                            while s1[a] == "N" and s2[b] != "N":
                                a += 1
                            while s1[a] != "N" and s2[b] == "N":
                                b += 1

                            if s1[a] != s2[b]:
                                break
                    except IndexError:
                        print(
                            "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i"
                            % (k, x, a, b, len(s1), len(s2)))
                        break

                    a += 1
                    b += 1
                    map_a2b.addPairExplicit(a, b, 0.0)
                    # check if we have reached the end:
                else:
                    keep = True
                    nfixed += 1
                    f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b)
                    print("fix\t%s\t%s" % (k, str(f)))

                if not keep:
                    print("# warning: not fixable: %s" % k)

            if write_diff:
                options.stdout.write("---- %s ---- %s\n" % (k, status))

            if write_seqdiff:
                options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k]))

    for k in sorted(list(seqs2.keys())):
        if k not in found2:
            nmissed2 += 1
            if write_missed2:
                options.stdout.write("---- %s ---- %s\n" % (k, "missed2"))

    options.stdlog.write("""# Legend:
""")

    E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" %
           (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2))

    E.info(
        "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i"
        % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine,
           ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last -
           ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed))

    E.stop()
Example #18
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-w",
        "--weights-tsv-file",
        dest="filename_weights",
        type=str,
        help="filename with codon frequencies. Multiple filenames "
        "can be separated by comma.")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        nargs="*",
                        type=str,
                        choices=("length", "sequence", "hid", "na", "aa",
                                 "cpg", "dn", "degeneracy", "gaps", "codons",
                                 "codon-usage", "codon-translator",
                                 "codon-bias"),
                        help="which sections to output ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="seqtype",
        type=str,
        choices=("na", "aa"),
        help="type of sequence: na=nucleotides, aa=amino acids .")

    parser.add_argument(
        "-e",
        "--regex-identifier",
        dest="regex_identifier",
        type=str,
        help="regular expression to extract identifier from fasta "
        "description line.")

    parser.add_argument(
        "--split-fasta-identifier",
        dest="split_id",
        action="store_true",
        help="split fasta description line (starting >) and use "
        "only text before first space")

    parser.add_argument(
        "--add-total",
        dest="add_total",
        action="store_true",
        help="add a row with column totals at the end of the table")

    parser.set_defaults(
        filename_weights=None,
        pseudocounts=1,
        sections=[],
        regex_identifier="(.+)",
        seqtype="na",
        gap_chars='xXnN',
        split_id=False,
        add_total=False,
    )

    (args) = E.start(parser, argv=argv)

    rx = re.compile(args.regex_identifier)

    reference_codons = []
    if args.filename_weights:
        args.filename_weights = args.filename_weights.split(",")
        for filename in args.filename_weights:
            if filename == "uniform":
                reference_codons.append(Genomics.GetUniformCodonUsage())
            else:
                reference_codons.append(
                    iotools.ReadMap(iotools.open_file(filename, "r"),
                                    has_header=True,
                                    map_functions=(str, float)))

        # print codon table differences
        args.stdlog.write(
            "# Difference between supplied codon usage preferences.\n")
        for x in range(0, len(reference_codons)):
            for y in range(0, len(reference_codons)):
                if x == y:
                    continue
                # calculate KL distance
                a = reference_codons[x]
                b = reference_codons[y]
                d = 0
                for codon, p in list(a.items()):
                    if Genomics.IsStopCodon(codon):
                        continue
                    d += b[codon] * math.log(b[codon] / p)

                args.stdlog.write(
                    "# tablediff\t%s\t%s\t%f\n" %
                    (args.filename_weights[x], args.filename_weights[y], d))

    iterator = FastaIterator.FastaIterator(args.stdin)

    def getCounter(section):

        if args.seqtype == "na":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "na":
                s = SequenceProperties.SequencePropertiesNA()
            elif section == "gaps":
                s = SequenceProperties.SequencePropertiesGaps(args.gap_chars)
            elif section == "cpg":
                s = SequenceProperties.SequencePropertiesCpg()
            elif section == "dn":
                s = SequenceProperties.SequencePropertiesDN()
            # these sections requires sequence length to be a multiple of 3
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAA()
            elif section == "degeneracy":
                s = SequenceProperties.SequencePropertiesDegeneracy()
            elif section == "codon-bias":
                s = SequenceProperties.SequencePropertiesBias(reference_codons)
            elif section == "codons":
                s = SequenceProperties.SequencePropertiesCodons()
            elif section == "codon-usage":
                s = SequenceProperties.SequencePropertiesCodonUsage()
            elif section == "codon-translator":
                s = SequenceProperties.SequencePropertiesCodonTranslator()
            else:
                raise ValueError("unknown section %s" % section)
        elif args.seqtype == "aa":
            if section == "length":
                s = SequenceProperties.SequencePropertiesLength()
            elif section == "sequence":
                s = SequenceProperties.SequencePropertiesSequence()
            elif section == "hid":
                s = SequenceProperties.SequencePropertiesHid()
            elif section == "aa":
                s = SequenceProperties.SequencePropertiesAminoAcids()
            else:
                raise ValueError("unknown section %s" % section)
        return s

    # setup totals
    totals = {}
    for section in args.sections:
        totals[section] = getCounter(section)

    args.stdout.write("id")
    for section in args.sections:
        args.stdout.write("\t" + "\t".join(totals[section].getHeaders()))

    args.stdout.write("\n")
    args.stdout.flush()

    s = getCounter("hid")
    s.loadSequence("AAAAAAAAA", "na")

    for cur_record in iterator:

        sequence = re.sub(" ", "", cur_record.sequence).upper()

        if len(sequence) == 0:
            raise ValueError("empty sequence %s" % cur_record.title)

        id = rx.search(cur_record.title).groups()[0]

        if args.split_id is True:
            args.stdout.write("%s" % id.split()[0])
        else:
            args.stdout.write("%s" % id)
        args.stdout.flush()

        for section in args.sections:
            s = getCounter(section)
            s.loadSequence(sequence, args.seqtype)
            totals[section].addProperties(s)

            args.stdout.write("\t" + "\t".join(s.getFields()))

        args.stdout.write("\n")

    if args.add_total:
        args.stdout.write("total")
        for section in args.sections:
            args.stdout.write("\t" + "\t".join(totals[section].getFields()))
        args.stdout.write("\n")

    E.stop()
Example #19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--trna-scheme",
        dest="trna_scheme",
        type="choice",
        choices=("tDR-5'", "tRH-DA"),
        help="name of the tRNA scheme to make bed file for[default=%default]")

    parser.set_defaults(trna_scheme=None)

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    outfile = IOTools.open_file(options.stdout.name, "w")
    trna_options = [
        "tRH-5'", "tRH-DA", "tRH-DTA", "tRH-AT", "tRH-3'", "tRF-5'", "tRF-3'",
        "tRF-D", "tRF-DA", "tRF-A", "tRF-AT", "tRF-T"
    ]
    for trna in trna_options:
        infile = IOTools.open_file(options.stdin.name)
        iterator = FastaIterator.FastaIterator(infile)

        d = collections.OrderedDict()
        cluster_dict = dict()

        # first iterate over the fasta file

        for cur_record in iterator:

            title = cur_record.title
            m = re.match("(cluster\d+):chr\S+.tRNA\d+-(\S+)-\((\S+)\)", title)

            cluster = m.group(1)
            trna_group = m.group(2)
            strand = m.group(3)

            chrom = cluster + ":" + trna_group + "-"
            score = "."
            print(trna)
            if trna == "tRH-5'":
                start = "1"
                end = "33"
            elif trna == "tRH-DA":
                start = "14"
                end = "43"
            elif trna == "tRH-DTA":
                start = "17"
                end = "54"
            elif trna == "tRH-AT":
                start = "38"
                end = "69"
            elif trna == "tRH-3'":
                start = "43"
                end = "73"
            elif trna == "tRF-5'":
                start = "1"
                end = "15"
            elif trna == "tRF-3'":
                start = "58"
                end = "73"
            elif trna == "tRF-D":
                start = "8"
                end = "23"
            elif trna == "tRF-DA":
                start = "20"
                end = "35"
            elif trna == "tRF-A":
                start = "27"
                end = "42"
            elif trna == "tRF-AT":
                start = "33"
                end = "53"
            elif trna == "tRF-T":
                start = "45"
                end = "71"
            else:
                print("tRNA fragment not implemented")
                break
            outfile.write(("%s\t%s\t%s\t%s\t%s\t%s\n") %
                          (chrom, start, end, trna, score, strand))

    E.stop()
Example #20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--info-file-out",
                      dest="info_file",
                      type="str",
                      help="name of the info file name[default=%default]")

    parser.set_defaults(info_file="info_file.fa")

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the sequnce as the key and the name as the value
    # only add if the sequence occurs once
    for cur_record in iterator:

        key = cur_record.sequence
        if key in d:
            pass
        else:
            d[key] = cur_record.title
    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name
    n = 0
    for key, value in d.items():
        n += 1
        cluster_dict[key] = n
        # output this to std out
        m = re.match("(chr\d+).tRNA\d+-(\S+)-(\S+)", value)

        value = m.group(1) + "-" + m.group(2) + "-" + m.group(3)

        options.stdout.write((">cluster%s:%s\n%s\n") % (n, value, key))

    # iterate over the infile again, this time use the
    # sequence to pull out the cluster it belongs to

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    for cur_record in iterator:
        cluster = cluster_dict[cur_record.sequence]
        outfile_info.write((">cluster%s:%s\n%s\n") %
                           (cluster, cur_record.title, cur_record.sequence))

    E.stop()
Example #21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $"
    )

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="input pattern. Parses description line in order to extract id.")

    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern. Gives filename for a given sequence.")

    parser.add_option(
        "-n",
        "--num-sequences",
        dest="num_sequences",
        type="int",
        help="split by number of sequences (not implemented yet).")

    parser.add_option("-m",
                      "--map",
                      dest="map_filename",
                      type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s",
                      "--skip-identifiers",
                      dest="skip_identifiers",
                      action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size",
                      dest="min_size",
                      type="int",
                      help="minimum cluster size.")

    parser.set_defaults(input_filename=None,
                        map_filename=None,
                        skip_identifiers=False,
                        input_pattern="^(\S+)",
                        min_size=0,
                        num_sequences=None,
                        output_pattern="%s")

    (options, args) = E.start(parser)

    if options.input_filename:
        infile = iotools.open_file(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = iotools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print("# parsing error in description line %s" % (seq.title))
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    # delete all clusters below a minimum size
    # Note: this has to be done at the end, because
    # clusters sizes are only available once both the fasta
    # file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print("# input=%i, output=%i, ndeleted=%i" %
              (ninput, noutput, ndeleted))

    E.stop()
Example #22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--kmer-size",
                      dest="kmer",
                      type="int",
                      help="supply kmer length")

    parser.add_option("-p",
                      "--output-proportion",
                      dest="proportion",
                      action="store_true",
                      help="output proportions - overides the default output")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    # do not allow greater than octonucleotide
    assert options.kmer <= 8, "cannot handle kmer of length %i" % options.kmer

    # how we deal with the nucleotides depends on the kmer length
    nucleotides = []
    for nucleotide in ["A", "C", "T", "G"]:
        nucleotides = nucleotides + \
            [x for x in itertools.repeat(nucleotide, options.kmer)]

    E.info("retrieving %imer sequences" % options.kmer)
    # get all kmer sequences to query
    kmers = set()
    for kmer in itertools.permutations(nucleotides, options.kmer):
        kmers.add(kmer)

    E.info("matching %imers in file" % options.kmer)
    # count the number of kmers in each sequence

    result = {}

    # NB assume that non fasta files are caught by FastaIterator
    total_entries = 0
    for fasta in FastaIterator.iterate(options.stdin):
        total_entries += 1
        result[fasta.title] = {}
        for kmer in kmers:
            counts = [
                m.start() for m in re.finditer("".join(kmer), fasta.sequence)
            ]
            result[fasta.title][kmer] = len(counts)

    E.info("writing results")
    # write out the results
    headers = sorted(result.keys())
    rows = set()
    for kmer_counts in list(result.values()):
        for kmer, count in kmer_counts.items():
            rows.add("".join(kmer))

    # write header row
    options.stdout.write("kmer\t" + "\t".join(headers) + "\n")

    # output proportions if required - normalises by
    # sequence length
    E.info("computing total counts")
    totals = {}
    for header in headers:
        totals[header] = sum([result[header][tuple(row)] for row in rows])

    for row in sorted(rows):
        if options.proportion:
            options.stdout.write("\t".join([row] + [
                str(float(result[header][tuple(row)]) / totals[header])
                for header in headers
            ]) + "\n")
        else:
            options.stdout.write("\t".join(
                [row] +
                [str(result[header][tuple(row)])
                 for header in headers]) + "\n")

    E.info("written kmer counts for %i contigs" % total_entries)
    # write footer and output benchmark information.
    E.stop()
Example #23
0
twoA = 0
twoG = 0
twoT = 0
twoC = 0
twoO = 0

thrA = 0
thrG = 0
thrT = 0
thrC = 0
thrO = 0

i = 0

###Iterate over and change titles if they are matched.
for transcript in FastaIterator.iterate(infile):
    if transcript.sequence[0] == "A":
        oneA += 1
    elif transcript.sequence[0] == "G":
        oneG += 1
    elif transcript.sequence[0] == "T":
        oneT += 1
    elif transcript.sequence[0] == "C":
        oneC += 1
    else:
        oneO += 1

    if transcript.sequence[1] == "A":
        twoA += 1
    elif transcript.sequence[1] == "G":
        twoG += 1
Example #24
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-c", "--is-cds", dest="is_cds", action="store_true",
                        help="input are cds (nucleotide) sequences ")

    parser.set_defaults(
        is_cds=False,
    )

    (args) = E.start(parser, argv=argv)

    args.stdout.write(
        "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n")

    alphabet = "ACDEFGHIKLMNPQRSTVWY"

    snpid = 0

    for entry in FastaIterator.iterate(args.stdin):
        identifier = entry.title

        if args.is_cds:
            cds_sequence = entry.sequence.upper()
            assert len(cds_sequence) % 3 == 0, \
                "length of sequence '%s' is not a multiple of 3" % entry.title

            sequence = Genomics.translate(cds_sequence)
            weights = []
            for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)):
                codon = cds_sequence[cds_pos:cds_pos + 3]
                counts = collections.defaultdict(int)
                for x in range(0, 3):
                    rna = codon[x]
                    for na in "ACGT":
                        if na == rna:
                            continue
                        taa = Genomics.translate(
                            codon[:x] + na + codon[x + 1:])
                        counts[taa] += 1
                weights.append(counts)

        else:
            sequence = entry.sequence.upper()
            counts = {}
            for x in alphabet:
                counts[x] = 1
            weights = [counts] * len(sequence)

        for pos, ref in enumerate(sequence):

            if ref not in alphabet:
                continue
            w = weights[pos]
            t = float(sum(w.values()))
            for variant in alphabet:
                if variant == ref:
                    continue
                snpid += 1
                args.stdout.write(
                    "%s\n" % "\t".join(
                        ("%010i" % snpid,
                         identifier,
                         str(pos + 1),
                         ref,
                         variant,
                         "%i" % w[variant],
                         "%6.4f" % (w[variant] / t),
                         )))

    E.stop()
Example #25
0
def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | cgat bed2fasta 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with iotools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(iotools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH",
        "pid transfac_id pos strand core_similarity matrix_similarity sequence"
    )

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r:
                    yield pid, r
                r = []
                pid = re.match("Inspecting sequence ID\s+(\S+)",
                               line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep:
                continue
            if line[:-1].strip() == "":
                continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")
            ]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(
                MATCH._make((pid, transfac_id, int(pos), strand,
                             float(core_similarity), float(matrix_similarity),
                             sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = iotools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand", "start", "end",
                          "relative_start", "relative_end", "transfac_id",
                          "core_similarity", "matrix_similarity",
                          "sequence")) + "\n")

    bedf = iotools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(iotools.openFile(outfile +
                                                            ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(
                map(str, (transcript_id, strand, genome_start, genome_end,
                          relative_start, relative_end, match.transfac_id,
                          match.core_similarity, match.matrix_similarity,
                          match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(
                map(str, (contig, genome_start, genome_end, transcript_id,
                          strand, match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with iotools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Example #26
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--output-quality-format",
                        dest="q_format",
                        type=int,
                        help="sequence quality format, e.g 33 = +33/Sanger")

    parser.add_argument("--output-paired-end",
                        dest="paired",
                        action="store_true",
                        help="generate paired end reads")

    parser.add_argument("--insert-length-mean",
                        dest="insert_mean",
                        type=float,
                        help="mean insert length.")

    parser.add_argument("--insert-length-sd",
                        dest="insert_sd",
                        type=float,
                        help="insert length standard deviation.")

    parser.add_argument(
        "--counts-method",
        dest="counts_method",
        type=str,
        choices=("reads", "copies"),
        help="simulate a ground truth number of reads per entry or"
        "copies per entry.")

    parser.add_argument(
        "--counts-min",
        dest="counts_min",
        type=float,
        help="minimum number of reads/read pairs per fasta entry"
        "or copies per entry.")

    parser.add_argument(
        "--counts-max",
        dest="counts_max",
        type=float,
        help="maximum number of reads/read pairs per fasta entry "
        "or copies per entry.")

    parser.add_argument("--output-read-length",
                        dest="read_length",
                        type=int,
                        help="read length.")

    parser.add_argument("--sequence-error-phred",
                        dest="phred",
                        type=int,
                        help="phred quality score.")

    parser.add_argument("--output-counts",
                        dest="output_counts",
                        type=str,
                        help="name for counts outfile.")

    parser.add_argument("--output-fastq2",
                        dest="fastq2_out",
                        type=str,
                        help="filename for second fastq outfile.")

    parser.add_argument("--premrna-fraction",
                        dest="premrna_fraction",
                        type=float,
                        help="the fraction of reads to simulate from pre-mRNA")

    parser.add_argument("--infile-premrna-fasta",
                        dest="premrna_fasta",
                        type=str,
                        help="filename for pre-mRNA fasta.")

    parser.set_defaults(q_format=33,
                        paired=False,
                        insert_mean=0,
                        insert_sd=1,
                        counts_method="reads",
                        counts_min=1,
                        counts_max=1,
                        read_length=50,
                        fastq2_out=None,
                        output_counts=None,
                        phred=30,
                        premrna_fraction=0,
                        premrna_fasta=None)

    (args) = E.start(parser)

    if args.paired:
        assert args.fastq2_out, ("must specify a second fastq outfile for "
                                 "paired end (--output-fastq2)")
        outf2 = iotools.open_file(args.fastq2_out, "w")

    if args.premrna_fraction:
        assert args.premrna_fasta, ("must specfify the location of the"
                                    "fasta file for the pre-mRNA")

    # the sequence quality string will always be the same so define here
    sequence_quality = chr(args.q_format + args.phred)
    qual = "".join([sequence_quality] * args.read_length)

    if args.premrna_fraction:
        iterator = FastaIterator.iterate_together(
            args.stdin, iotools.open_file(args.premrna_fasta))
    else:
        iterator = FastaIterator.FastaIterator(args.stdin)

    # set a cut off of twice the read/pair length for short entries
    if args.paired:
        minimum_entry_length = (2 *
                                ((args.read_length * 2) + args.insert_mean))
    else:
        minimum_entry_length = 2 * args.read_length

    c = collections.Counter()
    counts = collections.Counter()
    copies = collections.Counter()

    for f_entry in iterator:

        if args.premrna_fraction:

            assert getTitle(f_entry[0]) == getTitle(
                f_entry[1]), ("entry ids do not match: %s != %s" %
                              (f_entry[0].title, f_entry[1].title))
            entry = f_entry[0]
            pre_entry = f_entry[1]

        else:
            entry = f_entry

        # reject short fasta entries
        if len(entry.sequence) < minimum_entry_length:
            E.info("skipping short transcript: %s length=%i" %
                   (entry.title, len(entry.sequence)))
            c['skipped'] += 1
            continue

        else:
            c['not_skipped'] += 1

        if args.paired:
            fragment_length = ((2 * args.read_length) + args.insert_mean)
        else:
            fragment_length = args.read_length

        reads_per_entry = float(len(entry.sequence)) / fragment_length

        if args.counts_method == "reads":
            n_reads = random.randint(args.counts_min, args.counts_max + 1)

            n_copies = float(n_reads) / reads_per_entry

            if args.premrna_fraction:
                n_reads_pre = int(round(n_reads * args.premrna_fraction))

        elif args.counts_method == "copies":

            # random float [0-1]
            rand = np.random.random_sample()
            n_copies = (args.counts_min +
                        (rand * (args.counts_max - args.counts_min)))

            n_reads = int(round(n_copies * reads_per_entry, 0))

            # as n_reads must be rounded to int, need to redefine n_copies
            n_copies = float(n_reads) / reads_per_entry

            if args.premrna_fraction:
                reads_per_pre_entry = (float(len(pre_entry.sequence)) /
                                       fragment_length)
                n_copies_pre = n_copies * args.premrna_fraction
                n_reads_pre = int(round(n_copies_pre * reads_per_pre_entry, 0))
                # as n_reads_pre must be rounded to int, need to
                # redefine n_copies_pre
                n_copies_pre = float(n_reads_pre) / reads_per_pre_entry

        entry_id = getTitle(entry)

        counts[entry_id] = n_reads
        copies[entry_id] = n_copies

        if "N" in entry.sequence.upper():
            E.warn("fasta entry %s contains unknown bases ('N')" % entry_id)

        for i in range(0, n_reads):

            read = generateRead(entry=entry.sequence.upper(),
                                read_length=args.read_length,
                                error_rate=args.phred,
                                paired=args.paired,
                                insert_mean=args.insert_mean,
                                insert_sd=args.insert_sd)

            if args.paired:
                r1, r2 = read
                h1 = "@%s_%i/1" % (entry_id, i)
                h2 = "@%s_%i/2" % (entry_id, i)

                args.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

            else:
                h = "@%s_%i/1" % (entry_id, i)

                args.stdout.write("\n".join((h, read, "+", qual)) + "\n")

        if args.premrna_fraction:
            c['pre_counts'] += n_reads_pre
            c['pre_copies'] += n_copies_pre

            for i in range(0, n_reads_pre):

                read = generateRead(entry=pre_entry.sequence.upper(),
                                    read_length=args.read_length,
                                    error_rate=args.phred,
                                    paired=args.paired,
                                    insert_mean=args.insert_mean,
                                    insert_sd=args.insert_sd)

                if args.paired:
                    r1, r2 = read
                    h1 = "@%s_pre-mRNA_%i/1" % (entry_id, i)
                    h2 = "@%s_pre-mRNA_%i/2" % (entry_id, i)

                    args.stdout.write("\n".join((h1, r1, "+", qual)) + "\n")
                    outf2.write("\n".join((h2, r2, "+", qual)) + "\n")

                else:
                    h = "@%s_pre-mRNA_%i/1" % (entry_id, i)

                    args.stdout.write("\n".join((h, read, "+", qual)) + "\n")

    if args.paired:
        outf2.close()

    with iotools.open_file(args.output_counts, "w") as counts_out:

        counts_out.write("%s\n" % "\t".join(("id", "read_count", "tpm")))

        sum_copies = sum(copies.values())
        sum_counts = sum(counts.values())

        for entry_id, count in counts.items():
            tpm = 1000000 * (float(copies[entry_id]) / sum_copies)
            counts_out.write("%s\n" %
                             "\t".join(map(str, (entry_id, count, tpm))))

    E.info("Reads simulated for %i fasta entries, %i entries skipped" %
           (c['not_skipped'], c['skipped']))

    E.info("Simulated: %i reads (%i mRNA, %i pre-mRNA), "
           "%f transcripts (%f mRNA, %f pre-mRNA)" %
           (sum_counts + c['pre_counts'], sum_counts, c['pre_counts'],
            sum_copies + c['pre_copies'], sum_copies, c['pre_copies']))

    E.stop()
Example #27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--outputdir",
                      dest="outdir",
                      type="string",
                      help="output directory to save plots")

    parser.add_option("-f",
                      "--fasta",
                      dest="fasta_file",
                      type="string",
                      help="fasta file containing tRNA cluster fasta seqs")

    parser.set_defaults(fasta_file=None, outdir=None)

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    dict_trna = {}
    for record in FastaIterator.iterate(IOTools.open_file(options.fasta_file)):
        title = record.title.strip("-")
        length = len(record.sequence)
        dict_trna[title] = length

# For each read in bamfile find end position and then plot this using length of tRNA cluster
    samfile = pysam.AlignmentFile(options.stdin.name, "rb")
    refname = ""
    values = []
    n = 0
    for line in samfile:
        if line.reference_name == refname:
            if line.reference_end is None:
                pass
            else:
                end = int(line.reference_end) - int(line.reference_start)
                values.append(end)
        elif line.reference_name != refname:
            n += 1
            if n > 1:

                values = pd.Series(values)
                percent = values.value_counts() / values.count() * 100
                percent = percent.sort_index()
                percent = pd.DataFrame(percent)
                percent.rename(columns={0: 'Percent'}, inplace=True)

                # length of each tRNA from fasta
                length = dict_trna[refname.strip("-")] + 1

                temp_df = pd.DataFrame(0,
                                       index=range(1, length),
                                       columns=['A'])
                temp_df = pd.concat([temp_df, percent], axis=1)
                percent = temp_df.fillna(0)

                refname = options.outdir + refname.strip("-")
                outfile = refname + ".csv"
                outfig = refname + ".eps"

                percent.to_csv(outfile)

                g = sns.factorplot(x=percent.index,
                                   y="Percent",
                                   data=percent,
                                   size=8,
                                   kind="bar",
                                   palette="Blues")
                g.set_xlabels('position from 5\' end')
                g.set_xticklabels(rotation=90)
                g.savefig(outfig, format='eps')

                values = []
                refname = line.reference_name

            else:

                refname = line.reference_name

    E.stop()
Example #28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)
    fastafile = IOTools.open_file(options.stdin.name)

    fasta = FastaIterator.FastaIterator(fastafile)

    for line in fasta:
        chrom = line.title
        total_len = len(line.sequence)

        trna_list = []
        string = None

        n = 0
        for letter in line.sequence:

            n += 1
            if n == 1:
                string = letter
            else:
                if string.isupper() and letter.isupper():
                    string = str(string) + str(letter)
                elif string.isupper() and letter.islower():
                    trna_list.append(string)
                    string = letter
                elif string.islower() and letter.islower():
                    string = str(string) + str(letter)
                elif string.islower() and letter.isupper():
                    trna_list.append(string)
                    string = letter
        trna_list.append(string)

        start = 1
        end = 1
        chrom = line.title
        for sequence in trna_list:
            start = end
            end = start + len(sequence)

            if sequence.islower():
                strand = chrom.split("(")[1].split(")")[0]
                options.stdout.write(("%s\t%s\t%s\t%s\t%s\t%s\n") %
                                     (chrom, start, end, chrom, ".", strand))

    E.stop()
Example #29
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="methods",
        type=str,
        action="append",
        choices=("translate", "translate-to-stop", "truncate-at-stop",
                 "back-translate", "mark-codons", "apply-map", "build-map",
                 "pseudo-codons", "filter", "interleaved-codons", "map-codons",
                 "remove-gaps", "mask-seg", "mask-bias", "mask-codons",
                 "mask-incomplete-codons", "mask-stops", "mask-soft",
                 "map-identifier", "nop", "remove-stops", "upper", "lower",
                 "reverse-complement", "sample", "shuffle"),
        help="method to apply to sequences.")

    parser.add_argument("-p",
                        "--parameters",
                        dest="parameters",
                        type=str,
                        help="parameter stack for methods that require one ")

    parser.add_argument("-x",
                        "--ignore-errors",
                        dest="ignore_errors",
                        action="store_true",
                        help="ignore errors.")

    parser.add_argument("--sample-proportion",
                        dest="sample_proportion",
                        type=float,
                        help="sample proportion.")

    parser.add_argument(
        "--exclude-pattern",
        dest="exclude_pattern",
        type=str,
        help="exclude all sequences with ids matching pattern ")

    parser.add_argument(
        "--include-pattern",
        dest="include_pattern",
        type=str,
        help="include only sequences with ids matching pattern ")

    parser.add_argument("--filter-method",
                        dest="filter_methods",
                        type=str,
                        action="append",
                        help="filtering methods to apply ")

    parser.add_argument(
        "-t",
        "--sequence-type",
        dest="type",
        type=str,
        choices=("aa", "na"),
        help="sequence type (aa or na) . This option determines "
        "which characters to use for masking.")

    parser.add_argument(
        "-l",
        "--template-identifier",
        dest="template_identifier",
        type=str,
        help="template for numerical identifier"
        "for the operation --build-map. A %i is replaced by the position "
        "of the sequence in the file.")

    parser.add_argument(
        "--map-tsv-file",
        dest="map_tsv_file",
        type=str,
        help=
        "input filename with map for identifiers. The first row is a header")

    parser.add_argument("--fold-width",
                        dest="fold_width",
                        type=int,
                        help="fold width for sequence output. 0 is unfolded ")

    parser.set_defaults(methods=[],
                        parameters="",
                        type="na",
                        aa_mask_chars="xX",
                        aa_mask_char="x",
                        na_mask_chars="nN",
                        na_mask_char="n",
                        gap_chars="-.",
                        gap_char="-",
                        template_identifier="ID%06i",
                        ignore_errors=False,
                        exclude_pattern=None,
                        include_pattern=None,
                        sample_proportion=None,
                        filter_methods=[],
                        input_filename_fasta="-",
                        input_filename_map=None,
                        fold_width=80)

    (args, unknown) = E.start(parser, unknowns=True)

    if len(unknown) > 0:
        args.input_filename_fasta = unknown[0]

    args.parameters = args.parameters.split(",")

    rx_include, rx_exclude = None, None
    if args.include_pattern:
        rx_include = re.compile(args.include_pattern)
    if args.exclude_pattern:
        rx_exclude = re.compile(args.exclude_pattern)

    iterator = FastaIterator.FastaIterator(args.stdin)

    nseq = 0

    map_seq2nid = {}

    map_identifier = ("apply-map" in args.methods
                      or "map-identifier" in args.methods)
    if map_identifier:
        if args.input_filename_map is None:
            raise ValueError("for method=map-identifier use --map-tsv-file")
        with iotools.open_file(args.input_filename_map) as infile:
            map_identifier = iotools.read_map(infile, has_header=True)

    if args.type == "na":
        mask_chars = args.na_mask_chars
        mask_char = args.na_mask_char
    else:
        mask_chars = args.aa_mask_chars
        mask_char = args.aa_mask_char

    if "map-codons" in args.methods:
        map_codon2code = iotools.ReadMap(open(args.parameters[0], "r"))
        del args.parameters[0]

    if "mask-soft" in args.methods:
        f = args.parameters[0]
        del args.parameters[0]
        hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "mask-codons" in args.methods or "back-translate" in args.methods:

        # open a second stream to read sequences from
        f = args.parameters[0]
        del args.parameters[0]

        other_iterator = FastaIterator.FastaIterator(open(f, "r"))

    if "sample" in args.methods:
        if not args.sample_proportion:
            raise ValueError("specify a sample proportion")
        sample_proportion = args.sample_proportion
    else:
        sample_proportion = None

    filter_min_sequence_length = None
    filter_max_sequence_length = None
    filter_id_list = None
    for f in args.filter_methods:
        if f.startswith("min-length"):
            filter_min_sequence_length = int(f.split("=")[1])
        elif f.startswith("max-length"):
            filter_max_sequence_length = int(f.split("=")[1])
        elif f.startswith("id-file"):
            filter_id_list = [
                line[:-1] for line in iotools.open_file(f.split("=")[1])
            ]

    def raiseIfNotCodon(l, title):
        '''raise ValueError if sequence length l is not divisible by
        3'''

        if l % 3 != 0:
            raise ValueError("length of sequence %s not divisible by 3" %
                             (title))

    iterator = pysam.FastxFile(args.input_filename_fasta)

    c = E.Counter()

    fold_width = args.fold_width

    def fold(s, w):
        return "\n".join([s[x:x + w] for x in range(0, len(s), w)])

    for record in iterator:
        c.nseq += 1
        c.input += 1

        sequence = re.sub(" ", "", record.sequence)
        l = len(sequence)

        if rx_include and not rx_include.search(record.name):
            c.skipped += 1
            continue

        if rx_exclude and rx_exclude.search(record.name):
            c.skipped += 1
            continue

        if sample_proportion:
            if random.random() > sample_proportion:
                continue

        if not (filter_id_list is None or record.name in filter_id_list):
            c.skipped += 1
            continue

        for method in args.methods:

            if method == "translate":
                # translate such that gaps are preserved
                seq = []

                ls = len(re.sub('[%s]' % args.gap_chars, sequence, ""))

                if ls % 3 != 0:
                    msg = "length of sequence %s (%i) not divisible by 3" % (
                        record.name, ls)
                    c.errors += 1
                    if args.ignore_errors:
                        E.warn(msg)
                        continue
                    else:
                        raise ValueError(msg)

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:
                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "back-translate":
                # translate from an amino acid alignment to codon alignment
                seq = []

                try:
                    other_record = next(other_iterator)
                except StopIteration:
                    raise ValueError("run out of sequences")

                if record.name != other_record.title:
                    raise "sequence titles don't match: %s %s" % (
                        record.name, other_record.title)

                other_sequence = re.sub("[ %s]" % args.gap_chars, "",
                                        other_record.sequence)

                if len(other_sequence) % 3 != 0:
                    raise ValueError(
                        "length of sequence %s not divisible by 3" %
                        (other_record.title))

                r = re.sub("[%s]" % args.gap_chars, "", sequence)
                if len(other_sequence) != len(r) * 3:
                    raise ValueError(
                        "length of sequences do not match: %i vs %i" %
                        (len(other_sequence), len(r)))

                x = 0
                for aa in sequence:
                    if aa in args.gap_chars:
                        c = args.gap_char * 3
                    else:
                        c = other_sequence[x:x + 3]
                        x += 3
                    seq.append(c)

                sequence = "".join(seq)

            elif method == "pseudo-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "   ".join(seq)

            elif method == "reverse-complement":
                sequence = sequence.translate(
                    str.maketrans("ACGTacgt", "TGCAtgca"))[::-1]

            elif method in ("mask-stops", "remove-stops"):
                c = []
                codon = []
                new_sequence = []

                if method == "mask-stops":
                    char = args.na_mask_char
                elif method == "remove-stops":
                    char = args.gap_char

                for x in sequence:

                    if x not in args.gap_chars:
                        codon.append(x.upper())

                    c.append(x)

                    if len(codon) == 3:
                        codon = "".join(codon).upper()
                        # mask all non-gaps
                        if Genomics.IsStopCodon(codon):

                            for x in c:
                                if x in args.gap_chars:
                                    new_sequence.append(x)
                                else:
                                    new_sequence.append(char)
                        else:
                            new_sequence += c

                        c = []
                        codon = []

                new_sequence += c

                sequence = "".join(new_sequence)

            elif method == "mask-soft":
                # Get next hard masked record and extract sequence and length
                try:
                    cur_hm_record = next(hard_masked_iterator)
                except StopIteration:
                    break
                hm_sequence = re.sub(" ", "", cur_hm_record.sequence)
                lhm = len(hm_sequence)
                new_sequence = []

                # Check lengths of unmasked and soft masked sequences the same
                if l != lhm:
                    raise ValueError(
                        "length of unmasked and hard masked sequences not "
                        "identical for record %s" % (record.name))

                # Check if hard masked seq contains repeat (N), if so replace N
                # with lowercase sequence from unmasked version
                if sequence == hm_sequence:
                    pass
                else:
                    for x, y in zip_longest(sequence, hm_sequence):
                        if y == "N":
                            new_sequence += x.lower()
                        else:
                            new_sequence += x.upper()
                sequence = "".join(new_sequence)

            elif method == "map-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in (sequence[x:x + 3].upper()
                              for x in range(0, l, 3)):

                    if codon not in map_codon2code:
                        aa = "X"
                    else:
                        aa = map_codon2code[codon]
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "interleaved-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append("%s:%s" % (aa, codon))

                sequence = " ".join(seq)

            elif method == "translate-to-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break

                    aa = Genomics.MapCodon2AA(codon)
                    seq.append(aa)

                sequence = "".join(seq)

            elif method == "truncate-at-stop":
                seq = []

                for codon in [sequence[x:x + 3] for x in range(0, l, 3)]:

                    if Genomics.IsStopCodon(codon):
                        break
                    seq.append(codon)

                sequence = "".join(seq)

            elif method == "remove-gaps":

                seq = []
                for s in sequence:
                    if s in args.gap_chars:
                        continue
                    seq.append(s)

                sequence = "".join(seq)

            elif method == "upper":
                sequence = sequence.upper()

            elif method == "lower":
                sequence = sequence.lower()

            elif method == "mark-codons":
                raiseIfNotCodon(l, record.name)
                seq = []

                sequence = " ".join(
                    [sequence[x:x + 3] for x in range(0, l, 3)])

            elif method == "apply-map":
                id = re.match("^(\S+)", record.name).groups()[0]
                if id in map_seq2nid:
                    rest = record.name[len(id):]
                    record.name = map_seq2nid[id] + rest

            elif method == "build-map":
                # build a map of identifiers
                id = re.match("^(\S+)", record.name).groups()[0]
                new_id = args.template_identifier % nseq
                if id in map_seq2nid:
                    raise "duplicate fasta entries - can't map those: %s" % id
                map_seq2nid[id] = new_id
                record.name = new_id

            elif method == "mask-bias":
                masker = Masker.MaskerBias()
                sequence = masker(sequence)

            elif method == "mask-seg":
                masker = Masker.MaskerSeg()
                sequence = masker(sequence)

            elif method == "shuffle":
                s = list(sequence)
                random.shuffle(s)
                sequence = "".join(s)

            elif method == "mask-incomplete-codons":
                seq = list(sequence)
                for x in range(0, l, 3):
                    nm = len([x for x in seq[x:x + 3] if x in mask_chars])
                    if 0 < nm < 3:
                        seq[x:x + 3] = [mask_char] * 3
                sequence = "".join(seq)

            elif method == "mask-codons":
                # mask codons based on amino acids given as reference
                # sequences.
                other_record = next(other_iterator)

                if other_record is None:
                    raise ValueError("run out of sequences.")

                if record.name != other_record.title:
                    raise ValueError("sequence titles don't match: %s %s" %
                                     (record.name, other_record.title))

                other_sequence = re.sub(" ", "", other_record.sequence)

                if len(other_sequence) * 3 != len(sequence):
                    raise ValueError(
                        "sequences for %s don't have matching lengths %i - %i"
                        %
                        (record.name, len(other_sequence) * 3, len(sequence)))

                seq = list(sequence)
                c = 0
                for x in other_sequence:
                    if x in args.aa_mask_chars:
                        if x.isupper():
                            seq[c:c + 3] = [args.na_mask_char.upper()] * 3
                        else:
                            seq[c:c + 3] = [args.na_mask_char.lower()] * 3
                    c += 3

                sequence = "".join(seq)

        l = len(sequence)
        if filter_min_sequence_length is not None and \
           l < filter_min_sequence_length:
            c.skipped += 1

        if filter_max_sequence_length is not None and \
           l > filter_max_sequence_length:
            c.skipped += 1
            continue

        record.sequence = sequence
        if fold_width >= 0:
            if record.comment:
                args.stdout.write(">{} {}\n{}\n".format(
                    record.name, record.comment,
                    fold(record.sequence, fold_width)))
            else:
                args.stdout.write(">{}\n{}\n".format(
                    record.name, fold(record.sequence, fold_width)))
        else:
            args.stdout.write(str(record) + "\n")

        c.output += 1

    if "build-map" in args.methods:
        p = args.parameters[0]
        if p:
            outfile = iotools.open_file(p, "w")
        else:
            outfile = args.stdout

        outfile.write("old\tnew\n")
        for old_id, new_id in list(map_seq2nid.items()):
            outfile.write("%s\t%s\n" % (old_id, new_id))
        if p:
            outfile.close()

    E.info(c)
    E.stop()