Esempio n. 1
0
def main(argv=None):

    parser = E.ArgumentParser(descriptin=__doc__)

    parser.add_argument("-f",
                        "--fasta",
                        dest="input_filename_fasta",
                        type=str,
                        help="filename with fasta sequences. ")

    parser.add_argument("-o",
                        "--output-filename-sequences",
                        dest="output_filename_sequences",
                        type=str,
                        help="output per sequence information to filename")

    parser.set_defaults(input_filename_fasta=None, )

    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unnowns) > 0:
        args.input_filename_fasta = args[0]

    sequence_pairs = []

    if args.input_filename_fasta != "-" and os.path.exists(
            args.input_filename_fasta + ".fai"):
        has_index = 1
        fastafile = pysam.FastaFile(args.input_filename_fasta)
        sequence_pairs = list(zip(fastafile.references, fastafile.lengths))
    else:
        has_index = 0
        iterator = pysam.FastxFile(args.input_filename_fasta)
        for record in iterator:
            sequence_pairs.append((record.name, len(record.sequence)))

    lengths = numpy.array([x[1] for x in sequence_pairs])

    args.stdout.write("\t".join(("has_index", "nsequences", "total_length",
                                 "min_length", "max_length", "median_length",
                                 "mean_length")) + "\n")

    if len(lengths) > 0:
        args.stdout.write("\t".join(
            map(str, (has_index, len(sequence_pairs), lengths.sum(),
                      lengths.min(), lengths.max(), numpy.median(lengths),
                      lengths.mean()))) + "\n")
    else:
        args.stdout.write("\t".join(
            map(str, (has_index, len(sequence_pairs), 0, "", "", "", ""))) +
                          "\n")

    if args.output_filename_sequences:
        with iotools.open_file(args.output_filename_sequences, "w") as outf:
            outf.write("name\tlength\n")
            outf.write(
                "\n".join(["\t".join(map(str, x))
                           for x in sequence_pairs]) + "\n")

    E.stop()
Esempio n. 2
0
def main(argv=sys.argv):

    parser = buildParser()

    (options, args) = E.start(parser, argv=argv,
                              add_database_options=True)

    if options.from_zipped:
        import gzip
        infile = gzip.GzipFile(fileobj=options.stdin, mode='r')

    else:
        infile = options.stdin

    if options.header_names:
        if "," in options.header_names:
            # sqlalchemy.exc.ArgumentError:
            #     Column must be constructed with a non-blank
            #     name or assign a non-blank .name before adding to a Table.
            replace_empty_strings = (lambda arg: '-' if len(arg) == 0 else arg)
            options.header_names = \
                [x for x in map(replace_empty_strings, options.header_names.split(','))]
        else:
            options.header_names = re.split("\s+", options.header_names.strip())

    run(infile, options)

    E.stop()
Esempio n. 3
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--Infile",
                        dest="Infile",
                        type=str,
                        help="Supply file containing filtered 16S fasta file")

    parser.add_argument("--Outfile",
                        dest="Outfile",
                        type=str,
                        help="Supply desired outfile name")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    ###############################################
    ###############################################
    ############## Execute Functions ##############
    ###############################################
    ###############################################

    specformatter(args.Infile, args.Outfile)

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--directory",
        dest="directory",
        type="string",
        help="supply directory where the input summaries aer located")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    infiles = glob.glob(os.path.join(options.directory, "*/*genes*summary*"))
    sys.stdout.write("category\tnreads\tpreads\tsample\n")
    for infile in infiles:
        reformat(infile)

    # write footer and output benchmark information.
    E.stop()
Esempio n. 5
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--regex-filename",
        dest="regex_filename",
        type="string",
        help="extract column name from filename via regular expression "
        "[%default]")

    parser.add_option("--filter",
                      dest="filters",
                      type="choice",
                      action="append",
                      choices=("PASS", "SNP"),
                      help="apply filters to VCFs when reading "
                      "[%default]")

    parser.set_defaults(
        regex_filename=None,
        filters=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) < 2:
        raise ValueError("requiring at least 2 input filenames")

    dfs = []
    for filename in args:
        if options.regex_filename:
            try:
                name = re.search(options.regex_filename, filename).groups()[0]
            except AttributeError:
                raise ValueError(
                    "regular expression '{}' does not match {}".format(
                        options.regex_filename, filename))
        else:
            name = iotools.snip(os.path.basename(filename), ".vcf.gz")

        E.debug("reading data from {}".format(filename))
        df = read_vcf_positions_into_dataframe(filename,
                                               filters=options.filters)
        df[name] = 1
        dfs.append(df)

    ndata = len(dfs)
    merged_df = dfs[0]
    for df in dfs[1:]:
        merged_df = merged_df.merge(df, how="outer")
    merged_df = merged_df.fillna(0)
    ddf = merged_df.drop(["chrom", "pos"], axis=1)
    set_counts = ddf.groupby(by=list(ddf.columns)).size()
    set_counts = set_counts.reset_index()
    set_counts.columns = list(set_counts.columns[:-1]) + ["counts"]

    set_counts.to_csv(options.stdout, sep="\t", index=False)
    E.stop()
def main(argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--option", dest="option", type="string")

    (options, args) = E.start(parser, argv)

    with IOTools.open_file(args[0]) as inf:
        data = "".join(inf.readlines()).strip()
    with IOTools.open_file(args[1]) as inf:
        reference = "".join(inf.readlines()).strip()

    data_counts = Counter(data)
    ref_counts = Counter(reference)

    keys = set(list(data_counts.keys()) + list(ref_counts.keys()))

    options.stdout.write("key\tinput\treference\n")
    for key in sorted(keys):
        options.stdout.write(
            "\t".join((key, str(data_counts[key]), str(ref_counts[key]))) + "\n")

    E.stop()
Esempio n. 7
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--run-id",
                      dest="run_id",
                      type="int",
                      help="numerical identifier of a run [%default]")

    parser.add_option("-d",
                      "--database-url",
                      dest="database_url",
                      type="string",
                      help="database url [%default]")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="only show statements to be executed [%default]")

    parser.set_defaults(
        run_id=None,
        database_url="sqlite:///./csvdb",
        dry_run=False,
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    purge_run_id(options.run_id, options.database_url, dry_run=options.dry_run)

    E.stop()
Esempio n. 8
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-s", "--sample-size", dest="sample_size", type="float",
        help="sample size. If less than 0, take a proportion of the chromosome size. "
        "If greater than 0, take a fixed number of variants [%default]")

    parser.set_defaults(
        input_filename_fasta=None,
        sample_size=0.001,
        sample_name="NA12878"
    )

    (options, args) = E.start(parser,
                              argv=argv,
                              add_output_options=True)

    if len(args) > 0:
        options.input_filename_fasta = args[0]

    if options.input_filename_fasta == "-":
        options.input_filename_fasta = options.stdin

    outf = options.stdout
    outf.write("##fileformat=VCFv4.1\n")
    outf.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
    outf.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format(options.sample_name))

    with pysam.FastxFile(options.input_filename_fasta) as inf:
        for record in inf:
            contig = record.name
            sequence = record.sequence
            if options.sample_size < 1.0:
                nsamples = int(float(len(sequence)) * options.sample_size)
            else:
                nsamples = int(options.sample_size)
            E.info("generating {} sampled variants for contig {}".format(nsamples, contig))
            sampled_positions = set()
            missing_nsamples = nsamples
            while len(sampled_positions) < nsamples:
                raw_positions = random.sample(list(range(len(sequence))), nsamples - len(sampled_positions))
                filtered_positions = [x for x in raw_positions if sequence[x] != "N"]
                sampled_positions.update(filtered_positions)
                E.debug("sample update: total={}, raw={}, filtered={}".format(
                        len(sampled_positions),
                        len(raw_positions),
                        len(filtered_positions)))

            sampled_positions = sorted(sampled_positions)

            for position in sampled_positions:
                base = sequence[position]
                outf.write("{}\t{}\t.\t{}\t{}\t.\t.\t.\tGT\t0/0\n".format(
                        contig, position + 1, base, base))

    E.stop()
Esempio n. 9
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-o", "--output-section", dest="output", type=str,
                        choices=("full", "name"),
                        help="output either ``full`` overlapping entries, only the ``name``s.")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if len(unknown) != 2:
        raise ValueError("two arguments required")

    if unknown[0] == "-":
        infile1 = args.stdin
    else:
        infile1 = iotools.open_file(unknown[0], "r")

    infile2 = iotools.open_file(unknown[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = args.output
    outfile = args.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--test-option",
                      dest="test_option",
                      type="string",
                      help="test option [default=%default].")

    parser.set_defaults(test_option="test")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    files = glob.glob(os.path.join(os.path.dirname(__file__), "*.pyx"))

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    for f in files:
        E.info("rebuilding %s" % f)
        ninput += 1
        prefix, suffix = os.path.splitext(f)
        for ext in (".c", ".pyxbldc"):
            try:
                os.remove(prefix + ext)
            except OSError:
                pass

        dirname, basename = os.path.split(prefix)
        assert basename.startswith("_")

        scriptname = os.path.join(dirname, basename[1:]) + ".py"
        if not os.path.exists(scriptname):
            E.warn("script %s does not exist - skipped" % scriptname)
            nskipped += 1
            continue

        E.info("compiling %s" % scriptname)
        os.system("%s %s --help > /dev/null" % (sys.executable, scriptname))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
Esempio n. 11
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("--is-gtf", dest="is_gtf", action="store_true",
                        help="input is gtf.")

    parser.set_defaults(
        is_gtf=False,
    )

    (args, unknown) = E.start(parser,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 0:
        files = [args.stdin]
    else:
        files = args

    args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields)))

    if args.is_gtf:
        args.stdout.write("\t%s" % ("\t".join(counter_exons.fields)))
    args.stdout.write("\n")

    for f in files:
        if f == args.stdin:
            infile = f
            args.stdout.write("stdin")
        else:
            infile = iotools.open_file(f)
            args.stdout.write(f)

        counters = []
        if args.is_gtf:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))
            counters.append(counter_exons(counters[0]))
        else:
            iterator = GTF.iterator(infile)
            counters.append(counter_gff(iterator))

        c = counters[-1]
        for x in c:
            pass

        for c in counters:
            args.stdout.write("\t%s" % str(c))
        args.stdout.write("\n")

        if infile != sys.stdin:
            infile.close()

    E.stop()
Esempio n. 12
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-a", "--first-fastq-file", dest="fastq1", type=str,
        help="supply read1 fastq file")
    parser.add_argument(
        "-b", "--second-fastq-file", dest="fastq2", type=str,
        help="supply read2 fastq file")

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser,
                              argv=argv,
                              unknowns=True)

    if unknown and len(unknown) == 2:
        args.fastq1, args.fastq2 = unknown

    fastq1 = iotools.open_file(args.fastq1)
    fastq2 = iotools.open_file(args.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in zip_longest(Fastq.iterate(fastq1),
                              Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError(
                    "unpaired reads detected. Are files sorted? are "
                    "files of equal length?")
            except PairedReadError as e:
                raise PairedReadError(e).with_traceback(sys.exc_info()[2])
        else:
            assert f1.identifier.endswith("/1") and \
                f2.identifier.endswith("/2"), \
                "Reads in file 1 must end with /1 and reads in file 2 with /2"
            args.stdout.write(
                ">%s\n%s\n>%s\n%s\n" %
                (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1

    E.info("output: %i pairs" % f1_count)

    # write footer and output benchmark information.
    E.stop()
Esempio n. 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id$", usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)


    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

   # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:


        # This is a temp fix because bedtools getfasta --name seems to have
        # changed the way it names the fasta titles. This may be temp but This
        # will fix this issue for the time being.
        m = re.match("(chr\d+.tRNA\d+-\S+-(pseudo)?)::\S+([+|-])", cur_record.title.replace("(","").replace(")",""))

        if m == None:
            continue
        if m.group(2) == "pseudo":
            pass
        else:
            key = str(m.group(1) +  m.group(3))
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n")%(key, value))

    E.stop()
Esempio n. 14
0
def main(argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv)

    data = "".join(open(args[0]).readlines())

    print(data[::-1])
Esempio n. 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $",
        usage=globals()["__doc__"])

    (options, args) = E.start(parser)

    if len(args) < 2:
        raise ValueError(
            "please supply at least two filenames to concatenate.")

    iterators = []
    for a in args:
        iterators.append(FastaIterator.FastaIterator(iotools.open_file(a,
                                                                       "r")))

    ninput, noutput, nerrors = 0, 0, 0

    while 1:

        sequences = []
        ids = []

        for iterator in iterators:
            try:
                cur_record = next(iterator)
            except StopIteration:
                break

            sequences.append(re.sub(" ", "", cur_record.sequence))
            ids.append(cur_record.title)

        if not sequences:
            break
        ninput += 1

        if len(sequences) != len(iterators):
            raise ValueError("unequal number of sequences in files")

        noutput += 1

        options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences)))

    E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors))

    E.stop()
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-o", "--outdir", dest="outdir", type=str,
                        help="supply output directory")
    parser.add_argument("-p", "--prefix", dest="prefix", type=str,
                        help="supply output file prefix")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    prefix = os.path.join(args.outdir, args.prefix)
    d_outf = open(prefix + "_domain.tsv", "w")
    k_outf = open(prefix + "_kingdom.tsv", "w")
    p_outf = open(prefix + "_phylum.tsv", "w")
    c_outf = open(prefix + "_class.tsv", "w")
    o_outf = open(prefix + "_order.tsv", "w")
    f_outf = open(prefix + "_family.tsv", "w")
    g_outf = open(prefix + "_genus.tsv", "w")
    s_outf = open(prefix + "_species.tsv", "w")
    
    for line in args.stdin.readlines():
        data = line[:-1].split("\t")
        taxon = data[0]
        counts = data[1:]
        taxonomy = taxon.split("|")

        if "d__" in taxonomy[-1]:
            print(taxon, counts)
            d_outf.write("\t".join([taxon] + counts) + "\n")
        elif "k__" in taxonomy[-1]:
            k_outf.write("\t".join([taxon] + counts) + "\n")
        elif "p__" in taxonomy[-1]:
            p_outf.write("\t".join([taxon] + counts) + "\n")
        elif "c__" in taxonomy[-1]:
            c_outf.write("\t".join([taxon] + counts) + "\n")
        elif "o__" in taxonomy[-1]:
            o_outf.write("\t".join([taxon] + counts) + "\n")
        elif "f__" in taxonomy[-1]:
            f_outf.write("\t".join([taxon] + counts) + "\n")
        elif "g__" in taxonomy[-1]:
            g_outf.write("\t".join([taxon] + counts) + "\n")
        elif "s__" in taxonomy[-1]:
            s_outf.write("\t".join([taxon] + counts) + "\n")

    # write footer and output benchmark information.
    E.stop()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--ontology",
                      dest="ontology",
                      type="string",
                      help="ontology label")

    parser.add_option("-f",
                      "--filter",
                      dest="filter",
                      action="store_true",
                      help="filter out genesets")

    parser.add_option("-l",
                      "--filter-list",
                      dest="filter_list",
                      type="string",
                      help="list of pathways to keep")

    parser.set_defaults(ontology=None, filter=False, filter_list=None)

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.filter:
        assert options.filter_list, "must specify a list of pathways to keep"
        filter_set = set()
        for line in open(options.filter_list).readlines():
            filter_set.add(line[:-1])

    inf = options.stdin
    for line in inf.readlines():
        data = line[:-1].split("\t")
        name, description, evidence = data[0], data[0], data[1]
        if options.filter:
            if name not in filter_set: continue
        genes = data[2:]
        for gene in genes:
            options.stdout.write("\t".join(
                [options.ontology, gene, name, description, evidence]) + "\n")

    ## write footer and output benchmark information.
    E.stop()
Esempio n. 18
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--delimiter",
                      dest="delimiter",
                      type="string",
                      help="delimiter to separate columns [%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=["row-describe", "column-describe"],
                      help="additional methods to apply [%default]")

    parser.set_defaults(
        delimiter="\t",
        methods=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.methods:
        options.methods = ["summary"]

    table = pandas.read_csv(options.stdin, options.delimiter)

    options.stdout.write("metric\tcount\tpercent\tinfo\n")

    for method in options.methods:
        label = re.sub("-", "_", method)
        if method == "summary":
            for category, count, denominator, info in compute_table_summary(
                    table):
                options.stdout.write("\t".join(
                    map(str, (category, count,
                              iotools.pretty_percent(count, denominator,
                                                     na=""), info))) + "\n")
        elif method == "column-describe":
            df = table.describe().T.stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")
        elif method == "row-describe":
            df = table.T.describe().stack()
            with E.open_output_file(label) as outf:
                outf.write("label\tcategory\tvalue\n")
                df.to_csv(outf, sep="\t")

    E.stop()
Esempio n. 19
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)

    lines = infile.readlines()

    for line in lines:

        column = line.split()

        new_columns = [
            column[0],
            str(int(column[1]) - 50),
            str(int(column[2]) + 50), column[3], column[4], column[5],
            str(int(column[1]) - 50),
            str(int(column[2]) + 50), column[8], column[9]
        ]

        if "pseudo" not in column[3]:

            if int(column[9]) == 2:
                [c, d] = column[10].split(",")
                block = int(column[2]) - int(column[1]) - int(d) + 50
                new_10 = ''.join(str(int(c) + 50) + ',' + str(int(d) + 50))
                new_11 = ''.join('0' + ',' + str(block))
                new_columns = new_columns + [new_10, new_11]

            else:
                new_columns = new_columns + [
                    str(int(column[10]) + 100), column[11]
                ]

            options.stdout.write('\t'.join(new_columns[0:]) + '\n')

    E.stop()
Esempio n. 20
0
def main(argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o", "--option", dest="option", type="string")

    (options, args) = E.start(parser, argv)

    data = "".join(open(args[0]).readlines())

    print(re.sub("o", "a", data))

    E.stop()
Esempio n. 21
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("length", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (options, args) = E.start(parser, argv)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if options.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(options.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            options.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
Esempio n. 22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    parser = E.ArgumentParser()

    parser.add_argument("-p", "--arguments", type=str, dest="arguments",   
                        default="",
                        help="Pass options and arguments to the executable. Please surround options in \"\"")

    parser.add_argument("-o", "--output-dir", type=str, dest="output",
                        default=".",
                        help="Output for the fastq files.")

    parser.add_argument("-f", "--fastqc", dest="fastqc",
                        action="store_true",
                        help="After demultiplexing open the fastq files in FastQC.")

    parser.add_argument("-F", "--fastqc-options", type=str, dest="fastqc_options",
                        default="",
                        help="Options for FastQC. Please surround options in \"\"")

    parser.add_argument("-H", "--bcl2fastq-help", dest="bcl2fastq_help", 
                        action="store_true",
                        help="Print help for Illumina's bcl2fastq conversion software")

    (args) = E.start(parser)

    if subprocess.run("which bcl2fastq", shell=True).returncode:
        raise ValueError("bcl2fastq cannot be found")

    if args.bcl2fastq_help:
        subprocess.run("bcl2fastq --help", shell=True)
        return
    else:
        subprocess.run(f"bcl2fastq {args.arguments} -o {args.output}", shell=True)

    for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True):
        with gzip.GzipFile(f"{infile}", "r") as f:
            if sum(1 for char in f.read().decode('utf-8') if char == "\n") % 4 != 0:
                raise ValueError(f"{infile} is either corrupt or incomplete.")

    if args.fastqc:
        for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True):
            subprocess.run(f"fastqc {infile} {args.fastqc_options}", shell=True)
Esempio n. 23
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq-file",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file. ")

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        action="append",
                        type=str,
                        choices=("length", ),
                        help="methods to apply ")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (args, unknown) = E.start(parser, argv, unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if args.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(args.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            args.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
Esempio n. 24
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-i",
                        "--input-fastq",
                        dest="input_fastq_file",
                        type=str,
                        help="input fastq file")

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=["ont2pacbio"],
                        help="methods to apply ")

    parser.set_defaults(
        input_fastq_file=None,
        line_width=80,
        method=None,
    )

    (args, unknown) = E.start(parser,
                              argv,
                              add_output_options=True,
                              unknowns=True)

    if len(unknown) == 1:
        args.input_fastq_file = unknown[0]

    if args.input_fastq_file == "-":
        args.input_fastq_file = args.stdin

    outf = args.stdout
    line_width = args.line_width
    well_no = 0
    for record in pysam.FastqFile(args.input_fastq_file):
        well_no += 1
        quals = record.get_quality_array()
        seq = record.sequence
        qv = int(math.floor(sum(quals) / len(quals)))
        outf.write(">{}/{}/{}_{} RQ=0.{}\n".format("test", well_no, 1,
                                                   len(seq) + 1, qv))
        for x in range(0, len(seq), line_width):
            outf.write(seq[x:x + line_width] + "\n")

    E.stop()
Esempio n. 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0:
        args.append("-")

    E.info(options.stdin)

    infile = IOTools.open_file(options.stdin.name)
    iterator = FastaIterator.FastaIterator(infile)

    # outfile_info = IOTools.open_file(options.info_file, "w")

    d = collections.OrderedDict()
    cluster_dict = dict()

    # first iterate over the fasta file and generate a dict
    # with the name (title) as the key and the sequence as the value
    # Remove any pseudo sequences
    for cur_record in iterator:

        key = cur_record.title
        if "pseudo" in key:
            pass

        else:
            d[key] = cur_record.sequence

    # next iterate of over the dict give the cluster a number
    # this will be used to then map back for the info name

    for key, value in d.items():
        # Add CCA tail
        options.stdout.write((">%s\n%scca\n") % (key, value))

    E.stop()
Esempio n. 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) == 0 or (len(args) == 1 and args[0] == "-"):
        infile = options.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        options.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
Esempio n. 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (args, unknown) = E.start(parser, argv=argv, unknowns=True)

    if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"):
        infile = args.stdin
    else:
        infile = fileinput.FileInput(args)

    # do sth
    ninput, nskipped, noutput = 0, 0, 0

    header = False

    for line in infile:
        ninput += 1
        if line.startswith("#"):
            pass
        elif not header:
            header = line
        elif line == header:
            nskipped += 1
            continue

        args.stdout.write(line)
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    # write footer and output benchmark information.
    E.stop()
Esempio n. 28
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--fastq1", dest="fastq1")
    parser.add_option("--to-drop-single", dest='to_remove_singletons')
    parser.add_option("--fastq-out1", dest="fq_out1")
    parser.add_option("--fastq-drop1", dest="fq_dropped1")

    (options, args) = E.start(parser)

    reads_to_remove = IOTools.open_file(
        options.to_remove_singletons).readlines()
    reads_to_remove = set([x.strip() for x in reads_to_remove])

    fastq_out = IOTools.open_file(options.fq_out1, 'w')
    fastq_host = IOTools.open_file(options.fq_dropped1, 'w')

    reads = 0
    dropped_reads = 0
    for read in Fastq.iterate(IOTools.open_file(fastq1)):
        reads += 1
        if read.identifier.split()[0] in reads_to_remove:
            fastq_host.write("@%s\n%s\n+\n%s\n" %
                             (read.identifier, read.seq, read.quals))
            dropped_reads += 1
        else:
            fastq_out.write("@%s\n%s\n+\n%s\n" %
                            (read.identifier, read.seq, read.quals))

    fastq_out.close()
    fastq_host.close()

    try:
        percent_dropped = dropped_reads / float(reads) * 100
    except ZeroDivisionError:
        percent_dropped = 0.0

    E.info('Dropped %i of %i reads (%f percent)' \
           % (dropped_reads, reads, percent_dropped))
Esempio n. 29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--sequence-type",
                      dest="type",
                      type="choice",
                      choices=("read_map", "rel_ab"),
                      help="type of file to be parsed to a table")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    assert options.type, "must specify infile type"
    if options.type == "read_map":
        options.stdout.write(
            "seq_id\tkingdom\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
        for entry in Metaphlan.read_map_iterator(sys.stdin):
            options.stdout.write("\t".join([
                entry.seq_id, entry.kingdom, entry.phylum, entry.c_lass,
                entry.order, entry.family, entry.genus, entry.species
            ]) + "\n")

    elif options.type == "rel_ab":
        options.stdout.write("taxon_level\ttaxon\trel_abundance\n")
        for entry in Metaphlan.relative_abundance_iterator(sys.stdin):
            options.stdout.write(
                "\t".join([entry.taxon_level, entry.taxon, entry.abundance]) +
                "\n")

    # write footer and output benchmark information.
    E.stop()
Esempio n. 30
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = argparse.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-t",
                        "--sequence-type",
                        dest="type",
                        type=str,
                        choices=("read_map", "rel_ab"),
                        help="type of file to be parsed to a table")

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    assert args.type, "must specify infile type"
    if args.type == "read_map":
        args.stdout.write(
            "seq_id\tkingdom\tphylum\tclass\torder\tfamily\tgenus\tspecies\n")
        for entry in Metaphlan.read_map_iterator(sys.stdin):
            args.stdout.write("\t".join([
                entry.seq_id, entry.kingdom, entry.phylum, entry.c_lass,
                entry.order, entry.family, entry.genus, entry.species
            ]) + "\n")

    elif args.type == "rel_ab":
        args.stdout.write("taxon_level\ttaxon\trel_abundance\n")
        for entry in Metaphlan.relative_abundance_iterator(sys.stdin):
            args.stdout.write(
                "\t".join([entry.taxon_level, entry.taxon, entry.abundance]) +
                "\n")

    # write footer and output benchmark information.
    E.stop()