def main(argv=None): parser = E.ArgumentParser(descriptin=__doc__) parser.add_argument("-f", "--fasta", dest="input_filename_fasta", type=str, help="filename with fasta sequences. ") parser.add_argument("-o", "--output-filename-sequences", dest="output_filename_sequences", type=str, help="output per sequence information to filename") parser.set_defaults(input_filename_fasta=None, ) (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unnowns) > 0: args.input_filename_fasta = args[0] sequence_pairs = [] if args.input_filename_fasta != "-" and os.path.exists( args.input_filename_fasta + ".fai"): has_index = 1 fastafile = pysam.FastaFile(args.input_filename_fasta) sequence_pairs = list(zip(fastafile.references, fastafile.lengths)) else: has_index = 0 iterator = pysam.FastxFile(args.input_filename_fasta) for record in iterator: sequence_pairs.append((record.name, len(record.sequence))) lengths = numpy.array([x[1] for x in sequence_pairs]) args.stdout.write("\t".join(("has_index", "nsequences", "total_length", "min_length", "max_length", "median_length", "mean_length")) + "\n") if len(lengths) > 0: args.stdout.write("\t".join( map(str, (has_index, len(sequence_pairs), lengths.sum(), lengths.min(), lengths.max(), numpy.median(lengths), lengths.mean()))) + "\n") else: args.stdout.write("\t".join( map(str, (has_index, len(sequence_pairs), 0, "", "", "", ""))) + "\n") if args.output_filename_sequences: with iotools.open_file(args.output_filename_sequences, "w") as outf: outf.write("name\tlength\n") outf.write( "\n".join(["\t".join(map(str, x)) for x in sequence_pairs]) + "\n") E.stop()
def main(argv=sys.argv): parser = buildParser() (options, args) = E.start(parser, argv=argv, add_database_options=True) if options.from_zipped: import gzip infile = gzip.GzipFile(fileobj=options.stdin, mode='r') else: infile = options.stdin if options.header_names: if "," in options.header_names: # sqlalchemy.exc.ArgumentError: # Column must be constructed with a non-blank # name or assign a non-blank .name before adding to a Table. replace_empty_strings = (lambda arg: '-' if len(arg) == 0 else arg) options.header_names = \ [x for x in map(replace_empty_strings, options.header_names.split(','))] else: options.header_names = re.split("\s+", options.header_names.strip()) run(infile, options) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--Infile", dest="Infile", type=str, help="Supply file containing filtered 16S fasta file") parser.add_argument("--Outfile", dest="Outfile", type=str, help="Supply desired outfile name") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) ############################################### ############################################### ############## Execute Functions ############## ############################################### ############################################### specformatter(args.Infile, args.Outfile) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-d", "--directory", dest="directory", type="string", help="supply directory where the input summaries aer located") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) infiles = glob.glob(os.path.join(options.directory, "*/*genes*summary*")) sys.stdout.write("category\tnreads\tpreads\tsample\n") for infile in infiles: reformat(infile) # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--regex-filename", dest="regex_filename", type="string", help="extract column name from filename via regular expression " "[%default]") parser.add_option("--filter", dest="filters", type="choice", action="append", choices=("PASS", "SNP"), help="apply filters to VCFs when reading " "[%default]") parser.set_defaults( regex_filename=None, filters=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("requiring at least 2 input filenames") dfs = [] for filename in args: if options.regex_filename: try: name = re.search(options.regex_filename, filename).groups()[0] except AttributeError: raise ValueError( "regular expression '{}' does not match {}".format( options.regex_filename, filename)) else: name = iotools.snip(os.path.basename(filename), ".vcf.gz") E.debug("reading data from {}".format(filename)) df = read_vcf_positions_into_dataframe(filename, filters=options.filters) df[name] = 1 dfs.append(df) ndata = len(dfs) merged_df = dfs[0] for df in dfs[1:]: merged_df = merged_df.merge(df, how="outer") merged_df = merged_df.fillna(0) ddf = merged_df.drop(["chrom", "pos"], axis=1) set_counts = ddf.groupby(by=list(ddf.columns)).size() set_counts = set_counts.reset_index() set_counts.columns = list(set_counts.columns[:-1]) + ["counts"] set_counts.to_csv(options.stdout, sep="\t", index=False) E.stop()
def main(argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--option", dest="option", type="string") (options, args) = E.start(parser, argv) with IOTools.open_file(args[0]) as inf: data = "".join(inf.readlines()).strip() with IOTools.open_file(args[1]) as inf: reference = "".join(inf.readlines()).strip() data_counts = Counter(data) ref_counts = Counter(reference) keys = set(list(data_counts.keys()) + list(ref_counts.keys())) options.stdout.write("key\tinput\treference\n") for key in sorted(keys): options.stdout.write( "\t".join((key, str(data_counts[key]), str(ref_counts[key]))) + "\n") E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-r", "--run-id", dest="run_id", type="int", help="numerical identifier of a run [%default]") parser.add_option("-d", "--database-url", dest="database_url", type="string", help="database url [%default]") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="only show statements to be executed [%default]") parser.set_defaults( run_id=None, database_url="sqlite:///./csvdb", dry_run=False, ) (options, args) = E.start(parser, argv=argv, add_output_options=True) purge_run_id(options.run_id, options.database_url, dry_run=options.dry_run) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-s", "--sample-size", dest="sample_size", type="float", help="sample size. If less than 0, take a proportion of the chromosome size. " "If greater than 0, take a fixed number of variants [%default]") parser.set_defaults( input_filename_fasta=None, sample_size=0.001, sample_name="NA12878" ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) > 0: options.input_filename_fasta = args[0] if options.input_filename_fasta == "-": options.input_filename_fasta = options.stdin outf = options.stdout outf.write("##fileformat=VCFv4.1\n") outf.write("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") outf.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format(options.sample_name)) with pysam.FastxFile(options.input_filename_fasta) as inf: for record in inf: contig = record.name sequence = record.sequence if options.sample_size < 1.0: nsamples = int(float(len(sequence)) * options.sample_size) else: nsamples = int(options.sample_size) E.info("generating {} sampled variants for contig {}".format(nsamples, contig)) sampled_positions = set() missing_nsamples = nsamples while len(sampled_positions) < nsamples: raw_positions = random.sample(list(range(len(sequence))), nsamples - len(sampled_positions)) filtered_positions = [x for x in raw_positions if sequence[x] != "N"] sampled_positions.update(filtered_positions) E.debug("sample update: total={}, raw={}, filtered={}".format( len(sampled_positions), len(raw_positions), len(filtered_positions))) sampled_positions = sorted(sampled_positions) for position in sampled_positions: base = sequence[position] outf.write("{}\t{}\t.\t{}\t{}\t.\t.\t.\tGT\t0/0\n".format( contig, position + 1, base, base)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--output-section", dest="output", type=str, choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s.") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unknown) != 2: raise ValueError("two arguments required") if unknown[0] == "-": infile1 = args.stdin else: infile1 = iotools.open_file(unknown[0], "r") infile2 = iotools.open_file(unknown[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = args.output outfile = args.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--test-option", dest="test_option", type="string", help="test option [default=%default].") parser.set_defaults(test_option="test") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) files = glob.glob(os.path.join(os.path.dirname(__file__), "*.pyx")) # do sth ninput, nskipped, noutput = 0, 0, 0 for f in files: E.info("rebuilding %s" % f) ninput += 1 prefix, suffix = os.path.splitext(f) for ext in (".c", ".pyxbldc"): try: os.remove(prefix + ext) except OSError: pass dirname, basename = os.path.split(prefix) assert basename.startswith("_") scriptname = os.path.join(dirname, basename[1:]) + ".py" if not os.path.exists(scriptname): E.warn("script %s does not exist - skipped" % scriptname) nskipped += 1 continue E.info("compiling %s" % scriptname) os.system("%s %s --help > /dev/null" % (sys.executable, scriptname)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.set_defaults( is_gtf=False, ) (args, unknown) = E.start(parser, add_output_options=True, unknowns=True) if len(unknown) == 0: files = [args.stdin] else: files = args args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields))) if args.is_gtf: args.stdout.write("\t%s" % ("\t".join(counter_exons.fields))) args.stdout.write("\n") for f in files: if f == args.stdin: infile = f args.stdout.write("stdin") else: infile = iotools.open_file(f) args.stdout.write(f) counters = [] if args.is_gtf: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) counters.append(counter_exons(counters[0])) else: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) c = counters[-1] for x in c: pass for c in counters: args.stdout.write("\t%s" % str(c)) args.stdout.write("\n") if infile != sys.stdin: infile.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-a", "--first-fastq-file", dest="fastq1", type=str, help="supply read1 fastq file") parser.add_argument( "-b", "--second-fastq-file", dest="fastq2", type=str, help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if unknown and len(unknown) == 2: args.fastq1, args.fastq2 = unknown fastq1 = iotools.open_file(args.fastq1) fastq2 = iotools.open_file(args.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" args.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) # outfile_info = IOTools.open_file(options.info_file, "w") d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file and generate a dict # with the name (title) as the key and the sequence as the value # Remove any pseudo sequences for cur_record in iterator: # This is a temp fix because bedtools getfasta --name seems to have # changed the way it names the fasta titles. This may be temp but This # will fix this issue for the time being. m = re.match("(chr\d+.tRNA\d+-\S+-(pseudo)?)::\S+([+|-])", cur_record.title.replace("(","").replace(")","")) if m == None: continue if m.group(2) == "pseudo": pass else: key = str(m.group(1) + m.group(3)) d[key] = cur_record.sequence # next iterate of over the dict give the cluster a number # this will be used to then map back for the info name for key, value in d.items(): # Add CCA tail options.stdout.write((">%s\n%scca\n")%(key, value)) E.stop()
def main(argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv) data = "".join(open(args[0]).readlines()) print(data[::-1])
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: fastas2fasta.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) (options, args) = E.start(parser) if len(args) < 2: raise ValueError( "please supply at least two filenames to concatenate.") iterators = [] for a in args: iterators.append(FastaIterator.FastaIterator(iotools.open_file(a, "r"))) ninput, noutput, nerrors = 0, 0, 0 while 1: sequences = [] ids = [] for iterator in iterators: try: cur_record = next(iterator) except StopIteration: break sequences.append(re.sub(" ", "", cur_record.sequence)) ids.append(cur_record.title) if not sequences: break ninput += 1 if len(sequences) != len(iterators): raise ValueError("unequal number of sequences in files") noutput += 1 options.stdout.write(">%s\n%s\n" % (ids[0], "".join(sequences))) E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-o", "--outdir", dest="outdir", type=str, help="supply output directory") parser.add_argument("-p", "--prefix", dest="prefix", type=str, help="supply output file prefix") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) prefix = os.path.join(args.outdir, args.prefix) d_outf = open(prefix + "_domain.tsv", "w") k_outf = open(prefix + "_kingdom.tsv", "w") p_outf = open(prefix + "_phylum.tsv", "w") c_outf = open(prefix + "_class.tsv", "w") o_outf = open(prefix + "_order.tsv", "w") f_outf = open(prefix + "_family.tsv", "w") g_outf = open(prefix + "_genus.tsv", "w") s_outf = open(prefix + "_species.tsv", "w") for line in args.stdin.readlines(): data = line[:-1].split("\t") taxon = data[0] counts = data[1:] taxonomy = taxon.split("|") if "d__" in taxonomy[-1]: print(taxon, counts) d_outf.write("\t".join([taxon] + counts) + "\n") elif "k__" in taxonomy[-1]: k_outf.write("\t".join([taxon] + counts) + "\n") elif "p__" in taxonomy[-1]: p_outf.write("\t".join([taxon] + counts) + "\n") elif "c__" in taxonomy[-1]: c_outf.write("\t".join([taxon] + counts) + "\n") elif "o__" in taxonomy[-1]: o_outf.write("\t".join([taxon] + counts) + "\n") elif "f__" in taxonomy[-1]: f_outf.write("\t".join([taxon] + counts) + "\n") elif "g__" in taxonomy[-1]: g_outf.write("\t".join([taxon] + counts) + "\n") elif "s__" in taxonomy[-1]: s_outf.write("\t".join([taxon] + counts) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--ontology", dest="ontology", type="string", help="ontology label") parser.add_option("-f", "--filter", dest="filter", action="store_true", help="filter out genesets") parser.add_option("-l", "--filter-list", dest="filter_list", type="string", help="list of pathways to keep") parser.set_defaults(ontology=None, filter=False, filter_list=None) ## add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.filter: assert options.filter_list, "must specify a list of pathways to keep" filter_set = set() for line in open(options.filter_list).readlines(): filter_set.add(line[:-1]) inf = options.stdin for line in inf.readlines(): data = line[:-1].split("\t") name, description, evidence = data[0], data[0], data[1] if options.filter: if name not in filter_set: continue genes = data[2:] for gene in genes: options.stdout.write("\t".join( [options.ontology, gene, name, description, evidence]) + "\n") ## write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--delimiter", dest="delimiter", type="string", help="delimiter to separate columns [%default]") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=["row-describe", "column-describe"], help="additional methods to apply [%default]") parser.set_defaults( delimiter="\t", methods=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if not options.methods: options.methods = ["summary"] table = pandas.read_csv(options.stdin, options.delimiter) options.stdout.write("metric\tcount\tpercent\tinfo\n") for method in options.methods: label = re.sub("-", "_", method) if method == "summary": for category, count, denominator, info in compute_table_summary( table): options.stdout.write("\t".join( map(str, (category, count, iotools.pretty_percent(count, denominator, na=""), info))) + "\n") elif method == "column-describe": df = table.describe().T.stack() with E.open_output_file(label) as outf: outf.write("label\tcategory\tvalue\n") df.to_csv(outf, sep="\t") elif method == "row-describe": df = table.T.describe().stack() with E.open_output_file(label) as outf: outf.write("label\tcategory\tvalue\n") df.to_csv(outf, sep="\t") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) infile = IOTools.open_file(options.stdin.name) lines = infile.readlines() for line in lines: column = line.split() new_columns = [ column[0], str(int(column[1]) - 50), str(int(column[2]) + 50), column[3], column[4], column[5], str(int(column[1]) - 50), str(int(column[2]) + 50), column[8], column[9] ] if "pseudo" not in column[3]: if int(column[9]) == 2: [c, d] = column[10].split(",") block = int(column[2]) - int(column[1]) - int(d) + 50 new_10 = ''.join(str(int(c) + 50) + ',' + str(int(d) + 50)) new_11 = ''.join('0' + ',' + str(block)) new_columns = new_columns + [new_10, new_11] else: new_columns = new_columns + [ str(int(column[10]) + 100), column[11] ] options.stdout.write('\t'.join(new_columns[0:]) + '\n') E.stop()
def main(argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--option", dest="option", type="string") (options, args) = E.start(parser, argv) data = "".join(open(args[0]).readlines()) print(re.sub("o", "a", data)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("length", ), help="methods to apply [%default]") parser.set_defaults( methods=[], input_fastq_file=None, ) (options, args) = E.start(parser, argv) if len(args) == 1: options.input_fastq_file = args[0] if options.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if options.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(options.input_fastq_file) as inf: for read in inf: counter.input += 1 options.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv parser = E.ArgumentParser() parser.add_argument("-p", "--arguments", type=str, dest="arguments", default="", help="Pass options and arguments to the executable. Please surround options in \"\"") parser.add_argument("-o", "--output-dir", type=str, dest="output", default=".", help="Output for the fastq files.") parser.add_argument("-f", "--fastqc", dest="fastqc", action="store_true", help="After demultiplexing open the fastq files in FastQC.") parser.add_argument("-F", "--fastqc-options", type=str, dest="fastqc_options", default="", help="Options for FastQC. Please surround options in \"\"") parser.add_argument("-H", "--bcl2fastq-help", dest="bcl2fastq_help", action="store_true", help="Print help for Illumina's bcl2fastq conversion software") (args) = E.start(parser) if subprocess.run("which bcl2fastq", shell=True).returncode: raise ValueError("bcl2fastq cannot be found") if args.bcl2fastq_help: subprocess.run("bcl2fastq --help", shell=True) return else: subprocess.run(f"bcl2fastq {args.arguments} -o {args.output}", shell=True) for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True): with gzip.GzipFile(f"{infile}", "r") as f: if sum(1 for char in f.read().decode('utf-8') if char == "\n") % 4 != 0: raise ValueError(f"{infile} is either corrupt or incomplete.") if args.fastqc: for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True): subprocess.run(f"fastqc {infile} {args.fastqc_options}", shell=True)
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq-file", dest="input_fastq_file", type=str, help="input fastq file. ") parser.add_argument("-m", "--method", dest="methods", action="append", type=str, choices=("length", ), help="methods to apply ") parser.set_defaults( methods=[], input_fastq_file=None, ) (args, unknown) = E.start(parser, argv, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if args.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(args.input_fastq_file) as inf: for read in inf: counter.input += 1 args.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq", dest="input_fastq_file", type=str, help="input fastq file") parser.add_argument("-m", "--method", dest="method", type=str, choices=["ont2pacbio"], help="methods to apply ") parser.set_defaults( input_fastq_file=None, line_width=80, method=None, ) (args, unknown) = E.start(parser, argv, add_output_options=True, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file == "-": args.input_fastq_file = args.stdin outf = args.stdout line_width = args.line_width well_no = 0 for record in pysam.FastqFile(args.input_fastq_file): well_no += 1 quals = record.get_quality_array() seq = record.sequence qv = int(math.floor(sum(quals) / len(quals))) outf.write(">{}/{}/{}_{} RQ=0.{}\n".format("test", well_no, 1, len(seq) + 1, qv)) for x in range(0, len(seq), line_width): outf.write(seq[x:x + line_width] + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) # outfile_info = IOTools.open_file(options.info_file, "w") d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file and generate a dict # with the name (title) as the key and the sequence as the value # Remove any pseudo sequences for cur_record in iterator: key = cur_record.title if "pseudo" in key: pass else: d[key] = cur_record.sequence # next iterate of over the dict give the cluster a number # this will be used to then map back for the info name for key, value in d.items(): # Add CCA tail options.stdout.write((">%s\n%scca\n") % (key, value)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) == 0 or (len(args) == 1 and args[0] == "-"): infile = options.stdin else: infile = fileinput.FileInput(args) # do sth ninput, nskipped, noutput = 0, 0, 0 header = False for line in infile: ninput += 1 if line.startswith("#"): pass elif not header: header = line elif line == header: nskipped += 1 continue options.stdout.write(line) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.set_defaults() # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"): infile = args.stdin else: infile = fileinput.FileInput(args) # do sth ninput, nskipped, noutput = 0, 0, 0 header = False for line in infile: ninput += 1 if line.startswith("#"): pass elif not header: header = line elif line == header: nskipped += 1 continue args.stdout.write(line) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--fastq1", dest="fastq1") parser.add_option("--to-drop-single", dest='to_remove_singletons') parser.add_option("--fastq-out1", dest="fq_out1") parser.add_option("--fastq-drop1", dest="fq_dropped1") (options, args) = E.start(parser) reads_to_remove = IOTools.open_file( options.to_remove_singletons).readlines() reads_to_remove = set([x.strip() for x in reads_to_remove]) fastq_out = IOTools.open_file(options.fq_out1, 'w') fastq_host = IOTools.open_file(options.fq_dropped1, 'w') reads = 0 dropped_reads = 0 for read in Fastq.iterate(IOTools.open_file(fastq1)): reads += 1 if read.identifier.split()[0] in reads_to_remove: fastq_host.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) dropped_reads += 1 else: fastq_out.write("@%s\n%s\n+\n%s\n" % (read.identifier, read.seq, read.quals)) fastq_out.close() fastq_host.close() try: percent_dropped = dropped_reads / float(reads) * 100 except ZeroDivisionError: percent_dropped = 0.0 E.info('Dropped %i of %i reads (%f percent)' \ % (dropped_reads, reads, percent_dropped))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--sequence-type", dest="type", type="choice", choices=("read_map", "rel_ab"), help="type of file to be parsed to a table") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) assert options.type, "must specify infile type" if options.type == "read_map": options.stdout.write( "seq_id\tkingdom\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") for entry in Metaphlan.read_map_iterator(sys.stdin): options.stdout.write("\t".join([ entry.seq_id, entry.kingdom, entry.phylum, entry.c_lass, entry.order, entry.family, entry.genus, entry.species ]) + "\n") elif options.type == "rel_ab": options.stdout.write("taxon_level\ttaxon\trel_abundance\n") for entry in Metaphlan.relative_abundance_iterator(sys.stdin): options.stdout.write( "\t".join([entry.taxon_level, entry.taxon, entry.abundance]) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-t", "--sequence-type", dest="type", type=str, choices=("read_map", "rel_ab"), help="type of file to be parsed to a table") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) assert args.type, "must specify infile type" if args.type == "read_map": args.stdout.write( "seq_id\tkingdom\tphylum\tclass\torder\tfamily\tgenus\tspecies\n") for entry in Metaphlan.read_map_iterator(sys.stdin): args.stdout.write("\t".join([ entry.seq_id, entry.kingdom, entry.phylum, entry.c_lass, entry.order, entry.family, entry.genus, entry.species ]) + "\n") elif args.type == "rel_ab": args.stdout.write("taxon_level\ttaxon\trel_abundance\n") for entry in Metaphlan.relative_abundance_iterator(sys.stdin): args.stdout.write( "\t".join([entry.taxon_level, entry.taxon, entry.abundance]) + "\n") # write footer and output benchmark information. E.stop()