def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option( "--output-removed-tsv", dest="output_removed_tsv", type="string", help="if given, sequence identifiers of removed sequences will " "be stored in this file [%default]") parser.add_option( "--output-stats-tsv", dest="output_stats_tsv", type="string", help="if given, output statistics will be written to this file. " "[%default]") parser.add_option("--output-removed-fastq", dest="output_removed_fastq", type="string", help="if given, removed fastq records will " "be stored in this file [%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("filter-N", "filter-identifier", "filter-ONT", "offset-quality", "apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "reverse-complement", "grep"), help="methods to apply [%default]") parser.add_option("--set-prefix", dest="set_prefix", type="string", help="set sequence prefix [%default]") parser.add_option("--input-filter-tsv", dest="input_filter_tsv", type="string", help="list of sequence ides to filter [%default]") parser.add_option("--min-average-quality", dest="min_average_quality", type="float", help="minimum average quality [%default]") parser.add_option("--min-sequence-length", dest="min_sequence_length", type="int", help="minimum sequence length [%default]") parser.add_option("--quality-offset", dest="quality_offset", type="int", help="offset to modify quality values with [%default]") parser.add_option("--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option("--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option("--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults( input_fastq_file="-", methods=[], change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*", max_percent_N=10.0, set_prefix=None, output_removed_tsv=None, output_removed_fastq=None, output_stats_tsv=None, input_filter_tsv=None, min_average_quality=0, min_sequence_length=0, quality_offset=0, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 1: options.input_fastq_file = args[0] if len(options.methods) == 0: raise ValueError("no method specified, please use --method") # this script combines two scripts with different functionalities # TODO: to be sanitized if options.methods[0] in [ "apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "reverse-complement", "grep" ]: options.method = options.methods[0] counter = process_cgat(options) else: counter = process_daisy(options) E.info(counter) E.stop()
def main(argv=None): """script main. parses command line args in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser() parser.add_argument("--version", action='version', version='%(prog)s {version}'.format(version="1.0")) parser.add_argument("-m", "--method", dest="methods", type=str, action="append", choices=("transpose", "normalize-by-max", "normalize-by-value", "multiply-by-value", "percentile", "remove-header", "normalize-by-table", "upper-bound", "lower-bound", "kullback-leibler", "expand", "compress", "fdr", "grep", "randomize-rows"), help="""actions to perform on table.""") parser.add_argument("-s", "--scale", dest="scale", type=float, help="factor to scale matrix by.") parser.add_argument("-f", "--format", dest="format", type=str, help="output number format") parser.add_argument("-p", "--parameters", dest="parameters", type=str, help="Parameters for various functions.") parser.add_argument("-t", "--header-names", dest="has_headers", action="store_true", help="matrix has row/column headers.") parser.add_argument("--transpose", dest="transpose", action="store_true", help="transpose table.") parser.add_argument( "--set-transpose-field", dest="set_transpose_field", type=str, help="set first field (row 1 and col 1) to this value [%default].") parser.add_argument("--transpose-format", dest="transpose_format", type=str, choices=( "default", "separated", ), help="input format of un-transposed table") parser.add_argument( "--expand", dest="expand_table", action="store_true", help="expand table - multi-value cells with be expanded over " "several rows.") parser.add_argument("--no-headers", dest="has_headers", action="store_false", help="matrix has no row/column headers.") parser.add_argument("--columns", dest="columns", type=str, help="columns to use.") parser.add_argument("--file", dest="file", type=str, help="columns to test from table.", metavar="FILE") parser.add_argument("-d", "--delimiter", dest="delimiter", type=str, help="delimiter of columns.", metavar="DELIM") parser.add_argument("-V", "--invert-match", dest="invert_match", action="store_true", help="invert match.") parser.add_argument("--sort-by-rows", dest="sort_rows", type=str, help="output order for rows.") parser.add_argument("-a", "--value", dest="value", type=float, help="value to use for various algorithms.") parser.add_argument( "--group", dest="group_column", type=int, help="group values by column. Supply an integer column ") parser.add_argument("--group-function", dest="group_function", type=str, choices=("min", "max", "sum", "mean", "stats", "cat", "uniq"), help="function to group values by.") parser.add_argument("--join-table", dest="join_column", type=int, help="join rows in a table by columns.") parser.add_argument( "--collapse-table", dest="collapse_table", type=str, help="collapse a table. Value determines the missing variable ") parser.add_argument("--join-column-name", dest="join_column_name", type=int, help="use this column as a prefix.") parser.add_argument("--flatten-table", dest="flatten_table", action="store_true", help="flatten a table.") parser.add_argument("--as-column", dest="as_column", action="store_true", help="output table as a single column.") parser.add_argument("--split-fields", dest="split_fields", action="store_true", help="split fields.") parser.add_argument("--separator", dest="separator", type=str, help="separator for multi-valued fields.") parser.add_argument( "--fdr-method", dest="fdr_method", type=str, choices=("BH", "bonferroni", "holm", "hommel", "hochberg", "BY"), help="method to perform multiple testing correction by controlling " "the fdr.") parser.add_argument( "--fdr-add-column", dest="fdr_add_column", type=str, help="add new column instead of replacing existing columns. " "The value of the option will be used as prefix if there are " "multiple columns") # IMS: add option to use a column as the row id in flatten parser.add_argument( "--id-column", dest="id_column", type=str, help="list of column(s) to use as the row id when flattening " "the table. If None, then row number is used.") parser.add_argument( "--variable-name", dest="variable_name", type=str, help="the column header for the 'variable' column when flattening ") parser.add_argument( "--value-name", dest="value_name", type=str, help="the column header for the 'value' column when flattening ") parser.set_defaults( methods=[], scale=1.0, has_headers=True, format=None, value=0.0, parameters="", columns="all", transpose=False, set_transpose_field=None, transpose_format="default", group=False, group_column=0, group_function="mean", missing_value="na", sort_rows=None, flatten_table=False, collapse_table=None, separator=";", expand=False, join_column=None, join_column_name=None, compute_fdr=None, as_column=False, fdr_method="BH", fdr_add_column=None, id_column=None, variable_name="column", value_name="value", file=None, delimiter="\t", invert_match=False, ) (args, unknown) = E.start(parser, unknowns=True) args.parameters = args.parameters.split(",") if args.group_column: args.group = True args.group_column -= 1 ###################################################################### ###################################################################### ###################################################################### # if only to remove header, do this quickly if args.methods == ["remove-header"]: first = True for line in args.stdin: if line[0] == "#": continue if first: first = False continue args.stdout.write(line) elif args.transpose or "transpose" in args.methods: read_and_transpose_table(args.stdin, args) elif args.flatten_table: # IMS: bug fixed to make work. Also added options for keying # on a particular and adding custom column headings fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=True) args.columns = get_columns(fields, args.columns) if args.id_column: id_columns = [int(x) - 1 for x in args.id_column.split(",")] id_header = "\t".join( [fields[id_column] for id_column in id_columns]) args.columns = [x for x in args.columns if x not in id_columns] else: id_header = "row" args.stdout.write("%s\t%s\t%s\n" % (id_header, args.variable_name, args.value_name)) for x, row in enumerate(table): if args.id_column: row_id = "\t".join( [row[int(x) - 1] for x in args.id_column.split(",")]) else: row_id = str(x) for y in args.columns: args.stdout.write("%s\t%s\t%s\n" % (row_id, fields[y], row[y])) elif args.as_column: fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=True) args.columns = get_columns(fields, args.columns) table = list(zip(*table)) args.stdout.write("value\n") for column in args.columns: args.stdout.write("\n".join(table[column]) + "\n") elif args.split_fields: # split comma separated fields fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=True) args.stdout.write("%s\n" % ("\t".join(fields))) for row in table: row = [x.split(args.separator) for x in row] for d in itertools.product(*row): args.stdout.write("%s\n" % "\t".join(d)) elif args.group: read_and_group_table(args.stdin, args) elif args.join_column: read_and_join_table(args.stdin, args) elif args.expand_table: read_and_expand_table(args.stdin, args) elif args.collapse_table is not None: read_and_collapse_table(args.stdin, args, args.collapse_table) elif "randomize-rows" in args.methods: read_and_randomize_rows(args.stdin, args) elif "grep" in args.methods: args.columns = [int(x) - 1 for x in args.columns.split(",")] patterns = [] if args.file: infile = iotools.open_file(args.file, "r") for line in infile: if line[0] == "#": continue patterns.append(line[:-1].split(args.delimiter)[0]) else: patterns = args for line in args.stdin: data = line[:-1].split(args.delimiter) found = False for c in args.columns: if data[c] in patterns: found = True break if (not found and args.invert_match) or (found and not args.invert_match): print(line[:-1]) else: ###################################################################### ###################################################################### ###################################################################### # Apply remainder of transformations fields, table = CSV.readTable(args.stdin, with_header=args.has_headers, as_rows=False) # convert columns to list table = [list(x) for x in table] ncols = len(fields) if len(table) == 0: raise ValueError("table is empty") nrows = len(table[0]) E.info("processing table with %i rows and %i columns" % (nrows, ncols)) args.columns = get_columns(fields, args.columns) # convert all values to float for c in args.columns: for r in range(nrows): try: table[c][r] = float(table[c][r]) except ValueError: continue for method in args.methods: if method == "normalize-by-value": value = float(args.parameters[0]) del args.parameters[0] for c in args.columns: table[c] = [x / value for x in table[c]] elif method == "multiply-by-value": value = float(args.parameters[0]) del args.parameters[0] for c in args.columns: table[c] = [x * value for x in table[c]] elif method == "normalize-by-max": for c in args.columns: m = max(table[c]) table[c] = [x / m for x in table[c]] elif method == "kullback-leibler": args.stdout.write("category1\tcategory2\tkl1\tkl2\tmean\n") format = args.format if format is None: format = "%f" for x in range(0, len(args.columns) - 1): for y in range(x + 1, len(args.columns)): c1 = args.columns[x] c2 = args.columns[y] e1 = 0 e2 = 0 for z in range(nrows): p = table[c1][z] q = table[c2][z] e1 += p * math.log(p / q) e2 += q * math.log(q / p) args.stdout.write("%s\t%s\t%s\t%s\t%s\n" % (fields[c1], fields[c2], format % e1, format % e2, format % ((e1 + e2) / 2))) E.stop() sys.exit(0) elif method == "rank": for c in args.columns: tt = table[c] t = list(zip(tt, list(range(nrows)))) t.sort() for i, n in zip([x[1] for x in t], list(range(nrows))): tt[i] = n elif method in ("lower-bound", "upper-bound"): boundary = float(args.parameters[0]) del args.parameters[0] new_value = float(args.parameters[0]) del args.parameters[0] if method == "upper-bound": for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] > boundary: table[c][r] = new_value else: for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ table[c][r] < boundary: table[c][r] = new_value elif method == "fdr": pvalues = [] for c in args.columns: pvalues.extend(table[c]) assert max(pvalues) <= 1.0, "pvalues > 1 in table: max=%s" % \ str(max(pvalues)) assert min(pvalues) >= 0, "pvalue < 0 in table: min=%s" % \ str(min(pvalues)) # convert to str to avoid test for float downstream qvalues = list( map(str, Stats.adjustPValues(pvalues, method=args.fdr_method))) if args.fdr_add_column is None: x = 0 for c in args.columns: table[c] = qvalues[x:x + nrows] x += nrows else: # add new column headers if len(args.columns) == 1: fields.append(args.fdr_add_column) else: for co in args.columns: fields.append(args.fdr_add_column + fields[c]) x = 0 for c in args.columns: # add a new column table.append(qvalues[x:x + nrows]) x += nrows ncols += len(args.columns) elif method == "normalize-by-table": other_table_name = args.parameters[0] del args.parameters[0] other_fields, other_table = CSV.readTable( iotools.open_file(other_table_name, "r"), with_header=args.has_headers, as_rows=False) # convert all values to float for c in args.columns: for r in range(nrows): try: other_table[c][r] = float(other_table[c][r]) except ValueError: continue # set 0s to 1 in the other matrix for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float) and \ isinstance(other_table[c][r], float) and \ other_table[c][r] != 0: table[c][r] /= other_table[c][r] else: table[c][r] = args.missing_value # convert back if args.format is not None: for c in args.columns: for r in range(nrows): if isinstance(table[c][r], float): table[c][r] = format % table[c][r] args.stdout.write("\t".join(fields) + "\n") if args.sort_rows: old2new = {} for r in range(nrows): old2new[table[0][r]] = r for x in args.sort_rows.split(","): if x not in old2new: continue r = old2new[x] args.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") else: for r in range(nrows): args.stdout.write( "\t".join(map(str, [table[c][r] for c in range(ncols)])) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--summarise", dest="summarise", type="choice", choices=("level-counts", "taxa-counts", "individual"), help="summarise the taxa counts - no. phyla etc") parser.add_option("--output-map", dest="output_map", action="store_true", help="ouput map of taxonomy") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.output_map: found = [] options.stdout.write("""Domain\t \ kingdom\t \ phylum\t \ class\t \ order\t \ family\t \ genus\t \ species\n""") # only output the mapping file - do not continue # summarise regardless of the specified options for lca in LCA.iterate(options.stdin): # if bacteria or archaea the kingdom will # be the domain if lca.domain == "Bacteria" or lca.domain == "Archaea": kingdom = lca.domain else: kingdom = lca.kingdom hierarchy = [ lca.domain, kingdom, lca.phylum, lca._class, lca.order, lca.family, lca.genus, lca.species ] if hierarchy in found: continue else: found.append(hierarchy) options.stdout.write("\t".join(hierarchy) + "\n") return if options.summarise == "level-counts": level_counts = collections.defaultdict(set) total = 0 nreads_domain = 0 nreads_kingdom = 0 nreads_kingdom_plus = 0 nreads_phylum = 0 nreads_phylum_plus = 0 nreads_class = 0 nreads_class_plus = 0 nreads_order = 0 nreads_order_plus = 0 nreads_family = 0 nreads_family_plus = 0 nreads_genus = 0 nreads_genus_plus = 0 nreads_species = 0 nreads_species_plus = 0 nreads_subspecies = 0 nreads_subspecies_plus = 0 c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": nreads_domain += 1 level_counts["domain"].add(lca.domain) else: c.kingdom_unmapped += 1 if lca.kingdom != "NA": nreads_kingdom += 1 level_counts["kingdom"].add(lca.kingdom) else: c.kingdom_unmapped += 1 if lca.kingdom_plus != "NA": nreads_kingdom_plus += 1 level_counts["kingdom+"].add(lca.kingdom_plus) else: c.kingdom_plus_unmapped += 1 if lca.phylum != "NA": nreads_phylum += 1 level_counts["phylum"].add(lca.phylum) else: c.phylum_unmapped += 1 if lca.phylum_plus != "NA": nreads_phylum_plus += 1 level_counts["phylum+"].add(lca.phylum_plus) else: c.phylum_plus_unmapped += 1 if lca._class != "NA": nreads_class += 1 level_counts["class"].add(lca._class) else: c.class_unmapped += 1 if lca._class_plus != "NA": nreads_class_plus += 1 level_counts["class+"].add(lca._class_plus) else: c.class_plus_unmapped += 1 if lca.order != "NA": nreads_order += 1 level_counts["order"].add(lca.order) else: c.order_unmapped += 1 if lca.order_plus != "NA": nreads_order_plus += 1 level_counts["order+"].add(lca.order_plus) else: c.order_plus_unmapped += 1 if lca.family != "NA": nreads_family += 1 level_counts["family"].add(lca.family) else: c.family_unmapped += 1 if lca.family != "NA": nreads_family_plus == 1 level_counts["family+"].add(lca.family_plus) else: c.family_plus_unmapped += 1 if lca.genus != "NA": nreads_genus += 1 level_counts["genus"].add(lca.genus) else: c.genus_unmapped += 1 if lca.genus_plus != "NA": nreads_genus_plus == 1 level_counts["genus+"].add(lca.genus_plus) else: c.genus_plus_unmapped += 1 if lca.species != "NA": nreads_species += 1 level_counts["species"].add(lca.species) else: c.species_unmapped += 1 if lca.species_plus != "NA": nreads_species_plus += 1 level_counts["species+"].add(lca.species_plus) else: c.species_plus_unmapped += 1 # removed subspecies mapping for the time # being # if lca.subspecies != "NA": # nreads_subspecies += 1 # level_counts["subspecies"].add(lca.subspecies) # else: # c.subspecies_unmapped += 1 # if lca.subspecies_plus != "NA": # nreads_subspecies_plus += 1 # level_counts["subspecies+"].add(lca.subspecies_plus) # else: # c.subspecies_plus_unmapped += 1 options.stdout.write("\t".join([ "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom", "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily", "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies", "nseqspecies+" ]) + "\n") options.stdout.write("\t".join( map(str, [ len(level_counts["domain"]), len(level_counts["kingdom"]), len(level_counts["kingdom+"]), len(level_counts["phylum"]), len(level_counts["phylum+"]), len(level_counts["class"]), len(level_counts["class+"]), len(level_counts["order"]), len(level_counts["order+"]), len(level_counts["family"]), len(level_counts["family+"]), len(level_counts["genus"]), len(level_counts["genus+"]), len(level_counts["species"]), len(level_counts["species+"]), nreads_domain, nreads_kingdom, nreads_phylum, nreads_phylum_plus, nreads_class, nreads_class_plus, nreads_order, nreads_order_plus, nreads_family, nreads_family_plus, nreads_genus, nreads_genus_plus, nreads_species, nreads_species_plus ])) + "\n") elif options.summarise == "taxa-counts": unmapped = collections.defaultdict(int) total = 0 taxa_counts = { "domain": collections.defaultdict(int), "kingdom": collections.defaultdict(int), "kingdom+": collections.defaultdict(int), "phylum": collections.defaultdict(int), "phylum+": collections.defaultdict(int), "class": collections.defaultdict(int), "class+": collections.defaultdict(int), "order": collections.defaultdict(int), "order+": collections.defaultdict(int), "family": collections.defaultdict(int), "family+": collections.defaultdict(int), "genus": collections.defaultdict(int), "genus+": collections.defaultdict(int), "species": collections.defaultdict(int), "species+": collections.defaultdict(int) } c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": taxa_counts["domain"][lca.domain] += 1 else: c.kingdom_unmapped += 1 unmapped["domain"] += 1 if lca.kingdom != "NA": taxa_counts["kingdom"][lca.kingdom] += 1 else: c.kingdom_unmapped += 1 unmapped["kingdom"] += 1 if lca.kingdom_plus != "NA": taxa_counts["kingdom+"][lca.kingdom_plus] += 1 else: c.kingdom_plus_unmapped += 1 unmapped["kingdom+"] += 1 if lca.phylum != "NA": taxa_counts["phylum"][lca.phylum] += 1 else: c.phylum_unmapped += 1 unmapped["phylum"] += 1 if lca.phylum_plus != "NA": taxa_counts["phylum+"][lca.phylum_plus] += 1 else: c.phylum_plus_unmapped += 1 unmapped["phylum+"] += 1 if lca._class != "NA": taxa_counts["class"][lca._class] += 1 else: c.class_unmapped += 1 unmapped["class"] += 1 if lca._class_plus != "NA": taxa_counts["class+"][lca._class_plus] += 1 else: c.class_plus_unmapped += 1 unmapped["class+"] += 1 if lca.order != "NA": taxa_counts["order"][lca.order] += 1 else: c.order_unmapped += 1 unmapped["order"] += 1 if lca.order_plus != "NA": taxa_counts["order+"][lca.order_plus] += 1 else: c.order_plus_unmapped += 1 unmapped["order+"] += 1 if lca.family != "NA": taxa_counts["family"][lca.family] += 1 else: c.family_unmapped += 1 unmapped["family"] += 1 if lca.family_plus != "NA": taxa_counts["family+"][lca.family_plus] += 1 else: c.family_plus_unmapped += 1 unmapped["family+"] += 1 if lca.genus != "NA": taxa_counts["genus"][lca.genus] += 1 else: c.genus_unmapped += 1 unmapped["genus"] += 1 if lca.genus_plus != "NA": taxa_counts["genus+"][lca.genus_plus] += 1 else: c.genus_plus_unmapped += 1 unmapped["genus+"] += 1 if lca.species != "NA": taxa_counts["species"][lca.species] += 1 else: c.species_unmapped += 1 unmapped["species"] += 1 if lca.species_plus != "NA": taxa_counts["species+"][lca.species_plus] += 1 else: c.species_plus_unmapped += 1 unmapped["species+"] += 1 options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n") for level, taxa_count in sorted(taxa_counts.items()): total_level = total - unmapped[level] for taxa, count in sorted(taxa_count.items()): options.stdout.write("\t".join([ level, taxa, str(count), "{:.8}".format(float(count) / total_level), "{:.8}". format(float(count) / (float(total_level) / 1000000)) ]) + "\n") E.info(c) elif options.summarise == "individual": # each read is output with its respective # taxon assignments options.stdout.write("\t".join([ "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+", "class", "class+", "order", "order+", "family", "family+", "genus", "genus+", "species", "species+" ]) + "\n") for lca in LCA.iterate(options.stdin): options.stdout.write("\t".join([ lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus, lca.phylum, lca.phylum_plus, lca._class, lca._class_plus, lca.order, lca.order_plus, lca.family, lca.family_plus, lca.genus, lca.genus_plus, lca.species, lca.species_plus ]) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument( "--input-filename-fasta", dest="input_filename_fasta", type=str, help="filename with reference sequence in fasta format ") parser.add_argument( "--counting-mode", dest="counting_mode", type=str, choices=("all", "pileup_defaults"), help="counting mode. all=all reads/bases. pileup-defaults= " "use default pileup thresholds. Options will be added to " "--mpileup-options. .") parser.add_argument("--mpileup-options", dest="mpileup_options", type=str, help="pileup options to use ") parser.set_defaults( mpileup_options="", counting_mode="all", input_filename_fasta=None, report_step=1000000, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) bamfile = args[0] mpileup_options = args.mpileup_options if args.counting_mode == "all": mpileup_options += " -Q 0 -B -A" read_depth_histogram = collections.defaultdict(int) base_depth_histogram = collections.defaultdict(int) # deletions are marked by something like -2AA at the first # position and a '*' for subsequent positions rx_deletions = re.compile("([-][0-9]+|[*])") report_step = args.report_step npositions = 0 samtools = iotools.which("samtools") statement = ("{samtools} mpileup " "-f {reference_fasta} " "{mpileup_options} " "{bamfile} ".format(samtools=samtools, reference_fasta=args.input_filename_fasta, mpileup_options=mpileup_options, bamfile=os.path.abspath(bamfile))) E.info("running the following statement: {}".format(statement)) cmd_args = shlex.split(statement) proc = subprocess.Popen(cmd_args, shell=False, stderr=subprocess.PIPE, stdout=subprocess.PIPE, cwd=os.path.abspath(os.curdir)) for line in proc.stdout: line = line.decode("utf-8") contig, pos, base, read_depth, info, qualities = line[:-1].split("\t") read_depth = int(read_depth) pos = int(pos) if pos % report_step == 0: E.info("working on {}: {}".format(contig, pos)) ndeletions = len(rx_deletions.findall(info)) base_depth = read_depth - ndeletions read_depth_histogram[read_depth] += 1 base_depth_histogram[base_depth] += 1 for line in proc.stderr: E.warn(line) keys = sorted( set(read_depth_histogram.keys()).union(base_depth_histogram.keys())) args.stdout.write("depth\tread_depth_positions\tbase_depth_positions\n") for key in keys: args.stdout.write("{}\t{}\t{}\n".format(key, read_depth_histogram[key], base_depth_histogram[key])) E.info("positions tested: {}".format(sum(read_depth_histogram.values()))) E.stop()
for read in bamfile.fetch(until_eof=True): all_total += 1 if all_total % 1000000 == 0: E.debug("Done %s reads; %s used" % (all_total, total)) if read.is_unmapped: continue if read.has_tag("NH") and read.get_tag("NH") != 1: continue if read.is_read2: continue flen = read.template_length total += 1 if abs(flen) <= 1000: lengthdist[abs(flen)] += 1.0 bamfile.close() lengthdist = [f / total for f in lengthdist] options.stdout.write("\t".join(map(str, lengthdist))) E.info("Used %s reads out of %s to built distribution. Total weight = %s " % (total, all_total, sum(lengthdist))) E.stop()
def run_workflow(options, args, pipeline=None): """command line control function for a pipeline. This method defines command line options for the pipeline and updates the global configuration dictionary correspondingly. It then provides a command parser to execute particular tasks using the ruffus pipeline control functions. See the generated command line help for usage. To use it, add:: import pipeline as P if __name__ == "__main__": sys.exit(P.main(sys.argv)) to your pipeline script. Arguments --------- pipeline: object pipeline to run. If not given, all ruffus pipelines are run. """ logger = logging.getLogger("cgatcore.pipeline") if args: options.pipeline_action = args[0] if len(args) > 1: options.pipeline_targets.extend(args[1:]) if options.force_run: if options.force_run == "all": forcedtorun_tasks = ruffus.pipeline_get_task_names() else: forcedtorun_tasks = options.pipeline_targets else: forcedtorun_tasks = [] # create local scratch if it does not already exists. Note that # directory itself will be not deleted while its contents should # be cleaned up. if not os.path.exists(get_params()["tmpdir"]): logger.warn( "local temporary directory {} did not exist - created".format( get_params()["tmpdir"])) try: os.makedirs(get_params()["tmpdir"]) except OSError: # file exists pass logger.debug("temporary directory is {}".format(get_params()["tmpdir"])) # set multiprocess to a sensible setting if there is no cluster run_on_cluster = HAS_DRMAA is True and not options.without_cluster if options.multiprocess is None: if not run_on_cluster: options.multiprocess = int( math.ceil(multiprocessing.cpu_count() / 2.0)) else: options.multiprocess = 40 # see inputValidation function in Parameters.py if options.input_validation: input_validation(get_params(), sys.argv[0]) elif options.pipeline_action == "debug": # create the session proxy start_session() method_name = options.pipeline_targets[0] caller = get_caller() method = getattr(caller, method_name) method(*options.pipeline_targets[1:]) elif options.pipeline_action in ("make", "show", "state", "svg", "plot", "dot", "touch", "regenerate"): messenger = None try: with cache_os_functions(): if options.pipeline_action == "make": if not options.without_cluster and not HAS_DRMAA and not get_params( )['testing']: E.critical( "DRMAA API not found so cannot talk to a cluster.") E.critical("Please use --local to run the pipeline" " on this host: {}".format(os.uname()[1])) sys.exit(-1) # get tasks to be done. This essentially replicates # the state information within ruffus. stream = StringIO() ruffus.pipeline_printout( stream, options.pipeline_targets, verbose=5, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) messenger = LoggingFilterProgress(stream.getvalue()) logger.addFilter(messenger) global task if options.without_cluster: # use ThreadPool to avoid taking multiple CPU for pipeline # controller. opts = {"multithread": options.multiprocess} else: # use cooperative multitasking instead of multiprocessing. opts = { "multiprocess": options.multiprocess, "pool_manager": "gevent" } # create the session proxy start_session() logger.info("current directory is {}".format(os.getcwd())) ruffus.pipeline_run( options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, logger=logger, verbose=options.loglevel, log_exceptions=options.log_exceptions, exceptions_terminate_immediately=options. exceptions_terminate_immediately, checksum_level=options.ruffus_checksums_level, pipeline=pipeline, one_second_per_job=False, **opts) close_session() elif options.pipeline_action == "show": ruffus.pipeline_printout( options.stdout, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "touch": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=True, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "regenerate": ruffus.pipeline_run( options.pipeline_targets, touch_files_only=options.ruffus_checksums_level, pipeline=pipeline, verbose=options.loglevel) elif options.pipeline_action == "svg": ruffus.pipeline_printout_graph( options.stdout.buffer, options.pipeline_format, options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "state": ruffus.ruffus_return_dag( options.stdout, target_tasks=options.pipeline_targets, forcedtorun_tasks=forcedtorun_tasks, verbose=options.loglevel, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) elif options.pipeline_action == "plot": outf, filename = tempfile.mkstemp() ruffus.pipeline_printout_graph( os.fdopen(outf, "wb"), options.pipeline_format, options.pipeline_targets, pipeline=pipeline, checksum_level=options.ruffus_checksums_level) execute("inkscape %s" % filename) os.unlink(filename) except ruffus.ruffus_exceptions.RethrownJobError as ex: if not options.debug: E.error("%i tasks with errors, please see summary below:" % len(ex.args)) for idx, e in enumerate(ex.args): task, job, error, msg, traceback = e if task is None: # this seems to be errors originating within ruffus # such as a missing dependency # msg then contains a RethrownJobJerror msg = str(msg) else: task = re.sub("__main__.", "", task) job = re.sub(r"\s", "", job) # display only single line messages if len([x for x in msg.split("\n") if x != ""]) > 1: msg = "" E.error("%i: Task=%s Error=%s %s: %s" % (idx, task, error, job, msg)) E.error("full traceback is in %s" % options.pipeline_logfile) logger.error("start of all error messages") logger.error(ex) logger.error("end of all error messages") raise ValueError("pipeline failed with %i errors" % len(ex.args)) from ex else: raise elif options.pipeline_action == "dump": options.stdout.write((json.dumps(get_params())) + "\n") elif options.pipeline_action == "printconfig": E.info("printing out pipeline parameters: ") p = get_params() for k in sorted(get_params()): print(k, "=", p[k]) print_config_files() elif options.pipeline_action == "config": # Level needs to be 2: # 0th level -> cgatflow.py # 1st level -> Control.py # 2nd level -> pipeline_xyz.py f = sys._getframe(2) caller = f.f_globals["__file__"] pipeline_path = os.path.splitext(caller)[0] general_path = os.path.join(os.path.dirname(pipeline_path), "configuration") write_config_files(pipeline_path, general_path) elif options.pipeline_action == "clone": clone_pipeline(options.pipeline_targets[0]) else: raise ValueError("unknown pipeline action %s" % options.pipeline_action) E.stop(logger=get_logger())
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="shift reads by a certain amount (ChIP-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extend", type="int", help="extend reads by a certain amount " "(ChIP-Seq) [%default]") parser.add_option("-p", "--wiggle-span", dest="span", type="int", help="span of a window in wiggle tracks " "[%default]") parser.add_option("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_option("--scale-base", dest="scale_base", type="float", help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads " "[default=%default]") parser.add_option("--scale-method", dest="scale_method", type="choice", choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file" "[default=%default]") parser.add_option("--max-insert-size", dest="max_insert_size", type="int", help="only merge if insert size less that " "# bases. 0 turns of this filter " "[default=%default].") parser.add_option("--min-insert-size", dest="min_insert_size", type="int", help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter. [default=%default]") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) >= 1: options.samfile = args[0] if len(args) == 2: options.output_filename_pattern = args[1] if not options.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(options.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = iotools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if options.shift or options.extend: if options.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if options.output_format == "bigwig": if not options.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if options.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % options.output_format) # check required executable file is in the path executable = iotools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if options.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if options.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(options.span) elif options.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = options.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if options.shift > 0 or options.extend > 0 or options.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if options.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = merge_pairs(samfile, outfile, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = options.shift, options.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if options.scale_method == "reads": scale_factor = float(options.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (options.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if options.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if options.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if options.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, options.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if options.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-bam", dest="input_bam_file", type="string", help="input bam file") parser.add_option( "-f", "--reference-bam", dest="reference_bam_file", type="string", help="reference BAM file [%default]") parser.add_option( "-q", "--query-name-regex", dest="query_name_regex", type="string", help="regular expression to apply on query name. " "Potentially required to match samtools sort order and should " "evaluate to an integer [%default]") parser.set_defaults( input_bam_file=None, reference_bam_file=None, query_name_regex=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 2: options.input_bam_file = args[0] options.reference_bam_file = args[1] if options.input_bam_file is None: raise ValueError("please supply a BAM file as input") if options.reference_bam_file is None: raise ValueError("please supply a BAM file as reference") # update paths to absolute options.input_bam_file = os.path.abspath(options.input_bam_file) options.reference_bam_file = os.path.abspath(options.reference_bam_file) if not os.path.exists(options.input_bam_file): raise OSError("input bam file {} does not exist".format( options.input_bam_file)) if not os.path.exists(options.reference_bam_file): raise OSError("reference bam file {} does not exist".format( options.reference_bam_file)) bam_in = pysam.AlignmentFile(options.input_bam_file) ref_in = pysam.AlignmentFile(options.reference_bam_file) outf_mapped = E.open_output_file("mapped") outf_mapped.write("\t".join( ["read", "length", "status", "overlap", "comp_contig", "comp_start", "comp_end", "ref_contig", "ref_start", "ref_end", "shared_misaligned", "shared_aligned", "shared_insertion", "shared_deletion", "comp_aligned", "comp_insertion", "comp_deletion", "ref_aligned", "ref_insertion", "ref_deletion"]) + "\n") outf_missing = E.open_output_file("missing") outf_missing.write("\t".join( ["read", "length", "status", "aligned", "insertion", "deletion"]) + "\n") counter = E.Counter() if options.query_name_regex: rx = re.compile(options.query_name_regex) def extract_query(x): return int(rx.search(x).groups()[0]) qname_fn = None if options.query_name_regex: qname_fn = extract_query for reads_cmp, read_ref in group_pairs(iterate_read_pairs( bam_in.fetch(until_eof=True), ref_in.fetch(until_eof=True), qname_fn=qname_fn)): if len(reads_cmp) == 0: counter.missing += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "missing") + count_pairs(pairs_ref))) + "\n") continue if len(reads_cmp) > 1: # multiple matches counter.multi_mapping += 1 prefix = "multi_" else: counter.unique_mapping += 1 prefix = "unique_" is_mapped = False for read_cmp in reads_cmp: counter.paired += 1 if read_cmp.is_unmapped: counter.unmapped += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "unmapped") + count_pairs(pairs_ref))) + "\n") continue overlap = max(0, (min(read_cmp.reference_end, read_ref.reference_end) - max(read_cmp.reference_start, read_ref.reference_start))) pairs_cmp = set(read_cmp.get_aligned_pairs()) pairs_ref = set(read_ref.get_aligned_pairs()) shared_cmp = pairs_cmp.intersection(pairs_ref) unique_cmp = pairs_cmp.difference(pairs_ref) missaligned = len([x for x, y in unique_cmp if x is not None and y is not None]) if read_cmp.reference_name != read_ref.reference_name or \ overlap == 0: status = "mismapped" else: counter.overlap += 1 status = "mapped" is_mapped = True outf_mapped.write("\t".join( map(str, (read_cmp.query_name, read_cmp.query_length, prefix + status, overlap, read_cmp.reference_name, read_cmp.reference_start, read_cmp.reference_end, read_ref.reference_name, read_ref.reference_start, read_ref.reference_end, missaligned) + count_pairs(shared_cmp) + count_pairs(pairs_cmp) + count_pairs(pairs_ref))) + "\n") else: if is_mapped: status = "mapped" else: status = "mismapped" counter[prefix + status] += 1 with E.open_output_file("summary") as outf: outf.write("category\tcounts\n") outf.write(counter.asTable() + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--program", dest="program", type="choice", choices=["plink2", "gcta", "plinkdev"], help="program to execute genome-wide analysis") parser.add_option("--input-file-pattern", dest="infile_pattern", type="string", help="file prefix that identifies a group of files") parser.add_option("--input-file-format", dest="file_format", type="choice", choices=[ "plink", "plink_binary", "oxford", "oxford_binary", "vcf", "GRM_binary", "GRM_gz" ], help="format of input files") parser.add_option("--phenotypes-file", dest="pheno_file", type="string", help="text file of additional phenotypes") parser.add_option("--pheno", dest="pheno", type="string", help="either phenotype file column header or number") parser.add_option("--covariates-file", dest="covariate_file", type="string", help="file containing covariates") parser.add_option("--covariate-column", dest="covar_col", type="string", help="column number(s) or header(s) to include in " "association model") parser.add_option("--method", dest="method", type="choice", choices=[ "ld_prune", "summary", "flag_hets", "remove_relations", "check_gender", "IBD" ], help="method to apply to genome-wide data") parser.add_option("--IBD-parameter", dest="ibd_param", type="choice", choices=["norm", "relatives", "full"], help="param " "to pass to IBD calculations") parser.add_option("--principal-components", dest="num_pcs", type="int", help="the number of principal components to output") parser.add_option("--matrix-shape", dest="matrix_shape", type="choice", choices=["triangle", "square", "square0"], help="output matrix shape.", default="triangle") parser.add_option("--matrix-compression", dest="matrix_compress", type="choice", choices=["gz", "bin", "bin4"], help="compression to apply to output matrix file", default="gz") parser.add_option("--matrix-form", dest="matrix_form", type="choice", choices=["distance", "grm"], help="type of relationship matrix to calculate") parser.add_option( "--matrix-metric", dest="matrix_metric", type="choice", choices=["fhat", "cov", "ibc2", "ibc3", "ibs", "genomic", "hamming"], help="value to calculate for diagonal elements of the " "grm. Default is fhat for grm and hamming for distance.") parser.add_option( "--matrix-options", dest="matrix_options", type="string", help="modifiers of matrix output, see plink documentation " "for details") parser.add_option("--strand-flip-subset", dest="flip_subset", action="store_true", help="apply strand flipping to a subset of samples") parser.add_option("--flip-scan-type", dest="scan_param", type="choice", choices=["default", "window", "threshold"], help="strand flipping scan to apply to SNPs") parser.add_option("--sort-type", dest="sort_type", type="choice", choices=["none", "natural", "ascii", "file"], help="sort type to input files") parser.add_option("--merge-file-format", dest="merge_format", type="choice", choices=["plink", "binary_plink"], help="format of input files to be merged") parser.add_option( "--merge-mode", dest="merge_mode", type="choice", choices=[ "default", "original_missing", "new_nonmissing", "no_overwrite", "force", "report_all", "report_nonmissing" ], help="merge mode to apply to dealing with merge conflicts") parser.add_option("--duplicates-method", dest="dup_method", type="choice", choices=["same_ref", "id_match", "suppress_first"], help="method for identifying and dealing with duplicate " "variants") parser.add_option("--summary-method", dest="summary_method", type="choice", choices=[ "allele_frequency", "missing_data", "hardy_weinberg", "mendel_errors", "inbreeding", "inbreeding_coef", "gender_checker", "wrights_fst" ], help="summary statistics to calculate") parser.add_option("--summary-parameter", dest="sum_param", type="string", help="optional parameters that can be passed to summary " "statistics methods") parser.add_option( "--genotype-rate", dest="filt_genotype_rate", type="string", help="genotyping rate threshold. SNPs below this threshold " "will be excluded from analysis") parser.add_option("--indiv-missing", dest="filt_missingness", type="string", help="individual missingness rate. Individuals below " "this threshold will be excluded from analysis") parser.add_option("--hardy-weinberg", dest="filt_hwe", type="string", help="hardy-weinberg p-value threshold for SNPs. SNPs " "with a 2df chisquared p-value below this will be " "filtered out") parser.add_option( "--min-allele-frequency", dest="filt_min_allele_frequency", type="string", help="only include SNPs with an allele frequency equal to " "or above this threshold") parser.add_option( "--max-allele-frequency", dest="filt_max_allele_frequency", type="string", help="only include SNPs with an allele frequency equal to " "or below this threshold") parser.add_option( "--mendelian-error", dest="filt_mendelian_error", type="string", help="exclude individuals/trios with mendelian errors that " "exceed this value") parser.add_option("--min-quality-score", dest="filt_min_qaul_score", type="string", help="reset the minimum low bound of quality scores for " "variants in a VCF file. Default is 0") parser.add_option( "--max-quality-score", dest="filt_max_qual_score", type="string", help="reset the maximum upper bound of quality scores for " "a VCCF file. Default is Inf") parser.add_option("--allow-no-gender", dest="filt_allow_no_sex", type="string", help="allow individuals with gender missing") parser.add_option("--enforce-gender", dest="filt_enforce_sex", type="string", help="only include individuals with non-missing gender " "information") parser.add_option("--keep-individuals", dest="filt_keep", type="string", help="a file containing individuals IDs to keep, " "one per row") parser.add_option("--remove-individuals", dest="filt_remove", type="string", help="a file of individual IDs to remove, one per row") parser.add_option("--subset-filter", dest="filt_subset_filter", type="choice", choices=[ "cases", "controls", "males", "females", "founders", "nonfounders" ], help="only apply filters to the specific subset of " "individuals supplied") parser.add_option( "--extract-snps", dest="filt_extract", type="string", help="text file of variant IDs to include in the analysis, " "ignoring all others") parser.add_option("--exclude-snps", dest="filt_exclude", type="string", help="a file of variant IDs to exclude from analysis") parser.add_option("--restrict-chromosome", dest="filt_chromosome", type="string", help="restict analysis to either a single chromosome, " "or a comma-separated list of chromosomes") parser.add_option("--exclude-chromosomes", dest="filt_exclude_chromosome", type="string", help="exclude all variants on these " "chromosome(s)") parser.add_option( "--autosome-only", dest="filt_autosome", action="store_true", help="if present only autosomal variants will be analysed") parser.add_option( "--pseudo-autosome", dest="filt_pseudo_autosome", action="store_true", help="include on the pseudo-autosomal region of chromosome X") parser.add_option("--ignore-indels", dest="filt_ignore_indels", action="store_true", help="only include bi-allelic single nucleotide " "variants in analysis") parser.add_option( "--snp-range", dest="filt_snp_bp_range", type="string", help="comma separated list of from, to genome co-ordinates " "within which to include variants for analysis") parser.add_option("--snp-id-range", dest="filt_snp_id_range", type="string", help="comma separate list of IDs from, to within which " "to include variants for analysis.") parser.add_option("--snp-id", dest="filt_specific_snp", type="string", help="include a single snp in the analysis given by " "it's variant ID.") parser.add_option("--exclude-variant", dest="filt_exclude_snp", type="string", help="exclude a single variant from the analysis, " "given by it's variant ID") parser.add_option( "--covariate-filter", dest="filt_covariate_filter", type="string", help="covariate column headers or column numbers on which " "to filter on. Requries --covariate-file") parser.add_option( "--filter-parameter", dest="param", type="string", help="parameter values to be passed to filtering function") parser.add_option("--window-size", dest="window_size", type="string", help="alters the behaviour of the --snp-range and " "--include/exclude snp options. variants within +/- " "half * window_size (kb) are included") parser.add_option( "--range-resolution", dest="filt_range_resolution", type="choice", choices=["bp", "kb", "mb"], help="alters the (from, to) range resolution to either bp, " "kb or mb") parser.add_option( "--output-file-pattern", dest="out_pattern", type="string", help="output file pattern prefix. file suffixes are dependent " "on the task executed") parser.add_option("--threads", dest="threads", type="int", help="the number of threads to use for multi-threaded " "processes") parser.add_option("--use-kb", dest="kb", action="store_true", help="if present uses a kb sized window for LD pruning") parser.add_option("--prune-method", dest="prune_method", type="choice", choices=["R2", "VIF"], help="type of LD pruning to " "perform, pair-wise LD or variance inflation factor") parser.add_option("--step-size", dest="step", type="string", help="step size to advance window by") parser.add_option("--threshold", dest="threshold", type="string", help="threshold on which to filter results") parser.add_option("--parallel", dest="parallel", type="int", help="number of jobs to split task into") parser.add_option("--memory", dest="memory", type="string", help="amount of memory to reserve for the task") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) parser.set_defaults(sum_param=None, dup_method="same_ref", matrix_shape="triangle", matrix_options=None, matrix_compress="gz", kb=False, random_seed=random.randint(0, 19999), memory="60G", parallel=None) if not options.infile_pattern: infiles = (argv[-1]).split(",") else: infiles = options.infile_pattern # create a new filegroup object geno_files = gwas.FileGroup(files=infiles, file_format=options.file_format, genotype_format="imputed") if options.pheno_file: geno_files.set_phenotype(pheno_file=options.pheno_file, pheno=options.pheno) else: pass # add FileGroup object to the gwas program object if options.program == "plink2": gwas_object = gwas.Plink2(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) elif options.program == "plinkdev": gwas_object = gwas.PlinkDev(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) elif options.program == "gcta": gwas_object = gwas.GCTA(files=geno_files) gwas_object.program_call(infiles=geno_files, outfile=options.out_pattern) else: pass # collect filtering options from options opt_dict = options.__dict__ filter_keys = [fx for fx in opt_dict.keys() if re.search("filt", fx)] filter_dict = {k: options.__dict__[k] for k in filter_keys if opt_dict[k]} # iteratively add all filters to GWASProgram object for fkey in filter_dict: filt_key = fkey.replace("filt_", "") filter_value = filter_dict[fkey] gwas_object.apply_filters(filter_type=filt_key, filter_value=filter_value) # handle summary statistics if options.method == "ld_prune": gwas_object._qc_methods(ld_prune=options.prune_method, kb=True, window=options.window_size, step=options.step, threshold=options.threshold) elif options.method == "IBD": # use sum param to pass arguments to ibd estiamte # these are norm, full or relatitves gwas_object._qc_methods(ibd=options.ibd_param) elif options.method == "summary": if options.summary_method == "allele_frequency": gwas_object._output_statistics(allele_frequency=options.sum_param) elif options.summary_method == "hardy_weinberg": gwas_object._output_statistics(hardy_weinberg=options.sum_param) elif options.summary_method == "missing_data": gwas_object._output_statistics(missing_data=options.sum_param) elif options.summary_method == "mendel_errors": gwas_object._output_statistics(mendel_errors=options.sum_param) elif options.summary_method == "inbreeding": gwas_object._output_statistics(inbreeding=options.sum_param) elif options.summary_method == "inbreeding_coef": gwas_object._output_statistics(inbreeding_coef=options.sum_param) elif options.summary_method == "gender_checker": gwas_object._output_statistics(gender_checker=options.sum_param) elif options.summary_method == "wrights_fst": gwas_object._output_statistics(wrights_fst=options.sum_param) else: pass elif options.method == "remove_relations": gwas_object._run_tasks(remove_relations="cutoff", parameter=options.threshold) elif options.method == "check_gender": gwas_object._run_tasks(check_gender="") else: pass gwas_object.build_statement(infiles=geno_files, outfile=options.out_pattern, threads=options.threads, memory=options.memory, parallel=options.parallel) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--mask-bed-file", "--mask-gff-file", dest="filename_bed", type="string", metavar='GFF', help="gff formatted file with masking locations. The number of " "reads overlapping the intervals in the given file will be " "computed. Note that the computation currently does not take " "into account indels, so it is an approximate count only. " "[%default]") parser.add_option( "-f", "--ignore-masked-reads", dest="ignore_masked_reads", action="store_true", help="as well as counting reads in the file given by --mask-bed-file, " "also remove these reads for duplicate and match statistics. " "[%default]") parser.add_option( "-i", "--num-reads", dest="input_reads", type="int", help="the number of reads - if given, used to provide percentages " "[%default]") parser.add_option( "-d", "--output-details", dest="output_details", action="store_true", help="output per-read details into a separate file. Read names are " "md5/base64 encoded [%default]") parser.add_option("--output-readmap", dest="output_readmap", action="store_true", help="output map between read name and " "md5/base64 encoded short name[%default]") parser.add_option( "--add-alignment-details", dest="add_alignment_details", action="store_true", help= "add alignment details to per-read details. Implies --output-details " "[%default]") parser.add_option( "-q", "--fastq-file", dest="filename_fastq", help="filename with sequences and quality scores. This file is only " "used to collect sequence identifiers. Thus, for paired end data a " "single file is sufficient [%default]") parser.add_option( "--basic-counts", dest="detailed_count", action="store_false", help="perform basic counting and do not compute per read stats. " "This is more memory efficient and faster stats computation, " "but only a summary counts table is output [%default]") parser.set_defaults( filename_bed=None, ignore_masked_reads=False, input_reads=0, force_output=False, filename_fastq=None, detailed_count=True, output_details=False, output_readmap=False, add_alignment_details=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.filename_bed: bed_mask = GTF.readAndIndex( GTF.iterator(iotools.open_file(options.filename_bed))) else: bed_mask = None if options.add_alignment_details: options.output_details = True is_stdin = True if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") if args[0] != "-": is_stdin = False elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.stdin != "-": is_stdin = False if options.output_details: outfile_details = E.open_output_file("details", "w") else: outfile_details = None if options.output_readmap: outfile_readmap = E.open_output_file("readmap", "w") else: outfile_readmap = None if options.filename_fastq and not os.path.exists(options.filename_fastq): raise IOError("file %s does not exist" % options.filename_fastq) (counter, flags_counts, nh_filtered, nh_all, nm_filtered, nm_all, mapq, mapq_all, max_hi, details_df) = \ bam2stats_count(pysam_in, bed_mask=bed_mask, ignore_masked_reads=options.ignore_masked_reads, is_stdin=is_stdin, filename_fastq=options.filename_fastq, outfile_details=outfile_details, add_alignment_details=options.add_alignment_details, outfile_readmap=outfile_readmap, detailed_count=options.detailed_count) if max_hi > 0 and max_hi != max(nh_all.keys()): E.warn("max_hi(%i) is inconsistent with max_nh (%i) " "- counts will be corrected" % (max_hi, max(nh_all.keys()))) outs = options.stdout outs.write("category\tcounts\tpercent\tof\n") def _write(outs, text, numerator, denominator, base): percent = iotools.pretty_percent(numerator, denominator) outs.write('%s\t%i\t%s\t%s\n' % (text, numerator, percent, base)) ############################### ############################### ############################### # Output alignment information ############################### nalignments_unmapped = flags_counts["unmapped"] nalignments_mapped = counter.alignments_input - nalignments_unmapped _write(outs, "alignments_total", counter.alignments_input, counter.alignments_input, "alignments_total") if counter.alignments_input == 0: E.warn("no alignments in BAM file - no further output") E.stop() return _write(outs, "alignments_mapped", nalignments_mapped, counter.alignments_input, 'alignments_total') _write(outs, "alignments_unmapped", nalignments_unmapped, counter.alignments_input, 'alignments_total') if nalignments_mapped == 0: E.warn("no mapped alignments - no further output") E.stop() return for flag, counts in sorted(flags_counts.items()): if flag == "unmapped": continue _write(outs, 'alignments_' + flag, counts, nalignments_mapped, 'alignments_mapped') if options.filename_bed: _write(outs, "alignments_masked", counter.alignments_masked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_notmasked", counter.alignments_notmasked, nalignments_mapped, 'alignments_mapped') _write(outs, "alignments_filtered", counter.alignments_filtered, nalignments_mapped, "alignments_mapped") if counter.filtered == nalignments_mapped: normby = "alignments_mapped" else: normby = "alignments_filtered" if counter.filtered > 0: _write(outs, "alignments_duplicates", counter.alignments_duplicates, counter.alignments_filtered, normby) _write(outs, "alignments_unique", counter.aligmnments_filtered - counter.alignments_duplicates, counter.alignments_filtered, normby) ############################### ############################### ############################### # Output read based information ############################### # derive the number of mapped reads in file from alignment counts if options.filename_fastq or not is_stdin: nreads_total = counter.total_read _write(outs, "reads_total", counter.total_read, nreads_total, 'reads_total') _write(outs, "reads_unmapped", counter.total_read_is_unmapped, nreads_total, 'reads_total') _write(outs, "reads_mapped", counter.total_read_is_mapped, nreads_total, 'reads_total') _write(outs, "reads_missing", counter.total_read_is_missing, nreads_total, 'reads_total') _write(outs, "reads_mapped_unique", counter.total_read_is_mapped_uniq, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_multimapping", counter.total_read_is_mmap, counter.total_read_is_mapped, 'reads_mapped') _write(outs, "reads_mapped_supplementary", counter.total_read_has_supplementary, counter.total_read_is_mapped, 'reads_mapped') else: E.warn('inferring read counts from alignments and NH tags') nreads_unmapped = flags_counts["unmapped"] nreads_mapped = computeMappedReadsFromAlignments( nalignments_mapped, nh_all, max_hi) nreads_missing = 0 if options.input_reads: nreads_total = options.input_reads # unmapped reads in bam file? if nreads_unmapped: nreads_missing = nreads_total - nreads_unmapped - nreads_mapped else: nreads_unmapped = nreads_total - nreads_mapped elif nreads_unmapped: # if unmapped reads are in bam file, take those nreads_total = nreads_mapped + nreads_unmapped else: # otherwise normalize by mapped reads nreads_unmapped = 0 nreads_total = nreads_mapped outs.write("reads_total\t%i\t%5.2f\treads_total\n" % (nreads_total, 100.0)) outs.write("reads_mapped\t%i\t%5.2f\treads_total\n" % (nreads_mapped, 100.0 * nreads_mapped / nreads_total)) outs.write("reads_unmapped\t%i\t%5.2f\treads_total\n" % (nreads_unmapped, 100.0 * nreads_unmapped / nreads_total)) outs.write("reads_missing\t%i\t%5.2f\treads_total\n" % (nreads_missing, 100.0 * nreads_missing / nreads_total)) if len(nh_all) > 1: outs.write("reads_unique\t%i\t%5.2f\treads_mapped\n" % (nh_all[1], 100.0 * nh_all[1] / nreads_mapped)) pysam_in.close() ############################### ############################### ############################### # Output pair information ############################### if flags_counts["read2"] > 0: if options.filename_fastq: pairs_mapped = counter.total_pair_is_mapped # sanity check assert counter.total_pair_is_mapped == \ (counter.total_pair_is_proper_uniq + counter.total_pair_is_incomplete_uniq + counter.total_pair_is_incomplete_mmap + counter.total_pair_is_proper_duplicate + counter.total_pair_is_proper_mmap + counter.total_pair_not_proper_uniq + counter.total_pair_is_other) outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (counter.total_pairs, 100.0 * counter.total_pairs / counter.total_pairs)) outs.write( "pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 100.0 * pairs_mapped / counter.total_pairs)) outs.write("pairs_unmapped\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_unmapped, 100.0 * counter.total_pair_is_unmapped / counter.total_pairs)) outs.write( "pairs_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_uniq, 100.0 * counter.total_pair_is_proper_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_uniq, 100.0 * counter.total_pair_is_incomplete_uniq / counter.total_pairs)) outs.write( "pairs_incomplete_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_incomplete_mmap, 100.0 * counter.total_pair_is_incomplete_mmap / counter.total_pairs)) outs.write( "pairs_proper_duplicate\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_duplicate, 100.0 * counter.total_pair_is_proper_duplicate / counter.total_pairs)) outs.write( "pairs_proper_multimapping\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_proper_mmap, 100.0 * counter.total_pair_is_proper_mmap / counter.total_pairs)) outs.write( "pairs_not_proper_unique\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_not_proper_uniq, 100.0 * counter.total_pair_not_proper_uniq / counter.total_pairs)) outs.write("pairs_other\t%i\t%5.2f\tpairs_total\n" % (counter.total_pair_is_other, 100.0 * counter.total_pair_is_other / counter.total_pairs)) nread1_total = counter.total_read1 _write(outs, "read1_total", counter.total_read1, nread1_total, 'read1_total') _write(outs, "read1_unmapped", counter.total_read1_is_unmapped, nread1_total, 'read1_total') _write(outs, "read1_mapped", counter.total_read1_is_mapped, nread1_total, 'read1_total') _write(outs, "read1_mapped_unique", counter.total_read1_is_mapped_uniq, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "reads_multimapping", counter.total_read1_is_mmap, counter.total_read1_is_mapped, 'read1_mapped') _write(outs, "read1_missing", counter.total_read1_is_missing, counter.total_read1_is_mapped, 'read1_total') nread2_total = counter.total_read2 _write(outs, "read2_total", counter.total_read2, nread2_total, 'read2_total') _write(outs, "read2_unmapped", counter.total_read2_is_unmapped, nread2_total, 'read2_total') _write(outs, "read2_mapped", counter.total_read2_is_mapped, nread2_total, 'read2_total') _write(outs, "read2_mapped_unique", counter.total_read2_is_mapped_uniq, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "reads_multimapping", counter.total_read2_is_mmap, counter.total_read2_is_mapped, 'read2_mapped') _write(outs, "read2_missing", counter.total_read2_is_missing, counter.total_read2_is_mapped, 'read2_total') else: # approximate counts pairs_total = nreads_total // 2 pairs_mapped = flags_counts["proper_pair"] // 2 _write(outs, "pairs_total", pairs_total, pairs_total, "pairs_total") _write(outs, "pairs_mapped", pairs_mapped, pairs_total, "pairs_total") else: # no paired end data pairs_total = pairs_mapped = 0 outs.write("pairs_total\t%i\t%5.2f\tpairs_total\n" % (pairs_total, 0.0)) outs.write("pairs_mapped\t%i\t%5.2f\tpairs_total\n" % (pairs_mapped, 0.0)) outs.write("error_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.error_counts, counter.error_rate * 100.0)) outs.write("insertion_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.insertion_counts, counter.insertion_rate * 100.0)) outs.write("deletion_rate\t%i\t%5.2f\tmatches+deletions\n" % (counter.deletion_counts, counter.deletion_rate * 100.0)) outs.write("mismatch_rate\t%i\t%5.2f\tmatches\n" % (counter.mismatch_counts, counter.mismatch_rate * 100.0)) outs.write("match_rate\t%i\t%5.2f\tmatches+insertions\n" % (counter.match_counts, counter.match_rate * 100.0)) if options.force_output or len(nm_filtered) > 0: outfile = E.open_output_file("nm", "w") outfile.write("NM\talignments\n") if len(nm_filtered) > 0: for x in range(0, max(nm_filtered.keys()) + 1): outfile.write("%i\t%i\n" % (x, nm_filtered[x])) else: outfile.write("0\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(nh_all) > 1: outfile = E.open_output_file("nh_all", "w") outfile.write("NH\treads\n") if len(nh_all) > 0: writeNH(outfile, nh_all, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.mapped_reads)) outfile.close() if options.force_output or len(nh_filtered) > 1: outfile = E.open_output_file("nh", "w") outfile.write("NH\treads\n") if len(nh_filtered) > 0: writeNH(outfile, nh_filtered, max_hi) else: # assume all are unique if NH flag not set outfile.write("1\t%i\n" % (counter.filtered)) outfile.close() if options.force_output or len(mapq_all) > 1: outfile = E.open_output_file("mapq", "w") outfile.write("mapq\tall_reads\tfiltered_reads\n") for x in range(0, max(mapq_all.keys()) + 1): outfile.write("%i\t%i\t%i\n" % (x, mapq_all[x], mapq[x])) outfile.close() if details_df is not None: with E.open_output_file("summaries", "w") as outf: details_df.describe().transpose().to_csv(outf, sep="\t", index_label="metric") bins = numpy.arange(0, 1.01, 0.01) histogram_df = pandas.DataFrame.from_items([ (x, numpy.histogram(details_df[x].dropna(), bins=bins)[0]) for x in details_df.columns ]) histogram_df.index = numpy.arange(0, 1.0, 0.01) row_sums = histogram_df.sum(axis=1) histogram_df = histogram_df[row_sums != 0] with E.open_output_file("histogram", "w") as outf: histogram_df.to_csv(outf, sep="\t", index_label="bin") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--trna-scheme", dest="trna_scheme", type="choice", choices=("tDR-5'", "tRH-DA"), help="name of the tRNA scheme to make bed file for[default=%default]") parser.set_defaults(trna_scheme=None) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) outfile = IOTools.open_file(options.stdout.name, "w") trna_options = [ "tRH-5'", "tRH-DA", "tRH-DTA", "tRH-AT", "tRH-3'", "tRF-5'", "tRF-3'", "tRF-D", "tRF-DA", "tRF-A", "tRF-AT", "tRF-T" ] for trna in trna_options: infile = IOTools.open_file(options.stdin.name) iterator = FastaIterator.FastaIterator(infile) d = collections.OrderedDict() cluster_dict = dict() # first iterate over the fasta file for cur_record in iterator: title = cur_record.title m = re.match("(cluster\d+):chr\S+.tRNA\d+-(\S+)-\((\S+)\)", title) cluster = m.group(1) trna_group = m.group(2) strand = m.group(3) chrom = cluster + ":" + trna_group + "-" score = "." print(trna) if trna == "tRH-5'": start = "1" end = "33" elif trna == "tRH-DA": start = "14" end = "43" elif trna == "tRH-DTA": start = "17" end = "54" elif trna == "tRH-AT": start = "38" end = "69" elif trna == "tRH-3'": start = "43" end = "73" elif trna == "tRF-5'": start = "1" end = "15" elif trna == "tRF-3'": start = "58" end = "73" elif trna == "tRF-D": start = "8" end = "23" elif trna == "tRF-DA": start = "20" end = "35" elif trna == "tRF-A": start = "27" end = "42" elif trna == "tRF-AT": start = "33" end = "53" elif trna == "tRF-T": start = "45" end = "71" else: print("tRNA fragment not implemented") break outfile.write(("%s\t%s\t%s\t%s\t%s\t%s\n") % (chrom, start, end, trna, score, strand)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_argument("-g", "--gtf-file", dest="filename_gtf", type=str, help="filename with gene models in gtf format ") parser.add_argument("-m", "--filename-mismapped", dest="filename_mismapped", type=str, help="output bam file for mismapped reads ") parser.add_argument("-j", "--junctions-bed-file", dest="filename_junctions", type=str, help="bam file with reads mapped across junctions ") parser.add_argument("-r", "--filename-regions", dest="filename_regions", type=str, help="filename with regions to remove in bed format ") parser.add_argument("-t", "--transcripts-gtf-file", dest="filename_transcriptome", type=str, help="bam file with reads mapped against transcripts ") parser.add_argument("-p", "--map-tsv-file", dest="filename_map", type=str, help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) ") parser.add_argument("-s", "--filename-stats", dest="filename_stats", type=str, help="filename to output stats to ") parser.add_argument( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) ") parser.add_argument("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches ") parser.add_argument("-c", "--remove-contigs", dest="remove_contigs", type=str, help="','-separated list of contigs to remove ") parser.add_argument("-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files ") parser.add_argument("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely ") parser.add_argument("--output-sam", dest="output_sam", action="store_true", help="output in sam format ") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = iotools.read_map(iotools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(iotools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(iotools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile( options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = iotools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser() parser.add_argument("--version", action='version', version='%(prog)s {version}'.format(version="1.0")) parser.add_argument("-t", "--no-titles", dest="input_has_titles", action="store_false", help="no titles in input.") parser.add_argument("--ignore-titles", dest="ignore_titles", action="store_true", help="ignore titles in input") parser.add_argument("-i", "--skip-titles", dest="skip_titles", action="store_true", help="skip output of titles.") parser.add_argument("-m", "--missing-value", dest="missing_value", type=str, help="entry to use for missing values.") parser.add_argument("--header-names", dest="headers", type=str, help="add headers for files as a ,-separated " "list.") parser.add_argument("-c", "--columns", dest="columns", type=str, help="columns to use for joining. Multiple columns " "can be specified as a comma-separated list ") parser.add_argument("-k", "--take", dest="take", type=str, action="append", help="columns to take. If not set, all columns " "except for " "the join columns are taken") parser.add_argument("-g", "--glob", dest="glob", type=str, help="wildcard expression for table names.") parser.add_argument( "-s", "--sort-order", dest="sort", type=str, help="sort by column titles in particular given order: " "alphabetical|numeric|list of columns.") parser.add_argument("-e", "--merge-overlapping", dest="merge", action="store_true", help="simply merge tables without matching up " "rows.") parser.add_argument("-a", "--cat", dest="cat", type=str, help="simply concatenate tables. Adds an " "additional column called X with the filename ") parser.add_argument("--sort-keys", dest="sort_keys", type=str, choices=("numeric", "alphabetic"), help="sort key columns by value.") parser.add_argument("--keep-empty", dest="ignore_empty", action="store_false", help="keep empty tables. The default is " "to ignore them.") parser.add_argument("--ignore-empty", dest="ignore_empty", action="store_true", help="ignore empty tables - this is " "the default.") parser.add_argument("--add-file-prefix", dest="add_file_prefix", action="store_true", help="add file prefix to " "columns headers. Suitable for multi-column" "tables") parser.add_argument("--use-file-prefix", dest="use_file_prefix", action="store_true", help="use file prefix as column headers. " "Suitable for two-column tables ") parser.add_argument("--prefixes", dest="prefixes", type=str, help="list of prefixes to use. " ", separated list of prefixes. " "The number of prefixes need to correspond to the " "number of input files") parser.add_argument("--regex-filename", dest="regex_filename", type=str, help="pattern to apply to filename to " "build prefix") parser.add_argument("--regex-start", dest="regex_start", type=str, help="regular expression to start " "collecting table in a file") parser.add_argument("--regex-end", dest="regex_end", type=str, help="regular expression to end collecting " "table in a file") parser.add_argument("--test", dest="test", type=int, help="test combining tables with " "first X rows") parser.set_defaults( input_has_titles=True, skip_titles=False, missing_value="na", headers=None, sort=None, glob=None, columns="1", sort_keys=False, merge=False, ignore_empty=True, regex_start=None, regex_end=None, add_file_prefix=False, use_file_prefix=False, cat=None, take=[], regex_filename="(.*)", prefixes=None, test=0, ) (args, unknown) = E.start(parser, argv=argv, unknowns=True) if args.headers: if "," in args.headers: args.headers = args.headers.split(",") else: args.headers = re.split("\s+", args.headers.strip()) if args.sort and args.sort not in ("numeric", "alphabetic"): if "," in args.sort: args.sort = args.sort.split(",") else: args.sort = re.split("\s+", args.sort) if args.merge: args.columns = [] else: args.columns = [int(x) - 1 for x in args.columns.split(",")] args.filenames = [] if args.glob: args.filenames += glob.glob(args.glob) args.filenames += unknown if len(args.filenames) < 1: raise ValueError("no tables found.") E.info("combining %i tables" % len(args.filenames)) if args.cat: concatenate_tables(args.stdout, args, unknown) else: join_tables(args.stdout, args, unknown) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--inplace", dest="inplace", action="store_true", help="update option list in place. New options will" "be added to the list given by --options-tsv-file. " "Options will only be added, not removed ") parser.add_argument("--options-tsv-file", dest="tsv_file", type=str, help="existing table with options. Will be updated if " "--in-place is set [default]") parser.set_defaults(inplace=False, tsv_file=None) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) old_options = None if args.tsv_file: if not os.path.exists(args.tsv_file): raise OSError("filename %s not found, see --options-tsv-file" % args.tsv_file) old_options = pandas.read_csv( iotools.open_file(args.tsv_file), sep="\t", index_col=0, ) old_options = old_options.fillna("") global ORIGINAL_START ORIGINAL_START = E.start all_options = collections.defaultdict(list) for label, expression in EXPRESSIONS: files = glob.glob(expression) files.sort() for f in files: E.debug("processing %s" % f) if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue collected_options = collectOptionsFromScript(os.path.abspath(f)) for o in collected_options: all_options[o].append(f) # add old options for x in old_options.index: if x not in all_options: all_options[x].append("--") if args.inplace: outfile = iotools.open_file(args.tsv_file, "w") E.info("updating file '%s'" % args.tsv_file) else: outfile = args.stdout outfile.write("option\taction\tcomment\talternative\tfiles\n") for o, v in sorted(all_options.items()): try: action, comment, alternative, ff = old_options.xs(o) except KeyError: action, comment, alternative, ff = "", "", "", "" if comment == "nan": comment = "" if alternative == "nan": alternative = "" outfile.write("\t".join( (list(map(str, (o, action, comment, alternative, ",".join(v)))))) + "\n") if outfile != args.stdout: outfile.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--min-overlap", dest="min_overlap", type=int, help="minimum overlap") parser.add_argument( "-w", "--pattern-window", dest="pattern_window", type=str, help="regular expression to extract window coordinates from " "test id ") parser.add_argument("-i", "--invert", dest="invert", action="store_true", help="invert direction of fold change ") parser.set_defaults(min_overlap=10, invert=False, pattern_window="(\S+):(\d+)-(\d+)"), # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) outfiles = iotools.FilePool(args.output_filename_pattern) if args.invert: test_f = lambda l2fold: l2fold < 0 else: test_f = lambda l2fold: l2fold > 0 def read(): rx_window = re.compile(args.pattern_window) # filter any of the DESeq/EdgeR message that end up at the top of the # output file for data in iotools.iterate(args.stdin): contig, start, end = rx_window.match(data.test_id).groups() start, end = list(map(int, (start, end))) yield DATA._make( (data.test_id, contig, start, end, data.treatment_name, float(data.treatment_mean), float(data.treatment_std), data.control_name, float(data.control_mean), float(data.control_std), float(data.pvalue), float(data.qvalue), float(data.l2fold), float(data.fold), int(data.significant), data.status, 0)) def grouper(data, distance=10): last = next(data) entries = [last] while 1: d = next(data) if d is None: break if d.contig == last.contig and d.start < last.start: raise ValueError("error not sorted by start") if ((d.contig != last.contig) or (d.start - last.end > distance) or (d.status != last.status) or (d.significant != last.significant) or (d.l2fold * last.l2fold < 0)): yield entries entries = [] entries.append(d) last = d yield entries counter = E.Counter() args.stdout.write("\t".join(DATA._fields) + "\n") # set of all sample names - used to create empty files samples = set() # need to sort by coordinate all_data = list(read()) all_data.sort(key=lambda x: (x.contig, x.start)) group_id = 0 for group in grouper(iter(all_data), distance=args.min_overlap): group_id += 1 start, end = group[0].start, group[-1].end assert start < end, 'start > end: %s' % str(group) n = float(len(group)) counter.input += n g = group[0] if g.l2fold < 0: l2fold = max([x.l2fold for x in group]) fold = max([x.fold for x in group]) else: l2fold = min([x.l2fold for x in group]) fold = min([x.fold for x in group]) outdata = DATA._make( (str(group_id), g.contig, start, end, g.treatment_name, sum([x.treatment_mean for x in group]) / n, max([x.treatment_std for x in group]), g.control_name, sum([x.control_mean for x in group]) / n, max([x.control_std for x in group]), max([x.pvalue for x in group]), max([x.qvalue for x in group]), l2fold, fold, g.significant, g.status, int(n))) samples.add(g.treatment_name) samples.add(g.control_name) if g.significant: if test_f(g.l2fold): # treatment lower methylation than control outfiles.write( g.treatment_name, "%s\t%i\t%i\t%i\t%f\n" % (g.contig, g.start, g.end, group_id, sum([x.treatment_mean for x in group]) / n)) else: outfiles.write( g.control_name, "%s\t%i\t%i\t%i\t%f\n" % (g.contig, g.start, g.end, group_id, sum([x.control_mean for x in group]) / n)) args.stdout.write("\t".join(map(str, outdata)) + "\n") counter.output += 1 # create empty files for sample in samples: outfiles.write(sample, "") outfiles.close() E.info("%s" % counter) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-u", "--use", dest="use", type="choice", choices=("pval", "padj"), help="Type of p-value to use for clade size") parser.add_option("-m", "--taxa-map", dest="taxa_map", type="string", help="Taxa mapping file - basically a text tree") parser.add_option("-l", "--highest-level", dest="highest_level", type="string", help="highest taxonomic level to visualise") parser.add_option( "-f", "--filter", dest="filter", action="store_true", help="do you want to filter? will filter based on highest-level") parser.add_option("-k", "--keep", dest="keep", type="string", help="keep all clades below these") parser.add_option( "--additional-labels", dest="additional_labels", type="string", help= "by default just the highest level labels are shown. Here you can add additional labels" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # read tree tree = readTree(options.taxa_map, options.highest_level) # filter if neccessary new_tree = {} keep = set() if options.filter: assert options.keep, "must specify which clades to keep" to_keep = options.keep.split(",") for t in to_keep: new_tree[t] = tree[t] else: assert len( list(tree.keys()) ) < 159, "not enough colours to support n = %i clades, please filter" % len( tree) new_tree = tree #get those to keep keep = set() for taxon, rest in new_tree.items(): keep.add(taxon) for r in rest: keep.add(r) # get colours ncols = len(list(new_tree.keys())) colours = getColours(ncols) taxon2colour = {} for i in range(ncols): taxon2colour[list(new_tree.keys())[i]] = colours[i] # add in colours for all clade nodes that is # based on the highest node for h, r in new_tree.items(): for taxon in r: taxon2colour[taxon] = taxon2colour[h] # read diff and output annotations result = collections.defaultdict(list) ps = [] fcs = [] taxa = [] colours = [] shapes = [] sig = [] for line in options.stdin.readlines(): # skip header if "taxa" and "log2FoldChange" in line: continue data = line.strip("\n").split("\t") taxon = data[0] if taxon not in keep: continue # assign colour if taxon in list(taxon2colour.keys()): colour = taxon2colour[taxon] else: colour = "NA" colours.append(colour) # append taxon to list taxa.append(taxon) # use -log10 p-value for clade size if options.use == "pval": column = 5 elif options.use == "padj": column = 6 else: raise ValueError("must use pval or padj, not %s for clade size" % options.use) # catch NA pvalues if data[column] == "NA": data[column] = 1 p = -math.log10(float(data[column])) if p >= 1.3: shapes.append("*") sig.append(taxon) else: shapes.append("o") p = p * 100 result[taxon].append(p) ps.append(p) # fold changes if data[2] == "NA": fc = 0 else: fc = data[2] fcs.append(fc) # output annotations options.stdout.write("%s\t%s\n" % ("clade_separation", "0.9")) for t, s in zip(taxa, shapes): options.stdout.write("%s\t%s\t%s\n" % (t, "clade_marker_shape", s)) for t, c in taxon2colour.items(): options.stdout.write("%s\t%s\t%s\n" % (t, "clade_marker_color", c)) for t, c in taxon2colour.items(): options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_background_color", c)) for t, c in taxon2colour.items(): options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_font_size", 12)) for t, p, f in zip(taxa, ps, fcs): if t in sig and float(f) > 0: options.stdout.write("%s\t%s\t%s\n" % (t, "clade_marker_color", "r")) options.stdout.write("%s\t%s\t%f\n" % (t, "clade_marker_size", 200)) options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_background_color", "r")) options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_height", 1, f)) options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_color", 1, "r")) options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_alpha", 1, 0.5)) elif t in sig and float(f) < 0: options.stdout.write("%s\t%s\t%s\n" % (t, "clade_marker_color", "b")) options.stdout.write("%s\t%s\t%f\n" % (t, "clade_marker_size", 200)) options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_background_color", "b")) options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_height", 1, float(f) * -1)) options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_color", 1, "b")) options.stdout.write("%s\t%s\t%s\t%s\n" % (t, "ring_alpha", 1, 0.5)) elif t not in sig: options.stdout.write("%s\t%s\t%f\n" % (t, "clade_marker_size", p)) # only output annotation for highest-level and # additional labels if options.additional_labels: additional_labels = options.additional_labels.split(",") else: additional_labels = [] for t, p in zip(taxa, ps): if t in additional_labels: # if "_" in t: # a = "".join(random.sample(list(string.ascii_lowercase),2)) + ":" + t.split(".")[-1] # else: a = t.split(".")[-1] options.stdout.write("%s\t%s\t%s\n" % (t, "annotation", a)) options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_rotation", 90)) options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_font_size", 8)) elif t in list(tree.keys()): a = t.split(".")[-1] options.stdout.write("%s\t%s\t%s\n" % (t, "annotation", a)) options.stdout.write("%s\t%s\t%s\n" % (t, "annotation_font_size", 12)) # write the tree out outf = open("input_tree.txt", "w") for x, y in new_tree.items(): for taxon in y: outf.write(taxon + "\n") outf.close() # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=[ "mutation-profile-bar-plot", "depth-profile-line-plot", "manhattan-plot" ], help="methods to apply [%default]") parser.add_option("-t", "--transformation", dest="transformations", type="choice", action="append", choices=["log-depth-ratio"], help="dataframe transformation options [%default]") parser.add_option("-r", "--regex-filename", dest="regex_filename", type="string", help="[%default]") parser.add_option("-f", "--reference-fasta-file", dest="reference_fasta_file", help="reference genomic sequence in fasta format. " "[%default]") parser.add_option("--input-file-format", dest="input_file_format", type="choice", choices=("tsv", "bcftools-query"), help="input file format " "[%default]") parser.add_option( "--plot-options", dest="plot_options", type="string", help="plot options to pass through to the plotter. The string is " "eval'ed, for example: --plot-options='window_size=3, ylabel=\"12\"' " "[%default]") parser.set_defaults( method=None, reference_fasta=None, input_file_format="tsv", plot_options=None, transformations=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) filenames = args if len(filenames) == 0: E.info("reading from stdin") filenames = [options.stdin] if options.plot_options is not None: plot_options = eval("dict({})".format(options.plot_options)) else: plot_options = {} for index, filename in enumerate(filenames): E.info("working on {}".format(filename)) try: if options.input_file_format == "bcftools-query": # for bctools query, header starts with "#". dataframe = pandas.read_csv(filename, sep="\t", skip_blank_lines=False, header=0, dtype={"CHROM": str}) # names are of format [1]sample1:DP, extract sample1 dataframe.columns = ([ re.search("\[\d+\]([^:]+)", x).groups()[0] for x in dataframe.columns ]) else: dataframe = pandas.read_csv(filename, sep="\t", dtype={"CHROM": str}) except pandas.io.common.EmptyDataError: E.warn("no data in {}, skipped".format(filename)) continue E.info("read data from {}".format(filename)) if options.regex_filename: section = re.search(options.regex_filename, filename).groups()[0] else: section = "{}".format(index + 1) for method in options.transformations: if method == "log-depth-ratio": dataframe = compute_log_depth_ratio(dataframe) if dataframe.empty: E.warn("dataframe from {} is empty - skipped".format(filename)) continue if options.method == "mutation-profile-bar-plot": plot_mutation_profile_bar_plot(dataframe, section, **plot_options) elif options.method == "depth-profile-line-plot": plot_depth_profile_plot(dataframe, section, **plot_options) elif options.method == "manhattan-plot": plot_manhattan_plot(dataframe, section, filename_fasta=options.reference_fasta_file, **plot_options) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome (indexed).") parser.add_argument("-w", "--windows-bed-file", dest="filename_windows", type=str, help="gff file with windows to use.") parser.add_argument("-d", "--filename-data", dest="filename_data", type=str, help="gff file with data to use.") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="filename-data is gtf file") parser.add_argument("-f", "--features", dest="features", type=str, action="append", choices=("GC", ), help="features to compute.") parser.add_argument("-c", "--decorator", dest="decorator", type=str, choices=("counts", "gc", "gc3", "mean-length", "median-length", "percent-coverage", "median-score", "mean-score", "stddev-score", "min-score", "max-score"), help="decorators to use.") parser.add_argument("-e", "--skip-empty", dest="skip_empty", action="store_true", help="skip empty windows.") parser.add_argument( "-t", "--transform=", dest="transform", type=str, choices=("none", "overlap", "complement", "third_codon"), help="transform to use when mapping overlapping regions onto window.") parser.set_defaults( genome_file=None, filename_windows=None, filename_data=None, features=[], skip_empty=False, decorator="counts", transform="none", is_gtf=False, ) (args) = E.start(parser) # test_transform_third_codon() if not args.filename_windows: raise ValueError("please supply a gff file with window information.") if args.loglevel >= 1: args.stdlog.write("# reading windows...") args.stdlog.flush() windows = GTF.readAsIntervals( GTF.iterator(iotools.open_file(args.filename_windows, "r"))) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() if args.filename_data: if args.loglevel >= 1: args.stdlog.write("# reading data...") args.stdlog.flush() if args.is_gtf: gff_data = GTF.readFromFile( iotools.open_file(args.filename_data, "r")) else: gff_data = GTF.readFromFile( IOTOols.open_file(args.filename_data, "r")) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() data_ranges = GTF.SortPerContig(gff_data) else: # use windows to compute properties # by supplying no data and asking for the complement = original window gff_data = None data_ranges = None args.transform = "complement" map_contig2size = {} if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) map_contig2size = fasta.getContigSizes() else: for contig, values in list(windows.items()): map_contig2size[contig] = max(lambda x: x[1], values) fasta = None contigs = list(map_contig2size.keys()) contigs.sort() # proceed contig wise noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0 args.stdout.write("\t".join( map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1", "l1", "n2", "l2", "score", "extra_info"))) + "\n") for contig in contigs: skip = False if contig not in windows: ncontigs_skipped_windows += 1 skip = True if data_ranges and contig not in data_ranges: ncontigs_skipped_data += 1 skip = True if skip: continue noutput_contigs += 1 if data_ranges: annotateWindows( contig, windows[contig], gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta, args) else: annotateWindows(contig, windows[contig], [], fasta, args) E.info( "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i" % (len(windows), noutput_contigs, len(contigs), ncontigs_skipped_windows, ncontigs_skipped_data)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults( input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = iotools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write( "estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write( "true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write( "nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join(["treatment_R%i" % x for x in range(len(options.treatment_files))])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join(["control_R%i" % x for x in range(len(options.control_files))])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-w", "--weights-tsv-file", dest="filename_weights", type=str, help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_argument("-s", "--section", dest="sections", nargs="*", type=str, choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output ") parser.add_argument( "-t", "--sequence-type", dest="seqtype", type=str, choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids .") parser.add_argument( "-e", "--regex-identifier", dest="regex_identifier", type=str, help="regular expression to extract identifier from fasta " "description line.") parser.add_argument( "--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_argument( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (args) = E.start(parser, argv=argv) rx = re.compile(args.regex_identifier) reference_codons = [] if args.filename_weights: args.filename_weights = args.filename_weights.split(",") for filename in args.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( iotools.ReadMap(iotools.open_file(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences args.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in list(a.items()): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) args.stdlog.write( "# tablediff\t%s\t%s\t%f\n" % (args.filename_weights[x], args.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(args.stdin) def getCounter(section): if args.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps(args.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif args.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in args.sections: totals[section] = getCounter(section) args.stdout.write("id") for section in args.sections: args.stdout.write("\t" + "\t".join(totals[section].getHeaders())) args.stdout.write("\n") args.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if args.split_id is True: args.stdout.write("%s" % id.split()[0]) else: args.stdout.write("%s" % id) args.stdout.flush() for section in args.sections: s = getCounter(section) s.loadSequence(sequence, args.seqtype) totals[section].addProperties(s) args.stdout.write("\t" + "\t".join(s.getFields())) args.stdout.write("\n") if args.add_total: args.stdout.write("total") for section in args.sections: args.stdout.write("\t" + "\t".join(totals[section].getFields())) args.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--task", dest="task", type="choice", choices=[ "merge_exclusions", "flag_hets", "find_inbreds", "flag_relations", "discordant_gender" ], help="task to execute on phenotype file(s)") parser.add_option("--gender-check-file", dest="gender_check", type="string", help="output from gender checking " "by Plink, suffix should be .sexcheck") parser.add_option("--relationship-file", dest="relations", type="string", help="output file from IBS " "calculation. Should contain all pairwise " "relationships.") parser.add_option("--inbreeding-coef-file", dest="inbreed_file", type="string", help="file containing either Plink " "or GCTA estimates of F, inbreeding coefficient") parser.add_option("--inbreeding-coefficient", dest="inbred_coeff", type="choice", choices=["Fhat1", "Fhat2", "Fhat3", "F", "ibc"], help="inbreeding coefficient " "to use to identify highly inbred individuals") parser.add_option("--inbred-cutoff", dest="inbred_cutoff", type="float", help="threshold above which individuals are classed " "as inbred.") parser.add_option("--ibs-cutoff", dest="ibs_cutoff", type="float", help="IBS threshold to flag individuals as being " "closely related") parser.add_option("--trimmed-relationships", dest="rel_cutoff", type="string", help="output file from Plink " "--rel-cutoff with trimmed data set of unrelated " "individuals.") parser.add_option( "--heterozygotes-file", dest="hets_file", type="string", help="file from heterozygote analysis containing observed " "homozygosity and F coefficients") parser.add_option("--auxillary-file", dest="aux_file", type="string", help="a file of IIDs and FIDs for individuals that are " "to be removed from analysis, unrelated to QC") parser.add_option("--plotting-path", dest="plot_path", type="string", help="PATH to save any plots to") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.task == "flag_hets": # calculate heterozygosity rates, find and flag # individuals > 3 s.d. away from mean value # rate = (nonissing - homs) / nonmissing # i.e. non-homozygote rate flags = gwas.flagExcessHets(options.hets_file, plot=True, plot_path=options.plot_path) flags.to_csv(options.stdout, index=None, sep="\t") elif options.task == "merge_exclusions": exclusions = gwas.mergeQcExclusions(hets_file=options.hets_file, inbred_file=options.inbreed_file, related_file=options.relations, gender_file=options.gender_check, mask_file=options.aux_file) exclusions.to_csv(options.stdout, index=None, sep="\t") elif options.task == "find_inbreds": inbreds = gwas.flagInbred(inbred_file=options.inbreed_file, inbreeding_coefficient=options.inbred_coeff, ibc_threshold=options.inbred_cutoff, plot=True, plot_path=options.plot_path) inbreds.to_csv(options.stdout, sep="\t", index=None) elif options.task == "flag_relations": # the input file is likely to be huge! Ergo, read the file in chunks # calculate any related individuals and store them, store # an array of IBD values for plotting, drop the rest relate = gwas.flagRelated(ibd_file=options.relations, chunk_size=500000, threshold=options.ibs_cutoff, plot=True, plotting_path=options.plot_path) elif options.task == "discordant_gender": sex_discord = gwas.flagGender(gender_file=options.gender_check, plot=True, plot_path=options.plot_path) sex_discord.to_csv(options.stdout, index=None, sep="\t") else: pass # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) rows = [] labels = {} for label, expr in expressions: nchecked, data = cgat.Style.runPep8(expr) rows.append((label, nchecked, data)) labels.update(dict([(x.code, x.description) for x in data])) # build table # # each row is data set and each column is a Warning/Error type # with some additional columns such as total and n. # build dictionary mapping error codes to columns # consistently across samples map_code2column = dict([(y, x + 3) for x, y in enumerate(labels.keys())]) # build first row containing the column labels results = [['code', 'n', 'total'] + list(labels.keys())] # build array with column totals column_totals = [0] * (len(map_code2column) + 3) for label, nchecked, data in rows: row = [label, nchecked, 0] + [0] * len(map_code2column) column_totals[1] += nchecked for x in data: c = map_code2column[x.code] row[c] = x.count row[2] += int(x.count) column_totals[2] += int(x.count) column_totals[c] += int(x.count) results.append(row) # add column totals column_totals[0] = 'total' results.append(column_totals) # add descriptions as last row results.append([ 'description', 'number of files checked', 'total errors/warnings in set' ] + list(labels.values())) # output transposed table outfile = sys.stdout for row in zip(*results): outfile.write('%s\n' % ('\t'.join(map(str, row)))) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The default behaviour of the script is to guess the quality " "format of the input fastq file. The user can specify the " "quality format of the input file using the --guess-format option. " "The script will use this format if the " "sequence qualities are ambiguous.[default=%default].") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'illumina-1.8', 'integer'), help="The script will convert quality scores to the destination " "format unless [default=%default].") parser.set_defaults( target_format=None, guess_format=None, min_quality=10, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() if options.target_format: iterator = Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format) else: iterator = Fastq.iterate_guess(options.stdin, guess=options.guess_format) options.stdout.write("read\tnfailed\tnN\t%s\n" % ("\t".join(Stats.Summary().getHeaders()))) min_quality = options.min_quality for record in iterator: c.input += 1 quals = record.toPhred() nfailed = len([x for x in quals if x < min_quality]) nns = record.seq.count("N") + record.seq.count(".") options.stdout.write( "%s\t%i\t%i\t%s\n" % (record.identifier, nfailed, nns, str(Stats.Summary(quals)))) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input file is in gtf format") parser.add_argument("--set-name", dest="name", type=str, help="field from the GFF/GTF file to use as the " "name field in the BED file ", choices=("gene_id", "transcript_id", "class", "family", "feature", "source", "repName", "gene_biotype")) parser.add_argument("--track", dest="track", type=str, choices=("feature", "source", None), help="use feature/source field to define BED tracks ") parser.add_argument( "--bed12-from-transcripts", dest="bed12", action="store_true", default=False, help="Process GTF file into Bed12 entries, with blocks as exons" "and thick/thin as coding/non-coding") parser.set_defaults(track=None, name="gene_id", is_gtf=False) (args) = E.start(parser, add_pipe_options=True) ninput, noutput = 0, 0 iterator = GTF.iterator(args.stdin) if args.bed12: iterator = GTF.transcript_iterator(iterator) if args.track: all_input = list(iterator) if args.track == "feature": grouper = lambda x: x.feature elif args.track == "source": grouper = lambda x: x.source all_input.sort(key=grouper) bed = Bed.Bed() for key, vals in itertools.groupby(all_input, grouper): args.stdout.write("track name=%s\n" % key) for gff in vals: ninput += 1 if args.bed12: bed = transcript2bed12(gff) else: bed.fromGTF(gff, name=args.name) args.stdout.write(str(bed) + "\n") noutput += 1 else: bed = Bed.Bed() for gff in iterator: ninput += 1 if args.bed12: bed = transcript2bed12(gff) else: bed.fromGTF(gff, name=args.name) args.stdout.write(str(bed) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i" % (ninput, noutput)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--outputdir", dest="outdir", type="string", help="output directory to save plots") parser.add_option("-f", "--fasta", dest="fasta_file", type="string", help="fasta file containing tRNA cluster fasta seqs") parser.set_defaults(fasta_file=None, outdir=None) (options, args) = E.start(parser, argv=argv) if len(args) == 0: args.append("-") E.info(options.stdin) dict_trna = {} for record in FastaIterator.iterate(IOTools.open_file(options.fasta_file)): title = record.title.strip("-") length = len(record.sequence) dict_trna[title] = length # For each read in bamfile find end position and then plot this using length of tRNA cluster samfile = pysam.AlignmentFile(options.stdin.name, "rb") refname = "" values = [] n = 0 for line in samfile: if line.reference_name == refname: if line.reference_end is None: pass else: end = int(line.reference_end) - int(line.reference_start) values.append(end) elif line.reference_name != refname: n += 1 if n > 1: values = pd.Series(values) percent = values.value_counts() / values.count() * 100 percent = percent.sort_index() percent = pd.DataFrame(percent) percent.rename(columns={0: 'Percent'}, inplace=True) # length of each tRNA from fasta length = dict_trna[refname.strip("-")] + 1 temp_df = pd.DataFrame(0, index=range(1, length), columns=['A']) temp_df = pd.concat([temp_df, percent], axis=1) percent = temp_df.fillna(0) refname = options.outdir + refname.strip("-") outfile = refname + ".csv" outfig = refname + ".eps" percent.to_csv(outfile) g = sns.factorplot(x=percent.index, y="Percent", data=percent, size=8, kind="bar", palette="Blues") g.set_xlabels('position from 5\' end') g.set_xticklabels(rotation=90) g.savefig(outfig, format='eps') values = [] refname = line.reference_name else: refname = line.reference_name E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(iotools.open_file(args[0], "r")) ]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(iotools.open_file(args[1], "r")) ]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len( [x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([ x for x in differences if x[0] in "NX" or x[1] in "NX" ]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print( "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.stop()
def main(argv=None): parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-s", "--species", dest="species", type="string", help="species to use [default=%default].") parser.add_option( "-i", "--slims", dest="filename_slims", type="string", help="filename with GO SLIM categories " "[default=%default].") parser.add_option( "-g", "--genes-tsv-file", dest="filename_genes", type="string", help="filename with genes to analyse " "[default=%default].") parser.add_option( "-b", "--background-tsv-file", dest="filename_background", type="string", help="filename with background genes to analyse " "[default=%default].") parser.add_option( "-m", "--min-counts", dest="minimum_counts", type="int", help="minimum count - ignore all categories that have " "fewer than # number of genes" " [default=%default].") parser.add_option( "-o", "--sort-order", dest="sort_order", type="choice", choices=("fdr", "pvalue", "ratio"), help="output sort order [default=%default].") parser.add_option( "--ontology", dest="ontology", type="string", action="append", help="go ontologies to analyze. Ontologies are tested " "separately [default=%default].") parser.add_option( "-t", "--threshold", dest="threshold", type="float", help="significance threshold [>1.0 = all ]. If --fdr is set, this " "refers to the fdr, otherwise it is a cutoff for p-values.") parser.add_option( "--filename-dump", dest="filename_dump", type="string", help="dump GO category assignments into a flatfile " "[default=%default].") parser.add_option( "--gene2name-map-tsv-file", dest="filename_gene2name", type="string", help="optional filename mapping gene identifiers to gene names " "[default=%default].") parser.add_option( "--filename-ontology", dest="filename_ontology", type="string", help="filename with ontology in OBO format [default=%default].") parser.add_option( "--filename-input", dest="filename_input", type="string", help="read GO category assignments from a flatfile " "[default=%default].") parser.add_option( "--sample-size", dest="sample", type="int", help="do sampling (with # samples) [default=%default].") parser.add_option( "--filename-output-pattern", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="pattern with output filename pattern " "(should contain: %(go)s and %(section)s ) [default=%default]") parser.add_option( "--fdr", dest="fdr", action="store_true", help="calculate and filter by FDR default=%default].") parser.add_option( "--go2goslim", dest="go2goslim", action="store_true", help="convert go assignments in STDIN to goslim assignments and " "write to STDOUT [default=%default].") parser.add_option( "--gene-pattern", dest="gene_pattern", type="string", help="pattern to transform identifiers to GO gene names " "[default=%default].") parser.add_option( "--filename-map-slims", dest="filename_map_slims", type="string", help="write mapping between GO categories and GOSlims " "[default=%default].") parser.add_option( "--get-genes", dest="get_genes", type="string", help="list all genes in the with a certain GOID [default=%default].") parser.add_option( "--strict", dest="strict", action="store_true", help="require all genes in foreground to be part of background. " "If not set, genes in foreground will be added to the background " "[default=%default].") parser.add_option( "-q", "--fdr-method", dest="qvalue_method", type="choice", choices=("empirical", "storey", "BH"), help="method to perform multiple testing correction by controlling " "the fdr [default=%default].") parser.add_option( "--pairwise", dest="compute_pairwise", action="store_true", help="compute pairwise enrichment for multiple gene lists. " "[default=%default].") # parser.add_option( "--fdr-lambda", dest="qvalue_lambda", type="float", # help="fdr computation: lambda [default=%default]." ) # parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", # choices = ("smoother", "bootstrap" ), # help="fdr computation: method for estimating pi0 [default=%default]." ) parser.set_defaults(species=None, filename_genes="-", filename_background=None, filename_slims=None, minimum_counts=0, ontology=[], filename_dump=None, sample=0, fdr=False, output_filename_pattern=None, threshold=0.05, filename_map_slims=None, gene_pattern=None, sort_order="ratio", get_genes=None, strict=False, qvalue_method="empirical", pairs_min_observed_counts=3, compute_pairwise=False, filename_gene2name=None ) (options, args) = E.start(parser, add_database_options=True) if options.go2goslim: GO.convertGo2Goslim(options) E.stop() sys.exit(0) if options.fdr and options.sample == 0: E.warn("fdr will be computed without sampling") ############################################################# # dump GO if options.filename_dump: # set default orthologies to GO if not options.ontology: options.ontology = [ "biol_process", "mol_function", "cell_location"] E.info("dumping GO categories to %s" % (options.filename_dump)) dbhandle = database.connect(url=options.database_url) outfile = iotools.open_file(options.filename_dump, "w", create_dir=True) GO.DumpGOFromDatabase(outfile, dbhandle, options) outfile.close() E.stop() sys.exit(0) ############################################################# # read GO categories from file if options.filename_input: E.info("reading association of categories and genes from %s" % (options.filename_input)) infile = iotools.open_file(options.filename_input) gene2gos, go2infos = GO.ReadGene2GOFromFile(infile) infile.close() if options.filename_gene2name: E.info("reading gene identifier to gene name mapping from %s" % options.filename_gene2name) infile = iotools.open_file(options.filename_gene2name) gene2name = iotools.read_map(infile, has_header=True) infile.close() E.info("read %i gene names for %i gene identifiers" % (len(set(gene2name.values())), len(gene2name))) else: # use identity mapping gene2name = dict([(x, x) for x in list(gene2gos.keys())]) ############################################################# # read GO ontology from file if options.filename_ontology: E.info("reading ontology from %s" % (options.filename_ontology)) infile = iotools.open_file(options.filename_ontology) ontology = GO.readOntology(infile) infile.close() def _g(): return collections.defaultdict(GO.GOInfo) go2infos = collections.defaultdict(_g) # substitute go2infos for go in list(ontology.values()): go2infos[go.mNameSpace][go.mId] = GO.GOInfo( go.mId, go_type=go.mNameSpace, description=go.mName) ############################################################# # get foreground gene list input_foreground, genelists = GO.ReadGeneLists( options.filename_genes, gene_pattern=options.gene_pattern) E.info("read %i genes for forground in %i gene lists" % (len(input_foreground), len(genelists))) ############################################################# # get background if options.filename_background: # nick - bug fix: background is the first tuple element from # ReadGeneLists input_background = GO.ReadGeneLists( options.filename_background, gene_pattern=options.gene_pattern)[0] E.info("read %i genes for background" % len(input_background)) else: input_background = None ############################################################# # sort out which ontologies to test if not options.ontology: if options.filename_input: options.ontology = list(gene2gos.keys()) E.info("found %i ontologies: %s" % (len(options.ontology), options.ontology)) summary = [] summary.append("\t".join(( "genelist", "ontology", "significant", "threshold", "ngenes", "ncategories", "nmaps", "nforegound", "nforeground_mapped", "nbackground", "nbackground_mapped", "nsample_counts", "nbackground_counts", "psample_assignments", "pbackground_assignments", "messages")) + "\n") ############################################################# # get go categories for genes for test_ontology in sorted(options.ontology): # store results for aggregate output of multiple gene lists all_results = [] all_significant_results = [] all_genelists_with_results = [] E.info("working on ontology %s" % test_ontology) ############################################################# # get/read association of GO categories to genes if options.filename_input: gene2go, go2info = gene2gos[test_ontology], go2infos[test_ontology] else: E.info("reading data from database ...") dbhandle.Connect(options) gene2go, go2info = GO.ReadGene2GOFromDatabase( dbhandle, test_ontology, options.database, options.species) E.info("finished") if len(go2info) == 0: E.warn( "could not find information for terms - " "could be mismatch between ontologies") ngenes, ncategories, nmaps, counts_per_category = GO.CountGO(gene2go) E.info("assignments found: %i genes mapped to %i categories " "(%i maps)" % (ngenes, ncategories, nmaps)) if options.minimum_counts > 0: to_remove = set( [x for x, y in counts_per_category.items() if y < options.minimum_counts]) E.info("removing %i categories with less than %i genes" % (len(to_remove), options.minimum_counts)) GO.removeCategories(gene2go, to_remove) ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) E.info("assignments after filtering: %i genes mapped " "to %i categories (%i maps)" % ( ngenes, ncategories, nmaps)) for genelist_name, foreground in sorted(genelists.items()): msgs = [] E.info("processing %s with %i genes" % (genelist_name, len(foreground))) ################################################################## ################################################################## ################################################################## # build background - reconcile with foreground ################################################################## if input_background is None: background = list(gene2go.keys()) else: background = list(input_background) # nick - bug-fix backgorund included the foreground in a tuple. # background is the first tuple element missing = foreground.difference(set(background)) if options.strict: assert len(missing) == 0, \ "%i genes in foreground but not in background: %s" % ( len(missing), str(missing)) else: if len(missing) != 0: E.warn("%i genes in foreground that are not in " "background - added to background of %i" % (len(missing), len(background))) background.extend(missing) E.info("(unfiltered) foreground=%i, background=%i" % (len(foreground), len(background))) # sort foreground and background, important for reproducibility # under random seed foreground = sorted(foreground) background = sorted(background) ############################################################# # sanity checks: # are all of the foreground genes in the dataset # missing = set(genes).difference( set(gene2go.keys()) ) # assert len(missing) == 0, "%i genes in foreground set without GO annotation: %s" % (len(missing), str(missing)) ############################################################# # read GO slims and map GO categories to GO slim categories if options.filename_slims: go_slims = GO.GetGOSlims( iotools.open_file(options.filename_slims, "r")) if options.loglevel >= 1: v = set() for x in list(go_slims.values()): for xx in x: v.add(xx) options.stdlog.write( "# read go slims from %s: go=%i, slim=%i\n" % (options.filename_slims, len(go_slims), len(v))) if options.filename_map_slims: if options.filename_map_slims == "-": outfile = options.stdout else: outfile = iotools.open_file( options.filename_map_slims, "w") outfile.write("GO\tGOSlim\n") for go, go_slim in sorted(list(go_slims.items())): outfile.write("%s\t%s\n" % (go, go_slim)) if outfile != options.stdout: outfile.close() gene2go = GO.MapGO2Slims(gene2go, go_slims, ontology=ontology) if options.loglevel >= 1: ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) options.stdlog.write( "# after go slim filtering: %i genes mapped to " "%i categories (%i maps)\n" % ( ngenes, ncategories, nmaps)) ############################################################# # Just dump out the gene list if options.get_genes: fg, bg, ng = [], [], [] for gene, vv in list(gene2go.items()): for v in vv: if v.mGOId == options.get_genes: if gene in genes: fg.append(gene) elif gene in background: bg.append(gene) else: ng.append(gene) # skip to next GO class if not (bg or ng): continue options.stdout.write( "# genes in GO category %s\n" % options.get_genes) options.stdout.write("gene\tset\n") for x in sorted(fg): options.stdout.write("%s\t%s\n" % ("fg", x)) for x in sorted(bg): options.stdout.write("%s\t%s\n" % ("bg", x)) for x in sorted(ng): options.stdout.write("%s\t%s\n" % ("ng", x)) E.info("nfg=%i, nbg=%i, nng=%i" % (len(fg), len(bg), len(ng))) E.stop() sys.exit(0) ############################################################# outfile = GO.getFileName(options, go=test_ontology, section='foreground', set=genelist_name) outfile.write("gene_id\n%s\n" % ("\n".join(sorted(foreground)))) if options.output_filename_pattern: outfile.close() outfile = GO.getFileName(options, go=test_ontology, section='background', set=genelist_name) # Jethro bug fix - see section 'build background' for assignment outfile.write("gene_id\n%s\n" % ("\n".join(sorted(background)))) if options.output_filename_pattern: outfile.close() ############################################################# # do the analysis go_results = GO.AnalyseGO(gene2go, foreground, background) if len(go_results.mSampleGenes) == 0: E.warn("%s: no genes with GO categories - analysis aborted" % genelist_name) continue pairs = list(go_results.mResults.items()) ############################################################# # calculate fdr for each hypothesis if options.fdr: fdrs, samples, method = GO.computeFDRs(go_results, foreground, background, options, test_ontology, gene2go, go2info) for x, v in enumerate(pairs): v[1].mQValue = fdrs[v[0]][0] else: fdrs, samples, method = {}, {}, None msgs.append("fdr=%s" % method) if options.sort_order == "fdr": pairs.sort(key=lambda x: x[1].mQValue) elif options.sort_order == "ratio": pairs.sort(key=lambda x: x[1].mRatio) elif options.sort_order == "pvalue": pairs.sort(key=lambda x: x[1].mPValue) ############################################################# ############################################################# ############################################################# # output the full result outfile = GO.getFileName(options, go=test_ontology, section='overall', set=genelist_name) GO.outputResults( outfile, pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # filter significant results and output filtered_pairs = GO.selectSignificantResults(pairs, fdrs, options) nselected = len(filtered_pairs) nselected_up = len([x for x in filtered_pairs if x[1].mRatio > 1]) nselected_down = len( [x for x in filtered_pairs if x[1].mRatio < 1]) assert nselected_up + nselected_down == nselected outfile = GO.getFileName(options, go=test_ontology, section='results', set=genelist_name) GO.outputResults(outfile, filtered_pairs, go2info, options, fdrs=fdrs, samples=samples) if options.output_filename_pattern: outfile.close() ############################################################# ############################################################# ############################################################# # save results for multi-gene-list analysis all_results.append(pairs) all_significant_results.append(filtered_pairs) all_genelists_with_results.append(genelist_name) ############################################################# ############################################################# ############################################################# # output parameters ngenes, ncategories, nmaps, counts_per_category = \ GO.CountGO(gene2go) outfile = GO.getFileName(options, go=test_ontology, section='parameters', set=genelist_name) nbackground = len(background) if nbackground == 0: nbackground = len(go_results.mBackgroundGenes) outfile.write( "# input go mappings for gene list '%s' and category '%s'\n" % (genelist_name, test_ontology)) outfile.write("parameter\tvalue\tdescription\n") outfile.write("mapped_genes\t%i\tmapped genes\n" % ngenes) outfile.write( "mapped_categories\t%i\tmapped categories\n" % ncategories) outfile.write("mappings\t%i\tmappings\n" % nmaps) outfile.write("genes_in_fg\t%i\tgenes in foreground\n" % len(foreground)) outfile.write( "genes_in_fg_with_assignment\t%i\tgenes in foreground with GO assignments\n" % (len(go_results.mSampleGenes))) outfile.write( "genes_in_bg\t%i\tinput background\n" % nbackground) outfile.write( "genes_in_bg_with_assignment\t%i\tgenes in background with GO assignments\n" % ( len(go_results.mBackgroundGenes))) outfile.write( "associations_in_fg\t%i\tassociations in sample\n" % go_results.mSampleCountsTotal) outfile.write( "associations_in_bg\t%i\tassociations in background\n" % go_results.mBackgroundCountsTotal) outfile.write( "percent_genes_in_fg_with_association\t%s\tpercent genes in sample with GO assignments\n" % ( iotools.pretty_percent(len(go_results.mSampleGenes), len(foreground), "%5.2f"))) outfile.write( "percent_genes_in_bg_with_associations\t%s\tpercent genes background with GO assignments\n" % ( iotools.pretty_percent(len(go_results.mBackgroundGenes), nbackground, "%5.2f"))) outfile.write( "significant\t%i\tsignificant results reported\n" % nselected) outfile.write( "significant_up\t%i\tsignificant up-regulated results reported\n" % nselected_up) outfile.write( "significant_down\t%i\tsignificant up-regulated results reported\n" % nselected_down) outfile.write( "threshold\t%6.4f\tsignificance threshold\n" % options.threshold) if options.output_filename_pattern: outfile.close() summary.append("\t".join(map(str, ( genelist_name, test_ontology, nselected, options.threshold, ngenes, ncategories, nmaps, len(foreground), len(go_results.mSampleGenes), nbackground, len(go_results.mBackgroundGenes), go_results.mSampleCountsTotal, go_results.mBackgroundCountsTotal, iotools.pretty_percent( len(go_results.mSampleGenes), len(foreground), "%5.2f"), iotools.pretty_percent( len(go_results.mBackgroundGenes), nbackground, "%5.2f"), ",".join(msgs)))) + "\n") ############################################################# ############################################################# ############################################################# # output the fg patterns outfile = GO.getFileName(options, go=test_ontology, section='withgenes', set=genelist_name) GO.outputResults(outfile, pairs, go2info, options, fdrs=fdrs, samples=samples, gene2go=gene2go, foreground=foreground, gene2name=gene2name) if options.output_filename_pattern: outfile.close() if len(genelists) > 1: ################################################################### # output various summary files # significant results GO.outputMultipleGeneListResults(all_significant_results, all_genelists_with_results, test_ontology, go2info, options, section='significant') # all results GO.outputMultipleGeneListResults(all_results, all_genelists_with_results, test_ontology, go2info, options, section='all') if options.compute_pairwise: GO.pairwiseGOEnrichment(all_results, all_genelists_with_results, test_ontology, go2info, options) outfile_summary = options.stdout outfile_summary.write("".join(summary)) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) # IMS: new method: extend intervals by set amount parser.add_argument("-m", "--method", dest="methods", type=str, action="append", choices=("merge", "filter-genome", "bins", "block", "sanitize-genome", "shift", "extend", "filter-names", "rename-chr"), help="method to apply") parser.add_argument("--num-bins", dest="num_bins", type=int, help="number of bins into which to merge (used for " "method `bins)") parser.add_argument("--bin-edges", dest="bin_edges", type=str, help="bin_edges for binning method") parser.add_argument( "--binning-method", dest="binning_method", type=str, choices=("equal-bases", "equal-intervals", "equal-range"), help="method used for binning (used for method `bins` if no " "bin_edges is given)") parser.add_argument( "--merge-distance", dest="merge_distance", type=int, help="distance in bases over which to merge that are not " "directly adjacent") parser.add_argument( "--merge-min-intervals", dest="merge_min_intervals", type=int, help="only output merged intervals that are build from at least " "x intervals") parser.add_argument("--merge-by-name", dest="merge_by_name", action="store_true", help="only merge intervals with the same name") parser.add_argument( "--merge-and-resolve-blocks", dest="resolve_blocks", action="store_true", help="When merging bed12 entrys, should blocks be resolved?") parser.add_argument("--merge-stranded", dest="stranded", action="store_true", help="Only merge intervals on the same strand") parser.add_argument( "--remove-inconsistent-names", dest="remove_inconsistent_names", action="store_true", help="when merging, do not output intervals where the names of " "overlapping intervals do not match") parser.add_argument("--offset", dest="offset", type=int, help="offset for shifting intervals") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-b", "--bam-file", dest="bam_file", type=str, help="bam-formatted filename with genome.") parser.add_argument("--filter-names-file", dest="names", type=str, help="list of names to keep. One per line") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(methods=[], merge_distance=0, binning_method="equal-bases", merge_by_name=False, genome_file=None, rename_chr_file=None, bam_file=None, num_bins=5, merge_min_intervals=1, bin_edges=None, offset=10000, test=None, extend_distance=1000, remove_inconsistent_names=False, resolve_blocks=False) (args) = E.start(parser, add_pipe_options=True) contigs = None chr_map = None # Why provide full indexed genome, when a tsv of contig sizes would do? if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.bam_file: samfile = pysam.AlignmentFile(args.bam_file) contigs = dict(list(zip(samfile.references, samfile.lengths))) if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") processor = Bed.iterator(args.stdin) for method in args.methods: if method == "filter-genome": if not contigs: raise ValueError("please supply contig sizes") processor = filterGenome(processor, contigs) elif method == "sanitize-genome": if not contigs: raise ValueError("please supply contig sizes") processor = sanitizeGenome(processor, contigs) elif method == "merge": processor = merge( processor, args.merge_distance, by_name=args.merge_by_name, min_intervals=args.merge_min_intervals, remove_inconsistent=args.remove_inconsistent_names, resolve_blocks=args.resolve_blocks, stranded=args.stranded) elif method == "bins": if args.bin_edges: bin_edges = list(map(float, args.bin_edges.split(","))) # IMS: check bin edges are valid if not (len(bin_edges) == args.num_bins + 1): raise ValueError( "Number of bin edge must be one more than " "number of bins") else: bin_edges = None processor, bin_edges = Bed.binIntervals(processor, num_bins=args.num_bins, method=args.binning_method, bin_edges=bin_edges) E.info("# split bed: bin_edges=%s" % (str(bin_edges))) elif method == "block": processor = Bed.blocked_iterator(processor) elif method == "shift": # IMS: test that contig sizes are availible if not contigs: raise ValueError("please supply genome file") processor = shiftIntervals(processor, contigs, offset=args.offset) # IMS: new method: extend intervals by set amount elif method == "extend": if not contigs: raise ValueError("please supply genome file") processor = extendInterval(processor, contigs, args.offset) elif method == "filter-names": if not args.names: raise ValueError("please supply list of names to filter") names = [name.strip() for name in open(args.names)] processor = filterNames(processor, names) elif method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") processor = renameChromosomes(processor, chr_map) noutput = 0 for bed in processor: args.stdout.write(str(bed) + "\n") noutput += 1 E.info("noutput=%i" % (noutput)) E.stop()