def sort(self, key=None): """Sort the bins in this array (in-place). Optional argument 'key' is one of: - a function that computes a sorting key from a CopyNumArray row - a string identifier for an existing data column - a list/array/iterable of precomputed keys equal in length to the number of rows in this CopyNumArray. By default, bins are sorted by chromosomal coordinates. """ if key is None: # Sort by chrom, then by start position chrom_keys = list(map(core.sorter_chrom, self.data['chromosome'])) order = numpy.lexsort((self.data['start'], chrom_keys)) else: # Sort by the given key, using a stable sort algorithm if isinstance(key, basestring): keys = self.data[key] elif callable(key): keys = list(map(key, self.data)) else: if not len(key) == len(self): raise ValueError("Sort key, as an array, must have the " "same length as the CopyNumArray to sort " "(%d vs. %d)." % (len(key), len(self))) keys = key order = numpy.argsort(keys, kind='mergesort') self.data = self.data.take(order)
def bedcov(bed_fname, bam_fname, min_mapq): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude low-MAPQ reads if min_mapq > 0: bedcov_args = ['-Q', str(min_mapq)] else: bedcov_args = [] try: lines = pysam.bedcov(bed_fname, bam_fname, *bedcov_args) except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... for line in lines: try: chrom, start_s, end_s, name, basecount_s = line.split('\t') except: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = map(int, (start_s, end_s, basecount_s.strip())) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 yield chrom, start, end, name, count, mean_depth
def __eq__(self, other): if len(self.data) != len(other.data): return 0 ok = reduce( lambda x, y: x and y, map(lambda x, y: x == y, self.data, other.data) ) return ok
def read_fasta_index(fasta_fname): """Load a FASTA file's index. Returns a dict of: {seq_id: (length, offset, chars_per_line, bytes_per_line), ...} The index file contains, in one row per sequence, tab-separated columns: - sequence identifier - length - offset of the first sequence character in the file - number of characters per line - number of bytes per line (including the end-of-line character) With this information, we can easily compute the byte offset of the i-th character of a sequence in a file by looking at its index record. We skip to this byte offset in the file and from there, we can read the necessary sequence characters. See: http://trac.seqan.de/wiki/Tutorial/IndexedFastaIO: """ # Build a dict of keys -> offsets index = {} fai_fname = ensure_fasta_index(fasta_fname) with open(fai_fname) as faifile: for line in faifile: fields = line.rstrip().split('\t') seq_id = fields[0] assert seq_id not in index, "Duplicate ID: " + seq_id index[fields[0]] = tuple(map(int, fields[1:])) return index
def bedcov(bed_fname, bam_fname): """Calculate depth of all regions in a BED file via samtools (pysam) bedcov. i.e. mean pileup depth across each region. """ # Count bases in each region; exclude 0-MAPQ reads try: lines = pysam.bedcov(bed_fname, bam_fname, '-Q', '1') except pysam.SamtoolsError as exc: raise ValueError("Failed processing %r coverages in %r regions. PySAM error: %s" % (bam_fname, bed_fname, exc)) if not lines: raise ValueError("BED file %r sequence IDs don't match any in BAM file %r" % (bed_fname, bam_fname)) # Return an iterable... for line in lines: try: chrom, start_s, end_s, name, basecount_s = line.split('\t') except: raise RuntimeError("Bad line from bedcov:\n" + line) start, end, basecount = map(int, (start_s, end_s, basecount_s.strip())) span = end - start if span > 0: # Algebra from above count = basecount / READ_LEN mean_depth = basecount / span else: # User-supplied bins might be oddly constructed count = mean_depth = 0 yield chrom, start, end, name, count, mean_depth
def read_fasta_index(fasta_fname): """Load a FASTA file's index. Returns a dict of: {seq_id: (length, offset, chars_per_line, bytes_per_line), ...} The index file contains, in one row per sequence, tab-separated columns: - sequence identifier - length - offset of the first sequence character in the file - number of characters per line - number of bytes per line (including the end-of-line character) With this information, we can easily compute the byte offset of the i-th character of a sequence in a file by looking at its index record. We skip to this byte offset in the file and from there, we can read the necessary sequence characters. See: http://trac.seqan.de/wiki/Tutorial/IndexedFastaIO: """ index = {} fai_fname = ensure_fasta_index(fasta_fname) with open(fai_fname) as faifile: for line in faifile: fields = line.rstrip().split('\t') seq_id = fields[0] assert seq_id not in index, "Duplicate ID: " + seq_id index[fields[0]] = tuple(map(int, fields[1:])) return index
def reference2regions(reference, coord_only=False): """Extract iterables of target and antitarget regions from a reference. Like loading two BED files with ngfrills.parse_regions. """ cna2rows = (_cna2coords if coord_only else _cna2regions) return map(cna2rows, _ref_split_targets(reference))
def reference2regions(reference, coord_only=False): """Extract iterables of target and antitarget regions from a reference CNA. Like loading two BED files with ngfrills.parse_regions. """ cna2rows = (_cna2coords if coord_only else _cna2regions) return map(cna2rows, _ref_split_targets(reference))
def get_max_abs_value(sample_data): val = 0 for i in range(len(sample_data)): for chrom in sample_data[i].keys(): vals = [data[2] for data in sample_data[i][chrom]] vals = filter(lambda x: x != -1000, vals) val = max(val, max(map(abs, vals))) return val
def _sniff_xtra(header_line): colnames = tuple(map(str.strip, header_line.split('\t'))) assert colnames[:5] == ('chromosome', 'start', 'end', 'gene', 'log2'), \ colnames[:5] xtra = colnames[5:] assert all(x in ('gc', 'rmask', 'spread', 'weight', 'probes') for x in xtra) return xtra
def get_max_abs_value(sample_data): val = 0 for i in range(len(sample_data)): for chrom in sample_data[i].keys(): vals = [data[2] for data in sample_data[i][chrom]] vals = filter(lambda x: x!= -1000, vals) val = max(val, max(map(abs, vals))) return val
def parse_line(line): fields = line.rstrip().split('\t') coverage, chrom, start, end, gene = fields[1:6] outrow = [chrom, int(start), int(end), gene, float(coverage)] # Parse extra fields as numbers (common type: float) rest = list(map(float, fields[6:])) core.assert_equal("Number of extra columns parsed doesn't match " "extra fields given", **{"extra columns": len(rest), "extra fields": len(xtra)}) return tuple(outrow + rest)
def read_class_color(path): # in case of categorical data # the use should provide a table with mappings of category labels # to rgb value # e.g. # positive 0.9 0.1 0.1 # negative 0.1 0.1 0.9 cols = collections.OrderedDict() for line in file(path): toks = line.strip().split('\t') cols[toks[0]] = tuple(map(float, toks[1:])) return cols
def warn_bad_probes(probes): """Warn about target probes where coverage is poor. Prints a formatted table to stderr. """ bad_probes = probes[mask_bad_probes(probes)] fg_index = (bad_probes['gene'] != 'Background') fg_bad_probes = bad_probes[fg_index] if len(fg_bad_probes) > 0: # ENH: print coverage and spread too bad_pct = 100 * len(fg_bad_probes) / sum( probes['gene'] != 'Background') echo("*WARNING*", len(fg_bad_probes), "targets", "(%.4f)" % bad_pct + '%', "failed filters:") gene_cols = max(map(len, fg_bad_probes['gene'])) labels = list(map(row2label, fg_bad_probes)) chrom_cols = max(map(len, labels)) last_gene = None for label, probe in zip(labels, fg_bad_probes): if probe['gene'] == last_gene: gene = ' "' else: gene = probe['gene'] last_gene = gene if 'rmask' in probes: print(" %s %s coverage=%.3f spread=%.3f rmask=%.3f" % (gene.ljust(gene_cols), label.ljust(chrom_cols), probe['coverage'], probe['spread'], probe['rmask'])) else: print(" %s %s coverage=%.3f spread=%.3f" % (gene.ljust(gene_cols), label.ljust(chrom_cols), probe['coverage'], probe['spread'])) # Count the number of BG probes dropped, too (names are all "Background") bg_bad_probes = bad_probes[True - fg_index] if len(bg_bad_probes) > 0: bad_pct = 100 * len(bg_bad_probes) / sum( probes['gene'] == 'Background') echo("Antitargets:", len(bg_bad_probes), "(%.4f)" % bad_pct + '%', "failed filters")
def warn_bad_probes(probes): """Warn about target probes where coverage is poor. Prints a formatted table to stderr. """ bad_probes = probes[fix.mask_bad_probes(probes)] fg_index = (bad_probes['gene'] != 'Background') fg_bad_probes = bad_probes[fg_index] if len(fg_bad_probes) > 0: bad_pct = 100 * len(fg_bad_probes) / sum( probes['gene'] != 'Background') logging.info("Targets: %d (%s) bins failed filters:", len(fg_bad_probes), "%.4f" % bad_pct + '%') gene_cols = max(map(len, fg_bad_probes['gene'])) labels = list(map(CNA.row2label, fg_bad_probes)) chrom_cols = max(map(len, labels)) last_gene = None for label, probe in zip(labels, fg_bad_probes): if probe['gene'] == last_gene: gene = ' "' else: gene = probe['gene'] last_gene = gene if 'rmask' in probes: logging.info( " %s %s coverage=%.3f spread=%.3f rmask=%.3f", gene.ljust(gene_cols), label.ljust(chrom_cols), probe['log2'], probe['spread'], probe['rmask']) else: logging.info(" %s %s coverage=%.3f spread=%.3f", gene.ljust(gene_cols), label.ljust(chrom_cols), probe['log2'], probe['spread']) # Count the number of BG probes dropped, too (names are all "Background") bg_bad_probes = bad_probes[~fg_index] if len(bg_bad_probes) > 0: bad_pct = 100 * len(bg_bad_probes) / sum( probes['gene'] == 'Background') logging.info("Antitargets: %d (%s) bins failed filters", len(bg_bad_probes), "%.4f" % bad_pct + '%')
def warn_bad_probes(probes): """Warn about target probes where coverage is poor. Prints a formatted table to stderr. """ bad_probes = probes[mask_bad_probes(probes)] fg_index = (bad_probes['gene'] != 'Background') fg_bad_probes = bad_probes[fg_index] if len(fg_bad_probes) > 0: # ENH: print coverage and spread too bad_pct = 100 * len(fg_bad_probes) / sum(probes['gene'] != 'Background') echo("*WARNING*", len(fg_bad_probes), "targets", "(%.4f)" % bad_pct + '%', "failed filters:") gene_cols = max(map(len, fg_bad_probes['gene'])) labels = list(map(row2label, fg_bad_probes)) chrom_cols = max(map(len, labels)) last_gene = None for label, probe in zip(labels, fg_bad_probes): if probe['gene'] == last_gene: gene = ' "' else: gene = probe['gene'] last_gene = gene if 'rmask' in probes: print(" %s %s coverage=%.3f spread=%.3f rmask=%.3f" % (gene.ljust(gene_cols), label.ljust(chrom_cols), probe['coverage'], probe['spread'], probe['rmask'])) else: print(" %s %s coverage=%.3f spread=%.3f" % (gene.ljust(gene_cols), label.ljust(chrom_cols), probe['coverage'], probe['spread'])) # Count the number of BG probes dropped, too (names are all "Background") bg_bad_probes = bad_probes[True - fg_index] if len(bg_bad_probes) > 0: bad_pct = 100 * len(bg_bad_probes) / sum(probes['gene'] == 'Background') echo("Antitargets:", len(bg_bad_probes), "(%.4f)" % bad_pct + '%', "failed filters")
def warn_bad_probes(probes): """Warn about target probes where coverage is poor. Prints a formatted table to stderr. """ bad_probes = probes[fix.mask_bad_probes(probes)] fg_index = (bad_probes['gene'] != 'Background') fg_bad_probes = bad_probes[fg_index] if len(fg_bad_probes) > 0: bad_pct = 100 * len(fg_bad_probes) / sum(probes['gene'] != 'Background') logging.info("Targets: %d (%s) bins failed filters:", len(fg_bad_probes), "%.4f" % bad_pct + '%') gene_cols = max(map(len, fg_bad_probes['gene'])) labels = list(map(CNA.row2label, fg_bad_probes)) chrom_cols = max(map(len, labels)) last_gene = None for label, probe in zip(labels, fg_bad_probes): if probe.gene == last_gene: gene = ' "' else: gene = probe.gene last_gene = gene if 'rmask' in probes: logging.info(" %s %s coverage=%.3f spread=%.3f rmask=%.3f", gene.ljust(gene_cols), label.ljust(chrom_cols), probe.log2, probe.spread, probe.rmask) else: logging.info(" %s %s coverage=%.3f spread=%.3f", gene.ljust(gene_cols), label.ljust(chrom_cols), probe.log2, probe.spread) # Count the number of BG probes dropped, too (names are all "Background") bg_bad_probes = bad_probes[~fg_index] if len(bg_bad_probes) > 0: bad_pct = 100 * len(bg_bad_probes) / sum(probes['gene'] == 'Background') logging.info("Antitargets: %d (%s) bins failed filters", len(bg_bad_probes), "%.4f" % bad_pct + '%')
def merge_rows(rows): """Combine equivalent rows of coverage data across multiple samples. Check that probe info matches across all samples, then merge the log2 coverage values. Input: a list of individual rows corresponding to the same probes from different coverage files. Output: a list starting with the single common Probe object, followed by the log2 coverage values from each sample, in order. """ probe_infos, coverages = zip(*map(row_to_probe_coverage, rows)) probe_info = core.check_unique(probe_infos, "probe Name") combined_row = [probe_info] + list(coverages) return combined_row
def write(self, outfile=sys.stdout): """Write coverage data to a file or handle in tabular format. This is similar to BED or BedGraph format, but with extra columns. To combine multiple samples in one file and/or convert to another format, see the 'export' subcommand. """ colnames = ['chromosome', 'start', 'end', 'gene', 'log2'] colnames.extend(self._xtra) rows = (list(map(str, row)) for row in self.data) with ngfrills.safe_write(outfile) as handle: header = '\t'.join(colnames) + '\n' handle.write(header) handle.writelines('\t'.join(row) + '\n' for row in rows)
def _get_coords(filename): alb = open(filename) start_line = None end_line = None for line in alb: if line.startswith("["): if not start_line: start_line = line # rstrip not needed else: end_line = line if end_line is None: # sequence is too short return [(0, 0), (0, 0)] return list(zip(*map(_alb_line2coords, [start_line, end_line]))) # returns [(start0, end0), (start1, end1)]
def squash_genes(self, ignore=('-', 'CGH', '.'), squash_background=False, summary_stat=metrics.biweight_location): """Combine consecutive bins with the same targeted gene name. The `ignore` parameter lists bin names that not be counted as genes to be output. Parameter `summary_stat` is a function that summarizes an array of coverage values to produce the "squashed" gene's coverage value. By default this is the biweight location, but you might want median, mean, max, min or something else in some cases. Optional columns, if present, are dropped. """ def squash_rows(name, rows): """Combine multiple rows (for the same gene) into one row.""" chrom = core.check_unique(rows['chromosome'], 'chromosome') start = rows[0]['start'] end = rows[-1]['end'] cvg = summary_stat(rows['coverage']) outrow = [chrom, start, end, name, cvg] # Handle extra fields # ENH - no coverage stat; do weighted average as appropriate for xfield in ('gc', 'rmask', 'spread', 'weight'): if xfield in self: outrow.append(summary_stat(rows[xfield])) if 'probes' in self: outrow.append(sum(rows['probes'])) return tuple(outrow) outrows = [] for name, subarr in self.by_gene(ignore): if name == 'Background' and not squash_background: outrows.extend(map(tuple, subarr)) else: outrows.append(squash_rows(name, subarr)) return self.to_rows(outrows)
def do_segmentation(cnarr, method, threshold=None, variants=None, skip_low=False, skip_outliers=10, save_dataframe=False, rlibpath=None, processes=1): """Infer copy number segments from the given coverage table.""" # XXX parallel flasso segfaults in R when run on a single chromosome if processes == 1 or method == 'flasso': cna = _do_segmentation(cnarr, method, threshold, variants, skip_low, skip_outliers, save_dataframe, rlibpath) if save_dataframe: cna, rstr = cna rstr = _to_str(rstr) else: with futures.ProcessPoolExecutor(processes) as pool: rets = list( pool.map(_ds, ((ca, method, threshold, variants, skip_low, skip_outliers, save_dataframe, rlibpath) for _, ca in cnarr.by_chromosome()))) if save_dataframe: # rets is a list of (CNA, R dataframe string) -- unpack rets, r_dframe_strings = zip(*rets) # Strip the header line from all but the first dataframe, then combine r_dframe_strings = map(_to_str, r_dframe_strings) rstr = [next(r_dframe_strings)] rstr.extend(r[r.index('\n') + 1:] for r in r_dframe_strings) rstr = "".join(rstr) cna = cnarr.concat(rets) if save_dataframe: return cna, rstr return cna
def create_heatmap(filenames, index, colormap, crange, show_chromosome=None): """ Plot signal for multiple samples as a heatmap. Signal can be continuous or categorical value. In case of numeric values, a continuous color scale will be used to map values to color. In case of categorical value, the user should provide mapping of values to color as an input""" _fig = pyplot.figure(figsize=(12, 8)) gs = matplotlib.gridspec.GridSpec(2, 1, height_ratios=[30, 1]) axis = pyplot.subplot(gs[0]) axis_aux = pyplot.subplot(gs[1]) # List sample names on the y-axis axis.set_yticks([i + 0.5 for i in range(len(filenames))]) axis.set_yticklabels(list(map(fbase, filenames))) axis.invert_yaxis() axis.set_ylabel("Samples") axis.set_axis_bgcolor('#DDDDDD') # Group each file's probes/segments by chromosome sample_data = [collections.defaultdict(list) for _f in filenames] #-------------------------------------------------------------# # read in the signal value in each sample from the input files index = int(index) for i, fname in enumerate(filenames): f_h = file(fname) for line in f_h: toks = line.strip().split('\t') if toks[0].lower() in ['chrom', 'chromosome']: # header line continue if colormap == None: # numerical data, convert signal to floating point numbers sample_data[i][toks[0]].append( (int(toks[1]), int(toks[2]), ifloat(toks[index]))) else: sample_data[i][toks[0]].append( (int(toks[1]), int(toks[2]), toks[index])) f_h.close() #-------------------------------------------------------------# # Calculate the size (max endpoint value) of each chromosome chrom_sizes = {} for row in sample_data: for chrom, data in iteritems(row): max_posn = max(coord[1] for coord in data) chrom_sizes[chrom] = max(max_posn, chrom_sizes.get(chrom, 0)) chrom_sizes = collections.OrderedDict( sorted(iteritems(chrom_sizes), key=sorter_chrom_at(0))) if colormap != None: cvg2rgb = CVG2RGB(colormap=colormap) else: if crange != None: vmax = float(crange) vmin = (-1) * vmax else: vmax = get_max_abs_value(sample_data) vmin = (-1) * vmax # this matplotlib color map is appropriate for diverging data # ; e.g. copy number values. can be subsituted by any maplotlib # colormap my_cmap = matplotlib.cm.seismic color_norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax) cvg2rgb = CVG2RGB(rcmap=my_cmap, color_norm=color_norm) def plot_rect(y_idx, x_start, x_end, cvg): """Draw a rectangle in the given coordinates and color.""" x_coords = (x_start, x_start, x_end + 1, x_end + 1) y_coords = (y_idx, y_idx + 1, y_idx + 1, y_idx) if cvg in [-1000.0, 'NA']: # missing data, a shade of dark gray rgbcolor = (0.3, 0.3, 0.3) else: rgbcolor = cvg2rgb.get_color(cvg) axis.fill(x_coords, y_coords, color=rgbcolor) if show_chromosome: # Lay out only the selected chromosome chrom_offsets = {show_chromosome: 0.0} # Set x-axis the chromosomal positions (in Mb), title as the chromosome axis.set_xlim(0, chrom_sizes[show_chromosome] * MB) axis.set_title(show_chromosome) axis.set_xlabel("Position (Mb)") axis.tick_params(which='both', direction='out') axis.get_xaxis().tick_bottom() axis.get_yaxis().tick_left() # Plot the individual probe/segment coverages for i, row in enumerate(sample_data): for start, end, cvg in row[show_chromosome]: plot_rect(i, start * MB, end * MB, cvg) else: # Lay out chromosome dividers and x-axis labels # (Just enough padding to avoid overlap with the divider line) chrom_offsets = plot_x_dividers(axis, chrom_sizes, 1) # Plot the individual probe/segment coverages for i, row in enumerate(sample_data): for chrom, curr_offset in iteritems(chrom_offsets): for start, end, cvg in row[chrom]: plot_rect(i, start + curr_offset, end + curr_offset, cvg) plot_y_dividers(axis, sample_data, color=(0.4, 0.4, 0.4)) if colormap != None: cols = cvg2rgb.label2color.items() cols = [(index, cols[index][0], cols[index][1]) \ for index in range(len(cols))] colors = [list(x[2]) for x in cols] labels = [x[1] for x in cols] cmap = matplotlib.colors.ListedColormap(colors) bounds = [col[0] + 0.5 for col in cols] bounds = [-0.5] + bounds norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N) cb2 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = cmap,\ boundaries = bounds, \ norm = norm, \ ticks = bounds, \ orientation = 'horizontal') cb2.set_ticks([x + 0.5 for x in bounds]) cb2.set_ticklabels(labels) else: cb1 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = my_cmap, norm = color_norm,\ orientation = 'horizontal') pass
def create_heatmap(filenames, index, colormap, crange, show_chromosome=None): """ Plot signal for multiple samples as a heatmap. Signal can be continuous or categorical value. In case of numeric values, a continuous color scale will be used to map values to color. In case of categorical value, the user should provide mapping of values to color as an input""" _fig = pyplot.figure(figsize = (12,8)) gs = matplotlib.gridspec.GridSpec(2,1,height_ratios = [30,1]) axis = pyplot.subplot(gs[0]) axis_aux = pyplot.subplot(gs[1]) # List sample names on the y-axis axis.set_yticks([i + 0.5 for i in range(len(filenames))]) axis.set_yticklabels(list(map(fbase, filenames))) axis.invert_yaxis() axis.set_ylabel("Samples") axis.set_axis_bgcolor('#DDDDDD') # Group each file's probes/segments by chromosome sample_data = [collections.defaultdict(list) for _f in filenames] #-------------------------------------------------------------# # read in the signal value in each sample from the input files index = int(index) for i, fname in enumerate(filenames): f_h = file(fname) for line in f_h: toks = line.strip().split('\t') if toks[0].lower() in ['chrom', 'chromosome']: # header line continue if colormap == None: # numerical data, convert signal to floating point numbers sample_data[i][toks[0]].append((int(toks[1]), int(toks[2]), ifloat(toks[index]))) else: sample_data[i][toks[0]].append((int(toks[1]), int(toks[2]), toks[index])) f_h.close() #-------------------------------------------------------------# # Calculate the size (max endpoint value) of each chromosome chrom_sizes = {} for row in sample_data: for chrom, data in iteritems(row): max_posn = max(coord[1] for coord in data) chrom_sizes[chrom] = max(max_posn, chrom_sizes.get(chrom, 0)) chrom_sizes = collections.OrderedDict(sorted(iteritems(chrom_sizes), key=sorter_chrom_at(0))) if colormap != None: cvg2rgb = CVG2RGB(colormap = colormap) else: if crange != None: vmax = float(crange) vmin = (-1) * vmax else: vmax = get_max_abs_value(sample_data) vmin = (-1) * vmax # this matplotlib color map is appropriate for diverging data # ; e.g. copy number values. can be subsituted by any maplotlib # colormap my_cmap = matplotlib.cm.seismic color_norm = matplotlib.colors.Normalize(vmin = vmin, vmax = vmax) cvg2rgb = CVG2RGB(rcmap = my_cmap, color_norm = color_norm) def plot_rect(y_idx, x_start, x_end, cvg): """Draw a rectangle in the given coordinates and color.""" x_coords = (x_start, x_start, x_end + 1, x_end + 1) y_coords = (y_idx, y_idx + 1, y_idx + 1, y_idx) if cvg in [-1000.0, 'NA']: # missing data, a shade of dark gray rgbcolor = (0.3, 0.3, 0.3) else: rgbcolor = cvg2rgb.get_color(cvg) axis.fill(x_coords, y_coords, color=rgbcolor) if show_chromosome: # Lay out only the selected chromosome chrom_offsets = {show_chromosome: 0.0} # Set x-axis the chromosomal positions (in Mb), title as the chromosome axis.set_xlim(0, chrom_sizes[show_chromosome] * MB) axis.set_title(show_chromosome) axis.set_xlabel("Position (Mb)") axis.tick_params(which='both', direction='out') axis.get_xaxis().tick_bottom() axis.get_yaxis().tick_left() # Plot the individual probe/segment coverages for i, row in enumerate(sample_data): for start, end, cvg in row[show_chromosome]: plot_rect(i, start * MB, end * MB, cvg) else: # Lay out chromosome dividers and x-axis labels # (Just enough padding to avoid overlap with the divider line) chrom_offsets = plot_x_dividers(axis, chrom_sizes, 1) # Plot the individual probe/segment coverages for i, row in enumerate(sample_data): for chrom, curr_offset in iteritems(chrom_offsets): for start, end, cvg in row[chrom]: plot_rect(i, start + curr_offset, end + curr_offset, cvg) plot_y_dividers(axis, sample_data, color = (0.4, 0.4, 0.4)) if colormap != None: cols = cvg2rgb.label2color.items() cols = [(index, cols[index][0], cols[index][1]) \ for index in range(len(cols))] colors = [list(x[2]) for x in cols] labels = [x[1] for x in cols] cmap = matplotlib.colors.ListedColormap(colors) bounds = [col[0] + 0.5 for col in cols] bounds = [-0.5] + bounds norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N) cb2 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = cmap,\ boundaries = bounds, \ norm = norm, \ ticks = bounds, \ orientation = 'horizontal') cb2.set_ticks([x + 0.5 for x in bounds]) cb2.set_ticklabels(labels) else: cb1 = matplotlib.colorbar.ColorbarBase(axis_aux, cmap = my_cmap, norm = color_norm,\ orientation = 'horizontal') pass
def __eq__(self, other): if len(self.data) != len(other.data): return 0 ok = reduce(lambda x, y: x and y, map(lambda x, y: x == y, self.data, other.data)) return ok
def train(training_set, results, feature_fns, update_fn=None, max_iis_iterations=10000, iis_converge=1.0e-5, max_newton_iterations=100, newton_converge=1.0e-10): """Train a maximum entropy classifier, returns MaxEntropy object. Train a maximum entropy classifier on a training set. training_set is a list of observations. results is a list of the class assignments for each observation. feature_fns is a list of the features. These are callback functions that take an observation and class and return a 1 or 0. update_fn is a callback function that is called at each training iteration. It is passed a MaxEntropy object that encapsulates the current state of the training. The maximum number of iterations and the convergence criterion for IIS are given by max_iis_iterations and iis_converge, respectively, while max_newton_iterations and newton_converge are the maximum number of iterations and the convergence criterion for Newton's method. """ if not training_set: raise ValueError("No data in the training set.") if len(training_set) != len(results): raise ValueError("training_set and results should be parallel lists.") # Rename variables for convenience. xs, ys = training_set, results # Get a list of all the classes that need to be trained. classes = sorted(set(results)) # Cache values for all features. features = [_eval_feature_fn(fn, training_set, classes) for fn in feature_fns] # Cache values for f#. f_sharp = _calc_f_sharp(len(training_set), len(classes), features) # Pre-calculate the empirical expectations of the features. e_empirical = _calc_empirical_expects(xs, ys, classes, features) # Now train the alpha parameters to weigh each feature. alphas = [0.0] * len(features) iters = 0 while iters < max_iis_iterations: nalphas = _train_iis(xs, classes, features, f_sharp, alphas, e_empirical, max_newton_iterations, newton_converge) diff = map(lambda x, y: numpy.fabs(x-y), alphas, nalphas) diff = reduce(lambda x, y: x+y, diff, 0) alphas = nalphas me = MaxEntropy() me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns if update_fn is not None: update_fn(me) if diff < iis_converge: # converged break else: raise RuntimeError("IIS did not converge") return me
def train(training_set, results, feature_fns, update_fn=None, max_iis_iterations=10000, iis_converge=1.0e-5, max_newton_iterations=100, newton_converge=1.0e-10): """Train a maximum entropy classifier, returns MaxEntropy object. Train a maximum entropy classifier on a training set. training_set is a list of observations. results is a list of the class assignments for each observation. feature_fns is a list of the features. These are callback functions that take an observation and class and return a 1 or 0. update_fn is a callback function that is called at each training iteration. It is passed a MaxEntropy object that encapsulates the current state of the training. The maximum number of iterations and the convergence criterion for IIS are given by max_iis_iterations and iis_converge, respectively, while max_newton_iterations and newton_converge are the maximum number of iterations and the convergence criterion for Newton's method. """ if not training_set: raise ValueError("No data in the training set.") if len(training_set) != len(results): raise ValueError("training_set and results should be parallel lists.") # Rename variables for convenience. xs, ys = training_set, results # Get a list of all the classes that need to be trained. classes = sorted(set(results)) # Cache values for all features. features = [ _eval_feature_fn(fn, training_set, classes) for fn in feature_fns ] # Cache values for f#. f_sharp = _calc_f_sharp(len(training_set), len(classes), features) # Pre-calculate the empirical expectations of the features. e_empirical = _calc_empirical_expects(xs, ys, classes, features) # Now train the alpha parameters to weigh each feature. alphas = [0.0] * len(features) iters = 0 while iters < max_iis_iterations: nalphas = _train_iis(xs, classes, features, f_sharp, alphas, e_empirical, max_newton_iterations, newton_converge) diff = map(lambda x, y: numpy.fabs(x - y), alphas, nalphas) diff = reduce(lambda x, y: x + y, diff, 0) alphas = nalphas me = MaxEntropy() me.alphas, me.classes, me.feature_fns = alphas, classes, feature_fns if update_fn is not None: update_fn(me) if diff < iis_converge: # converged break else: raise RuntimeError("IIS did not converge") return me