def run(self, verbose=False): """ This reads the input file and writes pieces to a directory. @param verbose: True if we want to report our progress """ # if we are reporting our progress then get the size of the file if verbose: nbytes_total = os.path.getsize(self.filename) pbar = Progress.Bar(nbytes_total) nbytes_current_approx = 0 # process the file, possibly updating the progress bar with open(self.filename) as fin: piece_index = 0 for line_lists in gen_line_list_lists(Util.gen_paragraphs(fin), self.approx_lines_per_piece): piece_filename = piece_index_to_filename( piece_index, self.filename) piece_path = os.path.join(self.target_directory, piece_filename) with open(piece_path, 'w') as fout: for line_list in line_lists: print >> fout, '\n'.join(line_list) print >> fout piece_index += 1 if verbose: nbytes = sum(len(''.join(x)) for x in line_lists) nbytes_current_approx += nbytes pbar.update(nbytes_current_approx) # possibly stop the progress bar if verbose: pbar.finish()
def __init__(self, verbose=False): # initialize the variables self.test_classes = [] self.imported_module_names = [] self.module_import_errors = [] self.imports_with_tests = [] # get the list of module names to try self.all_module_names = get_module_names() # create the progress bar pbar = Progress.Bar(len(self.all_module_names)) # try to load the modules and get the test classes for i, module_name in enumerate(self.all_module_names): try: module = __import__(module_name, globals(), locals()) except ImportError as e: self.module_import_errors.append(e) else: self.imported_module_names.append(module_name) test_classes = [] for object_name, object in module.__dict__.items(): try: if issubclass(object, unittest.TestCase): test_classes.append(object) except TypeError as e: pass if test_classes: self.imports_with_tests.append(module_name) self.test_classes.extend(test_classes) # update the progress bar pbar.update(i + 1)
def run(self, verbose=False): """ Create the index files. This might take a while. @param verbose: True if we want to write our progress to stdout """ # fill a dictionary by reading all of the fasta pieces chromosome_string_to_rows = {} piece_filenames = list(sorted(os.listdir(self.pieces_directory))) if verbose: print >> sys.stderr, 'creating a dictionary from the fasta pieces:' pbar = Progress.Bar(len(piece_filenames)) nfiles_read = 0 for piece_filename in piece_filenames: piece_index = piece_filename_to_index(piece_filename) piece_pathname = os.path.join(self.pieces_directory, piece_filename) for chrom_string, first_i, last_i in gen_elements(piece_pathname): row = (first_i, last_i, piece_index) rows = chromosome_string_to_rows.get(chrom_string, []) rows.append(row) chromosome_string_to_rows[chrom_string] = rows if verbose: nfiles_read += 1 pbar.update(nfiles_read) # define the list of chromosome strings chromosome_strings = list(sorted(chromosome_string_to_rows)) assert len(chromosome_strings) < 1000 # write the list of valid chromosome strings with open(self.chromosome_list_filename, 'w') as fout: fout.write('\n'.join(chromosome_strings)) if verbose: print >> sys.stderr, 'wrote', self.chromosome_list_filename print >> sys.stderr, 'writing the index files:' pbar = Progress.Bar(len(chromosome_strings)) nwritten = 0 # for each chromosome string write the index file for chromosome_string in chromosome_strings: rows = chromosome_string_to_rows[chromosome_string] index_filename = chromosome_string + '.index' index_pathname = os.path.join(self.index_directory, index_filename) with open(index_pathname, 'w') as fout: for row in sorted(rows): print >> fout, '%d\t%d\t%d' % row if verbose: nwritten += 1 pbar.update(nwritten)
def process(pathnames, good_coverage, bad_coverage, randomization_rate, nseconds, use_pbar): """ @param pathnames: paths to files to process @param good_coverage: the expected number of reads at informative positions @param bad_coverage: the expected number of reads at uninformative positions @param randomization_rate: the probability of an error per read @param nseconds: None or impose a time limit of this many seconds @param use_pbar: True iff a progress bar should be used @return: the multi-line string of the resulting csv file """ # define the three models homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage) heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage) overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage) models = [homozygous, heterozygous, overcovered] # define the oracle cache_size = 100000 oracle = Oracle(models, cache_size) # do some initialization out = StringIO() start_time = time.time() nfiles = len(pathnames) pbar = Progress.Bar(nfiles) if (use_pbar and nfiles > 1) else None chromosome_dict = {} termination_reason = 'finished the analysis' try: for i, pathname in enumerate(pathnames): with open(pathname) as fin: lines = fin.readlines() lines = [line.strip() for line in lines] lines = [line for line in lines if line] # validate the number of lines if len(lines) < 2: raise ValueError( 'there should be at least two lines of input') # break the lines of input into rows of elements rows = [line_to_row(line) for line in lines] # validate the columns of data ncolumns_expected = 8 ncolumns = len(rows[0]) if ncolumns != ncolumns_expected: raise ValueError('expected %d columns of input' % ncolumns_expected) for row in rows: if len(row) != ncolumns: raise ValueError( 'each row of input should have the same number of elements as the first row' ) # process the data rows data_rows = rows[1:] for row in data_rows: if nseconds and time.time() - start_time > nseconds: raise TimeoutError() process_genomic_position(row, chromosome_dict, oracle) if pbar: pbar.update(i + 1) except KeyboardInterrupt, e: termination_reason = 'early termination by control-c'
def main(args): """ @param args: positional and flaglike arguments """ # read the arguments input_filename = os.path.abspath(os.path.expanduser(args.infile)) output_directory = os.path.abspath(os.path.expanduser(args.outdir)) force = args.force # make sure that the output directory exists if not os.path.isdir(output_directory): if force: os.makedirs(output_directory) if not os.path.isdir(output_directory): msg = 'output directory does not exist: ' + output_directory raise Exception(msg) # scan the input file for chromosome names ch_paths = [] skimmer = DGRP.ChromoSkimmer() with open(input_filename) as fin: for chromo_name in skimmer.skim(gen_untyped_rows(fin)): output_filename = args.out_prefix + chromo_name + args.out_suffix ch_path = os.path.join(output_directory, output_filename) ch_paths.append(ch_path) if not force: if os.path.exists(ch_path): raise Exception('output already exists: ' + ch_path) chromo_names = skimmer.name_list nlines = skimmer.linecount # start the progress bar nticks = 2 * nlines pbar = Progress.Bar(nticks) # scan the input file for correct types and for monotonicity with open(input_filename) as fin: for i in DGRP.check_chromo_monotonicity(gen_typed_rows(fin)): pbar.increment() # create the files open for writing ch_files = [] for p in ch_paths: ch_files.append(open(p, 'wt')) # write the headers if not args.noheader: for f in ch_files: f.write(g_header + '\n') # write the lines name_to_file = dict(zip(chromo_names, ch_files)) with open(input_filename) as fin: for row in gen_typed_rows(fin): name = row[0] row_out = convert_row(row) f = name_to_file[name] line_out = '\t'.join(str(x) for x in row_out) f.write(line_out + '\n') pbar.increment() # close the files for f in ch_files: f.close()
def main(options): """ @param options: parsed from the command line """ edges = g_default_edges deadline = None pbar = Progress.Bar(options.nsamples) out = process(edges, options.epsilon, options.nsamples, deadline, pbar) pbar.finish() print out.getvalue().strip()
def process(input_lines, good_coverage, randomization_rate, nstickinesses, nseconds, use_pbar): """ @param input_lines: lines of input of csv data including the header @param good_coverage: the expected number of reads at informative positions @param randomization_rate: the probability of an error per read @param nstickinesses: use this many different levels of stickiness @param nseconds: None or impose a time limit of this many seconds @param use_pbar: True iff a progress bar should be used @return: the multi-line string of the resulting csv file """ # do some initialization out = StringIO() pbar = None start_time = time.time() # define the superstates cache_size_per_superstate = 100000 good_state = ReadCoverageGap.Good(randomization_rate, good_coverage, cache_size_per_superstate) bad_state = ReadCoverageGap.Bad(randomization_rate, good_coverage, cache_size_per_superstate) superstates = [good_state, bad_state] superstate_names = ['good', 'bad'] # read the chromosome data chromosomes = parse(input_lines) # write the header line header_row = [] header_row.extend([ 'genetic_line', 'chromosome', 'position', 'A_count', 'C_count', 'G_count', 'T_count', 'gap_count' ]) for stickiness in range(nstickinesses): for name in ('state', 'substate'): header_row.append('%s_%d' % (name, stickiness)) print >> out, ','.join(header_row) # prepare to annotate the chromosomes if use_pbar: count = 0 pbar = Progress.Bar(len(chromosomes) * nstickinesses) # annotate the chromosomes using the models for i, chromosome in enumerate(chromosomes): for stickiness in range(nstickinesses): if nseconds and time.time() - start_time > nseconds: raise TimeoutError() chromosome.annotate_posteriors(stickiness, superstates) if pbar: count += 1 pbar.update(count) print >> out, '\n'.join(','.join(row) for row in chromosome.get_rows_of_strings( superstates, superstate_names)) if pbar: pbar.finish() # return the output text return out.getvalue().strip()
def process(ntaxa, nseconds, nlengths, nsamples, nj_like, branch_length_sampler, use_pbar): """ @param ntaxa: the number of taxa per tree @param nseconds: stop after this many seconds @param nlengths: use this many different sequence lengths @param nsamples: stop after this many samples per sequence length @param nj_like: True to use a generalized neighbor-joining-like method of computing successive distance matrices @param branch_length_sampler: this function samples branch lengths independently @param use_pbar: True iff a progress bar should be used @return: a multi-line string of the contents of an R table """ # define the sequence lengths lengths = get_sequence_lengths(nlengths) # initialize the accumulation matrix accum = np.zeros((nlengths, len(g_headers)), dtype=np.int) for i, sequence_length in enumerate(lengths): set_attribute(accum[i], 'sequence.length', sequence_length) # Repeatedly analyze samples from each sequence length. # We might have to stop early if we run out of time or if ctrl-c is pressed. # If we have to stop early, then show the results of the progress so far. termination_reason = 'no reason for termination was given' start_time = time.time() pbar = None if use_pbar: pbar = Progress.Bar(nsamples) try: for sample_index in range(nsamples): # reset the accumulation matrix for this iteration single_iteration_accum = np.zeros((nlengths, len(g_headers))) # accumulate attributes of sampling attempts for each sequence length for sequence_length_index, sequence_length in enumerate(lengths): # keep trying to get an accepted sample while True: # check the time if nseconds and time.time() - start_time > nseconds: raise TimeoutError() # get counts of attributes of a sample sample_result = get_sample_results(sequence_length, ntaxa, nj_like, branch_length_sampler) single_iteration_accum[ sequence_length_index] += sample_result # if the sample was accepted then we are done looking if get_attribute(sample_result, 'nsamples.accepted'): break # finish the iteration accum += single_iteration_accum if pbar: pbar.update(sample_index + 1) else: termination_reason = 'the requested number of samples per sequence length was attained' except KeyboardInterrupt, e: termination_reason = 'keyboard interrupt'
def main(args): # do some validation if args.nframes < 2: raise ValueError('nframes should be at least 2') # define the requested physical size of the images (in pixels) physical_size = (args.physical_width, args.physical_height) # get the directed edges and the branch lengths and vertex names R, B, N = FtreeIO.newick_to_RBN(args.tree) # get the requested undirected edge edge = get_edge(R, N, args.branch_name) initial_length = B[edge] # get the undirected tree topology T = Ftree.R_to_T(R) # get the leaves and the vertices of articulation leaves = Ftree.T_to_leaves(T) internal = Ftree.T_to_internal_vertices(T) vertices = leaves + internal nleaves = len(leaves) v_to_index = Ftree.invseq(vertices) # get the requested indices x_index = args.x_axis - 1 y_index = args.y_axis - 1 if x_index >= nleaves - 1 or y_index >= nleaves - 1: raise ValueError( 'projection indices must be smaller than the number of leaves') X_prev = None # create the animation frames and write them as image files pbar = Progress.Bar(args.nframes) for frame_index in range(args.nframes): linear_progress = frame_index / float(args.nframes - 1) if args.interpolation == 'sigmoid': t = sigmoid(linear_progress) else: t = linear_progress B[edge] = (1 - t) * initial_length + t * args.final_length w, v = Ftree.TB_to_harmonic_extension(T, B, leaves, internal) X_full = np.dot(v, np.diag(np.reciprocal(np.sqrt(w)))) X = np.vstack([X_full[:, x_index], X_full[:, y_index]]).T if X_prev is not None: X = reflect_to_match(X, X_prev) X_prev = X image_string = get_animation_frame(args.image_format, physical_size, args.scale, v_to_index, T, X, w) image_filename = 'frame-%04d.%s' % (frame_index, args.image_format) image_pathname = os.path.join(args.output_directory, image_filename) with open(image_pathname, 'wb') as fout: fout.write(image_string) pbar.update(frame_index + 1) pbar.finish()
def main(args): # do some validation if args.nframes < 2: raise ValueError('nframes should be at least 2') # define the requested physical size of the images (in pixels) physical_size = (args.physical_width, args.physical_height) # build the newick tree from the string tree = NewickIO.parse(args.tree, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) # Get ordered ids with the leaves first, # and get the corresponding distance matrix. ordered_ids = get_ordered_ids(tree) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) index_edges = get_index_edges(tree, ordered_ids) # Create the reference points # so that the video frames are not reflected arbitrarily. reference_points = Euclid.edm_to_points(D).T[:3].T # create the animation frames and write them as image files pbar = Progress.Bar(args.nframes) for frame_index in range(args.nframes): linear_progress = frame_index / float(args.nframes - 1) if args.interpolation == 'sigmoid': progress = sigmoid(linear_progress) else: progress = linear_progress mass_vector = get_mass_vector(nvertices, nleaves, progress) points = get_canonical_3d_mds(D, mass_vector, reference_points) crossings = get_crossings(index_edges, points) # define the frame path name image_filename = 'frame-%04d.%s' % (frame_index, args.image_format) image_pathname = os.path.join(args.output_directory, image_filename) # clear the old figure and render the new figure mlab.clf() add_yz_plane() add_zx_plane() add_xy_plane() X, Y, Z = points.T[0], points.T[1], points.T[2] draw_3d_tree(X, Y, Z, index_edges) draw_crossings(X, Y, Z, index_edges) mlab.savefig(image_pathname, size=physical_size) # update the progress bar pbar.update(frame_index + 1) pbar.finish()
def process(input_lines, good_coverage, bad_coverage, randomization_rate, T, nseconds, use_pbar): """ @param input_lines: lines of input of csv data including the header @param good_coverage: the expected number of reads at informative positions @param bad_coverage: the expected number of reads at uninformative positions @param randomization_rate: the probability of an error per read @param T: a transition matrix relating the hidden states @param nseconds: None or impose a time limit of this many seconds @param use_pbar: True iff a progress bar should be used @return: the multi-line string of the resulting csv file """ # do some initialization out = StringIO() pbar = None start_time = time.time() # define the three models homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage) heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage) overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage) models = [homozygous, heterozygous, overcovered] # read the chromosome data chromosomes = parse(input_lines) # write the header line print >> out, ','.join(g_output_header_row) # prepare to annotate the chromosomes if use_pbar: pbar = Progress.Bar(len(chromosomes)) # annotate the chromosomes using the models try: for i, chromosome in enumerate(chromosomes): if nseconds and time.time() - start_time > nseconds: raise TimeoutError() chromosome.annotate_likelihoods(models) chromosome.annotate_posteriors(T, models) print >> out, '\n'.join( ','.join(row) for row in chromosome.get_rows_of_strings()) if pbar: pbar.update(i + 1) except KeyboardInterrupt, e: if pbar: pbar.finish() raise e
def main(args): # read the arguments input_filename = os.path.abspath(os.path.expanduser(args.infile)) output_directory = os.path.abspath(os.path.expanduser(args.outdir)) force = args.force low, high = args.low, args.high errlow, errhigh = args.errlow, args.errhigh # make sure that the output directory exists if not os.path.isdir(output_directory): if force: os.makedirs(output_directory) if not os.path.isdir(output_directory): msg = 'output directory does not exist: ' + output_directory raise Exception(msg) # create the scanner object which will be used for two passes scanner = Scanner(low, high, args.fill, errlow, errhigh) # Do the first pass, # checking for errors and gathering info about the chromosomes. name_to_path = {} with open(input_filename) as fin: for name in scanner.scan(fin): output_filename = args.out_prefix + name + args.out_suffix fpath = os.path.join(output_directory, output_filename) name_to_path[name] = fpath if not args.force: if os.path.exists(fpath): raise Exception('output file already exists: ' + fpath) nticks = scanner.get_npositions() pbar = Progress.Bar(nticks) # open the files for writing name_to_fout = {} for name, fpath in name_to_path.items(): name_to_fout[name] = open(fpath, 'wt') # Do the second pass, # writing the files and updating the progress bar. with open(input_filename) as fin: for name, line in scanner.gen_named_lines(fin): name_to_fout[name].write(line + '\n') pbar.increment() # close the files for fout in name_to_fout.values(): fout.close()
def main(args): # do some validation if args.nframes < 2: raise ValueError('nframes should be at least 2') # define the requested physical size of the images (in pixels) physical_size = (args.physical_width, args.physical_height) # create the animation frames and write them as image files pbar = Progress.Bar(args.nframes) for frame_index in range(args.nframes): t = frame_index / float(args.nframes - 1) image_string = get_animation_frame(args.image_format, physical_size, args.scale, args.tree, args.eigenvector_index, t_to_yaw(t), t_to_pitch(t)) image_filename = 'frame-%04d.%s' % (frame_index, args.image_format) image_pathname = os.path.join(args.output_directory, image_filename) with open(image_pathname, 'wb') as fout: fout.write(image_string) pbar.update(frame_index + 1) pbar.finish()
def main(args): # do some validation if args.nframes < 2: raise ValueError('nframes should be at least 2') # define the requested physical size of the images (in pixels) physical_size = (args.physical_width, args.physical_height) # build the newick tree from the string tree = NewickIO.parse(args.tree, FelTree.NewickTree) nvertices = len(list(tree.preorder())) nleaves = len(list(tree.gen_tips())) # Get ordered ids with the leaves first, # and get the corresponding distance matrix. ordered_ids = get_ordered_ids(tree) D = np.array(tree.get_partial_distance_matrix(ordered_ids)) index_edges = get_index_edges(tree, ordered_ids) # Create the reference points # so that the video frames are not reflected arbitrarily. reference_points = Euclid.edm_to_points(D).T[:2].T # create the animation frames and write them as image files pbar = Progress.Bar(args.nframes) for frame_index in range(args.nframes): linear_progress = frame_index / float(args.nframes - 1) if args.interpolation == 'sigmoid': progress = sigmoid(linear_progress) else: progress = linear_progress mass_vector = get_mass_vector(nvertices, nleaves, progress) points = get_canonical_2d_mds(D, mass_vector, reference_points) image_string = get_animation_frame(args.image_format, physical_size, args.scale, mass_vector, index_edges, points) image_filename = 'frame-%04d.%s' % (frame_index, args.image_format) image_pathname = os.path.join(args.output_directory, image_filename) with open(image_pathname, 'wb') as fout: fout.write(image_string) pbar.update(frame_index + 1) pbar.finish()
def main(options, args): """ @param options: from optparse @param args: from optparse @return: a response string """ # get this file from the web directory # http://hgdownload.cse.ucsc.edu/goldenPath/hg18/multiz28way/alignments/ original_fasta_filename = 'knownGene.exonAA.fa' # the original fasta file is broken into a bunch of pieces and put into this directory pieces_directory = 'fasta' # the index files that map genomic locations to fasta subfiles are in this directory index_directory = 'index' # this file keeps a list of valid chromosome names from the original fasta file chromosome_filename = 'chromosomes.txt' # assert that a command was given with the script if not args: raise MySyntaxError('no command was given') # try to dispatch the command command = args[0] command_args = args[1:] if command == 'split': if command_args: raise MySyntaxError( 'the split command does not take any arguments') # assert that the fasta directory has been created if not os.path.isdir(pieces_directory): err_lines = [ 'The directory for the split fasta files was not found: ' + pieces_directory, 'Please create this directory or cd to its parent directory.' ] raise MyConfigError('\n'.join(err_lines)) # assert that the current directory has the original huge fasta file pathnames = os.listdir('.') if original_fasta_filename not in pathnames: err_lines = [ 'The file %s was not found in the current directory.' % original_fasta_filename, 'Please download this file from:', 'http://hgdownload.cse.ucsc.edu/goldenPath/hg18/multiz28way/alignments/' ] raise MyConfigError('\n'.join(err_lines)) splitter = KGEA.Splitter(original_fasta_filename, pieces_directory) splitter.run(verbose=options.verbose) return '' elif command == 'index': if command_args: raise MySyntaxError( 'the index command does not take any arguments') # assert that the fasta directory has been created if not os.path.isdir(pieces_directory): err_lines = [ 'The directory for the split fasta files was not found: ' + pieces_directory, 'If this directory exists somewhere else, then cd to its parent directory.', 'If this directory has not been created, then create it and run the split command.' ] raise MyConfigError('\n'.join(err_lines)) # assert that the index directory has been created if not os.path.isdir(index_directory): err_lines = [ 'The directory for the index files was not found: ' + index_directory, 'Please create this directory or cd to its parent directory.' ] raise MyConfigError('\n'.join(err_lines)) indexer = KGEA.Indexer(index_directory, chromosome_filename, pieces_directory) indexer.run(verbose=options.verbose) return '' elif command == 'find-alignment': if len(command_args) != 2: raise MySyntaxError( 'the find-alignment command takes two arguments') # define the chromosome string and the chromosome position chromosome_string, chromosome_position_string = command_args # initialize the chromosome position and assert that it is plausible try: chromosome_position = int(chromosome_position_string) except ValueError as e: raise MySyntaxError('the chromosome position should be an integer') # assert that the fasta directory has been created if not os.path.isdir(pieces_directory): err_lines = [ 'The directory for the split fasta files was not found: ' + pieces_directory, 'If this directory exists somewhere else, then cd to its parent directory.', 'If this directory has not been created, then create it and run the split command.' ] raise MyConfigError('\n'.join(err_lines)) # assert that the index directory has been created if not os.path.isdir(index_directory): err_lines = [ 'The directory for the index files was not found: ' + index_directory, 'If this directory exists somewhere else, then cd to its parent directory.', 'If this directory has not been created, then create it and run the index command.' ] raise MyConfigError('\n'.join(err_lines)) # look for the alignment using the finder finder = KGEA.Finder(index_directory, chromosome_filename, pieces_directory) fasta_lines = finder.get_alignment_lines(chromosome_string, chromosome_position, verbose=options.verbose) if not fasta_lines: return 'no amino acid was found at this position' return '\n'.join(fasta_lines) elif command == 'find-column': if len(command_args) != 2: raise MySyntaxError('the find-column command takes two arguments') # define the chromosome string and the chromosome position chromosome_string, chromosome_position_string = command_args # initialize the chromosome position and assert that it is plausible try: chromosome_position = int(chromosome_position_string) except ValueError as e: raise MySyntaxError('the chromosome position should be an integer') # assert that the fasta directory has been created if not os.path.isdir(pieces_directory): err_lines = [ 'The directory for the split fasta files was not found: ' + pieces_directory, 'If this directory exists somewhere else, then cd to its parent directory.', 'If this directory has not been created, then create it and run the split command.' ] raise MyConfigError('\n'.join(err_lines)) # assert that the index directory has been created if not os.path.isdir(index_directory): err_lines = [ 'The directory for the index files was not found: ' + index_directory, 'If this directory exists somewhere else, then cd to its parent directory.', 'If this directory has not been created, then create it and run the index command.' ] raise MyConfigError('\n'.join(err_lines)) # look for the column using the finder finder = KGEA.Finder(index_directory, chromosome_filename, pieces_directory) column_lines = finder.get_column_lines(chromosome_string, chromosome_position, verbose=options.verbose) if not column_lines: return 'no amino acid was found at this position' return '\n'.join(column_lines) elif command == 'summarize': if command_args: raise MySyntaxError( 'the summarize command does not take any arguments') # assert that the current directory has the original huge fasta file pathnames = os.listdir('.') if original_fasta_filename not in pathnames: err_lines = [ 'The file %s was not found in the current directory.' % original_fasta_filename, 'Please download this file from:', 'http://hgdownload.cse.ucsc.edu/goldenPath/hg18/multiz28way/alignments/' ] raise MyConfigError('\n'.join(err_lines)) # initialize the progress bar nbytes_total = os.path.getsize(original_fasta_filename) pbar = Progress.Bar(nbytes_total) # initialize the summary mod3 = {0: 0, 1: 0, 2: 0} length_diff_dict = {} # summarize by reading each alignment from the file approx_nbytes_read = 0 fin = open(original_fasta_filename) for lines in Util.gen_paragraphs(fin): # process the lines header_line = lines[0] p = KGEA.LocationParser(header_line) genomic_length = (p.last_index - p.first_index) + 1 mod3[genomic_length % 3] += 1 diff = 3 * p.length - genomic_length if diff not in length_diff_dict: length_diff_dict[diff] = 0 length_diff_dict[diff] += 1 # update the progress bar approx_nbytes_read += sum(len(line) for line in lines) pbar.update(approx_nbytes_read) fin.close() # finish the progress bar pbar.update(nbytes_total) # return the summary summary_lines = [] summary_lines += [ 'genomic span of %d mod 3: %d sequences' % (i, mod3[i]) for i in range(3) ] summary_lines.append('histogram of 3*aa_length - genomic span:') for key, value in sorted(length_diff_dict.items()): summary_lines.append('%d : %d' % (key, value)) return '\n'.join(summary_lines) else: raise MySyntaxError('invalid command: ' + command)
def do_command_line_analysis(options): """ Print some stuff to stdout, and show a progress bar on stderr. @param options: an object from optparse """ # load the tree, using the default tree if no filename was provided tree, tree_remark = get_tree_and_remark(options) # initialize the simulation objects sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign cut with neighbor joining fallback'), Simulation(Clustering.RandomDMS(), 'nj', 'random partitioning') ] # possibly add the slow simulation if options.use_exact: sims.append( Simulation(Clustering.StoneExactDMS(), 'nj', 'exact criterion with neighbor joining fallback')) # define the simulation parameters reconstruction_count = options.nsamples sequence_length_string = options.sequence_length if sequence_length_string == 'inf': sequence_length = float('inf') else: sequence_length = int(sequence_length_string) inf_replacement = 20.0 if options.reject_inf: inf_replacement = None elif options.replace_inf: try: inf_replacement = float(options.replace_inf) except ValueError: msg = 'invalid replace_inf value: ' raise OptionError(msg + str(options.replace_inf)) zero_replacement = 0 if options.reject_zero: zero_replacement = None elif options.replace_zero: try: zero_replacement = float(options.replace_zero) except ValueError: msg = 'invalid replace_zero value: ' raise OptionError(msg + str(options.replace_zero)) # start the html file print '<html><body>' # show the simulation parameters print 'original tree source:', tree_remark, '<br/>' print 'reconstruction count:', reconstruction_count, '<br/>' print 'sequence length:', sequence_length, '<br/>' # set the simulation parameters for each simulation for sim in sims: sim.set_original_tree(tree) # If there is only one reconstruction per method # then show the progress of the tree builder. if reconstruction_count == 1: sim.set_verbose() # define an arbitrary but consistent ordering of the taxa ordered_names = [node.name for node in tree.gen_tips()] try: # attempt to simulate a bunch of distance matrices if options.verbose: print 'sampling', reconstruction_count, 'distance matrices...' # initialize the distance matrix sampler sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length) sampler.set_inf_replacement(inf_replacement) sampler.set_zero_replacement(zero_replacement) # start the progress bar pbar = Progress.Bar(1.0) # sample some distance matrices distance_matrices = [] for result in sampler.gen_samples_or_none(): # if we got a result then update the distance matrix list if result: sequence_list, D = result distance_matrices.append(D) # Update the progressbar regardless of whether or not # the proposal was accepted. remaining_acceptances = reconstruction_count - len( distance_matrices) numerator = sampler.get_completed_proposals() denominator = numerator + sampler.get_remaining_proposals( remaining_acceptances) dms_fraction = float(numerator) / float(denominator) dms_total = 1.0 / (1 + len(sims)) pbar.update(dms_fraction * dms_total) # if we have enough samples then break the loop if not remaining_acceptances: break # reconstruct trees using various methods for i, sim in enumerate(sims): if options.verbose: print 'running "%s"...' % sim.description sim.run(distance_matrices, ordered_names) pbar.update(float(i + 2) / float(1 + len(sims))) # stop the progress bar pbar.finish() # get the simulation data table = [('method', 'seconds', 'uniform loss', 'weighted loss')] for sim in sims: table.append((sim.description, sim.get_running_time(), sim.get_uniform_loss(), sim.get_deep_loss())) # convert the row major matrix into an html table print HtmlTable.get_table_string(table) # end the html file print '</html></body>' except KeyboardInterrupt: print 'interrupted stage', pbar.progress, 'of', pbar.high
def do_hard_coded_analysis_b(tree, tree_remark): """ Do a hardcoded analysis of tree reconstruction methods. Make R files of ordered reconstruction losses. @param tree: a tree object @param tree_remark: a string that is a comment about the tree """ # define an arbitrary order for the names of the leaves of the tree ordered_names = list(node.name for node in tree.gen_tips()) # use some replicates reconstruction_count = 100 # Make R files for reconstruction results from sequences # of some number of nucleotides in length. sequence_length = 2000 # define the tree reconstruction methods to be used sims = [ Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'), Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign') ] # set tree reconstruction parameters for sim in sims: sim.set_original_tree(tree) # initialize the distance matrix sampler sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names, sequence_length) sampler.set_inf_replacement(20.0) sampler.set_zero_replacement(0.0) # start the progress bar pbar = Progress.Bar(1.0) # sample some distance matrices distance_matrix_start_time = time.time() distance_matrices = [] for result in sampler.gen_samples_or_none(): # if we got a result then update the distance matrix list if result: sequence_list, D = result distance_matrices.append(D) # Update the progressbar regardless of whether or not # the proposal was accepted. remaining_acceptances = reconstruction_count - len(distance_matrices) numerator = sampler.get_completed_proposals() denominator = numerator + sampler.get_remaining_proposals( remaining_acceptances) dms_fraction = float(numerator) / float(denominator) dms_total = 1.0 / (1 + len(sims)) pbar.update(dms_fraction * dms_total) # if we have enough samples then break the loop if not remaining_acceptances: break distance_matrix_seconds = time.time() - distance_matrix_start_time # reconstruct trees using various methods reconstruction_seconds = [] for i, sim in enumerate(sims): reconstruction_start_time = time.time() print 'reconstructing', len(distance_matrices), 'trees' print 'using', sim.description sim.run(distance_matrices, ordered_names) pbar.update(float(i + 2) / float(1 + len(sims))) reconstruction_seconds.append(time.time() - reconstruction_start_time) # stop the progress bar pbar.finish() # consider the neighbor joining and the spectral sign results nj_sim, ss_sim = sims # extract the simulation data label_list_pairs = [ ('nj.unweighted', nj_sim.get_normalized_error_counts()), ('ss.unweighted', ss_sim.get_normalized_error_counts()), ('nj.weighted', nj_sim.get_normalized_loss_values()), ('ss.weighted', ss_sim.get_normalized_loss_values()) ] labels, transposed_table = zip(*label_list_pairs) table = zip(*transposed_table) table_string = RUtil.get_table_string(table, labels) # write the table filename = 'out3.table' with open(filename, 'w') as fout: print >> fout, '# tree source:', tree_remark print >> fout, '# number of taxa:', len(ordered_names) print >> fout, '# sampled distance matrices:', len(distance_matrices) print >> fout, '# sampling seconds elapsed:', distance_matrix_seconds print >> fout, '# sites per sequence:', sequence_length for sim, seconds in zip(sims, reconstruction_seconds): msg_a = '# seconds elapsed for tree reconstruction using ' msg_b = sim.description + ': ' + str(seconds) print >> fout, msg_a + msg_b print >> fout, table_string print 'wrote', filename
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler, use_pbar): """ @param ntaxa: the number of taxa per tree @param nseconds: stop after this many seconds @param seqlen: use this sequence length @param nsamples: stop after this many samples per sequence length @param branch_length_sampler: this function samples branch lengths independently @param use_pbar: True iff a progress bar should be used @return: a multi-line string of the contents of an R table """ # initialize the global rejection counts nrejected_zero = 0 nrejected_inf = 0 nrejected_fail = 0 naccepted = 0 # Initialize the accumulation matrix. # The rows specify the size of the smaller side of the initial split. # The columns specify the compatibility status of the split. nsmall_sizes = (ntaxa / 2) + 1 accum = np.zeros((nsmall_sizes, 2), dtype=np.int) # Repeatedly analyze samples. # We might have to stop early if we run out of time or if ctrl-c is pressed. # If we have to stop early, then show the results of the progress so far. termination_reason = 'no reason for termination was given' start_time = time.time() pbar = Progress.Bar(nsamples) if use_pbar else None try: for sample_index in range(nsamples): # keep trying to get an accepted sample while True: # check the time if nseconds and time.time() - start_time > nseconds: raise TimeoutError() # first sample a tree and get its set of informative splits tree = TreeSampler.sample_agglomerated_tree(ntaxa) true_splits = tree.get_nontrivial_splits() # sample the branch lengths for branch in tree.get_branches(): branch.length = branch_length_sampler() # Attempt to sample a distance matrix. # If the sample was rejected then note the reason and go back to the drawing board. try: D = sample_distance_matrix(tree, seqlen) except InfiniteDistanceError as e: nrejected_inf += 1 continue except ZeroDistanceError as e: nrejected_zero += 1 continue # Attempt to estimate the primary split of the tree from the distance matrix. # If there was a technical failure then note it and go back to the drawing board. # Otherwise note the compatibility and balance of the split. try: eigensplit = BuildTreeTopology.split_using_eigenvector(D) small_size = min(len(side) for side in eigensplit) if eigensplit in true_splits: compatibility = 1 else: compatibility = 0 except BuildTreeTopology.DegenerateSplitException, e: small_size = 0 compatibility = 1 except BuildTreeTopology.InvalidSpectralSplitException, e: nrejected_fail += 1 continue
def process(linesources, good_coverage, randomization_rate, stickiness, nseconds, use_pbar): """ @param linesources: open resequencing files for reading @param good_coverage: the expected number of reads at informative positions @param randomization_rate: the probability of an error per base call @param stickiness: level of stickiness @param nseconds: None or impose a time limit of this many seconds @param use_pbar: True iff a progress bar should be used @return: the multi-line string of the resulting csv file """ # do some initialization start_time = time.time() termination_reason = 'finished the analysis' # define the superstates cache_size_per_superstate = 100000 good_state = ReadCoverageGap.Good(randomization_rate, good_coverage, cache_size_per_superstate) bad_state = ReadCoverageGap.Bad(randomization_rate, good_coverage, cache_size_per_superstate) superstates = [good_state, bad_state] superstate_names = ['good', 'bad'] # prepare to annotate the chromosomes chromosomes = [] pbar = Progress.Bar(len(linesources)) if use_pbar else None # annotate the chromosomes using the models try: for i, linesource in enumerate(linesources): # read the lines of text lines = Util.get_stripped_lines(linesource.readlines()) # validate the number of lines if len(lines) < 2: raise ValueError('there should be at least two lines of input') # break the lines of input into rows of elements rows = [line_to_row(line) for line in lines] # validate the columns of data ncolumns_expected = 8 ncolumns = len(rows[0]) if ncolumns != ncolumns_expected: raise ValueError('expected %d columns of input: %s' % (ncolumns_expected, rows[0])) for row in rows: if len(row) != ncolumns: raise ValueError( 'each row of input should have the same number of elements as the first row' ) # process the data rows, building a dictionary of chromosomes chromosome_dict = {} data_rows = rows[1:] for row in data_rows: if nseconds and time.time() - start_time > nseconds: raise TimeoutError() process_genomic_position(row, chromosome_dict) current_chromosomes = [ chromosome for identifier, chromosome in sorted(chromosome_dict.items()) ] for chromosome in current_chromosomes: # do the annotation chromosome.annotate_posteriors(stickiness, superstates) # delete position specific data chromosome.del_position_specific_data() # add the chromosomes to the list chromosomes.extend(current_chromosomes) # update the progress bar if pbar: pbar.update(i + 1) except KeyboardInterrupt, e: termination_reason = 'early termination by control-c'
def process(input_lines, good_coverage, bad_coverage, randomization_rate, nseconds, use_pbar): """ @param input_lines: lines of input of csv data including the header @param good_coverage: the expected number of reads at informative positions @param bad_coverage: the expected number of reads at uninformative positions @param randomization_rate: the probability of an error per read @param nseconds: None or impose a time limit of this many seconds @param use_pbar: True iff a progress bar should be used @return: a multi-line string of the annotated csv file """ verbose = False # validate the number of lines if len(input_lines) < 6: raise ValueError('there should be at least six lines of input') if len(input_lines) % 5 != 1: raise ValueError( 'the input lines should consist of a header plus a multiple of five data lines' ) # break the lines of input into rows of elements input_rows = [line_to_row(line) for line in input_lines] # validate the columns of data ncolumns = len(input_rows[0]) if ncolumns < 7: raise ValueError('there should be at least seven columns of input') if ncolumns % 2 != 1: raise ValueError('the number of input columns should be odd') for row in input_rows: if len(row) != ncolumns: raise ValueError( 'each row of input should have the same number of elements as the first row' ) # define the three models homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage) heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage) overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage) models = [homozygous, heterozygous, overcovered] # initialize the output header row header_row = input_rows[0] output_header_row = header_row[:5] for heading in header_row[5:]: if heading.endswith('sco'): output_header_row.append(heading) elif heading.endswith('cov'): output_header_row.extend([ heading, heading + '_hom', heading + '_het', heading + '_ovr' ]) else: raise ValueError( 'each heading after the fifth should end with sco or cov') # get the rest of the rows data_rows = input_rows[1:] # define the number of genomic positions and the number of strains npositions = len(data_rows) / 5 nstrains = (ncolumns - 5) / 2 # begin the output out = StringIO() print >> out, ','.join(output_header_row) # initialize some stuff start_time = time.time() pbar = Progress.Bar(npositions) if use_pbar else None try: for position in range(npositions): # check the time if nseconds and time.time() - start_time > nseconds: raise TimeoutError() # get a chunk of five consecutive rows position_rows = [data_rows[position * 5 + i] for i in range(5)] # get the corresponding log likelihoods log_likelihood_lists = get_log_likelihoods_per_strain( position_rows, models) # construct five annotated output lines for position_row in position_rows: output_row = position_row[:5] for i, log_likelihoods in enumerate(log_likelihood_lists): # add the coverage, three annotations, and the score coverage_string = position_row[5 + 2 * i] score_string = position_row[5 + 2 * i + 1] if log_likelihoods: annotations = [str(x) for x in log_likelihoods] else: annotations = ['-', '-', '-'] output_row.extend([coverage_string] + annotations + [score_string]) print >> out, ','.join(output_row) # update the progress bar if pbar: pbar.update(position + 1) except KeyboardInterrupt, e: if pbar: pbar.finish() raise e