Esempio n. 1
0
 def run(self, verbose=False):
     """
     This reads the input file and writes pieces to a directory.
     @param verbose: True if we want to report our progress
     """
     # if we are reporting our progress then get the size of the file
     if verbose:
         nbytes_total = os.path.getsize(self.filename)
         pbar = Progress.Bar(nbytes_total)
         nbytes_current_approx = 0
     # process the file, possibly updating the progress bar
     with open(self.filename) as fin:
         piece_index = 0
         for line_lists in gen_line_list_lists(Util.gen_paragraphs(fin),
                                               self.approx_lines_per_piece):
             piece_filename = piece_index_to_filename(
                 piece_index, self.filename)
             piece_path = os.path.join(self.target_directory,
                                       piece_filename)
             with open(piece_path, 'w') as fout:
                 for line_list in line_lists:
                     print >> fout, '\n'.join(line_list)
                     print >> fout
             piece_index += 1
             if verbose:
                 nbytes = sum(len(''.join(x)) for x in line_lists)
                 nbytes_current_approx += nbytes
                 pbar.update(nbytes_current_approx)
     # possibly stop the progress bar
     if verbose:
         pbar.finish()
Esempio n. 2
0
 def __init__(self, verbose=False):
     # initialize the variables
     self.test_classes = []
     self.imported_module_names = []
     self.module_import_errors = []
     self.imports_with_tests = []
     # get the list of module names to try
     self.all_module_names = get_module_names()
     # create the progress bar
     pbar = Progress.Bar(len(self.all_module_names))
     # try to load the modules and get the test classes
     for i, module_name in enumerate(self.all_module_names):
         try:
             module = __import__(module_name, globals(), locals())
         except ImportError as e:
             self.module_import_errors.append(e)
         else:
             self.imported_module_names.append(module_name)
             test_classes = []
             for object_name, object in module.__dict__.items():
                 try:
                     if issubclass(object, unittest.TestCase):
                         test_classes.append(object)
                 except TypeError as e:
                     pass
             if test_classes:
                 self.imports_with_tests.append(module_name)
             self.test_classes.extend(test_classes)
         # update the progress bar
         pbar.update(i + 1)
Esempio n. 3
0
 def run(self, verbose=False):
     """
     Create the index files.
     This might take a while.
     @param verbose: True if we want to write our progress to stdout
     """
     # fill a dictionary by reading all of the fasta pieces
     chromosome_string_to_rows = {}
     piece_filenames = list(sorted(os.listdir(self.pieces_directory)))
     if verbose:
         print >> sys.stderr, 'creating a dictionary from the fasta pieces:'
         pbar = Progress.Bar(len(piece_filenames))
         nfiles_read = 0
     for piece_filename in piece_filenames:
         piece_index = piece_filename_to_index(piece_filename)
         piece_pathname = os.path.join(self.pieces_directory,
                                       piece_filename)
         for chrom_string, first_i, last_i in gen_elements(piece_pathname):
             row = (first_i, last_i, piece_index)
             rows = chromosome_string_to_rows.get(chrom_string, [])
             rows.append(row)
             chromosome_string_to_rows[chrom_string] = rows
         if verbose:
             nfiles_read += 1
             pbar.update(nfiles_read)
     # define the list of chromosome strings
     chromosome_strings = list(sorted(chromosome_string_to_rows))
     assert len(chromosome_strings) < 1000
     # write the list of valid chromosome strings
     with open(self.chromosome_list_filename, 'w') as fout:
         fout.write('\n'.join(chromosome_strings))
     if verbose:
         print >> sys.stderr, 'wrote', self.chromosome_list_filename
         print >> sys.stderr, 'writing the index files:'
         pbar = Progress.Bar(len(chromosome_strings))
         nwritten = 0
     # for each chromosome string write the index file
     for chromosome_string in chromosome_strings:
         rows = chromosome_string_to_rows[chromosome_string]
         index_filename = chromosome_string + '.index'
         index_pathname = os.path.join(self.index_directory, index_filename)
         with open(index_pathname, 'w') as fout:
             for row in sorted(rows):
                 print >> fout, '%d\t%d\t%d' % row
         if verbose:
             nwritten += 1
             pbar.update(nwritten)
Esempio n. 4
0
def process(pathnames, good_coverage, bad_coverage, randomization_rate,
            nseconds, use_pbar):
    """
    @param pathnames: paths to files to process
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    # define the oracle
    cache_size = 100000
    oracle = Oracle(models, cache_size)
    # do some initialization
    out = StringIO()
    start_time = time.time()
    nfiles = len(pathnames)
    pbar = Progress.Bar(nfiles) if (use_pbar and nfiles > 1) else None
    chromosome_dict = {}
    termination_reason = 'finished the analysis'
    try:
        for i, pathname in enumerate(pathnames):
            with open(pathname) as fin:
                lines = fin.readlines()
                lines = [line.strip() for line in lines]
                lines = [line for line in lines if line]
                # validate the number of lines
                if len(lines) < 2:
                    raise ValueError(
                        'there should be at least two lines of input')
                # break the lines of input into rows of elements
                rows = [line_to_row(line) for line in lines]
                # validate the columns of data
                ncolumns_expected = 8
                ncolumns = len(rows[0])
                if ncolumns != ncolumns_expected:
                    raise ValueError('expected %d columns of input' %
                                     ncolumns_expected)
                for row in rows:
                    if len(row) != ncolumns:
                        raise ValueError(
                            'each row of input should have the same number of elements as the first row'
                        )
                # process the data rows
                data_rows = rows[1:]
                for row in data_rows:
                    if nseconds and time.time() - start_time > nseconds:
                        raise TimeoutError()
                    process_genomic_position(row, chromosome_dict, oracle)
            if pbar:
                pbar.update(i + 1)
    except KeyboardInterrupt, e:
        termination_reason = 'early termination by control-c'
Esempio n. 5
0
def main(args):
    """
    @param args: positional and flaglike arguments
    """
    # read the arguments
    input_filename = os.path.abspath(os.path.expanduser(args.infile))
    output_directory = os.path.abspath(os.path.expanduser(args.outdir))
    force = args.force
    # make sure that the output directory exists
    if not os.path.isdir(output_directory):
        if force:
            os.makedirs(output_directory)
    if not os.path.isdir(output_directory):
        msg = 'output directory does not exist: ' + output_directory
        raise Exception(msg)
    # scan the input file for chromosome names
    ch_paths = []
    skimmer = DGRP.ChromoSkimmer()
    with open(input_filename) as fin:
        for chromo_name in skimmer.skim(gen_untyped_rows(fin)):
            output_filename = args.out_prefix + chromo_name + args.out_suffix
            ch_path = os.path.join(output_directory, output_filename)
            ch_paths.append(ch_path)
            if not force:
                if os.path.exists(ch_path):
                    raise Exception('output already exists: ' + ch_path)
    chromo_names = skimmer.name_list
    nlines = skimmer.linecount
    # start the progress bar
    nticks = 2 * nlines
    pbar = Progress.Bar(nticks)
    # scan the input file for correct types and for monotonicity
    with open(input_filename) as fin:
        for i in DGRP.check_chromo_monotonicity(gen_typed_rows(fin)):
            pbar.increment()
    # create the files open for writing
    ch_files = []
    for p in ch_paths:
        ch_files.append(open(p, 'wt'))
    # write the headers
    if not args.noheader:
        for f in ch_files:
            f.write(g_header + '\n')
    # write the lines
    name_to_file = dict(zip(chromo_names, ch_files))
    with open(input_filename) as fin:
        for row in gen_typed_rows(fin):
            name = row[0]
            row_out = convert_row(row)
            f = name_to_file[name]
            line_out = '\t'.join(str(x) for x in row_out)
            f.write(line_out + '\n')
            pbar.increment()
    # close the files
    for f in ch_files:
        f.close()
Esempio n. 6
0
def main(options):
    """
    @param options: parsed from the command line
    """
    edges = g_default_edges
    deadline = None
    pbar = Progress.Bar(options.nsamples)
    out = process(edges, options.epsilon, options.nsamples, deadline, pbar)
    pbar.finish()
    print out.getvalue().strip()
Esempio n. 7
0
def process(input_lines, good_coverage, randomization_rate, nstickinesses,
            nseconds, use_pbar):
    """
    @param input_lines: lines of input of csv data including the header
    @param good_coverage: the expected number of reads at informative positions
    @param randomization_rate: the probability of an error per read
    @param nstickinesses: use this many different levels of stickiness
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # do some initialization
    out = StringIO()
    pbar = None
    start_time = time.time()
    # define the superstates
    cache_size_per_superstate = 100000
    good_state = ReadCoverageGap.Good(randomization_rate, good_coverage,
                                      cache_size_per_superstate)
    bad_state = ReadCoverageGap.Bad(randomization_rate, good_coverage,
                                    cache_size_per_superstate)
    superstates = [good_state, bad_state]
    superstate_names = ['good', 'bad']
    # read the chromosome data
    chromosomes = parse(input_lines)
    # write the header line
    header_row = []
    header_row.extend([
        'genetic_line', 'chromosome', 'position', 'A_count', 'C_count',
        'G_count', 'T_count', 'gap_count'
    ])
    for stickiness in range(nstickinesses):
        for name in ('state', 'substate'):
            header_row.append('%s_%d' % (name, stickiness))
    print >> out, ','.join(header_row)
    # prepare to annotate the chromosomes
    if use_pbar:
        count = 0
        pbar = Progress.Bar(len(chromosomes) * nstickinesses)
    # annotate the chromosomes using the models
    for i, chromosome in enumerate(chromosomes):
        for stickiness in range(nstickinesses):
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            chromosome.annotate_posteriors(stickiness, superstates)
            if pbar:
                count += 1
                pbar.update(count)
        print >> out, '\n'.join(','.join(row)
                                for row in chromosome.get_rows_of_strings(
                                    superstates, superstate_names))
    if pbar:
        pbar.finish()
    # return the output text
    return out.getvalue().strip()
Esempio n. 8
0
def process(ntaxa, nseconds, nlengths, nsamples, nj_like,
            branch_length_sampler, use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param nlengths: use this many different sequence lengths
    @param nsamples: stop after this many samples per sequence length
    @param nj_like: True to use a generalized neighbor-joining-like method of computing successive distance matrices
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    # define the sequence lengths
    lengths = get_sequence_lengths(nlengths)
    # initialize the accumulation matrix
    accum = np.zeros((nlengths, len(g_headers)), dtype=np.int)
    for i, sequence_length in enumerate(lengths):
        set_attribute(accum[i], 'sequence.length', sequence_length)
    # Repeatedly analyze samples from each sequence length.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = None
    if use_pbar:
        pbar = Progress.Bar(nsamples)
    try:
        for sample_index in range(nsamples):
            # reset the accumulation matrix for this iteration
            single_iteration_accum = np.zeros((nlengths, len(g_headers)))
            # accumulate attributes of sampling attempts for each sequence length
            for sequence_length_index, sequence_length in enumerate(lengths):
                # keep trying to get an accepted sample
                while True:
                    # check the time
                    if nseconds and time.time() - start_time > nseconds:
                        raise TimeoutError()
                    # get counts of attributes of a sample
                    sample_result = get_sample_results(sequence_length, ntaxa,
                                                       nj_like,
                                                       branch_length_sampler)
                    single_iteration_accum[
                        sequence_length_index] += sample_result
                    # if the sample was accepted then we are done looking
                    if get_attribute(sample_result, 'nsamples.accepted'):
                        break
            # finish the iteration
            accum += single_iteration_accum
            if pbar:
                pbar.update(sample_index + 1)
        else:
            termination_reason = 'the requested number of samples per sequence length was attained'
    except KeyboardInterrupt, e:
        termination_reason = 'keyboard interrupt'
Esempio n. 9
0
def main(args):
    # do some validation
    if args.nframes < 2:
        raise ValueError('nframes should be at least 2')
    # define the requested physical size of the images (in pixels)
    physical_size = (args.physical_width, args.physical_height)
    # get the directed edges and the branch lengths and vertex names
    R, B, N = FtreeIO.newick_to_RBN(args.tree)
    # get the requested undirected edge
    edge = get_edge(R, N, args.branch_name)
    initial_length = B[edge]
    # get the undirected tree topology
    T = Ftree.R_to_T(R)
    # get the leaves and the vertices of articulation
    leaves = Ftree.T_to_leaves(T)
    internal = Ftree.T_to_internal_vertices(T)
    vertices = leaves + internal
    nleaves = len(leaves)
    v_to_index = Ftree.invseq(vertices)
    # get the requested indices
    x_index = args.x_axis - 1
    y_index = args.y_axis - 1
    if x_index >= nleaves - 1 or y_index >= nleaves - 1:
        raise ValueError(
            'projection indices must be smaller than the number of leaves')
    X_prev = None
    # create the animation frames and write them as image files
    pbar = Progress.Bar(args.nframes)
    for frame_index in range(args.nframes):
        linear_progress = frame_index / float(args.nframes - 1)
        if args.interpolation == 'sigmoid':
            t = sigmoid(linear_progress)
        else:
            t = linear_progress
        B[edge] = (1 - t) * initial_length + t * args.final_length
        w, v = Ftree.TB_to_harmonic_extension(T, B, leaves, internal)
        X_full = np.dot(v, np.diag(np.reciprocal(np.sqrt(w))))
        X = np.vstack([X_full[:, x_index], X_full[:, y_index]]).T
        if X_prev is not None:
            X = reflect_to_match(X, X_prev)
        X_prev = X
        image_string = get_animation_frame(args.image_format, physical_size,
                                           args.scale, v_to_index, T, X, w)
        image_filename = 'frame-%04d.%s' % (frame_index, args.image_format)
        image_pathname = os.path.join(args.output_directory, image_filename)
        with open(image_pathname, 'wb') as fout:
            fout.write(image_string)
        pbar.update(frame_index + 1)
    pbar.finish()
Esempio n. 10
0
def main(args):
    # do some validation
    if args.nframes < 2:
        raise ValueError('nframes should be at least 2')
    # define the requested physical size of the images (in pixels)
    physical_size = (args.physical_width, args.physical_height)
    # build the newick tree from the string
    tree = NewickIO.parse(args.tree, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # Get ordered ids with the leaves first,
    # and get the corresponding distance matrix.
    ordered_ids = get_ordered_ids(tree)
    D = np.array(tree.get_partial_distance_matrix(ordered_ids))
    index_edges = get_index_edges(tree, ordered_ids)
    # Create the reference points
    # so that the video frames are not reflected arbitrarily.
    reference_points = Euclid.edm_to_points(D).T[:3].T
    # create the animation frames and write them as image files
    pbar = Progress.Bar(args.nframes)
    for frame_index in range(args.nframes):
        linear_progress = frame_index / float(args.nframes - 1)
        if args.interpolation == 'sigmoid':
            progress = sigmoid(linear_progress)
        else:
            progress = linear_progress
        mass_vector = get_mass_vector(nvertices, nleaves, progress)
        points = get_canonical_3d_mds(D, mass_vector, reference_points)
        crossings = get_crossings(index_edges, points)
        # define the frame path name
        image_filename = 'frame-%04d.%s' % (frame_index, args.image_format)
        image_pathname = os.path.join(args.output_directory, image_filename)
        # clear the old figure and render the new figure
        mlab.clf()
        add_yz_plane()
        add_zx_plane()
        add_xy_plane()
        X, Y, Z = points.T[0], points.T[1], points.T[2]
        draw_3d_tree(X, Y, Z, index_edges)
        draw_crossings(X, Y, Z, index_edges)
        mlab.savefig(image_pathname, size=physical_size)
        # update the progress bar
        pbar.update(frame_index + 1)
    pbar.finish()
Esempio n. 11
0
def process(input_lines, good_coverage, bad_coverage, randomization_rate, T,
            nseconds, use_pbar):
    """
    @param input_lines: lines of input of csv data including the header
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param T: a transition matrix relating the hidden states
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # do some initialization
    out = StringIO()
    pbar = None
    start_time = time.time()
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    # read the chromosome data
    chromosomes = parse(input_lines)
    # write the header line
    print >> out, ','.join(g_output_header_row)
    # prepare to annotate the chromosomes
    if use_pbar:
        pbar = Progress.Bar(len(chromosomes))
    # annotate the chromosomes using the models
    try:
        for i, chromosome in enumerate(chromosomes):
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            chromosome.annotate_likelihoods(models)
            chromosome.annotate_posteriors(T, models)
            print >> out, '\n'.join(
                ','.join(row) for row in chromosome.get_rows_of_strings())
            if pbar:
                pbar.update(i + 1)
    except KeyboardInterrupt, e:
        if pbar:
            pbar.finish()
        raise e
Esempio n. 12
0
def main(args):
    # read the arguments
    input_filename = os.path.abspath(os.path.expanduser(args.infile))
    output_directory = os.path.abspath(os.path.expanduser(args.outdir))
    force = args.force
    low, high = args.low, args.high
    errlow, errhigh = args.errlow, args.errhigh
    # make sure that the output directory exists
    if not os.path.isdir(output_directory):
        if force:
            os.makedirs(output_directory)
    if not os.path.isdir(output_directory):
        msg = 'output directory does not exist: ' + output_directory
        raise Exception(msg)
    # create the scanner object which will be used for two passes
    scanner = Scanner(low, high, args.fill, errlow, errhigh)
    # Do the first pass,
    # checking for errors and gathering info about the chromosomes.
    name_to_path = {}
    with open(input_filename) as fin:
        for name in scanner.scan(fin):
            output_filename = args.out_prefix + name + args.out_suffix
            fpath = os.path.join(output_directory, output_filename)
            name_to_path[name] = fpath
            if not args.force:
                if os.path.exists(fpath):
                    raise Exception('output file already exists: ' + fpath)
    nticks = scanner.get_npositions()
    pbar = Progress.Bar(nticks)
    # open the files for writing
    name_to_fout = {}
    for name, fpath in name_to_path.items():
        name_to_fout[name] = open(fpath, 'wt')
    # Do the second pass,
    # writing the files and updating the progress bar.
    with open(input_filename) as fin:
        for name, line in scanner.gen_named_lines(fin):
            name_to_fout[name].write(line + '\n')
            pbar.increment()
    # close the files
    for fout in name_to_fout.values():
        fout.close()
Esempio n. 13
0
def main(args):
    # do some validation
    if args.nframes < 2:
        raise ValueError('nframes should be at least 2')
    # define the requested physical size of the images (in pixels)
    physical_size = (args.physical_width, args.physical_height)
    # create the animation frames and write them as image files
    pbar = Progress.Bar(args.nframes)
    for frame_index in range(args.nframes):
        t = frame_index / float(args.nframes - 1)
        image_string = get_animation_frame(args.image_format, physical_size,
                                           args.scale, args.tree,
                                           args.eigenvector_index, t_to_yaw(t),
                                           t_to_pitch(t))
        image_filename = 'frame-%04d.%s' % (frame_index, args.image_format)
        image_pathname = os.path.join(args.output_directory, image_filename)
        with open(image_pathname, 'wb') as fout:
            fout.write(image_string)
        pbar.update(frame_index + 1)
    pbar.finish()
Esempio n. 14
0
def main(args):
    # do some validation
    if args.nframes < 2:
        raise ValueError('nframes should be at least 2')
    # define the requested physical size of the images (in pixels)
    physical_size = (args.physical_width, args.physical_height)
    # build the newick tree from the string
    tree = NewickIO.parse(args.tree, FelTree.NewickTree)
    nvertices = len(list(tree.preorder()))
    nleaves = len(list(tree.gen_tips()))
    # Get ordered ids with the leaves first,
    # and get the corresponding distance matrix.
    ordered_ids = get_ordered_ids(tree)
    D = np.array(tree.get_partial_distance_matrix(ordered_ids))
    index_edges = get_index_edges(tree, ordered_ids)
    # Create the reference points
    # so that the video frames are not reflected arbitrarily.
    reference_points = Euclid.edm_to_points(D).T[:2].T
    # create the animation frames and write them as image files
    pbar = Progress.Bar(args.nframes)
    for frame_index in range(args.nframes):
        linear_progress = frame_index / float(args.nframes - 1)
        if args.interpolation == 'sigmoid':
            progress = sigmoid(linear_progress)
        else:
            progress = linear_progress
        mass_vector = get_mass_vector(nvertices, nleaves, progress)
        points = get_canonical_2d_mds(D, mass_vector, reference_points)
        image_string = get_animation_frame(args.image_format, physical_size,
                                           args.scale, mass_vector,
                                           index_edges, points)
        image_filename = 'frame-%04d.%s' % (frame_index, args.image_format)
        image_pathname = os.path.join(args.output_directory, image_filename)
        with open(image_pathname, 'wb') as fout:
            fout.write(image_string)
        pbar.update(frame_index + 1)
    pbar.finish()
Esempio n. 15
0
def main(options, args):
    """
    @param options: from optparse
    @param args: from optparse
    @return: a response string
    """
    # get this file from the web directory
    # http://hgdownload.cse.ucsc.edu/goldenPath/hg18/multiz28way/alignments/
    original_fasta_filename = 'knownGene.exonAA.fa'
    # the original fasta file is broken into a bunch of pieces and put into this directory
    pieces_directory = 'fasta'
    # the index files that map genomic locations to fasta subfiles are in this directory
    index_directory = 'index'
    # this file keeps a list of valid chromosome names from the original fasta file
    chromosome_filename = 'chromosomes.txt'
    # assert that a command was given with the script
    if not args:
        raise MySyntaxError('no command was given')
    # try to dispatch the command
    command = args[0]
    command_args = args[1:]
    if command == 'split':
        if command_args:
            raise MySyntaxError(
                'the split command does not take any arguments')
        # assert that the fasta directory has been created
        if not os.path.isdir(pieces_directory):
            err_lines = [
                'The directory for the split fasta files was not found: ' +
                pieces_directory,
                'Please create this directory or cd to its parent directory.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # assert that the current directory has the original huge fasta file
        pathnames = os.listdir('.')
        if original_fasta_filename not in pathnames:
            err_lines = [
                'The file %s was not found in the current directory.' %
                original_fasta_filename, 'Please download this file from:',
                'http://hgdownload.cse.ucsc.edu/goldenPath/hg18/multiz28way/alignments/'
            ]
            raise MyConfigError('\n'.join(err_lines))
        splitter = KGEA.Splitter(original_fasta_filename, pieces_directory)
        splitter.run(verbose=options.verbose)
        return ''
    elif command == 'index':
        if command_args:
            raise MySyntaxError(
                'the index command does not take any arguments')
        # assert that the fasta directory has been created
        if not os.path.isdir(pieces_directory):
            err_lines = [
                'The directory for the split fasta files was not found: ' +
                pieces_directory,
                'If this directory exists somewhere else, then cd to its parent directory.',
                'If this directory has not been created, then create it and run the split command.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # assert that the index directory has been created
        if not os.path.isdir(index_directory):
            err_lines = [
                'The directory for the index files was not found: ' +
                index_directory,
                'Please create this directory or cd to its parent directory.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        indexer = KGEA.Indexer(index_directory, chromosome_filename,
                               pieces_directory)
        indexer.run(verbose=options.verbose)
        return ''
    elif command == 'find-alignment':
        if len(command_args) != 2:
            raise MySyntaxError(
                'the find-alignment command takes two arguments')
        # define the chromosome string and the chromosome position
        chromosome_string, chromosome_position_string = command_args
        # initialize the chromosome position and assert that it is plausible
        try:
            chromosome_position = int(chromosome_position_string)
        except ValueError as e:
            raise MySyntaxError('the chromosome position should be an integer')
        # assert that the fasta directory has been created
        if not os.path.isdir(pieces_directory):
            err_lines = [
                'The directory for the split fasta files was not found: ' +
                pieces_directory,
                'If this directory exists somewhere else, then cd to its parent directory.',
                'If this directory has not been created, then create it and run the split command.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # assert that the index directory has been created
        if not os.path.isdir(index_directory):
            err_lines = [
                'The directory for the index files was not found: ' +
                index_directory,
                'If this directory exists somewhere else, then cd to its parent directory.',
                'If this directory has not been created, then create it and run the index command.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # look for the alignment using the finder
        finder = KGEA.Finder(index_directory, chromosome_filename,
                             pieces_directory)
        fasta_lines = finder.get_alignment_lines(chromosome_string,
                                                 chromosome_position,
                                                 verbose=options.verbose)
        if not fasta_lines:
            return 'no amino acid was found at this position'
        return '\n'.join(fasta_lines)
    elif command == 'find-column':
        if len(command_args) != 2:
            raise MySyntaxError('the find-column command takes two arguments')
        # define the chromosome string and the chromosome position
        chromosome_string, chromosome_position_string = command_args
        # initialize the chromosome position and assert that it is plausible
        try:
            chromosome_position = int(chromosome_position_string)
        except ValueError as e:
            raise MySyntaxError('the chromosome position should be an integer')
        # assert that the fasta directory has been created
        if not os.path.isdir(pieces_directory):
            err_lines = [
                'The directory for the split fasta files was not found: ' +
                pieces_directory,
                'If this directory exists somewhere else, then cd to its parent directory.',
                'If this directory has not been created, then create it and run the split command.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # assert that the index directory has been created
        if not os.path.isdir(index_directory):
            err_lines = [
                'The directory for the index files was not found: ' +
                index_directory,
                'If this directory exists somewhere else, then cd to its parent directory.',
                'If this directory has not been created, then create it and run the index command.'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # look for the column using the finder
        finder = KGEA.Finder(index_directory, chromosome_filename,
                             pieces_directory)
        column_lines = finder.get_column_lines(chromosome_string,
                                               chromosome_position,
                                               verbose=options.verbose)
        if not column_lines:
            return 'no amino acid was found at this position'
        return '\n'.join(column_lines)
    elif command == 'summarize':
        if command_args:
            raise MySyntaxError(
                'the summarize command does not take any arguments')
        # assert that the current directory has the original huge fasta file
        pathnames = os.listdir('.')
        if original_fasta_filename not in pathnames:
            err_lines = [
                'The file %s was not found in the current directory.' %
                original_fasta_filename, 'Please download this file from:',
                'http://hgdownload.cse.ucsc.edu/goldenPath/hg18/multiz28way/alignments/'
            ]
            raise MyConfigError('\n'.join(err_lines))
        # initialize the progress bar
        nbytes_total = os.path.getsize(original_fasta_filename)
        pbar = Progress.Bar(nbytes_total)
        # initialize the summary
        mod3 = {0: 0, 1: 0, 2: 0}
        length_diff_dict = {}
        # summarize by reading each alignment from the file
        approx_nbytes_read = 0
        fin = open(original_fasta_filename)
        for lines in Util.gen_paragraphs(fin):
            # process the lines
            header_line = lines[0]
            p = KGEA.LocationParser(header_line)
            genomic_length = (p.last_index - p.first_index) + 1
            mod3[genomic_length % 3] += 1
            diff = 3 * p.length - genomic_length
            if diff not in length_diff_dict:
                length_diff_dict[diff] = 0
            length_diff_dict[diff] += 1
            # update the progress bar
            approx_nbytes_read += sum(len(line) for line in lines)
            pbar.update(approx_nbytes_read)
        fin.close()
        # finish the progress bar
        pbar.update(nbytes_total)
        # return the summary
        summary_lines = []
        summary_lines += [
            'genomic span of %d mod 3: %d sequences' % (i, mod3[i])
            for i in range(3)
        ]
        summary_lines.append('histogram of 3*aa_length - genomic span:')
        for key, value in sorted(length_diff_dict.items()):
            summary_lines.append('%d : %d' % (key, value))
        return '\n'.join(summary_lines)
    else:
        raise MySyntaxError('invalid command: ' + command)
Esempio n. 16
0
def do_command_line_analysis(options):
    """
    Print some stuff to stdout, and show a progress bar on stderr.
    @param options: an object from optparse
    """
    # load the tree, using the default tree if no filename was provided
    tree, tree_remark = get_tree_and_remark(options)
    # initialize the simulation objects
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'),
        Simulation(Clustering.StoneSpectralSignDMS(), 'nj',
                   'spectral sign cut with neighbor joining fallback'),
        Simulation(Clustering.RandomDMS(), 'nj', 'random partitioning')
    ]
    # possibly add the slow simulation
    if options.use_exact:
        sims.append(
            Simulation(Clustering.StoneExactDMS(), 'nj',
                       'exact criterion with neighbor joining fallback'))
    # define the simulation parameters
    reconstruction_count = options.nsamples
    sequence_length_string = options.sequence_length
    if sequence_length_string == 'inf':
        sequence_length = float('inf')
    else:
        sequence_length = int(sequence_length_string)
    inf_replacement = 20.0
    if options.reject_inf:
        inf_replacement = None
    elif options.replace_inf:
        try:
            inf_replacement = float(options.replace_inf)
        except ValueError:
            msg = 'invalid replace_inf value: '
            raise OptionError(msg + str(options.replace_inf))
    zero_replacement = 0
    if options.reject_zero:
        zero_replacement = None
    elif options.replace_zero:
        try:
            zero_replacement = float(options.replace_zero)
        except ValueError:
            msg = 'invalid replace_zero value: '
            raise OptionError(msg + str(options.replace_zero))
    # start the html file
    print '<html><body>'
    # show the simulation parameters
    print 'original tree source:', tree_remark, '<br/>'
    print 'reconstruction count:', reconstruction_count, '<br/>'
    print 'sequence length:', sequence_length, '<br/>'
    # set the simulation parameters for each simulation
    for sim in sims:
        sim.set_original_tree(tree)
        # If there is only one reconstruction per method
        # then show the progress of the tree builder.
        if reconstruction_count == 1:
            sim.set_verbose()
    # define an arbitrary but consistent ordering of the taxa
    ordered_names = [node.name for node in tree.gen_tips()]
    try:
        # attempt to simulate a bunch of distance matrices
        if options.verbose:
            print 'sampling', reconstruction_count, 'distance matrices...'
        # initialize the distance matrix sampler
        sampler = DMSampler.DMSampler(tree, ordered_names, sequence_length)
        sampler.set_inf_replacement(inf_replacement)
        sampler.set_zero_replacement(zero_replacement)
        # start the progress bar
        pbar = Progress.Bar(1.0)
        # sample some distance matrices
        distance_matrices = []
        for result in sampler.gen_samples_or_none():
            # if we got a result then update the distance matrix list
            if result:
                sequence_list, D = result
                distance_matrices.append(D)
            # Update the progressbar regardless of whether or not
            # the proposal was accepted.
            remaining_acceptances = reconstruction_count - len(
                distance_matrices)
            numerator = sampler.get_completed_proposals()
            denominator = numerator + sampler.get_remaining_proposals(
                remaining_acceptances)
            dms_fraction = float(numerator) / float(denominator)
            dms_total = 1.0 / (1 + len(sims))
            pbar.update(dms_fraction * dms_total)
            # if we have enough samples then break the loop
            if not remaining_acceptances:
                break
        # reconstruct trees using various methods
        for i, sim in enumerate(sims):
            if options.verbose:
                print 'running "%s"...' % sim.description
            sim.run(distance_matrices, ordered_names)
            pbar.update(float(i + 2) / float(1 + len(sims)))
        # stop the progress bar
        pbar.finish()
        # get the simulation data
        table = [('method', 'seconds', 'uniform loss', 'weighted loss')]
        for sim in sims:
            table.append((sim.description, sim.get_running_time(),
                          sim.get_uniform_loss(), sim.get_deep_loss()))
        # convert the row major matrix into an html table
        print HtmlTable.get_table_string(table)
        # end the html file
        print '</html></body>'
    except KeyboardInterrupt:
        print 'interrupted stage', pbar.progress, 'of', pbar.high
Esempio n. 17
0
def do_hard_coded_analysis_b(tree, tree_remark):
    """
    Do a hardcoded analysis of tree reconstruction methods.
    Make R files of ordered reconstruction losses.
    @param tree: a tree object
    @param tree_remark: a string that is a comment about the tree
    """
    # define an arbitrary order for the names of the leaves of the tree
    ordered_names = list(node.name for node in tree.gen_tips())
    # use some replicates
    reconstruction_count = 100
    # Make R files for reconstruction results from sequences
    # of some number of nucleotides in length.
    sequence_length = 2000
    # define the tree reconstruction methods to be used
    sims = [
        Simulation(Clustering.NeighborJoiningDMS(), 'nj', 'neighbor joining'),
        Simulation(Clustering.StoneSpectralSignDMS(), 'nj', 'spectral sign')
    ]
    # set tree reconstruction parameters
    for sim in sims:
        sim.set_original_tree(tree)
    # initialize the distance matrix sampler
    sampler = DMSampler.InfiniteAllelesSampler(tree, ordered_names,
                                               sequence_length)
    sampler.set_inf_replacement(20.0)
    sampler.set_zero_replacement(0.0)
    # start the progress bar
    pbar = Progress.Bar(1.0)
    # sample some distance matrices
    distance_matrix_start_time = time.time()
    distance_matrices = []
    for result in sampler.gen_samples_or_none():
        # if we got a result then update the distance matrix list
        if result:
            sequence_list, D = result
            distance_matrices.append(D)
        # Update the progressbar regardless of whether or not
        # the proposal was accepted.
        remaining_acceptances = reconstruction_count - len(distance_matrices)
        numerator = sampler.get_completed_proposals()
        denominator = numerator + sampler.get_remaining_proposals(
            remaining_acceptances)
        dms_fraction = float(numerator) / float(denominator)
        dms_total = 1.0 / (1 + len(sims))
        pbar.update(dms_fraction * dms_total)
        # if we have enough samples then break the loop
        if not remaining_acceptances:
            break
    distance_matrix_seconds = time.time() - distance_matrix_start_time
    # reconstruct trees using various methods
    reconstruction_seconds = []
    for i, sim in enumerate(sims):
        reconstruction_start_time = time.time()
        print 'reconstructing', len(distance_matrices), 'trees'
        print 'using', sim.description
        sim.run(distance_matrices, ordered_names)
        pbar.update(float(i + 2) / float(1 + len(sims)))
        reconstruction_seconds.append(time.time() - reconstruction_start_time)
    # stop the progress bar
    pbar.finish()
    # consider the neighbor joining and the spectral sign results
    nj_sim, ss_sim = sims
    # extract the simulation data
    label_list_pairs = [
        ('nj.unweighted', nj_sim.get_normalized_error_counts()),
        ('ss.unweighted', ss_sim.get_normalized_error_counts()),
        ('nj.weighted', nj_sim.get_normalized_loss_values()),
        ('ss.weighted', ss_sim.get_normalized_loss_values())
    ]
    labels, transposed_table = zip(*label_list_pairs)
    table = zip(*transposed_table)
    table_string = RUtil.get_table_string(table, labels)
    # write the table
    filename = 'out3.table'
    with open(filename, 'w') as fout:
        print >> fout, '# tree source:', tree_remark
        print >> fout, '# number of taxa:', len(ordered_names)
        print >> fout, '# sampled distance matrices:', len(distance_matrices)
        print >> fout, '# sampling seconds elapsed:', distance_matrix_seconds
        print >> fout, '# sites per sequence:', sequence_length
        for sim, seconds in zip(sims, reconstruction_seconds):
            msg_a = '# seconds elapsed for tree reconstruction using '
            msg_b = sim.description + ': ' + str(seconds)
            print >> fout, msg_a + msg_b
        print >> fout, table_string
    print 'wrote', filename
Esempio n. 18
0
def process(ntaxa, nseconds, seqlen, nsamples, branch_length_sampler,
            use_pbar):
    """
    @param ntaxa: the number of taxa per tree
    @param nseconds: stop after this many seconds
    @param seqlen: use this sequence length
    @param nsamples: stop after this many samples per sequence length
    @param branch_length_sampler: this function samples branch lengths independently
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the contents of an R table
    """
    # initialize the global rejection counts
    nrejected_zero = 0
    nrejected_inf = 0
    nrejected_fail = 0
    naccepted = 0
    # Initialize the accumulation matrix.
    # The rows specify the size of the smaller side of the initial split.
    # The columns specify the compatibility status of the split.
    nsmall_sizes = (ntaxa / 2) + 1
    accum = np.zeros((nsmall_sizes, 2), dtype=np.int)
    # Repeatedly analyze samples.
    # We might have to stop early if we run out of time or if ctrl-c is pressed.
    # If we have to stop early, then show the results of the progress so far.
    termination_reason = 'no reason for termination was given'
    start_time = time.time()
    pbar = Progress.Bar(nsamples) if use_pbar else None
    try:
        for sample_index in range(nsamples):
            # keep trying to get an accepted sample
            while True:
                # check the time
                if nseconds and time.time() - start_time > nseconds:
                    raise TimeoutError()
                # first sample a tree and get its set of informative splits
                tree = TreeSampler.sample_agglomerated_tree(ntaxa)
                true_splits = tree.get_nontrivial_splits()
                # sample the branch lengths
                for branch in tree.get_branches():
                    branch.length = branch_length_sampler()
                # Attempt to sample a distance matrix.
                # If the sample was rejected then note the reason and go back to the drawing board.
                try:
                    D = sample_distance_matrix(tree, seqlen)
                except InfiniteDistanceError as e:
                    nrejected_inf += 1
                    continue
                except ZeroDistanceError as e:
                    nrejected_zero += 1
                    continue
                # Attempt to estimate the primary split of the tree from the distance matrix.
                # If there was a technical failure then note it and go back to the drawing board.
                # Otherwise note the compatibility and balance of the split.
                try:
                    eigensplit = BuildTreeTopology.split_using_eigenvector(D)
                    small_size = min(len(side) for side in eigensplit)
                    if eigensplit in true_splits:
                        compatibility = 1
                    else:
                        compatibility = 0
                except BuildTreeTopology.DegenerateSplitException, e:
                    small_size = 0
                    compatibility = 1
                except BuildTreeTopology.InvalidSpectralSplitException, e:
                    nrejected_fail += 1
                    continue
Esempio n. 19
0
def process(linesources, good_coverage, randomization_rate, stickiness,
            nseconds, use_pbar):
    """
    @param linesources: open resequencing files for reading
    @param good_coverage: the expected number of reads at informative positions
    @param randomization_rate: the probability of an error per base call
    @param stickiness: level of stickiness
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: the multi-line string of the resulting csv file
    """
    # do some initialization
    start_time = time.time()
    termination_reason = 'finished the analysis'
    # define the superstates
    cache_size_per_superstate = 100000
    good_state = ReadCoverageGap.Good(randomization_rate, good_coverage,
                                      cache_size_per_superstate)
    bad_state = ReadCoverageGap.Bad(randomization_rate, good_coverage,
                                    cache_size_per_superstate)
    superstates = [good_state, bad_state]
    superstate_names = ['good', 'bad']
    # prepare to annotate the chromosomes
    chromosomes = []
    pbar = Progress.Bar(len(linesources)) if use_pbar else None
    # annotate the chromosomes using the models
    try:
        for i, linesource in enumerate(linesources):
            # read the lines of text
            lines = Util.get_stripped_lines(linesource.readlines())
            # validate the number of lines
            if len(lines) < 2:
                raise ValueError('there should be at least two lines of input')
            # break the lines of input into rows of elements
            rows = [line_to_row(line) for line in lines]
            # validate the columns of data
            ncolumns_expected = 8
            ncolumns = len(rows[0])
            if ncolumns != ncolumns_expected:
                raise ValueError('expected %d columns of input: %s' %
                                 (ncolumns_expected, rows[0]))
            for row in rows:
                if len(row) != ncolumns:
                    raise ValueError(
                        'each row of input should have the same number of elements as the first row'
                    )
            # process the data rows, building a dictionary of chromosomes
            chromosome_dict = {}
            data_rows = rows[1:]
            for row in data_rows:
                if nseconds and time.time() - start_time > nseconds:
                    raise TimeoutError()
                process_genomic_position(row, chromosome_dict)
            current_chromosomes = [
                chromosome
                for identifier, chromosome in sorted(chromosome_dict.items())
            ]
            for chromosome in current_chromosomes:
                # do the annotation
                chromosome.annotate_posteriors(stickiness, superstates)
                # delete position specific data
                chromosome.del_position_specific_data()
            # add the chromosomes to the list
            chromosomes.extend(current_chromosomes)
            # update the progress bar
            if pbar:
                pbar.update(i + 1)
    except KeyboardInterrupt, e:
        termination_reason = 'early termination by control-c'
Esempio n. 20
0
def process(input_lines, good_coverage, bad_coverage, randomization_rate,
            nseconds, use_pbar):
    """
    @param input_lines: lines of input of csv data including the header
    @param good_coverage: the expected number of reads at informative positions
    @param bad_coverage: the expected number of reads at uninformative positions
    @param randomization_rate: the probability of an error per read
    @param nseconds: None or impose a time limit of this many seconds
    @param use_pbar: True iff a progress bar should be used
    @return: a multi-line string of the annotated csv file
    """
    verbose = False
    # validate the number of lines
    if len(input_lines) < 6:
        raise ValueError('there should be at least six lines of input')
    if len(input_lines) % 5 != 1:
        raise ValueError(
            'the input lines should consist of a header plus a multiple of five data lines'
        )
    # break the lines of input into rows of elements
    input_rows = [line_to_row(line) for line in input_lines]
    # validate the columns of data
    ncolumns = len(input_rows[0])
    if ncolumns < 7:
        raise ValueError('there should be at least seven columns of input')
    if ncolumns % 2 != 1:
        raise ValueError('the number of input columns should be odd')
    for row in input_rows:
        if len(row) != ncolumns:
            raise ValueError(
                'each row of input should have the same number of elements as the first row'
            )
    # define the three models
    homozygous = ReadCoverage.Homozygous(randomization_rate, good_coverage)
    heterozygous = ReadCoverage.Heterozygous(randomization_rate, good_coverage)
    overcovered = ReadCoverage.Overcovered(randomization_rate, bad_coverage)
    models = [homozygous, heterozygous, overcovered]
    # initialize the output header row
    header_row = input_rows[0]
    output_header_row = header_row[:5]
    for heading in header_row[5:]:
        if heading.endswith('sco'):
            output_header_row.append(heading)
        elif heading.endswith('cov'):
            output_header_row.extend([
                heading, heading + '_hom', heading + '_het', heading + '_ovr'
            ])
        else:
            raise ValueError(
                'each heading after the fifth should end with sco or cov')
    # get the rest of the rows
    data_rows = input_rows[1:]
    # define the number of genomic positions and the number of strains
    npositions = len(data_rows) / 5
    nstrains = (ncolumns - 5) / 2
    # begin the output
    out = StringIO()
    print >> out, ','.join(output_header_row)
    # initialize some stuff
    start_time = time.time()
    pbar = Progress.Bar(npositions) if use_pbar else None
    try:
        for position in range(npositions):
            # check the time
            if nseconds and time.time() - start_time > nseconds:
                raise TimeoutError()
            # get a chunk of five consecutive rows
            position_rows = [data_rows[position * 5 + i] for i in range(5)]
            # get the corresponding log likelihoods
            log_likelihood_lists = get_log_likelihoods_per_strain(
                position_rows, models)
            # construct five annotated output lines
            for position_row in position_rows:
                output_row = position_row[:5]
                for i, log_likelihoods in enumerate(log_likelihood_lists):
                    # add the coverage, three annotations, and the score
                    coverage_string = position_row[5 + 2 * i]
                    score_string = position_row[5 + 2 * i + 1]
                    if log_likelihoods:
                        annotations = [str(x) for x in log_likelihoods]
                    else:
                        annotations = ['-', '-', '-']
                    output_row.extend([coverage_string] + annotations +
                                      [score_string])
                print >> out, ','.join(output_row)
            # update the progress bar
            if pbar:
                pbar.update(position + 1)
    except KeyboardInterrupt, e:
        if pbar:
            pbar.finish()
        raise e