Ejemplo n.º 1
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    input_path = opts.input_path
    output_path = opts.output_path

    if isdir(input_path):
        # Run PCoA on all distance matrices in the input dir
        # Create the output directory if it does not exists
        if not exists(output_path):
            makedirs(output_path)

        # Get all the filenames present in the input directory
        file_names = [fname for fname in listdir(input_path)
                      if not (fname.startswith('.') or isdir(fname))]

        # Loop through all the input files
        for fname in file_names:
            # Get the path to the input distance matrix
            infile = join(input_path, fname)

            # Run PCoA on the input distance matrix
            with open(infile, 'U') as lines:
                pcoa_scores = pcoa(lines)

            # Store the PCoA results on the output directory
            base_fname, ext = splitext(fname)
            out_file = join(output_path, 'pcoa_%s.txt' % base_fname)
            pcoa_scores.write(out_file)

    else:
        # Run PCoA on the input distance matrix
        with open(input_path, 'U') as f:
            pcoa_scores = pcoa(f)
        # Store the results in the output file
        pcoa_scores.write(output_path)
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999):
    """ Compute Procrustes M2 and p-values for a set of results
    
        result_tables: 2d list of tables to be compared to expected tables, 
         where the data in the inner list is:
          [dataset_id, reference_database_id, method_id, 
           parameter_combination_id, table_fp]
        expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal
         coordinate matrices, for the expected result coordinate matrices
        taxonomy_level: level to compute results
    """
    ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is
    ### in order here. *ALMOST refers to changes to parser and variable names since expected
    ### is a pc matrix here.

    for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables:
        ## parse the expected table (unless taxonomy_level is specified, this should be
        ## collapsed on level 6 taxonomy)
        try:
            expected_pc_fp = expected_pc_lookup[dataset_id][reference_id]
        except KeyError:
            raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id)

        ## parse the actual table and collapse it at the specified taxonomic level
        try:
            actual_table = parse_biom_table(open(actual_table_fp, "U"))
        except ValueError:
            raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp
        collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level)
        actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy)
        ### End code copied directly from compute_prfs.

        # Next block of code, how do I hate thee? Let me count the ways...
        # (1) dist_bray_curtis doesn't take a BIOM Table object
        # (2) pcoa takes a qiime-formatted distance matrix as a list of lines
        # (3) pcoa return a qiime-formatted pc matrix
        # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple
        #     times, so we actually *need* those the pcs that get passed in to be
        #     lists of lines
        dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()]))
        formatted_dm = format_distance_matrix(actual_table.SampleIds, dm)
        actual_pc = pcoa(formatted_dm.split("\n")).split("\n")
        expected_pc = list(open(expected_pc_fp, "U"))

        ## run Procrustes analysis with monte carlo simulation
        actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo(
            expected_pc,
            actual_pc,
            trials=random_trials,
            max_dimensions=num_dimensions,
            sample_id_map=None,
            trial_output_dir=None,
        )

        yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
Ejemplo n.º 3
0
def generate_pcoa_cloud_from_point_in_omega(map_headers, map_data, biom_object, metric, 
        sequences, iterations, axes, tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """
    
    pcoa_input = {'pcoa_headers':[], 'pcoa_values':[], 'eigenvalues':[], 'coords_pct':[]}
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_file = StringIO()
        pcoa_file.write(pcoa_results)
        pcoa_file.seek(0)
        pcoa_headers, pcoa_values, eigenvalues, coords_pct = parse_coords(pcoa_file)
        pcoa_file.close()
        pcoa_input['pcoa_headers'].append(pcoa_headers)
        pcoa_input['pcoa_values'].append(pcoa_values)
        pcoa_input['eigenvalues'].append(eigenvalues)
        pcoa_input['coords_pct'].append(coords_pct)
    
    if iterations==1:
        coords_headers = pcoa_input['pcoa_headers'][0]
        coords_data = pcoa_input['pcoa_values'][0]
        coords_eigenvalues = pcoa_input['eigenvalues'][0]
        coords_pct = pcoa_input['coords_pct'][0]
        coords_low, coords_high = None, None
    else:
        coords_headers, coords_data, coords_eigenvalues, coords_pct, coords_low,\
            coords_high, clones = preprocess_coords_file(pcoa_input['pcoa_headers'],
            pcoa_input['pcoa_values'], pcoa_input['eigenvalues'], 
            pcoa_input['coords_pct'], map_headers, map_data, custom_axes=None, 
            jackknifing_method='IQR', is_comparison=False)
    
    return make_pcoa_plot(coords_headers, coords_data, coords_eigenvalues, coords_pct, \
        map_headers, map_data, coords_low, coords_high, True)
Ejemplo n.º 4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if os.path.isdir(opts.input_path):
        multiple_file_pcoa(opts.input_path, opts.output_path)
    elif os.path.isfile(opts.input_path):
        f = open(opts.input_path, 'U')
        pcoa_res_string = pcoa(f)
        f.close()

        f = open(opts.output_path, 'w')
        f.write(pcoa_res_string)
        f.close()
    else:
        print("io error, check input file path")
        exit(1)
Ejemplo n.º 5
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if os.path.isdir(opts.input_path):
        multiple_file_pcoa(opts.input_path,opts.output_path)
    elif os.path.isfile(opts.input_path):
        f = open(opts.input_path,'U')
        pcoa_res_string = pcoa(f)
        f.close()

        f = open(opts.output_path, 'w')
        f.write(pcoa_res_string)
        f.close()
    else:
        print("io error, check input file path")
        exit(1)
Ejemplo n.º 6
0
def generate_pcoa_cloud_from_point_in_omega(mapping_file_tuple, biom_object,
                                            metric, sequences, iterations, axes,
                                            tree_object=None):
    """run the randomisations and get a WebGL PCoA plot string representation

    Input:
    mapping_file_tuple: data and headers tuple for representing the mapping file
    biom_object: otu table biom object
    metric: string of the name for the beta diversity metric, i. e. 'unifrac'
    sequences: number of sequences per sample
    iterations: number of iterations to generate the pcoa plot
    axes: number of axes to account for
    tree_object: tree to perform the beta diversity calculation

    Output:
    WebGL string representing the PCoA plot
    """

    # get a list of the SampleIds
    full_id_list = mapping_file_to_dict(mapping_file_tuple[0], mapping_file_tuple[1]).keys()

    pcoa_list = []
    for i in range(iterations):
        rare_biom_table = get_rare_data(biom_object, sequences)
        beta_dm = single_object_beta(rare_biom_table, metric, tree_object)
        pcoa_results = pcoa(beta_dm)

        pcoa_list.append(pcoa_results)

    # convert the list of pcoa lines into ellipsoid coords
    ellipse_coords_by_sampleId, sampleId_to_coords  = get_pcoa_ellipsoid_coords(pcoa_list, axes, full_id_list)
        
    # check the ellipses are created correctly
    if type(ellipse_coords_by_sampleId) == type(''):
        raise ValueError, 'Could not create PCoA plot'

    webgl_string = make_pcoa_plot(ellipse_coords_by_sampleId, mapping_file_tuple, sampleId_to_coords['variation explained'])
    return webgl_string
Ejemplo n.º 7
0
 def test_pcoa(self):
     """ pcoa should throw no errors"""
     res = pcoa(self.distmtx_txt)
     assert res  # formatting tested elsewhere
Ejemplo n.º 8
0
 def test_pcoa(self):
     """ pcoa should throw no errors"""
     res = pcoa(self.distmtx_txt)
     assert res  # formatting tested elsewhere