Exemple #1
0
def main(file1=None,
         file2=None,
         base_directory=None,
         configuration_file=None,
         config_base_directory=None,
         file_out='input_data.txt.gz',
         row_wise_out=None,
         col_wise_out=None):

    # Load configuration for I/O (or automatically generate if not provided as an input)
    io_config = io.load_io_config(file1_path=file1, file2_path=file2, base_dir=base_directory,
                                  config_path=configuration_file, config_base_dir=config_base_directory)

    # Load and merge datasets
    data = io.load_datasets_from_config(io_config)

    # Get data row-wise and column-wise in json format
    if row_wise_out is not None:
        row_wise_data = preprocess.tabular2json(data, data.index, data.columns, by_col=False)
        io.write_json(row_wise_data, row_wise_out)

    if col_wise_out is not None:
        col_wise_data = preprocess.tabular2json(data, data.index, data.columns, by_col=True)
        io.write_json(col_wise_data, col_wise_out)

    # Write merged data sets to output
    data.to_csv(file_out, sep='\t', index=True, header=True)

    return data
Exemple #2
0
def main(file1=None,
         file2=None,
         base_directory=None,
         configuration_file=None,
         config_base_directory=None):

    # Load configuration for I/O (or automatically generate if not provided as an input)
    io_config = io.load_io_config(file1_path=file1, file2_path=file2, base_dir=base_directory,
                                  config_path=configuration_file, config_base_dir=config_base_directory)

    # Load and merge datasets
    data = io.load_datasets_from_config(io_config)

    # Filter out low-information columns
    data = preprocess.eliminate_low_information_columns(data, 0.01)

    # Identify, separate, and save numeric and categorical columns. Numeric data will be used for clustering
    # and categorical (attribute) data will be used for enrichment analysis. Note that categorical data is
    # saved as occurrence counts (in json format)
    data_numeric, data_object = preprocess.preprocess_and_split(data, fill_na=True)

    # Save numeric data to file in row-wise and column-wise formats
    data_numeric.to_csv('rows_numeric_data.txt.gz', sep='\t', header=False)  # Row-wise
    data_numeric.T.to_csv('cols_numeric_data.txt.gz', sep='\t', header=False)  # Column-wise

    # Save categorical (attribute) data
    data_object.to_json('cols_attribute_data.json.gz', orient='columns')

    # Save counts of value occurrences (in json format)
    data_object = preprocess.tabular2json(data_object.values, data_object.index, data_object.columns,
                                          by_col=True, pad_rows=False)
    data_object = preprocess.generate_occurrence_counts(data_object)
    io.write_json(data_object, 'cols_attribute_counts.json.gz')
Exemple #3
0
def main(file,
         sep=None,
         comment=None,
         index_col=None,
         header_row=None,
         rows_out='rows_out.json.gz',
         cols_out='cols_out.json.gz'):

    assert isinstance(file, str)

    # Read data as a data frame
    data = pd.read_csv(file,
                       sep=sep,
                       comment=comment,
                       index_col=index_col,
                       header=header_row)

    num_cols, cat_cols = identify_types(data)
    with open('col_types.json', 'w') as f:
        json.dump({
            'categorical': cat_cols,
            'numerical': num_cols
        },
                  f,
                  sort_keys=True,
                  indent='\t')

    # Drop columns with missing data
    data.dropna(axis=1, inplace=True)

    # Get row and column labels
    row_labels = list(data.index)
    row_labels = [str(x) for x in row_labels]

    col_labels = list(data.columns)
    col_labels = [str(x) for x in col_labels]

    # Convert to list of lists for subsequent processing
    data = data.values

    # Get data row-wise and column-wise in json format
    rowwise_data = preprocess.tabular2json(data,
                                           row_labels,
                                           col_labels,
                                           by_col=False,
                                           pad_rows=False)
    colwise_data = preprocess.tabular2json(data,
                                           row_labels,
                                           col_labels,
                                           by_col=True,
                                           pad_rows=True)

    # Write row-wise json
    io.write_json(rowwise_data, rows_out)

    # Write column-wise json
    io.write_json(colwise_data, cols_out)

    return rowwise_data, colwise_data
Exemple #4
0
def main(file, out='hist_out.json'):
    assert isinstance(file, str)

    # Load data from .json or .json.gz file
    data = io.load_json(file)

    # Generate "histogram" of occurrence counts as a dictionary
    data_counts = preprocess.generate_occurrence_counts(
        data, to_lower=True, replace_whitespace='-', collapse_singletons=True)

    # Write data to .json or .json.gz file format
    io.write_json(data_counts, out)

    return data_counts
Exemple #5
0
def main(file, length=100, out='fp_out.json'):
    assert isinstance(file, str)

    # Load data from .json or .json.gz file
    data = io.load_json(file)

    # Calculate fingerprints
    data_fp = make.encode_fp(data, length)

    # Convert numpy arrays to lists for conversion to json
    data_fp = {k: v.tolist() for k, v in data_fp.items()}

    # Write data to .json or .json.gz file format
    io.write_json(data_fp, out)

    return data_fp
Exemple #6
0
def main(file,
         method='ward',
         criterion='distance',
         cl_labels_out='cluster_labels.txt.gz',
         cl_members_out='cluster_members.json.gz'):

    assert isinstance(file, str)

    # Load data to be clustered
    data = pd.read_table(file, index_col=0, header=None)

    # Normalize data down columns (i.e. features or attributes)
    data = (data - data.mean(axis=0)) / data.std(axis=0)

    # Use data index as labels
    labels = np.array(data.index)

    # Perform clustering
    linkage_table = shc.linkage(data.values, method=method)

    # Identify members of all clusters of size 2 or greater
    cluster_members = cluster.get_cluster_membership(linkage_table, labels)

    # Write cluster membership to file
    io.write_json(cluster_members, cl_members_out)

    # Generate a list of dendrogram cutoff values
    distances, num_distances = linkage_table[:, 2], len(linkage_table[:, 2])
    cutoff_values = [
        np.mean(distances[i:min(i + 2, num_distances)])
        for i in range(num_distances)
    ]

    # Generate cluster assignments at different cutoff values
    cluster_assignments = [
        fcluster(linkage_table, c, criterion=criterion)
        for c in reversed(cutoff_values)
    ]

    # Write cluster labels to file
    index = list(range(len(cluster_assignments)))
    cluster_assignments = pd.DataFrame(cluster_assignments,
                                       index=index,
                                       columns=labels)
    cluster_assignments.to_csv(cl_labels_out, sep='\t', index=True)

    return cluster_assignments
def main(file,
         link_table_in='rows_hier_linkage.txt.gz',
         cl_members_out='cluster_members.json.gz'):

    assert isinstance(file, str)

    # Load the labels from the PCA output file - as strings
    labels = np.genfromtxt(fname=file,
                           delimiter='\t',
                           dtype='str',
                           usecols=range(0, 1))

    # Load hierarchical clustering linkage table
    linkage_table = pd.read_table(link_table_in, index_col=False, header=None)

    # Identify members of all clusters of size 2 or greater
    cluster_members = cluster.get_cluster_membership(linkage_table.values,
                                                     labels)

    # Write cluster membership to file
    io.write_json(cluster_members, cl_members_out)

    return cluster_members
def main(source_file, cl_members_file, counts_out_file):
    assert isinstance(source_file, str)

    # Load original data in json format
    data = io.load_json(source_file)
    data = pd.DataFrame(data)

    # Load cluster members and labels data
    cl_members_data = io.load_json(cl_members_file)

    # Get occurrence counts for all attributes and parent/children trios at all branch points in the cluster hierarchy
    all_occurrence_counts = {}
    for i in range(len(cl_members_data) - 1):
        parent_cnts, child1_cnts, child2_cnts = \
            enrichment.get_parent_child_occurrence_counts(str(i), data, cl_members_data)
        all_counts = {
            'parent': parent_cnts,
            'child1': child1_cnts,
            'child2': child2_cnts
        }
        all_occurrence_counts[str(i)] = all_counts

    # Write occurrence count data to file
    io.write_json(all_occurrence_counts, counts_out_file)
                        }
                        pairs.append(blob)

print("#numerical", N, sep="\t")
print("#tests_done", tests_done, sep="\t")
print('variableA', 'variableB', 'N', 'rho', 'pval', sep="\t")

significant = []
for pair in pairs:
    corrected = pair['pval'] * tests_done
    if corrected <= 0.05:
        pair['pval'] = format(corrected, '.2e')
        significant.append(pair)
        print(pair['A'],
              pair['B'],
              pair['N'],
              pair['rho'],
              pair['pval'],
              sep="\t")

result = {
    'relationship': 'is_correlated_to',
    'test_type': 'Spearman correlation',
    'correction': 'Bonferroni',
    'numerical_columns': N,
    'tests_done': tests_done,
    'tests_passed': len(significant),
    'tests': significant
}
io.write_json(result, 'num_assoc.json.gz')