def distance(args): json_data = open(args.matrix_file) data = json.load(json_data) json_data.close() datasets = [] for i in data['columns']: #print i['id'] datasets.append(i['id']) z = np.array(data['data']) dm = np.transpose(z) if args.metric == 'bray_curtis': distance_matrix = dt.dist_bray_curtis(dm) elif args.metric == 'morisita_horn': distance_matrix = dt.dist_morisita_horn(dm) elif args.metric == 'canberra': distance_matrix = dt.dist_canberra(dm) elif args.metric == 'chisq': distance_matrix = dt.dist_chisq(dm) elif args.metric == 'chord': distance_matrix = dt.dist_chord(dm) elif args.metric == 'euclidean': distance_matrix = dt.dist_euclidean(dm) elif args.metric == 'gower': distance_matrix = dt.dist_gower(dm) elif args.metric == 'hellinger': distance_matrix = dt.dist_hellinger(dm) elif args.metric == 'kulczynski': distance_matrix = dt.dist_kulczynski(dm) elif args.metric == 'manhattan': distance_matrix = dt.dist_manhattan(dm) elif args.metric == 'abund_jaccard': distance_matrix = dt.dist_abund_jaccard(dm) elif args.metric == 'binary_jaccard': distance_matrix = dt.binary_dist_jaccard(dm) elif args.metric == 'pearson': distance_matrix = dt.dist_pearson(dm) elif args.metric == 'soergel': distance_matrix = dt.dist_soergel(dm) elif args.metric == 'spearman': distance_matrix = dt.dist_spearman_approx(dm) else: # default distance_matrix = dt.dist_bray_curtis(dm) dist = {} for i,x in enumerate(distance_matrix): for n,d in enumerate(distance_matrix[i]): if i < n: # only needs one copy dist[ (datasets[i],datasets[n]) ] = d #np.savetxt(os.path.join(args.output_dir, args.file_prefix+'_distance.mtx'), distance_matrix) if args.to_output == 'distance': print(distance_matrix) return dist
def distance(args): json_data = open(args.matrix_file) data = json.load(json_data) json_data.close() datasets = [] for i in data['columns']: #print i['id'] datasets.append(i['id']) z = np.array(data['data']) dm = np.transpose(z) if args.metric == 'bray_curtis': distance_matrix = dt.dist_bray_curtis(dm) elif args.metric == 'morisita_horn': distance_matrix = dt.dist_morisita_horn(dm) elif args.metric == 'canberra': distance_matrix = dt.dist_canberra(dm) elif args.metric == 'chisq': distance_matrix = dt.dist_chisq(dm) elif args.metric == 'chord': distance_matrix = dt.dist_chord(dm) elif args.metric == 'euclidean': distance_matrix = dt.dist_euclidean(dm) elif args.metric == 'gower': distance_matrix = dt.dist_gower(dm) elif args.metric == 'hellinger': distance_matrix = dt.dist_hellinger(dm) elif args.metric == 'kulczynski': distance_matrix = dt.dist_kulczynski(dm) elif args.metric == 'manhattan': distance_matrix = dt.dist_manhattan(dm) elif args.metric == 'abund_jaccard': distance_matrix = dt.dist_abund_jaccard(dm) elif args.metric == 'binary_jaccard': distance_matrix = dt.binary_dist_jaccard(dm) elif args.metric == 'pearson': distance_matrix = dt.dist_pearson(dm) elif args.metric == 'soergel': distance_matrix = dt.dist_soergel(dm) elif args.metric == 'spearman': distance_matrix = dt.dist_spearman_approx(dm) else: # default distance_matrix = dt.dist_bray_curtis(dm) dist = {} for i, x in enumerate(distance_matrix): for n, d in enumerate(distance_matrix[i]): if i < n: # only needs one copy dist[(datasets[i], datasets[n])] = d #np.savetxt(os.path.join(args.output_dir, args.file_prefix+'_distance.mtx'), distance_matrix) if args.to_output == 'distance': print(distance_matrix) return dist
def get_dist(metric, mtx): if metric == 'bray_curtis': dtvar = dt.dist_bray_curtis(mtx, strict=False) elif metric == 'morisita_horn': dtvar = dt.dist_morisita_horn(mtx, strict=False) elif metric == 'canberra': dtvar = dt.dist_canberra(mtx, strict=False) elif metric == 'jaccard': dtvar = dt.binary_dist_jaccard(mtx, strict=False) elif metric == 'kulczynski': dtvar = dt.dist_kulczynski(mtx, strict=False) else: # default dtvar = dt.dist_bray_curtis(mtx, strict=False) dist = distance.squareform(dtvar) return dist
def get_dist(metric, mtx): if metric == 'bray_curtis': dtvar = dt.dist_bray_curtis(mtx, strict=False) elif metric == 'morisita_horn': dtvar = dt.dist_morisita_horn(mtx, strict=False) elif metric == 'canberra': dtvar = dt.dist_canberra(mtx, strict=False) elif metric == 'jaccard': dtvar = dt.binary_dist_jaccard(mtx, strict=False) elif metric == 'kulczynski': dtvar = dt.dist_kulczynski(mtx, strict=False) else: # default dtvar = dt.dist_bray_curtis(mtx, strict=False) dist = distance.squareform( dtvar ) return dist
def compute_procrustes(result_tables, expected_pc_lookup, taxonomy_level=6, num_dimensions=3, random_trials=999): """ Compute Procrustes M2 and p-values for a set of results result_tables: 2d list of tables to be compared to expected tables, where the data in the inner list is: [dataset_id, reference_database_id, method_id, parameter_combination_id, table_fp] expected_pc_lookup: 2d dict of dataset_id, reference_db_id to principal coordinate matrices, for the expected result coordinate matrices taxonomy_level: level to compute results """ ### Start code copied ALMOST* directly from compute_prfs - some re-factoring for re-use is ### in order here. *ALMOST refers to changes to parser and variable names since expected ### is a pc matrix here. for dataset_id, reference_id, method_id, params, actual_table_fp in result_tables: ## parse the expected table (unless taxonomy_level is specified, this should be ## collapsed on level 6 taxonomy) try: expected_pc_fp = expected_pc_lookup[dataset_id][reference_id] except KeyError: raise KeyError, "Can't find expected table for (%s, %s)." % (dataset_id, reference_id) ## parse the actual table and collapse it at the specified taxonomic level try: actual_table = parse_biom_table(open(actual_table_fp, "U")) except ValueError: raise ValueError, "Couldn't parse BIOM table: %s" % actual_table_fp collapse_by_taxonomy = get_taxonomy_collapser(taxonomy_level) actual_table = actual_table.collapseObservationsByMetadata(collapse_by_taxonomy) ### End code copied directly from compute_prfs. # Next block of code, how do I hate thee? Let me count the ways... # (1) dist_bray_curtis doesn't take a BIOM Table object # (2) pcoa takes a qiime-formatted distance matrix as a list of lines # (3) pcoa return a qiime-formatted pc matrix # (4) procrustes_monte_carlo needs to pass through the pc "file" multiple # times, so we actually *need* those the pcs that get passed in to be # lists of lines dm = dist_bray_curtis(asarray([v for v in actual_table.iterSampleData()])) formatted_dm = format_distance_matrix(actual_table.SampleIds, dm) actual_pc = pcoa(formatted_dm.split("\n")).split("\n") expected_pc = list(open(expected_pc_fp, "U")) ## run Procrustes analysis with monte carlo simulation actual_m_squared, trial_m_squareds, count_better, mc_p_value = procrustes_monte_carlo( expected_pc, actual_pc, trials=random_trials, max_dimensions=num_dimensions, sample_id_map=None, trial_output_dir=None, ) yield (dataset_id, reference_id, method_id, params, actual_m_squared, mc_p_value)
def calculate_distance(args): if args.file_format == 'json': try: json_data = open('./tmp/'+args.in_file) except IOError: json_data = open(args.in_file) except: print("NO FILE FOUND ERROR") sys.exit() data = json.load(json_data) json_data.close() else: # csv file # this doesn't work now with open('./tmp/'+args.in_file, 'rb') as csvfile: csv_data = csv.reader(csvfile, delimiter=',', quotechar='"') for row in csv_data: pass datasets = [] for i in data['columns']: datasets.append(i['name']) z = np.array(data['data']) dm = np.transpose(z) if args.metric == 'bray_curtis': dist = dt.dist_bray_curtis(dm) elif args.metric == 'morisita_horn': dist = dt.dist_morisita_horn(dm) elif args.metric == 'canberra': dist = dt.dist_canberra(dm) elif args.metric == 'chisq': dist = dt.dist_chisq(dm) elif args.metric == 'chord': dist = dt.dist_chord(dm) elif args.metric == 'euclidean': dist = dt.dist_euclidean(dm) elif args.metric == 'gower': dist = dt.dist_gower(dm) elif args.metric == 'hellinger': dist = dt.dist_hellinger(dm) elif args.metric == 'kulczynski': dist = dt.dist_kulczynski(dm) elif args.metric == 'manhattan': dist = dt.dist_manhattan(dm) elif args.metric == 'abund_jaccard': dist = dt.dist_abund_jaccard(dm) elif args.metric == 'binary_jaccard': dist = dt.binary_dist_jaccard(dm) elif args.metric == 'pearson': dist = dt.dist_pearson(dm) elif args.metric == 'soergel': dist = dt.dist_soergel(dm) elif args.metric == 'spearman': dist = dt.dist_spearman_approx(dm) else: # default dist = dt.dist_bray_curtis(dm) distance_matrix1 = {} distance_matrix2 = {} mat = [] out_fp = open(args.out_file,'w') file_header_line = ','.join([x['name'] for x in data['columns']]) + '\n' out_fp.write(file_header_line) for row,line in enumerate(data['columns']): name = line['name'] distance_matrix1[name] = {} file_data_line = name+',' for col,d in enumerate(dist[row]): file_data_line += str(dist[row][col])+',' distance_matrix1[name][data['columns'][col]['name']] = dist[row][col] distance_matrix2[(name, data['columns'][col]['name'])] = dist[row][col] file_data_line = file_data_line[:-1]+'\n' out_fp.write(file_data_line) out_fp.close() #if args.function == 'distance' or args.function == 'heatmap': print(json.dumps(distance_matrix1)) arr = [] for ds1 in distance_matrix1: print(ds1) tmp = [] for ds2 in distance_matrix1[ds1]: val = distance_matrix1[ds1][ds2] tmp.append(val) arr.append(tmp) #np.array(arr) linkage_matrix = linkage(arr, "single") dendrogram(linkage_matrix, color_threshold=1, show_leaf_counts=True) #image_file = '/Users/avoorhis/node_projects/vamps-node.js/public/tmp_images/'+args.prefix+'.png' image_file = 'public/tmp_images/'+args.prefix+'.png' plt.savefig(image_file)
print "Removing species with less than two occurrences..." sp_io = np.where(~(sp > 0), sp, 1) column_sums = np.sum(sp_io, 0) to_remove = np.where(column_sums < 2) sp = np.delete(sp, to_remove, 1) colnames = np.delete(colnames, to_remove) print "Removing plots with less than two species..." pl_io = np.where(~(sp > 0), sp, 1) row_sums = np.sum(pl_io, 1) to_remove = np.where(row_sums < 2) sp = np.delete(sp, to_remove, 0) rownames = np.delete(rownames, to_remove) #print sp.shape, len(rownames) #print sp.shape, len(colnames) print "Normalizing species coverage data with McCune logarithm..." sp = log_mccune(sp) from cogent.cluster.nmds import NMDS, metaNMDS from cogent.maths.distance_transform import dist_bray_curtis print "Calculating distance matrix..." distmtx = dist_bray_curtis(sp) nmds = NMDS(distmtx, dimension = 3) print nmds.getPoints() print nmds.getStress() #nmds = NMDS()