def sample_matrix_to_runs(dist, reps=3): '''Repeats a distance matrix to expand samples to reps.''' runs = DistanceMatrix( np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0)) runs.ids = [ '{}-{}'.format(g, i + 1) for g in dist.ids for i in range(reps) ] return runs
def get_truth(treefile, reps): dist = partition_weighted_distance(treefile) runs = DistanceMatrix( np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0)) runs.ids = [ '{}-{}'.format(g, i + 1) for g in dist.ids for i in range(reps) ] return runs
def go_distance(args): #print args try: json_data = open(args.in_file) data = json.load(json_data) json_data.close() except: print("NO FILE FOUND ERROR") sys.exit() datasets = [] for i in data['columns']: #print i['id'] datasets.append(i['id']) z = np.array(data['data']) #dmatrix = np.transpose(z) (dmatrix, bad_rows) = remove_zero_sum_datasets(np.transpose(z)) # find zero sum rows (datasets) after transpose #print(dmatrix) # delete datasets too: edited_dataset_list=[] #edited_did_hash = {} for row,line in enumerate(data['columns']): if row not in bad_rows[0]: edited_dataset_list.append(line['id']) #print(edited_dataset_list) dist = get_dist(dmatrix, data) if args.create_splits: # we're done return #print(dist) #sys.exit() dm1 = get_dist_matrix1(dist) dm2 = {} dm3 = {} for row,name in enumerate(edited_dataset_list): name = str(name) dm2[name] = {} #file_data_line = name+',' for col,d in enumerate(dm1[row]): #print data['columns'][col]['id'] #file_data_line += str(dm1[row][col])+',' dm2[name][str(data['columns'][col]['id'])] = dm1[row][col] dm3[(name, str(data['columns'][col]['id']))] = dm1[row][col] #file_data_line = file_data_line[:-1]+'\n' #out_fp.write(file_data_line) out_file_selected = os.path.join(args.basedir, 'tmp', args.prefix+'_distance.json') #my_file = Path(out_file_selected) if not os.path.exists(out_file_selected): out_fp2 = open(out_file_selected,'w') out_fp2.write(json.dumps(dm2)) out_fp2.close() dm1 = DistanceMatrix(dm1) # convert to scikit-bio DistanceMatrix (v 0.5.1) dm1.ids = edited_dataset_list # assign row names #print(dm1) return (dm1, edited_dataset_list)
def test_embad(self): exp = DistanceMatrix([[0, 2, 4, 6], [2, 0, 2, 4], [4, 2, 0, 2], [6, 4, 2, 0]]) exp.ids = ['S1', 'S2', 'S3', 'S4'] res = embad(self.table) self.assertEquals(exp, res)
def get_truth(treefile, reps): dist = partition_weighted_distance(treefile) runs = DistanceMatrix( np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0)) runs.ids = ['{}-{}'.format(g, i+1) for g in dist.ids for i in range(reps)] return runs
def sample_matrix_to_runs(dist, reps=3): '''Repeats a distance matrix to expand samples to reps.''' runs = DistanceMatrix( np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0)) runs.ids = ['{}-{}'.format(g, i+1) for g in dist.ids for i in range(reps)] return runs
def go_distance(args): #print args try: json_data = open(args.in_file) data = json.load(json_data) json_data.close() except: print("1-NO FILE FOUND ERROR") sys.exit() datasets = [] for i in data['columns']: #print i['id'] datasets.append(i['id']) z = np.array(data['data']) #dmatrix = np.transpose(z) (dmatrix, bad_rows) = remove_zero_sum_datasets(np.transpose(z)) # find zero sum rows (datasets) after transpose #print(dmatrix) # delete datasets too: edited_dataset_list = [] #edited_did_hash = {} for row, line in enumerate(data['columns']): if row not in bad_rows[0]: edited_dataset_list.append(line['id']) #print(edited_dataset_list) dist = get_dist(dmatrix, data) if args.create_splits: # we're done return #print(dist) #sys.exit() dm1 = get_dist_matrix1(dist) dm2 = {} dm3 = {} for row, name in enumerate(edited_dataset_list): name = str(name) dm2[name] = {} #file_data_line = name+',' for col, d in enumerate(dm1[row]): #print data['columns'][col]['id'] #file_data_line += str(dm1[row][col])+',' dm2[name][str(data['columns'][col]['id'])] = dm1[row][col] dm3[(name, str(data['columns'][col]['id']))] = dm1[row][col] #file_data_line = file_data_line[:-1]+'\n' #out_fp.write(file_data_line) out_file_selected = os.path.join(args.basedir, args.prefix + '_distance.json') out_file_csv = os.path.join(args.basedir, args.prefix + '_distance.csv') print(dm1) # must over write each time #if not os.path.exists(out_file_selected): out_fp2 = open(out_file_selected, 'w') out_fp2.write(json.dumps(dm2)) out_fp2.close() try: os.chmod(out_file_selected, 0o664) except: pass write_csv_file(dm1, data, args) dm1 = DistanceMatrix(dm1) # convert to scikit-bio DistanceMatrix (v 0.5.1) dm1.ids = edited_dataset_list # assign row names #print(dm1) return (dm1, edited_dataset_list)