Esempio n. 1
0
def sample_matrix_to_runs(dist, reps=3):
    '''Repeats a distance matrix to expand samples to reps.'''
    runs = DistanceMatrix(
        np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0))
    runs.ids = [
        '{}-{}'.format(g, i + 1) for g in dist.ids for i in range(reps)
    ]
    return runs
Esempio n. 2
0
def get_truth(treefile, reps):
    dist = partition_weighted_distance(treefile)
    runs = DistanceMatrix(
        np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0))
    runs.ids = [
        '{}-{}'.format(g, i + 1) for g in dist.ids for i in range(reps)
    ]
    return runs
def go_distance(args):
    #print args

    try:
        json_data = open(args.in_file)
        data = json.load(json_data)
        json_data.close()
    except:
        print("NO FILE FOUND ERROR")
        sys.exit()

    datasets = []

    for i in data['columns']:
        #print i['id']
        datasets.append(i['id'])


    z = np.array(data['data'])
    #dmatrix = np.transpose(z)


    (dmatrix, bad_rows) = remove_zero_sum_datasets(np.transpose(z))

    # find zero sum rows (datasets) after transpose

    #print(dmatrix)
    # delete datasets too:
    edited_dataset_list=[]
    #edited_did_hash = {}
    for row,line in enumerate(data['columns']):
        if row not in bad_rows[0]:
            edited_dataset_list.append(line['id'])

    #print(edited_dataset_list)
    dist = get_dist(dmatrix, data)
    if args.create_splits:
        # we're done
        return
    #print(dist)
    #sys.exit()
    dm1 = get_dist_matrix1(dist)

    dm2 = {}
    dm3 = {}

    for row,name in enumerate(edited_dataset_list):
            name = str(name)
            dm2[name] = {}
            #file_data_line = name+','
            for col,d in enumerate(dm1[row]):
                #print data['columns'][col]['id']
                #file_data_line += str(dm1[row][col])+','
                dm2[name][str(data['columns'][col]['id'])]  = dm1[row][col]
                dm3[(name, str(data['columns'][col]['id']))]  = dm1[row][col]
            #file_data_line = file_data_line[:-1]+'\n'
            #out_fp.write(file_data_line)

    out_file_selected      = os.path.join(args.basedir, 'tmp', args.prefix+'_distance.json')
    
    #my_file = Path(out_file_selected)
    if not os.path.exists(out_file_selected):
        out_fp2 = open(out_file_selected,'w')
        out_fp2.write(json.dumps(dm2))
        out_fp2.close()
    
    dm1 = DistanceMatrix(dm1)  # convert to scikit-bio DistanceMatrix (v 0.5.1)
    dm1.ids = edited_dataset_list  # assign row names
    #print(dm1)
    return (dm1, edited_dataset_list)
 def test_embad(self):
     exp = DistanceMatrix([[0, 2, 4, 6], [2, 0, 2, 4], [4, 2, 0, 2],
                           [6, 4, 2, 0]])
     exp.ids = ['S1', 'S2', 'S3', 'S4']
     res = embad(self.table)
     self.assertEquals(exp, res)
Esempio n. 5
0
def get_truth(treefile, reps):
    dist = partition_weighted_distance(treefile)
    runs = DistanceMatrix(
        np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0))
    runs.ids = ['{}-{}'.format(g, i+1) for g in dist.ids for i in range(reps)]
    return runs
Esempio n. 6
0
def sample_matrix_to_runs(dist, reps=3):
    '''Repeats a distance matrix to expand samples to reps.'''
    runs = DistanceMatrix(
        np.repeat(np.repeat(dist.data, reps, axis=1), reps, axis=0))
    runs.ids = ['{}-{}'.format(g, i+1) for g in dist.ids for i in range(reps)]
    return runs
def go_distance(args):
    #print args

    try:
        json_data = open(args.in_file)
        data = json.load(json_data)
        json_data.close()
    except:
        print("1-NO FILE FOUND ERROR")
        sys.exit()

    datasets = []

    for i in data['columns']:
        #print i['id']
        datasets.append(i['id'])

    z = np.array(data['data'])
    #dmatrix = np.transpose(z)

    (dmatrix, bad_rows) = remove_zero_sum_datasets(np.transpose(z))

    # find zero sum rows (datasets) after transpose

    #print(dmatrix)
    # delete datasets too:
    edited_dataset_list = []
    #edited_did_hash = {}
    for row, line in enumerate(data['columns']):
        if row not in bad_rows[0]:
            edited_dataset_list.append(line['id'])

    #print(edited_dataset_list)
    dist = get_dist(dmatrix, data)

    if args.create_splits:
        # we're done
        return
    #print(dist)
    #sys.exit()
    dm1 = get_dist_matrix1(dist)

    dm2 = {}
    dm3 = {}

    for row, name in enumerate(edited_dataset_list):
        name = str(name)
        dm2[name] = {}
        #file_data_line = name+','
        for col, d in enumerate(dm1[row]):
            #print data['columns'][col]['id']
            #file_data_line += str(dm1[row][col])+','
            dm2[name][str(data['columns'][col]['id'])] = dm1[row][col]
            dm3[(name, str(data['columns'][col]['id']))] = dm1[row][col]
        #file_data_line = file_data_line[:-1]+'\n'
        #out_fp.write(file_data_line)

    out_file_selected = os.path.join(args.basedir,
                                     args.prefix + '_distance.json')
    out_file_csv = os.path.join(args.basedir, args.prefix + '_distance.csv')
    print(dm1)

    # must over write each time
    #if not os.path.exists(out_file_selected):
    out_fp2 = open(out_file_selected, 'w')
    out_fp2.write(json.dumps(dm2))
    out_fp2.close()
    try:
        os.chmod(out_file_selected, 0o664)
    except:
        pass
    write_csv_file(dm1, data, args)

    dm1 = DistanceMatrix(dm1)  # convert to scikit-bio DistanceMatrix (v 0.5.1)
    dm1.ids = edited_dataset_list  # assign row names
    #print(dm1)
    return (dm1, edited_dataset_list)