Ejemplo n.º 1
0
def test_GetResults():
    testInput = pickle.load(open("data_testing/correctOneHotIn.pickle", "rb"))
    testCorrectOut = pickle.load(open("data_testing/correctOneHotOut.pickle", "rb"))

    getOneHotOut = ld.get_onehot(testInput, None, num_classes=3, seq_len=20)

#    print(testInput)
#    print(testCorrectOut)

    for x,y in zip(testCorrectOut,getOneHotOut):
        assert np.equal(x,y).all()
        pair_dict[y] = [single_dict[y], x]
    else:
        single_dict[y] = x
    if len(pair_dict) == num_classes:
        break

chosen_data = []
for i in range(2):
    for y in pair_dict:
        x = pair_dict[y][i]
        #		print len(x)
        chosen_data.append((x, y))

x, y, m = get_onehot(chosen_data,
                     None,
                     is_dna_data=is_dna_data,
                     seq_len=seq_len,
                     mask_len=mask_len if mask else None)
embed = embed_model.predict([x, m] if mask else x)

pos_counts = dict()
correct_counts = dict()
for n in top_n:
    pos_counts[n] = []
    correct_counts[n] = 0.0
    for _ in range(n):
        pos_counts[n].append(0)

for i in range(num_classes):
    distances = dict()
    ex = embed[i + num_classes]
Ejemplo n.º 3
0
model_name = 'blstm_dna_conv3_4500'
data_file = '/mnt/data/computervision/dna_train80_val10_test10/test.csv'
#data_file = '/mnt/data/computervision/dna_train80_val10_test10/unknowns.csv'
data_divide = 4
dist_min = 0
dist_max = 20

model_file = '../models/' + model_name + '.h5'
model = load_model(model_file)
av_model = Model(inputs=model.input, outputs=model.get_layer("AV").output)
print av_model.summary()

data = load_csv(data_file, divide=data_divide)
print len(data)
x, y = get_onehot(data,
                  None,
                  is_dna_data=is_dna_data,
                  seq_len=4500 if is_dna_data else 1500)
avs = av_model.predict(x, batch_size=500)

print 'done getting avs'
del data, x, y

means = []
with open('../results/' + model_name + '_mean_activations.csv', 'r') as infile:
    r = csv.reader(infile)
    for row in r:
        means.append(np.array(row, dtype=np.float32))

dists = []
with open('../results/' + model_name + '_mav_distances.csv', 'r') as infile:
    r = csv.reader(infile)
Ejemplo n.º 4
0
num_letters = 4 if is_dna_data else 26


model = model_template(num_classes, num_letters, sequence_length, embed_size=256, mask_length=mask_len if mask else None)

model.load_weights(model_file)
model.summary()

test_data = load_csv(data_dir + '/test.csv', divide=2 if is_dna_data else 1)
print len(test_data)

crop_count = 0.0
for seq, y in test_data:
	if len(seq) > sequence_length:
		crop_count += 1
print "percent cropped: ", crop_count / len(test_data)	

test_x, test_y, test_m = get_onehot(test_data, None, is_dna_data=is_dna_data, seq_len=sequence_length, num_classes=num_classes, rand_start=random_crop, mask_len=mask_len if mask else None)
if print_acc:
	print "test accuracy: ", model.evaluate([test_x, test_m] if mask else test_x, test_y, batch_size=100)

if save_stats:
	pred = model.predict([test_x, test_m] if mask else test_x, batch_size=100).argmax(axis=-1)
	log = Logger(model_name, num_classes, sequence_length)
	log.confusion_matrix(test_data,pred)
	log.length_stats(test_data,pred)
	log.length_histograms(test_data,pred)
	log.save()


Ejemplo n.º 5
0
results = []

for percent in range(2,22,2):
	#mode 0: substitute, mode 1: 3-aligned cut, mode 2: unaligned cut
	row = [percent]
	for mode in range(3):
		test_data = load_csv(data_dir + '/test.csv', divide=2)
		print len(test_data)
	
		for i in range(len(test_data)):
			(x, y) = test_data[i]
			if mode == 0:
				test_data[i] = (substitute(x, percent), y)
			else:
				test_data[i] = (delete_segment(x, percent, mode == 1), y)
			#if i % 100000 == 99999:
			#	print i+1

		test_x, test_y, test_m = get_onehot(test_data, None, is_dna_data=True, seq_len=sequence_length, num_classes=num_classes, mask_len=mask_len)
	
		acc =  model.evaluate([test_x, test_m], test_y, batch_size=100, verbose=1)[1]
		print percent, mode, acc

		row.append(acc)
		del test_data, test_x, test_y, test_m
	results.append(row)
	with open('../results/'+model_name+'_mutation_graphs.csv', 'w') as outfile:
		w = csv.writer(outfile)
		for row in results:
			w.writerow(row)
        i += 1
    print i

N = len(sequence_dict)
print 'done loading', N

for i in range(N):
    print len(sequence_dict[i])
    filename = '/mnt/data/computervision/tara/embed64/' + rev_label_dict[
        i] + '.npy'
    if os.path.exists(filename):
        embed_dict[i] = np.load(filename)
    else:
        x, y, m = get_onehot(sequence_dict[i],
                             None,
                             is_dna_data=is_dna_data,
                             seq_len=seq_len,
                             mask_len=mask_len)
        embed = embed_model.predict([x, m], batch_size=100, verbose=1)
        embed_dict[i] = embed
        del x, y, m
        np.save(filename, embed)
    del sequence_dict[i]
    print 'embedded', i, rev_label_dict[i]

    #embed_dict[i] = embed_dict[i][0:1000]

del sequence_dict, model, embed_model
result = []

tree_dict = dict()
Ejemplo n.º 7
0
#read the first two columns of the input csv file into a list of tuples.
#the file's second-column items become the first items in the tuples.
#the list of tuples is called train_data
train_data = load_csv(data_dir + '/train.csv')
print(len(train_data))
#val_data = load_csv(data_dir + '/validation.csv', divide=2 if is_dna_data else 1)
#val_x, val_y = get_onehot(val_data, None, num_classes=num_classes, seq_len=sequence_length, is_dna_data=is_dna_data)
#print(len(val_data))

num_episodes = 50000#200000
# Each iteration currently takes about 6 secs, so cutting num_episodes
# way down to be able to get to end of process in reasonable time.
num_episodes = 5

for i in range(num_episodes):
        x, y, m = get_onehot(train_data, 100, num_classes=num_classes, seq_len=sequence_length, is_dna_data=is_dna_data, mask_len=mask_len if mask else None)
        print(i)
        print(model.train_on_batch([x,m] if mask else x, y))
        if (i % 10000 == 0) or i == num_episodes - 1:

                #[loss, acc] = model.evaluate(val_x, val_y, batch_size=100)
                #print loss, acc
                #logger.record_val_acc(i, acc)

                model.save(save_path)
                print('saved to ' + save_path)
del train_data

#pred = model.predict(val_x, batch_size=100).argmax(axis=-1)
#logger.confusion_matrix(val_data, pred)
#logger.length_plot(val_data, pred)
Ejemplo n.º 8
0
model = load_model(model_file)
av_model = Model(inputs=model.input, outputs=model.get_layer("AV").output)
print av_model.summary()

train_data = load_csv(data_dir + '/train.csv')

batch_size = 10000
avs = []
actual = []
lower = 0
while lower < len(train_data):
    print lower
    upper = min(lower + batch_size, len(train_data))
    x, y = get_onehot(train_data[lower:upper],
                      None,
                      is_dna_data=is_dna_data,
                      seq_len=seq_len)
    pred = av_model.predict(x, batch_size=500)
    avs.append(pred)
    actual.append(y)
    lower += batch_size

del train_data

sums = np.zeros((num_classes, num_classes), np.float32)
counts = np.zeros((num_classes), np.float32)
class_avs = []
for i in range(num_classes):
    class_avs.append([])

for i in range(len(avs)):
Ejemplo n.º 9
0
num_amino_acids = 26

model = Sequential()
model.add(Masking(mask_value=0, input_shape=(1500, num_amino_acids)))
model.add(LSTM(50, activation='tanh'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer=Adam(lr=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

data_dir = '/mnt/data/computervision/train80_val10_test10'
train_data = load_csv(data_dir + '/train.csv')
print len(train_data)
val_data = load_csv(data_dir + '/validation.csv')
val_x, val_y = get_onehot(val_data, None)
print len(val_data)

logger = Logger('lstm50')

save_path = '../models/lstm50.h5'

num_episodes = 20000
for i in range(num_episodes):
    x, y = get_onehot(train_data, 1000)
    print i
    print model.train_on_batch(x, y)
    if (i % 1000 == 0) or i == num_episodes - 1:

        [loss, acc] = model.evaluate(val_x, val_y, batch_size=1000)
        print loss, acc