def read_similarity_graph(similarity_graph_file, fasta_file): my_dict = FastaIO.read_fasta_file_as_dict(fasta_file) with open(similarity_graph_file, 'r') as csvfile: reader = list(csv.reader(csvfile, delimiter=',')) X = [] Y = [] for candidate_pair in reader: read1 = my_dict[candidate_pair[0]].seq read2 = my_dict[candidate_pair[1]].seq if len(read1) < 400: rest = '' for i in range(400 - len(read1)): rest += 'A' read1 += rest if len(read2) < 400: rest = '' for i in range(400 - len(read2)): rest += 'A' read2 += rest read1 = read1[0:400] read2 = read2[0:400] pair = binarize_pair(read1, read2) X.append(pair) Y.append(1) all_pairs = np.array(X) all_labels = np.array(Y) all_pairs = all_pairs.reshape(all_pairs.shape[0], all_pairs.shape[1], all_pairs.shape[2], 1) test = DataSet(all_pairs, all_labels) ds = collections.namedtuple('Datasets', ['test']) return ds(test=test)
def getMinHashFunctions(reads, k, r, b): """Return signature matrix reads: list of reads k: length of each one of k-mers n: number of hash functions b: number of bands n = b * m """ n = b * r start = timeit.default_timer() # first integer number greater than all 32 bit integers p = 4294967311 # get random k-grams representing the rows of similarity matrix # k-grams are hashed to 32bit integers for more convenience. # all_k_grams = KGrams.generate_all_k_grams(k, ['A', 'C', 'G', 'T']) # random_grams = KGrams.get_random_k_grams(all_k_grams, r) read_id_list = [] # create k-mer dictionary for reads: k_gram_dictionary = {} for read in reads: read_id_list.append(read.id) k_gram_dictionary[read.id] = KGrams.find_k_grams(read.seq, k) now = timeit.default_timer() print("k-mers representation is ready.", now - start) """ for read_id1 in k_gram_dictionary: for read_id2 in k_gram_dictionary: if read_id1 != read_id2: print(len(set(k_gram_dictionary[read_id1]).intersection(set(k_gram_dictionary[read_id2]))) / len(set(k_gram_dictionary[read_id1]).union(set(k_gram_dictionary[read_id2])))) """ # get hash functions hash_functions = createHashFunctions(n, p) # Create Signature dictionary: the key of each element would be the pair (S, i): # S denotes the molecular string(read), i denotes the i-th hash function. # The value of the element with key(S, i) would be h_i(S) signature_dictionary = {} for i in range(n): signature_dictionary[i] = {} """ # First Sig(S, i) must be initialized to +inf for key in k_gram_dictionary: for hash_function_index in range(n): signature_dictionary[hash_function_index][key] = 9999999999 """ # Algorithm Description: """ For each chosen k_gram(rows of matrix): 1) compute h1(row), h2(row), ... 2) for each string: if k_gram exists in string: sig(s, i) = min(sig(s,i), h_i(row)) """ hash_index = -1 for a, b in hash_functions: hash_index += 1 # for each hash function we compute h_i(s) for all strings for key in k_gram_dictionary: # each string minHash = 9999999999 for item in k_gram_dictionary[key]: # each shingle in string current_number_hash_value = (a * item + b) % p if current_number_hash_value < minHash: minHash = current_number_hash_value signature_dictionary[hash_index][key] = minHash now2 = timeit.default_timer() print("minHash signature matrix is ready.", now2 - now) LSH_dict = {} for read_id in read_id_list: LSH_dict[read_id] = [] band_dictionary = {} for read_id in read_id_list: integer_list = [] # it should have m elements hash_function_counter = 0 band_counter = 0 for hash_function_index in signature_dictionary: integer_list.append( signature_dictionary[hash_function_index][read_id]) hash_function_counter += 1 if hash_function_counter == r: hashedValue = bandHashFunction(integer_list) LSH_dict[read_id].append(hashedValue) # add read to the bucket corresponding the band band_counter += 1 band_dictionary.setdefault((band_counter, hashedValue), []).append(read_id) hash_function_counter = 0 integer_list = [] now3 = timeit.default_timer() print("LSH array is ready.", now3 - now2) # Create graph G = nx.Graph() for read_id in read_id_list: G.add_node(read_id) for key in band_dictionary: if len(band_dictionary[key]) > 1: for i in range(1, len(band_dictionary[key])): for j in range(i): G.add_edge(band_dictionary[key][j], band_dictionary[key][i]) now4 = timeit.default_timer() print("Graph is created.", now4 - now3) print(len(G.edges())) b = n // r FastaIO.write_graph_to_csv( 'wholeGraphK' + str(k) + 'R' + str(r) + 'B' + str(b) + '.csv', G) return LSH_dict
:param n: number of hash functions :param p: length of permutation :return: Each hash function could be represented by pair (a,b) meaning h(c) = (ac + b) % p """ if p > 1000: p = 1000 hash_function_pairs = set() while len(hash_function_pairs) < n: hash_function_pairs.add( (random.randint(1, p - 1), random.randint(1, p - 1))) return list(hash_function_pairs) dataset = FastaIO.read_fasta_file('../files/reads50.fasta') getMinHashFunctions(dataset, 9, 3, 8) """ k = 8 k_gram_dictionary = {} n = 1 for record in dataset: k_gram_dictionary[record.id] = KGrams.find_k_grams(record.seq, k) if n >= 100: break n += 1 createRepresentationMatrix(k, 29, k_gram_dictionary) """
import csv import networkx from DataOperations import graphOperations import numpy as np import matplotlib.mlab as mlab import matplotlib.pyplot as plt import scipy.io import community from DataOperations import FastaIO from collections import Counter dataset = FastaIO.read_fasta_file('../files/isoseq_flnc1.fasta') dataset_dict = FastaIO.read_fasta_file_as_dict('../files/isoseq_flnc1.fasta') id_dict = {} id_list = [] G = networkx.Graph() print(dataset_dict[ 'm150803_002149_42161_c100745121910000001823165807071563_s1_p0/14/1140_57_CCS'] .seq) ind = 0 for item in dataset: id_dict[item.id] = ind ind += 1 id_list.append(item.id) G.add_node(item.id) print(id_list) my_dict = [] """ mat = scipy.io.loadmat('Data50.mat')
t1 = timeit.default_timer() print(t1 - t0) # Create graph G = nx.Graph() for key in LSH_dict: G.add_node(key) band_values = LSH_dict[key] for i in range(len(band_values)): bands_dict[(i + 1, band_values[i])].append(key) number_of_permutations = 10 for key in bands_dict: L = len(bands_dict[key]) t = 1 while L > t and t < number_of_permutations: for i in range(L - t): G.add_edge(bands_dict[key][i], bands_dict[key][i + t]) t += 1 t2 = timeit.default_timer() print(t2 - t1) FastaIO.write_graph_to_csv( 'MG1MK' + str(k_parameter) + 'R' + str(r) + 'B' + str(b) + 'P' + str(number_of_permutations) + '.csv', G) t3 = timeit.default_timer() print(t3 - t2) print(len(G.edges()))
import csv import timeit import networkx from DataOperations import graphOperations import matplotlib.pyplot as plt import community from DataOperations import FastaIO from collections import Counter dataset = FastaIO.read_fasta_file('reads.fasta') id_dict = {} id_list = [] G = networkx.Graph() ind = 0 for item in dataset: id_dict[item.id] = ind ind += 1 id_list.append(item.id) G.add_node(item.id) my_dict = [] """ with open('groundTruth1M.csv', 'r') as csvfile: reader = csv.reader(csvfile, delimiter=',') cluster_ids = list(reader) ground_truth_cluster_dict = {} ground_truth_predicted_dict = {}
import csv import timeit import networkx from DataOperations import graphOperations import matplotlib.pyplot as plt import community from DataOperations import FastaIO from collections import Counter print("sal") dataset = FastaIO.read_fasta_file('unbalanced700K.fasta') id_dict = {} id_list = [] G = networkx.Graph() print("hi") ind = 0 for item in dataset: id_dict[item.id] = ind ind += 1 id_list.append(item.id) G.add_node(item.id) my_dict = [] """ mat = scipy.io.loadmat('Data50.mat') cluster_ids = mat['Id'] """
W_fc2 = tf.get_variable("W4", shape=[50, 1]) b_fc2 = tf.get_variable("b4", shape=[1]) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Later, launch the model, use the saver to restore variables from disk, and # do some work with the model. with tf.Session() as sess: # Restore variables from disk. saver.restore(sess, "../model/CNNModel.ckpt") print("Model restored.") # Check the values of the variables data_generator = FastaIO.read_next_batch('MinGraphK15R1B10P10.csv', '../files/reads400.fasta', 5000) label_array = [] features, batch_num, batch_size = next(data_generator) print(features.shape) for i in range(batch_num + 1): first = timeit.default_timer() r1 = tf.placeholder(tf.float32, [None, 3200]) features = np.reshape(features, [batch_size, 3200]) reshaped_data = tf.reshape(r1, [-1, 8, 400, 1]) h_conv1 = tf.nn.relu(conv2d(reshaped_data, W_conv1) + b_conv1)
W_fc2 = tf.get_variable("W4", shape=[50, 1]) b_fc2 = tf.get_variable("b4", shape=[1]) # Add ops to save and restore all the variables. saver = tf.train.Saver() # Later, launch the model, use the saver to restore variables from disk, and # do some work with the model. with tf.Session() as sess: # Restore variables from disk. saver.restore(sess, "../model/CNNModel.ckpt") print("Model restored.") # Check the values of the variables t1 = timeit.default_timer() data_generator = FastaIO.read_next_batch2('G.csv', 'trimmed.fasta', 5000) # data_generator = FastaIO.read_next_batch('G8.csv', 'unbalanced2M.fasta', 5000) label_array = [] features, batch_num, batch_size = next(data_generator) t_end = timeit.default_timer() print("One hot encode time: ", t_end - t1) print(features.shape) first = last = 0 for i in range(batch_num + 1): if i % 10 == 0: first = timeit.default_timer()
:param p: length of permutation :return: Each hash function could be represented by pair (a,b) meaning h(c) = (ac + b) % p """ if p > 1000: p = 1000 hash_function_pairs = set() while len(hash_function_pairs) < n: hash_function_pairs.add((random.randint(1, p - 1), random.randint(1, p - 1))) return list(hash_function_pairs) t1 = timeit.default_timer() dataset = FastaIO.read_fasta_file('../files/isoseq_flnc1.fasta') getMinHashFunctions(dataset, 15, 1, 10, 10) t2 = timeit.default_timer() print(t2 - t1) """ k = 8 k_gram_dictionary = {} n = 1 for record in dataset: k_gram_dictionary[record.id] = KGrams.find_k_grams(record.seq, k) if n >= 100: break