Beispiel #1
0
def read_similarity_graph(similarity_graph_file, fasta_file):
    my_dict = FastaIO.read_fasta_file_as_dict(fasta_file)

    with open(similarity_graph_file, 'r') as csvfile:
        reader = list(csv.reader(csvfile, delimiter=','))

        X = []
        Y = []
        for candidate_pair in reader:
            read1 = my_dict[candidate_pair[0]].seq
            read2 = my_dict[candidate_pair[1]].seq

            if len(read1) < 400:
                rest = ''
                for i in range(400 - len(read1)):
                    rest += 'A'
                read1 += rest

            if len(read2) < 400:
                rest = ''
                for i in range(400 - len(read2)):
                    rest += 'A'
                read2 += rest

            read1 = read1[0:400]
            read2 = read2[0:400]
            pair = binarize_pair(read1, read2)
            X.append(pair)
            Y.append(1)

        all_pairs = np.array(X)
        all_labels = np.array(Y)

        all_pairs = all_pairs.reshape(all_pairs.shape[0], all_pairs.shape[1],
                                      all_pairs.shape[2], 1)

        test = DataSet(all_pairs, all_labels)
        ds = collections.namedtuple('Datasets', ['test'])

        return ds(test=test)
Beispiel #2
0
def getMinHashFunctions(reads, k, r, b):
    """Return signature matrix
    reads: list of reads
    k: length of each one of k-mers
    n: number of hash functions
    b: number of bands
    n = b * m
    """
    n = b * r
    start = timeit.default_timer()

    # first integer number greater than all 32 bit integers
    p = 4294967311

    # get random k-grams representing the rows of similarity matrix
    # k-grams are hashed to 32bit integers for more convenience.
    # all_k_grams = KGrams.generate_all_k_grams(k, ['A', 'C', 'G', 'T'])
    # random_grams = KGrams.get_random_k_grams(all_k_grams, r)

    read_id_list = []

    # create k-mer dictionary for reads:
    k_gram_dictionary = {}
    for read in reads:
        read_id_list.append(read.id)
        k_gram_dictionary[read.id] = KGrams.find_k_grams(read.seq, k)

    now = timeit.default_timer()
    print("k-mers representation is ready.", now - start)
    """
    
    for read_id1 in k_gram_dictionary:
        for read_id2 in k_gram_dictionary:
            if read_id1 != read_id2:
                print(len(set(k_gram_dictionary[read_id1]).intersection(set(k_gram_dictionary[read_id2]))) / len(set(k_gram_dictionary[read_id1]).union(set(k_gram_dictionary[read_id2]))))
    """

    # get hash functions
    hash_functions = createHashFunctions(n, p)

    # Create Signature dictionary: the key of each element would be the pair (S, i):
    # S denotes the molecular string(read), i denotes the i-th hash function.
    # The value of the element with key(S, i) would be h_i(S)
    signature_dictionary = {}

    for i in range(n):
        signature_dictionary[i] = {}
    """
    # First Sig(S, i) must be initialized to +inf
    for key in k_gram_dictionary:
        for hash_function_index in range(n):
            signature_dictionary[hash_function_index][key] = 9999999999
    """

    # Algorithm Description:
    """
    For each chosen k_gram(rows of matrix):
        1) compute h1(row), h2(row), ...
        2) for each string:
            if k_gram exists in string:
                sig(s, i) = min(sig(s,i), h_i(row))
    """
    hash_index = -1
    for a, b in hash_functions:

        hash_index += 1
        # for each hash function we compute h_i(s) for all strings
        for key in k_gram_dictionary:  # each string
            minHash = 9999999999
            for item in k_gram_dictionary[key]:  # each shingle in string
                current_number_hash_value = (a * item + b) % p
                if current_number_hash_value < minHash:
                    minHash = current_number_hash_value

            signature_dictionary[hash_index][key] = minHash

    now2 = timeit.default_timer()
    print("minHash signature matrix is ready.", now2 - now)

    LSH_dict = {}
    for read_id in read_id_list:
        LSH_dict[read_id] = []

    band_dictionary = {}

    for read_id in read_id_list:

        integer_list = []  # it should have m elements
        hash_function_counter = 0
        band_counter = 0

        for hash_function_index in signature_dictionary:

            integer_list.append(
                signature_dictionary[hash_function_index][read_id])
            hash_function_counter += 1

            if hash_function_counter == r:
                hashedValue = bandHashFunction(integer_list)
                LSH_dict[read_id].append(hashedValue)

                # add read to the bucket corresponding the band
                band_counter += 1
                band_dictionary.setdefault((band_counter, hashedValue),
                                           []).append(read_id)
                hash_function_counter = 0
                integer_list = []

    now3 = timeit.default_timer()
    print("LSH array is ready.", now3 - now2)

    # Create graph
    G = nx.Graph()
    for read_id in read_id_list:
        G.add_node(read_id)

    for key in band_dictionary:
        if len(band_dictionary[key]) > 1:
            for i in range(1, len(band_dictionary[key])):
                for j in range(i):
                    G.add_edge(band_dictionary[key][j],
                               band_dictionary[key][i])

    now4 = timeit.default_timer()
    print("Graph is created.", now4 - now3)

    print(len(G.edges()))

    b = n // r
    FastaIO.write_graph_to_csv(
        'wholeGraphK' + str(k) + 'R' + str(r) + 'B' + str(b) + '.csv', G)

    return LSH_dict
Beispiel #3
0
    :param n: number of hash functions
    :param p: length of permutation
    :return: Each hash function could be represented by pair (a,b) meaning h(c) = (ac + b) % p
    """
    if p > 1000:
        p = 1000

    hash_function_pairs = set()
    while len(hash_function_pairs) < n:
        hash_function_pairs.add(
            (random.randint(1, p - 1), random.randint(1, p - 1)))

    return list(hash_function_pairs)


dataset = FastaIO.read_fasta_file('../files/reads50.fasta')

getMinHashFunctions(dataset, 9, 3, 8)
"""
k = 8
k_gram_dictionary = {}
n = 1

for record in dataset:
    k_gram_dictionary[record.id] = KGrams.find_k_grams(record.seq, k)
    if n >= 100:
        break
    n += 1

createRepresentationMatrix(k, 29, k_gram_dictionary)
"""
Beispiel #4
0
import csv
import networkx
from DataOperations import graphOperations
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import scipy.io
import community
from DataOperations import FastaIO
from collections import Counter

dataset = FastaIO.read_fasta_file('../files/isoseq_flnc1.fasta')
dataset_dict = FastaIO.read_fasta_file_as_dict('../files/isoseq_flnc1.fasta')
id_dict = {}
id_list = []
G = networkx.Graph()
print(dataset_dict[
    'm150803_002149_42161_c100745121910000001823165807071563_s1_p0/14/1140_57_CCS']
      .seq)

ind = 0
for item in dataset:
    id_dict[item.id] = ind
    ind += 1
    id_list.append(item.id)
    G.add_node(item.id)

print(id_list)
my_dict = []
"""
mat = scipy.io.loadmat('Data50.mat')
t1 = timeit.default_timer()
print(t1 - t0)

# Create graph
G = nx.Graph()
for key in LSH_dict:
    G.add_node(key)
    band_values = LSH_dict[key]
    for i in range(len(band_values)):
        bands_dict[(i + 1, band_values[i])].append(key)

number_of_permutations = 10

for key in bands_dict:
    L = len(bands_dict[key])
    t = 1
    while L > t and t < number_of_permutations:
        for i in range(L - t):
            G.add_edge(bands_dict[key][i], bands_dict[key][i + t])
        t += 1

t2 = timeit.default_timer()
print(t2 - t1)
FastaIO.write_graph_to_csv(
    'MG1MK' + str(k_parameter) + 'R' + str(r) + 'B' + str(b) + 'P' +
    str(number_of_permutations) + '.csv', G)
t3 = timeit.default_timer()
print(t3 - t2)

print(len(G.edges()))
import csv
import timeit

import networkx
from DataOperations import graphOperations
import matplotlib.pyplot as plt
import community
from DataOperations import FastaIO
from collections import Counter

dataset = FastaIO.read_fasta_file('reads.fasta')
id_dict = {}
id_list = []
G = networkx.Graph()

ind = 0
for item in dataset:
    id_dict[item.id] = ind
    ind += 1
    id_list.append(item.id)
    G.add_node(item.id)

my_dict = []
"""
with open('groundTruth1M.csv', 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    cluster_ids = list(reader)

ground_truth_cluster_dict = {}
ground_truth_predicted_dict = {}
import csv
import timeit

import networkx
from DataOperations import graphOperations
import matplotlib.pyplot as plt
import community
from DataOperations import FastaIO
from collections import Counter

print("sal")
dataset = FastaIO.read_fasta_file('unbalanced700K.fasta')
id_dict = {}
id_list = []
G = networkx.Graph()
print("hi")
ind = 0
for item in dataset:
    id_dict[item.id] = ind
    ind += 1
    id_list.append(item.id)
    G.add_node(item.id)

my_dict = []

"""
mat = scipy.io.loadmat('Data50.mat')
cluster_ids = mat['Id']

"""
Beispiel #8
0
W_fc2 = tf.get_variable("W4", shape=[50, 1])
b_fc2 = tf.get_variable("b4", shape=[1])

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Later, launch the model, use the saver to restore variables from disk, and
# do some work with the model.
with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "../model/CNNModel.ckpt")
    print("Model restored.")
    # Check the values of the variables

    data_generator = FastaIO.read_next_batch('MinGraphK15R1B10P10.csv',
                                             '../files/reads400.fasta', 5000)

    label_array = []

    features, batch_num, batch_size = next(data_generator)
    print(features.shape)
    for i in range(batch_num + 1):
        first = timeit.default_timer()

        r1 = tf.placeholder(tf.float32, [None, 3200])

        features = np.reshape(features, [batch_size, 3200])

        reshaped_data = tf.reshape(r1, [-1, 8, 400, 1])

        h_conv1 = tf.nn.relu(conv2d(reshaped_data, W_conv1) + b_conv1)
Beispiel #9
0
W_fc2 = tf.get_variable("W4", shape=[50, 1])
b_fc2 = tf.get_variable("b4", shape=[1])

# Add ops to save and restore all the variables.
saver = tf.train.Saver()

# Later, launch the model, use the saver to restore variables from disk, and
# do some work with the model.
with tf.Session() as sess:
    # Restore variables from disk.
    saver.restore(sess, "../model/CNNModel.ckpt")
    print("Model restored.")
    # Check the values of the variables

    t1 = timeit.default_timer()
    data_generator = FastaIO.read_next_batch2('G.csv', 'trimmed.fasta', 5000)

    # data_generator = FastaIO.read_next_batch('G8.csv', 'unbalanced2M.fasta', 5000)

    label_array = []

    features, batch_num, batch_size = next(data_generator)
    t_end = timeit.default_timer()
    print("One hot encode time: ", t_end - t1)

    print(features.shape)
    first = last = 0
    for i in range(batch_num + 1):
        if i % 10 == 0:
            first = timeit.default_timer()
Beispiel #10
0
    :param p: length of permutation
    :return: Each hash function could be represented by pair (a,b) meaning h(c) = (ac + b) % p
    """
    if p > 1000:
        p = 1000

    hash_function_pairs = set()
    while len(hash_function_pairs) < n:
        hash_function_pairs.add((random.randint(1, p - 1), random.randint(1, p - 1)))

    return list(hash_function_pairs)


t1 = timeit.default_timer()

dataset = FastaIO.read_fasta_file('../files/isoseq_flnc1.fasta')

getMinHashFunctions(dataset, 15, 1, 10, 10)
t2 = timeit.default_timer()

print(t2 - t1)

"""
k = 8
k_gram_dictionary = {}
n = 1

for record in dataset:
    k_gram_dictionary[record.id] = KGrams.find_k_grams(record.seq, k)
    if n >= 100:
        break