def _compare_genome_node(self, node, genome, cache): probabilities = [] length_classifier = self._length_classifier for labeled_node in length_classifier._labeled_nodes: if labeled_node in cache: shared = cache[labeled_node] else: shared = shared_segment_length_genomes(genome, labeled_node.genome, 0) cache[labeled_node] = shared if (node, labeled_node) not in length_classifier: if shared == 0: prob = INF_REPLACE else: prob = ZERO_REPLACE else: prob = length_classifier.get_probability(shared, node, labeled_node) if prob > 1 or isnan(prob): prob = INF_REPLACE if prob == 0: prob = ZERO_REPLACE probabilities.append(prob) return np.array(probabilities)
def _compare_genome_node(self, node, genome, cache): probabilities = [] length_classifier = self._length_classifier for labeled_node in length_classifier._labeled_nodes: if labeled_node in cache: shared = cache[labeled_node] else: shared = shared_segment_length_genomes(genome, labeled_node.genome, 0) cache[labeled_node] = shared if (node, labeled_node) not in length_classifier: if shared == 0: prob = INF_REPLACE else: prob = ZERO_REPLACE else: prob = length_classifier.get_probability( shared, node, labeled_node) if prob > 1 or isnan(prob): prob = INF_REPLACE if prob == 0: prob = ZERO_REPLACE probabilities.append(prob) return np.array(probabilities)
def simulate_sharing(founders, pair, genome_generator, recombinators, iterations=10000): sharing = [] for i in range(iterations): generate_genomes_ancestors(founders, genome_generator, recombinators) shared = shared_segment_length_genomes(pair[0].genome, pair[1].genome, 0) sharing.append(shared) return sharing
def calculate_to_list(pairs, lengths): lengths.extend(shared_segment_length_genomes(node_a.genome, node_b.genome, 0) for node_a, node_b in pairs)
# start = perf_counter() # boundary = len(pairs) // 2 # thread_1 = threading.Thread(target=calculate_to_list, # args = (pairs[:boundary], lengths)) # thread_2 = threading.Thread(target=calculate_to_list, # args = (pairs[boundary:], lengths)) # thread_1.start() # thread_2.start() # thread_1.join() # thread_2.join() # stop = perf_counter() # print(stop - start) print("Comparing pairs.") nodes = population.generations[-1].members nodes = sample(nodes, 1500) start = perf_counter() lengths = [shared_segment_length_genomes(node_a.genome, node_b.genome, 0) for node_a, node_b in combinations(nodes, 2)] stop = perf_counter() print(stop - start) # import pdb # pdb.set_trace() # shared = [len(np.flatnonzero(np.unpackbits(a.genome._founder_bits & b.genome._founder_bits))) # for a, b in combinations(nodes, 2)] # print(np.average(shared)) # print(np.std(shared)) # print(max(lengths))
def identify(self, genome, actual_node, ibd_threshold=5000000): node_probabilities = dict() # Probability that a node is a match id_map = self._population.id_mapping length_classifier = self._length_classifier shared_list = [] for labeled_node_id in length_classifier._labeled_nodes: labeled_node = id_map[labeled_node_id] s = shared_segment_length_genomes(genome, labeled_node.genome, ibd_threshold) shared_list.append((labeled_node_id, s)) node_data = dict() batch_node_id = [] batch_labeled_node_id = [] batch_lengths = [] batch_cryptic_lengths = [] # This is done for performance reasons, as appending to this # list is the hottest part of the loop. append_cryptic = batch_cryptic_lengths.append distributions = length_classifier._distributions # Set membership testing is faster than dictionary key # membership testing, so we use a set. distribution_members = set(distributions.keys()) nodes = self._to_search(shared_list) for node in nodes: node_start_i = len(batch_node_id) node_id = node._id cryptic_start_i = len(batch_cryptic_lengths) for labeled_node_id, shared in shared_list: if (node_id, labeled_node_id) not in distribution_members: append_cryptic(shared) else: batch_node_id.append(node_id) batch_labeled_node_id.append(labeled_node_id) batch_lengths.append(shared) cryptic_stop_i = len(batch_cryptic_lengths) node_stop_i = len(batch_node_id) node_data[node] = ProbabilityData(node_start_i, node_stop_i, cryptic_start_i, cryptic_stop_i) calc_prob = length_classifier.get_batch_probability( batch_lengths, batch_node_id, batch_labeled_node_id) cryptic_prob = length_classifier.get_batch_smoothing( batch_cryptic_lengths) # index_data = {node._id: tuple(indices) # for node, indices in node_data.items()} # siblings = {node._id for node in get_sibling_group(actual_node)} # to_dump = {"actual_node_id": actual_node._id, # "calc_prob": calc_prob, # "cryptic_lengths": batch_cryptic_lengths, # "siblings": siblings, # "index_data": index_data} # output_filename = "/media/paul/Fast Storage/optimize_data/{}.pickle".format(actual_node._id) # with open(output_filename, "wb") as pickle_file: # dump(to_dump, pickle_file) node_probabilities = dict() for node, prob_data in node_data.items(): start_i, stop_i, cryptic_start_i, cryptic_stop_i = prob_data if node == actual_node: pass # import pdb # pdb.set_trace() node_calc = calc_prob[start_i:stop_i] node_cryptic = cryptic_prob[cryptic_start_i:cryptic_stop_i] log_prob = (np.sum(np.log(node_calc)) + np.sum(np.log(node_cryptic))) node_probabilities[node] = log_prob # potential_node = max(node_probabilities.items(), # key = lambda x: x[1])[0] write_log( "identify", { "node": actual_node._id, "probs": {node._id: prob for node, prob in node_probabilities.items()} }) potential_nodes = nlargest(8, node_probabilities.items(), key=lambda x: x[1]) top, top_log_prob = potential_nodes[0] sibling_group = get_sibling_group(top) for node, log_prob in potential_nodes[1:]: if node in sibling_group: continue next_node = node next_log_prob = log_prob break else: next_node, next_log_prob = potential_nodes[1] log_ratio = top_log_prob - next_log_prob # log_data = {"actual_node_id": actual_node._id, # "prob_indices": prob_data, # "calc_prob": calc_prob, # "cryptic_prob": cryptic_prob # "sibling_group": [node._id for node in sibling_group]} # write_log("run_data", log_data) return (sibling_group, log_ratio)
def calculate_to_list(pairs, lengths): lengths.extend( shared_segment_length_genomes(node_a.genome, node_b.genome, 0) for node_a, node_b in pairs)
# thread_1 = threading.Thread(target=calculate_to_list, # args = (pairs[:boundary], lengths)) # thread_2 = threading.Thread(target=calculate_to_list, # args = (pairs[boundary:], lengths)) # thread_1.start() # thread_2.start() # thread_1.join() # thread_2.join() # stop = perf_counter() # print(stop - start) print("Comparing pairs.") nodes = population.generations[-1].members nodes = sample(nodes, 1500) start = perf_counter() lengths = [ shared_segment_length_genomes(node_a.genome, node_b.genome, 0) for node_a, node_b in combinations(nodes, 2) ] stop = perf_counter() print(stop - start) # import pdb # pdb.set_trace() # shared = [len(np.flatnonzero(np.unpackbits(a.genome._founder_bits & b.genome._founder_bits))) # for a, b in combinations(nodes, 2)] # print(np.average(shared)) # print(np.std(shared)) # print(max(lengths))