def cryptic_parameters(id_map, labeled_nodes, related_pairs): # TODO: Add argument for this recomb_dir = abspath( join(dirname(__file__), "../data/recombination_rates/")) cm_data = centimorgan_data_from_directory(recomb_dir) ibd_detector = SharedSegmentDetector(0, 5, cm_data) # We try to only include the labeled nodes the analyst would have # access to This only works if we use the deterministic random # argument when we evaluate the classifier. labeled_copy = sorted(labeled_nodes) rand_state = getstate() seed(42) shuffle(labeled_copy) setstate(rand_state) labeled_node_pairs = set(combinations(labeled_copy[:1000], 2)) related_pairs = set(related_pairs) cryptic_pairs = set(x for x in labeled_node_pairs if x not in related_pairs) lengths = [] for node_a_id, node_b_id in cryptic_pairs: node_a = id_map[node_a_id] node_b = id_map[node_b_id] genome_a = node_a.genome genome_b = node_b.genome length = ibd_detector.shared_segment_length(genome_a, genome_b) lengths.append(length) np_lengths = np.array(lengths, dtype=np.uint64) params = fit_hurdle_gamma(np_lengths) assert all(x is not None for x in params) return params
def distributions_from_directory(directory, id_mapping): """ Calculate distributions from a directory created by calculate_shared_to_directory. """ distributions = dict() for labeled_filename in tqdm(listdir(directory)): lengths = defaultdict(list) labeled = int(labeled_filename) with open(join(directory, labeled_filename), "r") as labeled_file: for line in labeled_file: # If the program crashed, the output can be left in an # inconsistent state. try: unlabeled_id, shared_str = line.split("\t") except ValueError: warn("Malformed line:\n{}".format(line), stacklevel=0) continue unlabeled = int(unlabeled_id) if unlabeled not in id_mapping: error_string = "No such unlabeled node with id {}." warn(error_string.format(unlabeled_id), stacklevel=0) continue try: shared_float = float(shared_str) except ValueError: error_string = "Error formatting value as float: {}." warn(error_string.format(shared_str), stacklevel=0) continue lengths[unlabeled].append(shared_float) for unlabeled, lengths in lengths.items(): shape, scale, zero_prob = fit_hurdle_gamma( np.array(lengths, dtype=np.float64)) if shape is None: continue shape, scale = adjust_shape_scale(shape, scale) params = HurdleGammaParams(shape, scale, zero_prob) distributions[unlabeled, labeled] = params return distributions
if args.cm_ibd_threshold > 0: cur_path = realpath(__file__) parent = split(split(cur_path)[0])[0] rates_dir = join(parent, "data", "recombination_rates") print("Loading recombination data for centimorgan cutoff.", flush = True) recomb_data = centimorgan_data_from_directory(rates_dir) ibd_detector = SharedSegmentDetector(0, 5, recomb_data) else: ibd_detector = SharedSegmentDetector(5000000) labeled_nodes = classifier._labeled_nodes labeled_node_pairs = set(combinations(labeled_nodes, 2)) related_pairs = set(classifier._distributions.keys()) cryptic_pairs = set(x for x in combinations(labeled_nodes, 2) if x not in related_pairs) print("Calculating IBD for pairs.") lengths = [] id_map = population.id_mapping for node_a_id, node_b_id in progressbar(cryptic_pairs): node_a = id_map[node_a_id] node_b = id_map[node_b_id] genome_a = node_a.genome genome_b = node_b.genome length = ibd_detector.shared_segment_length(genome_a, genome_b) lengths.append(length) np_lengths = np.array(lengths, dtype = np.uint64) print(fit_hurdle_gamma(np_lengths))