Ejemplo n.º 1
0
    def _evaluate_node(self, node):
        raw_identified = self._bayes.identify(node.genome, node,
                                              self._ibd_detector)
        sibling_group, ln_ratio, identified_node = raw_identified
        node_generation = self._population.node_to_generation[node]
        print("Confidence score: {}".format(ln_ratio))
        if node in sibling_group:
            self.generation_error[node_generation]["correct"] += 1
            self.correct += 1
            print("correct")
        else:
            self.generation_error[node_generation]["incorrect"] += 1
            print("incorrect")
            self.incorrect += 1

        write_log(
            "evaluate", {
                "target node": node._id,
                "log ratio": ln_ratio,
                "identified": set(x._id for x in sibling_group),
                "run_number": self._run_number
            })
        stdout.flush()
        return IdentifyResult(node, sibling_group, identified_node, ln_ratio,
                              node in sibling_group, self._run_number)
    def run_evaluation(self, unlabeled, expansion=None):
        # generation_map = population.node_to_generation
        # write_log("labeled_nodes", [node._id for node in labeled_nodes])
        # write_log("target_nodes", [node._id for node in unlabeled])
        if expansion is None:
            print("Attempting to identify {} random nodes.".format(
                len(unlabeled)),
                  flush=True)
        write_log("start time", datetime.now())
        for i, node in enumerate(unlabeled):
            if expansion:
                i = expansion
            print("Iteration: {}, actual node ID: {}".format(i + 1, node._id))
            start_time = perf_counter()
            if not expansion and self._randomize_labeled:
                potential = self._original_labeled.copy()
                if node._id in potential:
                    potential.remove(node._id)
                assert len(potential) > self._randomize_labeled
                new_anchors = sample(potential, self._randomize_labeled)
                self.labeled_nodes = new_anchors
            if self._out_of_genealogy > 0:
                self._reset_out_of_genealogy()
                self._remove_node_from_genealogy(node)
            self.identify_results.append(self._evaluate_node(node))
            end_time = perf_counter()
            print("It took {} seconds".format(end_time - start_time))

        write_log("end time", datetime.now())
        self._run_number += 1
        return self.identify_results
Ejemplo n.º 3
0
    def run_evaluation(self, unlabeled):
        # generation_map = population.node_to_generation
        # write_log("labeled_nodes", [node._id for node in labeled_nodes])
        # write_log("target_nodes", [node._id for node in unlabeled])
        print("Attempting to identify {} random nodes.".format(len(unlabeled)),
              flush=True)
        write_log("start time", datetime.now())
        for i, node in enumerate(unlabeled):
            print("Iteration: {}, actual node ID: {}".format(i + 1, node._id))
            self.identify_results.append(self._evaluate_node(node))

        write_log("end time", datetime.now())
        self._run_number += 1
        return self.identify_results
    def _evaluate_node(self, node):
        identified, ln_ratio = self._bayes.identify(node.genome, node,
                                                    self._ibd_threshold)
        assert len(identified) > 0
        node_generation = self._population.node_to_generation[node]
        if node in identified:
            self.generation_error[node_generation]["correct"] += 1
            self.correct += 1
            print("correct")
        else:
            self.generation_error[node_generation]["incorrect"] += 1
            print("incorrect")
            self.incorrect += 1

        write_log(
            "evaluate", {
                "target node": node._id,
                "log ratio": ln_ratio,
                "identified": set(x._id for x in identified),
                "run_number": self._run_number
            })
        stdout.flush()
        return IdentifyResult(node, identified, ln_ratio, node in identified,
                              self._run_number)
Ejemplo n.º 5
0
 def run_expansion_round(self,
                         identify_candidates,
                         confidence_ratio,
                         expansion_data=None,
                         expansion_filename=None):
     print("Running expansion round.")
     to_evaluate = list(identify_candidates)
     shuffle(to_evaluate)
     added = []
     correct_add_count = 0
     write_log("expansion_confidence_ratio", confidence_ratio)
     for i, node in enumerate(to_evaluate):
         self.run_evaluation([node])
         result = self.identify_results[-1]
         print("Ratio: {}".format(result.ln_ratio))
         if result.ln_ratio > confidence_ratio:
             print("Adding node.")
             added.append(result)
             self._bayes.add_labeled_node_id(result.identified_node._id)
             if result.correct:
                 correct_add_count += 1
             else:
                 result.identified_node.suspected_genome = result.target_node.genome
         if i % 20 == 0:
             self.print_metrics()
             print("Nodes added this round: {}".format(len(added)))
             print("Correct nodes added: {}".format(correct_add_count))
         if expansion_data and expansion_filename and i % 500 == 0:
             remaining = set(node._id for node in to_evaluate[i:])
             expansion_data.extend_round(added, remaining)
             with open(expansion_filename, "wb") as expansion_file:
                 dump(expansion_data, expansion_file)
             write_log("expansion_data_written", {
                 "current_node": node._id,
                 "complete": False
             })
     write_log(
         "expansion_round", {
             "added": len(added),
             "correct_added": correct_add_count,
             "accuracy": self.accuracy
         })
     self.print_metrics()
     print("Added {} nodes this round.".format(len(added)))
     return added
Ejemplo n.º 6
0
    def print_metrics(self):
        total = self.correct + self.incorrect
        print("{} correct, {} incorrect, {} total.".format(
            self.correct, self.incorrect, total))
        stdout.flush()

        write_log("correct", self.correct)
        write_log("incorrect", self.incorrect)
        write_log("total", total)
        percent_accurate = self.accuracy
        std_dev = sqrt(percent_accurate *
                       (1 - percent_accurate) * total) / total
        print("{}±{:0.3} percent accurate.".format(percent_accurate, std_dev))
        for generation, counter in self.generation_error.items():
            gen_correct = counter["correct"]
            gen_incorrect = counter["incorrect"]
            total = gen_correct + gen_incorrect
            format_string = "For generation {}: {} accuracy, {} total."
            print(format_string.format(generation, gen_correct / total, total))
parser.add_argument("--ibd-threshold", type = int, default = 5000000,
                    help = "IBD segments smaller than this value will "
                    "go undetected")
parser.add_argument("--deterministic_random", "-d", action = "store_true",
                    help = "Seed the random number generator such that the same labeled nodes will be chosen on runs with the same number of nodes.")
parser.add_argument("--deterministic_labeled", "-ds", action = "store_true",
                    help = "Seed the random number generator to ensure labeled node subset is deterministic.")
args = parser.parse_args()

if args.data_logfile:
    change_logfile_name(args.data_logfile)
    start_logging()
else:
    stop_logging()

write_log("args", args)

print("Loading population.", flush = True)
with open(args.population, "rb") as pickle_file:
    population = PopulationUnpickler(pickle_file).load()

print("Loading classifier", flush = True)
with open(args.classifier, "rb") as pickle_file:
    classifier = load(pickle_file)

nodes = set(member for member in population.members
             if member.genome is not None)
# nodes = set(member for member in population.generations[-1].members
#             if member.genome is not None)

if args.subset_labeled:
    def identify(self, genome, actual_node, ibd_threshold=5000000):
        node_probabilities = dict()  # Probability that a node is a match
        id_map = self._population.id_mapping
        length_classifier = self._length_classifier
        shared_list = []
        for labeled_node_id in length_classifier._labeled_nodes:
            labeled_node = id_map[labeled_node_id]
            s = shared_segment_length_genomes(genome, labeled_node.genome,
                                              ibd_threshold)
            shared_list.append((labeled_node_id, s))

        node_data = dict()
        batch_node_id = []
        batch_labeled_node_id = []
        batch_lengths = []
        batch_cryptic_lengths = []
        # This is done for performance reasons, as appending to this
        # list is the hottest part of the loop.
        append_cryptic = batch_cryptic_lengths.append
        distributions = length_classifier._distributions
        # Set membership testing is faster than dictionary key
        # membership testing, so we use a set.
        distribution_members = set(distributions.keys())
        nodes = self._to_search(shared_list)
        for node in nodes:
            node_start_i = len(batch_node_id)
            node_id = node._id
            cryptic_start_i = len(batch_cryptic_lengths)
            for labeled_node_id, shared in shared_list:
                if (node_id, labeled_node_id) not in distribution_members:
                    append_cryptic(shared)
                else:
                    batch_node_id.append(node_id)
                    batch_labeled_node_id.append(labeled_node_id)
                    batch_lengths.append(shared)
            cryptic_stop_i = len(batch_cryptic_lengths)
            node_stop_i = len(batch_node_id)
            node_data[node] = ProbabilityData(node_start_i, node_stop_i,
                                              cryptic_start_i, cryptic_stop_i)

        calc_prob = length_classifier.get_batch_probability(
            batch_lengths, batch_node_id, batch_labeled_node_id)
        cryptic_prob = length_classifier.get_batch_smoothing(
            batch_cryptic_lengths)

        # index_data = {node._id: tuple(indices)
        #               for node, indices in node_data.items()}
        # siblings = {node._id for node in get_sibling_group(actual_node)}
        # to_dump = {"actual_node_id": actual_node._id,
        #            "calc_prob": calc_prob,
        #            "cryptic_lengths": batch_cryptic_lengths,
        #            "siblings": siblings,
        #            "index_data": index_data}
        # output_filename = "/media/paul/Fast Storage/optimize_data/{}.pickle".format(actual_node._id)
        # with open(output_filename, "wb") as pickle_file:
        #     dump(to_dump, pickle_file)
        node_probabilities = dict()
        for node, prob_data in node_data.items():
            start_i, stop_i, cryptic_start_i, cryptic_stop_i = prob_data
            if node == actual_node:
                pass
                # import pdb
                # pdb.set_trace()
            node_calc = calc_prob[start_i:stop_i]
            node_cryptic = cryptic_prob[cryptic_start_i:cryptic_stop_i]
            log_prob = (np.sum(np.log(node_calc)) +
                        np.sum(np.log(node_cryptic)))
            node_probabilities[node] = log_prob
        # potential_node = max(node_probabilities.items(),
        #                      key = lambda x: x[1])[0]
        write_log(
            "identify", {
                "node": actual_node._id,
                "probs":
                {node._id: prob
                 for node, prob in node_probabilities.items()}
            })
        potential_nodes = nlargest(8,
                                   node_probabilities.items(),
                                   key=lambda x: x[1])
        top, top_log_prob = potential_nodes[0]
        sibling_group = get_sibling_group(top)
        for node, log_prob in potential_nodes[1:]:
            if node in sibling_group:
                continue
            next_node = node
            next_log_prob = log_prob
            break
        else:
            next_node, next_log_prob = potential_nodes[1]

        log_ratio = top_log_prob - next_log_prob
        # log_data = {"actual_node_id": actual_node._id,
        #             "prob_indices": prob_data,
        #             "calc_prob": calc_prob,
        #             "cryptic_prob": cryptic_prob
        #             "sibling_group": [node._id for node in sibling_group]}
        # write_log("run_data", log_data)
        return (sibling_group, log_ratio)
    def identify(self, genome, actual_node, segment_detector):
        node_probabilities = dict() # Probability that a node is a match
        id_map = self._population.id_mapping
        length_classifier = self._length_classifier
        # TODO: Eliminated shared_list and use shared_dict everywhere
        shared_list = []
        for labeled_node_id in length_classifier._labeled_nodes:
            labeled_node = id_map[labeled_node_id]
            s = segment_detector.shared_segment_length(genome,
                                                       labeled_node.suspected_genome)
            shared_list.append((labeled_node_id, s))

        shared_dict = dict(shared_list)
        labeled_nodes = set(length_classifier._labeled_nodes)

        labeled_nodes_cryptic, all_lengths = list(zip(*shared_dict.items()))
        # We convert to python floats, as summing is faster.
        # all_cryptic_possibilities = [float(x) for x
        #                              in np.log(length_classifier.get_batch_smoothing(all_lengths))]
        all_cryptic_possibilities = [float(x) for x
                                     in np.log(length_classifier.get_batch_smoothing_gamma(all_lengths))]
        # Maps labeled nodes to the log cryptic value of the IBD detected
        cryptic_lookup = dict(zip(labeled_nodes_cryptic,
                                  all_cryptic_possibilities))

        # if self.cryptic_logging:
        #     unique_lengths = np.sort(np.unique(np.asarray(all_lengths,
        #                                                   dtype = np.uint64)))
        #     cryptic_length_logging = dict()
        #     non_cryptic_probabilties = dict()
        
        node_data = dict()
        batch_node_id = []
        batch_labeled_node_id = []
        batch_lengths = []
        # Keep for logging purposes
        # batch_cryptic_lengths = []
        node_cryptic_log_probs = dict()
        by_unlabeled = length_classifier.group_by_unlabeled
        nodes = self._to_search(shared_list, actual_node.sex)
        if len(nodes) == 0:
            # We have no idea which node it is
            return RawIdentified(set(), float("-inf"), None)
        
        empty = set()
        for node in nodes:
            node_start_i = len(batch_node_id)
            node_id = node._id
            #cryptic_start_i = len(batch_cryptic_lengths)
            cryptic_probability = 0
            node_cryptic_log_probs[node] = 0

            cryptic_nodes = labeled_nodes - by_unlabeled.get(node_id, empty)
            if len(cryptic_nodes) > 0:
                # batch_cryptic_lengths.extend(shared_dict[labeled_node_id]
                #                              for labeled_node_id
                #                              in cryptic_nodes)
                cryptic_probability = sum(cryptic_lookup[labeled_node_id]
                                          for labeled_node_id
                                          in cryptic_nodes)
                node_cryptic_log_probs[node] = cryptic_probability
                # if self.cryptic_logging:
                #     to_log = _get_logging_cryptic_lengths(shared_dict,
                #                                           cryptic_nodes,
                #                                           unique_lengths)
                #     cryptic_length_logging[node._id] = to_log

        
            non_cryptic_nodes = list(labeled_nodes - cryptic_nodes)
            if len(non_cryptic_nodes) > 0:
                batch_node_id.extend([node_id] * len(non_cryptic_nodes))
                batch_labeled_node_id.extend(non_cryptic_nodes)
                batch_lengths.extend(shared_dict[labeled_node_id]
                                     for labeled_node_id in non_cryptic_nodes)
            
            #cryptic_stop_i = len(batch_cryptic_lengths)
            node_stop_i = len(batch_node_id)
            node_data[node] = ProbabilityData(node_start_i, node_stop_i,
                                              -1, -1)
                                              #cryptic_start_i, cryptic_stop_i)

        assert len(node_data) > 0
        if len(batch_lengths) > 0:
            pdf_vals = length_classifier.get_batch_pdf(batch_lengths,
                                                       batch_node_id,
                                                       batch_labeled_node_id)
            calc_prob, zero_replace = pdf_vals
        else:
            calc_prob = []

        node_probabilities = dict()
        for node, prob_data in node_data.items():
            start_i, stop_i, cryptic_start_i, cryptic_stop_i = prob_data
            node_calc = calc_prob[start_i:stop_i]
            # if self.cryptic_logging:
            #     zero_vec = zero_replace[start_i:stop_i]
            #     non_cryptic_probabilties[node._id] = node_calc
            #     non_cryptic_probabilties[node._id][zero_vec] = None #stores as NaN
            log_prob = (np.sum(np.log(node_calc)) +
                        node_cryptic_log_probs[node])
            node_probabilities[node] = log_prob
        assert len(node_probabilities) > 0
        # potential_node = max(node_probabilities.items(),
        #                      key = lambda x: x[1])[0]
        write_log("identify", {"node": actual_node._id,
                               "probs": {node._id: prob
                                         for node, prob
                                         in node_probabilities.items()}})
        # if self.cryptic_logging:
        #     to_log = {"unique lengths": unique_lengths,
        #               "cryptic probability": cryptic_length_logging,
        #               "non cryptic": non_cryptic_probabilties}
        #     write_log("cryptic_logging", to_log)
        potential_nodes = nlargest(8, node_probabilities.items(),
                                   key = lambda x: x[1])
        top, top_log_prob = potential_nodes[0]
        sibling_group = get_suspected_sibling_group(top)
        for node, log_prob in potential_nodes[1:]:
            if node in sibling_group:
                continue
            next_node = node
            next_log_prob = log_prob
            break
        else:
            next_node, next_log_prob = potential_nodes[1]
                
        log_ratio  = top_log_prob - next_log_prob
        # log_data = {"actual_node_id": actual_node._id,
        #             "prob_indices": prob_data,
        #             "calc_prob": calc_prob,
        #             "cryptic_prob": cryptic_prob
        #             "sibling_group": [node._id for node in sibling_group]}
        # write_log("run_data", log_data)
        return RawIdentified(get_sibling_group(top), log_ratio, top)
Ejemplo n.º 10
0
        parser.error(
            "A subset of labeled nodes is necessary for expansion rounds when expansion rounds data file does not already exist."
        )
    if expansion_file_exists:
        with open(args.expansion_rounds_data, "rb") as expansion_file:
            expansion_data = load(expansion_file)
    else:
        expansion_data = None

if args.data_logfile:
    change_logfile_name(args.data_logfile)
    start_logging()
else:
    stop_logging()

write_log("args", args)

print("Loading population.", flush=True)
with open(args.population, "rb") as pickle_file:
    population = PopulationUnpickler(pickle_file).load()

print("Loading classifier", flush=True)
with open(args.classifier, "rb") as pickle_file:
    classifier = load(pickle_file)

if args.cm_ibd_threshold > 0:
    cur_path = realpath(__file__)
    parent = split(split(cur_path)[0])[0]
    rates_dir = join(parent, "data", "recombination_rates")
    print("Loading recombination data for centimorgan cutoff.", flush=True)
    recomb_data = centimorgan_data_from_directory(rates_dir)
    def run_expansion_round(self,
                            identify_candidates,
                            confidence_ratio,
                            expansion_data=None,
                            expansion_filename=None,
                            revisit=True):
        print("Running expansion round.")
        if revisit:
            to_evaluate = list(identify_candidates)
        else:
            id_map = self._population.id_mapping
            labeled_genomes = set(id_map[x].suspected_genome
                                  for x in self.labeled_nodes)
            to_evaluate = [
                x for x in identify_candidates
                if x.suspected_genome not in labeled_genomes
            ]
        shuffle(to_evaluate)
        added = []
        correct_add_count = 0
        new_added = 0
        self._bayes.probability_logging = False
        write_log("expansion_confidence_ratio", confidence_ratio)
        for i, node in enumerate(to_evaluate):
            self.run_evaluation([node], expansion=i)
            result = self.identify_results[-1]
            if result.ln_ratio > confidence_ratio:
                print("Adding node.")

                identified_node = result.identified_node
                added.append(result)
                if not expansion_data.identified_before(result.target_node):
                    new_added += 1
                if result.correct:
                    correct_add_count += 1
                prev_added = expansion_data.add_node(result)
                if prev_added != identified_node._id:
                    self._bayes.add_labeled_node_id(identified_node._id)
                    if prev_added is not None:
                        self._bayes.remove_labeled_node_id(prev_added._id)

            if 0 < i and i % 20 == 0:
                self.print_metrics()
                print("Nodes added this round: {}, New nodes added: {}".format(
                    len(added), new_added))
                print("Correct nodes added: {}".format(correct_add_count))
            if expansion_data and expansion_filename and i % 500 == 0:
                remaining = set(node._id for node in to_evaluate[i:])
                expansion_data.remaining = remaining
                with open(expansion_filename, "wb") as expansion_file:
                    dump(expansion_data, expansion_file)
                write_log("expansion_data_written", {
                    "current_node": node._id,
                    "complete": False
                })
        expansion_data.remaining = None
        write_log(
            "expansion_round", {
                "added": len(added),
                "evaluated": len(to_evaluate),
                "correct_added": correct_add_count,
                "accuracy": self.accuracy
            })
        self.print_metrics()
        print("Added {} nodes this round.".format(len(added)))
        if len(added) == 0:
            input("No nodes added. Press enter to continue")
        return added
Ejemplo n.º 12
0
    def identify(self, genome, actual_node, segment_detector):
        id_map = self._population.id_mapping
        length_classifier = self._length_classifier
        # TODO: Eliminated shared_list and use shared_dict everywhere
        shared_list = []
        anchors = set(length_classifier._labeled_nodes) - self.exclude_anchors
        sorted_labeled = sorted(anchors)
        np_sorted_labeled = np.array(sorted_labeled, dtype=np.uint32)
        sorted_shared = []
        for labeled_node_id in sorted_labeled:
            labeled_node = id_map[labeled_node_id]
            s = segment_detector.shared_segment_length(
                genome, labeled_node.suspected_genome)
            shared_list.append((labeled_node_id, s))
            sorted_shared.append(s)

        write_log("positive ibd count", sum(0.0 < x for x in sorted_shared))
        #write_log("shared", sorted_shared)
        shared_dict = dict(shared_list)
        sorted_shared = np.array(sorted_shared, dtype=np.float64)

        labeled_nodes_cryptic, all_lengths = list(zip(*shared_dict.items()))
        np_cryptic = np.log(
            length_classifier.get_batch_smoothing_gamma(sorted_shared))

        node_data = []
        batch_shape = []
        batch_scale = []
        batch_zero_prob = []
        batch_lengths = []
        # Keep for logging purposes
        # batch_cryptic_lengths = []
        nodes = self._to_search(shared_list, actual_node.sex)
        if len(nodes) == 0:
            # We have no idea which node it is
            return RawIdentified(set(), float("-inf"), None)

        for node in nodes:
            node_start_i = len(batch_shape)
            node_id = node._id
            #node_cryptic_log_probs[node] = 0

            if node_id in length_classifier._distributions:
                labeled_ids, shape, scale, zero_prob = length_classifier._distributions[
                    node_id]
            else:
                labeled_ids = np.array([], dtype=np.uint32)
                shape = scale = zero_prob = np.array([], dtype=np.float64)
            calc_data = calculate_probabilities(labeled_ids, shape, scale,
                                                zero_prob, sorted_shared,
                                                np_sorted_labeled, np_cryptic,
                                                node_id)
            cur_lengths, cur_shapes, cur_scales, cur_zero_prob, cur_cryptic = calc_data
            batch_lengths.extend(cur_lengths)
            batch_shape.extend(cur_shapes)
            batch_scale.extend(cur_scales)
            batch_zero_prob.extend(cur_zero_prob)

            node_stop_i = len(batch_shape)
            node_data.append(
                ProbabilityData(node, node_start_i, node_stop_i, cur_cryptic))

        assert len(node_data) > 0
        if len(batch_lengths) > 0:
            pdf_vals = length_classifier.batch_pdf_distributions(
                batch_lengths, batch_shape, batch_scale, batch_zero_prob)
            calc_prob, zero_replace = pdf_vals
        else:
            calc_prob = []

        log_calc_prob_cum = np.cumsum(np.log(calc_prob))
        del calc_prob
        log_calc_prob_cum = np.concatenate(([0.0], log_calc_prob_cum))
        node_probabilities = dict()
        for node, start_i, stop_i, cryptic_prob in node_data:
            log_prob = (log_calc_prob_cum[stop_i] -
                        log_calc_prob_cum[start_i]) + cryptic_prob
            node_probabilities[node] = log_prob
        assert len(node_probabilities) > 0
        if self.probability_logging:
            write_log(
                "identify", {
                    "node": actual_node._id,
                    "probs": {
                        node._id: prob
                        for node, prob in node_probabilities.items()
                    }
                })

        if len(node_probabilities) == 0:
            return RawIdentified(set(), -INF, None)
        # The value 8 is somewhat arbitrary. We are always able to
        # generate our confidence value with the top 8, as sibships
        # tend to be small. This number may need to be larger for
        # populations with large sibships.
        potential_nodes = nlargest(8,
                                   node_probabilities.items(),
                                   key=lambda x: x[1])
        top, top_log_prob = potential_nodes[0]
        sibling_group = get_suspected_sibling_group(top)
        for node, log_prob in potential_nodes[1:]:
            if node in sibling_group:
                continue
            next_node = node
            next_log_prob = log_prob
            break
        else:
            if len(potential_nodes) > 1:
                next_node, next_log_prob = potential_nodes[1]

        if len(potential_nodes) > 1:
            log_ratio = top_log_prob - next_log_prob
        else:
            log_ratio = -INF
        return RawIdentified(get_sibling_group(top), log_ratio, top)
Ejemplo n.º 13
0
    def identify(self, genome, actual_node, segment_detector):
        node_probabilities = dict()  # Probability that a node is a match
        id_map = self._population.id_mapping
        length_classifier = self._length_classifier
        # TODO: Eliminated shared_list and use shared_dict everywhere
        shared_list = []
        for labeled_node_id in length_classifier._labeled_nodes:
            labeled_node = id_map[labeled_node_id]
            s = segment_detector.shared_segment_length(
                genome, labeled_node.suspected_genome)
            shared_list.append((labeled_node_id, s))

        shared_dict = dict(shared_list)
        labeled_nodes = set(length_classifier._labeled_nodes)

        node_data = dict()
        batch_node_id = []
        batch_labeled_node_id = []
        batch_lengths = []
        batch_cryptic_lengths = []
        by_unlabeled = length_classifier.group_by_unlabeled
        nodes = self._to_search(shared_list)
        if len(nodes) == 0:
            # We have no idea which node it is
            return RawIdentified(set(), float("-inf"), None)

        empty = set()
        for node in nodes:
            node_start_i = len(batch_node_id)
            node_id = node._id
            cryptic_start_i = len(batch_cryptic_lengths)

            cryptic_nodes = labeled_nodes - by_unlabeled.get(node_id, empty)
            if len(cryptic_nodes) > 0:
                batch_cryptic_lengths.extend(
                    shared_dict[labeled_node_id]
                    for labeled_node_id in cryptic_nodes)

            non_cryptic_nodes = list(labeled_nodes - cryptic_nodes)
            if len(non_cryptic_nodes) > 0:
                batch_node_id.extend([node_id] * len(non_cryptic_nodes))
                batch_labeled_node_id.extend(non_cryptic_nodes)
                batch_lengths.extend(shared_dict[labeled_node_id]
                                     for labeled_node_id in non_cryptic_nodes)

            cryptic_stop_i = len(batch_cryptic_lengths)
            node_stop_i = len(batch_node_id)
            node_data[node] = ProbabilityData(node_start_i, node_stop_i,
                                              cryptic_start_i, cryptic_stop_i)

        assert len(node_data) > 0
        if len(batch_lengths) > 0:
            calc_prob = length_classifier.get_batch_probability(
                batch_lengths, batch_node_id, batch_labeled_node_id)
        else:
            calc_prob = []
        cryptic_prob = length_classifier.get_batch_smoothing(
            batch_cryptic_lengths)

        # index_data = {node._id: tuple(indices)
        #               for node, indices in node_data.items()}
        # siblings = {node._id for node in get_sibling_group(actual_node)}
        # to_dump = {"actual_node_id": actual_node._id,
        #            "calc_prob": calc_prob,
        #            "cryptic_lengths": batch_cryptic_lengths,
        #            "siblings": siblings,
        #            "index_data": index_data}
        # output_filename = "/media/paul/Fast Storage/optimize_data/{}.pickle".format(actual_node._id)
        # with open(output_filename, "wb") as pickle_file:
        #     dump(to_dump, pickle_file)
        node_probabilities = dict()
        for node, prob_data in node_data.items():
            start_i, stop_i, cryptic_start_i, cryptic_stop_i = prob_data
            if node == actual_node:
                pass
                # import pdb
                # pdb.set_trace()
            node_calc = calc_prob[start_i:stop_i]
            node_cryptic = cryptic_prob[cryptic_start_i:cryptic_stop_i]
            log_prob = (np.sum(np.log(node_calc)) +
                        np.sum(np.log(node_cryptic)))
            node_probabilities[node] = log_prob
        assert len(node_probabilities) > 0
        # potential_node = max(node_probabilities.items(),
        #                      key = lambda x: x[1])[0]
        write_log(
            "identify", {
                "node": actual_node._id,
                "probs":
                {node._id: prob
                 for node, prob in node_probabilities.items()}
            })
        potential_nodes = nlargest(8,
                                   node_probabilities.items(),
                                   key=lambda x: x[1])
        top, top_log_prob = potential_nodes[0]
        sibling_group = get_suspected_sibling_group(top)
        for node, log_prob in potential_nodes[1:]:
            if node in sibling_group:
                continue
            next_node = node
            next_log_prob = log_prob
            break
        else:
            next_node, next_log_prob = potential_nodes[1]

        log_ratio = top_log_prob - next_log_prob
        # log_data = {"actual_node_id": actual_node._id,
        #             "prob_indices": prob_data,
        #             "calc_prob": calc_prob,
        #             "cryptic_prob": cryptic_prob
        #             "sibling_group": [node._id for node in sibling_group]}
        # write_log("run_data", log_data)
        return RawIdentified(sibling_group, log_ratio, top)
    help=
    "Search only nodes that are related to labeled nodes for which there is nonzero ibd."
)

args = parser.parse_args()

if args.expansion_rounds > 1 and args.subset_labeled is None:
    parser.error(
        "A subset of labeled nodes is necessary for expansion rounds.")
if args.data_logfile:
    change_logfile_name(args.data_logfile)
    start_logging()
else:
    stop_logging()

write_log("args", args)

IdentifyResult = namedtuple(
    "IdentifyResult",
    ["target_node", "identified_node", "ln_ratio", "correct", "run_number"])


class Evaluation:
    def __init__(self,
                 population,
                 classifier,
                 labeled_nodes=None,
                 ibd_threshold=0,
                 search_related=False):
        self._population = population
        self._classifier = classifier