def test_unique_clusters(self): protoclusters = [create_protocluster(i, 10, product=prod) for i, prod in enumerate("ABC")] candidates = [CandidateCluster(CandidateCluster.kinds.INTERLEAVED, protoclusters[:2]), CandidateCluster(CandidateCluster.kinds.INTERLEAVED, protoclusters[1:])] assert protoclusters[1] in candidates[0].protoclusters and protoclusters[1] in candidates[1].protoclusters region = Region(candidate_clusters=candidates) unique_clusters = region.get_unique_protoclusters() # if the protocluster in both candidates is repeated, there'll be an extra assert len(unique_clusters) == 3 assert unique_clusters == protoclusters
def score_against_protoclusters(label: str, region: Region, hits_by_reference: HitsByReference, query_components: Dict[CDSCollection, Components], mode: Mode) -> VariantResults: """ Performs a protocluster vs protocluster comparison Arguments: label: the name to attach to the results region: the query Region hits_by_reference: a dictionary mapping ReferenceRecord to a dictionary mapping reference CDS name to Hit query_components: a dictionary mapping the region and each contained protocluster to a Components instance with the relevant data mode: the Mode in which to run the analysis Returns: a VariantResults instance """ score_matrix: Dict[int, Dict[ReferenceRegion, Dict[ ReferenceProtocluster, ReferenceScorer]]] = defaultdict(lambda: defaultdict(dict)) reference_best_scores: Dict[Protocluster, Dict[ ReferenceRegion, float]] = defaultdict(lambda: defaultdict(float)) local_hits = filter_by_query_area(region, hits_by_reference) for ref_region in local_hits: hits_for_ref_region = {ref_region: local_hits[ref_region]} for ref_protocluster in ref_region.protoclusters: hits = filter_by_reference_protocluster(ref_protocluster, hits_for_ref_region) for protocluster in region.get_unique_protoclusters(): for scorer in score_query_area(protocluster, hits, query_components[protocluster], mode): score = max( scorer.final_score, reference_best_scores[protocluster][ref_region]) reference_best_scores[protocluster][ref_region] = score score_matrix[protocluster.get_protocluster_number( )][ref_region][ref_protocluster] = scorer reference_total_scores: Dict[ReferenceRegion, float] = defaultdict(float) for ref_region_to_score in reference_best_scores.values(): for ref_region, score in ref_region_to_score.items(): reference_total_scores[ref_region] += score region_ranking = sorted(reference_total_scores.items(), key=lambda x: x[1], reverse=True) region_ranking, score_matrix, best_hits = apply_limits_to_rankings( region_ranking, score_matrix, local_hits) return VariantResults(label, region_ranking, ProtoToProtoScores(score_matrix), best_hits)
def score_as_protoclusters(label: str, region: Region, hits_by_reference: HitsByReference, query_components: Dict[CDSCollection, Components], mode: Mode) -> VariantResults: """ Performs a protocluster vs reference region comparison Arguments: label: the name to attach to the results region: the query Region hits_by_reference: a dictionary mapping ReferenceRecord to a dictionary mapping reference CDS name to Hit query_components: a dictionary mapping the region and each contained protocluster to a Components instance with the relevant data mode: the Mode in which to run the analysis Returns: a VariantResults instance """ local_hits = filter_by_query_area(region, hits_by_reference) total_scores: Dict[ReferenceRegion, float] = defaultdict(float) scores: Dict[int, Dict[ReferenceRegion, ReferenceScorer]] = defaultdict(dict) for protocluster in region.get_unique_protoclusters(): for scorer in score_query_area(protocluster, local_hits, query_components[protocluster], mode): total_scores[scorer.reference] += calculate_protocluster_ranking( scorer) scores[protocluster.get_protocluster_number()][ scorer.reference] = scorer ranking = sorted(total_scores.items(), key=lambda x: x[1], reverse=True) ranking, scores, best_hits = apply_limits_to_rankings( ranking, scores, local_hits) return VariantResults(label, ranking, ProtoToRegionScores(scores), best_hits)