def test_get_match_paths(self):
     self.me.trials = dict()
     for file in glob.glob('./matchengine/tests/data/ctml_boolean_cases/*.json'):
         with open(file) as f:
             data = json.load(f)
             trial = [data]
             self.me.trials[file] = trial
     with open("./matchengine/tests/data/get_match_paths_expected.json") as f:
         test_cases = json.load(f)
     for trial in self.me.trials:
         filename = os.path.basename(trial)
         me_trial = self.me.trials[trial]
         match_tree = create_match_tree(self.me, MatchClauseData(match_clause=me_trial,
                                                                 internal_id='123',
                                                                 code='456',
                                                                 coordinating_center='The Death Star',
                                                                 status='Open to Accrual',
                                                                 parent_path=ParentPath(()),
                                                                 match_clause_level=MatchClauseLevel('arm'),
                                                                 match_clause_additional_attributes={},
                                                                 is_suspended=True,
                                                                 protocol_no='12-345'))
         match_paths = list(get_match_paths(match_tree))
         for test_case, match_path in zip(test_cases[filename], match_paths):
             for test_case_criteria_idx, test_case_criteria in enumerate(test_case["criteria_list"]):
                 match_path_criteria = match_path.criteria_list[test_case_criteria_idx]
                 assert test_case_criteria["depth"] == match_path_criteria.depth
                 for inner_test_case_criteria, inner_match_path_criteria in zip(test_case_criteria["criteria"],
                                                                                match_path_criteria.criteria):
                     assert nested_object_hash(inner_test_case_criteria) == nested_object_hash(
                         inner_match_path_criteria)
 def test_comparable_dict(self):
     assert nested_object_hash({}) == nested_object_hash({})
     assert nested_object_hash({"1": "1",
                                "2": "2"}) == nested_object_hash({"2": "2",
                                                                  "1": "1"})
     assert nested_object_hash({"1": [{}, {2: 3}],
                                "2": "2"}) == nested_object_hash({"2": "2",
                                                                  "1": [{2: 3}, {}]})
     assert nested_object_hash({"1": [{'set': {1, 2, 3}}, {2: 3}],
                                "2": "2"}) == nested_object_hash({"2": "2",
                                                                  "1": [{2: 3}, {'set': {3, 1, 2}}]})
     assert nested_object_hash({
         1: {
             2: [
                 {
                     3: 4,
                     5: {6, 7}
                 }
             ]
         },
         "4": [9, 8]
     }) != nested_object_hash({
         1: {
             2: [
                 {
                     3: 4,
                     9: {6, 7}
                 }
             ]
         },
         "4": [9, 8]
     })
    def test_create_match_tree(self):
        self.me.trials = dict()
        for file in glob.glob('./matchengine/tests/data/ctml_boolean_cases/*.json'):
            with open(file) as f:
                data = json.load(f)
                trial = [data]
                self.me.trials[file] = trial

        with open('./matchengine/tests/data/create_match_tree_expected.json') as f:
            test_cases = json.load(f)

        for trial in self.me.trials:
            me_trial = self.me.trials[trial]
            match_tree = create_match_tree(self.me, MatchClauseData(match_clause=me_trial,
                                                                    internal_id='123',
                                                                    code='456',
                                                                    coordinating_center='The Death Star',
                                                                    status='Open to Accrual',
                                                                    parent_path=ParentPath(()),
                                                                    match_clause_level=MatchClauseLevel('arm'),
                                                                    match_clause_additional_attributes={},
                                                                    protocol_no='12-345',
                                                                    is_suspended=True))
            test_case = test_cases[os.path.basename(trial)]
            assert len(test_case["nodes"]) == len(match_tree.nodes)
            for test_case_key in test_case.keys():
                if test_case_key == "nodes":
                    for node_id, node_attrs in test_case[test_case_key].items():
                        graph_node = match_tree.nodes[int(node_id)]
                        assert len(node_attrs) == len(graph_node)
                        assert nested_object_hash(node_attrs) == nested_object_hash(graph_node)
                else:
                    for test_item, graph_item in zip(test_case[test_case_key], getattr(match_tree, test_case_key)):
                        for idx, test_item_part in enumerate(test_item):
                            assert test_item_part == graph_item[idx]
Beispiel #4
0
 def hash(self) -> str:
     if self._hash is None:
         self._hash = nested_object_hash({
             "query":
             [criteria.criteria for criteria in self.criteria_list]
         })
     return self._hash
Beispiel #5
0
 def raw_query_hash(self):
     if self._raw_query_hash is None:
         if not self.is_finalized:
             raise Exception("Query node is not finalized")
         else:
             self._raw_query_hash = nested_object_hash(
                 self.extract_raw_query())
     return self._raw_query_hash
Beispiel #6
0
 def hash(self) -> str:
     if self._hash is None:
         self._hash = nested_object_hash({
             "_tmp1":
             [query_part.hash() for query_part in self.query_parts],
             '_tmp2':
             self.exclusion
         })
     return self._hash
Beispiel #7
0
    def pre_process_trial_matches(self, trial_match: TrialMatch) -> Dict:
        """
        Function which returns required fields for trial_match documents
        """

        new_trial_match = dict()
        clinical_doc = self.cache.docs[trial_match.match_reason.clinical_id]
        new_trial_match.update(self.format_trial_match_k_v(clinical_doc))
        new_trial_match['clinical_id'] = self.cache.docs[trial_match.match_reason.clinical_id][
            '_id']

        new_trial_match.update(
            {
                'match_level': trial_match.match_clause_data.match_clause_level,
                'internal_id': trial_match.match_clause_data.internal_id,
                'reason_type': trial_match.match_reason.reason_name,
                'q_depth': trial_match.match_reason.depth,
                'q_width': trial_match.match_reason.width,
                'code': trial_match.match_clause_data.code,
                'trial_curation_level_status': 'closed' if trial_match.match_clause_data.is_suspended else 'open',
                'trial_summary_status': trial_match.match_clause_data.status,
                'coordinating_center': trial_match.match_clause_data.coordinating_center,
                'show_in_ui': trial_match.match_reason.show_in_ui,
                'query_hash': trial_match.match_criterion.hash()
            })

        # add trial fields except for extras
        new_trial_match.update({
            k: v
            for k, v in trial_match.trial.items()
            if k not in {'treatment_list', '_summary', 'status', '_elasticsearch', 'match'}
        })

        new_trial_match.update(
            {
                'match_path': '.'.join(
                    [str(item) for item in trial_match.match_clause_data.parent_path])
            })

        new_trial_match['combo_coord'] = nested_object_hash(
            {
                'query_hash': new_trial_match['query_hash'],
                'match_path': new_trial_match['match_path'],
                self.match_criteria_transform.trial_identifier: new_trial_match[
                    self.match_criteria_transform.trial_identifier]
            })

        new_trial_match['is_disabled'] = False
        new_trial_match.pop("_updated", None)
        new_trial_match.pop("last_updated", None)
        new_trial_match.pop("_id", None)
        return new_trial_match
Beispiel #8
0
async def run_query_task(matchengine: MatchEngine, task, worker_id):
    if matchengine.debug:
        log.info((f"Worker: {worker_id}, protocol_no: {task.trial['protocol_no']} got new QueryTask, "
                  f"{matchengine._task_q.qsize()} tasks left in queue"))
    try:
        results: Dict[ClinicalID, List[MatchReason]] = await matchengine.run_query(task.query,
                                                                                   task.clinical_ids)
    except Exception as e:
        results = dict()
        log.error(f"ERROR: Worker: {worker_id}, error: {e}")
        log.error(f"TRACEBACK: {traceback.print_tb(e.__traceback__)}")
        if e.__class__ is AutoReconnect:
            matchengine.task_q.put_nowait(task)
            matchengine.task_q.task_done()
        elif e.__class__ is CursorNotFound:
            matchengine.task_q.put_nowait(task)
            matchengine.task_q.task_done()
        elif e.__class__ is ServerSelectionTimeoutError:
            matchengine.task_q.put_nowait(task)
            matchengine.task_q.task_done()
        else:
            matchengine.loop.stop()
            log.error(f"ERROR: Worker: {worker_id}, error: {e}")
            log.error(f"TRACEBACK: {traceback.print_tb(e.__traceback__)}")

    try:
        by_sample_id = defaultdict(list)
        matchengine.results_transformer(results)
        if not results:
            matchengine.matches.setdefault(task.match_clause_data.protocol_no, dict())
        for _, sample_results in results.items():
            for result in sample_results:
                matchengine.queue_task_count += 1
                if matchengine.queue_task_count % 1000 == 0 and matchengine.debug:
                    log.info(f"Trial match count: {matchengine.queue_task_count}")
                match_context_data = TrialMatch(task.trial,
                                                task.match_clause_data,
                                                task.match_path,
                                                task.query,
                                                result,
                                                matchengine.starttime)

                # allow user to extend trial_match objects in plugin functions
                # generate required fields on trial match doc before
                # generate sort_order and hash fields after all fields are added
                new_match_proto = matchengine.pre_process_trial_matches(match_context_data)
                match_document = matchengine.create_trial_matches(match_context_data, new_match_proto)
                sort_order = get_sort_order(matchengine.config['trial_match_sorting'], match_document)
                match_document['sort_order'] = sort_order
                to_hash = {key: match_document[key] for key in match_document if key not in {'hash', 'is_disabled'}}
                match_document['hash'] = nested_object_hash(to_hash)
                match_document['_me_id'] = matchengine.run_id.hex

                matchengine.matches.setdefault(task.trial['protocol_no'],
                                               dict()).setdefault(match_document['sample_id'],
                                                                  list()).append(match_document)
                by_sample_id[match_document['sample_id']].append(match_document)

    except Exception as e:
        matchengine.loop.stop()
        log.error(f"ERROR: Worker: {worker_id}, error: {e}")
        log.error(f"TRACEBACK: {traceback.print_tb(e.__traceback__)}")
        raise e

    matchengine.task_q.task_done()
Beispiel #9
0
    def query_node_transform(self, query_node: QueryNode) -> NoReturn:
        """
        If a trial curation key/value requires alteration to a separate AND clause in the mongo query, do that here.
        Used to modify a query part dependent on another query part
        :return:
        """

        # If a trial curation calls for a structural variant but does NOT have the structured SV data field
        # FUSION_PARTNER_HUGO_SYMBOL, then the extended_attributes query is done using a regex search of the free text
        # STRUCTURAL_VARIANT_COMMENT field on the patient's extended_attributes document.
        whole_query = query_node.extract_raw_query()
        # encode as full search criteria
        if 'STRUCTURAL_VARIANT_COMMENT' in whole_query:
            for do_not_render_part_name in [
                    'TRUE_HUGO_SYMBOL', 'FUSION_PARTNER_HUGO_SYMBOL'
            ]:
                do_not_render_part = query_node.get_query_part_by_key(
                    do_not_render_part_name)
                if do_not_render_part is not None:
                    do_not_render_part.render = False
            gene = whole_query.get('TRUE_HUGO_SYMBOL')
            sv_part = query_node.get_query_part_by_key(
                'STRUCTURAL_VARIANT_COMMENT')
            if 'STRUCTURED_SV' in whole_query:
                sv_part.mcq_invalidating = True
                sv_part.render = False
            else:
                sv_part.set_query_attr(
                    'STRUCTURAL_VARIANT_COMMENT',
                    re.compile(
                        rf"(.*\W{gene}\W.*)|(^{gene}\W.*)|(.*\W{gene}$)",
                        re.IGNORECASE))
        # blank-GENE -> Intergenic
        # GENE-blank -> Intergenic
        # GENE1-GENE1 -> GENE1-GENE1 # Intragenic
        # GENE1-GENE2 -> GENE1-GENE2
        elif 'STRUCTURED_SV' in whole_query:
            sv_info_part = query_node.get_query_part_by_key('STRUCTURED_SV')
            sv_info_part.render = False
            left = query_node.get_query_part_value_by_key(
                'TRUE_HUGO_SYMBOL', None)
            right = query_node.get_query_part_value_by_key(
                'FUSION_PARTNER_HUGO_SYMBOL', None)
            for do_not_render_part_name in [
                    'TRUE_HUGO_SYMBOL', 'FUSION_PARTNER_HUGO_SYMBOL'
            ]:
                do_not_render_part = query_node.get_query_part_by_key(
                    do_not_render_part_name)
                if do_not_render_part is not None:
                    do_not_render_part.render = False
            left_query = build_structured_sv_query(left, right, 'LEFT-RIGHT')
            right_query = build_structured_sv_query(left, right, 'RIGHT-LEFT')
            new_query = ({
                '$or': [left_query, right_query]
            } if nested_object_hash(left_query) !=
                         nested_object_hash(right_query) else left_query)
            query_node.add_query_part(
                QueryPart(new_query, sv_info_part.negate, True, False))

        # if signature curation is passed, do not query TRUE_HUGO_SYMBOL
        if {
                'UVA_STATUS', 'TABACCO_STATUS', 'POLE_STATUS',
                'TEMOZOLOMIDE_STATUS', 'MMR_STATUS', 'APOBEC_STATUS'
        }.intersection(set(whole_query.keys())):
            gene_part = query_node.get_query_part_by_key('TRUE_HUGO_SYMBOL')
            if gene_part is not None:
                gene_part.render = False
Beispiel #10
0
 def hash(self) -> str:
     if self._hash is None:
         self._hash = nested_object_hash(self.query)
     return self._hash