def rerun_filters(filters=None, do_update=True, datapush_id=None): """ Update all filters, or individual filters accepted as an array of ids :param filters: Array of filter IDs or None to run all filters :param do_update: When finding matches for temporary filters do not update db :param datapush_id: When all filters are rerun as part of the oncopanel datapush, flag new matches as 'new' and not 'pending', add datapush ID to matches """ with MatchEngine( plugin_dir='./filters_config/plugins', protocol_nos=filters, match_on_closed=False, config='./filters_config/filters_config.json', db_name=settings.MONGO_DBNAME, match_document_creator_class="DFCIFilterMatchDocumentCreator", report_all_clinical_reasons=True, trial_match_collection="match", chunk_size=5000) as me: me.get_matches_for_all_trials() if do_update: me.update_all_matches() run_id = me.run_id.hex update = {"data_push_id": datapush_id} # set match status to "new" only when running filters as part of # new data ingestion if datapush_id: update["MATCH_STATUS"] = 0 database.get_collection("match").update_many({"_me_id": run_id}, {"$set": update}) return me.matches, run_id
def align_matches_clinical(a): # extract the clinical id. clinical_id = a['_id'] # lookup any matches. match_db = database.get_collection("match") filter_db = database.get_collection("filter") # loop through the match and build dictionary. matches = set() enrolled = set() for match in match_db.find({"CLINICAL_ID": clinical_id}): # build lookup. matches.add(match['FILTER_ID']) # check if the match is enrolled. if match['MATCH_STATUS'] == 4: enrolled.add(match['FILTER_ID']) # grab all filters. filters = list() for filter_id in matches: # get filter. filter = filter_db.find_one(filter_id) # save it to list. filters.append(filter) # embed in object. a['FILTER'] = filters a['ENROLLED'] = list(enrolled)
def align_matches_clinical(a): # extract the clinical id. clinical_id = a['_id'] # lookup any matches. match_db = database.get_collection("match") filter_db = database.get_collection("filter") # loop through the match and build dictionary. matches = set() enrolled = set() for match in match_db.find({"CLINICAL_ID": clinical_id}): # build lookup. matches.add(match['FILTER_ID']) # check if the match is enrolled. if match['MATCH_STATUS'] == 4: enrolled.add(match['FILTER_ID']) # grab all filters. filters = list() for filter_id in matches: # get filter. filter = filter_db.find_one(filter_id) # save it to list. filters.append(filter) # embed in object. a['FILTER'] = filters a['ENROLLED'] = list(enrolled)
def insert_users(data, from_file=False): # load the data. if from_file: with open(data, "rb") as fin: lines = fin.readlines() # build equivalent. data = list() for line in lines: data.append(line.strip().split(",")) tokens = line.strip().split(",") # simplify database. user_db = database.get_collection("user") team_db = database.get_collection("team") # build equivalent. for tokens in data[1::]: # simplify. user = { "first_name": tokens[1], "last_name": tokens[2], "user_name": tokens[3], "email": tokens[4], "roles": ["user"] } # query for existing user. result = user_db.find_one({"email": tokens[4]}) # deal with existing user. if result is not None: # check if there was a status update. if tokens[7] == 'NO': user_db.update_one({"_id": result['_id']}, {"$set": { "user_name": "" }}) # skip continue # create default team team_id = team_db.insert( {"name": user['first_name'][0] + user['last_name']}) # create the user account. user['teams'] = [team_id] # insert the user. user_db.insert(user)
def delete_genomic_by_sample(): sample_id = request.args.get("SAMPLE_ID") if sample_id is not None: database.get_collection('genomic').delete_many( {"SAMPLE_ID": sample_id}) # encode response. resp = Response(response={"success": True}, status=200, mimetype="application/json") return resp
def insert_users(data, from_file=False): # load the data. if from_file: with open(data, "rb") as fin: lines = fin.readlines() # build equivalent. data = list() for line in lines: data.append(line.strip().split(",")) tokens = line.strip().split(",") # simplify database. user_db = database.get_collection("user") team_db = database.get_collection("team") # build equivalent. for tokens in data[1::]: # simplify. user = { "first_name": tokens[1], "last_name": tokens[2], "user_name": tokens[3], "email": tokens[4], "roles": ["user"] } # query for existing user. result = user_db.find_one({"email": tokens[4]}) # deal with existing user. if result is not None: # check if there was a status update. if tokens[7] == 'NO': user_db.update_one({"_id": result['_id']}, {"$set": {"user_name": ""}}) # skip continue # create default team team_id = team_db.insert({"name": user['first_name'][0] + user['last_name']}) # create the user account. user['teams'] = [team_id] # insert the user. user_db.insert(user)
def align_enrolled(resp): # build list of clinical_ids clin_ids = set() for item in resp['_items']: if isinstance(item['CLINICAL_ID'], dict): clin_ids.add(item['CLINICAL_ID']['_id']) else: clin_ids.add(item['CLINICAL_ID']) # get only clincal id for matched subset. match_db = database.get_collection("match") matched_ids = set() for match in match_db.find( { "MATCH_STATUS": 4, "CLINICAL_ID": { "$in": list(clin_ids) } }, {"CLINICAL_ID": 1}): matched_ids.add(match['CLINICAL_ID']) # lookup any matches. for item in resp['_items']: id = None if isinstance(item['CLINICAL_ID'], dict): id = item['CLINICAL_ID']['_id'] else: id = item['CLINICAL_ID'] if id in matched_ids: item['ENROLLED'] = True else: item['ENROLLED'] = False
def align_enrolled(resp): # build list of clinical_ids clin_ids = set() for item in resp['_items']: if isinstance(item['CLINICAL_ID'], dict): clin_ids.add(item['CLINICAL_ID']['_id']) else: clin_ids.add(item['CLINICAL_ID']) # get only clincal id for matched subset. match_db = database.get_collection("match") matched_ids = set() for match in match_db.find({"MATCH_STATUS": 4, "CLINICAL_ID": {"$in": list(clin_ids)}}, {"CLINICAL_ID": 1}): matched_ids.add(match['CLINICAL_ID']) # lookup any matches. for item in resp['_items']: id = None if isinstance(item['CLINICAL_ID'], dict): id = item['CLINICAL_ID']['_id'] else: id = item['CLINICAL_ID'] if id in matched_ids: item['ENROLLED'] = True else: item['ENROLLED'] = False
def update_match_status(cbio, item): # loop over all existing matches match_db = database.get_collection('match') # check if filter is deleted. if item['status'] == 2: # delete associated matches. logging.info("filter is deleted, deleting associated matches") match_db.delete_many({'FILTER_ID': item['_id']}) elif item['status'] == 0: # archive associated matches. logging.info("filter is inactivated, deleting associated matches") match_db.delete_many({'FILTER_ID': item['_id']}) else: # update matches only. match_db.update_many({'FILTER_ID': item['_id']}, { "$set": { "FILTER_STATUS": item['status'], "FILTER_NAME": item['label'] }, })
def align_other_clinical(doc): """ If patient has been sampled multiple times, attach other clinical ids referencing those samples under key "RELATED". Remove patient's name from all documents. :param item: :return: """ # extract the clinical id. clinical_id = doc['_id'] # lookup any matches. clinical_db = database.get_collection('clinical') # look for record with sample MRN. related = list(clinical_db.find({"MRN": doc['MRN']})) # remove self. tmp = [] for clinical in related: for nm in ["FIRST_NAME", "LAST_NAME", "FIRST_LAST", "LAST_FIRST"]: del clinical[nm] if clinical['_id'] == doc['_id']: continue tmp.append(clinical) # add them to record. doc['RELATED'] = tmp
def update_match_status(cbio, item): # loop over all existing matches match_db = database.get_collection('match') # check if filter is deleted. if item['status'] == 2: # delete associated matches. logging.info("filter is deleted, deleting associated matches") match_db.delete_many({'FILTER_ID': item['_id']}) elif item['status'] == 0: # archive associated matches. logging.info("filter is inactivated, deleting associated matches") match_db.delete_many({'FILTER_ID': item['_id']}) else: # update matches only. match_db.update_many({'FILTER_ID': item['_id']}, { "$set": { "FILTER_STATUS": item['status'], "FILTER_NAME": item['label'] }, })
def update_filter_post(item, original): """ After filter is updated with new "match" clause, re-find filter matches :param item: :param original: :return: """ # status 3 means filter is deleted if item['status'] != 3: find_filter_matches([item]) else: update_query = { '$set': { 'is_disabled': True, 'FILTER_STATUS': 3, '_updated': datetime.datetime.now() } } database.get_collection('match').update_many( {'FILTER_ID': item['_id']}, update_query)
def add_filter_run_id(item, original): """ The Eve API does not allow keys to be prefixed with an underscore. It will return a 20X response, but not add to the DB. When rebinning matches in the UI, manually add back the _me_id field. Remove _created field as eve automatically sets it to 1970 since it is not present. :param item: :param original: :return: """ if '_me_id' not in item: match = list(database.get_collection('match').find({'_id': item['_id']}, {'_me_id': 1})) item.pop('_created') if '_me_id' in match[0]: item['_me_id'] = match[0]['_me_id']
def pre_get_restricted(request, lookup): # get the requesting user set of teams. if app.auth == None: # TODO REMOVE THIS HACK teams = list(database.get_collection('team').find()) else: teams = set(app.auth.get_request_auth_value()['teams']) # parse the query string. where_clause = request.args.get("where") if where_clause: # parse the value. clause = json.loads(where_clause) # check if a team_id is set. query_teams = False if 'TEAM_ID' in clause: # check if it is legit. if isinstance(clause['TEAM_ID'], dict): team_list = next(iter(clause['TEAM_ID'].values())) else: team_list = [clause['TEAM_ID']] for team in team_list: if ObjectId(team) not in teams: # emit a 404 because someone is cheating. abort(404) # mark it as present. query_teams = True # TEAM_ID isn't present, complain. if not query_teams: resp = Response(None, 406) abort(406, description= 'Resource requires TEAM_ID to be specified in where clause', response=resp)
def pre_get_restricted(request, lookup): # get the requesting user set of teams. if app.auth == None: # TODO REMOVE THIS HACK teams = list(database.get_collection('team').find()) else: teams = set(app.auth.get_request_auth_value()['teams']) # parse the query string. where_clause = request.args.get("where") if where_clause: # parse the value. clause = json.loads(where_clause) # check if a team_id is set. query_teams = False if 'TEAM_ID' in clause: # check if it is legit. if isinstance(clause['TEAM_ID'], dict): team_list = clause['TEAM_ID'].values()[0] else: team_list = [clause['TEAM_ID']] for team in team_list: if ObjectId(team) not in teams: # emit a 404 because someone is cheating. abort(404) # mark it as present. query_teams = True # TEAM_ID isn't present, complain. if not query_teams: resp = Response(None, 406) abort(406, description='Resource requires TEAM_ID to be specified in where clause', response=resp)
def align_matches_genomic(a): """ Attach filter docs to genomic docs which have been matched successfully by filters. E.g. If a filter is seeking EGFR, and a genomic document represents EGFR and has been positively matched, attach the EGFR filter to the EGFR genomic doc. :param genomic_docs: :return: """ # short circuit. if len(a['_items']) == 0: return # get the user. if settings.NO_AUTH: logging.info("NO AUTH enabled. align_matches_genomic") accounts = app.data.driver.db['user'] user = accounts.find_one({"last_name": "Doe"}) else: user = app.auth.get_request_auth_value() # extract the clinical id. clinical_id = a['_items'][0]['CLINICAL_ID'] match_db = database.get_collection('match') filter_db = database.get_collection('filter') variants = dict() for match in match_db.find({ "CLINICAL_ID": clinical_id, "is_disabled": False }): for variant_id in match['VARIANTS']: if variant_id not in variants: variants[variant_id] = list() variants[variant_id].append(match['FILTER_ID']) for item in a['_items']: if item['_id'] in variants: for filter_id in variants[item['_id']]: filter_doc = filter_db.find_one(filter_id) if filter_doc is None: continue # check status. if filter_doc['status'] != 1: continue # check ownership. if filter_doc['TEAM_ID'] not in set(user['teams']): continue # embed this in filter. if 'FILTER' not in item: item['FILTER'] = list() item['FILTER'].append(filter_doc) # merge genetic event with cytoband if 'GENETIC_EVENT' in item and 'CYTOBAND' in item and item[ 'GENETIC_EVENT'] is not None: item['CYTOBAND'] = '%s %s' % (item['CYTOBAND'], item['GENETIC_EVENT'])
def email_matches(): # get the database links. match_db = database.get_collection("match") user_db = database.get_collection('user') filter_db = database.get_collection('filter') logging.info("emailing filter matches - starting email search") # get distinct list of team ids teams = match_db.find().distinct("TEAM_ID") # loop over each team. message_list = [] for teamid in teams: # get the counts. num_filters, num_matches = _email_counts(teamid, match_db, filter_db) # skip if no updates. if num_matches < 1: continue # get users in this team team_members = list( user_db.find({'teams': { '$elemMatch': { '$in': [teamid] } }})) for user in team_members: # skip if silenced. if 'silent' in user and user['silent']: continue # simplify. recipient_email = user['email'] match_str = "matches" if num_matches == 1: match_str = "match" # create the message. cur_date = datetime.date.today().strftime("%B %d, %Y") cur_stamp = datetime.datetime.now().strftime( "%I:%M%p on %B %d, %Y") # generate text html = _email_text(user, num_matches, match_str, num_filters, cur_date, cur_stamp) db = database.get_db() email_item = { 'email_from': settings.EMAIL_AUTHOR_PROTECTED, 'email_to': recipient_email, 'subject': 'New MatchMiner Hits - %s' % cur_date, 'body': html, 'cc': [], 'sent': False, 'num_failures': 0, 'errors': [] } db['email'].insert(email_item) message_list.append(html) # return the message lists return message_list
def insert_matches(cbio, item, from_filter=True, dpi=None): start_iter = time.time() filter_db = database.get_collection('filter') pf_pairz = dict() filters = dict() for silly in cbio.match_iter(): filter_id = item['_id'] clinical_id = silly['CLINICAL_ID'] key = (filter_id, clinical_id) if key not in pf_pairz: pf_pairz[key] = list() if filter_id not in filters: filter_obj = filter_db.find_one({'_id': filter_id}) filters[filter_id] = filter_obj pf_pairz[key].append(silly['GENOMIC_ID']) user_id = item['USER_ID'] team_id = item['TEAM_ID'] filter_status = item['status'] filter_name = item['label'] clinical_lu = {} genomic_lu = {} matches = list() for key, val in pf_pairz.items(): clinical_id = key[1] genomic_id = val[0] if clinical_id not in clinical_lu: clinical_lu[clinical_id] = cbio._c.find_one(clinical_id) if genomic_id not in genomic_lu: genomic_lu[genomic_id] = cbio._g.find_one(genomic_id) # extract clinical information clinical_info = [ 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME', 'ONCOTREE_BIOPSY_SITE_TYPE', 'REPORT_DATE', 'VARIANT_CATEGORY', 'MRN', 'ORD_PHYSICIAN_EMAIL' ] clinical_info_vals = [''] * len(clinical_info) for idx, c in enumerate(clinical_info): if c in clinical_lu[clinical_id]: clinical_info_vals[idx] = clinical_lu[clinical_id][c] else: clinical_info_vals[idx] = "" # extract gene symbol true_hugo_symbol = None if 'TRUE_HUGO_SYMBOL' in genomic_lu[genomic_id]: true_hugo_symbol = genomic_lu[genomic_id]['TRUE_HUGO_SYMBOL'] if true_hugo_symbol is None: filter_ = filters[key[0]] if 'genomic_filter' in filter_ and 'TRUE_HUGO_SYMBOL' in filter_[ 'genomic_filter']: true_hugo_symbol = filter_['genomic_filter'][ 'TRUE_HUGO_SYMBOL'] if isinstance(true_hugo_symbol, dict): true_hugo_symbol = ', '.join([ str(i) for i in next(iter(true_hugo_symbol.values())) ]) if true_hugo_symbol is None: logging.error("error in filter logic") # extract tier information tier = None if 'TIER' in genomic_lu[genomic_id]: tier = genomic_lu[genomic_id]['TIER'] match_status = 0 if from_filter: match_status = 1 if 'protocol_id' in item: protocol_id = item['protocol_id'] else: protocol_id = "" email_subject = "(%s) ONCO PANEL RESULTS" % protocol_id email_body = email_content(protocol_id, genomic_lu[genomic_id], clinical_lu[clinical_id]) matches.append({ 'USER_ID': user_id, 'TEAM_ID': team_id, 'FILTER_STATUS': filter_status, 'MATCH_STATUS': match_status, 'FILTER_ID': key[0], 'CLINICAL_ID': key[1], 'VARIANTS': val, 'PATIENT_MRN': clinical_info_vals[clinical_info.index('MRN')], 'MMID': binascii.b2a_hex(os.urandom(3)).upper(), 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': clinical_info_vals[clinical_info.index( 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME')], 'ONCOTREE_BIOPSY_SITE_TYPE': clinical_info_vals[clinical_info.index( 'ONCOTREE_BIOPSY_SITE_TYPE')], 'TRUE_HUGO_SYMBOL': true_hugo_symbol, 'VARIANT_CATEGORY': clinical_info_vals[clinical_info.index('VARIANT_CATEGORY')], 'FILTER_NAME': filter_name, 'REPORT_DATE': clinical_info_vals[clinical_info.index('REPORT_DATE')], "EMAIL_ADDRESS": clinical_info_vals[clinical_info.index('ORD_PHYSICIAN_EMAIL')], "EMAIL_BODY": email_body, "EMAIL_SUBJECT": email_subject, 'data_push_id': dpi, 'TIER': tier }) ttr_iter = time.time() - start_iter start_ins = time.time() if len(matches) > 0: match_db = database.get_collection("match") match_db.insert_many(matches) ttr_ins = time.time() - start_ins logging.info("match: added %d and it took %.2f to fetch, %.2f to insert" % (len(matches), ttr_iter, ttr_ins))
def rerun_filters(dpi=None): """ re-runs all filters against new data. preserves options set on old matches. :return: count of new matches """ # get the database links. match_db = database.get_collection('match') filter_db = database.get_collection('filter') # create the object. cbio = CBioEngine(settings.MONGO_URI, settings.MONGO_DBNAME, data_model.match_schema, muser=settings.MONGO_USERNAME, mpass=settings.MONGO_PASSWORD, collection_clinical=settings.COLLECTION_CLINICAL, collection_genomic=settings.COLLECTION_GENOMIC) query = { 'status': 1, 'temporary': False, 'trial_watch': { '$exists': False } } filters = list(filter_db.find(query)) for filter_ in filters: # lots of logging. logging.info("rerun_filters: filter: %s" % filter_['_id']) # prepare the filters. c, g, txt = prepare_criteria(filter_) # execute the match. cbio.match(c=c, g=g) if cbio.match_df is not None and cbio.genomic_df is not None and cbio.clinical_df is not None: logging.info( "rerun_filters: new matches: match=%d, genomic=%d, clinical=%d" % (len(cbio.match_df), len( cbio.genomic_df), len(cbio.clinical_df))) # get existing matches for this filter. matches = list(match_db.find({'FILTER_ID': ObjectId(filter_['_id'])})) rec_cnt = 0 for m in matches: rec_cnt += len(m['VARIANTS']) logging.info("rerun_filters: exisiting: %d %d" % (len(matches), rec_cnt)) # parse the old matches. clinical_old_id = set() old_lu = {} match_lu = {} for match in matches: # get the clincal id. clinical_id = match['CLINICAL_ID'] # now build tuples of variants. for genomic_id in match['VARIANTS']: # make pair pair = (clinical_id, genomic_id) clinical_old_id.add(pair) # build id lookup. old_lu[pair] = match['_id'] # cache matches. match_lu[pair] = match # parse the new matches. clinical_new_id = set() new_lu = {} i = 0 for match in cbio.match_iter(): # simplify. clinical_id = match['CLINICAL_ID'] genomic_id = match['GENOMIC_ID'] # build set. pair = (clinical_id, genomic_id) clinical_new_id.add(pair) # cache matches. match_lu[pair] = match # build lookup. new_lu[pair] = i i += 1 # find the ones which need to be deleted and delete them. to_delete = clinical_old_id - clinical_new_id logging.info("rerun_filters: removing: %d" % len(to_delete)) updated = list() for pair in to_delete: # extract ids match_id = old_lu[pair] match = match_lu[pair] # find the variant. good = list() hit = False for v in match['VARIANTS']: if v != pair[1]: good.append(v) else: hit = True # update it if necessary. if hit: # check if will empty this. if len(good) == 0: # delete it. match_db.delete_one({'_id': match_id}) else: # just update it. match_db.update({"_id": match_id}, {"$set": { "VARIANTS": good }}) # update the local one to make sure we delete all variants match['VARIANTS'] = good # find the intersection and remove them from data frame. remove_frame = clinical_new_id.intersection(clinical_old_id) bad_list = [] for pair in remove_frame: # lookup index. idx = new_lu[pair] bad_list.append(idx) logging.info("rerun_filters: skipping: %d" % len(bad_list)) # remove them. if cbio.match_df is not None and len(cbio.match_df) > 0: cbio.match_df.drop(cbio.match_df.index[bad_list], inplace=True) # insert the counts. count_matches(cbio, filter_) # insert the matches if not temporary. insert_matches(cbio, filter_, from_filter=False, dpi=dpi)
def email_matches(): # get the database links. match_db = database.get_collection("match") user_db = database.get_collection('user') filter_db = database.get_collection('filter') logging.info("starting email search") # get distinct list of team ids teams = match_db.find().distinct("TEAM_ID") # loop over each team. message_list = [] for teamid in teams: # get the counts. num_filters, num_matches = _email_counts(teamid, match_db, filter_db) # skip if no updates. if num_matches < 1: continue # get users in this team team_members = list(user_db.find({'teams': {'$elemMatch': {'$in': [teamid]}}})) for user in team_members: # skip if silenced. if 'silent' in user and user['silent']: continue # simplify. recipient_email = user['email'] match_str = "matches" if num_matches == 1: match_str = "match" # create the message. cur_date = datetime.date.today().strftime("%B %d, %Y") cur_stamp = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") # generate text html = _email_text(user, num_matches, match_str, num_filters, cur_date, cur_stamp) db = database.get_db() email_item = { 'email_from': settings.EMAIL_AUTHOR_PROTECTED, 'email_to': recipient_email, 'subject': 'New MatchMiner Hits - %s' % cur_date, 'body': html, 'cc': [], 'sent': False, 'num_failures': 0, 'errors': [] } db['email'].insert(email_item) message_list.append(html) # return the message lists return message_list
def insert_matches(cbio, item, from_filter=True, dpi=None): start_iter = time.time() filter_db = database.get_collection('filter') pf_pairz = dict() filters = dict() for silly in cbio.match_iter(): filter_id = item['_id'] clinical_id = silly['CLINICAL_ID'] key = (filter_id, clinical_id) if key not in pf_pairz: pf_pairz[key] = list() if filter_id not in filters: filter_obj = filter_db.find_one({'_id': filter_id}) filters[filter_id] = filter_obj pf_pairz[key].append(silly['GENOMIC_ID']) user_id = item['USER_ID'] team_id = item['TEAM_ID'] filter_status = item['status'] filter_name = item['label'] clinical_lu = {} genomic_lu = {} matches = list() for key, val in pf_pairz.items(): clinical_id = key[1] genomic_id = val[0] if clinical_id not in clinical_lu: clinical_lu[clinical_id] = cbio._c.find_one(clinical_id) if genomic_id not in genomic_lu: genomic_lu[genomic_id] = cbio._g.find_one(genomic_id) # extract clinical information clinical_info = [ 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME', 'ONCOTREE_BIOPSY_SITE_TYPE', 'REPORT_DATE', 'VARIANT_CATEGORY', 'MRN', 'ORD_PHYSICIAN_EMAIL' ] clinical_info_vals = [''] * len(clinical_info) for idx, c in enumerate(clinical_info): if c in clinical_lu[clinical_id]: clinical_info_vals[idx] = clinical_lu[clinical_id][c] else: clinical_info_vals[idx] = "" # extract gene symbol true_hugo_symbol = None if 'TRUE_HUGO_SYMBOL' in genomic_lu[genomic_id]: true_hugo_symbol = genomic_lu[genomic_id]['TRUE_HUGO_SYMBOL'] if true_hugo_symbol is None: filter_ = filters[key[0]] if 'genomic_filter' in filter_ and 'TRUE_HUGO_SYMBOL' in filter_['genomic_filter']: true_hugo_symbol = filter_['genomic_filter']['TRUE_HUGO_SYMBOL'] if isinstance(true_hugo_symbol, dict): true_hugo_symbol = ', '.join([str(i) for i in true_hugo_symbol.values()[0]]) if true_hugo_symbol is None: logging.error("error in filter logic") # extract tier information tier = None if 'TIER' in genomic_lu[genomic_id]: tier = genomic_lu[genomic_id]['TIER'] match_status = 0 if from_filter: match_status = 1 if 'protocol_id' in item: protocol_id = item['protocol_id'] else: protocol_id = "" email_subject = "(%s) ONCO PANEL RESULTS" % protocol_id email_body = email_content(protocol_id, genomic_lu[genomic_id], clinical_lu[clinical_id]) matches.append({ 'USER_ID': user_id, 'TEAM_ID': team_id, 'FILTER_STATUS': filter_status, 'MATCH_STATUS': match_status, 'FILTER_ID': key[0], 'CLINICAL_ID': key[1], 'VARIANTS': val, 'PATIENT_MRN': clinical_info_vals[clinical_info.index('MRN')], 'MMID': binascii.b2a_hex(os.urandom(3)).upper(), 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': clinical_info_vals[clinical_info.index('ONCOTREE_PRIMARY_DIAGNOSIS_NAME')], 'ONCOTREE_BIOPSY_SITE_TYPE': clinical_info_vals[clinical_info.index('ONCOTREE_BIOPSY_SITE_TYPE')], 'TRUE_HUGO_SYMBOL': true_hugo_symbol, 'VARIANT_CATEGORY': clinical_info_vals[clinical_info.index('VARIANT_CATEGORY')], 'FILTER_NAME': filter_name, 'REPORT_DATE': clinical_info_vals[clinical_info.index('REPORT_DATE')], "EMAIL_ADDRESS": clinical_info_vals[clinical_info.index('ORD_PHYSICIAN_EMAIL')], "EMAIL_BODY": email_body, "EMAIL_SUBJECT": email_subject, 'data_push_id': dpi, 'TIER': tier }) ttr_iter = time.time() - start_iter start_ins = time.time() if len(matches) > 0: match_db = database.get_collection("match") match_db.insert_many(matches) ttr_ins = time.time() - start_ins logging.info("match: added %d and it took %.2f to fetch, %.2f to insert" % (len(matches), ttr_iter, ttr_ins))
def rerun_filters(dpi=None): """ re-runs all filters against new data. preserves options set on old matches. :return: count of new matches """ # get the database links. match_db = database.get_collection('match') filter_db = database.get_collection('filter') # create the object. cbio = CBioEngine(settings.MONGO_URI, settings.MONGO_DBNAME, data_model.match_schema, muser=settings.MONGO_USERNAME, mpass=settings.MONGO_PASSWORD, collection_clinical=settings.COLLECTION_CLINICAL, collection_genomic=settings.COLLECTION_GENOMIC) query = {'status': 1, 'temporary': False, 'trial_watch': {'$exists': False}} filters = list(filter_db.find(query)) for filter_ in filters: # lots of logging. logging.info("rerun_filters: filter: %s" % filter_['_id']) # prepare the filters. c, g, txt = prepare_criteria(filter_) # execute the match. cbio.match(c=c, g=g) if cbio.match_df is not None and cbio.genomic_df is not None and cbio.clinical_df is not None: logging.info("rerun_filters: new matches: match=%d, genomic=%d, clinical=%d" % (len(cbio.match_df), len(cbio.genomic_df), len(cbio.clinical_df))) # get existing matches for this filter. matches = list(match_db.find({'FILTER_ID': ObjectId(filter_['_id'])})) rec_cnt = 0 for m in matches: rec_cnt += len(m['VARIANTS']) logging.info("rerun_filters: exisiting: %d %d" % (len(matches), rec_cnt)) # parse the old matches. clinical_old_id = set() old_lu = {} match_lu = {} for match in matches: # get the clincal id. clinical_id = match['CLINICAL_ID'] # now build tuples of variants. for genomic_id in match['VARIANTS']: # make pair pair = (clinical_id, genomic_id) clinical_old_id.add(pair) # build id lookup. old_lu[pair] = match['_id'] # cache matches. match_lu[pair] = match # parse the new matches. clinical_new_id = set() new_lu = {} i = 0 for match in cbio.match_iter(): # simplify. clinical_id = match['CLINICAL_ID'] genomic_id = match['GENOMIC_ID'] # build set. pair = (clinical_id, genomic_id) clinical_new_id.add(pair) # cache matches. match_lu[pair] = match # build lookup. new_lu[pair] = i i += 1 # find the ones which need to be deleted and delete them. to_delete = clinical_old_id - clinical_new_id logging.info("rerun_filters: removing: %d" % len(to_delete)) updated = list() for pair in to_delete: # extract ids match_id = old_lu[pair] match = match_lu[pair] # find the variant. good = list() hit = False for v in match['VARIANTS']: if v != pair[1]: good.append(v) else: hit = True # update it if necessary. if hit: # check if will empty this. if len(good) == 0: # delete it. match_db.delete_one({'_id': match_id}) else: # just update it. match_db.update({"_id": match_id}, {"$set": {"VARIANTS": good}}) # update the local one to make sure we delete all variants match['VARIANTS'] = good # find the intersection and remove them from data frame. remove_frame = clinical_new_id.intersection(clinical_old_id) bad_list = [] for pair in remove_frame: # lookup index. idx = new_lu[pair] bad_list.append(idx) logging.info("rerun_filters: skipping: %d" % len(bad_list)) # remove them. if cbio.match_df is not None and len(cbio.match_df) > 0: cbio.match_df.drop(cbio.match_df.index[bad_list], inplace=True) # insert the counts. count_matches(cbio, filter_) # insert the matches if not temporary. insert_matches(cbio, filter_, from_filter=False, dpi=dpi)
def transform_filter_to_CTML(items, save=False): """ Transform filters clinical & genomic key objects into CTML. This is a placeholder function which is present in order to ensure backwards compatibility with UI filter generation. Eventually, this function should be removed as CTML ideally would be generated correctly in the frontend and saved/sent directly to the filter engine for matching, :param save: When calling function explicitly (not part of eve built-in hook), e explicitly update filter in db. Usually eve does this automatically :param items: List of filters :return: """ for item in items: # MMR_STATUS should always be on the genomic filter if 'clinical_filter' in item and 'MMR_STATUS' in item[ 'clinical_filter']: item['genomic_filter']['MMR_STATUS'] = item['clinical_filter'][ 'MMR_STATUS'] del item['clinical_filter']['MMR_STATUS'] item['match'] = [] or_clauses = [] genomic_and = {} if 'genomic_filter' in item: genomic_filter = item['genomic_filter'] multis = [] for k, v in genomic_filter.items(): # Skip wildtypes, null values & empty lists if k in ['WILDTYPE'] or v is None or (isinstance(v, list) and len(v) == 0): continue # Old filters used a custom encoding where keys were nested inside an '^in' key. # New filters do not do this anymore, but in case any are leftover, # catch and remove. elif isinstance(v, dict): if '^in' in v: genomic_and[k] = v['^in'] genomic_filter[k] = v['^in'] operator = "" op_sign = "" if '^lt' in v and v['^lt'] is not None: operator = '^lt' op_sign = '<' if '^gt' in v and v['^gt'] is not None: operator = '^gt' op_sign = '>' if operator != "" and k == 'ALLELE_FRACTION': if isinstance(v[operator], str) and '.' not in v[operator]: v[operator] = float(v[operator]) / 100 genomic_and[k] = op_sign + str(v[operator]) elif isinstance(v, list) and len(v) == 1: if v[0] is not None and v[0] != "": genomic_and[k] = v[0] # multi criteria should be used to create OR criteria later elif isinstance(v, list) and len(v) > 1: multis.append(k) pass else: genomic_and[k] = v # If a user has selected multiple criteria, generate all possible # OR CTML nodes. Filter out error clauses later. # A user may select multiple genes, variant categories, or multiples # of both categories. if len(multis) == 1: for val in genomic_filter[multis[0]]: or_clause = copy.deepcopy(genomic_and) or_clause[multis[0]] = val or_clauses.append({"genomic": or_clause}) elif len(multis) == 2: for val in genomic_filter[multis[0]]: for i_val in genomic_filter[multis[1]]: or_clause = copy.deepcopy(genomic_and) or_clause[multis[0]] = val or_clause[multis[1]] = i_val or_clauses.append({"genomic": or_clause}) elif len(multis) == 3: for val in genomic_filter[multis[0]]: for i_val in genomic_filter[multis[1]]: for i_i_val in genomic_filter[multis[2]]: or_clause = copy.deepcopy(genomic_and) or_clause[multis[0]] = val or_clause[multis[1]] = i_val or_clause[multis[2]] = i_i_val or_clauses.append({"genomic": or_clause}) # remove CNV_CALL's when VARIANT_CATEGORY is MUTATION or SV for or_clause in or_clauses: or_node = or_clause['genomic'] if 'VARIANT_CATEGORY' in or_node and 'CNV_CALL' in or_node and \ (or_node['VARIANT_CATEGORY'] == 'MUTATION' or or_node['VARIANT_CATEGORY'] == 'SV'): del or_node['CNV_CALL'] # remove duplicate nodes cleaned_or_clauses = [] for i in range(len(or_clauses)): if or_clauses[i] not in or_clauses[i + 1:]: cleaned_or_clauses.append(or_clauses[i]) clinical_and = {} if 'clinical_filter' in item: clinical_and = {} for (k, v) in item['clinical_filter'].items(): if v is not None: if k == 'BIRTH_DATE': # TODO remove once filter backfill is complete if 'AGE_NUMERICAL' in item['clinical_filter']: continue operator, integer = transform_date_to_range(v) clinical_and[ 'AGE_NUMERICAL'] = f"{operator}{str(integer)}" elif k == 'AGE_NUMERICAL': operator, difference = transform_age_to_CTML(v) clinical_and[k] = f"{operator}{difference}" else: clinical_and[k] = v and_clause = {"and": []} and_clause['and'].append({"clinical": clinical_and}) if genomic_and: and_clause['and'].append({"genomic": genomic_and}) if cleaned_or_clauses: and_clause["and"].append({"or": cleaned_or_clauses}) # If new OR clauses have been generated, remove # extra AND clause as it is already included on all OR clauses del and_clause['and'][1] item['match'] = [and_clause] item['description'] = get_filter_description(item) if save: database.get_collection("filter").replace_one({"_id": item['_id']}, item)