コード例 #1
0
ファイル: miner.py プロジェクト: pughlab/matchminer-api
def rerun_filters(filters=None, do_update=True, datapush_id=None):
    """
    Update all filters, or individual filters accepted as an array of ids
    :param filters: Array of filter IDs or None to run all filters
    :param do_update: When finding matches for temporary filters do not update db
    :param datapush_id: When all filters are rerun as part of the oncopanel datapush,
    flag new matches as 'new' and not 'pending', add datapush ID to matches
    """

    with MatchEngine(
            plugin_dir='./filters_config/plugins',
            protocol_nos=filters,
            match_on_closed=False,
            config='./filters_config/filters_config.json',
            db_name=settings.MONGO_DBNAME,
            match_document_creator_class="DFCIFilterMatchDocumentCreator",
            report_all_clinical_reasons=True,
            trial_match_collection="match",
            chunk_size=5000) as me:
        me.get_matches_for_all_trials()
        if do_update:
            me.update_all_matches()

        run_id = me.run_id.hex
        update = {"data_push_id": datapush_id}

        # set match status to "new" only when running filters as part of
        # new data ingestion
        if datapush_id:
            update["MATCH_STATUS"] = 0

        database.get_collection("match").update_many({"_me_id": run_id},
                                                     {"$set": update})
    return me.matches, run_id
コード例 #2
0
def align_matches_clinical(a):

    # extract the clinical id.
    clinical_id = a['_id']

    # lookup any matches.
    match_db = database.get_collection("match")
    filter_db = database.get_collection("filter")

    # loop through the match and build dictionary.
    matches = set()
    enrolled = set()
    for match in match_db.find({"CLINICAL_ID": clinical_id}):

        # build lookup.
        matches.add(match['FILTER_ID'])

        # check if the match is enrolled.
        if match['MATCH_STATUS'] == 4:
            enrolled.add(match['FILTER_ID'])

    # grab all filters.
    filters = list()
    for filter_id in matches:

        # get filter.
        filter = filter_db.find_one(filter_id)

        # save it to list.
        filters.append(filter)

    # embed in object.
    a['FILTER'] = filters
    a['ENROLLED'] = list(enrolled)
コード例 #3
0
ファイル: events.py プロジェクト: dfci/matchminer-api
def align_matches_clinical(a):

    # extract the clinical id.
    clinical_id = a['_id']

    # lookup any matches.
    match_db = database.get_collection("match")
    filter_db = database.get_collection("filter")

    # loop through the match and build dictionary.
    matches = set()
    enrolled = set()
    for match in match_db.find({"CLINICAL_ID": clinical_id}):

        # build lookup.
        matches.add(match['FILTER_ID'])

        # check if the match is enrolled.
        if match['MATCH_STATUS'] == 4:
            enrolled.add(match['FILTER_ID'])

    # grab all filters.
    filters = list()
    for filter_id in matches:

        # get filter.
        filter = filter_db.find_one(filter_id)

        # save it to list.
        filters.append(filter)

    # embed in object.
    a['FILTER'] = filters
    a['ENROLLED'] = list(enrolled)
コード例 #4
0
def insert_users(data, from_file=False):

    # load the data.
    if from_file:
        with open(data, "rb") as fin:
            lines = fin.readlines()

        # build equivalent.
        data = list()
        for line in lines:
            data.append(line.strip().split(","))
            tokens = line.strip().split(",")

    # simplify database.
    user_db = database.get_collection("user")
    team_db = database.get_collection("team")

    # build equivalent.
    for tokens in data[1::]:

        # simplify.
        user = {
            "first_name": tokens[1],
            "last_name": tokens[2],
            "user_name": tokens[3],
            "email": tokens[4],
            "roles": ["user"]
        }

        # query for existing user.
        result = user_db.find_one({"email": tokens[4]})

        # deal with existing user.
        if result is not None:

            # check if there was a status update.
            if tokens[7] == 'NO':
                user_db.update_one({"_id": result['_id']},
                                   {"$set": {
                                       "user_name": ""
                                   }})

            # skip
            continue

        # create default team
        team_id = team_db.insert(
            {"name": user['first_name'][0] + user['last_name']})

        # create the user account.
        user['teams'] = [team_id]

        # insert the user.
        user_db.insert(user)
コード例 #5
0
ファイル: custom.py プロジェクト: pughlab/matchminer-api
def delete_genomic_by_sample():
    sample_id = request.args.get("SAMPLE_ID")

    if sample_id is not None:
        database.get_collection('genomic').delete_many(
            {"SAMPLE_ID": sample_id})

    # encode response.
    resp = Response(response={"success": True},
                    status=200,
                    mimetype="application/json")

    return resp
コード例 #6
0
ファイル: utilities.py プロジェクト: dfci/matchminer-api
def insert_users(data, from_file=False):

    # load the data.
    if from_file:
        with open(data, "rb") as fin:
            lines = fin.readlines()

        # build equivalent.
        data = list()
        for line in lines:
            data.append(line.strip().split(","))
            tokens = line.strip().split(",")

    # simplify database.
    user_db = database.get_collection("user")
    team_db = database.get_collection("team")

    # build equivalent.
    for tokens in data[1::]:

        # simplify.
        user = {
            "first_name": tokens[1],
            "last_name": tokens[2],
            "user_name": tokens[3],
            "email": tokens[4],
            "roles": ["user"]
        }

        # query for existing user.
        result = user_db.find_one({"email": tokens[4]})

        # deal with existing user.
        if result is not None:

            # check if there was a status update.
            if tokens[7] == 'NO':
                user_db.update_one({"_id": result['_id']}, {"$set": {"user_name": ""}})

            # skip
            continue

        # create default team
        team_id = team_db.insert({"name": user['first_name'][0] + user['last_name']})

        # create the user account.
        user['teams'] = [team_id]

        # insert the user.
        user_db.insert(user)
コード例 #7
0
def align_enrolled(resp):

    # build list of clinical_ids
    clin_ids = set()
    for item in resp['_items']:
        if isinstance(item['CLINICAL_ID'], dict):
            clin_ids.add(item['CLINICAL_ID']['_id'])
        else:
            clin_ids.add(item['CLINICAL_ID'])

    # get only clincal id for matched subset.
    match_db = database.get_collection("match")
    matched_ids = set()
    for match in match_db.find(
        {
            "MATCH_STATUS": 4,
            "CLINICAL_ID": {
                "$in": list(clin_ids)
            }
        }, {"CLINICAL_ID": 1}):
        matched_ids.add(match['CLINICAL_ID'])

    # lookup any matches.
    for item in resp['_items']:
        id = None
        if isinstance(item['CLINICAL_ID'], dict):
            id = item['CLINICAL_ID']['_id']
        else:
            id = item['CLINICAL_ID']

        if id in matched_ids:
            item['ENROLLED'] = True
        else:
            item['ENROLLED'] = False
コード例 #8
0
ファイル: events.py プロジェクト: dfci/matchminer-api
def align_enrolled(resp):

    # build list of clinical_ids
    clin_ids = set()
    for item in resp['_items']:
        if isinstance(item['CLINICAL_ID'], dict):
            clin_ids.add(item['CLINICAL_ID']['_id'])
        else:
            clin_ids.add(item['CLINICAL_ID'])

    # get only clincal id for matched subset.
    match_db = database.get_collection("match")
    matched_ids = set()
    for match in match_db.find({"MATCH_STATUS": 4, "CLINICAL_ID": {"$in": list(clin_ids)}}, {"CLINICAL_ID": 1}):
        matched_ids.add(match['CLINICAL_ID'])

    # lookup any matches.
    for item in resp['_items']:
        id = None
        if isinstance(item['CLINICAL_ID'], dict):
            id = item['CLINICAL_ID']['_id']
        else:
            id = item['CLINICAL_ID']

        if id in matched_ids:
            item['ENROLLED'] = True
        else:
            item['ENROLLED'] = False
コード例 #9
0
ファイル: miner.py プロジェクト: dfci/matchminer-api
def update_match_status(cbio, item):

    # loop over all existing matches
    match_db = database.get_collection('match')

    # check if filter is deleted.
    if item['status'] == 2:

        # delete associated matches.
        logging.info("filter is deleted, deleting associated matches")
        match_db.delete_many({'FILTER_ID': item['_id']})

    elif item['status'] == 0:

        # archive associated matches.
        logging.info("filter is inactivated, deleting associated matches")
        match_db.delete_many({'FILTER_ID': item['_id']})

    else:

        # update matches only.
        match_db.update_many({'FILTER_ID': item['_id']},
                             {
                                 "$set": {
                                     "FILTER_STATUS": item['status'],
                                     "FILTER_NAME": item['label']
                                 },
                             })
コード例 #10
0
ファイル: clinical.py プロジェクト: pughlab/matchminer-api
def align_other_clinical(doc):
    """
    If patient has been sampled multiple times, attach other clinical ids referencing
    those samples under key "RELATED".

    Remove patient's name from all documents.
    :param item:
    :return:
    """

    # extract the clinical id.
    clinical_id = doc['_id']

    # lookup any matches.
    clinical_db = database.get_collection('clinical')

    # look for record with sample MRN.
    related = list(clinical_db.find({"MRN": doc['MRN']}))

    # remove self.
    tmp = []
    for clinical in related:

        for nm in ["FIRST_NAME", "LAST_NAME", "FIRST_LAST", "LAST_FIRST"]:
            del clinical[nm]

        if clinical['_id'] == doc['_id']:
            continue
        tmp.append(clinical)

    # add them to record.
    doc['RELATED'] = tmp
コード例 #11
0
def update_match_status(cbio, item):

    # loop over all existing matches
    match_db = database.get_collection('match')

    # check if filter is deleted.
    if item['status'] == 2:

        # delete associated matches.
        logging.info("filter is deleted, deleting associated matches")
        match_db.delete_many({'FILTER_ID': item['_id']})

    elif item['status'] == 0:

        # archive associated matches.
        logging.info("filter is inactivated, deleting associated matches")
        match_db.delete_many({'FILTER_ID': item['_id']})

    else:

        # update matches only.
        match_db.update_many({'FILTER_ID': item['_id']}, {
            "$set": {
                "FILTER_STATUS": item['status'],
                "FILTER_NAME": item['label']
            },
        })
コード例 #12
0
ファイル: miner.py プロジェクト: pughlab/matchminer-api
def update_filter_post(item, original):
    """
    After filter is updated with new "match" clause, re-find filter matches
    :param item:
    :param original:
    :return:
    """
    # status 3 means filter is deleted
    if item['status'] != 3:
        find_filter_matches([item])
    else:
        update_query = {
            '$set': {
                'is_disabled': True,
                'FILTER_STATUS': 3,
                '_updated': datetime.datetime.now()
            }
        }
        database.get_collection('match').update_many(
            {'FILTER_ID': item['_id']}, update_query)
コード例 #13
0
ファイル: match.py プロジェクト: pughlab/matchminer-api
def add_filter_run_id(item, original):
    """
    The Eve API does not allow keys to be prefixed with an underscore. It will return
    a 20X response, but not add to the DB.

    When rebinning matches in the UI, manually add back the _me_id field.

    Remove _created field as eve automatically sets it to 1970 since it is not
    present.
    :param item:
    :param original:
    :return:
    """
    if '_me_id' not in item:
        match = list(database.get_collection('match').find({'_id': item['_id']}, {'_me_id': 1}))
        item.pop('_created')
        if '_me_id' in match[0]:
            item['_me_id'] = match[0]['_me_id']
コード例 #14
0
def pre_get_restricted(request, lookup):

    # get the requesting user set of teams.
    if app.auth == None:
        # TODO REMOVE THIS HACK
        teams = list(database.get_collection('team').find())
    else:
        teams = set(app.auth.get_request_auth_value()['teams'])

    # parse the query string.
    where_clause = request.args.get("where")
    if where_clause:

        # parse the value.
        clause = json.loads(where_clause)

        # check if a team_id is set.
        query_teams = False
        if 'TEAM_ID' in clause:

            # check if it is legit.
            if isinstance(clause['TEAM_ID'], dict):
                team_list = next(iter(clause['TEAM_ID'].values()))
            else:
                team_list = [clause['TEAM_ID']]

            for team in team_list:
                if ObjectId(team) not in teams:

                    # emit a 404 because someone is cheating.
                    abort(404)

            # mark it as present.
            query_teams = True

        # TEAM_ID isn't present, complain.
        if not query_teams:

            resp = Response(None, 406)
            abort(406,
                  description=
                  'Resource requires TEAM_ID to be specified in where clause',
                  response=resp)
コード例 #15
0
ファイル: events.py プロジェクト: dfci/matchminer-api
def pre_get_restricted(request, lookup):

    # get the requesting user set of teams.
    if app.auth == None:
        # TODO REMOVE THIS HACK
        teams = list(database.get_collection('team').find())
    else:
        teams = set(app.auth.get_request_auth_value()['teams'])

    # parse the query string.
    where_clause = request.args.get("where")
    if where_clause:

        # parse the value.
        clause = json.loads(where_clause)

        # check if a team_id is set.
        query_teams = False
        if 'TEAM_ID' in clause:

            # check if it is legit.
            if isinstance(clause['TEAM_ID'], dict):
                team_list = clause['TEAM_ID'].values()[0]
            else:
                team_list = [clause['TEAM_ID']]

            for team in team_list:
                if ObjectId(team) not in teams:

                    # emit a 404 because someone is cheating.
                    abort(404)

            # mark it as present.
            query_teams = True

        # TEAM_ID isn't present, complain.
        if not query_teams:

            resp = Response(None, 406)
            abort(406, description='Resource requires TEAM_ID to be specified in where clause', response=resp)
コード例 #16
0
ファイル: genomic.py プロジェクト: pughlab/matchminer-api
def align_matches_genomic(a):
    """
   Attach filter docs to genomic docs which have been matched successfully by filters.

   E.g. If a filter is seeking EGFR, and a genomic document represents EGFR and
   has been positively matched, attach the EGFR filter to the EGFR genomic doc.
   :param genomic_docs:
   :return:
   """

    # short circuit.
    if len(a['_items']) == 0:
        return

    # get the user.
    if settings.NO_AUTH:
        logging.info("NO AUTH enabled. align_matches_genomic")
        accounts = app.data.driver.db['user']
        user = accounts.find_one({"last_name": "Doe"})
    else:
        user = app.auth.get_request_auth_value()

    # extract the clinical id.
    clinical_id = a['_items'][0]['CLINICAL_ID']

    match_db = database.get_collection('match')
    filter_db = database.get_collection('filter')

    variants = dict()
    for match in match_db.find({
            "CLINICAL_ID": clinical_id,
            "is_disabled": False
    }):
        for variant_id in match['VARIANTS']:
            if variant_id not in variants:
                variants[variant_id] = list()

            variants[variant_id].append(match['FILTER_ID'])

    for item in a['_items']:
        if item['_id'] in variants:
            for filter_id in variants[item['_id']]:

                filter_doc = filter_db.find_one(filter_id)
                if filter_doc is None:
                    continue

                # check status.
                if filter_doc['status'] != 1:
                    continue

                # check ownership.
                if filter_doc['TEAM_ID'] not in set(user['teams']):
                    continue

                # embed this in filter.
                if 'FILTER' not in item:
                    item['FILTER'] = list()

                item['FILTER'].append(filter_doc)

        # merge genetic event with cytoband
        if 'GENETIC_EVENT' in item and 'CYTOBAND' in item and item[
                'GENETIC_EVENT'] is not None:
            item['CYTOBAND'] = '%s %s' % (item['CYTOBAND'],
                                          item['GENETIC_EVENT'])
コード例 #17
0
def email_matches():

    # get the database links.
    match_db = database.get_collection("match")
    user_db = database.get_collection('user')
    filter_db = database.get_collection('filter')

    logging.info("emailing filter matches - starting email search")

    # get distinct list of team ids
    teams = match_db.find().distinct("TEAM_ID")

    # loop over each team.
    message_list = []
    for teamid in teams:

        # get the counts.
        num_filters, num_matches = _email_counts(teamid, match_db, filter_db)

        # skip if no updates.
        if num_matches < 1:
            continue

        # get users in this team
        team_members = list(
            user_db.find({'teams': {
                '$elemMatch': {
                    '$in': [teamid]
                }
            }}))
        for user in team_members:

            # skip if silenced.
            if 'silent' in user and user['silent']:
                continue

            # simplify.
            recipient_email = user['email']
            match_str = "matches"
            if num_matches == 1:
                match_str = "match"

            # create the message.
            cur_date = datetime.date.today().strftime("%B %d, %Y")
            cur_stamp = datetime.datetime.now().strftime(
                "%I:%M%p on %B %d, %Y")

            # generate text
            html = _email_text(user, num_matches, match_str, num_filters,
                               cur_date, cur_stamp)

            db = database.get_db()
            email_item = {
                'email_from': settings.EMAIL_AUTHOR_PROTECTED,
                'email_to': recipient_email,
                'subject': 'New MatchMiner Hits - %s' % cur_date,
                'body': html,
                'cc': [],
                'sent': False,
                'num_failures': 0,
                'errors': []
            }
            db['email'].insert(email_item)

            message_list.append(html)

    # return the message lists
    return message_list
コード例 #18
0
def insert_matches(cbio, item, from_filter=True, dpi=None):

    start_iter = time.time()
    filter_db = database.get_collection('filter')

    pf_pairz = dict()
    filters = dict()
    for silly in cbio.match_iter():

        filter_id = item['_id']
        clinical_id = silly['CLINICAL_ID']
        key = (filter_id, clinical_id)

        if key not in pf_pairz:
            pf_pairz[key] = list()

        if filter_id not in filters:
            filter_obj = filter_db.find_one({'_id': filter_id})
            filters[filter_id] = filter_obj

        pf_pairz[key].append(silly['GENOMIC_ID'])

    user_id = item['USER_ID']
    team_id = item['TEAM_ID']
    filter_status = item['status']
    filter_name = item['label']
    clinical_lu = {}
    genomic_lu = {}

    matches = list()
    for key, val in pf_pairz.items():

        clinical_id = key[1]
        genomic_id = val[0]

        if clinical_id not in clinical_lu:
            clinical_lu[clinical_id] = cbio._c.find_one(clinical_id)

        if genomic_id not in genomic_lu:
            genomic_lu[genomic_id] = cbio._g.find_one(genomic_id)

        # extract clinical information
        clinical_info = [
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME', 'ONCOTREE_BIOPSY_SITE_TYPE',
            'REPORT_DATE', 'VARIANT_CATEGORY', 'MRN', 'ORD_PHYSICIAN_EMAIL'
        ]
        clinical_info_vals = [''] * len(clinical_info)
        for idx, c in enumerate(clinical_info):
            if c in clinical_lu[clinical_id]:
                clinical_info_vals[idx] = clinical_lu[clinical_id][c]
            else:
                clinical_info_vals[idx] = ""

        # extract gene symbol
        true_hugo_symbol = None
        if 'TRUE_HUGO_SYMBOL' in genomic_lu[genomic_id]:
            true_hugo_symbol = genomic_lu[genomic_id]['TRUE_HUGO_SYMBOL']

        if true_hugo_symbol is None:

            filter_ = filters[key[0]]
            if 'genomic_filter' in filter_ and 'TRUE_HUGO_SYMBOL' in filter_[
                    'genomic_filter']:

                true_hugo_symbol = filter_['genomic_filter'][
                    'TRUE_HUGO_SYMBOL']
                if isinstance(true_hugo_symbol, dict):
                    true_hugo_symbol = ', '.join([
                        str(i) for i in next(iter(true_hugo_symbol.values()))
                    ])

        if true_hugo_symbol is None:
            logging.error("error in filter logic")

        # extract tier information
        tier = None
        if 'TIER' in genomic_lu[genomic_id]:
            tier = genomic_lu[genomic_id]['TIER']

        match_status = 0
        if from_filter:
            match_status = 1

        if 'protocol_id' in item:
            protocol_id = item['protocol_id']
        else:
            protocol_id = ""

        email_subject = "(%s) ONCO PANEL RESULTS" % protocol_id
        email_body = email_content(protocol_id, genomic_lu[genomic_id],
                                   clinical_lu[clinical_id])

        matches.append({
            'USER_ID':
            user_id,
            'TEAM_ID':
            team_id,
            'FILTER_STATUS':
            filter_status,
            'MATCH_STATUS':
            match_status,
            'FILTER_ID':
            key[0],
            'CLINICAL_ID':
            key[1],
            'VARIANTS':
            val,
            'PATIENT_MRN':
            clinical_info_vals[clinical_info.index('MRN')],
            'MMID':
            binascii.b2a_hex(os.urandom(3)).upper(),
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME':
            clinical_info_vals[clinical_info.index(
                'ONCOTREE_PRIMARY_DIAGNOSIS_NAME')],
            'ONCOTREE_BIOPSY_SITE_TYPE':
            clinical_info_vals[clinical_info.index(
                'ONCOTREE_BIOPSY_SITE_TYPE')],
            'TRUE_HUGO_SYMBOL':
            true_hugo_symbol,
            'VARIANT_CATEGORY':
            clinical_info_vals[clinical_info.index('VARIANT_CATEGORY')],
            'FILTER_NAME':
            filter_name,
            'REPORT_DATE':
            clinical_info_vals[clinical_info.index('REPORT_DATE')],
            "EMAIL_ADDRESS":
            clinical_info_vals[clinical_info.index('ORD_PHYSICIAN_EMAIL')],
            "EMAIL_BODY":
            email_body,
            "EMAIL_SUBJECT":
            email_subject,
            'data_push_id':
            dpi,
            'TIER':
            tier
        })

    ttr_iter = time.time() - start_iter
    start_ins = time.time()

    if len(matches) > 0:
        match_db = database.get_collection("match")
        match_db.insert_many(matches)

    ttr_ins = time.time() - start_ins
    logging.info("match: added %d and it took %.2f to fetch, %.2f to insert" %
                 (len(matches), ttr_iter, ttr_ins))
コード例 #19
0
def rerun_filters(dpi=None):
    """ re-runs all filters against new data. preserves options set on
    old matches.

    :return: count of new matches
    """

    # get the database links.
    match_db = database.get_collection('match')
    filter_db = database.get_collection('filter')

    # create the object.
    cbio = CBioEngine(settings.MONGO_URI,
                      settings.MONGO_DBNAME,
                      data_model.match_schema,
                      muser=settings.MONGO_USERNAME,
                      mpass=settings.MONGO_PASSWORD,
                      collection_clinical=settings.COLLECTION_CLINICAL,
                      collection_genomic=settings.COLLECTION_GENOMIC)

    query = {
        'status': 1,
        'temporary': False,
        'trial_watch': {
            '$exists': False
        }
    }
    filters = list(filter_db.find(query))
    for filter_ in filters:

        # lots of logging.
        logging.info("rerun_filters: filter: %s" % filter_['_id'])

        # prepare the filters.
        c, g, txt = prepare_criteria(filter_)

        # execute the match.
        cbio.match(c=c, g=g)

        if cbio.match_df is not None and cbio.genomic_df is not None and cbio.clinical_df is not None:
            logging.info(
                "rerun_filters: new matches: match=%d, genomic=%d, clinical=%d"
                % (len(cbio.match_df), len(
                    cbio.genomic_df), len(cbio.clinical_df)))

        # get existing matches for this filter.
        matches = list(match_db.find({'FILTER_ID': ObjectId(filter_['_id'])}))

        rec_cnt = 0
        for m in matches:
            rec_cnt += len(m['VARIANTS'])

        logging.info("rerun_filters: exisiting: %d %d" %
                     (len(matches), rec_cnt))

        # parse the old matches.
        clinical_old_id = set()
        old_lu = {}
        match_lu = {}
        for match in matches:

            # get the clincal id.
            clinical_id = match['CLINICAL_ID']

            # now build tuples of variants.
            for genomic_id in match['VARIANTS']:

                # make pair
                pair = (clinical_id, genomic_id)
                clinical_old_id.add(pair)

                # build id lookup.
                old_lu[pair] = match['_id']

                # cache matches.
                match_lu[pair] = match

        # parse the new matches.
        clinical_new_id = set()
        new_lu = {}
        i = 0
        for match in cbio.match_iter():

            # simplify.
            clinical_id = match['CLINICAL_ID']
            genomic_id = match['GENOMIC_ID']

            # build set.
            pair = (clinical_id, genomic_id)
            clinical_new_id.add(pair)

            # cache matches.
            match_lu[pair] = match

            # build lookup.
            new_lu[pair] = i
            i += 1

        # find the ones which need to be deleted and delete them.
        to_delete = clinical_old_id - clinical_new_id
        logging.info("rerun_filters: removing: %d" % len(to_delete))
        updated = list()
        for pair in to_delete:

            # extract ids
            match_id = old_lu[pair]
            match = match_lu[pair]

            # find the variant.
            good = list()
            hit = False
            for v in match['VARIANTS']:
                if v != pair[1]:
                    good.append(v)
                else:
                    hit = True

            # update it if necessary.
            if hit:

                # check if will empty this.
                if len(good) == 0:

                    # delete it.
                    match_db.delete_one({'_id': match_id})
                else:

                    # just update it.
                    match_db.update({"_id": match_id},
                                    {"$set": {
                                        "VARIANTS": good
                                    }})

                    # update the local one to make sure we delete all variants
                    match['VARIANTS'] = good

        # find the intersection and remove them from data frame.
        remove_frame = clinical_new_id.intersection(clinical_old_id)
        bad_list = []
        for pair in remove_frame:

            # lookup index.
            idx = new_lu[pair]
            bad_list.append(idx)

        logging.info("rerun_filters: skipping: %d" % len(bad_list))

        # remove them.
        if cbio.match_df is not None and len(cbio.match_df) > 0:
            cbio.match_df.drop(cbio.match_df.index[bad_list], inplace=True)

        # insert the counts.
        count_matches(cbio, filter_)

        # insert the matches if not temporary.
        insert_matches(cbio, filter_, from_filter=False, dpi=dpi)
コード例 #20
0
ファイル: miner.py プロジェクト: dfci/matchminer-api
def email_matches():

    # get the database links.
    match_db = database.get_collection("match")
    user_db = database.get_collection('user')
    filter_db = database.get_collection('filter')

    logging.info("starting email search")

    # get distinct list of team ids
    teams = match_db.find().distinct("TEAM_ID")

    # loop over each team.
    message_list = []
    for teamid in teams:

        # get the counts.
        num_filters, num_matches = _email_counts(teamid, match_db, filter_db)

        # skip if no updates.
        if num_matches < 1:
            continue

        # get users in this team
        team_members = list(user_db.find({'teams': {'$elemMatch': {'$in': [teamid]}}}))
        for user in team_members:

            # skip if silenced.
            if 'silent' in user and user['silent']:
                continue

            # simplify.
            recipient_email = user['email']
            match_str = "matches"
            if num_matches == 1:
                match_str = "match"

            # create the message.
            cur_date = datetime.date.today().strftime("%B %d, %Y")
            cur_stamp = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")

            # generate text
            html = _email_text(user, num_matches, match_str, num_filters, cur_date, cur_stamp)

            db = database.get_db()
            email_item = {
                'email_from': settings.EMAIL_AUTHOR_PROTECTED,
                'email_to': recipient_email,
                'subject': 'New MatchMiner Hits - %s' % cur_date,
                'body': html,
                'cc': [],
                'sent': False,
                'num_failures': 0,
                'errors': []
            }
            db['email'].insert(email_item)

            message_list.append(html)

    # return the message lists
    return message_list
コード例 #21
0
ファイル: miner.py プロジェクト: dfci/matchminer-api
def insert_matches(cbio, item, from_filter=True, dpi=None):

    start_iter = time.time()
    filter_db = database.get_collection('filter')

    pf_pairz = dict()
    filters = dict()
    for silly in cbio.match_iter():

        filter_id = item['_id']
        clinical_id = silly['CLINICAL_ID']
        key = (filter_id, clinical_id)

        if key not in pf_pairz:
            pf_pairz[key] = list()

        if filter_id not in filters:
            filter_obj = filter_db.find_one({'_id': filter_id})
            filters[filter_id] = filter_obj

        pf_pairz[key].append(silly['GENOMIC_ID'])

    user_id = item['USER_ID']
    team_id = item['TEAM_ID']
    filter_status = item['status']
    filter_name = item['label']
    clinical_lu = {}
    genomic_lu = {}

    matches = list()
    for key, val in pf_pairz.items():

        clinical_id = key[1]
        genomic_id = val[0]

        if clinical_id not in clinical_lu:
            clinical_lu[clinical_id] = cbio._c.find_one(clinical_id)

        if genomic_id not in genomic_lu:
            genomic_lu[genomic_id] = cbio._g.find_one(genomic_id)

        # extract clinical information
        clinical_info = [
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME',
            'ONCOTREE_BIOPSY_SITE_TYPE',
            'REPORT_DATE',
            'VARIANT_CATEGORY',
            'MRN',
            'ORD_PHYSICIAN_EMAIL'
        ]
        clinical_info_vals = [''] * len(clinical_info)
        for idx, c in enumerate(clinical_info):
            if c in clinical_lu[clinical_id]:
                clinical_info_vals[idx] = clinical_lu[clinical_id][c]
            else:
                clinical_info_vals[idx] = ""

        # extract gene symbol
        true_hugo_symbol = None
        if 'TRUE_HUGO_SYMBOL' in genomic_lu[genomic_id]:
            true_hugo_symbol = genomic_lu[genomic_id]['TRUE_HUGO_SYMBOL']

        if true_hugo_symbol is None:

            filter_ = filters[key[0]]
            if 'genomic_filter' in filter_ and 'TRUE_HUGO_SYMBOL' in filter_['genomic_filter']:

                true_hugo_symbol = filter_['genomic_filter']['TRUE_HUGO_SYMBOL']
                if isinstance(true_hugo_symbol, dict):
                    true_hugo_symbol = ', '.join([str(i) for i in true_hugo_symbol.values()[0]])

        if true_hugo_symbol is None:
            logging.error("error in filter logic")

        # extract tier information
        tier = None
        if 'TIER' in genomic_lu[genomic_id]:
            tier = genomic_lu[genomic_id]['TIER']

        match_status = 0
        if from_filter:
            match_status = 1

        if 'protocol_id' in item:
            protocol_id = item['protocol_id']
        else:
            protocol_id = ""

        email_subject = "(%s) ONCO PANEL RESULTS" % protocol_id
        email_body = email_content(protocol_id, genomic_lu[genomic_id], clinical_lu[clinical_id])

        matches.append({
            'USER_ID': user_id,
            'TEAM_ID': team_id,
            'FILTER_STATUS': filter_status,
            'MATCH_STATUS': match_status,
            'FILTER_ID': key[0],
            'CLINICAL_ID': key[1],
            'VARIANTS': val,
            'PATIENT_MRN': clinical_info_vals[clinical_info.index('MRN')],
            'MMID': binascii.b2a_hex(os.urandom(3)).upper(),
            'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': clinical_info_vals[clinical_info.index('ONCOTREE_PRIMARY_DIAGNOSIS_NAME')],
            'ONCOTREE_BIOPSY_SITE_TYPE': clinical_info_vals[clinical_info.index('ONCOTREE_BIOPSY_SITE_TYPE')],
            'TRUE_HUGO_SYMBOL': true_hugo_symbol,
            'VARIANT_CATEGORY': clinical_info_vals[clinical_info.index('VARIANT_CATEGORY')],
            'FILTER_NAME': filter_name,
            'REPORT_DATE': clinical_info_vals[clinical_info.index('REPORT_DATE')],
            "EMAIL_ADDRESS": clinical_info_vals[clinical_info.index('ORD_PHYSICIAN_EMAIL')],
            "EMAIL_BODY": email_body,
            "EMAIL_SUBJECT": email_subject,
            'data_push_id': dpi,
            'TIER': tier
        })

    ttr_iter = time.time() - start_iter
    start_ins = time.time()

    if len(matches) > 0:
        match_db = database.get_collection("match")
        match_db.insert_many(matches)

    ttr_ins = time.time() - start_ins
    logging.info("match: added %d and it took %.2f to fetch, %.2f to insert" % (len(matches), ttr_iter, ttr_ins))
コード例 #22
0
ファイル: miner.py プロジェクト: dfci/matchminer-api
def rerun_filters(dpi=None):
    """ re-runs all filters against new data. preserves options set on
    old matches.

    :return: count of new matches
    """

    # get the database links.
    match_db = database.get_collection('match')
    filter_db = database.get_collection('filter')

    # create the object.
    cbio = CBioEngine(settings.MONGO_URI,
                      settings.MONGO_DBNAME,
                      data_model.match_schema,
                      muser=settings.MONGO_USERNAME,
                      mpass=settings.MONGO_PASSWORD,
                      collection_clinical=settings.COLLECTION_CLINICAL,
                      collection_genomic=settings.COLLECTION_GENOMIC)

    query = {'status': 1, 'temporary': False, 'trial_watch': {'$exists': False}}
    filters = list(filter_db.find(query))
    for filter_ in filters:

        # lots of logging.
        logging.info("rerun_filters: filter: %s" % filter_['_id'])

        # prepare the filters.
        c, g, txt = prepare_criteria(filter_)

        # execute the match.
        cbio.match(c=c, g=g)

        if cbio.match_df is not None and cbio.genomic_df is not None and cbio.clinical_df is not None:
            logging.info("rerun_filters: new matches: match=%d, genomic=%d, clinical=%d" % (len(cbio.match_df), len(cbio.genomic_df), len(cbio.clinical_df)))

        # get existing matches for this filter.
        matches = list(match_db.find({'FILTER_ID': ObjectId(filter_['_id'])}))

        rec_cnt = 0
        for m in matches:
            rec_cnt += len(m['VARIANTS'])

        logging.info("rerun_filters: exisiting: %d %d" % (len(matches), rec_cnt))

        # parse the old matches.
        clinical_old_id = set()
        old_lu = {}
        match_lu = {}
        for match in matches:

            # get the clincal id.
            clinical_id = match['CLINICAL_ID']

            # now build tuples of variants.
            for genomic_id in match['VARIANTS']:

                # make pair
                pair = (clinical_id, genomic_id)
                clinical_old_id.add(pair)

                # build id lookup.
                old_lu[pair] = match['_id']

                # cache matches.
                match_lu[pair] = match

        # parse the new matches.
        clinical_new_id = set()
        new_lu = {}
        i = 0
        for match in cbio.match_iter():

            # simplify.
            clinical_id = match['CLINICAL_ID']
            genomic_id = match['GENOMIC_ID']

            # build set.
            pair = (clinical_id, genomic_id)
            clinical_new_id.add(pair)

            # cache matches.
            match_lu[pair] = match

            # build lookup.
            new_lu[pair] = i
            i += 1

        # find the ones which need to be deleted and delete them.
        to_delete = clinical_old_id - clinical_new_id
        logging.info("rerun_filters: removing: %d" % len(to_delete))
        updated = list()
        for pair in to_delete:

            # extract ids
            match_id = old_lu[pair]
            match = match_lu[pair]

            # find the variant.
            good = list()
            hit = False
            for v in match['VARIANTS']:
                if v != pair[1]:
                    good.append(v)
                else:
                    hit = True

            # update it if necessary.
            if hit:

                # check if will empty this.
                if len(good) == 0:

                    # delete it.
                    match_db.delete_one({'_id': match_id})
                else:

                    # just update it.
                    match_db.update({"_id": match_id}, {"$set": {"VARIANTS": good}})

                    # update the local one to make sure we delete all variants
                    match['VARIANTS'] = good

        # find the intersection and remove them from data frame.
        remove_frame = clinical_new_id.intersection(clinical_old_id)
        bad_list = []
        for pair in remove_frame:

            # lookup index.
            idx = new_lu[pair]
            bad_list.append(idx)

        logging.info("rerun_filters: skipping: %d" % len(bad_list))

        # remove them.
        if cbio.match_df is not None and len(cbio.match_df) > 0:
            cbio.match_df.drop(cbio.match_df.index[bad_list], inplace=True)

        # insert the counts.
        count_matches(cbio, filter_)

        # insert the matches if not temporary.
        insert_matches(cbio, filter_, from_filter=False, dpi=dpi)
コード例 #23
0
ファイル: miner.py プロジェクト: pughlab/matchminer-api
def transform_filter_to_CTML(items, save=False):
    """
    Transform filters clinical & genomic key objects into CTML.

    This is a placeholder function which is present in order to ensure backwards compatibility
    with UI filter generation.

    Eventually, this function should be removed as CTML ideally would be generated
    correctly in the frontend and saved/sent directly to the filter engine for matching,

    :param save: When calling function explicitly (not part of eve built-in hook), e
    explicitly update filter in db. Usually eve does this automatically
    :param items: List of filters
    :return:
    """

    for item in items:
        # MMR_STATUS should always be on the genomic filter
        if 'clinical_filter' in item and 'MMR_STATUS' in item[
                'clinical_filter']:
            item['genomic_filter']['MMR_STATUS'] = item['clinical_filter'][
                'MMR_STATUS']
            del item['clinical_filter']['MMR_STATUS']

        item['match'] = []
        or_clauses = []
        genomic_and = {}
        if 'genomic_filter' in item:
            genomic_filter = item['genomic_filter']

            multis = []
            for k, v in genomic_filter.items():
                # Skip wildtypes, null values & empty lists
                if k in ['WILDTYPE'] or v is None or (isinstance(v, list)
                                                      and len(v) == 0):
                    continue

                # Old filters used a custom encoding where keys were nested inside an '^in' key.
                # New filters do not do this anymore, but in case any are leftover,
                # catch and remove.
                elif isinstance(v, dict):
                    if '^in' in v:
                        genomic_and[k] = v['^in']
                        genomic_filter[k] = v['^in']

                    operator = ""
                    op_sign = ""
                    if '^lt' in v and v['^lt'] is not None:
                        operator = '^lt'
                        op_sign = '<'

                    if '^gt' in v and v['^gt'] is not None:
                        operator = '^gt'
                        op_sign = '>'

                    if operator != "" and k == 'ALLELE_FRACTION':
                        if isinstance(v[operator],
                                      str) and '.' not in v[operator]:
                            v[operator] = float(v[operator]) / 100
                        genomic_and[k] = op_sign + str(v[operator])

                elif isinstance(v, list) and len(v) == 1:
                    if v[0] is not None and v[0] != "":
                        genomic_and[k] = v[0]

                # multi criteria should be used to create OR criteria later
                elif isinstance(v, list) and len(v) > 1:
                    multis.append(k)
                    pass
                else:
                    genomic_and[k] = v

            # If a user has selected multiple criteria, generate all possible
            # OR CTML nodes. Filter out error clauses later.
            # A user may select multiple genes, variant categories, or multiples
            # of both categories.
            if len(multis) == 1:
                for val in genomic_filter[multis[0]]:
                    or_clause = copy.deepcopy(genomic_and)
                    or_clause[multis[0]] = val
                    or_clauses.append({"genomic": or_clause})
            elif len(multis) == 2:
                for val in genomic_filter[multis[0]]:
                    for i_val in genomic_filter[multis[1]]:
                        or_clause = copy.deepcopy(genomic_and)
                        or_clause[multis[0]] = val
                        or_clause[multis[1]] = i_val
                        or_clauses.append({"genomic": or_clause})
            elif len(multis) == 3:
                for val in genomic_filter[multis[0]]:
                    for i_val in genomic_filter[multis[1]]:
                        for i_i_val in genomic_filter[multis[2]]:
                            or_clause = copy.deepcopy(genomic_and)
                            or_clause[multis[0]] = val
                            or_clause[multis[1]] = i_val
                            or_clause[multis[2]] = i_i_val
                            or_clauses.append({"genomic": or_clause})

        # remove CNV_CALL's when VARIANT_CATEGORY is MUTATION or SV
        for or_clause in or_clauses:
            or_node = or_clause['genomic']
            if 'VARIANT_CATEGORY' in or_node and 'CNV_CALL' in or_node and \
                    (or_node['VARIANT_CATEGORY'] == 'MUTATION' or or_node['VARIANT_CATEGORY'] == 'SV'):
                del or_node['CNV_CALL']

        # remove duplicate nodes
        cleaned_or_clauses = []
        for i in range(len(or_clauses)):
            if or_clauses[i] not in or_clauses[i + 1:]:
                cleaned_or_clauses.append(or_clauses[i])

        clinical_and = {}
        if 'clinical_filter' in item:
            clinical_and = {}
            for (k, v) in item['clinical_filter'].items():
                if v is not None:
                    if k == 'BIRTH_DATE':
                        # TODO remove once filter backfill is complete
                        if 'AGE_NUMERICAL' in item['clinical_filter']:
                            continue
                        operator, integer = transform_date_to_range(v)
                        clinical_and[
                            'AGE_NUMERICAL'] = f"{operator}{str(integer)}"
                    elif k == 'AGE_NUMERICAL':
                        operator, difference = transform_age_to_CTML(v)
                        clinical_and[k] = f"{operator}{difference}"
                    else:
                        clinical_and[k] = v

        and_clause = {"and": []}
        and_clause['and'].append({"clinical": clinical_and})

        if genomic_and:
            and_clause['and'].append({"genomic": genomic_and})

        if cleaned_or_clauses:
            and_clause["and"].append({"or": cleaned_or_clauses})

            # If new OR clauses have been generated, remove
            # extra AND clause as it is already included on all OR clauses
            del and_clause['and'][1]

        item['match'] = [and_clause]
        item['description'] = get_filter_description(item)

        if save:
            database.get_collection("filter").replace_one({"_id": item['_id']},
                                                          item)