def match(args): """ Matches all trials in database to patients :param daemon: Boolean flag; when true, runs the matchengine once per 24 hours. """ db = get_db(args.mongo_uri) while True: me = MatchEngine(db) me.find_trial_matches() # exit if it is not set to run as a nightly automated daemon, otherwise sleep for a day if not args.daemon: # choose output file format if args.json_format: file_format = 'json' elif args.outpath and len(args.outpath.split('.')) > 1: file_format = args.outpath.split('.')[-1] if file_format not in ['json', 'csv']: file_format = 'csv' else: file_format = 'csv' # choose output path if args.outpath: outpath = args.outpath.split('.')[0] else: outpath = './results' # export results export_results(args.mongo_uri, file_format, outpath) break else: time.sleep(86400) # sleep for 24 hours
def setUp(self): """ Descriptions of test patients 1: >18, Adrenal Gland, Female, BRAF F346R Mutation 2: >18, Melanoma, Female, EGFR L858R Mutation 3: >18, Melanoma, Female, EGFR F346A Mutation 4: >18, Melanoma, Female, EGFR F346B Mutation 5: >18, Melanoma, Female, EGFR F000F Mutation 6: >0.5 && <18, Melanoma, Male, EGFR SV 7: >0.5 && <18, Glioblastoma, Male, EGFR CNV Hetero del 8: >0.5 && <18, Glioblastoma, Male, EGFR CNV Gain 9: >0.5 && <18, Glioblastoma, Male, EGFR CNV H**o del 10: <0.5, Glioblastoma, Male, EGFR CNV High amp Descriptions of test trials 00-001.yml: dose: EGFR L858R && >=18/_SOLID_ 00-002.yml: arm: EGFR L858R && >=18/_SOLID_ 00-003.yml: step: EGFR L858R && >=18/_SOLID_ 00-004.yml dose: EGFR L858R && >=18/_SOLID_ 00-005.yml 2 doses: EGFR L858R && >=18/_SOLID_ 00-006.yml exon: !13 """ self.db = get_db(None) for res in ["clinical", "dashboard", "filter", "genomic", "hipaa", "match", "normalize", "oplog" "response", "statistics", "status", "team", "trial", "trial_match", "user"]: self.db.drop_collection(res) self.me = MatchEngine(self.db) self.trials = {} self.clinical_id = ObjectId() self.mrn = 'TCGA-BH-A1FR' self.sample_id = 'TCGA-OR-A5J1' self.mrns = [self.mrn] + [self.__random_id() for _ in range(9)] self.sample_ids = [self.sample_id] + [self.__random_id() for _ in range(9)] self.clinical_ids = [self.clinical_id] + [ObjectId() for _ in range(9)] self.static_date = dt.datetime.today() # clinical collection self.oncotree_diagnoses = ['Adrenal Gland'] + ['Melanoma'] * 5 + ['Glioblastoma'] * 4 self.genders = ['Female'] * 5 + ['Male'] * 5 # ages adult = self.static_date - dt.timedelta(days=365*19) child = self.static_date - dt.timedelta(days=365*10) infant = self.static_date - dt.timedelta(days=30*4) self.ages = [adult] * 5 + [child] * 4 + [infant] self.clinical = [{ '_id': clinical_id, 'ONCOTREE_PRIMARY_DIAGNOSIS_NAME': diagnosis, 'SAMPLE_ID': sample_id, 'VITAL_STATUS': 'alive', 'MRN': mrn, 'REPORT_DATE': self.static_date, 'BIRTH_DATE': age, 'GENDER': gender } for diagnosis, gender, age, clinical_id, sample_id, mrn in zip( self.oncotree_diagnoses, self.genders, self.ages, self.clinical_ids, self.sample_ids, self.mrns)] # genomic collection self.genes = ['BRAF'] + ['EGFR'] * 9 self.protein_changes = ['p.F346R', 'p.L858R', 'p.F346A', 'p.F346B', 'p.F000F', None, None, None, None, None] self.variant_categories = ['MUTATION'] * 5 + ['SV', 'CNV', 'CNV', 'CNV', 'CNV'] self.wildtypes = [False] * 10 self.cnv_calls = [None, None, None, None, None, None, 'Heterozygous deletion', 'Gain', 'Homozygous deletion', 'High level amplification'] self.genomic = [{ 'TRUE_VARIANT_CLASSIFICATION': 'In_Frame_Del', 'TRUE_PROTEIN_CHANGE': protein_change, 'VARIANT_CATEGORY': variant_category, 'CHROMOSOME': 'chr3', 'POSITION': 178952085, 'TRUE_STRAND': '+', 'WILDTYPE': wildtype, 'CLINICAL_ID': _id, 'CNV_CALL': cnv_call, 'TRUE_HUGO_SYMBOL': gene, 'SAMPLE_ID': sample_id, 'TRUE_TRANSCRIPT_EXON': 19 } for protein_change, variant_category, wildtype, cnv_call, gene, _id, sample_id in zip( self.protein_changes, self.variant_categories, self.wildtypes, self.cnv_calls, self.genes, self.clinical_ids, self.sample_ids )] # test trials self.test_trials = ['00-001', '00-002', '00-003'] # demo match results pnos = ['00-001', '00-001', '00-001', '00-002', '00-002', '00-002'] mlevels = ['arm', 'arm', 'arm', 'dose', 'dose', 'dose'] iids = ['1', '2', '3', '4', '5', '6'] galts = ['Alt1', 'Alt2', 'Alt2', 'Alt3', 'Alt3', 'Alt3'] self.matches = [{ 'mrn': 'SAMPLE1', 'sample_id': 'SAMPLE1-ID', 'protocol_no': protocol_no, 'match_level': match_level, 'internal_id': internal_id, 'genomic_alteration': genomic_alteration } for protocol_no, match_level, internal_id, genomic_alteration in zip( pnos, mlevels, iids, galts )]
def load(args): """ Sets up MongoDB for matching :param args: clinical: Path to csv file containing clinical data. Required fields are: - MRN (Unique patient identifier) - SAMPLE_ID (Unique sample identifier) - ONCOTREE_PRIMARY_DIAGNOSIS_NAME (Disease diagnosis) - BIRTH_DATE (Date of birth in format 'YYYY-MM-DD 00:00:00.000') Suggested additional fields: - ORD_PHYSICIAN_NAME - ORD_PHYSICIAN_EMAIL - REPORT_DATE - VITAL_STATUS (alive or deceased) - FIRST_LAST (Patient's first and last name) - GENDER (Male or Female) :param args: genomic: Path to csv file containing genomic data. The following fields are used in matching: - SAMPLE_ID (Unique sample identifier) - TRUE_HUGO_SYMBOL (Gene name) - TRUE_PROTEIN_CHANGE (Specific variant) - TRUE_VARIANT_CLASSIFICATION (Variant type) - VARIANT_CATEGORY (CNV, MUTATION, or SV) - TRUE_TRANSCRIPT_EXON (Exon number <integer> - CNV_CALL (Heterozygous deletion, Homozygous deletion, Gain, High Level amplification, or null) - WILDTYPE (True or False) Suggested additional fields: - CHROMOSOME (Chromosome number in format 'chr01') - POSITION <integer> - TRUE_CDNA_CHANGE - REFERENCE_ALLELE - CANONICAL_STRAND (- or +) - ALLELE_FRACTION <float> - TIER <integer> :param args: trials: Path to bson trial file. """ db = get_db(args.mongo_uri) t = Trial(db) p = Patient(db) # Add trials to mongo if args.trials: logging.info('Adding trials to mongo...') t.load_dict[args.trial_format](args.trials) # Add patient data to mongo if args.clinical and args.genomic: logging.info('Reading data into pandas...') is_bson = p.load_dict[args.patient_format](args.clinical, args.genomic) if not is_bson: # reformatting for col in ['BIRTH_DATE', 'REPORT_DATE']: try: p.clinical_df[col] = p.clinical_df[col].apply(lambda x: str(dt.datetime.strptime(x, '%Y-%m-%d'))) except ValueError as exc: if col == 'BIRTH_DATE': print '## WARNING ## Birth dates should be formatted %Y-%m-%d to be properly stored in MongoDB.' print '## ## Birth dates may be malformed in the database and will therefore not match' print '## ## trial age restrictions properly.' print '## ## System error: \n%s' % exc p.genomic_df['TRUE_TRANSCRIPT_EXON'] = p.genomic_df['TRUE_TRANSCRIPT_EXON'].apply( lambda x: int(x) if x != '' and pd.notnull(x) else x) # Add clinical data to mongo logging.info('Adding clinical data to mongo...') clinical_json = json.loads(p.clinical_df.T.to_json()).values() for item in clinical_json: for col in ['BIRTH_DATE', 'REPORT_DATE']: if col in item: item[col] = dt.datetime.strptime(str(item[col]), '%Y-%m-%d %X') db.clinical.insert(clinical_json) # Get clinical ids from mongo logging.info('Adding clinical ids to genomic data...') clinical_doc = list(db.clinical.find({}, {"_id": 1, "SAMPLE_ID": 1})) clinical_dict = dict(zip([i['SAMPLE_ID'] for i in clinical_doc], [i['_id'] for i in clinical_doc])) # pd -> json if args.trial_format == 'pkl': genomic_json = json.loads(p.genomic_df.to_json(orient='records')) else: genomic_json = json.loads(p.genomic_df.T.to_json()).values() # Map clinical ids to genomic data for item in genomic_json: if item['SAMPLE_ID'] in clinical_dict: item["CLINICAL_ID"] = clinical_dict[item['SAMPLE_ID']] else: item["CLINICAL_ID"] = None # Add genomic data to mongo logging.info('Adding genomic data to mongo...') db.genomic.insert(genomic_json) # Create index logging.info('Creating index...') db.genomic.create_index([("TRUE_HUGO_SYMBOL", ASCENDING), ("WILDTYPE", ASCENDING)]) elif args.clinical and not args.genomic or args.genomic and not args.clinical: logging.error('If loading patient information, please provide both clinical and genomic data.') sys.exit(1)