def createMafDatabase(syn, databaseToSynIdMappingDf, testing=False, staging=False): mafDatabaseSynId = process_functions.getDatabaseSynId( syn, "vcf2maf", databaseToSynIdMappingDf=databaseToSynIdMappingDf) mafDatabaseEnt = syn.get(mafDatabaseSynId) mafCols = list(syn.getTableColumns(mafDatabaseSynId)) schema = synapseclient.Schema( name='Narrow MAF %s Database' % time.time(), columns=mafCols, parent=process_functions.getDatabaseSynId( syn, "main", databaseToSynIdMappingDf=databaseToSynIdMappingDf)) schema.primaryKey = mafDatabaseEnt.primaryKey newMafDb = syn.store(schema) #Store in the new database synid databaseToSynIdMappingDf['Id'][0] = newMafDb.id syn.store( synapseclient.Table( process_functions.getDatabaseSynId(syn, "dbMapping", test=testing), databaseToSynIdMappingDf)) if not staging and not testing: #Make sure to store the newly created maf db synid into the staging synapse mapping databaseToSynIdMapping = syn.tableQuery( "SELECT * FROM syn12094210 where Database = 'vcf2maf'") databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() databaseToSynIdMappingDf['Id'][0] = newMafDb.id syn.store(synapseclient.Table("syn12094210", databaseToSynIdMappingDf)) #Move and archive old mafdatabase mafDatabaseEnt.parentId = "syn7208886" mafDatabaseEnt.name = "ARCHIVED " + mafDatabaseEnt.name syn.store(mafDatabaseEnt) mafDatabaseSynId = newMafDb.id #Remove can download permissions from project GENIE team syn.setPermissions(mafDatabaseSynId, 3326313, [])
def _validate(self, mutationInCisDf, project_id): databaseToSynIdMappingDf = process_functions.get_synid_database_mappingdf( self.syn, project_id) mutationInCisSynId = process_functions.getDatabaseSynId( self.syn, "mutationsInCis", databaseToSynIdMappingDf=databaseToSynIdMappingDf, ) # Pull down the correct database existingMergeCheck = self.syn.tableQuery( "select * from {} where Center = '{}'".format( mutationInCisSynId, self.center)) existingMergeCheckDf = existingMergeCheck.asDataFrame() total_error = "" warning = "" required_headers = pd.Series([ "Flag", "Center", "Tumor_Sample_Barcode", "Hugo_Symbol", "HGVSp_Short", "Variant_Classification", "Chromosome", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", "t_alt_count_num", "t_depth", ]) primaryKeys = [ "Tumor_Sample_Barcode", "HGVSp_Short", "Start_Position", "Reference_Allele", "Tumor_Seq_Allele2", ] if not all(required_headers.isin(mutationInCisDf.columns)): missing_headers = required_headers[~required_headers. isin(mutationInCisDf.columns)] total_error += ("Mutations In Cis Filter File: " "Must at least have these headers: %s.\n" % ",".join(missing_headers)) else: new = mutationInCisDf[primaryKeys].fillna("") existing = existingMergeCheckDf[primaryKeys].fillna("") existing["primaryAll"] = [ " ".join(values.astype(str)) for i, values in existing.iterrows() ] new["primaryAll"] = [ " ".join(values.astype(str)) for i, values in new.iterrows() ] if not all(new.primaryAll.isin(existing.primaryAll)): total_error += ("Mutations In Cis Filter File: " "All variants must come from the original " "mutationInCis_filtered_samples.csv file in " "each institution's staging folder.\n") return total_error, warning
def _process(self, cnaDf, test=False): checkBy = "TUMOR_SAMPLE_BARCODE" cnaDf.rename(columns= {cnaDf.columns[0]:cnaDf.columns[0].upper()}, inplace=True) cnaDf.rename(columns= {"HUGO_SYMBOL":"Hugo_Symbol"}, inplace=True) columns = [col.upper() for col in cnaDf.columns] index = [i for i, col in enumerate(cnaDf.columns) if col.upper() == "ENTREZ_GENE_ID"] if len(index) > 0: del cnaDf[cnaDf.columns[index][0]] #validateSymbol = partial(process_functions.validateSymbol,returnMapping=True) #invalidated_genes = self.pool.map(validateSymbol, cna["HUGO_SYMBOL"].drop_duplicates()) #cna, nonmapped = process_functions.remapGenes(invalidated_genes, cna, "HUGO_SYMBOL",isBedFile=True) bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=test) bed = self.syn.tableQuery("select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center)) bedDf = bed.asDataFrame() #originalSymbols = cnaDf['HUGO_SYMBOL'].copy() cnaDf['Hugo_Symbol'] = cnaDf['Hugo_Symbol'].apply(lambda x: validateSymbol(x, bedDf)) order = cnaDf.columns # unmappable = cnaDf[cnaDf['HUGO_SYMBOL'].isnull()] # unmappableSymbols = originalSymbols[cnaDf['HUGO_SYMBOL'].isnull()] cnaDf = cnaDf[~cnaDf['Hugo_Symbol'].isnull()] #cnaDf = cnaDf.applymap(str) duplicatedGenes = pd.DataFrame() for i in cnaDf['Hugo_Symbol'][cnaDf['Hugo_Symbol'].duplicated()].unique(): dups = cnaDf[cnaDf['Hugo_Symbol'] == i] newVal = dups[dups.columns[dups.columns!="Hugo_Symbol"]].apply(mergeCNAvalues) temp = pd.DataFrame(newVal).transpose() temp['Hugo_Symbol'] = i duplicatedGenes = duplicatedGenes.append(temp,sort=False) cnaDf.drop_duplicates('Hugo_Symbol',keep=False, inplace=True) cnaDf = cnaDf.append(duplicatedGenes,sort=False) cnaDf = cnaDf[order] #symbols = cnaDf['HUGO_SYMBOL'] #del cnaDf['HUGO_SYMBOL'] cnaDf.columns = [process_functions.checkGenieId(i,self.center) if i != "Hugo_Symbol" else i for i in cnaDf.columns] #Transpose matrix # cnaDf = cnaDf.transpose() # data = cnaDf.apply(lambda row: makeCNARow(row, symbols), axis=1) #Transpose matrix # del unmappable['HUGO_SYMBOL'] # unmappable = unmappable.transpose() # unmappableData = unmappable.apply(lambda row: makeCNARow(row, unmappableSymbols), axis=1) # newCNA = pd.DataFrame() # newCNA[checkBy] = newsamples # newCNA['CNAData'] = data.values # newCNA['CENTER'] = self.center # newCNA['unmappedData'] = unmappableData.values #newCNA = newCNA[~newCNA['CNAData'].isnull()] #remove the 0.0, 1.0 and 2.0 # os.system("sed 's/[.]0//g' %s > %s" % (newPath + "temp", newPath)) # os.remove(newPath + "temp") return(cnaDf)
def retract(syn, test=False): ''' Main retraction function params: syn: synapse object test: use test files or main files. Default is False ''' patientRetract = syn.tableQuery('select * from %s' % process_functions.getDatabaseSynId( syn, "patientRetraction", test=test)) patientRetractIds = patientRetract.asDataFrame() #grab all clinical samples that belong to patients in the patient clinical file and append to sample list sampleClinical = syn.tableQuery( 'select * from %s' % process_functions.getDatabaseSynId(syn, "sample", test=test)) sampleClinicalDf = sampleClinical.asDataFrame() appendSamples = sampleClinicalDf['SAMPLE_ID'][ sampleClinicalDf['PATIENT_ID'].isin(patientRetractIds.geniePatientId)] sampleRetract = syn.tableQuery( 'select * from %s' % process_functions.getDatabaseSynId(syn, "sampleRetraction", test=test)) sampleRetractIds = sampleRetract.asDataFrame() allRetractedSamples = sampleRetractIds['genieSampleId'].append( appendSamples) #Only need to retract clinical data, because the rest of the data is filtered by clinical data #Sample Clinical Data retract_samples( syn, process_functions.getDatabaseSynId(syn, "sample", test=test), "SAMPLE_ID", allRetractedSamples) #Patient Clinical Data retract_samples( syn, process_functions.getDatabaseSynId(syn, "patient", test=test), "PATIENT_ID", patientRetractIds['geniePatientId'])
def _validate(self, fusionDF, noSymbolCheck, testing=False): total_error = "" warning = "" # Frame: "in-frame" or "frameshift". # Fusion_Status (OPTIONAL): An assessment of the mutation type (i.e., "SOMATIC", "GERMLINE", "UNKNOWN", or empty) fusionDF.columns = [col.upper() for col in fusionDF.columns] REQUIRED_HEADERS = pd.Series([ 'HUGO_SYMBOL', 'ENTREZ_GENE_ID', 'CENTER', 'TUMOR_SAMPLE_BARCODE', 'FUSION', 'DNA_SUPPORT', 'RNA_SUPPORT', 'METHOD', 'FRAME' ]) if fusionDF.get("COMMENTS") is None: fusionDF['COMMENTS'] = float('nan') if not all(REQUIRED_HEADERS.isin(fusionDF.columns)): total_error += "Your fusion file must at least have these headers: %s.\n" % ",".join( REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(fusionDF.columns)]) if process_functions.checkColExist( fusionDF, "HUGO_SYMBOL") and not noSymbolCheck: # logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath)) #invalidated_genes = fusionDF["HUGO_SYMBOL"].drop_duplicates().apply(validateSymbol) bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=testing) bed = self.syn.tableQuery( "select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center)) bedDf = bed.asDataFrame() #invalidated_genes = self.pool.map(process_functions.validateSymbol, fusionDF["HUGO_SYMBOL"].drop_duplicates()) fusionDF = fusionDF.drop_duplicates("HUGO_SYMBOL").apply( lambda x: validateSymbol(x, bedDf), axis=1) if fusionDF["HUGO_SYMBOL"].isnull().any(): total_error += "Your fusion file should not have any NA/blank Hugo Symbols.\n" # if process_functions.checkColExist(fusionDF, "DNA_SUPPORT"): # if not fusionDF.DNA_SUPPORT.isin(["yes","no","unknown"]).all(): # total_error += "Your fusion file's DNA_SUPPORT column must be 'yes', 'no', or 'unknown'" # if process_functions.checkColExist(fusionDF, "RNA_SUPPORT"): # if not fusionDF.RNA_SUPPORT.isin(["yes","no","unknown"]).all(): # total_error += "Your fusion file's RNA_SUPPORT column must be 'yes', 'no', or 'unknown'" # if process_functions.checkColExist(fusionDF, "FRAME"): # if not fusionDF.FRAME.isin(["in-frame","frameshift"]).all(): # total_error += "Your fusion file's FRAME column must be 'in-frame', or 'frameshift'" return (total_error, warning)
def _process(self, fusion, databaseToSynIdMappingDf): fusion.columns = [col.upper() for col in fusion.columns] fusion["CENTER"] = self.center newsamples = [ process_functions.checkGenieId(i, self.center) for i in fusion["TUMOR_SAMPLE_BARCODE"] ] fusion["TUMOR_SAMPLE_BARCODE"] = newsamples # This is temporary, because comments column will be removed # if fusion.get("COMMENTS") is None: # fusion['COMMENTS'] = "" # #Will remove comments column # fusion['COMMENTS'] = "" fusion["ENTREZ_GENE_ID"] = fusion["ENTREZ_GENE_ID"].fillna(0) fusion = fusion.drop_duplicates() fusion["ID"] = fusion["HUGO_SYMBOL"].copy() bedSynId = process_functions.getDatabaseSynId( self.syn, "bed", databaseToSynIdMappingDf=databaseToSynIdMappingDf) bed = self.syn.tableQuery( "select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center)) bedDf = bed.asDataFrame() fusion = fusion.apply(lambda x: validateSymbol(x, bedDf), axis=1) # Create nonmapped gene dict temp = fusion[fusion["HUGO_SYMBOL"] != fusion["ID"]] foo = temp[~temp.HUGO_SYMBOL.isnull()] temp = foo[["HUGO_SYMBOL", "ID"]] temp.drop_duplicates(inplace=True) temp.index = temp.ID del temp["ID"] # fusion = fusion[~fusion['HUGO_SYMBOL'].isnull()] fusion["FUSION"] = fusion["FUSION"].fillna("") fusion, nonmapped = remapFusion(temp.to_dict()["HUGO_SYMBOL"], fusion, "FUSION") # Fill in blank hugo symbol columns with original symbol null_symbols_idx = fusion["HUGO_SYMBOL"].isnull() fusion["HUGO_SYMBOL"][null_symbols_idx] = fusion["ID"][ null_symbols_idx] # fusion, nonmapped = remapFusion(temp.to_dict()['HUGO_SYMBOL'], fusion, "COMMENTS") fusion["ENTREZ_GENE_ID"] = [ int(float(i)) for i in fusion["ENTREZ_GENE_ID"] ] return fusion
def _validate(self, mutationInCisDf, testing=False): mutationInCisSynId = process_functions.getDatabaseSynId( self.syn, "mutationsInCis", test=testing) #Pull down the correct database existingMergeCheck = self.syn.tableQuery( "select * from %s where Center = '%s'" % (mutationInCisSynId, self.center)) existingMergeCheckDf = existingMergeCheck.asDataFrame() total_error = "" warning = "" REQUIRED_HEADERS = pd.Series([ 'Flag', 'Center', 'Tumor_Sample_Barcode', 'Hugo_Symbol', 'HGVSp_Short', 'Variant_Classification', 'Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2', 't_alt_count_num', 't_depth' ]) primaryKeys = [ 'Tumor_Sample_Barcode', 'HGVSp_Short', 'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2' ] if not all(REQUIRED_HEADERS.isin(mutationInCisDf.columns)): total_error += "Mutations In Cis Filter File: Must at least have these headers: %s.\n" % ",".join( REQUIRED_HEADERS[~REQUIRED_HEADERS.isin(mutationInCisDf.columns )]) else: new = mutationInCisDf[primaryKeys].fillna("") existing = existingMergeCheckDf[primaryKeys].fillna("") existing['primaryAll'] = [ " ".join(values.astype(str)) for i, values in existing.iterrows() ] new['primaryAll'] = [ " ".join(values.astype(str)) for i, values in new.iterrows() ] if not all(new.primaryAll.isin(existing.primaryAll)): total_error += "Mutations In Cis Filter File: All variants must come from the original mutationInCis_filtered_samples.csv file in each institution's staging folder.\n" return (total_error, warning)
def _validate(self, cnvDF, noSymbolCheck, testing=False): total_error = "" warning = "" cnvDF.columns = [col.upper() for col in cnvDF.columns] if cnvDF.columns[0] != "HUGO_SYMBOL": total_error += "Your cnv file's first column must be Hugo_Symbol\n" haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL") if haveColumn: keepSymbols = cnvDF["HUGO_SYMBOL"] cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True) # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0: # total_error += "Your cnv file must not have any empty values\n" if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"): del cnvDF['ENTREZ_GENE_ID'] #cnvDF = cnvDF.fillna('') if not all(cnvDF.applymap(lambda x: str(x) in ['-2.0','-2','-1.5','-1.0','-1','0.0','0','0.5','1.0','1','1.5','2','2.0','nan']).all()): total_error += "All values must be NA/blank, -2, -1.5, -1, -0.5, 0, 0.5, 1, 1.5, or 2.\n" else: cnvDF['HUGO_SYMBOL'] = keepSymbols if haveColumn and not noSymbolCheck: #logger.info("VALIDATING %s GENE SYMBOLS" % os.path.basename(filePath)) bedSynId = process_functions.getDatabaseSynId(self.syn, "bed", test=testing) bed = self.syn.tableQuery("select Hugo_Symbol, ID from %s where CENTER = '%s'" % (bedSynId, self.center)) bedDf = bed.asDataFrame() cnvDF['remapped'] = cnvDF['HUGO_SYMBOL'].apply(lambda x: validateSymbol(x, bedDf)) cnvDF = cnvDF[~cnvDF['remapped'].isnull()] #Do not allow any duplicated genes after symbols have been remapped if sum(cnvDF['remapped'].duplicated()) >0: total_error+= "Your CNA file has duplicated Hugo_Symbols (After remapping of genes): %s -> %s.\n" % (",".join(cnvDF['HUGO_SYMBOL'][cnvDF['remapped'].duplicated(keep=False)]), ",".join(cnvDF['remapped'][cnvDF['remapped'].duplicated(keep=False)])) return(total_error, warning)
def main(): """Set up argument parser and returns""" parser = argparse.ArgumentParser( description='GENIE center inputs to database') parser.add_argument("process", choices=['vcf', 'maf', 'main', 'mafSP'], help='Process vcf, maf or the rest of the files') parser.add_argument('--center', help='The centers') parser.add_argument("--pemFile", type=str, help="Path to PEM file (genie.pem)") parser.add_argument("--deleteOld", action='store_true', help="Delete all old processed and temp files") parser.add_argument("--onlyValidate", action='store_true', help="Only validate the files, don't process") parser.add_argument("--oncotreeLink", type=str, help="Link to oncotree code") parser.add_argument("--createNewMafDatabase", action='store_true', help="Creates a new maf database") parser.add_argument("--testing", action='store_true', help="Testing the infrastructure!") parser.add_argument("--debug", action='store_true', help="Add debug mode to synapse") parser.add_argument("--reference", type=str, help="Path to VCF reference file") #DEFAULT PARAMS parser.add_argument("--vcf2mafPath", type=str, help="Path to vcf2maf", default="~/vcf2maf-1.6.14") parser.add_argument("--vepPath", type=str, help="Path to VEP", default="~/vep") parser.add_argument("--vepData", type=str, help="Path to VEP data", default="~/.vep") parser.add_argument('--thread', type=int, help="Number of threads to use for validation", default=1) args = parser.parse_args() syn = process_functions.synLogin(args.pemFile, debug=args.debug) #Must specify path to vcf2maf, VEP and VEP data is these types are specified if args.process in ['vcf', 'maf', 'mafSP'] and not args.onlyValidate: assert os.path.exists( args.vcf2mafPath ), "Path to vcf2maf (--vcf2mafPath) must be specified if `--process {vcf,maf,mafSP}` is used" assert os.path.exists( args.vepPath ), "Path to VEP (--vepPath) must be specified if `--process {vcf,maf,mafSP}` is used" assert os.path.exists( args.vepData ), "Path to VEP data (--vepData) must be specified if `--process {vcf,maf,mafSP}` is used" if args.testing: databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn11600968') else: databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn10967259') databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() center_mapping_id = process_functions.getDatabaseSynId( syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf) center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id) center_mapping_df = center_mapping.asDataFrame() if args.center is not None: assert args.center in center_mapping_df.center.tolist( ), "Must specify one of these centers: %s" % ", ".join( center_mapping_df.center) centers = [args.center] else: center_mapping_df = center_mapping_df[~center_mapping_df['inputSynId']. isnull()] center_mapping_df = center_mapping_df[center_mapping_df['release'] == True] centers = center_mapping_df.center if args.oncotreeLink is None: onco_link = databaseToSynIdMappingDf['Id'][ databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0] onco_link_ent = syn.get(onco_link) args.oncotreeLink = onco_link_ent.externalURL #Check if you can connect to oncotree link, if not then don't run validation / processing process_functions.checkUrl(args.oncotreeLink) center_mapping_ent = syn.get(center_mapping_id) if center_mapping_ent.get('isProcessing', ['True'])[0] == 'True': raise Exception( "Processing/validation is currently happening. Please change/add the 'isProcessing' annotation on %s to False to enable processing" % center_mapping_id) else: center_mapping_ent.isProcessing = "True" center_mapping_ent = syn.store(center_mapping_ent) #remove this query timeout and see what happens #syn.table_query_timeout = 50000 #Create new maf database, should only happen once if its specified if args.createNewMafDatabase: createMafDatabase(syn, databaseToSynIdMappingDf, testing=args.testing) for center in centers: input_to_database(syn, center, args.process, args.testing, args.onlyValidate, args.vcf2mafPath, args.vepPath, args.vepData, databaseToSynIdMappingDf, center_mapping_df, reference=args.reference, delete_old=args.deleteOld, oncotree_link=args.oncotreeLink, thread=args.thread) # To ensure that this is the new entity center_mapping_ent = syn.get(center_mapping_id) center_mapping_ent.isProcessing = "False" center_mapping_ent = syn.store(center_mapping_ent) error_tracker_synid = process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf) #Only write out invalid reasons if the center isnt specified and if only validate if args.center is None and args.onlyValidate: logging.info("WRITING INVALID REASONS TO CENTER STAGING DIRS") write_invalid_reasons.write_invalid_reasons(syn, center_mapping_df, error_tracker_synid)
def input_to_database(syn, center, process, testing, only_validate, vcf2maf_path, vep_path, vep_data, database_to_synid_mappingdf, center_mapping_df, reference=None, delete_old=False, oncotree_link=None, thread=1): if only_validate: log_path = os.path.join(process_functions.SCRIPT_DIR, "%s_validation_log.txt" % center) else: log_path = os.path.join(process_functions.SCRIPT_DIR, "%s_%s_log.txt" % (center, process)) logFormatter = logging.Formatter( "%(asctime)s [%(name)s][%(levelname)s] %(message)s") fileHandler = logging.FileHandler(log_path, mode='w') fileHandler.setFormatter(logFormatter) logger.addHandler(fileHandler) if testing: logger.info("###########################################") logger.info("############NOW IN TESTING MODE############") logger.info("###########################################") # ---------------------------------------- # Start input to staging process # ---------------------------------------- #path_to_genie = os.path.realpath(os.path.join(process_functions.SCRIPT_DIR,"../")) #Make the synapsecache dir the genie input folder for now #The main reason for this is because the .synaspecache dir is mounted by batch path_to_genie = os.path.expanduser("~/.synapseCache") #Create input and staging folders if not os.path.exists(os.path.join(path_to_genie, center, "input")): os.makedirs(os.path.join(path_to_genie, center, "input")) if not os.path.exists(os.path.join(path_to_genie, center, "staging")): os.makedirs(os.path.join(path_to_genie, center, "staging")) if delete_old: process_functions.rmFiles(os.path.join(path_to_genie, center)) validFiles = validation(syn, center, process, center_mapping_df, database_to_synid_mappingdf, thread, testing, oncotree_link) if len(validFiles) > 0 and not only_validate: #Reorganize so BED file are always validated and processed first validBED = [ os.path.basename(i).endswith('.bed') for i in validFiles['path'] ] beds = validFiles[validBED] validFiles = beds.append(validFiles) validFiles.drop_duplicates(inplace=True) #Valid maf, mafsp, vcf and cbs files validMAF = [ i for i in validFiles['path'] if os.path.basename(i) == "data_mutations_extended_%s.txt" % center ] validMAFSP = [ i for i in validFiles['path'] if os.path.basename(i) == "nonGENIE_data_mutations_extended_%s.txt" % center ] validVCF = [ i for i in validFiles['path'] if os.path.basename(i).endswith('.vcf') ] #validCBS = [i for i in validFiles['path'] if os.path.basename(i).endswith('.cbs')] if process == 'mafSP': validMAFs = validMAFSP else: validMAFs = validMAF processTrackerSynId = process_functions.getDatabaseSynId( syn, "processTracker", databaseToSynIdMappingDf=database_to_synid_mappingdf) #Add process tracker for time start processTracker = syn.tableQuery( "SELECT timeStartProcessing FROM %s where center = '%s' and processingType = '%s'" % (processTrackerSynId, center, process)) processTrackerDf = processTracker.asDataFrame() if len(processTrackerDf) == 0: new_rows = [[ center, str(int(time.time() * 1000)), str(int(time.time() * 1000)), process ]] table = syn.store( synapseclient.Table(processTrackerSynId, new_rows)) else: processTrackerDf['timeStartProcessing'][0] = str( int(time.time() * 1000)) syn.store( synapseclient.Table(processTrackerSynId, processTrackerDf)) processFiles(syn, validFiles, center, path_to_genie, thread, center_mapping_df, oncotree_link, database_to_synid_mappingdf, validVCF=validVCF, validMAFs=validMAFs, vcf2mafPath=vcf2maf_path, veppath=vep_path, vepdata=vep_data, test=testing, processing=process, reference=reference) #Should add in this process end tracking before the deletion of samples processTracker = syn.tableQuery( "SELECT timeEndProcessing FROM %s where center = '%s' and processingType = '%s'" % (processTrackerSynId, center, process)) processTrackerDf = processTracker.asDataFrame() processTrackerDf['timeEndProcessing'][0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) logger.info("SAMPLE/PATIENT RETRACTION") toRetract.retract(syn, testing) else: messageOut = "%s does not have any valid files" if not only_validate else "ONLY VALIDATION OCCURED FOR %s" logger.info(messageOut % center) #Store log file syn.store(synapseclient.File(log_path, parentId="syn10155804")) os.remove(log_path) logger.info("ALL PROCESSES COMPLETE")
def validation(syn, center, process, center_mapping_df, databaseToSynIdMappingDf, thread, testing, oncotreeLink): centerInputSynId = center_mapping_df['inputSynId'][ center_mapping_df['center'] == center][0] logger.info("Center: " + center) allFiles = getCenterInputFiles(syn, centerInputSynId, center, process) allFiles = pd.DataFrame(allFiles, columns=['synId', 'filePaths']) #If a center has no files, then return empty list if allFiles.empty: logger.info("%s has not uploaded any files" % center) return ([]) else: #Make sure the vcf validation statuses don't get wiped away if process != "vcf": addToQuery = "and name not like '%.vcf'" else: addToQuery = '' validationStatus = syn.tableQuery( "SELECT * FROM %s where center = '%s' %s" % (process_functions.getDatabaseSynId( syn, "validationStatus", databaseToSynIdMappingDf=databaseToSynIdMappingDf), center, addToQuery)) errorTracker = syn.tableQuery( "SELECT * FROM %s where center = '%s' %s" % (process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf), center, addToQuery)) #VALIDATE FILES validationStatusDf = validationStatus.asDataFrame() errorTrackerDf = errorTracker.asDataFrame() validated = allFiles.apply( lambda x: validateFile(syn, validationStatusDf, errorTrackerDf, center, thread, x, testing, oncotreeLink), axis=1) inputValidStatus = [] invalidErrors = [] for inputStat, invalErrors in validated: inputValidStatus.extend(inputStat) if invalErrors is not None: invalidErrors.extend(invalErrors) inputValidStatus = pd.DataFrame(inputValidStatus, columns=[ "id", 'path', 'md5', 'status', 'name', 'modifiedOn', 'fileType' ]) logger.info("CHECK FOR DUPLICATED FILES") ##### DUPLICATED FILES ###### #check for duplicated filenames. There should be no duplication, files should be uploaded as new versions and the entire dataset should be uploaded everytime #cbs and seg files should not be duplicated. There can only be one duplicatedFiles = inputValidStatus[inputValidStatus['name'].duplicated( keep=False)] cbsSegBool = [ os.path.basename(i).endswith('.cbs') or os.path.basename(i).endswith('.seg') for i in inputValidStatus['name'] ] cbsSegFiles = inputValidStatus[cbsSegBool] if len(cbsSegFiles) > 1: duplicatedFiles = duplicatedFiles.append(cbsSegFiles) # nodups = ["data_mutations_extended"] # allDuplicatedFiles = [] # for nodup in nodups: # checkDups = [name for name in inputValidStatus['name'] if name.startswith(nodup)] # if len(checkDups) > 1: # allDuplicatedFiles.extend(checkDups) # duplicatedFiles = duplicatedFiles.append(inputValidStatus[inputValidStatus['name'].isin(allDuplicatedFiles)]) duplicatedFiles.drop_duplicates("id", inplace=True) inputValidStatus['status'][inputValidStatus['id'].isin( duplicatedFiles['id'])] = "INVALID" duplicatedFiles[ 'errors'] = "DUPLICATED FILENAME! FILES SHOULD BE UPLOADED AS NEW VERSIONS AND THE ENTIRE DATASET SHOULD BE UPLOADED EVERYTIME" #Send an email if there are any duplicated files if not duplicatedFiles.empty: incorrectFiles = ", ".join([ name for synId, name in zip(duplicatedFiles['id'], duplicatedFiles['name']) ]) incorrectEnt = syn.get(duplicatedFiles['id'].iloc[0]) sendEmail = set([incorrectEnt.modifiedBy, incorrectEnt.createdBy]) userNames = ", ".join( [syn.getUserProfile(user).userName for user in sendEmail]) syn.sendMessage( list(sendEmail), "GENIE Validation Error", "Dear %s,\n\nYour files (%s) are duplicated! FILES SHOULD BE UPLOADED AS NEW VERSIONS AND THE ENTIRE DATASET SHOULD BE UPLOADED EVERYTIME" % (userNames, incorrectFiles)) logger.info("THERE ARE %d DUPLICATED FILES" % len(duplicatedFiles)) ##### DUPLICATED FILES ###### #Create invalid error synapse table logger.info("UPDATE INVALID FILE REASON DATABASE") invalidErrors = pd.DataFrame(invalidErrors, columns=["id", 'errors', 'name']) # Remove fixed duplicated files dupIds = invalidErrors['id'][ invalidErrors['errors'] == "DUPLICATED FILENAME! FILES SHOULD BE UPLOADED AS NEW VERSIONS AND THE ENTIRE DATASET SHOULD BE UPLOADED EVERYTIME"] removeIds = dupIds[~dupIds.isin(duplicatedFiles['id'])] invalidErrors = invalidErrors[~invalidErrors['id'].isin(removeIds)] # Append duplicated file errors invalidErrors = invalidErrors.append( duplicatedFiles[['id', 'errors', 'name']]) invalidErrors['center'] = center invalidIds = inputValidStatus['id'][inputValidStatus['status'] == "INVALID"] invalidErrors = invalidErrors[invalidErrors['id'].isin(invalidIds)] process_functions.updateDatabase( syn, errorTracker.asDataFrame(), invalidErrors, process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf), ["id"], toDelete=True) paths = inputValidStatus['path'] filenames = [os.path.basename(name) for name in paths] del inputValidStatus['path'] logger.info("UPDATE VALIDATION STATUS DATABASE") inputValidStatus['center'] = center #Remove fixed duplicated files inputValidStatus = inputValidStatus[~inputValidStatus['id']. isin(removeIds)] process_functions.updateDatabase( syn, validationStatus.asDataFrame(), inputValidStatus[[ "id", 'md5', 'status', 'name', 'center', 'modifiedOn' ]], process_functions.getDatabaseSynId( syn, "validationStatus", databaseToSynIdMappingDf=databaseToSynIdMappingDf), ["id"], toDelete=True) inputValidStatus['path'] = paths validFiles = inputValidStatus[[ 'id', 'path', 'fileType' ]][inputValidStatus['status'] == "VALIDATED"] return (validFiles)
def _validate(self, assay_info_df, project_id): """ Validates the values of assay information file Args: assay_info_df: assay information dataframe Returns: tuple: error and warning """ total_error = "" warning = "" if process_functions.checkColExist(assay_info_df, "SEQ_ASSAY_ID"): all_seq_assays = (assay_info_df.SEQ_ASSAY_ID.replace( { "_": "-" }, regex=True).str.upper().unique()) if not all( [assay.startswith(self.center) for assay in all_seq_assays]): total_error += ( "Assay_information.yaml: Please make sure all your " "SEQ_ASSAY_IDs start with your center abbreviation.\n") db_to_syn_map_df = process_functions.get_synid_database_mappingdf( self.syn, project_id) sample_synid = process_functions.getDatabaseSynId( self.syn, "sample", databaseToSynIdMappingDf=db_to_syn_map_df) uniq_seq_df = process_functions.get_syntabledf( self.syn, f"select distinct(SEQ_ASSAY_ID) as seq from {sample_synid} " f"where CENTER = '{self.center}'", ) # These are all the SEQ_ASSAY_IDs that are in the clinical database # but not in the assay_information file missing_seqs = uniq_seq_df["seq"][ ~uniq_seq_df["seq"].replace({ "_": "-" }, regex=True).str.upper().isin(all_seq_assays)] missing_seqs_str = ", ".join(missing_seqs) if missing_seqs.to_list(): total_error += ( "Assay_information.yaml: You are missing SEQ_ASSAY_IDs: " f"{missing_seqs_str}\n") else: total_error += "Assay_information.yaml: Must have SEQ_ASSAY_ID column.\n" read_group_dict = process_functions.get_gdc_data_dictionary( "read_group") read_group_headers = read_group_dict["properties"] warn, error = process_functions.check_col_and_values( assay_info_df, "is_paired_end", [True, False], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_selection", read_group_headers["library_selection"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "library_strategy", read_group_headers["library_strategy"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error warn, error = process_functions.check_col_and_values( assay_info_df, "platform", read_group_headers["platform"]["enum"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error instrument_model = read_group_headers["instrument_model"]["enum"] instrument_model.extend(["Illumina NovaSeq 6000", None]) warn, error = process_functions.check_col_and_values( assay_info_df, "instrument_model", instrument_model, filename="Assay_information.yaml", required=True, ) warning += warn total_error += error # target_capture_kit = read_group_headers['target_capture_kit']['enum'] # warn, error = process_functions.check_col_and_values( # assay_info_df, # 'target_capture_kit', # target_capture_kit, # filename="Assay_information.yaml", # required=True) # warning += warn # total_error += error if not process_functions.checkColExist(assay_info_df, "target_capture_kit"): total_error += ("Assay_information.yaml: " "Must have target_capture_kit column.\n") variant_classes = [ "Splice_Site", "Nonsense_Mutation", "Frame_Shift_Del", "Frame_Shift_Ins", "Nonstop_Mutation", "Translation_Start_Site", "In_Frame_Ins", "In_Frame_Del", "Missense_Mutation", "Intron", "Splice_Region", "Silent", "RNA", "5'UTR", "3'UTR", "IGR", "5'Flank", "3'Flank", None, ] warn, error = process_functions.check_col_and_values( assay_info_df, "variant_classifications", variant_classes, filename="Assay_information.yaml", na_allowed=True, sep=";", ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "read_length"): if not all([ process_functions.checkInt(i) for i in assay_info_df["read_length"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your read_length. " "It must be an integer or null.\n") else: total_error += "Assay_information.yaml: " "Must have read_length column.\n" if process_functions.checkColExist(assay_info_df, "number_of_genes"): if not all([ process_functions.checkInt(i) for i in assay_info_df["number_of_genes"] ]): total_error += ("Assay_information.yaml: " "Please double check your number_of_genes. " "It must be an integer.\n") else: total_error += ("Assay_information.yaml: " "Must have number_of_genes column.\n") if process_functions.checkColExist(assay_info_df, "gene_padding"): if not all([ process_functions.checkInt(i) for i in assay_info_df["gene_padding"] if i is not None and not pd.isnull(i) ]): total_error += ("Assay_information.yaml: " "Please double check your gene_padding. " "It must be an integer or blank.\n") else: warning += ("Assay_information.yaml: " "gene_padding is by default 10 if not specified.\n") warn, error = process_functions.check_col_and_values( assay_info_df, "calling_strategy", ["tumor_only", "tumor_normal", "plasma_normal"], filename="Assay_information.yaml", required=True, ) warning += warn total_error += error if process_functions.checkColExist(assay_info_df, "specimen_tumor_cellularity"): if not all([ i.startswith(">") and i.endswith("%") for i in assay_info_df["specimen_tumor_cellularity"] ]): total_error += ( "Assay_information.yaml: " "Please double check your specimen_tumor_cellularity. " "It must in this format >(num)%. ie. >10%\n") else: total_error += ("Assay_information.yaml: " "Must have specimen_tumor_cellularity column.\n") alteration_types = [ "snv", "small_indels", "gene_level_cna", "intragenic_cna", "structural_variants", ] warn, error = process_functions.check_col_and_values( assay_info_df, "alteration_types", alteration_types, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error preservation_technique = ["FFPE", "fresh_frozen", "NA"] warn, error = process_functions.check_col_and_values( assay_info_df, "preservation_technique", preservation_technique, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error coverage = ["hotspot_regions", "coding_exons", "introns", "promoters"] warn, error = process_functions.check_col_and_values( assay_info_df, "coverage", coverage, filename="Assay_information.yaml", required=True, sep=";", ) warning += warn total_error += error return total_error, warning
def _validate(self, cnvDF, nosymbol_check, project_id): total_error = "" warning = "" cnvDF.columns = [col.upper() for col in cnvDF.columns] if cnvDF.columns[0] != "HUGO_SYMBOL": total_error += "Your cnv file's first column must be Hugo_Symbol\n" haveColumn = process_functions.checkColExist(cnvDF, "HUGO_SYMBOL") if haveColumn: keepSymbols = cnvDF["HUGO_SYMBOL"] cnvDF.drop("HUGO_SYMBOL", axis=1, inplace=True) # if sum(cnvDF.apply(lambda x: sum(x.isnull()))) > 0: # total_error += "Your cnv file must not have any empty values\n" if process_functions.checkColExist(cnvDF, "ENTREZ_GENE_ID"): del cnvDF["ENTREZ_GENE_ID"] # cnvDF = cnvDF.fillna('') allowed_values = [ "-2.0", "-2", "-1.5", "-1.0", "-1", "0.0", "0", "0.5", "1.0", "1", "1.5", "2", "2.0", "nan", ] if not all(cnvDF.applymap(lambda x: str(x) in allowed_values).all()): total_error += ("All values must be NA/blank, -2, -1.5, -1, -0.5, " "0, 0.5, 1, 1.5, or 2.\n") else: cnvDF["HUGO_SYMBOL"] = keepSymbols if haveColumn and not nosymbol_check: databaseToSynIdMappingDf = ( process_functions.get_synid_database_mappingdf( self.syn, project_id)) bedSynId = process_functions.getDatabaseSynId( self.syn, "bed", databaseToSynIdMappingDf=databaseToSynIdMappingDf) bed = self.syn.tableQuery( "select Hugo_Symbol, ID from {} where " "CENTER = '{}'".format(bedSynId, self.center)) bedDf = bed.asDataFrame() cnvDF["remapped"] = cnvDF["HUGO_SYMBOL"].apply( lambda x: validateSymbol(x, bedDf)) cnvDF = cnvDF[~cnvDF["remapped"].isnull()] # Do not allow any duplicated genes after symbols # have been remapped if sum(cnvDF["remapped"].duplicated()) > 0: duplicated = cnvDF["remapped"].duplicated(keep=False) total_error += ( "Your CNA file has duplicated Hugo_Symbols " "(After remapping of genes): {} -> {}.\n".format( ",".join(cnvDF["HUGO_SYMBOL"][duplicated]), ",".join(cnvDF["remapped"][duplicated]), )) return (total_error, warning)
def main( process, project_id, center=None, pemfile=None, delete_old=False, only_validate=False, oncotree_link=None, genie_annotation_pkg=None, create_new_maf_database=False, debug=False, format_registry=None, ): syn = process_functions.synLogin(pemfile, debug=debug) # Get the Synapse Project where data is stored # Should have annotations to find the table lookup project = syn.get(project_id) database_to_synid_mapping_synid = project.annotations.get("dbMapping", "") databaseToSynIdMapping = syn.tableQuery( "SELECT * FROM {}".format(database_to_synid_mapping_synid[0]) ) databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() center_mapping_id = process_functions.getDatabaseSynId( syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf ) center_mapping = syn.tableQuery("SELECT * FROM %s" % center_mapping_id) center_mapping_df = center_mapping.asDataFrame() if center is not None: assert ( center in center_mapping_df.center.tolist() ), "Must specify one of these centers: {}".format( ", ".join(center_mapping_df.center) ) centers = [center] else: # exclude_sites = ['JHU', 'DFCI', 'GRCC', 'VICC', 'NKI', 'MSK', # 'UHN', 'MDA', 'WAKE', 'YALE', 'UCSF', 'CRUK', # 'CHOP', 'VHIO', 'SCI', 'PHS', 'COLU', 'UCHI'] center_mapping_df = center_mapping_df[~center_mapping_df["inputSynId"].isnull()] # release is a bool column center_mapping_df = center_mapping_df[center_mapping_df["release"]] # center_mapping_df = center_mapping_df[ # ~center_mapping_df['center'].isin(exclude_sites) # ] centers = center_mapping_df.center if oncotree_link is None: onco_link = databaseToSynIdMappingDf["Id"][ databaseToSynIdMappingDf["Database"] == "oncotreeLink" ].values[0] onco_link_ent = syn.get(onco_link) oncotree_link = onco_link_ent.externalURL # Check if you can connect to oncotree link, # if not then don't run validation / processing process_functions.checkUrl(oncotree_link) center_mapping_ent = syn.get(center_mapping_id) if center_mapping_ent.get("isProcessing", ["True"])[0] == "True": raise Exception( "Processing/validation is currently happening. " "Please change/add the 'isProcessing' annotation on {} " "to False to enable processing".format(center_mapping_id) ) else: center_mapping_ent.isProcessing = "True" center_mapping_ent = syn.store(center_mapping_ent) # remove this query timeout and see what happens # syn.table_query_timeout = 50000 # Create new maf database, should only happen once if its specified if create_new_maf_database: today = date.today() table_name = f"Narrow MAF Database - {today}" # filetype = "vcf2maf" # syn7208886 is the GENIE staging project to archive maf table new_tables = process_functions.create_new_fileformat_table( syn, "vcf2maf", table_name, project_id, "syn7208886" ) syn.setPermissions(new_tables["newdb_ent"].id, 3326313, []) databaseToSynIdMappingDf = new_tables["newdb_mappingdf"] format_registry = config.collect_format_types(args.format_registry_packages) for process_center in centers: input_to_database.center_input_to_database( syn, project_id, process_center, process, only_validate, databaseToSynIdMappingDf, center_mapping_df, delete_old=delete_old, oncotree_link=oncotree_link, format_registry=format_registry, genie_annotation_pkg=genie_annotation_pkg, ) # To ensure that this is the new entity center_mapping_ent = syn.get(center_mapping_id) center_mapping_ent.isProcessing = "False" center_mapping_ent = syn.store(center_mapping_ent) error_tracker_synid = process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf ) # Only write out invalid reasons if the center # isnt specified and if only validate if center is None and only_validate: logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS") write_invalid_reasons.write(syn, center_mapping_df, error_tracker_synid) logger.info("INPUT TO DATABASE COMPLETE")
def main(process, project_config=None, center=None, pemfile=None, delete_old=False, only_validate=False, oncotree_link=None, create_new_maf_database=False, testing=False, debug=False, reference=None, vcf2maf_path=None, vep_path=None, vep_data=None, thread=1, format_registry=config.PROCESS_FILES): syn = process_functions.synLogin(pemfile, debug=debug) try: # Must specify correct paths to vcf2maf, VEP and VEP data # if trying to process vcf, maf and mafSP if process in ['vcf', 'maf', 'mafSP'] and not only_validate: assert os.path.exists(vcf2maf_path), ( "Path to vcf2maf (--vcf2mafPath) must be specified " "if `--process {vcf,maf,mafSP}` is used") assert os.path.exists(vep_path), ( "Path to VEP (--vepPath) must be specified " "if `--process {vcf,maf,mafSP}` is used") assert os.path.exists(vep_data), ( "Path to VEP data (--vepData) must be specified " "if `--process {vcf,maf,mafSP}` is used") databaseToSynIdMapping = syn.tableQuery('SELECT * FROM {}'.format(project_config.get('database_to_synid_mapping'))) databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() center_mapping_id = process_functions.getDatabaseSynId( syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf) center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id) center_mapping_df = center_mapping.asDataFrame() if center is not None: assert center in center_mapping_df.center.tolist(), ( "Must specify one of these centers: {}".format( ", ".join(center_mapping_df.center))) centers = [center] else: center_mapping_df = \ center_mapping_df[~center_mapping_df['inputSynId'].isnull()] # release is a bool column center_mapping_df = center_mapping_df[center_mapping_df['release']] centers = center_mapping_df.center if oncotree_link is None: onco_link = databaseToSynIdMappingDf['Id'][ databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0] onco_link_ent = syn.get(onco_link) oncotree_link = onco_link_ent.externalURL # Check if you can connect to oncotree link, # if not then don't run validation / processing process_functions.checkUrl(oncotree_link) currently_processing = get_processing_status(syn, center_mapping_id) if currently_processing: logger.error( "Processing/validation is currently happening. " "Please change/add the 'isProcessing' annotation on {} " "to False to enable processing".format(center_mapping_id)) sys.exit(1) else: status = set_processing_status(syn, center_mapping_id, status=True) # remove this query timeout and see what happens # syn.table_query_timeout = 50000 # Create new maf database, should only happen once if its specified if create_new_maf_database: databaseToSynIdMappingDf = \ input_to_database.create_and_archive_maf_database(syn, databaseToSynIdMappingDf) format_registry = config.collect_format_types(args.format_registry_packages) logger.debug("Using {format_registry} file formats.".format( format_registry=format_registry)) for center in centers: input_to_database.center_input_to_database( syn, center, process, testing, only_validate, vcf2maf_path, vep_path, vep_data, databaseToSynIdMappingDf, center_mapping_df, reference=reference, delete_old=delete_old, oncotree_link=oncotree_link, thread=thread, format_registry=format_registry) # To ensure that this is the new entity center_mapping_ent = syn.get(center_mapping_id) center_mapping_ent.isProcessing = "False" center_mapping_ent = syn.store(center_mapping_ent) error_tracker_synid = process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf) # Only write out invalid reasons if the center # isnt specified and if only validate if center is None and only_validate: logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS") write_invalid_reasons.write_invalid_reasons( syn, center_mapping_df, error_tracker_synid) except Exception as e: raise e finally: _ = set_processing_status(syn, center_mapping_id, status=False)