def main(): ''' Main block with argparse and calls the main retract function ''' parser = argparse.ArgumentParser(description='Sample retraction') parser.add_argument("--pemFile", type=str, help="Path to PEM file (genie.pem)") parser.add_argument("--test", action='store_true', help="Run test") parser.add_argument("--debug", action='store_true', help="Synapse Debug Feature") args = parser.parse_args() syn = process_functions.synLogin(args.pemFile, debug=args.debug) retract(syn, args.test)
def main(): parser = argparse.ArgumentParser(description='Write invalid reasons') parser.add_argument("--pemFile", type=str, help="Path to PEM file (genie.pem)") parser.add_argument("--debug", action='store_true', help="Synapse Debug Feature") args = parser.parse_args() syn = process_functions.synLogin(args.pemFile, debug=args.debug) center_mapping = syn.tableQuery( 'SELECT * FROM syn10061452 where inputSynId is not null and release is true' ) center_mappingdf = center_mapping.asDataFrame() error_tracker_synid = "syn10153306" write_invalid_reasons(syn, center_mappingdf, error_tracker_synid)
def main(): parser = argparse.ArgumentParser(description="Update dashboard tables") parser.add_argument("--release", help="GENIE release number (ie. 5.3-consortium)", default=None) parser.add_argument("--pem_file", type=str, help="Path to PEM file (genie.pem)") parser.add_argument("--staging", action="store_true", help="Using staging directory files") parser.add_argument("--debug", action="store_true", help="Synapse debugging flag") parser.add_argument("--public", action="store_true", help="Set true if releasing public release") args = parser.parse_args() syn = process_functions.synLogin(args) if args.staging: # Database to Synapse Id mapping Table database_mapping_synid = "syn12094210" else: database_mapping_synid = "syn10967259" database_mapping = syn.tableQuery("select * from %s" % database_mapping_synid) database_mappingdf = database_mapping.asDataFrame() run_dashboard(syn, database_mappingdf, args.release, staging=args.staging, public=args.public)
def main(): """Set up argument parser and returns""" parser = argparse.ArgumentParser( description='GENIE center inputs to database') parser.add_argument("process", choices=['vcf', 'maf', 'main', 'mafSP'], help='Process vcf, maf or the rest of the files') parser.add_argument('--center', help='The centers') parser.add_argument("--pemFile", type=str, help="Path to PEM file (genie.pem)") parser.add_argument("--deleteOld", action='store_true', help="Delete all old processed and temp files") parser.add_argument("--onlyValidate", action='store_true', help="Only validate the files, don't process") parser.add_argument("--oncotreeLink", type=str, help="Link to oncotree code") parser.add_argument("--createNewMafDatabase", action='store_true', help="Creates a new maf database") parser.add_argument("--testing", action='store_true', help="Testing the infrastructure!") parser.add_argument("--debug", action='store_true', help="Add debug mode to synapse") parser.add_argument("--reference", type=str, help="Path to VCF reference file") #DEFAULT PARAMS parser.add_argument("--vcf2mafPath", type=str, help="Path to vcf2maf", default="~/vcf2maf-1.6.14") parser.add_argument("--vepPath", type=str, help="Path to VEP", default="~/vep") parser.add_argument("--vepData", type=str, help="Path to VEP data", default="~/.vep") parser.add_argument('--thread', type=int, help="Number of threads to use for validation", default=1) args = parser.parse_args() syn = process_functions.synLogin(args.pemFile, debug=args.debug) #Must specify path to vcf2maf, VEP and VEP data is these types are specified if args.process in ['vcf', 'maf', 'mafSP'] and not args.onlyValidate: assert os.path.exists( args.vcf2mafPath ), "Path to vcf2maf (--vcf2mafPath) must be specified if `--process {vcf,maf,mafSP}` is used" assert os.path.exists( args.vepPath ), "Path to VEP (--vepPath) must be specified if `--process {vcf,maf,mafSP}` is used" assert os.path.exists( args.vepData ), "Path to VEP data (--vepData) must be specified if `--process {vcf,maf,mafSP}` is used" if args.testing: databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn11600968') else: databaseToSynIdMapping = syn.tableQuery('SELECT * FROM syn10967259') databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() center_mapping_id = process_functions.getDatabaseSynId( syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf) center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id) center_mapping_df = center_mapping.asDataFrame() if args.center is not None: assert args.center in center_mapping_df.center.tolist( ), "Must specify one of these centers: %s" % ", ".join( center_mapping_df.center) centers = [args.center] else: center_mapping_df = center_mapping_df[~center_mapping_df['inputSynId']. isnull()] center_mapping_df = center_mapping_df[center_mapping_df['release'] == True] centers = center_mapping_df.center if args.oncotreeLink is None: onco_link = databaseToSynIdMappingDf['Id'][ databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0] onco_link_ent = syn.get(onco_link) args.oncotreeLink = onco_link_ent.externalURL #Check if you can connect to oncotree link, if not then don't run validation / processing process_functions.checkUrl(args.oncotreeLink) center_mapping_ent = syn.get(center_mapping_id) if center_mapping_ent.get('isProcessing', ['True'])[0] == 'True': raise Exception( "Processing/validation is currently happening. Please change/add the 'isProcessing' annotation on %s to False to enable processing" % center_mapping_id) else: center_mapping_ent.isProcessing = "True" center_mapping_ent = syn.store(center_mapping_ent) #remove this query timeout and see what happens #syn.table_query_timeout = 50000 #Create new maf database, should only happen once if its specified if args.createNewMafDatabase: createMafDatabase(syn, databaseToSynIdMappingDf, testing=args.testing) for center in centers: input_to_database(syn, center, args.process, args.testing, args.onlyValidate, args.vcf2mafPath, args.vepPath, args.vepData, databaseToSynIdMappingDf, center_mapping_df, reference=args.reference, delete_old=args.deleteOld, oncotree_link=args.oncotreeLink, thread=args.thread) # To ensure that this is the new entity center_mapping_ent = syn.get(center_mapping_id) center_mapping_ent.isProcessing = "False" center_mapping_ent = syn.store(center_mapping_ent) error_tracker_synid = process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf) #Only write out invalid reasons if the center isnt specified and if only validate if args.center is None and args.onlyValidate: logging.info("WRITING INVALID REASONS TO CENTER STAGING DIRS") write_invalid_reasons.write_invalid_reasons(syn, center_mapping_df, error_tracker_synid)
def main(args): cbioValidatorPath = os.path.join( args.cbioportalPath, "core/src/main/scripts/importer/validateData.py" ) assert os.path.exists(cbioValidatorPath), "Please specify correct cbioportalPath" assert not ( args.test and args.staging ), "You can only specify --test or --staging, not both" try: processingDate = datetime.datetime.strptime(args.processingDate, "%b-%Y") except ValueError: raise ValueError( "Process date must be in the format " "abbreviated_month-YEAR ie. Oct-2017" ) syn = process_functions.synLogin(args.pemFile, debug=args.debug) genie_user = os.environ.get("GENIE_USER") if args.pemFile is not None: genie_pass = process_functions.get_password(args.pemFile) else: genie_pass = None # Get all the possible public releases # Get configuration if args.test: databaseSynIdMappingId = "syn11600968" args.genieVersion = "TESTpublic" elif args.staging: databaseSynIdMappingId = "syn12094210" else: databaseSynIdMappingId = "syn10967259" databaseSynIdMapping = syn.tableQuery("select * from %s" % databaseSynIdMappingId) databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame() public_synid = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "public" ].values[0] releaseSynId = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "release" ].values[0] officialPublic = consortium_to_public.get_public_to_consortium_synid_mapping( syn, releaseSynId, test=args.test ) assert ( args.genieVersion in officialPublic.keys() ), "genieVersion must be one of these: {}.".format(", ".join(officialPublic.keys())) args.releaseId = officialPublic[args.genieVersion] if not args.test and not args.staging: processTrackerSynId = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "processTracker" ].values[0] processTracker = syn.tableQuery( "SELECT timeStartProcessing FROM %s where center = 'SAGE' " "and processingType = 'public'" % processTrackerSynId ) processTrackerDf = processTracker.asDataFrame() processTrackerDf["timeStartProcessing"].iloc[0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) caseListEntities, genePanelEntities = consortium_to_public.consortiumToPublic( syn, processingDate, args.genieVersion, args.releaseId, databaseSynIdMappingDf, publicReleaseCutOff=args.publicReleaseCutOff, ) database_to_staging.revise_metadata_files( syn, args.staging, public_synid, args.genieVersion ) logger.info("CBIO VALIDATION") # Must be exit 0 because the validator sometimes fails, # but we still want to capture the output command = [ cbioValidatorPath, "-s", database_to_staging.GENIE_RELEASE_DIR, "-n", "; exit 0", ] cbio_output = subprocess.check_output(" ".join(command), shell=True) cbio_decoded_output = cbio_output.decode("utf-8") logger.info(cbio_decoded_output) if not args.test and not args.staging: log_folder_synid = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "logs" ].values[0] # Use tempfiles cbio_log_file = "cbioValidatorLogsPublic_{}.txt".format(args.genieVersion) with open(cbio_log_file, "w") as cbioLog: cbioLog.write(cbio_decoded_output) syn.store(synapseclient.File(cbio_log_file, parentId=log_folder_synid)) os.remove(cbio_log_file) logger.info("REMOVING OLD FILES") process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) seg_meta_file = "{}/genie_public_meta_cna_hg19_seg.txt".format( database_to_staging.GENIE_RELEASE_DIR ) if os.path.exists(seg_meta_file): os.unlink(seg_meta_file) logger.info("CREATING LINK VERSION") folders = database_to_staging.create_link_version( syn, args.genieVersion, caseListEntities, genePanelEntities, databaseSynIdMappingDf, release_type="public", ) # Don't update process tracker is testing or staging if not args.test and not args.staging: processTracker = syn.tableQuery( "SELECT timeEndProcessing FROM %s where center = 'SAGE' and " "processingType = 'public'" % processTrackerSynId ) processTrackerDf = processTracker.asDataFrame() processTrackerDf["timeEndProcessing"].iloc[0] = str(int(time.time() * 1000)) syn.store(synapseclient.Table(processTrackerSynId, processTrackerDf)) if not args.test: logger.info("DASHBOARD UPDATE") dashboard_table_updater.run_dashboard( syn, databaseSynIdMappingDf, args.genieVersion, staging=args.staging ) generate_dashboard_html( args.genieVersion, staging=args.staging, genie_user=genie_user, genie_pass=genie_pass, ) logger.info("DASHBOARD UPDATE COMPLETE") logger.info("AUTO GENERATE DATA GUIDE") onco_link = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "oncotreeLink" ].values[0] onco_link_ent = syn.get(onco_link) oncotree_link = onco_link_ent.externalURL oncotree_version = oncotree_link.split("=")[1] data_guide_pdf = generate_data_guide( args.genieVersion, oncotree_version=oncotree_version, database_mapping=databaseSynIdMappingId, genie_user=genie_user, genie_pass=genie_pass, ) data_guide_ent = synapseclient.File( data_guide_pdf, parent=folders["release_folder"] ) syn.store(data_guide_ent) logger.info("COMPLETED CONSORTIUM TO PUBLIC")
def main( process, project_id, center=None, pemfile=None, delete_old=False, only_validate=False, oncotree_link=None, genie_annotation_pkg=None, create_new_maf_database=False, debug=False, format_registry=None, ): syn = process_functions.synLogin(pemfile, debug=debug) # Get the Synapse Project where data is stored # Should have annotations to find the table lookup project = syn.get(project_id) database_to_synid_mapping_synid = project.annotations.get("dbMapping", "") databaseToSynIdMapping = syn.tableQuery( "SELECT * FROM {}".format(database_to_synid_mapping_synid[0]) ) databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() center_mapping_id = process_functions.getDatabaseSynId( syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf ) center_mapping = syn.tableQuery("SELECT * FROM %s" % center_mapping_id) center_mapping_df = center_mapping.asDataFrame() if center is not None: assert ( center in center_mapping_df.center.tolist() ), "Must specify one of these centers: {}".format( ", ".join(center_mapping_df.center) ) centers = [center] else: # exclude_sites = ['JHU', 'DFCI', 'GRCC', 'VICC', 'NKI', 'MSK', # 'UHN', 'MDA', 'WAKE', 'YALE', 'UCSF', 'CRUK', # 'CHOP', 'VHIO', 'SCI', 'PHS', 'COLU', 'UCHI'] center_mapping_df = center_mapping_df[~center_mapping_df["inputSynId"].isnull()] # release is a bool column center_mapping_df = center_mapping_df[center_mapping_df["release"]] # center_mapping_df = center_mapping_df[ # ~center_mapping_df['center'].isin(exclude_sites) # ] centers = center_mapping_df.center if oncotree_link is None: onco_link = databaseToSynIdMappingDf["Id"][ databaseToSynIdMappingDf["Database"] == "oncotreeLink" ].values[0] onco_link_ent = syn.get(onco_link) oncotree_link = onco_link_ent.externalURL # Check if you can connect to oncotree link, # if not then don't run validation / processing process_functions.checkUrl(oncotree_link) center_mapping_ent = syn.get(center_mapping_id) if center_mapping_ent.get("isProcessing", ["True"])[0] == "True": raise Exception( "Processing/validation is currently happening. " "Please change/add the 'isProcessing' annotation on {} " "to False to enable processing".format(center_mapping_id) ) else: center_mapping_ent.isProcessing = "True" center_mapping_ent = syn.store(center_mapping_ent) # remove this query timeout and see what happens # syn.table_query_timeout = 50000 # Create new maf database, should only happen once if its specified if create_new_maf_database: today = date.today() table_name = f"Narrow MAF Database - {today}" # filetype = "vcf2maf" # syn7208886 is the GENIE staging project to archive maf table new_tables = process_functions.create_new_fileformat_table( syn, "vcf2maf", table_name, project_id, "syn7208886" ) syn.setPermissions(new_tables["newdb_ent"].id, 3326313, []) databaseToSynIdMappingDf = new_tables["newdb_mappingdf"] format_registry = config.collect_format_types(args.format_registry_packages) for process_center in centers: input_to_database.center_input_to_database( syn, project_id, process_center, process, only_validate, databaseToSynIdMappingDf, center_mapping_df, delete_old=delete_old, oncotree_link=oncotree_link, format_registry=format_registry, genie_annotation_pkg=genie_annotation_pkg, ) # To ensure that this is the new entity center_mapping_ent = syn.get(center_mapping_id) center_mapping_ent.isProcessing = "False" center_mapping_ent = syn.store(center_mapping_ent) error_tracker_synid = process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf ) # Only write out invalid reasons if the center # isnt specified and if only validate if center is None and only_validate: logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS") write_invalid_reasons.write(syn, center_mapping_df, error_tracker_synid) logger.info("INPUT TO DATABASE COMPLETE")
def main(genie_version, processing_date, cbioportal_path, oncotree_link=None, consortium_release_cutoff=184, pemfile=None, test=False, staging=False, debug=False, skip_mutationsincis=False): ''' - Does parameter checks - Updates process tracking start - initiates database to staging - create case lists - revise meta files - run cBioPortal validation - create link versions - update process tracking end - Create dashboard tables and plots Args: genie_version: GENIE version, processing_date: processing date cbioportal_path: Path to cbioportal validator oncotree_link: Link to oncotree codes consortium_release_cutoff: release cut off value in days pemfile: Path to private key file test: Test flag, uses test databases staging: Staging flag, uses staging databases debug: Synapse debug flag skip_mutationsincis: Skip mutation in cis filter ''' syn = process_functions.synLogin(pemfile, debug=debug) genie_user = os.environ.get('GENIE_USER') if pemfile is not None: genie_pass = process_functions.get_password(pemfile) else: genie_pass = None if test: databaseSynIdMappingId = 'syn11600968' genie_version = "TESTING" elif staging: skip_mutationsincis = True databaseSynIdMappingId = 'syn12094210' else: databaseSynIdMappingId = 'syn10967259' # Database/folder syn id mapping databaseSynIdMapping = syn.tableQuery( 'select * from {}'.format(databaseSynIdMappingId)) databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame() # databaseSynIdMappingDf.index = databaseSynIdMappingDf.Database # del databaseSynIdMappingDf['Database'] # databaseSynIdMappingDf.to_dict() if oncotree_link is None: oncoLink = databaseSynIdMappingDf['Id'][ databaseSynIdMappingDf['Database'] == 'oncotreeLink'].values[0] oncoLinkEnt = syn.get(oncoLink) oncotree_link = oncoLinkEnt.externalURL # Check if you can connect to oncotree link, # if not then don't run validation / processing process_functions.checkUrl(oncotree_link) cbioValidatorPath = os.path.join( cbioportal_path, "core/src/main/scripts/importer/validateData.py") assert os.path.exists(cbioValidatorPath),\ "Please specify correct cbioportalPath" syn.table_query_timeout = 50000 consortiumSynId = databaseSynIdMappingDf['Id'][ databaseSynIdMappingDf['Database'] == 'consortium'].values[0] processTrackerSynId = databaseSynIdMappingDf['Id'][ databaseSynIdMappingDf['Database'] == 'processTracker'].values[0] # get syn id of case list folder in consortium release # caseListSynId = findCaseListId(syn, consortiumSynId) caseListSynId, _ = database_to_staging.search_and_create_folder( syn, consortiumSynId, "case_lists") if not staging: database_to_staging.update_process_trackingdf(syn, processTrackerSynId, 'SAGE', 'dbToStage', start=True) centerMappingSynId = databaseSynIdMappingDf['Id'][ databaseSynIdMappingDf['Database'] == 'centerMapping'].values[0] # Only release files where release is true center_mapping = syn.tableQuery( 'SELECT * FROM {} where release is true'.format(centerMappingSynId)) center_mappingdf = center_mapping.asDataFrame() processingDate = datetime.datetime.strptime(processing_date, '%b-%Y') logger.info("STAGING TO CONSORTIUM") genePanelEntities = database_to_staging.stagingToCbio( syn, processingDate, genie_version, center_mappingdf, databaseSynIdMappingDf, oncotree_url=oncotree_link, consortiumReleaseCutOff=consortium_release_cutoff, current_release_staging=staging, skipMutationsInCis=skip_mutationsincis, test=test, genie_user=genie_user, genie_pass=genie_pass) # Create case lists files logger.info("CREATE CASE LIST FILES") # Remove old caselists first if not os.path.exists(database_to_staging.CASE_LIST_PATH): os.mkdir(database_to_staging.CASE_LIST_PATH) caselists = os.listdir(database_to_staging.CASE_LIST_PATH) for caselist in caselists: os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist)) clinical_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, 'data_clinical_{}.txt'.format(genie_version)) gene_matrix_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, "data_gene_matrix_{}.txt".format(genie_version)) create_case_lists.main( clinical_path, gene_matrix_path, database_to_staging.CASE_LIST_PATH, "genie_private") caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH) caseListEntities = [] for casePath in caseListFiles: casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath) caseListEntities.append(database_to_staging.store_file( syn, casePath, parent=caseListSynId, genieVersion=genie_version)) logger.info("REMOVING UNNECESSARY FILES") genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR) for genie_file in genie_files: if genie_version not in genie_file and \ "meta" not in genie_file and "case_lists" not in genie_file: os.remove(os.path.join(database_to_staging.GENIE_RELEASE_DIR, genie_file)) os.remove(clinical_path) logger.info("REVISE METADATA FILES") database_to_staging.revise_metadata_files(syn, staging, consortiumSynId, genie_version) logger.info("CBIO VALIDATION") ''' Must be exit 0 because the validator sometimes fails, but we still want to capture the output ''' command = [cbioValidatorPath, '-s', database_to_staging.GENIE_RELEASE_DIR, '-n', '; exit 0'] cbioOutput = subprocess.check_output(" ".join(command), shell=True) logger.info(cbioOutput.decode("utf-8")) cbio_validator_log = \ "cbioValidatorLogsConsortium_{}.txt".format(genie_version) if not test and not staging: log_folder_synid = databaseSynIdMappingDf['Id'][ databaseSynIdMappingDf['Database'] == 'logs'].values[0] with open(cbio_validator_log, "w") as cbioLog: cbioLog.write(cbioOutput.decode("utf-8")) syn.store(synapseclient.File( cbio_validator_log, parentId=log_folder_synid)) os.remove(cbio_validator_log) logger.info("REMOVING OLD FILES") process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) private_cna_meta_path = os.path.join(database_to_staging.GENIE_RELEASE_DIR, "genie_private_meta_cna_hg19_seg.txt") if os.path.exists(private_cna_meta_path): os.unlink(private_cna_meta_path) logger.info("CREATING LINK VERSION") database_to_staging.create_link_version(syn, genie_version, caseListEntities, genePanelEntities, databaseSynIdMappingDf) if not staging: database_to_staging.update_process_trackingdf( syn, processTrackerSynId, 'SAGE', 'dbToStage', start=False) logger.info("COMPLETED DATABASE TO STAGING") if not test: logger.info("DASHBOARD UPDATE") dashboard_table_updater.run_dashboard( syn, databaseSynIdMappingDf, genie_version, staging=staging) dashboard_markdown_html_commands = [ 'Rscript', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'dashboard_markdown_generator.R'), genie_version] if genie_user is not None and genie_pass is not None: dashboard_markdown_html_commands.extend( ['--syn_user', genie_user, '--syn_pass', genie_pass]) if staging: dashboard_markdown_html_commands.append('--staging') subprocess.check_call(dashboard_markdown_html_commands) logger.info("DASHBOARD UPDATE COMPLETE")
def main( genie_version, processing_date, cbioportal_path, oncotree_link=None, consortium_release_cutoff=184, pemfile=None, test=False, staging=False, debug=False, skip_mutationsincis=False, ): """ - Does parameter checks - Updates process tracking start - initiates database to staging - create case lists - revise meta files - run cBioPortal validation - create link versions - update process tracking end - Create dashboard tables and plots Args: genie_version: GENIE version, processing_date: processing date cbioportal_path: Path to cbioportal validator oncotree_link: Link to oncotree codes consortium_release_cutoff: release cut off value in days pemfile: Path to private key file test: Test flag, uses test databases staging: Staging flag, uses staging databases debug: Synapse debug flag skip_mutationsincis: Skip mutation in cis filter """ syn = process_functions.synLogin(pemfile, debug=debug) genie_user = os.environ.get("GENIE_USER") if pemfile is not None: genie_pass = process_functions.get_password(pemfile) else: genie_pass = None if test: databaseSynIdMappingId = "syn11600968" genie_version = "TESTING" elif staging: skip_mutationsincis = True databaseSynIdMappingId = "syn12094210" else: databaseSynIdMappingId = "syn10967259" # Database/folder syn id mapping databaseSynIdMapping = syn.tableQuery( "select * from {}".format(databaseSynIdMappingId)) databaseSynIdMappingDf = databaseSynIdMapping.asDataFrame() # databaseSynIdMappingDf.index = databaseSynIdMappingDf.Database # del databaseSynIdMappingDf['Database'] # databaseSynIdMappingDf.to_dict() if oncotree_link is None: oncoLink = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "oncotreeLink"].values[0] oncoLinkEnt = syn.get(oncoLink) oncotree_link = oncoLinkEnt.externalURL # Check if you can connect to oncotree link, # if not then don't run validation / processing process_functions.checkUrl(oncotree_link) cbioValidatorPath = os.path.join( cbioportal_path, "core/src/main/scripts/importer/validateData.py") assert os.path.exists( cbioValidatorPath), "Please specify correct cbioportalPath" syn.table_query_timeout = 50000 consortiumSynId = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "consortium"].values[0] processTrackerSynId = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "processTracker"].values[0] # get syn id of case list folder in consortium release # caseListSynId = findCaseListId(syn, consortiumSynId) caseListSynId, _ = database_to_staging.search_and_create_folder( syn, consortiumSynId, "case_lists") if not staging: database_to_staging.update_process_trackingdf(syn, processTrackerSynId, "SAGE", "dbToStage", start=True) centerMappingSynId = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "centerMapping"].values[0] # Only release files where release is true center_mapping = syn.tableQuery( "SELECT * FROM {} where release is true".format(centerMappingSynId)) center_mappingdf = center_mapping.asDataFrame() processingDate = datetime.datetime.strptime(processing_date, "%b-%Y") logger.info("STAGING TO CONSORTIUM") genePanelEntities = database_to_staging.stagingToCbio( syn, processingDate, genie_version, center_mappingdf, databaseSynIdMappingDf, oncotree_url=oncotree_link, consortiumReleaseCutOff=consortium_release_cutoff, current_release_staging=staging, skipMutationsInCis=skip_mutationsincis, test=test, genie_user=genie_user, genie_pass=genie_pass, ) # Create case lists files logger.info("CREATE CASE LIST FILES") # Remove old caselists first if not os.path.exists(database_to_staging.CASE_LIST_PATH): os.mkdir(database_to_staging.CASE_LIST_PATH) caselists = os.listdir(database_to_staging.CASE_LIST_PATH) for caselist in caselists: os.remove(os.path.join(database_to_staging.CASE_LIST_PATH, caselist)) clinical_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, "data_clinical_{}.txt".format(genie_version), ) assay_information_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, "assay_information_{}.txt".format(genie_version), ) create_case_lists.main( clinical_path, assay_information_path, database_to_staging.CASE_LIST_PATH, "genie_private", ) caseListFiles = os.listdir(database_to_staging.CASE_LIST_PATH) caseListEntities = [] for casePath in caseListFiles: casePath = os.path.join(database_to_staging.CASE_LIST_PATH, casePath) caseListEntities.append( database_to_staging.store_file(syn, casePath, parent=caseListSynId, genieVersion=genie_version)) logger.info("REMOVING UNNECESSARY FILES") genie_files = os.listdir(database_to_staging.GENIE_RELEASE_DIR) for genie_file in genie_files: if (genie_version not in genie_file and "meta" not in genie_file and "case_lists" not in genie_file): os.remove( os.path.join(database_to_staging.GENIE_RELEASE_DIR, genie_file)) os.remove(clinical_path) logger.info("REVISE METADATA FILES") database_to_staging.revise_metadata_files(syn, staging, consortiumSynId, genie_version) logger.info("CBIO VALIDATION") # Must be exit 0 because the validator sometimes fails, # but we still want to capture the output command = [ cbioValidatorPath, "-s", database_to_staging.GENIE_RELEASE_DIR, "-n", "; exit 0", ] cbioOutput = subprocess.check_output(" ".join(command), shell=True) logger.info(cbioOutput.decode("utf-8")) cbio_validator_log = f"cbioValidatorLogsConsortium_{genie_version}.txt" if not test and not staging: log_folder_synid = databaseSynIdMappingDf["Id"][ databaseSynIdMappingDf["Database"] == "logs"].values[0] with open(cbio_validator_log, "w") as cbio_log: cbio_log.write(cbioOutput.decode("utf-8")) syn.store( synapseclient.File(cbio_validator_log, parentId=log_folder_synid)) os.remove(cbio_validator_log) logger.info("REMOVING OLD FILES") process_functions.rmFiles(database_to_staging.CASE_LIST_PATH) private_cna_meta_path = os.path.join( database_to_staging.GENIE_RELEASE_DIR, "genie_private_meta_cna_hg19_seg.txt") if os.path.exists(private_cna_meta_path): os.unlink(private_cna_meta_path) logger.info("CREATING LINK VERSION") # Returns release and case list folder folders = database_to_staging.create_link_version(syn, genie_version, caseListEntities, genePanelEntities, databaseSynIdMappingDf) if not staging: database_to_staging.update_process_trackingdf(syn, processTrackerSynId, "SAGE", "dbToStage", start=False) if not test: logger.info("DASHBOARD UPDATE") dashboard_table_updater.run_dashboard(syn, databaseSynIdMappingDf, genie_version, staging=staging) generate_dashboard_html(genie_version, staging=staging, genie_user=genie_user, genie_pass=genie_pass) logger.info("DASHBOARD UPDATE COMPLETE") logger.info("AUTO GENERATE DATA GUIDE") oncotree_version = oncotree_link.split("=")[1] data_guide_pdf = generate_data_guide( genie_version, oncotree_version=oncotree_version, database_mapping=databaseSynIdMappingId, genie_user=genie_user, genie_pass=genie_pass, ) database_to_staging.store_file( syn, data_guide_pdf, genieVersion=genie_version, parent=folders["release_folder"], ) logger.info("COMPLETED DATABASE TO STAGING")
def main(process, project_config=None, center=None, pemfile=None, delete_old=False, only_validate=False, oncotree_link=None, create_new_maf_database=False, testing=False, debug=False, reference=None, vcf2maf_path=None, vep_path=None, vep_data=None, thread=1, format_registry=config.PROCESS_FILES): syn = process_functions.synLogin(pemfile, debug=debug) try: # Must specify correct paths to vcf2maf, VEP and VEP data # if trying to process vcf, maf and mafSP if process in ['vcf', 'maf', 'mafSP'] and not only_validate: assert os.path.exists(vcf2maf_path), ( "Path to vcf2maf (--vcf2mafPath) must be specified " "if `--process {vcf,maf,mafSP}` is used") assert os.path.exists(vep_path), ( "Path to VEP (--vepPath) must be specified " "if `--process {vcf,maf,mafSP}` is used") assert os.path.exists(vep_data), ( "Path to VEP data (--vepData) must be specified " "if `--process {vcf,maf,mafSP}` is used") databaseToSynIdMapping = syn.tableQuery('SELECT * FROM {}'.format(project_config.get('database_to_synid_mapping'))) databaseToSynIdMappingDf = databaseToSynIdMapping.asDataFrame() center_mapping_id = process_functions.getDatabaseSynId( syn, "centerMapping", databaseToSynIdMappingDf=databaseToSynIdMappingDf) center_mapping = syn.tableQuery('SELECT * FROM %s' % center_mapping_id) center_mapping_df = center_mapping.asDataFrame() if center is not None: assert center in center_mapping_df.center.tolist(), ( "Must specify one of these centers: {}".format( ", ".join(center_mapping_df.center))) centers = [center] else: center_mapping_df = \ center_mapping_df[~center_mapping_df['inputSynId'].isnull()] # release is a bool column center_mapping_df = center_mapping_df[center_mapping_df['release']] centers = center_mapping_df.center if oncotree_link is None: onco_link = databaseToSynIdMappingDf['Id'][ databaseToSynIdMappingDf['Database'] == 'oncotreeLink'].values[0] onco_link_ent = syn.get(onco_link) oncotree_link = onco_link_ent.externalURL # Check if you can connect to oncotree link, # if not then don't run validation / processing process_functions.checkUrl(oncotree_link) currently_processing = get_processing_status(syn, center_mapping_id) if currently_processing: logger.error( "Processing/validation is currently happening. " "Please change/add the 'isProcessing' annotation on {} " "to False to enable processing".format(center_mapping_id)) sys.exit(1) else: status = set_processing_status(syn, center_mapping_id, status=True) # remove this query timeout and see what happens # syn.table_query_timeout = 50000 # Create new maf database, should only happen once if its specified if create_new_maf_database: databaseToSynIdMappingDf = \ input_to_database.create_and_archive_maf_database(syn, databaseToSynIdMappingDf) format_registry = config.collect_format_types(args.format_registry_packages) logger.debug("Using {format_registry} file formats.".format( format_registry=format_registry)) for center in centers: input_to_database.center_input_to_database( syn, center, process, testing, only_validate, vcf2maf_path, vep_path, vep_data, databaseToSynIdMappingDf, center_mapping_df, reference=reference, delete_old=delete_old, oncotree_link=oncotree_link, thread=thread, format_registry=format_registry) # To ensure that this is the new entity center_mapping_ent = syn.get(center_mapping_id) center_mapping_ent.isProcessing = "False" center_mapping_ent = syn.store(center_mapping_ent) error_tracker_synid = process_functions.getDatabaseSynId( syn, "errorTracker", databaseToSynIdMappingDf=databaseToSynIdMappingDf) # Only write out invalid reasons if the center # isnt specified and if only validate if center is None and only_validate: logger.info("WRITING INVALID REASONS TO CENTER STAGING DIRS") write_invalid_reasons.write_invalid_reasons( syn, center_mapping_df, error_tracker_synid) except Exception as e: raise e finally: _ = set_processing_status(syn, center_mapping_id, status=False)