def actionClean(): """ Deletes output folders and files generated from idl """ result = NO_ERROR Logger.printStartActionMessage(ACTION_CLEAN) #Init cleanup logger Cleanup.init() for action in Settings.cleanupOptions['actions']: if action == 'cleanOutput': for target in Settings.cleanupOptions['targets']: for platform in Settings.cleanupOptions['platforms']: for cpu in Settings.cleanupOptions['cpus']: for configuration in Settings.cleanupOptions[ 'configurations']: #Clean up output folders for specific target, platform, cpu and configuration result = Cleanup.run(action, target, platform, cpu, configuration) else: #Perform other cleanup acrions that are not dependent of target ... result = Cleanup.run(action) if result == NO_ERROR: Logger.printEndActionMessage(ACTION_CLEAN) else: Logger.printEndActionMessage('Cleanup failed!', ColoredFormatter.RED) System.stopExecution(result)
def __init__(self, options): self.no_errors = True self.cleanup_sample = Cleanup() self.options = options self.sample_json = json.load(open(options.json)) # initialize the QC_Run object self.qc_run = QC_Run(self.sample_json, options.recalc_3x3_tables) self.__QCDirectory = "%s/QC" % self.sample_json['analysis'][ 'software_directory']
def main(): """ Current board [0] [1] [2] [3] [4] [5] [6] [7] [8] [0] i i o o o o o o [1] o o o o o o o o o [2] o o o o o o o o o [3] o o o o o o o o o [4] o o o o o o o o o [5] o o o o o o o o o [6] o o o o o o o o o [7] o o o o o o o o o <- [8, 7] [8] o o o o o o o o o """ controller_settings = {'id': 'mk1', 'assignments': {}} controller = Controller(controller_settings) def all_on(): controller.allOn("green_bright") def all_off(): controller.allOff() def print_assignments(): controller.printAssignments() controller.addAssignment('all_on_off', [0, 0], all_on, all_off) controller.addAssignment('print_assignments', [1, 0], print_assignments, print_assignments) cleanup_settings = {'object_to_clean': controller} cleanup = Cleanup(cleanup_settings) while True: controller.buttonEvent()
def __init__(self, options): self.__QCDirectory = "/rawdata/legos/scripts/QC" self.no_errors = True self.cleanup_sample = Cleanup() self.options = options self.sample_json = json.load(open(options.json)) self.qc_run = QC_Run(self.sample_json)
def start(self): # Script statup steps logger.info('PolyEngine v1.0') config = Config('config.ini') project_name = config.check_setting('PolyEngine', 'Name') logger.info('Starting project {}', project_name) message = config.check_setting('PolyEngine', 'Message') logger.info(message) # Source directory of project based on config file source_directory = config.check_setting('Compile', 'SourceDirectory') # Create the temporary code modification workspace workspace = Workspace(source_directory) workspace.create_workspace() # Process the files for f in workspace.source_files: if f is not None: processor = Processing(f) processor.process() for f in workspace.header_files: if f is not None: processor = Processing(f) processor.process() # Initialize the compiler once information has been loaded output_file = config.check_setting('Compile', 'Output') commands = config.check_setting('Compile', 'Commands') compiler_option = config.check_setting('Compile', 'Compiler') if compiler_option == 'gcc' or compiler_option == 'g++': compiler = Compile(compiler_option, workspace.source_files, commands, output_file) compiler.compile() else: logger.error('Invalid compiler option selected.') exit('Invalid compiler.') # Cleanup workspace and exit print() Cleanup.clean_exit(workspace.work_path)
def __init__(self, sourcedir, masterdir, bibfile, orgfile, serial=None, alfdir=None, also_repeated=False, doctype='book'): self.sourcedir = os.path.expanduser(sourcedir) self.masterdir = os.path.expanduser(masterdir) if serial and alfdir: self.cleanup = Cleanup(serial, alfdir) else: self.cleanup = None self.bibfile = os.path.expanduser(bibfile) self.orgfile = os.path.expanduser(orgfile) self.also_repeated = also_repeated self.doctype = doctype
def plot_data(log_q, plot_power=True, plot_throttle=True, plot_temps=True, dirpath=os.getcwd()): """Walks directories, finds all csv data, then sorts into dataframes and plots""" # paths holds the filepaths for each type of data paths = get_filepaths.get_csv_paths(dirpath) # paths becomes a list of dicts, where each element of the list holds a dict with direct links to full filepath for each type of data paths = get_filepaths.filter_data(paths) print("plot_data q object {}".format(log_q)) log_q.put("Running...") logging.info("Running...") # for each "test" in paths create dataframes and plots (where a test is a particular setup ie 1085_fans_high_heaters_on_5G) for test in paths: # if test is an empty dictionary, skip that iteration if not test: continue logging.debug("Dataset is at {}".format(test)) dirpath = os.path.dirname(test["smart"]) try: # Check that thermal data is present thermal = ThermalData(test["thermal"], log_q) therm_df = thermal.get_therm_df() therm_exists = True except KeyError: # If not present, skip that iteration. logging.warning( "The data at {} has no thermal data or is mislabled.".format( dirpath)) therm_exists = False therm_df = None # Clean up the SMART data so it matches the expected dataframe cleaner = Cleanup(test["smart"]) # Append Hour to SMART data if necessary, using thermal data's starting hour time. This should *almost* always work # Check that the thermal data actually exists first. NOTE: THIS MAY CAUSE ISSUES WITH POWER VS SMART DATA MATCHING IF THERMAL DATA DOESN'T EXIST if therm_exists: cleaner.fix_time(thermal.start_hour) cleaner.drop_unnamed() cleaner.split_brackets() cleaner.standardize_drive_temp() cleaner.output_csv() smart = SmartData(test["smart"], log_q) try: power = PowerData(test["3p3v"], test["12v"], log_q) power.write_power_csv() power_df = power.get_total_power() except KeyError as e: log_q.put("No Power data for {}".format(dirpath)) logging.info("No Power data for {}".format(dirpath)) power_df = None # Now accounts for missing thermal data alldata = CombinedData(dirpath, log_q, smart.get_smart_df(), therm_df, power_df) # Set the mode based on SMART data if "read" in test["smart"].lower(): alldata.mode = "Total Read Rate" else: alldata.mode = "Total Write Rate" if plot_throttle: alldata.plot_throttle() if plot_temps and therm_exists: alldata.plot_temps() if plot_power: alldata.plot_power() alldata.power_vs_bw() plt.clf() comb_data_path = os.path.join(dirpath, "Combined_Data.csv") log_q.put("Outputting combined data to {}".format(comb_data_path)) logging.info("Outputting combined data to {}".format(comb_data_path)) alldata.output_comb_df(comb_data_path) log_q.put("Finished!") logging.info("Finished!")
class ImportBooks(object): def __init__(self, sourcedir, masterdir, bibfile, orgfile, serial=None, alfdir=None, also_repeated=False, doctype='book'): self.sourcedir = os.path.expanduser(sourcedir) self.masterdir = os.path.expanduser(masterdir) if serial and alfdir: self.cleanup = Cleanup(serial, alfdir) else: self.cleanup = None self.bibfile = os.path.expanduser(bibfile) self.orgfile = os.path.expanduser(orgfile) self.also_repeated = also_repeated self.doctype = doctype def add_to_bib(self, bibstr, bibid): if os.path.exists(self.bibfile): bib = codecs.open(self.bibfile, encoding='utf-8').read() else: bib = '' if not bibid in bib: with codecs.open(self.bibfile, 'a', encoding='utf-8') as f: f.write(u'\n' + bibstr + u'\n') return True return False def clippings_to_org(self, bookfile, meta): kc = KindleBook(bookfile, text_path='text', meta=meta) kc.print_clippings(self.orgfile, self.doctype) def convert(self, book, bu_dir='imported'): if not os.path.exists(book): book = os.path.join(self.sourcedir, book) print book original_book = book ext = os.path.splitext(book)[1] if ext == '.azw': if self.cleanup: book = self.cleanup.decrypt(book) else: print("** Won't be able to clean up " + book + ", need a kindle serial") return bibstr, meta = docid.bibstr(book, self.doctype, add_isbn=True) bibid = meta['bibid'] new = self.add_to_bib(bibstr, bibid) newbook = os.path.join(self.masterdir, bibid) if new or self.also_repeated: if ext in ('.mobi', '.pdf'): newbook = newbook + ext os.rename(book, newbook) else: newbook = newbook + '.mobi' devnull = codecs.open(os.devnull, 'w', encoding='utf-8') if subprocess.call(['ebook-convert', book, newbook], stdout=devnull, stderr=devnull): print("** Error converting to " + newbook + " (maybe DRMed book?)") return None else: if not os.path.exists(bu_dir): os.mkdir(bu_dir) bu = os.path.join(bu_dir, bibid + ext) print ' ...moving away', original_book, 'to', bu shutil.move(original_book, bu) self.clippings_to_org(newbook, meta) print ' ->', newbook return newbook def convert_all(self): for book in os.listdir(self.sourcedir): ext = os.path.splitext(book)[1] if ext in ('.azw', '.epub', '.mobi', '.pdf'): self.convert(book)
import_env() from api.server import ApiServer from cleanup import Cleanup from camera.manager import CameraManager from notifier import Notifier if __name__ == "__main__": try: notifier = Notifier() notifier.start() camera_manager = CameraManager(notifier=notifier) camera_manager.start() cleanup = Cleanup() cleanup.start() api_server = ApiServer(camera_manager=camera_manager) api_server.start() total_threads = threading.active_count() while True: time.sleep(1) if threading.active_count() < total_threads: log("[main] Some thread is dead") raise KeyboardInterrupt except (KeyboardInterrupt, Exception) as e: log("[main] Caught exception: {}".format(str(e)))
class QC_Sample: def __init__(self, options): self.no_errors = True self.cleanup_sample = Cleanup() self.options = options self.sample_json = json.load(open(options.json)) # initialize the QC_Run object self.qc_run = QC_Run(self.sample_json, options.recalc_3x3_tables) self.__QCDirectory = "%s/QC" % self.sample_json['analysis'][ 'software_directory'] # will find all of the runs in a sample and QC them with each other def QC_merge_runs(self): # if this is a germline sample, QC all of the normal runs with each other. if self.sample_json['sample_type'] == 'germline': self.QC_germline() # if this is a tumor_normal sample, find the normal and tumor runs, and then QC them with each other. elif self.sample_json['sample_type'] == 'tumor_normal': self.QC_tumor_normal() # print the final status if self.no_errors == False or self.qc_run.no_errors == False: sys.stderr.write( "%s finished with errors. See %s/sge.log for more details" % (self.sample_json['sample_name'], self.sample_json['output_folder'])) self.sample_json['sample_status'] == 'failed' write_json(self.sample_json['json_file'], self.sample_json) sys.exit(1) else: print "%s finished with no errors" % ( self.sample_json['sample_name']) # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # make the excel spreadsheet containing the data and copy it back to the proton self._make_xlsx() # if this is a germline sample, QC all of the normal runs with each other. def QC_germline(self): # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json[ 'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal runs with each other self.QC_runs(self.sample_json['runs']) # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass. # make the merger merger = Merger(self.sample_json, self.options.recalc_3x3_tables) # Check to see if the normal runs are ready to be merged. self.sample_json, merge = merger.check_merge(self.sample_json['runs']) if merge != True: if 'final_json' in self.sample_json: # update the final run status merger.update_merged_run_status(self.sample_json['final_json']) elif merge == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. self.sample_json = merger.merge_runs('germline') # update the merged run status merger.update_merged_run_status(self.sample_json['merged_json']) if json.load(open(self.sample_json['merged_json']) )['pass_fail_merged_status'] == 'pass': # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' # cleanup the individual run bam files self.cleanup_sample.cleanup_runs( self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup the merged dir self.cleanup_sample.cleanup_runs( [self.sample_json['merged_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # copy the final run's VCF file to the final_dir if it passes the "merged" coverage flag if 'final_json' in self.sample_json: final_json = json.load(open(self.sample_json['final_json'])) if final_json['pass_fail_merged_status'] == 'pass': final_vcf = glob.glob("%s/*.vcf" % final_json['run_folder'])[0] final_project_dir = "/home/ionadmin/jeff/%s_Final_VCFs" % ( self.sample_json['project']) print "copying %s to %s" % (final_vcf, final_project_dir) # check to make sure the final dir exists. if not os.path.isdir(final_project_dir): os.mkdir(final_project_dir) shutil.copy( final_vcf, "%s/%s.vcf" % (final_project_dir, self.sample_json['sample_name'])) # now push the sample to s3 storage if self.sample_json['project'] == 'Einstein': print "pushing %s to amazon s3 storage" % self.sample_json[ 'sample_name'] self.push_sample_to_s3(final_json) # if this is a tumor_normal sample, find the normal and tumor runs, and then QC them with each other. def QC_tumor_normal(self): # Separate the runs into tumor and normal lists normal_runs, tumor_runs = self.getTumor_Normal() if self.sample_json['analysis']['settings'][ 'type'] == 'all_tumor_normal': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json[ 'sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal or tumor runs with each other self.QC_runs(normal_runs, 'normal_') self.QC_runs(tumor_runs, 'tumor_') # now QC the tumor and normal runs together. self.QC_normal_tumor_runs(normal_runs, tumor_runs) # make the merger merger = Merger(self.sample_json, self.options.recalc_3x3_tables) # Check to see if the normal runs are ready to be merged. self.sample_json, merge_normal = merger.check_merge( normal_runs, 'Normal/', 'normal_') if merge_normal == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. self.sample_json = merger.merge_runs('normal', 'Normal_', 'normal_') # Check to see if the tumor runs are ready to be merged. self.sample_json, merge_tumor = merger.check_merge( tumor_runs, 'Tumor/', 'tumor_') if merge_tumor == True: self.sample_json = merger.merge_runs('tumor', 'Tumor_', 'tumor_') # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again. #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1): # now QC the tumor and normal merged bams together if both normal and tumor runs are ready. # To only QC all for the actual merged runs (PNET), change the 'final' part to 'merged'. # The 'final_normal_json' and 'final_tumor_json' flags are set by merger.py in the function check_merge, line 157 #if (merge_normal or merge_tumor) and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json): if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json: self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], 'normal_', 'tumor_', '_merged') self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], qc_json) # update the merged run status merger.update_merged_run_status( self.sample_json['final_normal_json'], merged_perc_avail_bases) merger.update_merged_run_status( self.sample_json['final_tumor_json'], merged_perc_avail_bases) # cleanup the individual run bam files if merged_perc_avail_bases > .9: final_qc_dir = "%s/all%svs%s" % ( self.sample_json['qc_folder'], json.load(open(self.sample_json['final_normal_json'])) ['run_name'], json.load(open( self.sample_json['final_tumor_json']))['run_name']) # annotate the final somatic variants command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s %s" % ( self.sample_json['analysis']['software_directory'], final_qc_dir, self.sample_json['sample_name'], self.sample_json['analysis']['software_directory']) if runCommandLine(command) != 0: sys.stderr.write("ERROR: somatic annotation failed!\n") # Cleanup the PTRIM.bam and chr bam files after all of the QC is done. # are there any other files to clean up? self.cleanup_sample.cleanup_runs( self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup after the merging QC is done. self.cleanup_sample.cleanup_runs([ self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'] ], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' else: self.sample_json[ 'sample_status'] = 'awaiting_more_sequencing' # Separate the runs into tumor and normal lists def getTumor_Normal(self): normal_runs = [] tumor_runs = [] for run in self.sample_json['runs']: run_json = json.load(open(run)) # temp fix for runs that have old JSON files (i.e. SEGA) if 'run_type' not in run_json or 'run_num' not in run_json: if re.search('N-', run): run_json['run_type'] = 'normal' else: run_json['run_type'] = 'tumor' run_json['pass_fail_status'] = 'pending' run_json['json_type'] = 'run' run_json['json_file'] = run run_json['run_name'] = run_json['name'] run_json['run_num'] = run_json['run_name'][-1] run_json['sample_name'] = run_json['sample'] if re.search('-', run): run_json['run_folder'] = '/'.join(run.split('/')[:-1]) run_json['sample_folder'] = os.path.abspath( '/'.join(run.split('/')[:-1]) + "/../..") write_json(run, run_json) # temp fix over if 'analysis' not in run_json: bam = glob.glob("%s/*.bam" % run_json['run_folder'])[0].split('/')[-1] run_json['analysis'] = {'files': [bam]} write_json(run, run_json) if run_json['run_type'] == 'normal': normal_runs.append(run) elif run_json['run_type'] == 'tumor': tumor_runs.append(run) else: print "ERROR run type is not normal or tumor." return normal_runs, tumor_runs # QC the normal runs with each other def QC_runs(self, runs, pref=''): # first run TVC_CV and get the Run info to prepare for QC2Runs for run in runs: run_json = json.load(open(run)) # only run these if this run has a status of pending. # This way the pass_fail_status can be manually overwritten. if run_json[ 'pass_fail_status'] == "pending" or self.options.pass_fail: self.qc_run.runTVC_COV(run, pref) self.qc_run.getRunInfo(run, pref) # Update the run status based on the metrics gathered by QC_getRunInfo.sh self.qc_run.update_run_status(run, len(runs)) # if there is only one run for this sample, then set the status to 'pending_merge' so that the only run will be set as the 'final_json' pending_runs, passing_runs = self.qc_run.get_runs_status(runs) if len(passing_runs) == 1: self.sample_json['sample_status'] = 'pending_merge' else: for run1 in runs: run1_json = json.load(open(run1)) for run2 in runs: run2_json = json.load(open(run2)) # check to see if these two runs should be QC'd together. Only QC the runs that pass the single run QC metrics. if int(run1_json['run_num']) < int( run2_json['run_num']) and ( (run1_json['pass_fail_status'] == 'pass' and run2_json['pass_fail_status'] == 'pass') or self.options.qc_all): self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, run1, run2, pref, pref) self.sample_json, perc_avail_bases = self.qc_run.update_3x3_runs_status( self.sample_json, run1, run2, qc_json) # now QC the tumor and normal runs together. def QC_normal_tumor_runs(self, normal_runs, tumor_runs): for normal_run in normal_runs: for tumor_run in tumor_runs: normal_json = json.load(open(normal_run)) tumor_json = json.load(open(tumor_run)) # Only QC the runs that pass the single run QC metrics. if (normal_json['pass_fail_status'] == 'pass' and tumor_json['pass_fail_status'] == 'pass') or self.options.qc_all: self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, normal_run, tumor_run, 'normal_', 'tumor_') self.sample_json, perc_avail_bases = self.qc_run.update_3x3_runs_status( self.sample_json, normal_run, tumor_run, qc_json) # make the xlsx file to be copied back to the proton def _make_xlsx(self): xlsx_file = '%s/%s_QC.xlsx' % (self.sample_json['qc_folder'], self.sample_json['sample_name']) make_xlsx_command = "python2.7 %s/QC_generateSheets.py "%self.__QCDirectory + \ "--sample_path %s "%self.sample_json['sample_folder'] + \ "--sheet_per_sample " + \ "--out %s "%xlsx_file + \ "--ex_json %s "%(self.sample_json['json_file']) status = runCommandLine(make_xlsx_command) if status != 0: print "unable to generate the excel file" else: print "Generated the QC spreadsheet successfully!" # t would be really really cool if I could send them an email with the xlsx file!! if self.options.email and 'emails' in self.sample_json: # TEMP add my email automatically if '*****@*****.**' not in self.sample_json[ 'emails']: self.sample_json['emails'].append( '*****@*****.**') for email in self.sample_json['emails']: # this command will email the status of the sample, and attach the excel spreadsheet and somatic variants if it is found. # TODO add germline project variants as well. somatic_variants = "%s/%s_somatic.xlsx" % ( self.sample_json['qc_folder'], self.sample_json['sample_name']) if os.path.isfile(somatic_variants): email_command = '\tprintf "%s finished with a status of %s. \\n`grep sample_status *.json`\\n" | (cat -; uuencode %s %s; uuencode %s %s) | ssmtp -vvv %s >/dev/null 2>&1\n' % ( self.sample_json['sample_name'], "pass", xlsx_file, xlsx_file.split('/')[-1], somatic_variants, somatic_variants.split('/')[-1], email) else: email_command = '\tprintf "%s finished with a status of %s. \\n`grep sample_status *.json`\\n" | (cat -; uuencode %s %s) | ssmtp -vvv %s >/dev/null 2>&1\n' % ( self.sample_json['sample_name'], "pass", xlsx_file, xlsx_file.split('/')[-1], email) runCommandLine(email_command) # just send the email for now. # # I will copy the .xlsx file to every run of the sample # for run in self.sample_json['runs']: # run_json = json.load(open(run)) # if 'server_ip' in run_json and 'orig_filepath_plugin_dir' in run_json: # copy_command = "scp %s ionadmin@%s:%s "%(xlsx_file, run_json['server_ip'], run_json['orig_filepath_plugin_dir']) # status = runCommandLine(copy_command) # if status == 0: # print "Copied the QC.xlsx file back to %s successfully! %s"%(run_json['proton'], copy_command) # else: # print "Failed to copy the QC.xlsx file back to %s... %s"%(run_json['proton'], copy_command) # # try to copy the log file back as well. # copy_command = "scp %s/sge.log ionadmin@%s:%s/QC.log "%(self.sample_json['sample_folder'], run_json['server_ip'], run_json['orig_filepath_plugin_dir']) # status = runCommandLine(copy_command) # # try to add the log file to the plugin's log file. # # this didn't work... # #copy_command = "ssh ionadmin@%s:%s/QC.log 'cat %s/QC.log >> %s/drmaa_stdout.txt"%(run_json['server_ip'], run_json['orig_filepath_plugin_dir'], run_json['orig_filepath_plugin_dir'], run_json['orig_filepath_plugin_dir']) # #status = runCommandLine(copy_command) # send an email with the specified attachments # TODO finish this function def _send_email(self, emails, attachments, status): for email in emails: # this command will email the status of the sample, and attach the excel spreadsheet and somatic variants if it is found. email_command = '\tprintf "%s finished with a status of %s. \\n`grep sample_status *.json`\\n" | ssmtp -vvv %s >/dev/null 2>&1\n' % ( self.sample_json['sample_name'], status, email) runCommandLine(email_command) # pushes the final run or merged files to amazon s3 storage. def push_sample_to_s3(self, final_json): # first get all of the files to push final_vcf = glob.glob("%s/*.vcf" % final_json['run_folder'])[0] target_vcf = "Einstein/%s/%s.vcf" % (self.sample_json['sample_name'], self.sample_json['sample_name']) final_cov = glob.glob("%s/*.amplicon.cov.xls" % (final_json['run_folder']))[0] target_cov = "Einstein/%s/%s" % (self.sample_json['sample_name'], final_cov.split('/')[-1]) final_bam = "%s/%s" % (final_json['run_folder'], final_json['analysis']['files'][0]) target_bam = "Einstein/%s/%s" % (self.sample_json['sample_name'], final_bam.split('/')[-1]) final_bai = final_bam + ".bai" target_bai = target_bam + ".bai" final_json_file = final_json['json_file'] target_json_file = "Einstein/%s/%s" % (self.sample_json['sample_name'], final_json_file.split('/')[-1]) # call the push_files script to push each file to s3 storage status = runCommandLine("bash /rawdata/scripts/TRI_Dev/push_files_s3.sh " + \ "%s %s "%(final_vcf, target_vcf) + "%s %s "%(final_cov, target_cov) + \ "%s %s "%(final_bam, target_bam) + "%s %s "%(final_bai, target_bai) + \ "%s %s "%(final_json_file, target_json_file)) if status != 0: self.no_errors = False print "ERROR: unable to push the sample to s3 storage" # if the update_json flag is specified, then update the cutoffs found in the normal json file. def update_cutoffs(self): # load the json file update_json = json.load(open(self.options.update_cutoffs)) # set the cutoff settings to the example json's cutoff settings self.sample_json['analysis']['settings']['cutoffs'] = update_json[ 'analysis']['settings']['cutoffs'] # write the updated sample's json file. write_json(self.options.json, self.sample_json) # move the old 3x3 tables to the flag "old_GTs" def recalc_3x3_tables(self): self.sample_json['sample_status'] = 'pushed' # load the output QC json. will be used to check if this combination has already been made. qc_json_data = {} if os.path.isfile(self.sample_json['results_qc_json']): qc_json_data = json.load(open(self.sample_json['results_qc_json'])) # if the user specified to recalculate the 3x3 tables, do that here. if self.options.recalc_3x3_tables and 'QC_comparisons' in qc_json_data: # rearrange the old 3x3 tables to calculate the new 3x3 tables usingn the updated GT cutoffs qc_json_data['old_GTs'] = qc_json_data['QC_comparisons'] del qc_json_data['QC_comparisons'] write_json(self.sample_json['results_qc_json'], qc_json_data) # get the alignment statistics for each run or merged bam file. def get_alignment_stats(self): # TEMP fix the runs. runs = [] for run in glob.glob("%s/Normal/N-[0-9]/*.json" % self.sample_json['sample_folder']): runs.append(run) for run in glob.glob("%s/Tumor/T-[0-9]/*.json" % self.sample_json['sample_folder']): runs.append(run) self.sample_json['runs'] = runs write_json(self.sample_json['json_file'], self.sample_json) # ## now get the alignment statistics for run in self.sample_json['runs']: print "getting alignment_status for: %s" % run Align_Stats(run) # if 'merged_normal_json' in self.sample_json: # normal_merged_path = json.load(open(self.sample_json['merged_normal_json']))['run_folder'] # if not os.path.isfile('%s/Analysis_Files/ionstats_alignment.json'%json.load(open(self.sample_json['merged_normal_json']))['run_folder']): # # first fix the header of the merged.bam file # normal_merged_bam = "%s/%s"%(normal_merged_path, json.load(open(self.sample_json['merged_normal_json']))['analysis']['files'][0]) # print "fixing header for %s"%normal_merged_bam # correct_header_command = "samtools view -H %s > %s/merged.header.sam "%(normal_merged_bam, normal_merged_path) # if runCommandLine(correct_header_command) != 0: # print "ERROR: samtools view -H failed!" # sys.exit(1) # # # move the old bam # old_normal_merged_bam = "%s/bad_header.bam"%normal_merged_path # shutil.move(normal_merged_bam, old_normal_merged_bam) # # this command deletes the KS: tag!! not good! I don't know why but some headers are tab delimited, and some are not it seems. # sed_command = 'sed -E "s/SM:[^:]*:/SM:%s\tKS:/" %s/merged.header.sam > %s/merged.headerCorrected.sam'%(self.sample_json['sample_name'], normal_merged_path, normal_merged_path) # if runCommandLine(sed_command) != 0: # print "ERROR: sed command failed!" # sys.exit(1) # # write the new header to merged.bam # reheader_command = "samtools reheader %s/merged.headerCorrected.sam %s > %s "%(normal_merged_path, old_normal_merged_bam, normal_merged_bam) # if runCommandLine(reheader_command) != 0: # print "ERROR: sed command failed!" # sys.exit(1) # # make a new index file # runCommandLine("samtools index %s"%normal_merged_bam) # #remove the old bam # os.remove(old_normal_merged_bam) # os.remove("%s/merged.headerCorrected.sam"%normal_merged_path) # os.remove("%s/merged.header.sam"%normal_merged_path) # # then get the ionstats # Align_Stats(self.sample_json['merged_normal_json']) # if 'merged_tumor_json' in self.sample_json: # tumor_merged_path = json.load(open(self.sample_json['merged_tumor_json']))['run_folder'] # if not os.path.isfile('%s/Analysis_Files/ionstats_alignment.json'%json.load(open(self.sample_json['merged_tumor_json']))['run_folder']): # # first fix the header of the merged.bam file # tumor_merged_bam = "%s/%s"%(tumor_merged_path, json.load(open(self.sample_json['merged_tumor_json']))['analysis']['files'][0]) # print "fixing header for %s"%tumor_merged_bam # correct_header_command = "samtools view -H %s > %s/merged.header.sam "%(tumor_merged_bam, tumor_merged_path) # if runCommandLine(correct_header_command) != 0: # print "ERROR: samtools view -H failed!" # sys.exit(1) # # # move the old bam # old_tumor_merged_bam = "%s/bad_header.bam"%tumor_merged_path # shutil.move(tumor_merged_bam, old_tumor_merged_bam) # # this command deletes the KS: tag!! not good! I don't know why but some headers are tab delimited, and some are not it seems. # sed_command = 'sed -E "s/SM:[^:]*:/SM:%s\tKS:/" %s/merged.header.sam > %s/merged.headerCorrected.sam'%(self.sample_json['sample_name'], tumor_merged_path, tumor_merged_path) # if runCommandLine(sed_command) != 0: # print "ERROR: sed command failed!" # sys.exit(1) # # write the new header to merged.bam # reheader_command = "samtools reheader %s/merged.headerCorrected.sam %s > %s "%(tumor_merged_path, old_tumor_merged_bam, tumor_merged_bam) # if runCommandLine(reheader_command) != 0: # print "ERROR: sed command failed!" # sys.exit(1) # # make a new index file # runCommandLine("samtools index %s"%tumor_merged_bam) # #remove the old bam # os.remove(old_tumor_merged_bam) # os.remove("%s/merged.headerCorrected.sam"%tumor_merged_path) # os.remove("%s/merged.header.sam"%tumor_merged_path) # # then get the ionstats # Align_Stats(self.sample_json['merged_tumor_json']) # copy the xlsx file here because it didn't get copied for a lot of samples #self._make_xlsx() # subset out the 718 gene set from the final merged PNET 3x3 tables def get_718_subset(self): # add the path to the 718 subset: self.sample_json['analysis']['settings'][ 'subset_bed'] = '/rawdata/support_files/BED/PNET/AmpliSeqExome_PNET_subset.bed' self.sample_json['analysis']['settings'][ 'chromosomes_to_analyze_merged'] = ['all', '718'] if 'results_qc_json' not in self.sample_json and 'results_QC_json' in self.sample_json: self.sample_json['results_qc_json'] = self.sample_json[ 'results_QC_json'] self.sample_json['emails'] = ['*****@*****.**'] #self.sample_json['sample_status'] = 'pending_merge' write_json(self.sample_json['json_file'], self.sample_json) #Normal_Merged1vsTumor_Merged1 #NMerge1vsTMerged1 qc_comp_dir = '' if os.path.isdir("%s/allNMerged1vsTMerged1" % self.sample_json['qc_folder']): qc_comp_dir = "%s/allNMerged1vsTMerged1" % self.sample_json[ 'qc_folder'] qc_comp = "NMerged1vsTMerged1" elif os.path.isdir("%s/allNormal_Merged1vsTumor_Merged1" % self.sample_json['qc_folder']): qc_comp_dir = "%s/allNormal_Merged1vsTumor_Merged1" % self.sample_json[ 'qc_folder'] qc_comp = "Normal_Merged1vsTumor_Merged1" if qc_comp_dir != '': results_qc_json = json.load( open(self.sample_json['results_qc_json'])) # fix the name of the folder and the name in the results_qc_json normal_merged_name = json.load( open(self.sample_json['merged_normal_json']))['run_name'] tumor_merged_name = json.load( open(self.sample_json['merged_tumor_json']))['run_name'] new_qc_comp = "%svs%s" % (normal_merged_name, tumor_merged_name) print "moving %s to %s" % ( qc_comp_dir, "%s/all%s" % (self.sample_json['qc_folder'], new_qc_comp)) shutil.move( qc_comp_dir, "%s/all%s" % (self.sample_json['qc_folder'], new_qc_comp)) results_qc_json = json.load( open(self.sample_json['results_qc_json'])) new_qc_comp_dict = results_qc_json['QC_comparisons']['all'][ 'normal_tumor'][qc_comp] del results_qc_json['QC_comparisons']['all']['normal_tumor'][ qc_comp] results_qc_json['QC_comparisons']['all']['normal_tumor'][ new_qc_comp] = new_qc_comp_dict results_qc_json['sample_name'] = self.sample_json['sample_name'] results_qc_json['sample'] = self.sample_json['sample_name'] write_json(self.sample_json['results_qc_json'], results_qc_json) if 'merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json: print "Running QC_2Runs" self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], 'normal_', 'tumor_', '_merged') print 'done' # done for now self._make_xlsx() def overlap(self): # add the merged_perc_avail_bases # fix the name of the folder and the name in the results_qc_json if 'merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json: results_qc_json = json.load( open(self.sample_json['results_qc_json'])) normal_merged_json = json.load( open(self.sample_json['merged_normal_json'])) tumor_merged_json = json.load( open(self.sample_json['merged_tumor_json'])) qc_comp = "%svs%s" % (normal_merged_json['run_name'], tumor_merged_json['run_name']) perc_avail_bases = results_qc_json['QC_comparisons']['all'][ 'normal_tumor'][qc_comp]['perc_avail_bases'] normal_merged_json['run_data'][ 'merged_perc_avail_bases'] = perc_avail_bases tumor_merged_json['run_data'][ 'merged_perc_avail_bases'] = perc_avail_bases write_json(self.sample_json['merged_normal_json'], normal_merged_json) write_json(self.sample_json['merged_tumor_json'], tumor_merged_json) def A_227(self): # compare the Normal merged file to all of the other tumor combinations # Separate the runs into tumor and normal lists #normal_runs, tumor_runs = self.getTumor_Normal() #for tumor_run in tumor_runs: # # generate the 3x3 tables for only chr1. # self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, self.sample_json['merged_normal_json'], tumor_run, 'normal_', 'tumor_', '_merged') self.sample_json, qc_json = self.qc_run.QC_2Runs( self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], 'normal_', 'tumor_', '_merged') # merge tumor runs 1-3, 4-5, and 7-8. #tumor_1_2_3 = ["/mnt/Despina/projects/PNET/A_227/Tumor/T-1/A_227_T-1.json", "/mnt/Despina/projects/PNET/A_227/Tumor/T-2/A_227_T-2.json", "/mnt/Despina/projects/PNET/A_227/Tumor/T-3/A_227_T-3.json"] #tumor_7_8 = ["/mnt/Despina/projects/PNET/A_227/Tumor/T-7/A_227_T-7.json", "/mnt/Despina/projects/PNET/A_227/Tumor/T-8/A_227_T-8.json"] # done for now self._make_xlsx() def somatic_variants(self): # get the somatic variants from samples that pass the final overlapping coverage cutoff if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json: final_normal_json = json.load( open(self.sample_json['final_normal_json'])) final_tumor_json = json.load( open(self.sample_json['final_tumor_json'])) if "pass_fail_merged_status" in final_normal_json and final_normal_json[ "pass_fail_merged_status"] == 'pass': # get the path to the final QC comparison dir qc_comp_dir = "%s/QC/all%svs%s" % ( self.sample_json['sample_folder'], final_normal_json['run_name'], final_tumor_json['run_name']) # get the somatic variants using the somatic_variants.sh script which utilizes Ozlem's scripts. #TODO "import" Ozlem's scripts into this pipeline command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s %s" % ( self.sample_json['analysis']['software_directory'], qc_comp_dir, self.sample_json['sample_name'], self.sample_json['analysis']['software_directory']) result = runCommandLine(command) if result != 0: self.no_errors = False self._make_xlsx() def change_stringency(self): self.sample_json['analysis']['settings'][ 'normal_tvc_json'] = "/rawdata/support_files/parameter_sets/Parameter_Tests/ampliseq_germline_lowstringency_pgm_parameters_jingwei_edits.json" self.sample_json['analysis']['settings'][ 'tumor_tvc_json'] = "/rawdata/support_files/parameter_sets/Parameter_Tests/ch1_somatic_lowstringency_pgm_parameters_jingwei_edits.json" # initialize the QC_Run object qc_run_diff_settings = QC_Run(self.sample_json, recalc_3x3_tables=False) # change the stringency of the somatic analysis to see if we can find more somatic variants if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json: final_normal_json = json.load( open(self.sample_json['final_normal_json'])) final_tumor_json = json.load( open(self.sample_json['final_tumor_json'])) # check to see if the final runs passed the merged cutoff if "pass_fail_merged_status" in final_normal_json and final_normal_json[ "pass_fail_merged_status"] == 'pass': for final_json in [final_normal_json, final_tumor_json]: # re-run TVC on the individual bam files vcfs = glob.glob("%s/*.vcf" % final_json['run_folder']) # if len(vcfs) > 0: # shutil.move(vcfs[0], "%s/Analysis_Files/4.2_TSVC_variants_High_String_Jingwei_edits.vcf"%final_json['run_folder']) qc_run_diff_settings.runTVC_COV( final_json['json_file'], "%s_" % final_json['run_type']) # now fix the results_qc_json # move the results_qc_json analysis to a different "chromosome" # results_qc_json = json.load(open(self.sample_json['results_qc_json'])) # qc_comp = "%svs%s"%(final_normal_json['run_name'], final_tumor_json['run_name']) # old_qc_comp_dict = results_qc_json['QC_comparisons']['all']['normal_tumor'][qc_comp] # del results_qc_json['QC_comparisons']['all']['normal_tumor'][qc_comp] # # results_qc_json['QC_comparisons']['High_String_Jingwei_Edits'] = {'normal_tumor': {qc_comp: old_qc_comp_dict}} # write_json(self.sample_json['results_qc_json'], results_qc_json) # get the path to the final QC comparison dir qc_comp_dir = "%s/QC/all%svs%s" % ( self.sample_json['sample_folder'], final_normal_json['run_name'], final_tumor_json['run_name']) # high_string_qc_comp_dir = "%s/QC/High_String_Jingwei_Edits_all%svs%s"%(self.sample_json['sample_folder'], final_normal_json['run_name'], final_tumor_json['run_name']) # shutil.move(qc_comp_dir, high_string_qc_comp_dir) # QC the 2 final runs self.sample_json, qc_json = qc_run_diff_settings.QC_2Runs( self.sample_json, self.sample_json['final_normal_json'], self.sample_json['final_tumor_json'], 'normal_', 'tumor_', '_merged') # get the somatic variants using the somatic_variants.sh script which utilizes Ozlem's scripts. #TODO "import" Ozlem's scripts into this pipeline command = "bash %s/Somatic_Variants/somatic_variants.sh %s %s" % ( self.sample_json['analysis']['software_directory'], qc_comp_dir, self.sample_json['sample_name']) result = runCommandLine(command) if result != 0: self.no_errors = False #self._make_xlsx() def samtools_gatk(self): # 7 gene panel for PNET analysis Seven_Gene_Bed = "/rawdata/support_files/BED/PNET/7PNET_Genes_amplicons.bed" # run samtools and GATK on the SEGA dataset if 'final_normal_json' in self.sample_json and 'final_tumor_json' in self.sample_json: final_normal_json = json.load( open(self.sample_json['final_normal_json'])) final_tumor_json = json.load( open(self.sample_json['final_tumor_json'])) for final_json in [final_normal_json, final_tumor_json]: # if the analysis has not already been done, do it #if not os.path.isfile("%s/Analysis_Files/gatk_filtered_snps.vcf"%(final_json['run_folder'])) or not os.path.isfile("%s/Analysis_Files/7Genes_TSVC_variants.vcf"%(final_json['run_folder'])): bam = "%s/%s" % (final_json['run_folder'], final_json['analysis']['files'][0]) # subset the 7 genes from the VCF file #vcf = glob.glob("%s/*.vcf"%(final_json['run_folder']))[0] #command = "bedtools intersect -header -a %s -b %s > %s/Analysis_Files/7Genes_TSVC_variants.vcf"%(vcf, Seven_Gene_Bed, final_json['run_folder']) #tvc_result = runCommandLine(command) # subset the 7 genes from the bam file to be uploaded to IR. #command = "samtools view -L /rawdata/support_files/BED/PNET/7PNET_Genes_amplicons.bed %s -b > %s/Analysis_Files/%s_%s_7genes_06032015.bam"%(bam, final_json['run_folder'], self.sample_json['sample_name'], final_json['run_type']) #subset_result = runCommandLine(command) # run samtools #command = "bash /home/ionadmin/TRI_Scripts/Variants/Samtools/run_samtools.sh %s %s %s/Analysis_Files"%(bam, Seven_Gene_Bed, final_json['run_folder']) #samtools_result = runCommandLine(command) # run GATK command = "bash /home/ionadmin/TRI_Scripts/Variants/GATK/run_gatk.sh %s %s %s/Analysis_Files" % ( bam, Seven_Gene_Bed, final_json['run_folder']) gatk_result = runCommandLine(command) # I would push the bam file to IR now, but that would require a password... # Lastly, generate the Venn Diagrams. I need to copy the R code here and figure out how to copy the IR TVC results back to here as well. # if samtools_result != 0 or gatk_result != 0 or tvc_result != 0 or subset_result != 0: # sys.stderr.write("Error: samtools or gatk failed!\n") # self.no_errors = False # done for now #self._send_email("*****@*****.**") #self._make_xlsx() def variant_call(self): # re-run TVC to see which variants were filtered out #final_normal_dir = json.load(open(self.sample_json['final_normal_json']))['run_folder'] final_tumor_dir = json.load(open( self.sample_json['final_tumor_json']))['run_folder'] #runCommandLine("mv %s/4.2_TSVC_variants.vcf %s/Analysis_Folder"%(final_normal_dir, final_normal_dir)) #runCommandLine("mv %s/4.2_TSVC_variants.vcf %s/Analysis_Folder"%(final_tumor_dir, final_tumor_dir)) #self.qc_run.runTVC_COV(self.sample_json['final_normal_json'], "normal_") self.qc_run.runTVC_COV(self.sample_json['final_tumor_json'], "tumor_") def einstein_merged(self): # first find the merged dir merged_dir = "%s/Merged" % self.sample_json['sample_folder'] if os.path.isdir(merged_dir): # if the merged file has >= 30x coverage, copy the VCF file merged_amp = glob.glob("%s/*.amplicon.cov.xls" % merged_dir)[0] command = "tail -n +2 %s | awk -v cutoff=30 '{ if ($10 >= cutoff) printf \".\"}' | wc -c" % merged_amp amp_cov = runCommandLine(command, get_output=True) command = "tail -n +2 %s | wc -l" % merged_amp num_amps = runCommandLine(command, get_output=True) if (float(amp_cov) / float(num_amps)) >= 0.9: # copy the VCF file var_file = glob.glob("%s/*.vcf" % merged_dir)[0] command = "cp %s /home/ionadmin/jeff/Einstein_passVCF/%s.vcf" % ( var_file, self.sample_json['sample_name']) runCommandLine(command)
class ImportBooks(object): def __init__(self, sourcedir, masterdir, bibfile, orgfile, serial=None, alfdir=None, also_repeated=False, doctype='book'): self.sourcedir = os.path.expanduser(sourcedir) self.masterdir = os.path.expanduser(masterdir) if serial and alfdir: self.cleanup = Cleanup(serial, alfdir) else: self.cleanup = None self.bibfile = os.path.expanduser(bibfile) self.orgfile = os.path.expanduser(orgfile) self.also_repeated = also_repeated self.doctype = doctype def add_to_bib(self, bibstr, bibid): if os.path.exists(self.bibfile): bib = codecs.open(self.bibfile, encoding='utf-8').read() else: bib = '' if not bibid in bib: with codecs.open(self.bibfile, 'a', encoding='utf-8') as f: f.write(u'\n' + bibstr + u'\n') return True return False def clippings_to_org(self, bookfile, meta): kc = KindleBook(bookfile, text_path='text', meta=meta) kc.print_clippings(self.orgfile, self.doctype) def convert(self, book, bu_dir='imported'): if not os.path.exists(book): book = os.path.join(self.sourcedir, book) print book original_book = book ext = os.path.splitext(book)[1] if ext == '.azw': if self.cleanup: book = self.cleanup.decrypt(book) else: print ("** Won't be able to clean up " + book + ", need a kindle serial") return bibstr, meta = docid.bibstr(book, self.doctype, add_isbn=True) bibid = meta['bibid'] new = self.add_to_bib(bibstr, bibid) newbook = os.path.join(self.masterdir, bibid) if new or self.also_repeated: if ext in ('.mobi', '.pdf'): newbook = newbook + ext os.rename(book, newbook) else: newbook = newbook + '.mobi' devnull = codecs.open(os.devnull, 'w', encoding='utf-8') if subprocess.call(['ebook-convert', book, newbook], stdout=devnull, stderr=devnull): print ("** Error converting to " + newbook + " (maybe DRMed book?)") return None else: if not os.path.exists(bu_dir): os.mkdir(bu_dir) bu = os.path.join(bu_dir, bibid + ext) print ' ...moving away', original_book, 'to', bu shutil.move(original_book, bu) self.clippings_to_org(newbook, meta) print ' ->', newbook return newbook def convert_all(self): for book in os.listdir(self.sourcedir): ext = os.path.splitext(book)[1] if ext in ('.azw', '.epub', '.mobi', '.pdf'): self.convert(book)
class QC_Sample: def __init__(self, options): self.__QCDirectory = "/rawdata/legos/scripts/QC" self.no_errors = True self.cleanup_sample = Cleanup() self.options = options self.sample_json = json.load(open(options.json)) self.qc_run = QC_Run(self.sample_json) # will find all of the runs in a sample and QC them with each other def QC_merge_runs(self): # if this is a germline sample, QC all of the normal runs with each other. if self.sample_json['sample_type'] == 'germline': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json['sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal runs with each other self.QC_runs(self.sample_json['runs']) # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # what if there is only one run that passes all of the metrics? It should be marked as the 'final_json' and have the 'pass_fail_merged' flag marked as pass. # make the merger merger = Merger(self.sample_json['json_file']) # Check to see if the normal runs are ready to be merged. merge = merger.check_merge(self.sample_json['runs']) if merge == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. merger.merge_runs('germline') # load the sample json file because merger edited it. self.sample_json = json.load(open(self.sample_json['json_file'])) # update the merged run status merger.update_merged_run_status(self.sample_json['merged_json']) if json.load(open(self.sample_json['merged_json']))['pass_fail_merged_status'] == 'pass': # Set the sample_status self.sample_json['sample_status'] = 'merged' # cleanup the individual run bam files self.cleanup_sample.cleanup_runs(self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup the merged dir self.cleanup_sample.cleanup_runs([self.sample_json['merged_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # if this is a tumor_normal sample, find the normal and tumor runs, and then QC them with each other. elif self.sample_json['sample_type'] == 'tumor_normal': # Separate the runs into tumor and normal lists normal_runs, tumor_runs = self.getTumor_Normal() if self.sample_json['analysis']['settings']['type'] == 'all_tumor_normal': # Use the sample_status here to not re-run the QC and to not overwrite run status. The 'sample_status' should be reset to 'pushed' when new runs are pushed.. #if self.sample_json['sample_status'] != 'pending_merge' and self.sample_json['sample_status'] != 'pending_3x3_review' and self.sample_json['sample_status'] != 'merged': # if the user specified the '--pass_fail' option, then run this part still if self.sample_json['sample_status'] == 'pushed' or self.options.pass_fail or self.options.qc_all: # QC the normal or tumor runs with each other self.QC_runs(normal_runs, 'normal_') self.QC_runs(tumor_runs, 'tumor_') # now QC the tumor and normal runs together. self.QC_normal_tumor_runs(normal_runs, tumor_runs) # make the excel spreadsheet containing the data and copy it back to the proton #self._make_xlsx() # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # make the merger merger = Merger(self.sample_json['json_file']) # Check to see if the normal runs are ready to be merged. merge_normal = merger.check_merge(normal_runs, 'Normal/', 'normal_') if merge_normal == True: # merge the normal and/or tumor runs. Will only merge the passing runs with each other. merger.merge_runs('normal', 'Normal_', 'normal_') # Check to see if the tumor runs are ready to be merged. merge_tumor = merger.check_merge(tumor_runs, 'Tumor/', 'tumor_') if merge_tumor == True: merger.merge_runs('tumor', 'Tumor_', 'tumor_') # load the sample json file because merger edited it. self.sample_json = json.load(open(self.sample_json['json_file'])) # If any runs were merged, QC them. If there are only 1 normal and tumor run, they won't be QCd again. #if normal_merge_dir != '' or tumor_merge_dir != '' or (len(normal_passing_bams) == 1 and len(tumor_passing_bams) == 1): # only QC all for the actual merged runs for now (PNET). # now QC the tumor and normal merged bams together if both normal and tumor runs are ready. if merge_normal or merge_tumor and ('merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json): self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], 'normal_', 'tumor_', '_merged') self.sample_json, merged_perc_avail_bases = self.qc_run.update_3x3_runs_status(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], qc_json) # update the merged run status merger.update_merged_run_status(self.sample_json['merged_normal_json'], merged_perc_avail_bases) merger.update_merged_run_status(self.sample_json['merged_tumor_json'], merged_perc_avail_bases) # cleanup the individual run bam files if merged_perc_avail_bases > .9: # Cleanup the PTRIM.bam and chr bam files after all of the QC is done. # are there any other files to clean up? self.cleanup_sample.cleanup_runs(self.sample_json['runs'], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) #self.cleanup_sample.delete_runs(runs, self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Cleanup after the merging QC is done. self.cleanup_sample.cleanup_runs([self.sample_json['final_normal_json'], self.sample_json['final_tumor_json']], self.sample_json['analysis']['settings']['cleanup'], self.no_errors) # Set the sample_status self.sample_json['sample_status'] = 'merged_pass' else: self.sample_json['sample_status'] = 'awaiting_more_sequencing' # print the final status if self.no_errors == False or self.qc_run.no_errors == False: sys.stderr.write("%s finished with errors. See %s/sge.log for more details"%(self.sample_json['sample_name'], self.sample_json['output_folder'])) self.sample_json['sample_status'] == 'failed' write_json(self.sample_json['json_file'], self.sample_json) sys.exit(1) else: print "%s finished with no errors!"%(self.sample_json['sample_name']) # write the sample json file write_json(self.sample_json['json_file'], self.sample_json) # make the excel spreadsheet containing the data and copy it back to the proton self._make_xlsx() # Separate the runs into tumor and normal lists def getTumor_Normal(self): normal_runs = [] tumor_runs = [] for run in self.sample_json['runs']: run_json = json.load(open(run)) # temp fix for runs that have old JSON files (i.e. SEGA) if 'run_type' not in run_json or 'run_num' not in run_json: if re.search('N-', run): run_json['run_type'] = 'normal' else: run_json['run_type'] = 'tumor' run_json['pass_fail_status'] = 'pending' run_json['json_type'] = 'run' run_json['json_file'] = run run_json['run_name'] = run_json['name'] run_json['run_num'] = run_json['run_name'][-1] run_json['sample_name'] = run_json['sample'] if re.search('-', run): run_json['run_folder'] = '/'.join(run.split('/')[:-1]) run_json['sample_folder'] = os.path.abspath('/'.join(run.split('/')[:-1]) + "/../..") write_json(run, run_json) # temp fix over if 'analysis' not in run_json: bam = glob.glob("%s/*.bam"%run_json['run_folder'])[0].split('/')[-1] run_json['analysis'] = {'files': [bam]} write_json(run, run_json) if run_json['run_type'] == 'normal': normal_runs.append(run) elif run_json['run_type'] == 'tumor': tumor_runs.append(run) else: print "ERROR run type is not normal or tumor." return normal_runs, tumor_runs # QC the normal runs with each other def QC_runs(self, runs, pref=''): # first run TVC_CV and get the Run info to prepare for QC2Runs for run in runs: run_json = json.load(open(run)) # only run these if this run has a status of pending. # This way the pass_fail_status can be manually overwritten. if run_json['pass_fail_status'] == "pending" or self.options.pass_fail: self.qc_run.runTVC_COV(run, pref) self.qc_run.getRunInfo(run, pref) # Update the run status based on the metrics gathered by QC_getRunInfo.sh self.qc_run.update_run_status(run, len(runs)) # if there is only one run for this sample, then set the status to 'pending_merge' so that the only run will be set as the 'final_json' passing_runs = self.qc_run.get_runs_status(runs)[0] if len(passing_runs) == 1: self.sample_json['sample_stats'] == 'pending_merge' else: for run1 in runs: run1_json = json.load(open(run1)) for run2 in runs: run2_json = json.load(open(run2)) # check to see if these two runs should be QC'd together. Only QC the runs that pass the single run QC metrics. if int(run1_json['run_num']) < int(run2_json['run_num']) and ((run1_json['pass_fail_status'] == 'pass' and run2_json['pass_fail_status'] == 'pass') or self.options.qc_all): self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, run1, run2, pref, pref) self.sample_json, perc_avail_bases = self.qc_run.update_3x3_runs_status(self.sample_json, run1, run2, qc_json) # now QC the tumor and normal runs together. def QC_normal_tumor_runs(self, normal_runs, tumor_runs): for normal_run in normal_runs: for tumor_run in tumor_runs: normal_json = json.load(open(normal_run)) tumor_json = json.load(open(tumor_run)) # Only QC the runs that pass the single run QC metrics. if (normal_json['pass_fail_status'] == 'pass' and tumor_json['pass_fail_status'] == 'pass') or self.options.qc_all: self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, normal_run, tumor_run, 'normal_', 'tumor_') self.sample_json, perc_avail_bases = self.qc_run.update_3x3_runs_status(self.sample_json, normal_run, tumor_run, qc_json) # make the xlsx file to be copied back to the proton def _make_xlsx(self): xlsx_file = '%s/%s_QC.xlsx'%(self.sample_json['qc_folder'], self.sample_json['sample_name']) make_xlsx_command = "python2.7 %s/QC_generateSheets.py "%self.__QCDirectory + \ "--sample_path %s "%self.sample_json['sample_folder'] + \ "--sheet_per_sample " + \ "--out %s "%xlsx_file + \ "--ex_json %s "%(self.sample_json['json_file']) status = runCommandLine(make_xlsx_command) if status != 0: print "unable to generate the excel file" else: print "Generated the QC spreadsheet successfully!" # t would be really really cool if I could send them an email with the xlsx file!! if self.options.email and 'emails' in self.sample_json: # TEMP add my email automatically if '*****@*****.**' not in self.sample_json['emails']: self.sample_json['emails'].append('*****@*****.**') for email in self.sample_json['emails']: # this command will email the status of the sample, and attach the excel spreadsheet email_command = '\tprintf "%s finished with a status of %s. \\n`grep sample_status *.json`\\n" | (cat - && uuencode %s %s) | ssmtp -vvv %s >/dev/null 2>&1\n' % (self.sample_json['sample_name'], "pass", xlsx_file, xlsx_file.split('/')[-1], email) runCommandLine(email_command) # I will copy the .xlsx file to every run of the sample for run in self.sample_json['runs']: run_json = json.load(open(run)) if 'server_ip' in run_json and 'orig_filepath_plugin_dir' in run_json: copy_command = "scp %s ionadmin@%s:%s "%(xlsx_file, run_json['server_ip'], run_json['orig_filepath_plugin_dir']) status = runCommandLine(copy_command) if status == 0: print "Copied the QC.xlsx file back to %s successfully! %s"%(run_json['proton'], copy_command) else: print "Failed to copy the QC.xlsx file back to %s... %s"%(run_json['proton'], copy_command) # try to copy the log file back as well. copy_command = "scp %s/sge.log ionadmin@%s:%s/QC.log "%(self.sample_json['sample_folder'], run_json['server_ip'], run_json['orig_filepath_plugin_dir']) status = runCommandLine(copy_command) # try to add the log file to the plugin's log file. # this didn't work... #copy_command = "ssh ionadmin@%s:%s/QC.log 'cat %s/QC.log >> %s/drmaa_stdout.txt"%(run_json['server_ip'], run_json['orig_filepath_plugin_dir'], run_json['orig_filepath_plugin_dir'], run_json['orig_filepath_plugin_dir']) #status = runCommandLine(copy_command) # if the update_json flag is specified, then update the cutoffs found in the normal json file. def update_cutoffs(self): # load the json file update_json = json.load(open(self.options.update_cutoffs)) # set the cutoff settings to the example json's cutoff settings self.sample_json['analysis']['settings']['cutoffs'] = update_json['analysis']['settings']['cutoffs'] # write the updated sample's json file. write_json(self.options.json, self.sample_json) # move the old 3x3 tables to the flag "old_GTs" def recalc_3x3_tables(self): # load the output QC json. will be used to check if this combination has already been made. qc_json_data = {} if os.path.isfile(qc_json): qc_json_data = json.load(open(qc_json)) # if the user specified to recalculate the 3x3 tables, do that here. if self.options.recalc_3x3_tables and 'QC_comparisons' in qc_json_data: # rearrange the old 3x3 tables to calculate the new 3x3 tables usingn the updated GT cutoffs qc_json_data['old_GTs'] = qc_json_data['QC_comparisons'] del qc_json_data['QC_comparisons'] write_json(qc_json, qc_json_data) # get the alignment statistics for each run or merged bam file. def get_alignment_stats(self): # TEMP fix the runs. runs = [] for run in glob.glob("%s/Normal/N-[0-9]/*.json"%self.sample_json['sample_folder']): runs.append(run) for run in glob.glob("%s/Tumor/T-[0-9]/*.json"%self.sample_json['sample_folder']): runs.append(run) self.sample_json['runs'] = runs write_json(self.sample_json['json_file'], self.sample_json) # now get the alignment statistics for run in self.sample_json['runs']: Align_Stats(run) #if 'merged_normal_json' in self.sample_json: # Align_Stats(self.sample_json['merged_normal_json']) #if 'merged_tumor_json' in self.sample_json: # Align_Stats(self.sample_json['merged_tumor_json']) # copy the xlsx file here because it didn't get copied for a lot of samples self._make_xlsx() # subset out the 718 gene set from the final merged PNET 3x3 tables def get_718_subset(self): # add the path to the 718 subset: self.sample_json['analysis']['settings']['subset_bed'] = '/rawdata/support_files/BED/PNET/AmpliSeqExome_PNET_subset.bed' self.sample_json['analysis']['settings']['chromosomes_to_analyze_merged'] = ['all', '718'] if 'results_qc_json' not in self.sample_json and 'results_QC_json' in self.sample_json: self.sample_json['results_qc_json'] = self.sample_json['results_QC_json'] self.sample_json['emails'] = ['*****@*****.**'] #self.sample_json['sample_status'] = 'pending_merge' write_json(self.sample_json['json_file'], self.sample_json) #Normal_Merged1vsTumor_Merged1 #NMerge1vsTMerged1 qc_comp_dir = '' if os.path.isdir("%s/allNMerged1vsTMerged1"%self.sample_json['qc_folder']): qc_comp_dir = "%s/allNMerged1vsTMerged1"%self.sample_json['qc_folder'] qc_comp = "NMerged1vsTMerged1" elif os.path.isdir("%s/allNormal_Merged1vsTumor_Merged1"%self.sample_json['qc_folder']): qc_comp_dir = "%s/allNormal_Merged1vsTumor_Merged1"%self.sample_json['qc_folder'] qc_comp = "Normal_Merged1vsTumor_Merged1" if qc_comp_dir != '': results_qc_json = json.load(open(self.sample_json['results_qc_json'])) # fix the name of the folder and the name in the results_qc_json normal_merged_name = json.load(open(self.sample_json['merged_normal_json']))['run_name'] tumor_merged_name = json.load(open(self.sample_json['merged_tumor_json']))['run_name'] new_qc_comp = "%svs%s"%(normal_merged_name, tumor_merged_name) print "moving %s to %s"%(qc_comp_dir, "%s/all%s"%(self.sample_json['qc_folder'], new_qc_comp)) shutil.move(qc_comp_dir, "%s/all%s"%(self.sample_json['qc_folder'], new_qc_comp)) results_qc_json = json.load(open(self.sample_json['results_qc_json'])) new_qc_comp_dict = results_qc_json['QC_comparisons']['all']['normal_tumor'][qc_comp] del results_qc_json['QC_comparisons']['all']['normal_tumor'][qc_comp] results_qc_json['QC_comparisons']['all']['normal_tumor'][new_qc_comp] = new_qc_comp_dict results_qc_json['sample_name'] = self.sample_json['sample_name'] results_qc_json['sample'] = self.sample_json['sample_name'] write_json(self.sample_json['results_qc_json'], results_qc_json) if 'merged_normal_json' in self.sample_json and 'merged_tumor_json' in self.sample_json: print "Running QC_2Runs" self.sample_json, qc_json = self.qc_run.QC_2Runs(self.sample_json, self.sample_json['merged_normal_json'], self.sample_json['merged_tumor_json'], 'normal_', 'tumor_', '_merged') print 'done' # done for now self._make_xlsx()