def setConcatDatabaseDict(self): """ creates concatenated dataframe from all files in a given list of paths to database subdirectories structure {subdirectory: concatenated_table_of_all_files_in_database/subdirectory/*} """ # create dataframe from first file in file_list in given key(subdirectory) of database_dict for subdirectory, file_list in self.database_dict.items(): self.concat_database_dict[subdirectory] = utils.readInDataframe( file_list[0]) column_list = self.concat_database_dict[subdirectory].columns column_list = [ column_header.strip() for column_header in column_list ] self.concat_database_dict[subdirectory].columns = column_list # keep appending (cbind) dataframes to the bottom for file in file_list[1:]: # read in next file in list as next_sheet next_sheet = utils.readInDataframe(file) self.logger.debug('columns of %s are %s' % (file, next_sheet.columns)) self.concat_database_dict[ subdirectory] = self.concat_database_dict[ subdirectory].append(next_sheet) # reset index so it is sequential self.concat_database_dict[subdirectory].reset_index(inplace=True, drop=True)
def __init__(self, **kwargs): # additional attributes to add to the _attributes in StandardData # TODO: possibly change inheretence to a subclass of OrganismData that sets up a class for ANY scheduler manipulation (ie align_counts, this) that take email as an optional argument self._igv_attributes = ['query_sheet_path', 'igv_output_dir'] # initialize Standard data with the extended _attributes super(IgvObject, self).__init__(self._igv_attributes, **kwargs) # initialize list to store bamfiles that need to be indexed (must be done by batch script) self.bam_file_to_index_list = [] # create logger for IgvObject self.logger = utils.createLogger(self.log_file_path, __name__, 'DEBUG') try: self.query_sheet_path = kwargs['query_sheet_path'] self.control_sheet_path = kwargs['control_sheet_path'] except KeyError: self.logger.debug('query sheet path and/or control sheet path not passed in constructor') else: self.sample_df = utils.readInDataframe(self.query_sheet_path) self.control_sample_df = self.createControlSampleDict() # get gene dictionary with chromsome, gene coordinates, strand if self.annotation_file.endswith('gtf'): self.annotation_dict = annotation_tools.parseGtf(self.annotation_file) elif self.annotation_file.endswith('gff') or self.annotation_file.endswith('gff3'): self.annotation_dict = annotation_tools.parseGff3(self.annotation_file) else: sys.exit("ERROR: The gene annotation format cannot be recognized.") # TODO: clean up preceeding blocks -- move parseGFF to OrganismData
def main(argv): args = parseArgs(argv) try: if not os.path.isdir(args.count_directory): raise NotADirectoryError('ERROR: %s does not exist.' % args.count_directory) if not os.path.isfile(args.query_sheet): raise NotADirectoryError('ERROR: %s does not exist.' % args.count_directory) except FileNotFoundError: print('path to %s does not exist') else: count_dirpath = args.count_directory query_sheet_path = args.query_sheet query_df = utils.readInDataframe(query_sheet_path) # extract count files from count_dir count_dir_file_list = glob.glob( os.path.join(count_dirpath, '*read_count.tsv')) # TODO: SOME ERROR CHECKING ON THE FASTQFILENAME? # all crypto records will have genotype beginning with CNAG_, used this to extract list of crypto and yeast samples from query crypto_sample_list = list( query_df[query_df.genotype1.str.startswith('CNAG')].fastqFileName ) #TODO: after metadata organism column added, update this section s288c_r64_sample_list = list( query_df[~query_df.genotype1.str.startswith('CNAG')].fastqFileName) # split list of count files based on membership in dataframes above count_files_by_organism_dict = { 'KN99': [ x for x in count_dir_file_list if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in crypto_sample_list ], 'S288C_R64': [ x for x in count_dir_file_list if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in s288c_r64_sample_list ] } # create and write out count sheets for organism, count_file_list in count_files_by_organism_dict.items(): if len(count_file_list) > 0: od = OrganismData(organism=organism, config_file=args.config_file, interactive=args.interactive) count_df = od.createCountSheet(count_file_list) output_path = os.path.join( utils.dirPath(utils.dirPath(count_file_list[0])), '%s_raw_count.csv' % organism) print('writing count file to %s' % output_path) count_df.to_csv(output_path, index=False)
def checkColumns(self, subdirectory_specs_dict, subdirectory_filepath): """ check column heading names and entries in each row/column for adherence to the specs at: https://github.com/BrentLab/database_files/wiki :param subdirectory_specs_dict: see constructor. In the case of bioSample, you would pass db.specification_dict['bioSample'] :param subdirectory_filepath: path to a sheet in a given subdirectory (eg a bioSample .xslx) :param logger: reference to a logger. Default is None :return: colname_inconsistencies_dict, a dict in structure {specification_heading: nearest_match_to_heading, ...} row_inconsistencies_dict, a dict in structure {row_index: column_with_inconsistent_entry, ...} """ self.logger.info('path to sheet is %s' % subdirectory_filepath) # see :return: statement for structure colname_inconsistencies_dict = {} row_inconsistencies_dict = {} # list to store inappropriately formatted column names skip_columns = [] # read in subdirectory_filepath as dataframe subdirectory_df = utils.readInDataframe(subdirectory_filepath) # loop over rows in dataframe for index, row in subdirectory_df.iterrows(): # convert row into a dictionary {column: value, ...} row_dict = dict(row) for column_name, column_entry in row_dict.items(): column_entry = str(column_entry) try: column_specs_regex = subdirectory_specs_dict[ 'column_specs_dict'][column_name] except KeyError: if column_name not in skip_columns: if self.logger: self.logger.info( 'Column name not found in specs: %s' % column_name) self.logger.info( 'row for offending column is: %s' % row) nearest_match = difflib.get_close_matches( column_name, subdirectory_specs_dict['column_specs_dict'].keys( ))[0] colname_inconsistencies_dict.setdefault( nearest_match, column_name) print( '\tCannot check %s in %s. Either the format of the column is incorrect, or it is not in the specifications_dictionary.\n' '\tThe rest of this column could not be checked. Correct the column name, and re-run.' % (column_name, subdirectory_filepath)) skip_columns.append(column_name) else: if not re.match(column_specs_regex, column_entry): row_inconsistencies_dict.setdefault( str(index), []).append(column_name) return colname_inconsistencies_dict, row_inconsistencies_dict
def main(argv): """ main method :param argv: cmd line arguments """ # parse cmd line arguments args = parseArgs(argv) print('...parsing cmd line arguments') query_sheet_path = args.query_sheet try: if not os.path.isfile(query_sheet_path): raise FileNotFoundError('DNE: %s' %query_sheet_path) except FileNotFoundError: print('The query sheet path is not valid. Check and try again') else: query_df = utils.readInDataframe(query_sheet_path) # store interactive flag try: interactive_flag = args.interactive except AttributeError: interactive_flag = False run_list = list(query_df.runNumber.unique()) # create paths from /scratch to the run directory sd = StandardData(config_file=args.config_file, interactive=interactive_flag) run_path_list = [os.path.join(sd.align_count_results, 'run_'+str(x)+'_samples') for x in run_list] # check that paths exist TODO: CHECK CONTENTS OF SUBDIRECTORY FOR COMPLETENESS print('...validating paths to run directories') validated_run_path_list = validatePaths(sd, run_list, run_path_list) # write lookup file of run number paths for the sbatch cmd (see https://htcfdocs.readthedocs.io/en/latest/runningjobs/) lookup_filename = 'qual_assess_1_lookup_' + str(sd.year_month_day) + '_' + str(utils.hourMinuteSecond()) + '.txt' lookup_output_path = os.path.join(sd.job_scripts, lookup_filename) print('...writing lookup file for sbatch script to: %s' %lookup_output_path) with open(lookup_output_path, 'w') as file: file.write('\n'.join(map(str, validated_run_path_list))) # write sbatch script to run qual_assess on all runs in lookup file above script = writeSbatchScript(sd, args.user_name, validated_run_path_list, lookup_output_path, query_sheet_path) sbatch_filename = 'qual_assess_1_batch_' + str(sd.year_month_day) + '_' + str(utils.hourMinuteSecond() + '.sbatch') qual_assess_job_script_path = os.path.join(sd.job_scripts, sbatch_filename) print('...writing sbatch script to: %s' %qual_assess_job_script_path) with open(qual_assess_job_script_path, "w") as f: f.write(script) cmd = 'sbatch %s' %qual_assess_job_script_path utils.executeSubProcess(cmd) print('\nCheck status by cat\'ing the sbatch file above and then cat\'ing the .out file in the sbatch script\n')
def main(argv): args = parseArgs(argv) # parse cmd line arguments and error check paths/values print('...parsing cmd line input') try: if not os.path.isdir(args.align_count_dir): raise NotADirectoryError('OutputDirDoesNotExist') except NotADirectoryError: print( '%s does not lead to a valid directory. Check the path and resubmit with working -r' % args.align_count_dir) else: align_count_path = args.align_count_dir output_directory = args.align_count_dir try: if not os.path.isfile(args.query_sheet_path): raise FileNotFoundError('QuerySheetDoesNotExist') except FileNotFoundError: print( '%s does not lead to a valid file. Check and resubmit correct -qs' % args.query_sheet_path) except TypeError: pass else: query_sheet_path = args.query_sheet_path # get run number if exists for output naming. if DNE, ask user to provide name to insert after run_<using run_num>_summary.csv try: run_number = utils.getRunNumber(align_count_path) # create name for qual_assess filename_prefix = 'run_%s' % run_number except AttributeError: # TODO: this will cause a problem if running via batchscript filename_prefix = input( 'No run number detected in input directory name. Enter something to insert in the output directory\n' 'name: <your_input>_quality_summary.csv: ') # store interactive flag try: interactive_flag = args.interactive except AttributeError: interactive_flag = False # read in query sheet # TODO: GENERALIZE THIS INTO EITHER STANDARDDATA OR UTILS. RETURN AS DICT. DO THIS AFTER ADDING ORGANISM COLUMN TO METADATA SPECS query_df = utils.readInDataframe(query_sheet_path) query_fastq_list = list(query_df.fastqFileName) # extract bam file names bam_list = utils.extractFiles(align_count_path, '.bam') # filter bam_list for files in the query sheet filtered_bam_list = [ x for x in bam_list if os.path.basename(x).replace('_sorted_aligned_reads_with_annote.bam', '.fastq.gz') in query_fastq_list ] # extract novoalign logs novoalign_logs = utils.extractFiles(align_count_path, 'novoalign.log') filtered_novoalign_logs = [ x for x in novoalign_logs if os.path.basename(x).replace( '_novoalign.log', '.fastq.gz') in query_fastq_list ] # extract count file list count_list = utils.extractFiles(align_count_path, 'read_count.tsv') filtered_count_list = [ x for x in count_list if os.path.basename(x).replace( '_read_count.tsv', '.fastq.gz') in query_fastq_list ] # from count_list, get convert to a list of fastq.gz names extracted_sample_fastq_list = [ os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) for x in count_list ] if len(filtered_bam_list) != len(filtered_count_list) or len( filtered_bam_list) != len(filtered_novoalign_logs): sys.exit( 'The number of bam_files, count_files and/or log_files does not match. Check file contents' ) # all crypto records will have genotype beginning with CNAG_ crypto_query_df = query_df[ ~query_df.genotype1.isna() & query_df.genotype1.str.startswith('CNAG') & query_df.fastqFileName.isin(extracted_sample_fastq_list)] yeast_query_df = query_df[( ~(query_df.genotype1.isna() | query_df.fastqFileName.isin(crypto_query_df.fastqFileName)) & query_df.fastqFileName.isin(extracted_sample_fastq_list))] # create list to store qual_assess dataframes qual_assess_df_list = [] if len(crypto_query_df) > 0: # if coverage_check is passed in cmd line, include query and coverage_check_flag in constructor (automatically sets some values #TODO make this a function with arugmnets to pass so as not to repeat entire constructor) print('...compiling KN99 samples information') crypto_qa_object = CryptoQualAssessAuditObject( organism='KN99', bam_file_list=filtered_bam_list, count_file_list=filtered_count_list, novoalign_log_list=filtered_novoalign_logs, coverage_check_flag=True, query_df=crypto_query_df, config_file=args.config_file, interactive=interactive_flag) # add dataframe to list try: qual_assess_df_list.append(crypto_qa_object.qual_assess_df) except AttributeError: error_msg = 'There was an error appending the KN99 qual assess dataframe. Check the paths in the query sheet and align_counts directory' crypto_qa_object.logger.debug(error_msg) print(error_msg) if len(yeast_query_df) > 0: yeast_qa_object = S288C_R54QualAssessAuditObject( organism='S288C_R64', bam_file_list=filtered_bam_list, count_file_list=filtered_count_list, novoalign_log_list=filtered_novoalign_logs, query_path=args.query_sheet_path, config_file=args.config_file, interactive=interactive_flag) print('...compiling S288C_R64 alignment information') # create dataframes storing the relevant alignment and count metadata from the novoalign and htseq logs try: qual_assess_df_list.append(yeast_qa_object.qual_assess_df) except AttributeError: error_msg = 'There was an error appending the S288C_R64 qual assess dataframe. Check the paths in the query sheet and align_counts directory' crypto_qa_object.logger.debug(error_msg) print(error_msg) # combine dataframes, if both organisms present print('...creating quality_assessment sheet for %s' % filename_prefix) combined_qual_assess_1_df = pd.concat(qual_assess_df_list) # create filename quality_assessment_filename = "%s_sequence_quality_summary.csv" % filename_prefix output_path = os.path.join(output_directory, quality_assessment_filename) print('writing output to %s' % output_path) combined_qual_assess_1_df.to_csv(output_path, index=False)
def main(argv): """ main method :param argv: cmd line arguments """ # parse cmd line arguments args = parseArgs(argv) query_sheet_path = args.query_sheet try: if not os.path.isfile(query_sheet_path): raise FileNotFoundError except FileNotFoundError: print('Query sheet path not valid. Check and try again.') try: interactive_flag = args.interactive except AttributeError: interactive_flag = False # instantiate DatabaseObject --> mostly this will be for access to StandardData paths db = DatabaseObject(query_sheet_path=query_sheet_path, config_file=args.config_file, interactive=interactive_flag) # read in dataframe db.query_df = utils.readInDataframe(db.query_sheet_path) # add column organism which identifies either KN99 or S288C_R64 depending on whether genotype1 starts with CNAG # TODO: this is point of weakness -- need to keep an eye here db.query_df['organism'] = np.where(db.query_df['genotype1'].str.startswith('CNAG'), 'KN99', 'S288C_R64') # cast libraryDate to datetime format db.query_df['libraryDate'] = pd.to_datetime(db.query_df['libraryDate']) # create strandedness column based on libraryDate. May change to prep protocol at some point, but for now this is best db.query_df['strandedness'] = np.where(db.query_df['libraryDate'] > '2015-10-25', 'reverse', 'no') # add leading zero to runNumber, if necessary -- take care of in loop db.query_df['runNumber'] = db.query_df['runNumber'].astype(str) # new dictionary to store run_directory in dataframe run_directory_list = [] for index, row in db.query_df.iterrows(): # some early runs have run numbers that start with zero in /lts. 0s are dropped in df b/c they are read in as ints # this step adds the zero and casts the row to str run_num_tmp = int(float(row['runNumber']))# TODO: super ugly, needs to be fixed. Not sure why this is now getting read in as 4422.0, eg as of 20200923 if run_num_tmp in db._run_numbers_with_zeros: # TODO: Probably the best way to is to always read runnumbers as strings -- requires changing _run_num_with_zeros keys to strings, and checking the rest of the codebase that uses this run_number = str(db._run_numbers_with_zeros[run_num_tmp]) else: run_number = run_num_tmp # create run directory name, eg run_1234_samples run_directory = 'run_' + str(run_number) + '_samples' # SEE TODO above # add to list run_directory_list.append(run_directory) # create fastqfilename path try: fastq_filename = os.path.basename(row['fastqFileName']).rstrip() except TypeError: sys.exit("%s <-- not a fastqfilename?" %row['fastqFileName']) fastq_scratch_path = os.path.join(db.scratch_sequence, run_directory, fastq_filename) # move fastq file to scratch if it is not already tehre if not os.path.exists(fastq_scratch_path): fastq_lts_path = os.path.join(db.lts_sequence, run_directory, fastq_filename) scratch_run_directory_path = os.path.join(db.scratch_sequence, run_directory) utils.mkdirp(scratch_run_directory_path) print('...moving %s to %s' %(fastq_lts_path, scratch_run_directory_path)) rsync_cmd = 'rsync -aHv %s %s' %(fastq_lts_path, scratch_run_directory_path) utils.executeSubProcess(rsync_cmd) # update fastqFileName in query_df db.query_df.loc[index, 'fastqFileName'] = fastq_scratch_path # add column runDirectory from run_directory_list db.query_df['runDirectory'] = run_directory_list # use OrganismDataObject to get paths to novoalign_index and annotation files kn99_organism_data = OrganismData(organism='KN99') kn99_novoalign_index = kn99_organism_data.novoalign_index # this is annotations + nc, t, r RNA with nc,t,r RNA annotations overlapping with protein coding ON SAME STRAND removed. rRNA retained kn99_annotation_file = kn99_organism_data.annotation_file # this is annotations + nc, t, r RNA with nc,t,r RNA annotations overlapping protein coding removed regardless of strand. rRNA retained kn99_annotation_file_no_strand = kn99_organism_data.annotation_file_no_strand kn99_genome = kn99_organism_data.genome s288c_r64_organism_data = OrganismData(organism='S288C_R64') s288c_r64_novoalign_index = s288c_r64_organism_data.novoalign_index s288c_r64_annotation_file = s288c_r64_organism_data.annotation_file s288c_r64_genome = s288c_r64_organism_data.genome # filter nextflow_fastqfile_df = db.query_df[['runDirectory', 'fastqFileName', 'organism', 'strandedness']] for index, row in nextflow_fastqfile_df.iterrows(): try: if not os.path.isfile(row['fastqFileName']): raise FileNotFoundError('fastqFileNotFoundInScratch') except FileNotFoundError: print('file %s was not successfully moved from lts to scratch' %row['fastqFileName']) print('\nnextflow fastq file .csv head:\n') print(nextflow_fastqfile_df.head()) print('\n') # write out fastq_file_list_output_path = os.path.join(db.job_scripts, 'nextflow_fastqfile_list' + '_' + args.name + '.csv') print('...writing out to %s' % fastq_file_list_output_path) nextflow_fastqfile_df.to_csv(fastq_file_list_output_path, index=False) # config_header goes at the top of the config -- includes date created and StandardObject instructions config_header = "/*\n" \ "* -------------------------------------------------\n" \ "* Brentlab nextflow rnaseq_pipeline configuration\n" \ "* -------------------------------------------------\n" \ "* created with create_nextflow_config.py on %s\n" \ "* note: this is for a specific job for a specific user\n" \ "* and not intended as a general config file. To re-create\n" \ "* this job, you will need to run create_nextflow_config.py\n" \ "* with the same query_sheet input\n" \ "*/\n\n" % db.year_month_day # params section has all relevant path parameters to run the pipeline params_section = "// params necessary for the pipeline\n" \ "params {\n" \ "\tfastq_file_list = \"%s\"\n" \ "\tlts_sequence = \"%s\"\n" \ "\tscratch_sequence = \"%s\"\n" \ "\tlts_align_expr = \"%s\"\n" \ "\talign_count_results = \"%s\"\n" \ "\tlog_dir = \"%s\"\n" \ "\tKN99_novoalign_index = \"%s\"\n" \ "\tKN99_annotation_file = \"%s\"\n" \ "\tKN99_annotation_file_no_strand = \"%s\"\n" \ "\tKN99_genome = \"%s\"\n" \ "\tS288C_R64_novoalign_index = \"%s\"\n" \ "\tS288C_R64_annotation_file = \"%s\"\n" \ "\tS288C_R64_genome = \"%s\"\n" \ "}\n\n" % (fastq_file_list_output_path, db.lts_sequence, db.scratch_sequence, db.lts_align_expr, db.align_count_results, db.log_dir, kn99_novoalign_index, kn99_annotation_file, kn99_annotation_file_no_strand, kn99_genome, s288c_r64_novoalign_index, s288c_r64_annotation_file, s288c_r64_genome) # write out and submit sbatch script with named/combined output/err nextflow_config_path = os.path.join(db.job_scripts, args.name + '_nextflow.config') print('...writing nextflow job config file to %s' % nextflow_config_path) with open(nextflow_config_path, 'w') as nextflow_config_file: nextflow_config_file.write(config_header) nextflow_config_file.write(params_section) sbatch_script_name = args.name + '_nextflow' nextflow_sbatch_path = os.path.join(db.job_scripts, sbatch_script_name + '.sbatch') # write sbatch script to submit nextflow job print('...writing sbatch script to %s' %nextflow_sbatch_path) with open(nextflow_sbatch_path, 'w') as nf_sbatch_file: nf_sbatch_file.write('#!/bin/bash\n' '#SBATCH --mem=15G\n' '#SBATCH -o %s/%s.out\n' '#SBATCH -J %s\n\n' 'ml rnaseq_pipeline\n\n' 'nextflow -C %s run $CODEBASE/tools/align_count_pipeline.nf\n' %(db.sbatch_log, sbatch_script_name, sbatch_script_name, nextflow_config_path)) sbatch_cmd = 'sbatch %s' %nextflow_sbatch_path print('\nsubmitting sbatch script with cmd:\n\t%s' %sbatch_cmd) utils.executeSubProcess(sbatch_cmd) print('\nCheck progress by entering:\n\ttail %s/%s.out' %(db.sbatch_log, sbatch_script_name)) print('\nTo run this in an interactive session, do the following:\n\t' 'interactive\n\tnextflow -C %s run $CODEBASE/tools/align_count_pipeline.nf\n' % nextflow_config_path) print('If this job fails or is interrupted, you can resume it from where it failed by adding the flag -r to the nextflow command in the .sbatch file and resubmitting to sbatch')