def standardizeDatabaseDataframe(rnaseq_metadata_df, **kwargs): """ convert a dataframe containing sample info to a 'standard form' -- capitalized column headings and FASTQFILENAME is just the sample name -- no path, no extension :param rnaseq_metadata_df: pandas dataframe of the rnaseq_metadata :param kwargs: arbitrary keyword arguments. provided to pass logger :returns: the dataframe with column variables cast to uppercase and fastqFileName converted to SAMPLE """ try: # convert column headings to upper case rnaseq_metadata_df.columns = rnaseq_metadata_df.columns.str.upper() except AttributeError: print( 'standardizeDatabaseDataframe takes a dataframe, not a filepath, as an argument' ) # loop through rows for index, row in rnaseq_metadata_df.iterrows(): # replace fastqfilename one by one so as to extract the run number appropriately fastq_file_path = rnaseq_metadata_df.loc[index, 'FASTQFILENAME'] fastq_basename = utils.pathBaseName(fastq_file_path) rnaseq_metadata_df.loc[ index, 'FASTQFILENAME'] = utils.pathBaseName(fastq_basename) return rnaseq_metadata_df
def createBedFile(self, gene_list, fastq_filename, description, flanking_region=500): """ TODO currently file is _read_count.bed -- git rid of the _read_count Create bed files to describe IGV region of interest :param gene_list: list of genes to include in the bed file :param fastq_filename: fastq filename (sample identifier). used to name the bedfile and name the track in the igv shot :param description: either 'perturbed' or 'wildtype', or some other description of the track :param flanking_region: how far up/down stream from the gene to capture in the snapshot :returns: bedfile path (will be in rnaseq_tmp/todays_date """ # create day specific directory in rnaseq_tmp rnaseq_tmp_bedfile_dirpath = os.path.join(self.rnaseq_tmp, self.year_month_day) utils.mkdirp(rnaseq_tmp_bedfile_dirpath) # sample is the fastq file basename sample = utils.pathBaseName(fastq_filename) # create gene body region bed file igv_bed_filepath = os.path.join(rnaseq_tmp_bedfile_dirpath, utils.pathBaseName(fastq_filename) + '.bed') bed_lines_list = [] for gene in gene_list: gene_parsed_annotation_dict = self.annotation_dict[gene] sample_description = '[%s]%s.png' %(sample, gene) bed_lines_list.append('%s\t%s\t%s\t%s\t%s\t%s\n' %(gene_parsed_annotation_dict['chrm'], max(gene_parsed_annotation_dict['coords'][0] - flanking_region, 1), gene_parsed_annotation_dict['coords'][1] + flanking_region, sample_description, ".", gene_parsed_annotation_dict['strand'])) with open(igv_bed_filepath, 'w') as file: file.write(''.join(bed_lines_list)) return igv_bed_filepath
def main(argv): # read in cmd line args args = parseArgs(argv) print('...parsing arguments') # store interactive flag try: interactive_flag = args.interactive except AttributeError: interactive_flag = False sd = StandardData(config_file=args.config_file, interactive=interactive_flag) sd.standardDirectoryStructure() print('queryDB log can be found at: %s' % sd.log_file_path) sd.logger.debug('cmd line arguments are: %s' % args) # read in and check cmd line arguments database_path = args.database if database_path is not None and not os.path.exists(database_path): raise FileNotFoundError('DatabaseFileDoesNotExist') elif database_path is None: database_path = sd.database_files filter_json_path = args.json if filter_json_path is not None and not os.path.isfile(filter_json_path): raise FileNotFoundError('QueryJsonDoesNotExist') output_directory = args.output_directory if output_directory is not None and not os.path.exists(output_directory): raise FileNotFoundError('OutputDirectoryDoesNotExist') print('...compiling database') database_object = DatabaseObject(database_path, filter_json_path=filter_json_path, database_files=database_path, config_file=args.config_file, interactive=interactive_flag) database_object.setDatabaseDataframe() # filter database and print to output_directory, if json is present if database_object.filter_json_path is not None: print('...filtering database') database_object.filterDatabaseDataframe() output_filename = utils.pathBaseName(database_object.filter_json_path) filtered_output_path = os.path.join(output_directory, output_filename + '.csv') print('printing filtered database to: %s' % filtered_output_path) database_object.filtered_database_df.to_csv(filtered_output_path, index=False) # if user enters -pf, print full database if args.print_full: year_month_day = utils.yearMonthDay() full_database_output_path = os.path.join( output_directory, 'combined_df_{}.csv'.format(year_month_day)) print('printing full database to: %s' % full_database_output_path) database_object.database_df.to_csv(full_database_output_path, index=False)
def makeIgvSnapshotDict(self, control_sample_flag=True, marker_list=['CNAG_G418', 'CNAG_NAT']): """ TODO: TEST THAT .BAI FILE IS PRESENT TODO: RIGHT NOW SPECIFICALLY SET UP FOR 90MINUTEINDUCTION CRYPTO (because of control and marker) from list of samples, create dictionary with structure {fastq_filename1: {'gene': [gene_1, gene_2,...], 'bed': /path/to/bed, 'bam': /path/to/bam, fastq_filename2: ...} """ setattr(self, 'igv_snapshot_dict', {}) igv_sample_dict = {} for index, row in self.sample_df.iterrows(): # extract relevant info from query row sample_name = utils.pathBaseName(row.fastqFileName) bam_file = utils.convertFastqFilename(row.fastqFileName, 'bam') treatment = str(row.treatment) timepoint = str(row.timePoint) treatment_timepoint = "%s_%s" %(treatment,timepoint) #NOTE: this is setup specifically for KN99 -- needs to be generalized run_number = row.runNumber try: run_num_with_zero = self._run_numbers_with_zeros[run_number] except KeyError: pass else: run_number = run_num_with_zero bamfile_fullpath = os.path.join(self.align_count_results, 'run_%s_samples/align' %run_number, bam_file) try: if not os.path.exists(bamfile_fullpath): raise FileNotFoundError('%s DNE' %bamfile_fullpath) except FileNotFoundError: error_msg = 'bamfile does not exist at %s' %bamfile_fullpath self.logger.critical(error_msg) print(error_msg) genotype = row.genotype # split on period if this is a double perturbation. Regardless of whether a '.' is present, # genotype will be cast to a list eg ['CNAG_00000'] or ['CNAG_05420', 'CNAG_01438'] genotype_list = genotype.split('.') # if genotype (possibly two if double KO, hence loop) ends with _over, remove _over genotype_list = [x.replace('_over', '') for x in genotype_list] # replace any CNAG with CKF44 -- for crypto, CNAG was h99 designation and continues to be used in metadata. CKF44 is kn99 prefix in ncbi and is the prefix used in the genome, annotation, etc genotype_list = [x.replace('CNAG', 'CKF44') for x in genotype_list] # add sample to igv_sample_dict igv_sample_dict.setdefault(sample_name, {}) if control_sample_flag: igv_sample_dict[sample_name]['control_sample'] = self.getControlSample(treatment_timepoint) # add locus_dict igv_sample_dict[sample_name]['locus_dict'] = self.createLocusDict[genotype_list] # add marker_dict --> this will be pretty standard, maybe put in attributes somewhere eventually if marker_list: igv_sample_dict[sample_name]['marker_locus_dict'] = self.createLocusDict[marker_list]
def test_pathBaseName(self): path = '/path/to/dir/sequence_path.fastq.gz' basename = 'sequence_path' self.assertEqual(basename, utils.pathBaseName(path))
def subdirectoryReport(self, subdirectory_name, subdir_filepath_list, key_column_only_report=False): """ """ print('Checking %s column name formatting and entries' % subdirectory_name) specs_website = 'https://github.com/BrentLab/database_files/wiki' with open(self.accuracy_check_output_file, 'a') as subdirectory_report: subdirectory_report.write( 'Checking %s for adherence to specifications found at: %s\n' % (subdirectory_name, specs_website)) subdirectory_report.write( 'Last update (likely git pull) to directory: %s\n\n' % self.last_git_change) for subdirectory_filepath in subdir_filepath_list: self.logger.debug('Checking %s:\n' % subdirectory_filepath) # extract dictionaries of inconsistencies in column names and rows col_inconsistencies_dict, row_inconsistencies_dict = self.checkColumns( self.specification_dict[subdirectory_name], subdirectory_filepath) # check the format of the filename if not self.checkFileName(subdirectory_name, subdirectory_filepath): subdirectory_report.write( '\tThe filename %s does not adhere to the specifications. Please correct.\n\n' % os.path.basename(subdirectory_filepath)) # check column headings lines_to_write = ['In sheet %s:\n\tThe items below are column headings in a given sheet that do not match ' \ 'the specifications (key and non-key, this should be fixed when found).\n' %subdirectory_filepath] for spec_column, sheet_column in col_inconsistencies_dict.items( ): lines_to_write.append( '\tThe specification is: %s, the sheet column is: %s\n' % (spec_column, sheet_column)) lines_to_write.append( '\n\tThe items below are numbered by row (eg 1: inductionDelay means a problem in row 1 of inductionDelay). If shortReport, only key columns are checked:\n' ) for row_index, column_heading in row_inconsistencies_dict.items( ): # if short_report flag == True, only write out if the column_heading is a key column subdir_key_set = set( self.key_column_dict[utils.pathBaseName( utils.dirPath(subdirectory_filepath))]) current_column_heading_set = set(column_heading) # determine if column heading is in key column set key_set_diff_length = len(subdir_key_set - current_column_heading_set) if not key_column_only_report or (len(subdir_key_set) != key_set_diff_length): lines_to_write.append( '\tRow %s has an inconsistency in column %s\n' % (row_index, column_heading)) # if no columns found to have inconsistencies, remove the header line for this section from the lines_to_write list if lines_to_write[-1].endswith( 'only key columns are checked:\n'): lines_to_write.pop(-1) # if no column headings are found to be inconsistent, don't write at all. otherwise, write out the lines if not lines_to_write[-1].endswith( 'this should be fixed when found).\n'): lines_to_write.append('\n\n\n\n') subdirectory_report.write(''.join(lines_to_write))
def perturbedCheck(self): """ calculate gene coverage of genes in the 'genotype' column of the query_df that do not have suffix _over split genotype into two columns, genotype1, genotype2 to address double KO """ # error check if bam_file_list is set try: if not hasattr(self, 'bam_file_list'): raise AttributeError('NoBamFileList') except AttributeError: print( 'QualityAssessmentObject does not have attribute bam file list' ) # log which run numbers are present self.logger.info('\nThe run numbers in the sheet are: %s' % self.query_df['runNumber'].unique()) # create genotype_df from query_df[['fastqFileName' and 'genotype']] # get intersection of set('fastqFileName', 'genotype1', 'perturbation1', 'genotype2', 'perturbation2') set(self.query_df.columns) genotype_columns = list({ 'fastqFileName', 'genotype1', 'perturbation1', 'genotype2', 'perturbation2' }.intersection(set(self.query_df.columns))) # create genotype_df from intersection genotype_df = self.query_df[genotype_columns] # reduce fastqFileName to only simple basename (eg /path/to/some_fastq_R1_001.fastq.gz --> some_fastq_R1_001 genotype_df['fastqFileName'] = genotype_df['fastqFileName'].apply( lambda x: utils.pathBaseName(x)) # extract nat and g418 num bases from organism data nat_bases_in_cds = int(self.nat_cds_length) g418_bases_in_cds = int(self.g418_cds_length) # set feature over which to take percentage of reads (CDS in this case) feature = 'CDS' # create columns genotype1_coverage, genotype2_coverage, overexpression_fow (fold over wildtype), NAT_coverage, G418_coverage genotype_df['genotype1_coverage'] = None genotype_df['genotype2_coverage'] = None genotype_df['overexpression_fow'] = None genotype_df['NAT_coverage'] = None genotype_df['G418_coverage'] = None # loop over rows, calculating coverage for each genotype (testing wither genotype2 is none and perturbation is _over for index, row in genotype_df.iterrows(): # simple name is like this: run_673_s_4_withindex_sequence_TGAGGTT (no containing directories, no extention) fastq_simple_name = utils.pathBaseName(row['fastqFileName']) # get bam files which correspond to query_df fastqFileNames try: bam_file = [ bam_file for bam_file in self.bam_file_list if fastq_simple_name in bam_file ][0] except IndexError: self.logger.info( 'bam file not found for %s' % fastq_simple_name) # TODO: improve this logging continue # calculate marker coverages print('...calculating NAT coverage for %s' % fastq_simple_name) genotype_df.loc[ index, 'NAT_coverage'] = self.calculatePercentFeatureCoverage( feature, 'CNAG_NAT', self.annotation_file, bam_file, nat_bases_in_cds) print('...calculating G418 coverage for %s' % fastq_simple_name) genotype_df.loc[ index, 'G418_coverage'] = self.calculatePercentFeatureCoverage( feature, 'CNAG_G418', self.annotation_file, bam_file, g418_bases_in_cds) # if deletion, calculate coverage. Currently only set to check genotype1. assumes both are deletions if perturbation1 == 'deletion' if row['perturbation1'] == "deletion": try: # extract genotype1 TODO: JUST CASE EVERYTHING TO UPPER EARLIER genotype = [ self.extractInfoFromQuerySheet(row['fastqFileName'], 'genotype1'), None ] #genotype = [list(self.query_df[self.query_df['fastqFileName'].str.contains(row['FASTQFILENAME'] + '.fastq.gz')]['genotype1'])[0]] except ValueError: self.logger.info( 'genotype cannot be extracted with the fastq filename in this row. Note: if there are null entries in the column fastqFileNames, this is the cause. those need to be remedied or removed in order for this to work: %s' % row) try: # extract genotype2 or set it to None genotype[1] = self.extractInfoFromQuerySheet( row['fastqFileName'], 'genotype2') except KeyError: self.logger.debug("sample: %s does not have genotype2" % row['fastqFileName']) # determine which genome to use -- if CNAG, use KN99 if not genotype[0].startswith('CNAG'): raise ValueError('%sNotRecognizedCryptoGenotype' % genotype[0]) # replace CNAG with CKF44 (in past version of pipeline, KN99 genes were labelled with H99 names. NCBI required change to CKF. Numbering/order is same -- just need to switch CNAG to CKF44) genotype[0] = genotype[0].replace('CNAG', 'CKF44') if genotype[1] not in [None, 'nan' ] and genotype[1].startswith('CNAG'): genotype[1] = genotype[1].replace('CNAG', 'CKF44') print('...checking coverage of %s in %s' % (genotype, fastq_simple_name)) genotype_df.loc[ index, 'genotype1_coverage'] = self.calculatePercentFeatureCoverage( feature, genotype[0], self.annotation_file, bam_file) # do the same for genotype2 if it exists if genotype[1] not in [None, 'nan']: genotype_df.loc[ index, 'genotype2_coverage'] = self.calculatePercentFeatureCoverage( feature, genotype[1], self.annotation_file, bam_file) # return genotype check genotype_df.columns = [ column_name.upper() for column_name in genotype_df.columns ] return genotype_df[[ 'FASTQFILENAME', 'GENOTYPE1_COVERAGE', 'GENOTYPE2_COVERAGE', 'NAT_COVERAGE', 'G418_COVERAGE' ]]
def calculateExonicCoverage(self, bam_file, output_directory): """ calculate coverage of exon regions. deposits file in output directory named utils.pathBaseName(bam_file)+'_exonic_coverage.csv' :param bam_file: path to bam file :param output_directory: path to output directory """ # create df with two columns -- fastqFileName and EXONIC_COVERAGE exonic_df = pd.DataFrame() exonic_df['fastqFileName'] = [(bam_file)] exonic_df['EXONIC_COVERAGE'] = None try: for index, row in exonic_df.iterrows(): bam_file = row['fastqFileName'] print('...Assessing exonic coverage for %s' % bam_file) error_msg = 'No attribute %s. Check OrganismData_config.ini in %s' # get exon info from config file try: exon_region_bed_path = os.path.join( self.genome_files, 'KN99', self.exon_region_bed) total_exon_bases = int(self.total_exon_bases) except KeyError: kn99_error_msg = error_msg % ( 'kn99_total_intergenic_bases', 'KN99') self.logger.critical(kn99_error_msg) print(kn99_error_msg) # error check intergenic_region_bed_path try: if not os.path.isfile(exon_region_bed_path): raise FileNotFoundError( 'IntergenicRegionBedDoesNotExist') except FileNotFoundError: exonic_region_bed_path_error_msg = 'Intergenic region bed file does not exist at: %s' % exon_region_bed_path self.logger.critical(exonic_region_bed_path_error_msg) print(exonic_region_bed_path_error_msg) # extract exonic bases covered by at least one read exonic_bases_covered_cmd = 'samtools depth -aa -Q 10 -b %s %s | cut -f3 | grep -v 0 | wc -l' % ( exon_region_bed_path, bam_file) num_exonic_bases_covered = int( subprocess.getoutput(exonic_bases_covered_cmd)) # add to the df exonic_df.loc[ index, 'EXONIC_COVERAGE'] = num_exonic_bases_covered / float( total_exon_bases) except NameError: self.logger.critical( 'Cannot calculate INTERGENIC COVERAGE -- total_intergenic_bases or intergenic bed file not found as attribute for organism. Check genome_files/subdirs and each OrganismData_config.ini' ) # return qual_assess_df with INTERGENIC_COVERAGE added exonic_df['fastqFileName'] = exonic_df['fastqFileName'].apply( lambda x: utils.pathBaseName(x)) # write output_path = os.path.join( output_directory, utils.pathBaseName(bam_file) + '_exonic_coverage.csv') exonic_df.to_csv(output_path, index=False)
def parseGeneCount(self, htseq_counts_path): """ NOTE: SPECIFICALLY SET UP FOR CRYPTO count the gene counts that mapped either to genes (see COUNT_VARS at top of script for other features) :param htseq_counts_path: a path to a _read_count.tsv file (htseq-counts output) :returns: a dictionary with the keys FEATURE_ALIGN_NOT_UNIQUE, TOO_LOW_AQUAL, AMBIGUOUS_FEATURE, NO_FEATURE, NOT_ALIGNED_TOTAL """ sample_name = utils.pathBaseName(htseq_counts_path).replace( '_read_count', '') try: genotype = [ self.extractInfoFromQuerySheet(sample_name, 'genotype1'), None ] perturbation = [ self.extractInfoFromQuerySheet(sample_name, 'perturbation1'), None ] except KeyError: self.logger.info('Not in query sheet: %s' % htseq_counts_path) sys.exit( 'Count file passed to one of the quality assessment objects was not in the query sheet. These * should be * filtered out in the qual_assess_1 script' ) try: # extract genotype2 or set it to None genotype[1] = self.extractInfoFromQuerySheet( sample_name, 'genotype2') perturbation[1] = self.extractInfoFromQuerySheet( sample_name, 'perturbation2') except KeyError: self.logger.debug( "%s has no genotype2 and/or perturbation2 -- may need to check script if this is expected" % sample_name) else: library_metadata_dict = {} # TODO: error checking on keys htseq_file = open(htseq_counts_path, 'r') htseq_file_reversed = reversed(htseq_file.readlines()) crypto_protein_coding_count = 0 line = next(htseq_file_reversed) try: while True: line_strip_split = line.strip().split('\t') if line.startswith('CKF44'): # split the line, take the entry in the second column, which is the gene count, and add to crypto_protein_coding_effective_count gene_count = int(line_strip_split[1]) crypto_protein_coding_count += gene_count if not (line.startswith('CNAG') or line.startswith('CKF44')): # strip newchar, split on tab line = line.strip().split('\t') # extract the category of metadata count (eg __alignment_not_unique --> ALIGNMENT_NOT_UNIQUE) htseq_count_metadata_category = line_strip_split[0][ 2:].upper() # drop the __ in front of the category # enter to htseq_count_dict library_metadata_dict.setdefault( htseq_count_metadata_category, int(line[1])) # iterate line = next(htseq_file_reversed) except StopIteration: pass # error check gene count try: if crypto_protein_coding_count == 0: raise ValueError('NoGeneCountsDetected') except ValueError: self.logger.info( 'no lines start with CKF44 -- check organism: %s' % htseq_file) print('No lines starting with CKF44 have gene counts') # rename some key/value pairs library_metadata_dict[ 'NOT_ALIGNED_TOTAL'] = library_metadata_dict.pop('NOT_ALIGNED') library_metadata_dict[ 'FEATURE_ALIGN_NOT_UNIQUE'] = library_metadata_dict.pop( 'ALIGNMENT_NOT_UNIQUE') library_metadata_dict[ 'AMBIGUOUS_FEATURE'] = library_metadata_dict.pop('AMBIGUOUS') # add PROTEIN_CODING_COUNTED library_metadata_dict[ 'PROTEIN_CODING_COUNTED'] = crypto_protein_coding_count # add log2cpm data -- note, this will look in the run_####_samples directory of subdir count log2cpm_path = os.path.join( utils.dirPath(utils.dirPath(htseq_counts_path)), '%s_log2_cpm.csv' % self.organism) try: if not os.path.isfile(log2cpm_path): raise FileNotFoundError('log2cpm_pathDNE: %s' % log2cpm_path) except FileNotFoundError: msg = ' Output of log2cpm.R, which requires output of %s_raw_counts.py, ' \ 'must be in run_####_samples directory containing subdir count. ' \ 'This doesn\'t exist in %s' %(self.organism, sample_name) print(msg) self.logger.critical(msg) library_metadata_dict['NAT_LOG2CPM'] = self.extractLog2cpm( 'CNAG_NAT', sample_name, log2cpm_path) library_metadata_dict['G418_LOG2CPM'] = self.extractLog2cpm( 'CNAG_G418', sample_name, log2cpm_path) print("...extracting genotype log2cpm -- TESTING TESTING TESTING") if genotype[0] != 'CNAG_00000': library_metadata_dict[ 'GENOTYPE1_LOG2CPM'] = self.extractLog2cpm( genotype[0].replace("CNAG", "CKF44"), sample_name, log2cpm_path) if genotype[1] is not None: library_metadata_dict[ 'GENOTYPE2_LOG2CPM'] = self.extractLog2cpm( genotype[1].replace("CNAG", "CKF44"), sample_name, log2cpm_path) if perturbation[0] == "over": sample_medium = self.extractInfoFromQuerySheet( sample_name, 'treatment') sample_temperature = self.extractInfoFromQuerySheet( sample_name, 'temperature') sample_atmosphere = self.extractInfoFromQuerySheet( sample_name, 'atmosphere') sample_timepoint = self.extractInfoFromQuerySheet( sample_name, 'timePoint') perturbed_gene = genotype.replace('_over', '').replace('CNAG', 'CKF44') # THIS NEEDS TO BE UPDATED WITH NEW MEDIAN_LOG2CPM BY WILDTYPE REPLICATE GROUPS WHEN TREATMENT COLUMNS ARE STABLE AGAIN library_metadata_dict[ 'OVEREXPRESSION_FOW'] = 0 #self.foldOverWildtype(perturbed_gene, sample_name, log2cpm_path, [sample_medium, sample_temperature, sample_atmosphere], sample_timepoint) htseq_file.close() return library_metadata_dict
def quantifyNonCodingRna(self, qual_assess_df): """ """ num_reads_to_ncRNA_dict = {} # set threshold to determine strandedness. note that there is an email from holly to yiming mentioning 10.25.2015. That is the best record we have, if htis message remains strandedness_date_threshold = pd.to_datetime('10.01.2015') kn99_tRNA_ncRNA_annotations = os.path.join(self.genome_files, 'KN99', 'ncRNA_tRNA_no_rRNA.gff') if hasattr(self, 'query_df'): for index, row in qual_assess_df.iterrows(): try: # extract genotype1 genotype = [ self.extractInfoFromQuerySheet(row['FASTQFILENAME'], 'genotype1'), None ] #genotype = [list(self.query_df[self.query_df['fastqFileName'].str.contains(row['FASTQFILENAME'] + '.fastq.gz')]['genotype1'])[0]] except ValueError: self.logger.info( 'genotype cannot be extracted with the fastq filename in this row. Note: if there are null entries in the column fastqFileNames, this is the cause. those need to be remedied or removed in order for this to work: %s' % row) try: # extract genotype2 or set it to None genotype[1] = self.extractInfoFromQuerySheet( row['FASTQFILENAME'], 'genotype2') except KeyError: self.logger.debug("sample: %s does not have genotype2" % row['FASTQFILENAME']) # test if organism is KN99. Proceed if so if genotype[0].startswith('CNAG'): # extract fastq_filename without any preceeding path or file extension fastq_simple_name = utils.pathBaseName( row['FASTQFILENAME']) print('...evaluating ncRNA in %s' % fastq_simple_name) # use this to extract bam_path try: # bam_file_list is inherited bam_path = [ bam_file for bam_file in self.bam_file_list if fastq_simple_name in bam_file ][0] except IndexError: self.logger.info('%s not in bam_file_list' % fastq_simple_name) continue try: if not os.path.isfile(bam_path): raise FileNotFoundError except FileNotFoundError: self.logger.error('bam file not found %s' % bam_path) print('bam file not found: %s' % bam_path) library_date = list( self.query_df[self.query_df['fastqFileName'].str. contains(row['FASTQFILENAME'] + '.fastq.gz')]['libraryDate'])[0] row_date_time = pd.to_datetime(library_date) strandedness = 'no' if row_date_time < strandedness_date_threshold else 'reverse' total_rRNA, unique_rRNA = self.totalrRNA( bam_path, 'CP022322.1:272773-283180', strandedness) unique_tRNA_ncRNA = self.totaltRNAncRNA( bam_path, kn99_tRNA_ncRNA_annotations, strandedness) num_reads_to_ncRNA_dict.setdefault( fastq_simple_name, { 'total_rRNA': total_rRNA, 'unique_rRNA': unique_rRNA, 'total_tRNA_ncRNA': unique_tRNA_ncRNA }) # create dataframe from num_reads_to_ncRNA_dict ncRNA_df = pd.DataFrame.from_dict(num_reads_to_ncRNA_dict, orient='index').reset_index() # format column headers ncRNA_df.columns = [ 'FASTQFILENAME', 'TOTAL_rRNA', 'UNIQUE_rRNA', 'UNIQUE_tRNA_ncRNA' ] return ncRNA_df