def standardizeDatabaseDataframe(rnaseq_metadata_df, **kwargs):
        """
            convert a dataframe containing sample info to a 'standard form' -- capitalized column headings and FASTQFILENAME
            is just the sample name -- no path, no extension
            :param rnaseq_metadata_df: pandas dataframe of the rnaseq_metadata
            :param kwargs: arbitrary keyword arguments. provided to pass logger
            :returns: the dataframe with column variables cast to uppercase and fastqFileName converted to SAMPLE
        """
        try:
            # convert column headings to upper case
            rnaseq_metadata_df.columns = rnaseq_metadata_df.columns.str.upper()
        except AttributeError:
            print(
                'standardizeDatabaseDataframe takes a dataframe, not a filepath, as an argument'
            )

        # loop through rows
        for index, row in rnaseq_metadata_df.iterrows():
            # replace fastqfilename one by one so as to extract the run number appropriately
            fastq_file_path = rnaseq_metadata_df.loc[index, 'FASTQFILENAME']
            fastq_basename = utils.pathBaseName(fastq_file_path)
            rnaseq_metadata_df.loc[
                index, 'FASTQFILENAME'] = utils.pathBaseName(fastq_basename)

        return rnaseq_metadata_df
    def createBedFile(self, gene_list, fastq_filename, description, flanking_region=500):
        """ TODO currently file is _read_count.bed -- git rid of the _read_count
            Create bed files to describe IGV region of interest
            :param gene_list: list of genes to include in the bed file
            :param fastq_filename: fastq filename (sample identifier). used to name the bedfile and name the track in the igv shot
            :param description: either 'perturbed' or 'wildtype', or some other description of the track
            :param flanking_region: how far up/down stream from the gene to capture in the snapshot
            :returns: bedfile path (will be in rnaseq_tmp/todays_date
        """

        # create day specific directory in rnaseq_tmp
        rnaseq_tmp_bedfile_dirpath = os.path.join(self.rnaseq_tmp, self.year_month_day)
        utils.mkdirp(rnaseq_tmp_bedfile_dirpath)

        # sample is the fastq file basename
        sample = utils.pathBaseName(fastq_filename)

        # create gene body region bed file
        igv_bed_filepath = os.path.join(rnaseq_tmp_bedfile_dirpath, utils.pathBaseName(fastq_filename) + '.bed')

        bed_lines_list = []
        for gene in gene_list:
            gene_parsed_annotation_dict = self.annotation_dict[gene]
            sample_description = '[%s]%s.png' %(sample, gene)
            bed_lines_list.append('%s\t%s\t%s\t%s\t%s\t%s\n' %(gene_parsed_annotation_dict['chrm'],
                                                                max(gene_parsed_annotation_dict['coords'][0] - flanking_region, 1),
                                                                gene_parsed_annotation_dict['coords'][1] + flanking_region,
                                                                sample_description, ".", gene_parsed_annotation_dict['strand']))

        with open(igv_bed_filepath, 'w') as file:
            file.write(''.join(bed_lines_list))

        return igv_bed_filepath
Beispiel #3
0
def main(argv):
    # read in cmd line args
    args = parseArgs(argv)
    print('...parsing arguments')
    # store interactive flag
    try:
        interactive_flag = args.interactive
    except AttributeError:
        interactive_flag = False

    sd = StandardData(config_file=args.config_file,
                      interactive=interactive_flag)
    sd.standardDirectoryStructure()
    print('queryDB log can be found at: %s' % sd.log_file_path)
    sd.logger.debug('cmd line arguments are: %s' % args)

    # read in and check cmd line arguments
    database_path = args.database
    if database_path is not None and not os.path.exists(database_path):
        raise FileNotFoundError('DatabaseFileDoesNotExist')
    elif database_path is None:
        database_path = sd.database_files
    filter_json_path = args.json
    if filter_json_path is not None and not os.path.isfile(filter_json_path):
        raise FileNotFoundError('QueryJsonDoesNotExist')
    output_directory = args.output_directory
    if output_directory is not None and not os.path.exists(output_directory):
        raise FileNotFoundError('OutputDirectoryDoesNotExist')
    print('...compiling database')
    database_object = DatabaseObject(database_path,
                                     filter_json_path=filter_json_path,
                                     database_files=database_path,
                                     config_file=args.config_file,
                                     interactive=interactive_flag)
    database_object.setDatabaseDataframe()

    # filter database and print to output_directory, if json is present
    if database_object.filter_json_path is not None:
        print('...filtering database')
        database_object.filterDatabaseDataframe()
        output_filename = utils.pathBaseName(database_object.filter_json_path)
        filtered_output_path = os.path.join(output_directory,
                                            output_filename + '.csv')
        print('printing filtered database to: %s' % filtered_output_path)
        database_object.filtered_database_df.to_csv(filtered_output_path,
                                                    index=False)

    # if user enters -pf, print full database
    if args.print_full:
        year_month_day = utils.yearMonthDay()
        full_database_output_path = os.path.join(
            output_directory, 'combined_df_{}.csv'.format(year_month_day))
        print('printing full database to: %s' % full_database_output_path)
        database_object.database_df.to_csv(full_database_output_path,
                                           index=False)
    def makeIgvSnapshotDict(self, control_sample_flag=True, marker_list=['CNAG_G418', 'CNAG_NAT']):
        """ TODO: TEST THAT .BAI FILE IS PRESENT
        TODO: RIGHT NOW SPECIFICALLY SET UP FOR 90MINUTEINDUCTION CRYPTO (because of control and marker)
            from list of samples, create dictionary with structure
                {fastq_filename1: {'gene': [gene_1, gene_2,...], 'bed': /path/to/bed, 'bam': /path/to/bam, fastq_filename2: ...}
        """
        setattr(self, 'igv_snapshot_dict', {})
        igv_sample_dict = {}

        for index, row in self.sample_df.iterrows():
            # extract relevant info from query row
            sample_name = utils.pathBaseName(row.fastqFileName)
            bam_file = utils.convertFastqFilename(row.fastqFileName, 'bam')
            treatment = str(row.treatment)
            timepoint = str(row.timePoint)
            treatment_timepoint = "%s_%s" %(treatment,timepoint) #NOTE: this is setup specifically for KN99 -- needs to be generalized
            run_number = row.runNumber
            try:
                run_num_with_zero = self._run_numbers_with_zeros[run_number]
            except KeyError:
                pass
            else:
                run_number = run_num_with_zero
            bamfile_fullpath = os.path.join(self.align_count_results, 'run_%s_samples/align' %run_number, bam_file)
            try:
                if not os.path.exists(bamfile_fullpath):
                    raise FileNotFoundError('%s DNE' %bamfile_fullpath)
            except FileNotFoundError:
                error_msg = 'bamfile does not exist at %s' %bamfile_fullpath
                self.logger.critical(error_msg)
                print(error_msg)
            genotype = row.genotype
            # split on period if this is a double perturbation. Regardless of whether a '.' is present,
            # genotype will be cast to a list eg ['CNAG_00000'] or ['CNAG_05420', 'CNAG_01438']
            genotype_list = genotype.split('.')
            # if genotype (possibly two if double KO, hence loop) ends with _over, remove _over
            genotype_list = [x.replace('_over', '') for x in genotype_list]
            # replace any CNAG with CKF44 -- for crypto, CNAG was h99 designation and continues to be used in metadata. CKF44 is kn99 prefix in ncbi and is the prefix used in the genome, annotation, etc
            genotype_list = [x.replace('CNAG', 'CKF44') for x in genotype_list]

            # add sample to igv_sample_dict
            igv_sample_dict.setdefault(sample_name, {})
            if control_sample_flag:
                igv_sample_dict[sample_name]['control_sample'] = self.getControlSample(treatment_timepoint)
            # add locus_dict
            igv_sample_dict[sample_name]['locus_dict'] = self.createLocusDict[genotype_list]
            # add marker_dict --> this will be pretty standard, maybe put in attributes somewhere eventually
            if marker_list:
                igv_sample_dict[sample_name]['marker_locus_dict'] = self.createLocusDict[marker_list]
Beispiel #5
0
 def test_pathBaseName(self):
     path = '/path/to/dir/sequence_path.fastq.gz'
     basename = 'sequence_path'
     self.assertEqual(basename, utils.pathBaseName(path))
    def subdirectoryReport(self,
                           subdirectory_name,
                           subdir_filepath_list,
                           key_column_only_report=False):
        """

        """
        print('Checking %s column name formatting and entries' %
              subdirectory_name)
        specs_website = 'https://github.com/BrentLab/database_files/wiki'
        with open(self.accuracy_check_output_file, 'a') as subdirectory_report:
            subdirectory_report.write(
                'Checking %s for adherence to specifications found at: %s\n' %
                (subdirectory_name, specs_website))
            subdirectory_report.write(
                'Last update (likely git pull) to directory: %s\n\n' %
                self.last_git_change)

            for subdirectory_filepath in subdir_filepath_list:
                self.logger.debug('Checking %s:\n' % subdirectory_filepath)
                # extract dictionaries of inconsistencies in column names and rows
                col_inconsistencies_dict, row_inconsistencies_dict = self.checkColumns(
                    self.specification_dict[subdirectory_name],
                    subdirectory_filepath)
                # check the format of the filename
                if not self.checkFileName(subdirectory_name,
                                          subdirectory_filepath):
                    subdirectory_report.write(
                        '\tThe filename %s does not adhere to the specifications. Please correct.\n\n'
                        % os.path.basename(subdirectory_filepath))
                # check column headings
                lines_to_write = ['In sheet %s:\n\tThe items below are column headings in a given sheet that do not match ' \
                                 'the specifications (key and non-key, this should be fixed when found).\n' %subdirectory_filepath]
                for spec_column, sheet_column in col_inconsistencies_dict.items(
                ):
                    lines_to_write.append(
                        '\tThe specification is: %s, the sheet column is: %s\n'
                        % (spec_column, sheet_column))
                lines_to_write.append(
                    '\n\tThe items below are numbered by row (eg 1: inductionDelay means a problem in row 1 of inductionDelay). If shortReport, only key columns are checked:\n'
                )
                for row_index, column_heading in row_inconsistencies_dict.items(
                ):
                    # if short_report flag == True, only write out if the column_heading is a key column
                    subdir_key_set = set(
                        self.key_column_dict[utils.pathBaseName(
                            utils.dirPath(subdirectory_filepath))])
                    current_column_heading_set = set(column_heading)
                    # determine if column heading is in key column set
                    key_set_diff_length = len(subdir_key_set -
                                              current_column_heading_set)
                    if not key_column_only_report or (len(subdir_key_set) !=
                                                      key_set_diff_length):
                        lines_to_write.append(
                            '\tRow %s has an inconsistency in column %s\n' %
                            (row_index, column_heading))
                # if no columns found to have inconsistencies, remove the header line for this section from the lines_to_write list
                if lines_to_write[-1].endswith(
                        'only key columns are checked:\n'):
                    lines_to_write.pop(-1)
                # if no column headings are found to be inconsistent, don't write at all. otherwise, write out the lines
                if not lines_to_write[-1].endswith(
                        'this should be fixed when found).\n'):
                    lines_to_write.append('\n\n\n\n')
                    subdirectory_report.write(''.join(lines_to_write))
Beispiel #7
0
    def perturbedCheck(self):
        """
           calculate gene coverage of genes in the 'genotype' column of the query_df that do not have suffix _over
           split genotype into two columns, genotype1, genotype2 to address double KO
        """
        # error check if bam_file_list is set
        try:
            if not hasattr(self, 'bam_file_list'):
                raise AttributeError('NoBamFileList')
        except AttributeError:
            print(
                'QualityAssessmentObject does not have attribute bam file list'
            )

        # log which run numbers are present
        self.logger.info('\nThe run numbers in the sheet are: %s' %
                         self.query_df['runNumber'].unique())

        # create genotype_df from query_df[['fastqFileName' and 'genotype']]
        # get intersection of set('fastqFileName', 'genotype1', 'perturbation1', 'genotype2', 'perturbation2') set(self.query_df.columns)
        genotype_columns = list({
            'fastqFileName', 'genotype1', 'perturbation1', 'genotype2',
            'perturbation2'
        }.intersection(set(self.query_df.columns)))
        # create genotype_df from intersection
        genotype_df = self.query_df[genotype_columns]
        # reduce fastqFileName to only simple basename (eg /path/to/some_fastq_R1_001.fastq.gz --> some_fastq_R1_001
        genotype_df['fastqFileName'] = genotype_df['fastqFileName'].apply(
            lambda x: utils.pathBaseName(x))

        # extract nat and g418 num bases from organism data
        nat_bases_in_cds = int(self.nat_cds_length)
        g418_bases_in_cds = int(self.g418_cds_length)

        # set feature over which to take percentage of reads (CDS in this case)
        feature = 'CDS'

        # create columns genotype1_coverage, genotype2_coverage, overexpression_fow (fold over wildtype), NAT_coverage, G418_coverage
        genotype_df['genotype1_coverage'] = None
        genotype_df['genotype2_coverage'] = None
        genotype_df['overexpression_fow'] = None
        genotype_df['NAT_coverage'] = None
        genotype_df['G418_coverage'] = None

        # loop over rows, calculating coverage for each genotype (testing wither genotype2 is none and perturbation is _over
        for index, row in genotype_df.iterrows():
            # simple name is like this: run_673_s_4_withindex_sequence_TGAGGTT (no containing directories, no extention)
            fastq_simple_name = utils.pathBaseName(row['fastqFileName'])
            # get bam files which correspond to query_df fastqFileNames
            try:
                bam_file = [
                    bam_file for bam_file in self.bam_file_list
                    if fastq_simple_name in bam_file
                ][0]
            except IndexError:
                self.logger.info(
                    'bam file not found for %s' %
                    fastq_simple_name)  # TODO: improve this logging
                continue

            # calculate marker coverages
            print('...calculating NAT coverage for %s' % fastq_simple_name)
            genotype_df.loc[
                index, 'NAT_coverage'] = self.calculatePercentFeatureCoverage(
                    feature, 'CNAG_NAT', self.annotation_file, bam_file,
                    nat_bases_in_cds)
            print('...calculating G418 coverage for %s' % fastq_simple_name)
            genotype_df.loc[
                index, 'G418_coverage'] = self.calculatePercentFeatureCoverage(
                    feature, 'CNAG_G418', self.annotation_file, bam_file,
                    g418_bases_in_cds)

            # if deletion, calculate coverage. Currently only set to check genotype1. assumes both are deletions if perturbation1 == 'deletion'
            if row['perturbation1'] == "deletion":
                try:
                    # extract genotype1 TODO: JUST CASE EVERYTHING TO UPPER EARLIER
                    genotype = [
                        self.extractInfoFromQuerySheet(row['fastqFileName'],
                                                       'genotype1'), None
                    ]
                    #genotype = [list(self.query_df[self.query_df['fastqFileName'].str.contains(row['FASTQFILENAME'] + '.fastq.gz')]['genotype1'])[0]]
                except ValueError:
                    self.logger.info(
                        'genotype cannot be extracted with the fastq filename in this row. Note: if there are null entries in the column fastqFileNames, this is the cause. those need to be remedied or removed in order for this to work: %s'
                        % row)
                try:
                    # extract genotype2 or set it to None
                    genotype[1] = self.extractInfoFromQuerySheet(
                        row['fastqFileName'], 'genotype2')
                except KeyError:
                    self.logger.debug("sample: %s does not have genotype2" %
                                      row['fastqFileName'])
                # determine which genome to use -- if CNAG, use KN99
                if not genotype[0].startswith('CNAG'):
                    raise ValueError('%sNotRecognizedCryptoGenotype' %
                                     genotype[0])
                # replace CNAG with CKF44 (in past version of pipeline, KN99 genes were labelled with H99 names. NCBI required change to CKF. Numbering/order is same -- just need to switch CNAG to CKF44)
                genotype[0] = genotype[0].replace('CNAG', 'CKF44')
                if genotype[1] not in [None, 'nan'
                                       ] and genotype[1].startswith('CNAG'):
                    genotype[1] = genotype[1].replace('CNAG', 'CKF44')
                print('...checking coverage of %s in %s' %
                      (genotype, fastq_simple_name))
                genotype_df.loc[
                    index,
                    'genotype1_coverage'] = self.calculatePercentFeatureCoverage(
                        feature, genotype[0], self.annotation_file, bam_file)
                # do the same for genotype2 if it exists
                if genotype[1] not in [None, 'nan']:
                    genotype_df.loc[
                        index,
                        'genotype2_coverage'] = self.calculatePercentFeatureCoverage(
                            feature, genotype[1], self.annotation_file,
                            bam_file)
        # return genotype check
        genotype_df.columns = [
            column_name.upper() for column_name in genotype_df.columns
        ]
        return genotype_df[[
            'FASTQFILENAME', 'GENOTYPE1_COVERAGE', 'GENOTYPE2_COVERAGE',
            'NAT_COVERAGE', 'G418_COVERAGE'
        ]]
Beispiel #8
0
    def calculateExonicCoverage(self, bam_file, output_directory):
        """
            calculate coverage of exon regions. deposits file in output directory named utils.pathBaseName(bam_file)+'_exonic_coverage.csv'
            :param bam_file: path to bam file
            :param output_directory: path to output directory
        """
        # create df with two columns -- fastqFileName and EXONIC_COVERAGE
        exonic_df = pd.DataFrame()
        exonic_df['fastqFileName'] = [(bam_file)]
        exonic_df['EXONIC_COVERAGE'] = None

        try:
            for index, row in exonic_df.iterrows():
                bam_file = row['fastqFileName']
                print('...Assessing exonic coverage for %s' % bam_file)
                error_msg = 'No attribute %s. Check OrganismData_config.ini in %s'
                # get exon info from config file
                try:
                    exon_region_bed_path = os.path.join(
                        self.genome_files, 'KN99', self.exon_region_bed)
                    total_exon_bases = int(self.total_exon_bases)
                except KeyError:
                    kn99_error_msg = error_msg % (
                        'kn99_total_intergenic_bases', 'KN99')
                    self.logger.critical(kn99_error_msg)
                    print(kn99_error_msg)

                # error check intergenic_region_bed_path
                try:
                    if not os.path.isfile(exon_region_bed_path):
                        raise FileNotFoundError(
                            'IntergenicRegionBedDoesNotExist')
                except FileNotFoundError:
                    exonic_region_bed_path_error_msg = 'Intergenic region bed file does not exist at: %s' % exon_region_bed_path
                    self.logger.critical(exonic_region_bed_path_error_msg)
                    print(exonic_region_bed_path_error_msg)

                # extract exonic bases covered by at least one read
                exonic_bases_covered_cmd = 'samtools depth -aa -Q 10 -b %s %s | cut -f3 | grep -v 0 | wc -l' % (
                    exon_region_bed_path, bam_file)
                num_exonic_bases_covered = int(
                    subprocess.getoutput(exonic_bases_covered_cmd))

                # add to the df
                exonic_df.loc[
                    index,
                    'EXONIC_COVERAGE'] = num_exonic_bases_covered / float(
                        total_exon_bases)

        except NameError:
            self.logger.critical(
                'Cannot calculate INTERGENIC COVERAGE -- total_intergenic_bases or intergenic bed file not found as attribute for organism. Check genome_files/subdirs and each OrganismData_config.ini'
            )
        # return qual_assess_df with INTERGENIC_COVERAGE added

        exonic_df['fastqFileName'] = exonic_df['fastqFileName'].apply(
            lambda x: utils.pathBaseName(x))
        # write
        output_path = os.path.join(
            output_directory,
            utils.pathBaseName(bam_file) + '_exonic_coverage.csv')
        exonic_df.to_csv(output_path, index=False)
Beispiel #9
0
    def parseGeneCount(self, htseq_counts_path):
        """
            NOTE: SPECIFICALLY SET UP FOR CRYPTO
            count the gene counts that mapped either to genes (see COUNT_VARS at top of script for other features)
            :param htseq_counts_path: a path to a  _read_count.tsv file (htseq-counts output)
            :returns: a dictionary with the keys FEATURE_ALIGN_NOT_UNIQUE, TOO_LOW_AQUAL, AMBIGUOUS_FEATURE, NO_FEATURE, NOT_ALIGNED_TOTAL
        """

        sample_name = utils.pathBaseName(htseq_counts_path).replace(
            '_read_count', '')
        try:
            genotype = [
                self.extractInfoFromQuerySheet(sample_name, 'genotype1'), None
            ]
            perturbation = [
                self.extractInfoFromQuerySheet(sample_name, 'perturbation1'),
                None
            ]
        except KeyError:
            self.logger.info('Not in query sheet: %s' % htseq_counts_path)
            sys.exit(
                'Count file passed to one of the quality assessment objects was not in the query sheet. These * should be * filtered out in the qual_assess_1 script'
            )
        try:
            # extract genotype2 or set it to None
            genotype[1] = self.extractInfoFromQuerySheet(
                sample_name, 'genotype2')
            perturbation[1] = self.extractInfoFromQuerySheet(
                sample_name, 'perturbation2')
        except KeyError:
            self.logger.debug(
                "%s has no genotype2 and/or perturbation2 -- may need to check script if this is expected"
                % sample_name)
        else:
            library_metadata_dict = {}
            # TODO: error checking on keys
            htseq_file = open(htseq_counts_path, 'r')
            htseq_file_reversed = reversed(htseq_file.readlines())

            crypto_protein_coding_count = 0
            line = next(htseq_file_reversed)
            try:
                while True:
                    line_strip_split = line.strip().split('\t')
                    if line.startswith('CKF44'):
                        # split the line, take the entry in the second column, which is the gene count, and add to crypto_protein_coding_effective_count
                        gene_count = int(line_strip_split[1])
                        crypto_protein_coding_count += gene_count
                    if not (line.startswith('CNAG')
                            or line.startswith('CKF44')):
                        # strip newchar, split on tab
                        line = line.strip().split('\t')
                        # extract the category of metadata count (eg __alignment_not_unique --> ALIGNMENT_NOT_UNIQUE)
                        htseq_count_metadata_category = line_strip_split[0][
                            2:].upper()  # drop the __ in front of the category
                        # enter to htseq_count_dict
                        library_metadata_dict.setdefault(
                            htseq_count_metadata_category, int(line[1]))
                        # iterate
                    line = next(htseq_file_reversed)
            except StopIteration:
                pass

            # error check gene count
            try:
                if crypto_protein_coding_count == 0:
                    raise ValueError('NoGeneCountsDetected')
            except ValueError:
                self.logger.info(
                    'no lines start with CKF44 -- check organism: %s' %
                    htseq_file)
                print('No lines starting with CKF44 have gene counts')

            # rename some key/value pairs
            library_metadata_dict[
                'NOT_ALIGNED_TOTAL'] = library_metadata_dict.pop('NOT_ALIGNED')
            library_metadata_dict[
                'FEATURE_ALIGN_NOT_UNIQUE'] = library_metadata_dict.pop(
                    'ALIGNMENT_NOT_UNIQUE')
            library_metadata_dict[
                'AMBIGUOUS_FEATURE'] = library_metadata_dict.pop('AMBIGUOUS')

            # add PROTEIN_CODING_COUNTED
            library_metadata_dict[
                'PROTEIN_CODING_COUNTED'] = crypto_protein_coding_count
            # add log2cpm data -- note, this will look in the run_####_samples directory of subdir count
            log2cpm_path = os.path.join(
                utils.dirPath(utils.dirPath(htseq_counts_path)),
                '%s_log2_cpm.csv' % self.organism)
            try:
                if not os.path.isfile(log2cpm_path):
                    raise FileNotFoundError('log2cpm_pathDNE: %s' %
                                            log2cpm_path)
            except FileNotFoundError:
                msg = ' Output of log2cpm.R, which requires output of %s_raw_counts.py, ' \
                      'must be in run_####_samples directory containing subdir count. ' \
                      'This doesn\'t exist in %s' %(self.organism, sample_name)
                print(msg)
                self.logger.critical(msg)

            library_metadata_dict['NAT_LOG2CPM'] = self.extractLog2cpm(
                'CNAG_NAT', sample_name, log2cpm_path)
            library_metadata_dict['G418_LOG2CPM'] = self.extractLog2cpm(
                'CNAG_G418', sample_name, log2cpm_path)
            print("...extracting genotype log2cpm -- TESTING TESTING TESTING")
            if genotype[0] != 'CNAG_00000':
                library_metadata_dict[
                    'GENOTYPE1_LOG2CPM'] = self.extractLog2cpm(
                        genotype[0].replace("CNAG", "CKF44"), sample_name,
                        log2cpm_path)
            if genotype[1] is not None:
                library_metadata_dict[
                    'GENOTYPE2_LOG2CPM'] = self.extractLog2cpm(
                        genotype[1].replace("CNAG", "CKF44"), sample_name,
                        log2cpm_path)
            if perturbation[0] == "over":
                sample_medium = self.extractInfoFromQuerySheet(
                    sample_name, 'treatment')
                sample_temperature = self.extractInfoFromQuerySheet(
                    sample_name, 'temperature')
                sample_atmosphere = self.extractInfoFromQuerySheet(
                    sample_name, 'atmosphere')
                sample_timepoint = self.extractInfoFromQuerySheet(
                    sample_name, 'timePoint')
                perturbed_gene = genotype.replace('_over',
                                                  '').replace('CNAG', 'CKF44')
                # THIS NEEDS TO BE UPDATED WITH NEW MEDIAN_LOG2CPM BY WILDTYPE REPLICATE GROUPS WHEN TREATMENT COLUMNS ARE STABLE AGAIN
                library_metadata_dict[
                    'OVEREXPRESSION_FOW'] = 0  #self.foldOverWildtype(perturbed_gene, sample_name, log2cpm_path, [sample_medium, sample_temperature, sample_atmosphere], sample_timepoint)

            htseq_file.close()
            return library_metadata_dict
Beispiel #10
0
    def quantifyNonCodingRna(self, qual_assess_df):
        """

        """
        num_reads_to_ncRNA_dict = {}
        # set threshold to determine strandedness. note that there is an email from holly to yiming mentioning 10.25.2015. That is the best record we have, if htis message remains
        strandedness_date_threshold = pd.to_datetime('10.01.2015')
        kn99_tRNA_ncRNA_annotations = os.path.join(self.genome_files, 'KN99',
                                                   'ncRNA_tRNA_no_rRNA.gff')
        if hasattr(self, 'query_df'):
            for index, row in qual_assess_df.iterrows():
                try:
                    # extract genotype1
                    genotype = [
                        self.extractInfoFromQuerySheet(row['FASTQFILENAME'],
                                                       'genotype1'), None
                    ]
                    #genotype = [list(self.query_df[self.query_df['fastqFileName'].str.contains(row['FASTQFILENAME'] + '.fastq.gz')]['genotype1'])[0]]
                except ValueError:
                    self.logger.info(
                        'genotype cannot be extracted with the fastq filename in this row. Note: if there are null entries in the column fastqFileNames, this is the cause. those need to be remedied or removed in order for this to work: %s'
                        % row)
                try:
                    # extract genotype2 or set it to None
                    genotype[1] = self.extractInfoFromQuerySheet(
                        row['FASTQFILENAME'], 'genotype2')
                except KeyError:
                    self.logger.debug("sample: %s does not have genotype2" %
                                      row['FASTQFILENAME'])
                # test if organism is KN99. Proceed if so
                if genotype[0].startswith('CNAG'):
                    # extract fastq_filename without any preceeding path or file extension
                    fastq_simple_name = utils.pathBaseName(
                        row['FASTQFILENAME'])
                    print('...evaluating ncRNA in %s' % fastq_simple_name)
                    # use this to extract bam_path
                    try:
                        # bam_file_list is inherited
                        bam_path = [
                            bam_file for bam_file in self.bam_file_list
                            if fastq_simple_name in bam_file
                        ][0]
                    except IndexError:
                        self.logger.info('%s not in bam_file_list' %
                                         fastq_simple_name)
                        continue
                    try:
                        if not os.path.isfile(bam_path):
                            raise FileNotFoundError
                    except FileNotFoundError:
                        self.logger.error('bam file not found %s' % bam_path)
                        print('bam file not found: %s' % bam_path)
                    library_date = list(
                        self.query_df[self.query_df['fastqFileName'].str.
                                      contains(row['FASTQFILENAME'] +
                                               '.fastq.gz')]['libraryDate'])[0]
                    row_date_time = pd.to_datetime(library_date)
                    strandedness = 'no' if row_date_time < strandedness_date_threshold else 'reverse'
                    total_rRNA, unique_rRNA = self.totalrRNA(
                        bam_path, 'CP022322.1:272773-283180', strandedness)
                    unique_tRNA_ncRNA = self.totaltRNAncRNA(
                        bam_path, kn99_tRNA_ncRNA_annotations, strandedness)
                    num_reads_to_ncRNA_dict.setdefault(
                        fastq_simple_name, {
                            'total_rRNA': total_rRNA,
                            'unique_rRNA': unique_rRNA,
                            'total_tRNA_ncRNA': unique_tRNA_ncRNA
                        })

        # create dataframe from num_reads_to_ncRNA_dict
        ncRNA_df = pd.DataFrame.from_dict(num_reads_to_ncRNA_dict,
                                          orient='index').reset_index()
        # format column headers
        ncRNA_df.columns = [
            'FASTQFILENAME', 'TOTAL_rRNA', 'UNIQUE_rRNA', 'UNIQUE_tRNA_ncRNA'
        ]

        return ncRNA_df