def main(argv):

    args = parseArgs(argv)

    try:
        if not os.path.isdir(args.count_directory):
            raise NotADirectoryError('ERROR: %s does not exist.' %
                                     args.count_directory)
        if not os.path.isfile(args.query_sheet):
            raise NotADirectoryError('ERROR: %s does not exist.' %
                                     args.count_directory)
    except FileNotFoundError:
        print('path to %s does not exist')
    else:
        count_dirpath = args.count_directory
        query_sheet_path = args.query_sheet
        query_df = utils.readInDataframe(query_sheet_path)

    # extract count files from count_dir
    count_dir_file_list = glob.glob(
        os.path.join(count_dirpath, '*read_count.tsv'))

    # TODO: SOME ERROR CHECKING ON THE FASTQFILENAME?
    # all crypto records will have genotype beginning with CNAG_, used this to extract list of crypto and yeast samples from query
    crypto_sample_list = list(
        query_df[query_df.genotype1.str.startswith('CNAG')].fastqFileName
    )  #TODO: after metadata organism column added, update this section
    s288c_r64_sample_list = list(
        query_df[~query_df.genotype1.str.startswith('CNAG')].fastqFileName)

    # split list of count files based on membership in dataframes above
    count_files_by_organism_dict = {
        'KN99': [
            x for x in count_dir_file_list
            if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in
            crypto_sample_list
        ],
        'S288C_R64': [
            x for x in count_dir_file_list
            if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in
            s288c_r64_sample_list
        ]
    }

    # create and write out count sheets
    for organism, count_file_list in count_files_by_organism_dict.items():
        if len(count_file_list) > 0:
            od = OrganismData(organism=organism,
                              config_file=args.config_file,
                              interactive=args.interactive)
            count_df = od.createCountSheet(count_file_list)
            output_path = os.path.join(
                utils.dirPath(utils.dirPath(count_file_list[0])),
                '%s_raw_count.csv' % organism)
            print('writing count file to %s' % output_path)
            count_df.to_csv(output_path, index=False)
 def createOrganismDataLogger(self):
     """
         create logger for OrganismData
         :raises: NotADirectoryError if logger_directory_path does not exist
     """
     logger_directory_path = utils.dirPath(self.log_file_path)
     if os.path.isdir(logger_directory_path):
         self.logger = utils.createStandardObjectChildLogger(self, __name__)
     else:
         raise NotADirectoryError('LogDirectoryDoesNotExist')
    def subdirectoryReport(self,
                           subdirectory_name,
                           subdir_filepath_list,
                           key_column_only_report=False):
        """

        """
        print('Checking %s column name formatting and entries' %
              subdirectory_name)
        specs_website = 'https://github.com/BrentLab/database_files/wiki'
        with open(self.accuracy_check_output_file, 'a') as subdirectory_report:
            subdirectory_report.write(
                'Checking %s for adherence to specifications found at: %s\n' %
                (subdirectory_name, specs_website))
            subdirectory_report.write(
                'Last update (likely git pull) to directory: %s\n\n' %
                self.last_git_change)

            for subdirectory_filepath in subdir_filepath_list:
                self.logger.debug('Checking %s:\n' % subdirectory_filepath)
                # extract dictionaries of inconsistencies in column names and rows
                col_inconsistencies_dict, row_inconsistencies_dict = self.checkColumns(
                    self.specification_dict[subdirectory_name],
                    subdirectory_filepath)
                # check the format of the filename
                if not self.checkFileName(subdirectory_name,
                                          subdirectory_filepath):
                    subdirectory_report.write(
                        '\tThe filename %s does not adhere to the specifications. Please correct.\n\n'
                        % os.path.basename(subdirectory_filepath))
                # check column headings
                lines_to_write = ['In sheet %s:\n\tThe items below are column headings in a given sheet that do not match ' \
                                 'the specifications (key and non-key, this should be fixed when found).\n' %subdirectory_filepath]
                for spec_column, sheet_column in col_inconsistencies_dict.items(
                ):
                    lines_to_write.append(
                        '\tThe specification is: %s, the sheet column is: %s\n'
                        % (spec_column, sheet_column))
                lines_to_write.append(
                    '\n\tThe items below are numbered by row (eg 1: inductionDelay means a problem in row 1 of inductionDelay). If shortReport, only key columns are checked:\n'
                )
                for row_index, column_heading in row_inconsistencies_dict.items(
                ):
                    # if short_report flag == True, only write out if the column_heading is a key column
                    subdir_key_set = set(
                        self.key_column_dict[utils.pathBaseName(
                            utils.dirPath(subdirectory_filepath))])
                    current_column_heading_set = set(column_heading)
                    # determine if column heading is in key column set
                    key_set_diff_length = len(subdir_key_set -
                                              current_column_heading_set)
                    if not key_column_only_report or (len(subdir_key_set) !=
                                                      key_set_diff_length):
                        lines_to_write.append(
                            '\tRow %s has an inconsistency in column %s\n' %
                            (row_index, column_heading))
                # if no columns found to have inconsistencies, remove the header line for this section from the lines_to_write list
                if lines_to_write[-1].endswith(
                        'only key columns are checked:\n'):
                    lines_to_write.pop(-1)
                # if no column headings are found to be inconsistent, don't write at all. otherwise, write out the lines
                if not lines_to_write[-1].endswith(
                        'this should be fixed when found).\n'):
                    lines_to_write.append('\n\n\n\n')
                    subdirectory_report.write(''.join(lines_to_write))
Beispiel #4
0
    def parseGeneCount(self, htseq_counts_path):
        """
            NOTE: SPECIFICALLY SET UP FOR CRYPTO
            count the gene counts that mapped either to genes (see COUNT_VARS at top of script for other features)
            :param htseq_counts_path: a path to a  _read_count.tsv file (htseq-counts output)
            :returns: a dictionary with the keys FEATURE_ALIGN_NOT_UNIQUE, TOO_LOW_AQUAL, AMBIGUOUS_FEATURE, NO_FEATURE, NOT_ALIGNED_TOTAL
        """

        sample_name = utils.pathBaseName(htseq_counts_path).replace(
            '_read_count', '')
        try:
            genotype = [
                self.extractInfoFromQuerySheet(sample_name, 'genotype1'), None
            ]
            perturbation = [
                self.extractInfoFromQuerySheet(sample_name, 'perturbation1'),
                None
            ]
        except KeyError:
            self.logger.info('Not in query sheet: %s' % htseq_counts_path)
            sys.exit(
                'Count file passed to one of the quality assessment objects was not in the query sheet. These * should be * filtered out in the qual_assess_1 script'
            )
        try:
            # extract genotype2 or set it to None
            genotype[1] = self.extractInfoFromQuerySheet(
                sample_name, 'genotype2')
            perturbation[1] = self.extractInfoFromQuerySheet(
                sample_name, 'perturbation2')
        except KeyError:
            self.logger.debug(
                "%s has no genotype2 and/or perturbation2 -- may need to check script if this is expected"
                % sample_name)
        else:
            library_metadata_dict = {}
            # TODO: error checking on keys
            htseq_file = open(htseq_counts_path, 'r')
            htseq_file_reversed = reversed(htseq_file.readlines())

            crypto_protein_coding_count = 0
            line = next(htseq_file_reversed)
            try:
                while True:
                    line_strip_split = line.strip().split('\t')
                    if line.startswith('CKF44'):
                        # split the line, take the entry in the second column, which is the gene count, and add to crypto_protein_coding_effective_count
                        gene_count = int(line_strip_split[1])
                        crypto_protein_coding_count += gene_count
                    if not (line.startswith('CNAG')
                            or line.startswith('CKF44')):
                        # strip newchar, split on tab
                        line = line.strip().split('\t')
                        # extract the category of metadata count (eg __alignment_not_unique --> ALIGNMENT_NOT_UNIQUE)
                        htseq_count_metadata_category = line_strip_split[0][
                            2:].upper()  # drop the __ in front of the category
                        # enter to htseq_count_dict
                        library_metadata_dict.setdefault(
                            htseq_count_metadata_category, int(line[1]))
                        # iterate
                    line = next(htseq_file_reversed)
            except StopIteration:
                pass

            # error check gene count
            try:
                if crypto_protein_coding_count == 0:
                    raise ValueError('NoGeneCountsDetected')
            except ValueError:
                self.logger.info(
                    'no lines start with CKF44 -- check organism: %s' %
                    htseq_file)
                print('No lines starting with CKF44 have gene counts')

            # rename some key/value pairs
            library_metadata_dict[
                'NOT_ALIGNED_TOTAL'] = library_metadata_dict.pop('NOT_ALIGNED')
            library_metadata_dict[
                'FEATURE_ALIGN_NOT_UNIQUE'] = library_metadata_dict.pop(
                    'ALIGNMENT_NOT_UNIQUE')
            library_metadata_dict[
                'AMBIGUOUS_FEATURE'] = library_metadata_dict.pop('AMBIGUOUS')

            # add PROTEIN_CODING_COUNTED
            library_metadata_dict[
                'PROTEIN_CODING_COUNTED'] = crypto_protein_coding_count
            # add log2cpm data -- note, this will look in the run_####_samples directory of subdir count
            log2cpm_path = os.path.join(
                utils.dirPath(utils.dirPath(htseq_counts_path)),
                '%s_log2_cpm.csv' % self.organism)
            try:
                if not os.path.isfile(log2cpm_path):
                    raise FileNotFoundError('log2cpm_pathDNE: %s' %
                                            log2cpm_path)
            except FileNotFoundError:
                msg = ' Output of log2cpm.R, which requires output of %s_raw_counts.py, ' \
                      'must be in run_####_samples directory containing subdir count. ' \
                      'This doesn\'t exist in %s' %(self.organism, sample_name)
                print(msg)
                self.logger.critical(msg)

            library_metadata_dict['NAT_LOG2CPM'] = self.extractLog2cpm(
                'CNAG_NAT', sample_name, log2cpm_path)
            library_metadata_dict['G418_LOG2CPM'] = self.extractLog2cpm(
                'CNAG_G418', sample_name, log2cpm_path)
            print("...extracting genotype log2cpm -- TESTING TESTING TESTING")
            if genotype[0] != 'CNAG_00000':
                library_metadata_dict[
                    'GENOTYPE1_LOG2CPM'] = self.extractLog2cpm(
                        genotype[0].replace("CNAG", "CKF44"), sample_name,
                        log2cpm_path)
            if genotype[1] is not None:
                library_metadata_dict[
                    'GENOTYPE2_LOG2CPM'] = self.extractLog2cpm(
                        genotype[1].replace("CNAG", "CKF44"), sample_name,
                        log2cpm_path)
            if perturbation[0] == "over":
                sample_medium = self.extractInfoFromQuerySheet(
                    sample_name, 'treatment')
                sample_temperature = self.extractInfoFromQuerySheet(
                    sample_name, 'temperature')
                sample_atmosphere = self.extractInfoFromQuerySheet(
                    sample_name, 'atmosphere')
                sample_timepoint = self.extractInfoFromQuerySheet(
                    sample_name, 'timePoint')
                perturbed_gene = genotype.replace('_over',
                                                  '').replace('CNAG', 'CKF44')
                # THIS NEEDS TO BE UPDATED WITH NEW MEDIAN_LOG2CPM BY WILDTYPE REPLICATE GROUPS WHEN TREATMENT COLUMNS ARE STABLE AGAIN
                library_metadata_dict[
                    'OVEREXPRESSION_FOW'] = 0  #self.foldOverWildtype(perturbed_gene, sample_name, log2cpm_path, [sample_medium, sample_temperature, sample_atmosphere], sample_timepoint)

            htseq_file.close()
            return library_metadata_dict