コード例 #1
0
 def setConcatDatabaseDict(self):
     """
         creates concatenated dataframe from all files in a given list of paths to database subdirectories
         structure {subdirectory: concatenated_table_of_all_files_in_database/subdirectory/*}
     """
     # create dataframe from first file in file_list in given key(subdirectory) of database_dict
     for subdirectory, file_list in self.database_dict.items():
         self.concat_database_dict[subdirectory] = utils.readInDataframe(
             file_list[0])
         column_list = self.concat_database_dict[subdirectory].columns
         column_list = [
             column_header.strip() for column_header in column_list
         ]
         self.concat_database_dict[subdirectory].columns = column_list
         # keep appending (cbind) dataframes to the bottom
         for file in file_list[1:]:
             # read in next file in list as next_sheet
             next_sheet = utils.readInDataframe(file)
             self.logger.debug('columns of %s are %s' %
                               (file, next_sheet.columns))
             self.concat_database_dict[
                 subdirectory] = self.concat_database_dict[
                     subdirectory].append(next_sheet)
         # reset index so it is sequential
         self.concat_database_dict[subdirectory].reset_index(inplace=True,
                                                             drop=True)
コード例 #2
0
    def __init__(self, **kwargs):
        # additional attributes to add to the _attributes in StandardData
        # TODO: possibly change inheretence to a subclass of OrganismData that sets up a class for ANY scheduler manipulation (ie align_counts, this) that take email as an optional argument
        self._igv_attributes = ['query_sheet_path', 'igv_output_dir']
        # initialize Standard data with the extended _attributes
        super(IgvObject, self).__init__(self._igv_attributes, **kwargs)
        # initialize list to store bamfiles that need to be indexed (must be done by batch script)
        self.bam_file_to_index_list = []
        # create logger for IgvObject
        self.logger = utils.createLogger(self.log_file_path, __name__, 'DEBUG')
        try:
            self.query_sheet_path = kwargs['query_sheet_path']
            self.control_sheet_path = kwargs['control_sheet_path']
        except KeyError:
            self.logger.debug('query sheet path and/or control sheet path not passed in constructor')
        else:
            self.sample_df = utils.readInDataframe(self.query_sheet_path)
            self.control_sample_df = self.createControlSampleDict()

        # get gene dictionary with chromsome, gene coordinates, strand
        if self.annotation_file.endswith('gtf'):
            self.annotation_dict = annotation_tools.parseGtf(self.annotation_file)
        elif self.annotation_file.endswith('gff') or self.annotation_file.endswith('gff3'):
            self.annotation_dict = annotation_tools.parseGff3(self.annotation_file)
        else:
            sys.exit("ERROR: The gene annotation format cannot be recognized.")  # TODO: clean up preceeding blocks -- move parseGFF to OrganismData
コード例 #3
0
def main(argv):

    args = parseArgs(argv)

    try:
        if not os.path.isdir(args.count_directory):
            raise NotADirectoryError('ERROR: %s does not exist.' %
                                     args.count_directory)
        if not os.path.isfile(args.query_sheet):
            raise NotADirectoryError('ERROR: %s does not exist.' %
                                     args.count_directory)
    except FileNotFoundError:
        print('path to %s does not exist')
    else:
        count_dirpath = args.count_directory
        query_sheet_path = args.query_sheet
        query_df = utils.readInDataframe(query_sheet_path)

    # extract count files from count_dir
    count_dir_file_list = glob.glob(
        os.path.join(count_dirpath, '*read_count.tsv'))

    # TODO: SOME ERROR CHECKING ON THE FASTQFILENAME?
    # all crypto records will have genotype beginning with CNAG_, used this to extract list of crypto and yeast samples from query
    crypto_sample_list = list(
        query_df[query_df.genotype1.str.startswith('CNAG')].fastqFileName
    )  #TODO: after metadata organism column added, update this section
    s288c_r64_sample_list = list(
        query_df[~query_df.genotype1.str.startswith('CNAG')].fastqFileName)

    # split list of count files based on membership in dataframes above
    count_files_by_organism_dict = {
        'KN99': [
            x for x in count_dir_file_list
            if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in
            crypto_sample_list
        ],
        'S288C_R64': [
            x for x in count_dir_file_list
            if os.path.basename(x.replace('_read_count.tsv', '.fastq.gz')) in
            s288c_r64_sample_list
        ]
    }

    # create and write out count sheets
    for organism, count_file_list in count_files_by_organism_dict.items():
        if len(count_file_list) > 0:
            od = OrganismData(organism=organism,
                              config_file=args.config_file,
                              interactive=args.interactive)
            count_df = od.createCountSheet(count_file_list)
            output_path = os.path.join(
                utils.dirPath(utils.dirPath(count_file_list[0])),
                '%s_raw_count.csv' % organism)
            print('writing count file to %s' % output_path)
            count_df.to_csv(output_path, index=False)
コード例 #4
0
    def checkColumns(self, subdirectory_specs_dict, subdirectory_filepath):
        """
            check column heading names and entries in each row/column for adherence to the specs at:
            https://github.com/BrentLab/database_files/wiki
            :param subdirectory_specs_dict: see constructor. In the case of bioSample, you would pass db.specification_dict['bioSample']
            :param subdirectory_filepath: path to a sheet in a given subdirectory (eg a bioSample .xslx)
            :param logger: reference to a logger. Default is None
            :return: colname_inconsistencies_dict, a dict in structure {specification_heading: nearest_match_to_heading, ...}
                     row_inconsistencies_dict, a dict in structure {row_index: column_with_inconsistent_entry, ...}
        """
        self.logger.info('path to sheet is %s' % subdirectory_filepath)
        # see :return: statement for structure
        colname_inconsistencies_dict = {}
        row_inconsistencies_dict = {}
        # list to store inappropriately formatted column names
        skip_columns = []

        # read in subdirectory_filepath as dataframe
        subdirectory_df = utils.readInDataframe(subdirectory_filepath)
        # loop over rows in dataframe
        for index, row in subdirectory_df.iterrows():
            # convert row into a dictionary {column: value, ...}
            row_dict = dict(row)
            for column_name, column_entry in row_dict.items():
                column_entry = str(column_entry)
                try:
                    column_specs_regex = subdirectory_specs_dict[
                        'column_specs_dict'][column_name]
                except KeyError:
                    if column_name not in skip_columns:
                        if self.logger:
                            self.logger.info(
                                'Column name not found in specs: %s' %
                                column_name)
                            self.logger.info(
                                'row for offending column is: %s' % row)
                        nearest_match = difflib.get_close_matches(
                            column_name,
                            subdirectory_specs_dict['column_specs_dict'].keys(
                            ))[0]
                        colname_inconsistencies_dict.setdefault(
                            nearest_match, column_name)
                        print(
                            '\tCannot check %s in %s. Either the format of the column is incorrect, or it is not in the specifications_dictionary.\n'
                            '\tThe rest of this column could not be checked. Correct the column name, and re-run.'
                            % (column_name, subdirectory_filepath))
                        skip_columns.append(column_name)
                else:
                    if not re.match(column_specs_regex, column_entry):
                        row_inconsistencies_dict.setdefault(
                            str(index), []).append(column_name)

        return colname_inconsistencies_dict, row_inconsistencies_dict
コード例 #5
0
def main(argv):
    """ main method
    :param argv: cmd line arguments
    """
    # parse cmd line arguments
    args = parseArgs(argv)
    print('...parsing cmd line arguments')
    query_sheet_path = args.query_sheet
    try:
        if not os.path.isfile(query_sheet_path):
            raise FileNotFoundError('DNE: %s' %query_sheet_path)
    except FileNotFoundError:
        print('The query sheet path is not valid. Check and try again')
    else:
        query_df = utils.readInDataframe(query_sheet_path)

    # store interactive flag
    try:
        interactive_flag = args.interactive
    except AttributeError:
        interactive_flag = False


    run_list = list(query_df.runNumber.unique())

    # create paths from /scratch to the run directory
    sd = StandardData(config_file=args.config_file, interactive=interactive_flag)
    run_path_list = [os.path.join(sd.align_count_results, 'run_'+str(x)+'_samples') for x in run_list]

    # check that paths exist TODO: CHECK CONTENTS OF SUBDIRECTORY FOR COMPLETENESS
    print('...validating paths to run directories')
    validated_run_path_list = validatePaths(sd, run_list, run_path_list)

    # write lookup file of run number paths for the sbatch cmd (see https://htcfdocs.readthedocs.io/en/latest/runningjobs/)
    lookup_filename = 'qual_assess_1_lookup_' + str(sd.year_month_day) + '_' + str(utils.hourMinuteSecond()) + '.txt'
    lookup_output_path = os.path.join(sd.job_scripts, lookup_filename)
    print('...writing lookup file for sbatch script to: %s' %lookup_output_path)
    with open(lookup_output_path, 'w') as file:
        file.write('\n'.join(map(str, validated_run_path_list)))

    # write sbatch script to run qual_assess on all runs in lookup file above
    script = writeSbatchScript(sd, args.user_name, validated_run_path_list, lookup_output_path, query_sheet_path)
    sbatch_filename = 'qual_assess_1_batch_' + str(sd.year_month_day) + '_' + str(utils.hourMinuteSecond() + '.sbatch')
    qual_assess_job_script_path = os.path.join(sd.job_scripts, sbatch_filename)
    print('...writing sbatch script to: %s' %qual_assess_job_script_path)
    with open(qual_assess_job_script_path, "w") as f:
        f.write(script)
    cmd = 'sbatch %s' %qual_assess_job_script_path
    utils.executeSubProcess(cmd)
    print('\nCheck status by cat\'ing the sbatch file above and then cat\'ing the .out file in the sbatch script\n')
コード例 #6
0
def main(argv):
    args = parseArgs(argv)
    # parse cmd line arguments and error check paths/values
    print('...parsing cmd line input')
    try:
        if not os.path.isdir(args.align_count_dir):
            raise NotADirectoryError('OutputDirDoesNotExist')
    except NotADirectoryError:
        print(
            '%s does not lead to a valid directory. Check the path and resubmit with working -r'
            % args.align_count_dir)
    else:
        align_count_path = args.align_count_dir
        output_directory = args.align_count_dir
    try:
        if not os.path.isfile(args.query_sheet_path):
            raise FileNotFoundError('QuerySheetDoesNotExist')
    except FileNotFoundError:
        print(
            '%s does not lead to a valid file. Check and resubmit correct -qs'
            % args.query_sheet_path)
    except TypeError:
        pass
    else:
        query_sheet_path = args.query_sheet_path

    # get run number if exists for output naming. if DNE, ask user to provide name to insert after run_<using run_num>_summary.csv
    try:
        run_number = utils.getRunNumber(align_count_path)
        # create name for qual_assess
        filename_prefix = 'run_%s' % run_number
    except AttributeError:  # TODO: this will cause a problem if running via batchscript
        filename_prefix = input(
            'No run number detected in input directory name. Enter something to insert in the output directory\n'
            'name: <your_input>_quality_summary.csv: ')
    # store interactive flag
    try:
        interactive_flag = args.interactive
    except AttributeError:
        interactive_flag = False

    # read in query sheet # TODO: GENERALIZE THIS INTO EITHER STANDARDDATA OR UTILS. RETURN AS DICT. DO THIS AFTER ADDING ORGANISM COLUMN TO METADATA SPECS
    query_df = utils.readInDataframe(query_sheet_path)
    query_fastq_list = list(query_df.fastqFileName)

    # extract bam file names
    bam_list = utils.extractFiles(align_count_path, '.bam')
    # filter bam_list for files in the query sheet
    filtered_bam_list = [
        x for x in bam_list
        if os.path.basename(x).replace('_sorted_aligned_reads_with_annote.bam',
                                       '.fastq.gz') in query_fastq_list
    ]
    # extract novoalign logs
    novoalign_logs = utils.extractFiles(align_count_path, 'novoalign.log')
    filtered_novoalign_logs = [
        x for x in novoalign_logs if os.path.basename(x).replace(
            '_novoalign.log', '.fastq.gz') in query_fastq_list
    ]
    # extract count file list
    count_list = utils.extractFiles(align_count_path, 'read_count.tsv')
    filtered_count_list = [
        x for x in count_list if os.path.basename(x).replace(
            '_read_count.tsv', '.fastq.gz') in query_fastq_list
    ]
    # from count_list, get convert to a list of fastq.gz names
    extracted_sample_fastq_list = [
        os.path.basename(x.replace('_read_count.tsv', '.fastq.gz'))
        for x in count_list
    ]
    if len(filtered_bam_list) != len(filtered_count_list) or len(
            filtered_bam_list) != len(filtered_novoalign_logs):
        sys.exit(
            'The number of bam_files, count_files and/or log_files does not match. Check file contents'
        )

    # all crypto records will have genotype beginning with CNAG_
    crypto_query_df = query_df[
        ~query_df.genotype1.isna() & query_df.genotype1.str.startswith('CNAG')
        & query_df.fastqFileName.isin(extracted_sample_fastq_list)]
    yeast_query_df = query_df[(
        ~(query_df.genotype1.isna()
          | query_df.fastqFileName.isin(crypto_query_df.fastqFileName))
        & query_df.fastqFileName.isin(extracted_sample_fastq_list))]

    # create list to store qual_assess dataframes
    qual_assess_df_list = []

    if len(crypto_query_df) > 0:
        # if coverage_check is passed in cmd line, include query and coverage_check_flag in constructor (automatically sets some values #TODO make this a function with arugmnets to pass so as not to repeat entire constructor)
        print('...compiling KN99 samples information')
        crypto_qa_object = CryptoQualAssessAuditObject(
            organism='KN99',
            bam_file_list=filtered_bam_list,
            count_file_list=filtered_count_list,
            novoalign_log_list=filtered_novoalign_logs,
            coverage_check_flag=True,
            query_df=crypto_query_df,
            config_file=args.config_file,
            interactive=interactive_flag)

        # add dataframe to list
        try:
            qual_assess_df_list.append(crypto_qa_object.qual_assess_df)
        except AttributeError:
            error_msg = 'There was an error appending the KN99 qual assess dataframe. Check the paths in the query sheet and align_counts directory'
            crypto_qa_object.logger.debug(error_msg)
            print(error_msg)

    if len(yeast_query_df) > 0:
        yeast_qa_object = S288C_R54QualAssessAuditObject(
            organism='S288C_R64',
            bam_file_list=filtered_bam_list,
            count_file_list=filtered_count_list,
            novoalign_log_list=filtered_novoalign_logs,
            query_path=args.query_sheet_path,
            config_file=args.config_file,
            interactive=interactive_flag)
        print('...compiling S288C_R64 alignment information')
        # create dataframes storing the relevant alignment and count metadata from the novoalign and htseq logs
        try:
            qual_assess_df_list.append(yeast_qa_object.qual_assess_df)
        except AttributeError:
            error_msg = 'There was an error appending the S288C_R64 qual assess dataframe. Check the paths in the query sheet and align_counts directory'
            crypto_qa_object.logger.debug(error_msg)
            print(error_msg)

    # combine dataframes, if both organisms present
    print('...creating quality_assessment sheet for %s' % filename_prefix)
    combined_qual_assess_1_df = pd.concat(qual_assess_df_list)

    # create filename
    quality_assessment_filename = "%s_sequence_quality_summary.csv" % filename_prefix
    output_path = os.path.join(output_directory, quality_assessment_filename)
    print('writing output to %s' % output_path)
    combined_qual_assess_1_df.to_csv(output_path, index=False)
コード例 #7
0
def main(argv):
    """ main method
    :param argv: cmd line arguments
    """
    # parse cmd line arguments
    args = parseArgs(argv)
    query_sheet_path = args.query_sheet
    try:
        if not os.path.isfile(query_sheet_path):
            raise FileNotFoundError
    except FileNotFoundError:
        print('Query sheet path not valid. Check and try again.')
    try:
        interactive_flag = args.interactive
    except AttributeError:
        interactive_flag = False

    # instantiate DatabaseObject --> mostly this will be for access to StandardData paths
    db = DatabaseObject(query_sheet_path=query_sheet_path, config_file=args.config_file, interactive=interactive_flag)
    # read in dataframe
    db.query_df = utils.readInDataframe(db.query_sheet_path)
    # add column organism which identifies either KN99 or S288C_R64 depending on whether genotype1 starts with CNAG
    # TODO: this is point of weakness -- need to keep an eye here
    db.query_df['organism'] = np.where(db.query_df['genotype1'].str.startswith('CNAG'), 'KN99', 'S288C_R64')
    # cast libraryDate to datetime format
    db.query_df['libraryDate'] = pd.to_datetime(db.query_df['libraryDate'])
    # create strandedness column based on libraryDate. May change to prep protocol at some point, but for now this is best
    db.query_df['strandedness'] = np.where(db.query_df['libraryDate'] > '2015-10-25', 'reverse', 'no')
    # add leading zero to runNumber, if necessary -- take care of in loop
    db.query_df['runNumber'] = db.query_df['runNumber'].astype(str)
    # new dictionary to store run_directory in dataframe
    run_directory_list = []
    for index, row in db.query_df.iterrows():
        # some early runs have run numbers that start with zero in /lts. 0s are dropped in df b/c they are read in as ints
        # this step adds the zero and casts the row to str
        run_num_tmp = int(float(row['runNumber']))# TODO: super ugly, needs to be fixed. Not sure why this is now getting read in as 4422.0, eg as of 20200923
        if run_num_tmp in db._run_numbers_with_zeros: # TODO: Probably the best way to is to always read runnumbers as strings -- requires changing _run_num_with_zeros keys to strings, and checking the rest of the codebase that uses this
            run_number = str(db._run_numbers_with_zeros[run_num_tmp])
        else:
            run_number = run_num_tmp
        # create run directory name, eg run_1234_samples
        run_directory = 'run_' + str(run_number) + '_samples'  # SEE TODO above
        # add to list
        run_directory_list.append(run_directory)
        # create fastqfilename path
        try:
            fastq_filename = os.path.basename(row['fastqFileName']).rstrip()
        except TypeError:
            sys.exit("%s <-- not a fastqfilename?" %row['fastqFileName'])
        fastq_scratch_path = os.path.join(db.scratch_sequence, run_directory, fastq_filename)
        # move fastq file to scratch if it is not already tehre
        if not os.path.exists(fastq_scratch_path):
            fastq_lts_path = os.path.join(db.lts_sequence, run_directory, fastq_filename)
            scratch_run_directory_path = os.path.join(db.scratch_sequence, run_directory)
            utils.mkdirp(scratch_run_directory_path)
            print('...moving %s to %s' %(fastq_lts_path, scratch_run_directory_path))
            rsync_cmd = 'rsync -aHv %s %s' %(fastq_lts_path, scratch_run_directory_path)
            utils.executeSubProcess(rsync_cmd)
        # update fastqFileName in query_df
        db.query_df.loc[index, 'fastqFileName'] = fastq_scratch_path
    # add column runDirectory from run_directory_list
    db.query_df['runDirectory'] = run_directory_list

    # use OrganismDataObject to get paths to novoalign_index and annotation files
    kn99_organism_data = OrganismData(organism='KN99')
    kn99_novoalign_index = kn99_organism_data.novoalign_index
    # this is annotations + nc, t, r RNA with nc,t,r RNA annotations overlapping with protein coding ON SAME STRAND removed. rRNA retained
    kn99_annotation_file = kn99_organism_data.annotation_file
    # this is annotations + nc, t, r RNA with nc,t,r RNA annotations overlapping protein coding removed regardless of strand. rRNA retained
    kn99_annotation_file_no_strand = kn99_organism_data.annotation_file_no_strand
    kn99_genome = kn99_organism_data.genome
    s288c_r64_organism_data = OrganismData(organism='S288C_R64')
    s288c_r64_novoalign_index = s288c_r64_organism_data.novoalign_index
    s288c_r64_annotation_file = s288c_r64_organism_data.annotation_file
    s288c_r64_genome = s288c_r64_organism_data.genome

    # filter
    nextflow_fastqfile_df = db.query_df[['runDirectory', 'fastqFileName', 'organism', 'strandedness']]
    for index, row in nextflow_fastqfile_df.iterrows():
        try:
            if not os.path.isfile(row['fastqFileName']):
                raise FileNotFoundError('fastqFileNotFoundInScratch')
        except FileNotFoundError:
            print('file %s was not successfully moved from lts to scratch' %row['fastqFileName'])
    print('\nnextflow fastq file .csv head:\n')
    print(nextflow_fastqfile_df.head())
    print('\n')
    # write out
    fastq_file_list_output_path = os.path.join(db.job_scripts,
                                               'nextflow_fastqfile_list' + '_' + args.name + '.csv')
    print('...writing out to %s' % fastq_file_list_output_path)
    nextflow_fastqfile_df.to_csv(fastq_file_list_output_path, index=False)

    # config_header goes at the top of the config -- includes date created and StandardObject instructions
    config_header = "/*\n" \
                    "* -------------------------------------------------\n" \
                    "*  Brentlab nextflow rnaseq_pipeline configuration\n" \
                    "* -------------------------------------------------\n" \
                    "* created with create_nextflow_config.py on %s\n" \
                    "* note: this is for a specific job for a specific user\n" \
                    "* and not intended as a general config file. To re-create\n" \
                    "* this job, you will need to run create_nextflow_config.py\n" \
                    "* with the same query_sheet input\n" \
                    "*/\n\n" % db.year_month_day

    # params section has all relevant path parameters to run the pipeline
    params_section = "// params necessary for the pipeline\n" \
                     "params {\n" \
                     "\tfastq_file_list = \"%s\"\n" \
                     "\tlts_sequence = \"%s\"\n" \
                     "\tscratch_sequence = \"%s\"\n" \
                     "\tlts_align_expr = \"%s\"\n" \
                     "\talign_count_results = \"%s\"\n" \
                     "\tlog_dir = \"%s\"\n" \
                     "\tKN99_novoalign_index = \"%s\"\n" \
                     "\tKN99_annotation_file = \"%s\"\n" \
                     "\tKN99_annotation_file_no_strand = \"%s\"\n" \
                     "\tKN99_genome = \"%s\"\n" \
                     "\tS288C_R64_novoalign_index = \"%s\"\n" \
                     "\tS288C_R64_annotation_file = \"%s\"\n" \
                     "\tS288C_R64_genome = \"%s\"\n" \
                     "}\n\n" % (fastq_file_list_output_path, db.lts_sequence, db.scratch_sequence,
                                db.lts_align_expr, db.align_count_results, db.log_dir, kn99_novoalign_index,
                                kn99_annotation_file, kn99_annotation_file_no_strand, kn99_genome, s288c_r64_novoalign_index, s288c_r64_annotation_file,
                                s288c_r64_genome)

    # write out and submit sbatch script with named/combined output/err

    nextflow_config_path = os.path.join(db.job_scripts, args.name + '_nextflow.config')
    print('...writing nextflow job config file to %s' % nextflow_config_path)
    with open(nextflow_config_path, 'w') as nextflow_config_file:
        nextflow_config_file.write(config_header)
        nextflow_config_file.write(params_section)

    sbatch_script_name = args.name + '_nextflow'
    nextflow_sbatch_path = os.path.join(db.job_scripts, sbatch_script_name + '.sbatch')
    # write sbatch script to submit nextflow job
    print('...writing sbatch script to %s' %nextflow_sbatch_path)
    with open(nextflow_sbatch_path, 'w') as nf_sbatch_file:
        nf_sbatch_file.write('#!/bin/bash\n'
                             '#SBATCH --mem=15G\n'
                             '#SBATCH -o %s/%s.out\n'
                             '#SBATCH -J %s\n\n'
                             'ml rnaseq_pipeline\n\n'
                             'nextflow -C %s run $CODEBASE/tools/align_count_pipeline.nf\n'
                             %(db.sbatch_log, sbatch_script_name, sbatch_script_name, nextflow_config_path))

    sbatch_cmd = 'sbatch %s' %nextflow_sbatch_path
    print('\nsubmitting sbatch script with cmd:\n\t%s' %sbatch_cmd)
    utils.executeSubProcess(sbatch_cmd)

    print('\nCheck progress by entering:\n\ttail %s/%s.out' %(db.sbatch_log, sbatch_script_name))
    print('\nTo run this in an interactive session, do the following:\n\t'
          'interactive\n\tnextflow -C %s run $CODEBASE/tools/align_count_pipeline.nf\n' % nextflow_config_path)
    print('If this job fails or is interrupted, you can resume it from where it failed by adding the flag -r to the nextflow command in the .sbatch file and resubmitting to sbatch')