def moveFiles(file_list, dest_dir, query_len):
    """
        extract run number and index, cp the files to the destination dir and log the move
        :param file_list: list of files to be moved
        :param dest_dir: the destination of the move
        :param query_len: the length of the query (used to check)
    """
    count = 0
    for file in file_list:
        # throw error/exit if file isn't found in src_dir
        if not os.path.isfile(file):
            print(
                '%s cannot be found and therefore can not be moved. '
                'Please check %s for the directory with the run number in the filename'
                % (file, COUNT_LTS))
            sys.exit(1)

        dest_full_path = os.path.join(dest_dir, os.path.basename(file))
        print('...copying {} to {}'.format(os.path.basename(file),
                                           dest_full_path))
        cmd = 'rsync -aHv {} {}'.format(file, dest_full_path)
        utils.executeSubProcess(cmd)
        count = count + 1

    if not count == 3 * query_len:
        print(
            "\nThe number of files moved is {}. The number of rows in the query is {}.\n \
               If moving count and bam files (default), the number of files should be twice the number of rows.\n \
               If it is not, Check the query, and {}, and try again".format(
                count, query_len, COUNT_LTS))
    else:
        print("Your data has been copied to your output directory!")
def main(argv):
    """

    :param argv: cmd line arguments
    """
    print(
        '\nWARNING: IF YOU MOVING THE OUTPUT OF ALIGN_COUNTS TO LTS_ALIGN_EXPR, PLEASE USE SCRIPT MOVE_ALIGNMENT_COUNT_FILES.PY\n')
    print('IMPORTANT: read the print statements carefully. Details are important in this one.')
    args = parseArgs(argv)
    print('\nDo you want to copy a directory or individual file(s)? Enter \'d\' or \'f\': ')
    response = input()
    try:
        if response not in ['d', 'f']:
            raise ValueError('MustEnterRecognizedLetter')
    except ValueError:
        print(
            '\nlast chance: only \'d\' or \'f\' are recognized.\nDo you want to copy a directory or individual file(s)? Enter \'d\' or \'f\': ')
        response = input()
        if response not in ['d', 'f']:
            sys.exit('only \'d\' or \'f\' are recognized. Try again from the beginning.')
    if response == 'd':
        source = utils.removeForwardSlash(args.source)
    elif response == 'f':
        print(
            '\nIf this is a single file, enter \'s\'. Else, hit enter. The assumption if you do not enter \'s\' is that\n'
            'you wish to move the contents of a directory. The script will take care of the forward slash formatting: ')
        response = input()
        if response == 's':
            source = args.source
        else:
            source = utils.addForwardSlash(args.source)

    try:
        if not os.path.isdir(args.destination):
            raise FileNotFoundError('DirectoryDoesNotExist')
    except FileNotFoundError:
        print(
            '\nThe Directory you wish to copy the files to Does Not Exist. If you wish to create the directory, enter \'y\'. Else, the script will exit.\n')
        response = input()
        if response == 'y':
            utils.mkdirp(args.destination)
            destination = utils.addForwardSlash(args.destination)
        else:
            sys.exit('Script exiting -- Correct the filepath and try again if you wish.')
    else:
        destination = utils.addForwardSlash(args.destination)

    cmd = 'rsync -aHv %s %s' % (source, destination)
    print('\nexecuting %s\n' % cmd)
    utils.executeSubProcess(cmd)
    print('\nRsync Complete!')
def main(argv):
    """ main method
    :param argv: cmd line arguments
    """
    # parse cmd line arguments
    args = parseArgs(argv)
    print('...parsing cmd line arguments')
    query_sheet_path = args.query_sheet
    try:
        if not os.path.isfile(query_sheet_path):
            raise FileNotFoundError('DNE: %s' %query_sheet_path)
    except FileNotFoundError:
        print('The query sheet path is not valid. Check and try again')
    else:
        query_df = utils.readInDataframe(query_sheet_path)

    # store interactive flag
    try:
        interactive_flag = args.interactive
    except AttributeError:
        interactive_flag = False


    run_list = list(query_df.runNumber.unique())

    # create paths from /scratch to the run directory
    sd = StandardData(config_file=args.config_file, interactive=interactive_flag)
    run_path_list = [os.path.join(sd.align_count_results, 'run_'+str(x)+'_samples') for x in run_list]

    # check that paths exist TODO: CHECK CONTENTS OF SUBDIRECTORY FOR COMPLETENESS
    print('...validating paths to run directories')
    validated_run_path_list = validatePaths(sd, run_list, run_path_list)

    # write lookup file of run number paths for the sbatch cmd (see https://htcfdocs.readthedocs.io/en/latest/runningjobs/)
    lookup_filename = 'qual_assess_1_lookup_' + str(sd.year_month_day) + '_' + str(utils.hourMinuteSecond()) + '.txt'
    lookup_output_path = os.path.join(sd.job_scripts, lookup_filename)
    print('...writing lookup file for sbatch script to: %s' %lookup_output_path)
    with open(lookup_output_path, 'w') as file:
        file.write('\n'.join(map(str, validated_run_path_list)))

    # write sbatch script to run qual_assess on all runs in lookup file above
    script = writeSbatchScript(sd, args.user_name, validated_run_path_list, lookup_output_path, query_sheet_path)
    sbatch_filename = 'qual_assess_1_batch_' + str(sd.year_month_day) + '_' + str(utils.hourMinuteSecond() + '.sbatch')
    qual_assess_job_script_path = os.path.join(sd.job_scripts, sbatch_filename)
    print('...writing sbatch script to: %s' %qual_assess_job_script_path)
    with open(qual_assess_job_script_path, "w") as f:
        f.write(script)
    cmd = 'sbatch %s' %qual_assess_job_script_path
    utils.executeSubProcess(cmd)
    print('\nCheck status by cat\'ing the sbatch file above and then cat\'ing the .out file in the sbatch script\n')
    def report(self, key_columns_only=False):
        """
            The intent is for this to be used to generate a full report on the entire database. However, any number of
            subdirectories may be passed up to all of the subdirectories in database_files
            :params key_columns_only: only check actual filename and key columns for adherence to specs
        """
        # remove old sheets from the same day if they exist
        if os.path.isfile(self.accuracy_check_output_file):
            remove_cmd = 'rm %s' % self.accuracy_check_output_file
            utils.executeSubProcess(remove_cmd)

        if key_columns_only:
            self.accuracy_check_output_file = self.accuracyCheckFilename(
                'keyColumn')

        for subdirectory_name, subdirectory_path_list in self.database_dict.items(
        ):
            self.subdirectoryReport(subdirectory_name, subdirectory_path_list,
                                    key_columns_only)
def main(argv):
    # store suffixes of the files we wish to move
    count_suffix = '_read_count.tsv'
    novoalign_log_suffix = '_novoalign.log'
    sorted_alignment_suffix = '_sorted_aligned_reads.bam'

    # parse cmd line arguments
    args = parseArgs(argv)
    try:
        if not args.query_sheet.endswith('.csv'):
            raise ValueError('NotCsv')
    except ValueError:
        sys.exit(
            '%s does not end with a .csv. Are you sure it is a .csv? -qs takes the .csv output of queryDB.py. Check and resubmit.'
        )
    if args.leading_zero_rn:
        leading_zero_list = args.leading_zero_rn
    else:
        leading_zero_list = ''

    # read in database_df (The path to the result of a query against the metadata base using queryDB)
    database_df = pd.read_csv(args.query_sheet)

    # create a directory for the experiment
    destination_directory = os.path.join(args.output_directory,
                                         args.experiment_name)
    cmd = "mkdir -p {}".format(destination_directory)
    utils.executeSubProcess(cmd)

    # get list of count files
    count_file_list = filepathList(database_df, count_suffix,
                                   leading_zero_list)
    # get list of novoalign logs
    novoalign_log_list = filepathList(database_df, novoalign_log_suffix,
                                      leading_zero_list)
    # get list of sorted alignment files
    sorted_alignment_list = filepathList(database_df, sorted_alignment_suffix,
                                         leading_zero_list)
    # concat the lists together
    file_list = count_file_list + novoalign_log_list + sorted_alignment_list

    # move the files from /lts to the output directory (generally the user's scratch)
    moveFiles(file_list, destination_directory, len(database_df))
    def setGenomeFiles(self):
        """
            set genome_files path and download, if genome_files  DNE. If config_file has genome_files = https://...
            Then the zip file will be downloaded from that path
            TODO: error checking if the config_file https path doesn't work
        """
        # if genome_files is set in config file
        if hasattr(self, 'genome_files'):
            # if the config_file has an entry genome_files = 'https://...' (link to the hosted genome files in /lts -- it is important that there be a single source for genome_files)
            if self.genome_files.startswith('https'):
                # and the file genome_files DNE in user_rnaseq_pipeline_directory, download from path
                if not os.path.isdir(
                        os.path.join(self.user_rnaseq_pipeline_directory,
                                     'genome_files')):
                    zipped_genome_files_path = os.path.join(
                        self.user_rnaseq_pipeline_directory,
                        'genome_files.zip')
                    download_genome_files_cmd = 'wget -O %s %s' % (
                        zipped_genome_files_path, self.genome_files)
                    utils.executeSubProcess(download_genome_files_cmd)
                    unzip_genome_files_cmd = 'unzip %s -d %s && rm %s' % (
                        zipped_genome_files_path,
                        self.user_rnaseq_pipeline_directory,
                        zipped_genome_files_path)
                    utils.executeSubProcess(unzip_genome_files_cmd)

        # set path of self.genome_files to subdir of user_rnaseq_pipeline directory
        setattr(
            self, 'genome_files',
            os.path.join(self.user_rnaseq_pipeline_directory, 'genome_files'))

        # if the file DNE and interactive flag is set to False (not in interactive session on  htcf), then download from /lts
        if not (self.interactive or os.path.exists(self.genome_files)):
            genome_files_full_path = os.path.join(self.lts_rnaseq_data,
                                                  self.pipeline_version,
                                                  'genome_files.zip')
            cmd = 'unzip {} -d {}'.format(genome_files_full_path,
                                          self.user_rnaseq_pipeline_directory)
            utils.executeSubProcess(cmd)
def main(argv):
    """ main method
    :param argv: cmd line arguments
    """
    # parse cmd line arguments
    args = parseArgs(argv)
    query_sheet_path = args.query_sheet
    try:
        if not os.path.isfile(query_sheet_path):
            raise FileNotFoundError
    except FileNotFoundError:
        print('Query sheet path not valid. Check and try again.')
    try:
        interactive_flag = args.interactive
    except AttributeError:
        interactive_flag = False

    # instantiate DatabaseObject --> mostly this will be for access to StandardData paths
    db = DatabaseObject(query_sheet_path=query_sheet_path, config_file=args.config_file, interactive=interactive_flag)
    # read in dataframe
    db.query_df = utils.readInDataframe(db.query_sheet_path)
    # add column organism which identifies either KN99 or S288C_R64 depending on whether genotype1 starts with CNAG
    # TODO: this is point of weakness -- need to keep an eye here
    db.query_df['organism'] = np.where(db.query_df['genotype1'].str.startswith('CNAG'), 'KN99', 'S288C_R64')
    # cast libraryDate to datetime format
    db.query_df['libraryDate'] = pd.to_datetime(db.query_df['libraryDate'])
    # create strandedness column based on libraryDate. May change to prep protocol at some point, but for now this is best
    db.query_df['strandedness'] = np.where(db.query_df['libraryDate'] > '2015-10-25', 'reverse', 'no')
    # add leading zero to runNumber, if necessary -- take care of in loop
    db.query_df['runNumber'] = db.query_df['runNumber'].astype(str)
    # new dictionary to store run_directory in dataframe
    run_directory_list = []
    for index, row in db.query_df.iterrows():
        # some early runs have run numbers that start with zero in /lts. 0s are dropped in df b/c they are read in as ints
        # this step adds the zero and casts the row to str
        run_num_tmp = int(float(row['runNumber']))# TODO: super ugly, needs to be fixed. Not sure why this is now getting read in as 4422.0, eg as of 20200923
        if run_num_tmp in db._run_numbers_with_zeros: # TODO: Probably the best way to is to always read runnumbers as strings -- requires changing _run_num_with_zeros keys to strings, and checking the rest of the codebase that uses this
            run_number = str(db._run_numbers_with_zeros[run_num_tmp])
        else:
            run_number = run_num_tmp
        # create run directory name, eg run_1234_samples
        run_directory = 'run_' + str(run_number) + '_samples'  # SEE TODO above
        # add to list
        run_directory_list.append(run_directory)
        # create fastqfilename path
        try:
            fastq_filename = os.path.basename(row['fastqFileName']).rstrip()
        except TypeError:
            sys.exit("%s <-- not a fastqfilename?" %row['fastqFileName'])
        fastq_scratch_path = os.path.join(db.scratch_sequence, run_directory, fastq_filename)
        # move fastq file to scratch if it is not already tehre
        if not os.path.exists(fastq_scratch_path):
            fastq_lts_path = os.path.join(db.lts_sequence, run_directory, fastq_filename)
            scratch_run_directory_path = os.path.join(db.scratch_sequence, run_directory)
            utils.mkdirp(scratch_run_directory_path)
            print('...moving %s to %s' %(fastq_lts_path, scratch_run_directory_path))
            rsync_cmd = 'rsync -aHv %s %s' %(fastq_lts_path, scratch_run_directory_path)
            utils.executeSubProcess(rsync_cmd)
        # update fastqFileName in query_df
        db.query_df.loc[index, 'fastqFileName'] = fastq_scratch_path
    # add column runDirectory from run_directory_list
    db.query_df['runDirectory'] = run_directory_list

    # use OrganismDataObject to get paths to novoalign_index and annotation files
    kn99_organism_data = OrganismData(organism='KN99')
    kn99_novoalign_index = kn99_organism_data.novoalign_index
    # this is annotations + nc, t, r RNA with nc,t,r RNA annotations overlapping with protein coding ON SAME STRAND removed. rRNA retained
    kn99_annotation_file = kn99_organism_data.annotation_file
    # this is annotations + nc, t, r RNA with nc,t,r RNA annotations overlapping protein coding removed regardless of strand. rRNA retained
    kn99_annotation_file_no_strand = kn99_organism_data.annotation_file_no_strand
    kn99_genome = kn99_organism_data.genome
    s288c_r64_organism_data = OrganismData(organism='S288C_R64')
    s288c_r64_novoalign_index = s288c_r64_organism_data.novoalign_index
    s288c_r64_annotation_file = s288c_r64_organism_data.annotation_file
    s288c_r64_genome = s288c_r64_organism_data.genome

    # filter
    nextflow_fastqfile_df = db.query_df[['runDirectory', 'fastqFileName', 'organism', 'strandedness']]
    for index, row in nextflow_fastqfile_df.iterrows():
        try:
            if not os.path.isfile(row['fastqFileName']):
                raise FileNotFoundError('fastqFileNotFoundInScratch')
        except FileNotFoundError:
            print('file %s was not successfully moved from lts to scratch' %row['fastqFileName'])
    print('\nnextflow fastq file .csv head:\n')
    print(nextflow_fastqfile_df.head())
    print('\n')
    # write out
    fastq_file_list_output_path = os.path.join(db.job_scripts,
                                               'nextflow_fastqfile_list' + '_' + args.name + '.csv')
    print('...writing out to %s' % fastq_file_list_output_path)
    nextflow_fastqfile_df.to_csv(fastq_file_list_output_path, index=False)

    # config_header goes at the top of the config -- includes date created and StandardObject instructions
    config_header = "/*\n" \
                    "* -------------------------------------------------\n" \
                    "*  Brentlab nextflow rnaseq_pipeline configuration\n" \
                    "* -------------------------------------------------\n" \
                    "* created with create_nextflow_config.py on %s\n" \
                    "* note: this is for a specific job for a specific user\n" \
                    "* and not intended as a general config file. To re-create\n" \
                    "* this job, you will need to run create_nextflow_config.py\n" \
                    "* with the same query_sheet input\n" \
                    "*/\n\n" % db.year_month_day

    # params section has all relevant path parameters to run the pipeline
    params_section = "// params necessary for the pipeline\n" \
                     "params {\n" \
                     "\tfastq_file_list = \"%s\"\n" \
                     "\tlts_sequence = \"%s\"\n" \
                     "\tscratch_sequence = \"%s\"\n" \
                     "\tlts_align_expr = \"%s\"\n" \
                     "\talign_count_results = \"%s\"\n" \
                     "\tlog_dir = \"%s\"\n" \
                     "\tKN99_novoalign_index = \"%s\"\n" \
                     "\tKN99_annotation_file = \"%s\"\n" \
                     "\tKN99_annotation_file_no_strand = \"%s\"\n" \
                     "\tKN99_genome = \"%s\"\n" \
                     "\tS288C_R64_novoalign_index = \"%s\"\n" \
                     "\tS288C_R64_annotation_file = \"%s\"\n" \
                     "\tS288C_R64_genome = \"%s\"\n" \
                     "}\n\n" % (fastq_file_list_output_path, db.lts_sequence, db.scratch_sequence,
                                db.lts_align_expr, db.align_count_results, db.log_dir, kn99_novoalign_index,
                                kn99_annotation_file, kn99_annotation_file_no_strand, kn99_genome, s288c_r64_novoalign_index, s288c_r64_annotation_file,
                                s288c_r64_genome)

    # write out and submit sbatch script with named/combined output/err

    nextflow_config_path = os.path.join(db.job_scripts, args.name + '_nextflow.config')
    print('...writing nextflow job config file to %s' % nextflow_config_path)
    with open(nextflow_config_path, 'w') as nextflow_config_file:
        nextflow_config_file.write(config_header)
        nextflow_config_file.write(params_section)

    sbatch_script_name = args.name + '_nextflow'
    nextflow_sbatch_path = os.path.join(db.job_scripts, sbatch_script_name + '.sbatch')
    # write sbatch script to submit nextflow job
    print('...writing sbatch script to %s' %nextflow_sbatch_path)
    with open(nextflow_sbatch_path, 'w') as nf_sbatch_file:
        nf_sbatch_file.write('#!/bin/bash\n'
                             '#SBATCH --mem=15G\n'
                             '#SBATCH -o %s/%s.out\n'
                             '#SBATCH -J %s\n\n'
                             'ml rnaseq_pipeline\n\n'
                             'nextflow -C %s run $CODEBASE/tools/align_count_pipeline.nf\n'
                             %(db.sbatch_log, sbatch_script_name, sbatch_script_name, nextflow_config_path))

    sbatch_cmd = 'sbatch %s' %nextflow_sbatch_path
    print('\nsubmitting sbatch script with cmd:\n\t%s' %sbatch_cmd)
    utils.executeSubProcess(sbatch_cmd)

    print('\nCheck progress by entering:\n\ttail %s/%s.out' %(db.sbatch_log, sbatch_script_name))
    print('\nTo run this in an interactive session, do the following:\n\t'
          'interactive\n\tnextflow -C %s run $CODEBASE/tools/align_count_pipeline.nf\n' % nextflow_config_path)
    print('If this job fails or is interrupted, you can resume it from where it failed by adding the flag -r to the nextflow command in the .sbatch file and resubmitting to sbatch')
    def standardDirectoryStructure(self):
        """
            checks for and creates if necessary the expected directory structure in /scratch/mblab/$USER/rnaseq_pipeline
        """
        # offer method to set user_scratch in config file
        try:
            if not os.path.isdir(self.user_scratch):
                raise NotADirectoryError('UserScratchDirectoryNotPresent')
        except AttributeError:
            # set attribute user_scratch (this is where rnaseq_pipeline and all subordinate folders/files will be
            user_scratch = os.path.join(self.mblab_scratch, self._user)
            setattr(self, 'user_scratch', user_scratch)
        except NotADirectoryError:
            utils.mkdirp(self.user_scratch)

        # if it does not already exist, create user_rnaseq_pipeline in user_scratch and set attribute
        setattr(self, 'user_rnaseq_pipeline_directory',
                '{}/rnaseq_pipeline'.format(self.user_scratch))
        utils.mkdirp(self.user_rnaseq_pipeline_directory)

        # create necessary subdirectories in rnaseq_pipeline
        process_directories = [
            'reports', 'align_count_results', 'query', 'sbatch_log',
            'log/%s' % self.year_month_day, 'job_scripts', 'rnaseq_tmp',
            'experiments', 'scratch_sequence'
        ]  # TODO: MAKE SBATCH_LOG LIKE LOG WITH YEAR_MONTH_DAY SUBDIR
        for directory in process_directories:
            # store path
            path = os.path.join(self.user_rnaseq_pipeline_directory, directory)
            # this will only create the path if it dne
            utils.mkdirp(path)
            # set attr to directory (the names in process_directories) unless log, which is treated specially
            if directory == 'log/%s' % self.year_month_day:
                # distinguish the log directory ($USER/rnaseq_pipeline/log)
                self.log_dir = os.path.join(
                    self.user_rnaseq_pipeline_directory,
                    'log/%s' % self.year_month_day)
                utils.mkdirp(self.log_dir)
                # from the daily log file ($USER/rnaseq_pipeline/log/<year-month-day>)
                self.log_file_path = os.path.join(
                    self.log_dir, '%s.log' % self.year_month_day)
                self.createStandardDataLogger()
            else:
                setattr(self, directory, path)

        try:
            database_files_path = os.path.join(
                self.user_rnaseq_pipeline_directory, 'database_files')
            if not os.path.isdir(database_files_path):
                raise NotADirectoryError('DatabaseFilesNotFound: %s' %
                                         database_files_path)
        except NotADirectoryError:
            cmd = 'git clone https://github.com/BrentLab/database_files.git %s' % database_files_path
            utils.executeSubProcess(cmd)
        finally:
            setattr(self, 'database_files', database_files_path)

        if self.interactive:
            print(
                'Remember you will not be able to access lts_align_expr or lts_sequence in an interactive session on htcf'
            )
        else:
            # check for directories to be soft linked from /lts/mblab/Crypto/rnaseq_pipeline (self.lts_rnaseq_data)
            lts_dirs_to_softlink = ['lts_align_expr', 'lts_sequence']
            try:
                utils.softLinkAndSetAttr(self, lts_dirs_to_softlink,
                                         self.lts_rnaseq_data,
                                         self.user_rnaseq_pipeline_directory)
            except FileNotFoundError:
                print(
                    'WARNING: The source of %s does not exist and are not accessible. In the future, it is better to include the flag\n'
                    'interactive=True in the constructor of a StandardData object when you are in an interactive session.'
                    % lts_dirs_to_softlink)
                setattr(
                    self, 'lts_align_expr',
                    os.path.join(self.user_rnaseq_pipeline_directory,
                                 'lts_align_expr'))
                setattr(
                    self, 'lts_sequence',
                    os.path.join(self.user_rnaseq_pipeline_directory,
                                 'lts_sequence'))
            # TODO: priority figure out how to do this without pulling from /lts. put link to genome_files.zip in config maybe

        # unzip genome files from /lts/mblab/Crypto/rnaseq_data/1.0/genome_files to self.user_rnaseq_pipeline_directory
        self.setGenomeFiles()
        # check that all files present in the OrganismDataConfig.ini file in the subdirectories of genome_files exist
        try:
            self.checkGenomeFiles()
        except NotADirectoryError:
            print(
                'Genome Files are incomplete. Delete genome_files completely and re-run StandardDataObject or child '
                'to re-download genome_files.\nNote: this cannot be done from an interactive session on HTCF.'
            )
        except FileNotFoundError:
            print(
                'Genome Files are incomplete. Delete genome_files completely and re-run StandardDataObject or child '
                'to re-download genome_files.\nNote: this cannot be done from an interactive session on HTCF.'
            )
Exemple #9
0
def main(argv):
    # parse command line input and store as more descriptive variables
    print('...parsing input')
    args = parse_args(argv)
    try:
        if not os.path.isdir(args.fastq_path):
            raise NotADirectoryError('FastqDirectoryDoesNotExist')
    except NotADirectoryError:
        print(
            'The path to %s for the raw fastq_files does not exist. Correct and re-submit.\n'
            'Remember this directory cannot be in long term storage')
    # in event a run_####_samples is not passed, ask user for a replacement for run_number
    try:
        run_number = utils.getRunNumber(args.fastq_path)
    except AttributeError:
        run_number = input(
            'No run number found. Enter a number, word or phrase to be appended to run_ that will be used to create a\n'
            'subdirectory in output: ')
    print('...creating OrganismDataObject')
    od = OrganismData(organism=args.organism,
                      fastq_path=args.fastq_path,
                      strandness=args.strandness,
                      email=args.user_email,
                      run_number=run_number)
    # check directory structure and set organism data (see OrganismData.setOrganismData())
    od.setOrganismData()
    # create logger for this script if od logger is set
    if os.path.isfile(od.log_file_path):
        logger = utils.createLogger(od.log_file_path, 'align_count.py', 'INFO')
    else:
        logger = utils.createStdOutLogger(name='align_count_logger')

    # add attribute output_dir
    od.output_dir = os.path.join(args.output_directory,
                                 'run_{}'.format(od.run_number))
    # store align_only flag from cmd line
    align_only = args.align_only

    print('...extracting list of fastq files to process')
    fastq_list_file = '%s/run_%s_fastq_list.txt' % (od.job_scripts,
                                                    od.run_number)
    logger.info('The fastq list file path is %s' % fastq_list_file)
    print('The fastq list file path is %s' % fastq_list_file)
    # extract all files with the extensions in the list from od.fastq_path
    fastq_file_list = utils.getFileListFromDirectory(
        od.fastq_path, ["fastq.gz", "fastq", "fq.gz", "fq"])
    # store length of list
    num_fastqs = len(fastq_file_list)
    # write list to file
    with open(fastq_list_file, 'w') as file:
        for fastq_basename in fastq_file_list:
            file.write('%s\n' % fastq_basename)
    if not os.path.isfile(fastq_list_file):
        sys.exit("list of fastq files at %s does not exist" % fastq_list_file)
    else:
        print('list of fastq files may be found at %s' % fastq_list_file)

    print('...writing sbatch job_script')
    # create path for sbatch job_script
    sbatch_job_script_path = '%s/run_%s_mblab_rnaseq.sbatch' % (od.job_scripts,
                                                                od.run_number)
    logger.info('sbatch job script path is %s' % sbatch_job_script_path)
    # create a slurm submission script and write to ./job_scripts
    SbatchWriter.writeAlignCountJobScript(sbatch_job_script_path,
                                          od.output_dir, fastq_list_file,
                                          num_fastqs, od.novoalign_index,
                                          od.annotation_file, od.feature_type,
                                          od.strandness, align_only)
    if not os.path.isfile(sbatch_job_script_path):
        sys.exit('sbatch job_script does not exist at path %s' %
                 sbatch_job_script_path)
    else:
        print('sbatch script may be found at %s' % sbatch_job_script_path)

    # submit sbatch job
    print('...submitting sbatch job')
    if od.email is None:
        cmd = "sbatch %s" % sbatch_job_script_path
        utils.executeSubProcess(cmd)
    else:
        cmd = "sbatch --mail-type=END,FAIL --mail-user=%s %s" % (
            od.email, sbatch_job_script_path)
        utils.executeSubProcess(cmd)

    print('\nannotation and pipeline information recorded in {}/run_{}/{}'.
          format(od.output_dir, od.run_number, 'pipeline_info'))
    pipeline_info_subdir_path = os.path.join(
        od.output_dir, "{}_pipeline_info".format(od.organism))
    utils.mkdirp(pipeline_info_subdir_path)

    # write version info from the module .lua file (see the .lua whatis statements)
    pipeline_info_txt_file_path = os.path.join(pipeline_info_subdir_path,
                                               'pipeline_info.txt')
    cmd_pipeline_info = "module whatis rnaseq_pipeline 2> {}".format(
        pipeline_info_txt_file_path)
    utils.executeSubProcess(cmd_pipeline_info)
    # include the date processed in pipeline_info_subdir_path/pipeline_into.txt
    with open(pipeline_info_txt_file_path, "a+") as file:
        file.write("\n")
        current_datetime = od.year_month_day + '_' + utils.hourMinuteSecond()
        file.write('Date processed: %s' % current_datetime)
        file.write("\n")

    # include the head of the gff/gtf, also
    cmd_annotation_info = "head {} >> {}".format(od.annotation_file,
                                                 pipeline_info_txt_file_path)
    utils.executeSubProcess(cmd_annotation_info)
    # include copy of job script
    cmd_cp_job_script_to_pipeline_info = 'rsync -aHv %s %s' % (
        sbatch_job_script_path, pipeline_info_subdir_path)
    utils.executeSubProcess(cmd_cp_job_script_to_pipeline_info)
    # include copy of list of fastq files
    cmd_cp_fastq_file_list_to_pipeline_info = 'rsync -aHv %s %s' % (
        fastq_list_file, pipeline_info_subdir_path)
    utils.executeSubProcess(cmd_cp_fastq_file_list_to_pipeline_info)