def test_reformatting(out_test_dir): out_test_dir = os.path.expanduser(out_test_dir) test_dir = out_test_dir + '/tmp' if (os.path.isdir(out_test_dir) ): # Warn if output directory already exists log.disable(log.NOTSET) # flip logging back on log.warning( 'Your output directory already exists. Your output directory may already contain output from Fasta-O-Matic in %(out_test_dir)s.' % locals()) log.disable(log.ERROR) # disable most log output again else: assert general.mk_out_sub_directory( out_test_dir ), 'Failed to create output directory. Check that your output directory path.' assert general.mk_out_sub_directory( test_dir ), 'Failed to create output sub-directory for Unit testing. Check that your output directory exists and can be written to.' assert (test.test_all(test_dir) ), 'Failed to reformat when all three steps were called' assert test.test_newline( test_dir ), 'Failed to reformat when only newline and header reformatting was used' assert test.test_wrapping( test_dir ), 'Failed to reformat when only wrapping and header reformatting was used' assert test.test_unique( test_dir ), 'Failed to reformat or die when only testing uniqueness validation and/or reformatting for headers' os.rmdir(test_dir)
def test_reformatting(out_test_dir): out_test_dir = os.path.expanduser(out_test_dir) test_dir = out_test_dir + '/tmp' if (os.path.isdir(out_test_dir)): # Warn if output directory already exists log.disable(log.NOTSET) # flip logging back on log.warning('Your output directory already exists. Your output directory may already contain output from Fasta-O-Matic in %(out_test_dir)s.' % locals()) log.disable(log.ERROR) # disable most log output again else: assert general.mk_out_sub_directory(out_test_dir), 'Failed to create output directory. Check that your output directory path.' assert general.mk_out_sub_directory(test_dir), 'Failed to create output sub-directory for Unit testing. Check that your output directory exists and can be written to.' assert (test.test_all(test_dir)), 'Failed to reformat when all three steps were called' assert test.test_newline(test_dir), 'Failed to reformat when only newline and header reformatting was used' assert test.test_wrapping(test_dir), 'Failed to reformat when only wrapping and header reformatting was used' assert test.test_unique(test_dir), 'Failed to reformat or die when only testing uniqueness validation and/or reformatting for headers' os.rmdir(test_dir)
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.',action='store_true',dest='colorized') parser.add_argument('-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument('-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument('-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads',default='/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument('-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default='TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info('Output is verbose. Run with -q, --quiet flag to suppress full output.') else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards,reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file=general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file=general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path,out_basename,out_ext)=general.parse_filename(args.out) out_dir=out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir= out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert=' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index=0 args.adapter = fasta_o_matic.run_steps(args.adapter,['wrap', 'new_line','header_whitespace']) for fastq in forwards: (f_path,f_basename,f_ext)=general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 '+ out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n' ) if not args.single: (r_path,r_basename,r_ext)=general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write(trimmomatic_template.trim_template( forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write(trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()
def main(): ''' Run full script as opposed to individual script functions. ''' ###################################################################### ############ Get commandline arguments ############ ###################################################################### parser = argparse.ArgumentParser( description='DESCRIPTION: Summarize counts of all four DNA bases. \ Command-line options that may be omitted \ (i.e. are NOT required) are shown in \ square brackets.') parser.add_argument('-v', '--verbose', action='store_true', dest='verbose', help='Runs reporting status updates', default=True) parser.add_argument('-q', '--quiet', action='store_false', dest='verbose', help='Does not report status updates') parser.add_argument('-c', '--colorized', help='Colorizes log reports. Use only if printing \ output to screen.', action='store_true', dest='colorized') parser.add_argument( '-r', '--read_list', dest='read_list', help='This is the the full path (path and filename) of \ the user provided list of read files. The file should \ be tab separated with the first read file, then the \ second read file (see example_read_list_PE.tab). If a \ sample has multiple fastq files for R1 and R2 separate \ these with commas (see example_read_list_PE_multi.tab).\ For single end reads each line should be a path \ to a fastq file. For single end reads each line should \ be a path to a fastq file (see example_read_list_SE.tab\ )', required=True) parser.add_argument( '-p', '--project', dest='project', help='The project id. This will be used to name output \ (default=project).', default='project', required=False) parser.add_argument( '-a', '--adapter', dest='adapter', help='The adapter fasta file. This will be used to \ clean reads', default= '/homes/bioinfo_software/Trimmomatic-0.33/adapters/TruSeq3-PE-2.fa', required=False) parser.add_argument('-s', '--single_end', action='store_true', dest='single', help='If your reads are single end use this flag. \ Without it the script assumes reads are paired end. \ Also skip the second column (the reverse fastq files) \ when making your read list', required=False, default=False) parser.add_argument('-x', '--convert_header', action='store_true', dest='convert_header', help='If the illumina headers \ do not end in /1 or /2 use this parameter to indicat \ that headers need to be converted. Check your headers \ by typing "head FASTA_FULL_PATH" and read more about \ illumina headers at \ http://en.wikipedia.org/wiki/Fastq#Illumina_sequence_identifiers.', default=False, required=False) parser.add_argument('-m', '--min_read_length', dest='min_read_length', help='The minimum read length in bp. (Default = 90).', required=False, default=90) parser.add_argument('-o', '--out', dest='out', help='Output directory (Default=$HOME)', required=False, default='~') parser.add_argument( '-d', '--dna', dest='sequence', help='DNA sequence to \ summarize', default= 'TATGAAGGGCGATGAATGCTATCTGTCCTGTAGAATTATAGAATCGACTACGTTGGGGAACTAATGGACCAGACAACTCGCTTTGACTGACGTAGACGGCGTGTTGT', required=False) args = parser.parse_args() if args.colorized: import Colorer if args.verbose: doc() log.basicConfig(format='%(levelname)s: %(message)s', level=log.DEBUG) log.info( 'Output is verbose. Run with -q, --quiet flag to suppress full output.' ) else: log.basicConfig(format='%(levelname)s: %(message)s') ###################################################################### ############ Call custom functions with arguments ########### ###################################################################### # Get list of read FASTQ files ####################################### print(args.read_list, args.single, args.min_read_length) (forwards, reverses) = trimmomatic_template.parse_file(args.read_list, args.single) ####################################### # Sanity check read FASTQ files ####################################### index = 0 for fastq in forwards: f_opened_file = general.open_file(forwards[index]) f_opened_file.close() forwards[index] = general.convert_to_full(forwards[index]) if not args.single: r_opened_file = general.open_file(reverses[index]) r_opened_file.close() reverses[index] = general.convert_to_full(reverses[index]) index += 1 ####################################### # Make output directory ####################################### (out_path, out_basename, out_ext) = general.parse_filename(args.out) out_dir = out_path + '/' + out_basename general.path_check(out_dir) # Sanity check directory out_dir = out_dir + '/' + args.project # final out directory is 'project_id' general.mk_out_sub_directory(out_dir) general.mk_out_sub_directory(out_dir + '/scripts') general.mk_out_sub_directory(out_dir + '/qsubs') ####################################### # Write trimmomatic script ####################################### convert = ' | awk \'{if (NR % 4 == 1) {split($1, arr, \":\"); printf \"%s_%s:%s:%s:%s:%s#0/%s\\n\", arr[1], arr[3], arr[4], arr[5], arr[6], arr[7], substr($2, 1, 1), $0} else if (NR % 4 == 3){print \"+\"} else {print $0} }\' > ' qsub_script = general.open_write_file(out_dir + '/qsubs/qsub_trimmomatic.sh') qsub_script.write('#!/bin/bash\n') index = 0 args.adapter = fasta_o_matic.run_steps( args.adapter, ['wrap', 'new_line', 'header_whitespace']) for fastq in forwards: (f_path, f_basename, f_ext) = general.parse_filename(forwards[index]) qsub_script.write('qsub -l mem=4G,h_rt=6:00:00 -pe single 16 ' + out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh\n') if not args.single: (r_path, r_basename, r_ext) = general.parse_filename(reverses[index]) trim_script = general.open_write_file(out_dir + '/scripts/run_trimmomatic_' + f_basename + '.sh') trim_script.write('#!/bin/bash\n') # Convert headers if args.convert_header: trim_script.write('# Convert headers:\n') new_forward_fastq = out_dir + '/' + f_basename + '_h.fastq' trim_script.write('cat ' + forwards[index] + convert + new_forward_fastq + '\n') forwards[index] = new_forward_fastq if not args.single: new_reverse_fastq = out_dir + '/' + r_basename + '_h.fastq' trim_script.write('cat ' + reverses[index] + convert + new_reverse_fastq + '\n') reverses[index] = new_reverse_fastq # Trim sequences trim_script.write('# Clean reads:\n') if not args.single: trim_script.write( trimmomatic_template.trim_template(forwards[index], reverses[index], args.adapter, out_dir)) else: trim_script.write( trimmomatic_template.trim_template_single(forwards[index])) # Section in progress... (Remember to point to a SE adapter fasta file # by default) trim_script.close() index += 1 qsub_script.close()