def test_get_resource_filename(self): '''WHEN get_resource_filename is invoked with a relative file path to the root folder, THEN it returns the absolute path for that file.''' result = command.get_resource_filename( "scripts/fastq-fasta-line-validation.awk") self.assertRegex(result, r"^.+/scripts/fastq-fasta-line-validation.awk")
def run(self): # Setup input_files = self.input_files_local[0][0:2] num_inputs = len(input_files) assert num_inputs in [1, 2], 'Invalid number of input files' output_files = self.output_files_local()[1:3] summary_file = self.output_files_local()[0] max_fragments = self.additional_attributes["truncate_fragments_to"] file_ext = self.additional_attributes.get("file_ext") assert file_ext in ['fastq', 'fasta'], 'Invalid file extension' is_fastq = file_ext == 'fastq' try: for i in range(num_inputs): input_file = input_files[i] splited_input_file_name, splited_input_file_ext = os.path.splitext( input_file) num_lines = self.calc_max_num_lines(is_fastq, max_fragments) # unzip if .gz file if splited_input_file_ext == '.gz': input_files[i] = splited_input_file_name try: # test if a valid gzip file command.execute( command_patterns.SingleCommand( cmd="gzip", args=["-t", input_file])) # then decompress it command.execute( command_patterns.ShellScriptCommand( script= r'''gzip -dc "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''', named_args={ "input_file": input_file, "awk_script_file": command.get_resource_filename( "scripts/fastq-fasta-line-validation.awk" ), "max_line_length": vc.MAX_LINE_LENGTH, "num_lines": num_lines, "output_file": splited_input_file_name })) except: raise InvalidFileFormatError( "Invalid fastq/fasta/gzip file") else: # Validate and truncate the input file to keep behavior consistent with gz input files try: tmp_file = splited_input_file_name + ".tmp" command.execute( command_patterns.ShellScriptCommand( script= r'''cat "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''', named_args={ "input_file": input_file, "awk_script_file": command.get_resource_filename( "scripts/fastq-fasta-line-validation.awk" ), "max_line_length": vc.MAX_LINE_LENGTH, "num_lines": num_lines, "output_file": tmp_file })) input_files[i] = tmp_file except: raise InvalidFileFormatError( "Invalid fastq/fasta file") # keep a dictionary of the distribution of read lengths in the files self.summary_dict = { vc.BUCKET_TOO_SHORT: 0, vc.BUCKET_NORMAL: 0, vc.BUCKET_LONG: 0, vc.BUCKET_TOO_LONG: 0 } quick_check_passed = \ self.quick_check_file(input_files[0], is_fastq) and \ (num_inputs == 1 or self.quick_check_file(input_files[1], is_fastq)) all_fragments = [] for infile, outfile in zip(input_files, output_files): if quick_check_passed: num_fragments = self.truncate_file(infile, outfile, is_fastq, max_fragments) else: num_fragments = self._full_check_and_truncate_file( infile, outfile, is_fastq, max_fragments, num_inputs) all_fragments.append(num_fragments) if len(all_fragments) == 2 and abs(all_fragments[1] - all_fragments[0]) > 1000: raise InvalidFileFormatError( "Paired input files need to contain the same number of reads" ) with open(summary_file, 'w') as summary_f: json.dump(self.summary_dict, summary_f) except Exception as e: with open(summary_file, 'w') as summary_f: json.dump({'Validation error': str(e)}, summary_f) s3_path = self.s3_path(summary_file) s3.upload_with_retries(summary_file, s3_path) raise e return