Exemple #1
0
    def test_get_resource_filename(self):
        '''WHEN get_resource_filename is invoked with a relative file path to the root folder, THEN it returns the absolute path for that file.'''
        result = command.get_resource_filename(
            "scripts/fastq-fasta-line-validation.awk")

        self.assertRegex(result,
                         r"^.+/scripts/fastq-fasta-line-validation.awk")
    def run(self):
        # Setup
        input_files = self.input_files_local[0][0:2]
        num_inputs = len(input_files)
        assert num_inputs in [1, 2], 'Invalid number of input files'
        output_files = self.output_files_local()[1:3]
        summary_file = self.output_files_local()[0]
        max_fragments = self.additional_attributes["truncate_fragments_to"]

        file_ext = self.additional_attributes.get("file_ext")
        assert file_ext in ['fastq', 'fasta'], 'Invalid file extension'

        is_fastq = file_ext == 'fastq'

        try:
            for i in range(num_inputs):
                input_file = input_files[i]
                splited_input_file_name, splited_input_file_ext = os.path.splitext(
                    input_file)

                num_lines = self.calc_max_num_lines(is_fastq, max_fragments)

                # unzip if .gz file
                if splited_input_file_ext == '.gz':
                    input_files[i] = splited_input_file_name
                    try:
                        # test if a valid gzip file
                        command.execute(
                            command_patterns.SingleCommand(
                                cmd="gzip", args=["-t", input_file]))
                        # then decompress it
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'''gzip -dc "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''',
                                named_args={
                                    "input_file":
                                    input_file,
                                    "awk_script_file":
                                    command.get_resource_filename(
                                        "scripts/fastq-fasta-line-validation.awk"
                                    ),
                                    "max_line_length":
                                    vc.MAX_LINE_LENGTH,
                                    "num_lines":
                                    num_lines,
                                    "output_file":
                                    splited_input_file_name
                                }))
                    except:
                        raise InvalidFileFormatError(
                            "Invalid fastq/fasta/gzip file")
                else:
                    # Validate and truncate the input file to keep behavior consistent with gz input files
                    try:
                        tmp_file = splited_input_file_name + ".tmp"
                        command.execute(
                            command_patterns.ShellScriptCommand(
                                script=
                                r'''cat "${input_file}" | cut -c -"$[max_line_length+1]" | head -n "${num_lines}" | awk -f "${awk_script_file}" -v max_line_length="${max_line_length}" > "${output_file}";''',
                                named_args={
                                    "input_file":
                                    input_file,
                                    "awk_script_file":
                                    command.get_resource_filename(
                                        "scripts/fastq-fasta-line-validation.awk"
                                    ),
                                    "max_line_length":
                                    vc.MAX_LINE_LENGTH,
                                    "num_lines":
                                    num_lines,
                                    "output_file":
                                    tmp_file
                                }))
                        input_files[i] = tmp_file
                    except:
                        raise InvalidFileFormatError(
                            "Invalid fastq/fasta file")

            # keep a dictionary of the distribution of read lengths in the files
            self.summary_dict = {
                vc.BUCKET_TOO_SHORT: 0,
                vc.BUCKET_NORMAL: 0,
                vc.BUCKET_LONG: 0,
                vc.BUCKET_TOO_LONG: 0
            }

            quick_check_passed = \
                self.quick_check_file(input_files[0], is_fastq) and \
                (num_inputs == 1 or self.quick_check_file(input_files[1], is_fastq))

            all_fragments = []

            for infile, outfile in zip(input_files, output_files):
                if quick_check_passed:
                    num_fragments = self.truncate_file(infile, outfile,
                                                       is_fastq, max_fragments)
                else:
                    num_fragments = self._full_check_and_truncate_file(
                        infile, outfile, is_fastq, max_fragments, num_inputs)
                all_fragments.append(num_fragments)

            if len(all_fragments) == 2 and abs(all_fragments[1] -
                                               all_fragments[0]) > 1000:
                raise InvalidFileFormatError(
                    "Paired input files need to contain the same number of reads"
                )

            with open(summary_file, 'w') as summary_f:
                json.dump(self.summary_dict, summary_f)

        except Exception as e:
            with open(summary_file, 'w') as summary_f:
                json.dump({'Validation error': str(e)}, summary_f)
            s3_path = self.s3_path(summary_file)
            s3.upload_with_retries(summary_file, s3_path)
            raise e

        return