Example #1
0
def get_study_state(study_id='.'):
    existing_data_types = get_existing_data_types(study_id)

    state = {
        'study-id': study_id,
        'workflow': {
            'exe': [],
            'nexe': []
        },
        'data': {}
    }

    for workflow_id in workflows:
        workflow = workflows[workflow_id]
        if workflow.inputs.issubset(existing_data_types):
            state['workflow']['exe'].append(workflow_id)
        else:
            state['workflow']['nexe'].append(workflow_id)

    for data_type in existing_data_types:
        data_filepath = get_data_filepath(data_type, study_id)
        with open(data_filepath, 'rb') as data_file:
            # should we be using sha256 instead?
            md5 = safe_md5(data_file).hexdigest()
        state['data'][data_filepath] = md5

    return state
Example #2
0
    def test_safe_md5(self):
        exp = 'ab07acbb1e496801937adfa772424bf7'

        fd = BytesIO(b'foo bar baz')
        obs = safe_md5(fd)
        self.assertEqual(obs.hexdigest(), exp)

        fd.close()
Example #3
0
    def test_safe_md5(self):
        exp = 'ab07acbb1e496801937adfa772424bf7'

        fd = BytesIO(b'foo bar baz')
        obs = safe_md5(fd)
        self.assertEqual(obs.hexdigest(), exp)

        fd.close()
Example #4
0
    def test_safe_md5(self):
        """Make sure we have the expected md5"""
        exp = 'ab07acbb1e496801937adfa772424bf7'

        fd = BytesIO(b'foo bar baz')
        obs = safe_md5(fd)
        self.assertEqual(obs.hexdigest(), exp)

        fd.close()
Example #5
0
    def test_safe_md5(self):
        """Make sure we have the expected md5"""
        exp = 'ab07acbb1e496801937adfa772424bf7'

        fd = BytesIO(b'foo bar baz')
        obs = safe_md5(fd)
        self.assertEqual(obs.hexdigest(), exp)

        fd.close()
Example #6
0
    def generate_run_xml(self):
        """Generates the run XML file

        Returns
        -------
        ET.Element
            Object with run XML values
        """
        run_set = ET.Element(
            'RUN_SET', {
                'xmlns:xsi': self.xmlns_xsi,
                "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "run"
            })
        for sample_name, sample_prep in sorted(viewitems(self.samples_prep)):
            sample_prep = dict(sample_prep)

            if self._ebi_experiment_accessions[sample_name]:
                experiment_ref_dict = {
                    'accession': self._ebi_experiment_accessions[sample_name]
                }
            else:
                experiment_alias = self._get_experiment_alias(sample_name)
                experiment_ref_dict = {'refname': experiment_alias}

            # We only submit fastq
            file_type = 'fastq'
            file_path = self.sample_demux_fps[sample_name]

            with open(file_path) as fp:
                md5 = safe_md5(fp).hexdigest()

            run = ET.SubElement(
                run_set, 'RUN', {
                    'alias': self._get_run_alias(sample_name),
                    'center_name': qiita_config.ebi_center_name
                })
            ET.SubElement(run, 'EXPERIMENT_REF', experiment_ref_dict)
            data_block = ET.SubElement(run, 'DATA_BLOCK')
            files = ET.SubElement(data_block, 'FILES')
            ET.SubElement(
                files, 'FILE', {
                    'filename': join(self.ebi_dir, basename(file_path)),
                    'filetype': file_type,
                    'quality_scoring_system': 'phred',
                    'checksum_method': 'MD5',
                    'checksum': md5
                })

        return run_set
Example #7
0
    def generate_run_xml(self):
        """Generates the run XML file

        Returns
        -------
        ET.Element
            Object with run XML values
        """
        run_set = ET.Element('RUN_SET', {
            'xmlns:xsi': self.xmlns_xsi,
            "xsi:noNamespaceSchemaLocation": self.xsi_noNSL % "run"})
        for sample_name, sample_prep in sorted(viewitems(self.samples_prep)):
            sample_prep = dict(sample_prep)

            if self._ebi_experiment_accessions[sample_name]:
                experiment_ref_dict = {
                    'accession': self._ebi_experiment_accessions[sample_name]}
            else:
                experiment_alias = self._get_experiment_alias(sample_name)
                experiment_ref_dict = {'refname': experiment_alias}

            # We only submit fastq
            file_type = 'fastq'
            file_path = self.sample_demux_fps[sample_name]

            with open(file_path) as fp:
                md5 = safe_md5(fp).hexdigest()

            run = ET.SubElement(run_set, 'RUN', {
                'alias': self._get_run_alias(sample_name),
                'center_name': qiita_config.ebi_center_name}
            )
            ET.SubElement(run, 'EXPERIMENT_REF', experiment_ref_dict)
            data_block = ET.SubElement(run, 'DATA_BLOCK')
            files = ET.SubElement(data_block, 'FILES')
            ET.SubElement(files, 'FILE', {
                'filename': join(self.ebi_dir, basename(file_path)),
                'filetype': file_type,
                'quality_scoring_system': 'phred',
                'checksum_method': 'MD5',
                'checksum': md5}
            )

        return run_set
Example #8
0
    def generate_run_xml(self):
        """Generates the run XML file

        Returns
        -------
        xml.etree.Element
            The root elelement of the generated ``ElementTree``
        """
        run_set = ET.Element(
            'RUN_SET', {
                "xmlns:xsi":
                "http://www.w3.org/2001/XMLSchema-instance",
                "xsi:noNamespaceSchemaLocation":
                "ftp://ftp.sra.ebi.ac.uk/meta/xsd"
                "/sra_1_3/SRA.run.xsd"
            })
        for sample_name, sample_info in sorted(viewitems(self.samples)):

            experiment_alias = self._get_experiment_alias(sample_name)

            file_type = sample_info['prep']['file_type']
            file_path = sample_info['prep']['file_path']

            with open(file_path) as fp:
                md5 = safe_md5(fp).hexdigest()

            run = ET.SubElement(
                run_set, 'RUN', {
                    'alias': self._get_run_alias(basename(file_path)),
                    'center_name': qiita_config.ebi_center_name
                })
            ET.SubElement(run, 'EXPERIMENT_REF', {'refname': experiment_alias})
            data_block = ET.SubElement(run, 'DATA_BLOCK')
            files = ET.SubElement(data_block, 'FILES')
            ET.SubElement(
                files, 'FILE', {
                    'filename': join(self.ebi_dir, basename(file_path)),
                    'filetype': file_type,
                    'quality_scoring_system': 'phred',
                    'checksum_method': 'MD5',
                    'checksum': md5
                })

        return run_set
Example #9
0
    def _add_file_subelement(self, add_file, file_type, sample_name,
                             is_forward):
        """generate_run_xml helper to avoid duplication of code
        """

        if is_forward:
            suffix = self.FWD_READ_SUFFIX
        else:
            suffix = self.REV_READ_SUFFIX

        file_path = self.sample_demux_fps[sample_name] + suffix
        with open(file_path, 'rb') as fp:
            md5 = safe_md5(fp).hexdigest()

        file_details = {'filetype': file_type,
                        'quality_scoring_system': 'phred',
                        'checksum_method': 'MD5',
                        'checksum': md5,
                        'filename': join(self.ebi_dir, basename(file_path))}

        add_file(file_details)
Example #10
0
    def generate_run_xml(self):
        """Generates the run XML file

        Returns
        -------
        xml.etree.Element
            The root elelement of the generated ``ElementTree``
        """
        run_set = ET.Element('RUN_SET', {
            "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
            "xsi:noNamespaceSchemaLocation": "ftp://ftp.sra.ebi.ac.uk/meta/xsd"
                                             "/sra_1_3/SRA.run.xsd"})
        for sample_name, sample_info in sorted(viewitems(self.samples)):

            experiment_alias = self._get_experiment_alias(sample_name)

            file_type = sample_info['prep']['file_type']
            file_path = sample_info['prep']['file_path']

            with open(file_path) as fp:
                md5 = safe_md5(fp).hexdigest()

            run = ET.SubElement(run_set, 'RUN', {
                'alias': self._get_run_alias(basename(file_path)),
                'center_name': qiita_config.ebi_center_name}
            )
            ET.SubElement(run, 'EXPERIMENT_REF', {
                'refname': experiment_alias}
            )
            data_block = ET.SubElement(run, 'DATA_BLOCK')
            files = ET.SubElement(data_block, 'FILES')
            ET.SubElement(files, 'FILE', {
                'filename': join(self.ebi_dir, basename(file_path)),
                'filetype': file_type,
                'quality_scoring_system': 'phred',
                'checksum_method': 'MD5',
                'checksum': md5}
            )

        return run_set
Example #11
0
File: ebi.py Project: mcmk3/qiita
    def _add_file_subelement(self, add_file, file_type, sample_name,
                             is_forward):
        """generate_run_xml helper to avoid duplication of code
        """

        if is_forward:
            suffix = self.FWD_READ_SUFFIX
        else:
            suffix = self.REV_READ_SUFFIX

        file_path = self.sample_demux_fps[sample_name] + suffix
        with open(file_path, 'rb') as fp:
            md5 = safe_md5(fp).hexdigest()

        file_details = {
            'filetype': file_type,
            'quality_scoring_system': 'phred',
            'checksum_method': 'MD5',
            'checksum': md5,
            'filename': join(self.ebi_dir, basename(file_path))
        }

        add_file(file_details)
Example #12
0
def get_study_state(study_id='.'):
    existing_data_types = get_existing_data_types(study_id)

    state = {
        'study-id': study_id,
        'workflow': {'exe': [], 'nexe': []},
        'data': {}
    }

    for workflow_id in workflows:
        workflow = workflows[workflow_id]
        if workflow.inputs.issubset(existing_data_types):
            state['workflow']['exe'].append(workflow_id)
        else:
            state['workflow']['nexe'].append(workflow_id)

    for data_type in existing_data_types:
        data_filepath = get_data_filepath(data_type, study_id)
        with open(data_filepath, 'rb') as data_file:
            # should we be using sha256 instead?
            md5 = safe_md5(data_file).hexdigest()
        state['data'][data_filepath] = md5

    return state
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error(
                "If not providing barcode reads (because "
                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error(
                "If providing --sample_ids (because "
                "your data is not multiplexed), must provide the same number "
                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error(
            '--last_bad_quality_char is no longer supported. '
            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(
            set([
                len(sequence_read_fps),
                len(barcode_read_fps),
                len(mapping_fps)
            ])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')

        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:

        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')

        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:

        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(
                mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {
                str(DNA(k).rc()): v
                for k, v in barcode_to_sample_id.iteritems()
            }

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error(
                    "Some or all barcodes are not valid golay "
                    "codes. Do they need to be reverse complemented? If these "
                    "are not golay barcodes pass --barcode_type 12 to disable "
                    "barcode error correction, or pass --barcode_type # if "
                    "the barcodes are not 12 base pairs, where # is the size "
                    "of the barcodes. Invalid codes:\n\t%s" %
                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write(
                'Barcode read filepath: %s (md5: %s)\n\n' %
                (barcode_read_fp, safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f,
                barcode_read_f,
                barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f,
                sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp,
                seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f,
                histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    read_arguments_from_file = opts.read_arguments_from_file

    # these arguments can optionally be read from a file, reasoning is to
    # allow arguments that would span over hundreds of samples and would be
    # prohibitive to execute as a command line call
    if read_arguments_from_file:
        # sample_ids is the only one of these arguments that's returned as a
        # string, the rest of them are lists
        if opts.sample_ids:
            opts.sample_ids = ','.join(parse_items(opts.sample_ids))
        if opts.sequence_read_fps:
            opts.sequence_read_fps = parse_items(opts.sequence_read_fps[0])
        if opts.barcode_read_fps:
            opts.barcode_read_fps = parse_items(opts.barcode_read_fps[0])
        if opts.mapping_fps:
            opts.mapping_fps = parse_items(opts.mapping_fps[0])

    sequence_read_fps = opts.sequence_read_fps
    barcode_read_fps = opts.barcode_read_fps
    sample_ids = None
    if opts.sample_ids is not None:
        sample_ids = opts.sample_ids.split(',')
    mapping_fps = opts.mapping_fps
    phred_quality_threshold = opts.phred_quality_threshold
    retain_unassigned_reads = opts.retain_unassigned_reads
    min_per_read_length_fraction = opts.min_per_read_length_fraction
    max_bad_run_length = opts.max_bad_run_length
    rev_comp = opts.rev_comp
    rev_comp_barcode = opts.rev_comp_barcode
    rev_comp_mapping_barcodes = opts.rev_comp_mapping_barcodes
    seq_max_N = opts.sequence_max_n
    start_seq_id = opts.start_seq_id
    # NEED TO FIX THIS FUNCTIONALITY - CURRENTLY READING THE WRONG FIELD
    # opts.filter_bad_illumina_qual_digit
    filter_bad_illumina_qual_digit = False
    store_qual_scores = opts.store_qual_scores
    store_demultiplexed_fastq = opts.store_demultiplexed_fastq
    barcode_type = opts.barcode_type
    max_barcode_errors = opts.max_barcode_errors

    # if this is not a demultiplexed run,
    if barcode_type == 'not-barcoded':
        if sample_ids is None:
            option_parser.error("If not providing barcode reads (because "
                                "your data is not multiplexed), must provide --sample_ids.")
        if len(sample_ids) != len(sequence_read_fps):
            option_parser.error("If providing --sample_ids (because "
                                "your data is not multiplexed), must provide the same number "
                                "of sample ids as sequence read filepaths.")
        barcode_read_fps = [None] * len(sequence_read_fps)
        mapping_fps = [None] * len(sequence_read_fps)
    elif barcode_read_fps is None:
        option_parser.error("Must provide --barcode_read_fps if "
                            "--barcode_type is not 'not-barcoded'")
    elif mapping_fps is None:
        option_parser.error("Must provide --mapping_fps if "
                            "--barcode_type is not 'not-barcoded'")

    phred_offset = opts.phred_offset
    if phred_offset is not None:
        try:
            phred_offset = int(phred_offset)
        except ValueError:
            # shouldn't be able to get here...
            option_parser.error(
                "If --phred_offset is provided, it must be a valid integer.")

    if opts.last_bad_quality_char is not None:
        option_parser.error('--last_bad_quality_char is no longer supported. '
                            'Use -q instead (see option help text by passing -h)')

    if not (0 < min_per_read_length_fraction <= 1):
        option_parser.error('--min_per_read_length_fraction must be greater '
                            'than 0 and less than or equal to 1. You passed '
                            '%1.5f.' % min_per_read_length_fraction)

    barcode_correction_fn = BARCODE_DECODER_LOOKUP.get(barcode_type, None)

    if len(mapping_fps) == 1 and len(sequence_read_fps) > 1:
        mapping_fps = mapping_fps * len(sequence_read_fps)

    if len(set([len(sequence_read_fps), len(barcode_read_fps),
                len(mapping_fps)])) > 1:
        option_parser.error("Same number of sequence, barcode, and mapping "
                            "files must be provided.")

    output_dir = opts.output_dir
    create_dir(output_dir)

    output_fp_temp = '%s/seqs.fna.incomplete' % output_dir
    output_fp = '%s/seqs.fna' % output_dir
    output_f = open(output_fp_temp, 'w')
    qual_fp_temp = '%s/qual.fna.incomplete' % output_dir
    qual_fp = '%s/seqs.qual' % output_dir
    output_fastq_fp_temp = '%s/seqs.fastq.incomplete' % output_dir
    output_fastq_fp = '%s/seqs.fastq' % output_dir

    if store_qual_scores:
        qual_f = open(qual_fp_temp, 'w')
        # define a qual writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def qual_writer(h, q):
            qual_f.write('>%s\n%s\n' % (h, q))
    else:
        def qual_writer(h, q):
            pass

    if store_demultiplexed_fastq:
        output_fastq_f = open(output_fastq_fp_temp, 'w')
        # define a fastq writer whether we're storing
        # qual strings or not so we don't have to check
        # every time through the for loop below

        def fastq_writer(h, s, q):
            output_fastq_f.write(format_fastq_record(h, s, q))
    else:
        def fastq_writer(h, s, q):
            pass

    log_fp = '%s/split_library_log.txt' % output_dir
    log_f = open(log_fp, 'w')
    histogram_fp = '%s/histograms.txt' % output_dir
    histogram_f = open(histogram_fp, 'w')

    for i in range(len(sequence_read_fps)):
        sequence_read_fp = sequence_read_fps[i]
        barcode_read_fp = barcode_read_fps[i]
        mapping_fp = mapping_fps[i]
        if mapping_fp is not None:
            mapping_f = open(mapping_fp, 'U')
            _, _, barcode_to_sample_id, _, _, _, _ = check_map(mapping_f,
                disable_primer_check=True,
                has_barcodes=barcode_read_fp is not None)
        else:
            mapping_f = None
            barcode_to_sample_id = {}

        if rev_comp_mapping_barcodes:
            barcode_to_sample_id = {str(DNA(k).rc()): v for k, v in
                                    barcode_to_sample_id.iteritems()}

        if barcode_type == 'golay_12':
            invalid_golay_barcodes = get_invalid_golay_barcodes(
                barcode_to_sample_id.keys())
            if len(invalid_golay_barcodes) > 0:
                option_parser.error("Some or all barcodes are not valid golay "
                                    "codes. Do they need to be reverse complemented? If these "
                                    "are not golay barcodes pass --barcode_type 12 to disable "
                                    "barcode error correction, or pass --barcode_type # if "
                                    "the barcodes are not 12 base pairs, where # is the size "
                                    "of the barcodes. Invalid codes:\n\t%s" %
                                    ' '.join(invalid_golay_barcodes))

        log_f.write("Input file paths\n")
        if mapping_fp is not None:
            log_f.write('Mapping filepath: %s (md5: %s)\n' %
                        (mapping_fp, safe_md5(open(mapping_fp)).hexdigest()))
        log_f.write('Sequence read filepath: %s (md5: %s)\n' %
                    (sequence_read_fp,
                     str(safe_md5(open(sequence_read_fp)).hexdigest())))

        if sequence_read_fp.endswith('.gz'):
            sequence_read_f = gzip_open(sequence_read_fp)
        else:
            sequence_read_f = open(sequence_read_fp, 'U')

        seq_id = start_seq_id

        if barcode_read_fp is not None:
            log_f.write('Barcode read filepath: %s (md5: %s)\n\n' %
                        (barcode_read_fp,
                         safe_md5(open(barcode_read_fp)).hexdigest()))

            if barcode_read_fp.endswith('.gz'):
                barcode_read_f = gzip_open(barcode_read_fp)
            else:
                barcode_read_f = open(barcode_read_fp, 'U')

            seq_generator = process_fastq_single_end_read_file(
                sequence_read_f, barcode_read_f, barcode_to_sample_id,
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp, rev_comp_barcode=rev_comp_barcode,
                seq_max_N=seq_max_N, start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f, histogram_f=histogram_f,
                barcode_correction_fn=barcode_correction_fn,
                max_barcode_errors=max_barcode_errors,
                phred_offset=phred_offset)
        else:
            seq_generator = process_fastq_single_end_read_file_no_barcode(
                sequence_read_f, sample_ids[i],
                store_unassigned=retain_unassigned_reads,
                max_bad_run_length=max_bad_run_length,
                phred_quality_threshold=phred_quality_threshold,
                min_per_read_length_fraction=min_per_read_length_fraction,
                rev_comp=rev_comp, seq_max_N=seq_max_N,
                start_seq_id=start_seq_id,
                filter_bad_illumina_qual_digit=filter_bad_illumina_qual_digit,
                log_f=log_f, histogram_f=histogram_f,
                phred_offset=phred_offset)

        for fasta_header, sequence, quality, seq_id in seq_generator:
            output_f.write('>%s\n%s\n' % (fasta_header, sequence))
            qual_writer(fasta_header, quality)
            fastq_writer(fasta_header, sequence, quality)

        start_seq_id = seq_id + 1
        log_f.write('\n---\n\n')

    output_f.close()
    rename(output_fp_temp, output_fp)

    # process the optional output files, as necessary
    if store_qual_scores:
        qual_f.close()
        rename(qual_fp_temp, qual_fp)

    if store_demultiplexed_fastq:
        output_fastq_f.close()
        rename(output_fastq_fp_temp, output_fastq_fp)