コード例 #1
0
def find_bowtie2():
    try:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger)
    except RuntimeError:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION,
                                     BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION,
                                     logger)
    return bowtie2, bowtie2_build
コード例 #2
0
ファイル: prelim_map.py プロジェクト: pastvir/MiCall
def prelim_map(fastq1,
               fastq2,
               prelim_csv,
               nthreads=BOWTIE_THREADS,
               rdgopen=READ_GAP_OPEN,
               rfgopen=REF_GAP_OPEN,
               stderr=sys.stderr,
               gzip=False,
               work_path='',
               excluded_seeds=None):
    """ Run the preliminary mapping step.

    @param fastq1: the file name for the forward reads in FASTQ format
    @param fastq2: the file name for the reverse reads in FASTQ format
    @param prelim_csv: an open file object for the output file - all the reads
        mapped to references in CSV version of the SAM format
    @param nthreads: the number of threads to use.
    @param rdgopen: a penalty for opening a gap in the read sequence.
    @param rfgopen: a penalty for opening a gap in the reference sequence.
    @param stderr: where to write the standard error output from bowtie2 calls.
    @param gzip: True if FASTQ files are in gzip format
    @param work_path:  optional path to store working files
    @param excluded_seeds: a list of seed names to exclude from mapping
    """
    try:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION,
                                     BOWTIE_BUILD_PATH,
                                     logger)
    except RuntimeError:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION,
                                     BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION,
                                     logger)

    # check that the inputs exist
    fastq1 = check_fastq(fastq1, gzip)
    fastq2 = check_fastq(fastq2, gzip)

    # generate initial reference files
    projects = project_config.ProjectConfig.loadDefault()
    ref_path = os.path.join(work_path, 'micall.fasta')
    all_excluded_seeds = {project_config.G2P_SEED_NAME}
    if excluded_seeds:
        all_excluded_seeds.update(excluded_seeds)
    with open(ref_path, 'w') as ref:
        projects.writeSeedFasta(ref, all_excluded_seeds)
    reffile_template = os.path.join(work_path, 'reference')
    bowtie2_build.build(ref_path, reffile_template)

    fieldnames = ['qname',
                  'flag',
                  'rname',
                  'pos',
                  'mapq',
                  'cigar',
                  'rnext',
                  'pnext',
                  'tlen',
                  'seq',
                  'qual']
    writer = csv.writer(prelim_csv, lineterminator=os.linesep)
    writer.writerow(fieldnames)

    # do preliminary mapping
    read_gap_open_penalty = rdgopen
    ref_gap_open_penalty = rfgopen

    # stream output from bowtie2
    bowtie_args = ['--wrapper', 'micall-0',
                   '--quiet',
                   '-x', reffile_template,
                   '-1', fastq1,
                   '-2', fastq2,
                   '--rdg', "{},{}".format(read_gap_open_penalty,
                                           READ_GAP_EXTEND),
                   '--rfg', "{},{}".format(ref_gap_open_penalty,
                                           REF_GAP_EXTEND),
                   '--no-hd',  # no header lines (start with @)
                   '-X', '1200',
                   '-p', str(nthreads)]

    for i, line in enumerate(bowtie2.yield_output(bowtie_args, stderr=stderr)):
        writer.writerow(line.split('\t')[:11])  # discard optional items
コード例 #3
0
ファイル: remap.py プロジェクト: tarah28/MiCall
def remap(fastq1,
          fastq2,
          prelim_csv,
          remap_csv,
          remap_counts_csv,
          remap_conseq_csv,
          unmapped1,
          unmapped2,
          work_path='',
          nthreads=BOWTIE_THREADS,
          callback=None,
          count_threshold=10,
          rdgopen=READ_GAP_OPEN,
          rfgopen=REF_GAP_OPEN,
          stderr=sys.stderr,
          gzip=False):
    """
    Iterative re-map reads from raw paired FASTQ files to a reference sequence set that
    is being updated as the consensus of the reads that were mapped to the last set.
    @param fastq1: input R1 FASTQ
    @param fastq2: input R2 FASTQ
    @param prelim_csv: input CSV output from prelim_csv()
    @param remap_csv:  output CSV, contents of bowtie2 SAM output
    @param remap_counts_csv:  output CSV, counts of reads mapped to regions
    @param remap_conseq_csv:  output CSV, sample- and region-specific consensus sequences
                                generated while remapping reads
    @param unmapped1:  output FASTQ containing R1 reads that did not map to any region
    @param unmapped2:  output FASTQ containing R2 reads that did not map to any region
    @param work_path:  optional path to store working files
    @param nthreads:  optional setting to modify the number of threads used by bowtie2
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param count_threshold:  minimum number of reads that map to a region for it to be remapped
    @param rdgopen: read gap open penalty
    @param rfgopen: reference gap open penalty
    """

    reffile = os.path.join(work_path, 'temp.fasta')
    samfile = os.path.join(work_path, 'temp.sam')

    try:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger)
    except:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION,
                                     BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION,
                                     logger)

    # check that the inputs exist
    if not os.path.exists(fastq1):
        logger.error('No FASTQ found at %s', fastq1)
        sys.exit(1)

    if not os.path.exists(fastq2):
        logger.error('No FASTQ found at %s', fastq2)
        sys.exit(1)

    # append .gz extension if necessary
    if gzip:
        if not fastq1.endswith('.gz'):
            try:
                os.symlink(fastq1, fastq1 + '.gz')
            except OSError:
                # symbolic link already exists
                pass
            fastq1 += '.gz'

        if not fastq2.endswith('.gz'):
            try:
                os.symlink(fastq2, fastq2 + '.gz')
            except OSError:
                # symbolic link already exists
                pass
            fastq2 += '.gz'

    worker_pool = multiprocessing.Pool(
        processes=nthreads) if nthreads > 1 else None

    # retrieve reference sequences used for preliminary mapping
    projects = project_config.ProjectConfig.loadDefault()
    seeds = {}
    for seed, vals in projects.config['regions'].iteritems():
        seqs = vals['reference']
        seeds[seed] = ''.join(seqs)
    conseqs = dict(seeds)  # copy

    # record the raw read count
    raw_count = line_counter.count(
        fastq1, gzip=gzip) / 2  # 4 lines per record in FASTQ, paired

    remap_counts_writer = csv.DictWriter(
        remap_counts_csv,
        'type count filtered_count seed_dist other_dist other_seed'.split(),
        lineterminator=os.linesep)
    remap_counts_writer.writeheader()
    remap_counts_writer.writerow(dict(type='raw', count=raw_count))

    # convert preliminary CSV to SAM, count reads
    if callback:
        callback(message='... processing preliminary map',
                 progress=0,
                 max_progress=raw_count)

    with open(samfile, 'w') as f:
        # write SAM header
        f.write('@HD\tVN:1.0\tSO:unsorted\n')
        for rname, refseq in conseqs.iteritems():
            f.write('@SQ\tSN:%s\tLN:%d\n' % (rname, len(refseq)))
        f.write('@PG\tID:bowtie2\tPN:bowtie2\tVN:2.2.3\tCL:""\n')

        # iterate through prelim CSV and record counts, transfer rows to SAM
        refgroups = {}  # { group_name: (refname, count) }
        reader = csv.DictReader(prelim_csv)
        row_count = 0
        for refname, group in itertools.groupby(reader, itemgetter('rname')):
            count = 0
            filtered_count = 0
            for row in group:
                if callback and row_count % 1000 == 0:
                    callback(progress=row_count)

                count += 1
                row_count += 1

                # write SAM row
                f.write('\t'.join([row[field] for field in fieldnames]) + '\n')

                if is_unmapped_read(row['flag']):
                    continue
                if is_short_read(row, max_primer_length=50):
                    # exclude short reads
                    continue

                filtered_count += 1
            if callback:
                callback(progress=raw_count)

            # report preliminary counts to file
            remap_counts_writer.writerow(
                dict(type='prelim %s' % refname,
                     count=count,
                     filtered_count=filtered_count))
            if refname == '*':
                continue
            refgroup = projects.getSeedGroup(refname)
            seed_count_threshold = 1 if refname == 'HIV1B-env-seed' else count_threshold
            _best_ref, best_count = refgroups.get(
                refgroup, (None, seed_count_threshold - 1))
            if filtered_count > best_count:
                refgroups[refgroup] = (refname, filtered_count)

    seed_counts = {
        best_ref: best_count
        for best_ref, best_count in refgroups.itervalues()
    }
    # regenerate consensus sequences based on preliminary map
    conseqs = build_conseqs(samfile, seeds=seeds, worker_pool=worker_pool)

    # exclude references with low counts (post filtering)
    new_conseqs = {}
    map_counts = {}
    for rname, conseq in conseqs.iteritems():
        count = seed_counts.get(rname, None)
        if count is not None:
            map_counts[
                rname] = count  # transfer filtered counts to map counts for remap loop
            new_conseqs[rname] = conseq
    conseqs = new_conseqs

    # start remapping loop
    n_remaps = 0
    new_counts = Counter()
    unmapped_count = raw_count
    while conseqs:
        if callback:
            callback(message='... remap iteration %d' % n_remaps, progress=0)

        # reset unmapped files with each iteration
        unmapped1.seek(0)
        unmapped1.truncate()
        unmapped2.seek(0)
        unmapped2.truncate()

        unmapped_count = map_to_reference(fastq1, fastq2, conseqs, reffile,
                                          samfile, unmapped1, unmapped2,
                                          bowtie2, bowtie2_build, raw_count,
                                          rdgopen, rfgopen, nthreads,
                                          new_counts, stderr, callback)

        old_seed_names = set(conseqs.iterkeys())
        # regenerate consensus sequences
        distance_report = {}
        conseqs = build_conseqs(
            samfile,
            seeds=seeds,
            is_filtered=True,
            worker_pool=worker_pool,
            filter_coverage=count_threshold / 2,  # pairs
            distance_report=distance_report)
        new_seed_names = set(conseqs.iterkeys())
        n_remaps += 1
        write_remap_counts(remap_counts_writer,
                           new_counts,
                           title='remap-{}'.format(n_remaps),
                           distance_report=distance_report)

        if new_seed_names == old_seed_names:
            # stopping criterion 1 - none of the regions gained reads
            if all((count <= map_counts[refname])
                   for refname, count in new_counts.iteritems()):
                break

            # stopping criterion 2 - a sufficient fraction of raw data has been mapped
            mapping_efficiency = sum(new_counts.values()) / float(raw_count)
            if mapping_efficiency > MIN_MAPPING_EFFICIENCY:
                break

            if n_remaps >= MAX_REMAPS:
                break

        # deep copy of mapping counts
        map_counts = dict(new_counts)

    # finished iterative phase
    if worker_pool is not None:
        worker_pool.close()

    # generate SAM CSV output
    remap_writer = csv.DictWriter(remap_csv,
                                  fieldnames,
                                  lineterminator=os.linesep)
    remap_writer.writeheader()
    if new_counts:
        splitter = MixedReferenceSplitter()
        split_counts = Counter()
        # At least one read was mapped, so samfile has relevant data
        with open(samfile, 'rU') as f:
            for fields in splitter.split(f):
                remap_writer.writerow(dict(zip(fieldnames, fields)))
        for rname, (split_file1, split_file2) in splitter.splits.iteritems():
            refseqs = {rname: conseqs[rname]}
            unmapped_count += map_to_reference(
                split_file1.name, split_file2.name, refseqs, reffile, samfile,
                unmapped1, unmapped2, bowtie2, bowtie2_build, raw_count,
                rdgopen, rfgopen, nthreads, split_counts, stderr, callback)
            new_counts.update(split_counts)
            with open(samfile, 'rU') as f:
                for fields in splitter.walk(f):
                    remap_writer.writerow(dict(zip(fieldnames, fields)))

    # write consensus sequences and counts
    remap_conseq_csv.write(
        'region,sequence\n')  # record consensus sequences for later use
    for refname in new_counts.iterkeys():
        # NOTE this is the consensus sequence to which the reads were mapped, NOT the
        # current consensus!
        conseq = conseqs.get(refname) or projects.getReference(refname)
        remap_conseq_csv.write('%s,%s\n' % (refname, conseq))
    write_remap_counts(remap_counts_writer, new_counts, title='remap-final')

    # report number of unmapped reads
    remap_counts_writer.writerow(dict(type='unmapped', count=unmapped_count))
コード例 #4
0
def remap(fastq1,
          fastq2,
          prelim_csv,
          remap_csv,
          remap_counts_csv,
          remap_conseq_csv,
          unmapped1,
          unmapped2,
          work_path='',
          callback=None,
          count_threshold=10,
          rdgopen=READ_GAP_OPEN,
          rfgopen=REF_GAP_OPEN,
          stderr=sys.stderr,
          gzip=False,
          debug_file_prefix=None):
    """
    Iterative re-map reads from raw paired FASTQ files to a reference sequence set that
    is being updated as the consensus of the reads that were mapped to the last set.
    @param fastq1: input R1 FASTQ
    @param fastq2: input R2 FASTQ
    @param prelim_csv: input CSV output from prelim_csv()
    @param remap_csv:  output CSV, contents of bowtie2 SAM output
    @param remap_counts_csv:  output CSV, counts of reads mapped to regions
    @param remap_conseq_csv:  output CSV, sample- and region-specific consensus sequences
                                generated while remapping reads
    @param unmapped1:  output FASTQ containing R1 reads that did not map to any region
    @param unmapped2:  output FASTQ containing R2 reads that did not map to any region
    @param work_path:  optional path to store working files
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param count_threshold:  minimum number of reads that map to a region for it to be remapped
    @param rdgopen: read gap open penalty
    @param rfgopen: reference gap open penalty
    @param stderr: an open file object to receive stderr from the bowtie2 calls
    @param gzip: True if the FASTQ files are gzipped
    @param debug_file_prefix: the prefix for the file path to write debug files.
        If not None, this will be used to write a copy of the reference FASTA
        files and the output SAM files.
    """

    reffile = os.path.join(work_path, 'temp.fasta')
    samfile = os.path.join(work_path, 'temp.sam')

    try:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger)
    except RuntimeError:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION,
                                     BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION,
                                     logger)
    # check that the inputs exist
    fastq1 = check_fastq(fastq1, gzip)
    fastq2 = check_fastq(fastq2, gzip)

    # retrieve reference sequences used for preliminary mapping
    projects = project_config.ProjectConfig.loadDefault()
    seeds = projects.getAllReferences()

    # record the raw read count
    raw_count = line_counter.count(
        fastq1, gzip=gzip) // 2  # 4 lines per record in FASTQ, paired

    remap_counts_writer = csv.DictWriter(
        remap_counts_csv,
        'type count filtered_count seed_dist other_dist other_seed'.split(),
        lineterminator=os.linesep)
    remap_counts_writer.writeheader()
    remap_counts_writer.writerow(dict(type='raw', count=raw_count))

    # convert preliminary CSV to SAM, count reads
    with open(samfile, 'w') as f:
        # transfer filtered counts to map counts for remap loop
        map_counts = convert_prelim(prelim_csv, f, remap_counts_writer,
                                    count_threshold, projects)

    # regenerate consensus sequences based on preliminary map
    prelim_conseqs = build_conseqs(samfile, seeds=seeds)

    # exclude references with low counts (post filtering)
    conseqs = {
        rname: prelim_conseqs[rname]
        for rname in map_counts if rname in prelim_conseqs
    }

    # start remapping loop
    n_remaps = 0
    new_counts = Counter()
    unmapped_count = raw_count
    while conseqs:
        # reset unmapped files with each iteration
        unmapped1.seek(0)
        unmapped1.truncate()
        unmapped2.seek(0)
        unmapped2.truncate()

        if debug_file_prefix is None:
            next_debug_prefix = None
        else:
            next_debug_prefix = '{}_remap{}'.format(debug_file_prefix,
                                                    n_remaps + 1)
        unmapped_count = map_to_reference(fastq1,
                                          fastq2,
                                          conseqs,
                                          reffile,
                                          samfile,
                                          unmapped1,
                                          unmapped2,
                                          bowtie2,
                                          bowtie2_build,
                                          raw_count,
                                          rdgopen,
                                          rfgopen,
                                          new_counts,
                                          stderr,
                                          callback,
                                          debug_file_prefix=next_debug_prefix)

        old_seed_names = set(conseqs.keys())
        # regenerate consensus sequences
        distance_report = {}
        conseqs = build_conseqs(
            samfile,
            seeds=conseqs,
            is_filtered=True,
            filter_coverage=count_threshold // 2,  # pairs
            distance_report=distance_report,
            original_seeds=seeds)
        new_seed_names = set(conseqs.keys())
        n_remaps += 1
        write_remap_counts(remap_counts_writer,
                           new_counts,
                           title='remap-{}'.format(n_remaps),
                           distance_report=distance_report)

        if new_seed_names == old_seed_names:
            # stopping criterion 1 - none of the regions gained reads
            if all((count <= map_counts[refname])
                   for refname, count in new_counts.items()):
                break

            # stopping criterion 2 - a sufficient fraction of raw data has been mapped
            mapping_efficiency = sum(new_counts.values()) / float(raw_count)
            if mapping_efficiency > MIN_MAPPING_EFFICIENCY:
                break

            if n_remaps >= MAX_REMAPS:
                break

        # deep copy of mapping counts
        map_counts = dict(new_counts)

    # finished iterative phase
    # generate SAM CSV output
    remap_writer = csv.DictWriter(remap_csv,
                                  SAM_FIELDS,
                                  lineterminator=os.linesep)
    remap_writer.writeheader()
    if new_counts:
        splitter = MixedReferenceSplitter(work_path)
        split_counts = Counter()
        # At least one read was mapped, so samfile has relevant data
        with open(samfile) as f:
            for fields in splitter.split(f):
                remap_writer.writerow(dict(zip(SAM_FIELDS, fields)))
        for rname, (split_file1, split_file2) in splitter.splits.items():
            refseqs = {rname: conseqs[rname]}
            unmapped_count += map_to_reference(
                split_file1.name, split_file2.name, refseqs, reffile, samfile,
                unmapped1, unmapped2, bowtie2, bowtie2_build, raw_count,
                rdgopen, rfgopen, split_counts, stderr, callback)
            new_counts.update(split_counts)
            with open(samfile, 'rU') as f:
                for fields in splitter.walk(f):
                    remap_writer.writerow(dict(zip(SAM_FIELDS, fields)))

    # write consensus sequences and counts
    remap_conseq_csv.write(
        'region,sequence\n')  # record consensus sequences for later use
    for refname in new_counts.keys():
        # NOTE this is the consensus sequence to which the reads were mapped, NOT the
        # current consensus!
        conseq = conseqs.get(refname) or projects.getReference(refname)
        remap_conseq_csv.write('%s,%s\n' % (refname, conseq))
    write_remap_counts(remap_counts_writer, new_counts, title='remap-final')

    # report number of unmapped reads
    remap_counts_writer.writerow(dict(type='unmapped', count=unmapped_count))
コード例 #5
0
ファイル: prelim_map.py プロジェクト: tarah28/MiCall
def prelim_map(fastq1,
               fastq2,
               prelim_csv,
               nthreads=BOWTIE_THREADS,
               callback=None,
               rdgopen=READ_GAP_OPEN,
               rfgopen=REF_GAP_OPEN,
               stderr=sys.stderr,
               gzip=False,
               work_path=''):
    """ Run the preliminary mapping step.

    @param fastq1: the file name for the forward reads in FASTQ format
    @param fastq2: the file name for the reverse reads in FASTQ format
    @param prelim_csv: an open file object for the output file - all the reads
        mapped to references in CSV version of the SAM format
    @param nthreads: the number of threads to use.
    @param callback: a function to report progress with three optional
        parameters - callback(message, progress, max_progress)
    @param rdgopen: a penalty for opening a gap in the read sequence.
    @param rfgopen: a penalty for opening a gap in the reference sequence.
    @param stderr: where to write the standard error output from bowtie2 calls.
    @param work_path:  optional path to store working files
    """
    try:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION, BOWTIE_BUILD_PATH, logger)
    except:
        bowtie2 = Bowtie2(BOWTIE_VERSION, BOWTIE_PATH + '-' + BOWTIE_VERSION)
        bowtie2_build = Bowtie2Build(BOWTIE_VERSION,
                                     BOWTIE_BUILD_PATH + '-' + BOWTIE_VERSION,
                                     logger)

    # check that the inputs exist
    if not os.path.exists(fastq1):
        logger.error('No FASTQ found at %s', fastq1)
        sys.exit(1)

    if not os.path.exists(fastq2):
        logger.error('No FASTQ found at %s', fastq2)
        sys.exit(1)

    # append .gz extension if necessary
    if gzip:
        if not fastq1.endswith('.gz'):
            try:
                os.symlink(fastq1, fastq1 + '.gz')
            except OSError:
                # symbolic link already exists
                pass
            fastq1 += '.gz'

        if not fastq2.endswith('.gz'):
            try:
                os.symlink(fastq2, fastq2 + '.gz')
            except OSError:
                # symbolic link already exists
                pass
            fastq2 += '.gz'

    if callback:
        # four lines per read, two files
        total_reads = line_counter.count(fastq1, gzip=gzip) / 2
        callback(message='... preliminary mapping',
                 progress=0,
                 max_progress=total_reads)

    # generate initial reference files
    projects = project_config.ProjectConfig.loadDefault()
    ref_path = os.path.join(work_path, 'micall.fasta')
    with open(ref_path, 'w') as ref:
        projects.writeSeedFasta(ref)
    reffile_template = os.path.join(work_path, 'reference')
    bowtie2_build.build(ref_path, reffile_template)

    # do preliminary mapping
    output = {}
    read_gap_open_penalty = rdgopen
    ref_gap_open_penalty = rfgopen

    # stream output from bowtie2
    bowtie_args = [
        '--wrapper',
        'micall-0',
        '--quiet',
        '-x',
        reffile_template,
        '-1',
        fastq1,
        '-2',
        fastq2,
        '--rdg',
        "{},{}".format(read_gap_open_penalty, READ_GAP_EXTEND),
        '--rfg',
        "{},{}".format(ref_gap_open_penalty, REF_GAP_EXTEND),
        '--no-hd',  # no header lines (start with @)
        '-X',
        '1200',
        '-p',
        str(nthreads)
    ]

    for i, line in enumerate(bowtie2.yield_output(bowtie_args, stderr=stderr)):
        if callback and i % 1000 == 0:
            callback(progress=i)
        refname = line.split('\t')[2]  # read was mapped to this reference
        if refname not in output:
            output.update({refname: []})
        output[refname].append(line.split('\t')[:11])  # discard optional items

    fieldnames = [
        'qname', 'flag', 'rname', 'pos', 'mapq', 'cigar', 'rnext', 'pnext',
        'tlen', 'seq', 'qual'
    ]
    writer = csv.DictWriter(prelim_csv, fieldnames, lineterminator=os.linesep)
    writer.writeheader()

    # lines grouped by refname
    for refname, lines in output.iteritems():
        for line in lines:
            writer.writerow(dict(zip(fieldnames, line)))

    if callback:
        # Track progress for second half
        callback(progress=total_reads)