Esempio n. 1
0
def write_output(stats_file_path, image_index, base_name, fastq_image_aligner, path_info, all_tile_data, make_pdfs, um_per_pixel):
    all_read_rcs_filepath = os.path.join(path_info.results_directory, base_name, '{}_all_read_rcs.txt'.format(image_index))

    # if we've already aligned this channel with a different strategy, the current alignment may or may not be better
    # here we load some data so we can make that comparison
    existing_score = load_existing_score(stats_file_path)

    new_stats = fastq_image_aligner.alignment_stats
    if existing_score > 0:
        log.debug("Alignment already exists for %s/%s, skipping. Score difference: %d." % (base_name, image_index, (new_stats.score - existing_score)))
        return False

    # save information about how to align the images
    log.info("Saving alignment with score of %s\t\t%s" % (new_stats.score, base_name))
    with open(stats_file_path, 'w') as f:
        f.write(new_stats.serialized)

    # save the corrected location of each read
    all_fastq_image_aligner = fastqimagealigner.FastqImageAligner(um_per_pixel)
    all_fastq_image_aligner.all_reads_fic_from_aligned_fic(fastq_image_aligner, all_tile_data)
    with open(all_read_rcs_filepath, 'w') as f:
        for line in all_fastq_image_aligner.read_names_rcs:
            f.write(line)

    # save some diagnostic PDFs that give a nice visualization of the alignment
    if make_pdfs:
        ax = plotting.plot_all_hits(fastq_image_aligner)
        ax.figure.savefig(os.path.join(path_info.figure_directory, base_name, '{}_all_hits.pdf'.format(image_index)))
        plt.close()
        ax = plotting.plot_hit_hists(fastq_image_aligner)
        ax.figure.savefig(os.path.join(path_info.figure_directory, base_name, '{}_hit_hists.pdf'.format(image_index)))
        plt.close()
    del all_fastq_image_aligner
    del fastq_image_aligner
    return True
Esempio n. 2
0
def run_data_channel(cluster_strategy, h5_filenames, channel_name, path_info, alignment_tile_data, all_tile_data, metadata, clargs, process_limit):
    image_count = count_images(h5_filenames, channel_name)
    num_processes, chunksize = calculate_process_count(image_count)
    if process_limit > 0:
        num_processes = min(process_limit, num_processes)
    log.debug("Aligning data images with %d cores with chunksize %d" % (num_processes, chunksize))

    log.debug("Loading reads into FASTQ Image Aligner.")
    fastq_image_aligner = fastqimagealigner.FastqImageAligner(metadata['microns_per_pixel'])
    fastq_image_aligner.load_reads(alignment_tile_data)
    log.debug("Reads loaded.")
    second_processor = functools.partial(process_data_image, cluster_strategy, path_info, all_tile_data,
                                         clargs.microns_per_pixel, clargs.make_pdfs,
                                         channel_name, fastq_image_aligner, clargs.min_hits)
    for h5_filename in h5_filenames:
        pool = multiprocessing.Pool(num_processes)
        log.debug("Doing second channel alignment of all images with %d cores" % num_processes)
        pool.map_async(second_processor,
                       load_aligned_stats_files([h5_filename], metadata['alignment_channel'], path_info),
                       chunksize=chunksize).get(sys.maxint)
        pool.close()
        pool.join()
        gc.collect()

    log.debug("Done aligning!")
Esempio n. 3
0
def main(clargs):
    metadata = initialize.load_metadata(clargs.image_directory)
    cache = initialize.load_cache(clargs.image_directory)
    if not cache['preprocessed']:
        preprocess(clargs.image_directory, cache)

    h5_filenames = load_filenames(clargs.image_directory)
    if len(h5_filenames) == 0:
        error.fail(
            "There were no HDF5 files to process. You must have deleted or moved them after preprocessing them."
        )

    path_info = PathInfo(clargs.image_directory, metadata['mapped_reads'],
                         metadata['perfect_target_name'],
                         metadata['alternate_fiducial_reads'],
                         metadata['alternate_perfect_target_reads_filename'],
                         metadata['alternate_good_target_reads_filename'])
    # Ensure we have the directories where output will be written
    align.make_output_directories(h5_filenames, path_info)

    log.debug("Loading tile data.")
    sequencing_chip = chip.load(metadata['chip_type'])(
        metadata['ports_on_right'])

    alignment_tile_data = align.load_read_names(
        path_info.aligning_read_names_filepath)
    perfect_tile_data = align.load_read_names(path_info.perfect_read_names)
    on_target_tile_data = align.load_read_names(path_info.on_target_read_names)
    all_tile_data = align.load_read_names(path_info.all_read_names_filepath)
    log.debug("Tile data loaded.")

    # We use one process per concentration. We could theoretically speed this up since our machine
    # has significantly more cores than the typical number of concentration points, but since it
    # usually finds a result in the first image or two, it's not going to deliver any practical benefits
    log.debug("Loading FastQImageAligner")
    fia = fastqimagealigner.FastqImageAligner(clargs.microns_per_pixel)
    fia.load_reads(alignment_tile_data)
    log.debug("Loaded %s points" %
              sum([len(v) for v in alignment_tile_data.values()]))
    log.debug("FastQImageAligner loaded.")

    if 'end_tiles' not in cache:
        end_tiles = align.get_end_tiles(cluster_strategies,
                                        clargs.rotation_adjustment,
                                        h5_filenames,
                                        metadata['alignment_channel'],
                                        clargs.snr, metadata, sequencing_chip,
                                        fia)
        cache['end_tiles'] = end_tiles
        initialize.save_cache(clargs.image_directory, cache)
    else:
        log.debug("End tiles already calculated.")
        end_tiles = cache['end_tiles']
    gc.collect()

    if not cache['phix_aligned']:
        for cluster_strategy in cluster_strategies:
            align.run(cluster_strategy, clargs.rotation_adjustment,
                      h5_filenames, path_info, clargs.snr, clargs.min_hits,
                      fia, end_tiles, metadata['alignment_channel'],
                      all_tile_data, metadata, clargs.make_pdfs,
                      sequencing_chip, clargs.process_limit)
            cache['phix_aligned'] = True
            initialize.save_cache(clargs.image_directory, cache)
        else:
            log.debug("Phix already aligned.")

    if clargs.fiducial_only:
        # the user doesn't want us to align the protein channels
        exit(0)

    gc.collect()
    protein_channels = [
        channel
        for channel in projectinfo.load_channels(clargs.image_directory)
        if channel != metadata['alignment_channel']
    ]
    if protein_channels:
        log.debug("Protein channels found: %s" % ", ".join(protein_channels))
    else:
        # protein is in phix channel, hopefully?
        log.warn(
            "No protein channels detected. Assuming protein is in phiX channel: %s"
            % [metadata['alignment_channel']])
        protein_channels = [metadata['alignment_channel']]

    for channel_name in protein_channels:
        # Attempt to precision align protein channels using the phix channel alignment as a starting point.
        # Not all experiments have "on target" or "perfect target" reads - that only applies to CRISPR systems
        # (at the time of this writing anyway)
        for cluster_strategy in cluster_strategies:
            gc.collect()
            if on_target_tile_data:
                channel_combo = channel_name + "_on_target"
                combo_align(cluster_strategy, h5_filenames, channel_combo,
                            channel_name, path_info, on_target_tile_data,
                            all_tile_data, metadata, cache, clargs)
            gc.collect()
            if perfect_tile_data:
                channel_combo = channel_name + "_perfect_target"
                combo_align(cluster_strategy, h5_filenames, channel_combo,
                            channel_name, path_info, perfect_tile_data,
                            all_tile_data, metadata, cache, clargs)
            gc.collect()