class MappableRegionsPipeline(object):
    def __init__(self, config, aligner=None):
        self.config = config
        self.genome = Genome(self.config)
        if aligner is not None:
            self.aligner = aligner
        else:
            assert self.genome.aligner is not None
            self.aligner = self.genome.aligner
        assert self.aligner is not None

    def mappable_regions_check(self, chroms, mappable_regions_df):

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])
            start_pos_count = len(chrom_df.start_pos.unique())
            if start_pos_count < len(chrom_df):
                LOG.error(
                    "chrom {} has duplicate mappable regions".format(chrom))

    def generate_reads(self, chroms, read_length):
        try:
            for chrom in chroms:
                seq_record = self.genome.load_chrom(chrom)
                for i in range(len(seq_record) - read_length + 1):
                    seq = seq_record.seq[i:i + read_length]
                    out_record = SeqRecord(seq,
                                           id="{}.{}".format(chrom, i + 1),
                                           description="generated_read")
                    # if 'N' in seq:
                    #     print('skipping: ', out_record)
                    #     continue
                    yield out_record
        finally:
            pass

    def generate_mappable_regions(self,
                                  chroms,
                                  read_length,
                                  outfile=None,
                                  aligner_options=[]):

        if outfile is None:
            outfile = sys.stdout

        reads_generator = self.generate_reads(chroms, read_length)

        def aligner_output_process_function(line):
            outfile.write(str(line))
            outfile.write("\n")

        aligner_command = self.aligner.build_mappable_regions_command(
            options=aligner_options)
        print('aligner command:', ' '.join(aligner_command))

        with Popen(aligner_command, stdout=PIPE, stdin=PIPE) as proc:

            control_queue = queue.Queue()
            input_thread = InputGeneratorThread(control_queue, proc.stdin,
                                                reads_generator)
            output_thread = AlignerOutputProcessingThread(
                control_queue, proc.stdout, aligner_output_process_function)

            input_thread.start()
            output_thread.start()

            while True:
                msg = None
                try:
                    msg = control_queue.get()
                except queue.Empty:
                    print("timeout - queue empty")
                    msg = None
                if msg == 'out_done':
                    print("output done")
                    break
                if msg == 'in_done':
                    print('input done')
            input_thread.join()
            output_thread.join()

    def mappable_regions_chrom_filename(self, chrom):
        mname = "{}_{}".format(chrom,
                               self.config.mappable_regions.mappable_file)
        filename = os.path.join(self.config.mappable_regions.mappable_dir,
                                mname)
        return filename

    def mappable_regions_filename(self):
        mname = self.config.mappable_regions.mappable_file
        filename = os.path.join(self.config.mappable_regions.mappable_dir,
                                mname)
        return filename

    def run_once(self, chrom):
        outfilename = self.mappable_regions_chrom_filename(chrom)
        with open(outfilename, "w") as outfile:
            self.generate_mappable_regions([chrom],
                                           read_length=50,
                                           outfile=outfile)
        return outfilename

    def concatenate_all_chroms(self):
        dst = self.mappable_regions_filename()
        if os.path.exists(dst) and not self.config.force:
            print(
                colored(
                    "destination mappable regions file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if not self.config.dry_run:
            with open(dst, 'wb') as output:
                for chrom in self.genome.version.CHROMS:
                    src = self.mappable_regions_chrom_filename(chrom)
                    print(
                        colored("appending {} to {}".format(src, dst),
                                "green"))
                    with open(src, 'rb') as src:
                        if not self.config.dry_run:
                            shutil.copyfileobj(src, output, 1024 * 1024 * 10)

    def run(self, dask_client):
        outfilename = self.mappable_regions_filename()
        print(
            colored(
                "going to generate mappable regions with length {} "
                "from genome {} into {}".format(
                    self.config.mappable_regions.mappable_read_length,
                    self.config.genome.genome_dir, outfilename), "green"))

        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        genome_index_filenames = self.aligner.genome_index_filenames
        if not os.path.exists(genome_index_filenames[0]):
            print(
                colored(
                    "genome index file {} not found".format(
                        genome_index_filenames), "red"))
            raise ValueError("genome index file not found")

        if self.config.dry_run:
            return

        os.makedirs(self.config.mappable_regions.mappable_dir, exist_ok=True)

        assert dask_client

        delayed_tasks = dask_client.map(self.run_once,
                                        self.genome.version.CHROMS)

        distributed.wait(delayed_tasks)

        for fut in delayed_tasks:
            print("fut done:", fut.done())
            print("fut exception:", fut.exception())
            print("fut traceback:", fut.traceback())
            print("fut result:", fut.result())

            # if fut.traceback() is not None:
            #     traceback.print_tb(fut.traceback())
            # if fut.exception() is None:
            #     print(fut.result())

        self.concatenate_all_chroms()
Esempio n. 2
0
class BinsPipeline(object):
    def __init__(self, config):
        self.config = config
        self.genome = Genome(self.config)

    def calc_bins_gc_content(self, chroms, bins_df):

        result = []
        for chrom in chroms:
            chrom_df = bins_df[bins_df['bin.chrom'] == chrom]
            gc_df = chrom_df.copy()
            gc_df.reset_index(inplace=True, drop=True)

            gc_series = pd.Series(index=gc_df.index)
            chrom_seq = self.genome.load_chrom(chrom)

            for index, row in gc_df.iterrows():
                start = row['bin.start']
                end = row['bin.end']
                seq = chrom_seq.seq[start:end]
                counts = [seq.count(x) for x in ['G', 'C', 'A', 'T']]
                total_counts = sum(counts)
                if total_counts == 0:
                    gc = 0.0
                else:
                    gc = float(sum(counts[0:2])) / sum(counts)
                gc_series.iloc[index] = gc

            gc_df['gc.content'] = gc_series
            result.append(gc_df)
        assert len(result) > 0
        if len(result) == 1:
            return result[0]
        df = pd.concat(result)
        return df

    def bins_boundaries_generator(self, chroms, mappable_regions_df):
        chrom_sizes = self.genome.chrom_sizes()
        chrom_bins = self.genome.calc_chrom_bins()

        # if mappable_regions_df is None:
        #     mappable_regions_df = self.load_mappable_regions()

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])

            params = BinParams.build(chrom_size=chrom_sizes[chrom],
                                     chrom_bin=chrom_bins[chrom])
            mappable_bin = None
            current_excess = 0
            bins_count = params.bins_count

            for row in chrom_df.to_dict(orient="records"):
                if mappable_bin is None:
                    mappable_bin = MappableBin.from_start(params, start_pos=0)
                    current_excess = mappable_bin.adapt_excess(current_excess)
                if not mappable_bin.check_extend(row):
                    next_bin = mappable_bin.split_extend(row)

                    bins_count -= 1
                    if bins_count == 0:
                        # last bin a chromosome
                        mappable_bin.end_pos = chrom_sizes[chrom].size
                    yield mappable_bin
                    if next_bin.is_overfill():
                        current_excess, mappable_bins = \
                            next_bin.overfill_split(current_excess)

                        assert len(mappable_bins) > 1
                        for mb in mappable_bins[:-1]:
                            bins_count -= 1
                            yield mb
                        mappable_bin = mappable_bins[-1]
                    else:
                        mappable_bin = next_bin
                        current_excess = \
                            mappable_bin.adapt_excess(current_excess)
                # print("mappable_bin:", row, mappable_bin)
            mappable_bin = None

    def calc_bins_boundaries(self, chroms=None, regions_df=None):
        if chroms is None:
            chroms = self.genome.version.CHROMS
        bin_rows = []
        for mbin in self.bins_boundaries_generator(chroms, regions_df):
            # print("mbin:", mbin)
            bin_rows.append(
                (mbin.chrom, mbin.start_pos, mbin.start_abspos, mbin.end_pos,
                 mbin.end_pos - mbin.start_pos, mbin.bin_size))

        df = pd.DataFrame.from_records(bin_rows,
                                       columns=[
                                           'bin.chrom', 'bin.start',
                                           'bin.start.abspos', 'bin.end',
                                           'bin.length', 'mappable.positions'
                                       ])
        df.sort_values(by=['bin.start.abspos'], inplace=True)
        return df

    def load_mappable_regions(self, chrom=None):
        filename = self.config.mappable_regions_filename(chrom=chrom)

        df = pd.read_csv(self.config.mappable_regions_filename(),
                         names=['chrom', 'start_pos', 'end_pos'],
                         sep='\t')
        df = df.sort_values(by=['chrom', 'start_pos', 'end_pos'])
        assert len(df) > 0

        return df

    def run_once(self, chrom):
        print(
            colored(f"started calculating bins for chromosome {chrom}",
                    "green"))
        regions_df = self.load_mappable_regions(chrom=chrom)
        bins_df = self.calc_bins_boundaries([chrom], regions_df)
        df = self.calc_bins_gc_content([chrom], bins_df)
        outfilename = self.config.bins_boundaries_filename(chrom)
        print(
            colored(f"saving bins for chromosome {chrom} into {outfilename}",
                    "green"))
        df.to_csv(outfilename, sep='\t', index=False)
        return outfilename

    def concatenate_all_chroms(self):
        outfilename = self.config.bins_boundaries_filename()
        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "destination bins boundaries file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if self.config.dry_run:
            return

        dataframes = []
        for chrom in self.genome.version.CHROMS:
            srcfilename = self.config.bins_boundaries_filename(chrom)
            df = pd.read_csv(srcfilename, sep='\t')
            dataframes.append(df)
        outdf = pd.concat(dataframes, ignore_index=True)
        outdf.sort_values(by=['bin.start.abspos', 'bin.start', 'bin.end'],
                          inplace=True)

        outdf.to_csv(outfilename, sep='\t', index=False)

    def run(self, dask_client):
        outfilename = self.config.bins_boundaries_filename()
        os.makedirs(os.path.dirname(outfilename), exist_ok=True)

        print(
            colored(
                "going to compute bin boundaries from mappable regions: {} "
                "into bins boundaries file {}".format(
                    self.config.mappable_regions_filename(), outfilename),
                "green"))
        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        if self.config.dry_run:
            return

        assert self.genome.chrom_sizes() is not None

        delayed_tasks = dask_client.map(self.run_once,
                                        self.genome.version.CHROMS)
        print(len(delayed_tasks), delayed_tasks)
        print(dask_client.scheduler_info())

        distributed.wait(delayed_tasks)
        for task in delayed_tasks:
            outfile = task.result()
            print(outfile, os.path.exists(outfile))

        self.concatenate_all_chroms()
Esempio n. 3
0
class MappableRegionsPipeline(object):
    def __init__(self, config):
        self.config = config
        self.hg = Genome(self.config)

    def mappable_regions_check(self, chroms, mappable_regions_df):
        # if mappable_regions_df is None:
        #     mappable_regions_df = self.load_mappable_regions()

        for chrom in chroms:
            chrom_df = mappable_regions_df[mappable_regions_df.chrom == chrom]
            chrom_df = chrom_df.sort_values(
                by=['chrom', 'start_pos', 'end_pos'])
            start_pos_count = len(chrom_df.start_pos.unique())
            if start_pos_count < len(chrom_df):
                LOG.error(
                    "chrom {} has duplicate mappable regions".format(chrom))

    def generate_reads(self, chroms, read_length):
        try:
            for chrom in chroms:
                seq_record = self.hg.load_chrom(chrom)
                for i in range(len(seq_record) - read_length + 1):
                    out_record = SeqRecord(seq_record.seq[i:i + read_length],
                                           id="{}.{}".format(chrom, i + 1),
                                           description="generated_read")
                    yield out_record
        finally:
            pass

    async def async_start_bowtie(self, bowtie_opts=""):
        genomeindex = self.config.genome_index_filename()
        if bowtie_opts:
            command = [
                'bowtie',
                '-S',
                '-t',
                '-v',
                '0',
                '-m',
                '1',
                *bowtie_opts.split(' '),
                '-f',
                genomeindex,
                '-',
            ]
        else:
            command = [
                'bowtie',
                '-S',
                '-t',
                '-v',
                '0',
                '-m',
                '1',
                '-f',
                genomeindex,
                '-',
            ]
        print(
            colored("going to execute bowtie: {}".format(" ".join(command)),
                    "green"))
        create = asyncio.create_subprocess_exec(
            *command,
            stdin=asyncio.subprocess.PIPE,
            stdout=asyncio.subprocess.PIPE,
        )
        proc = await create
        return proc

    @staticmethod
    async def async_write_fasta(outfile, rec):
        out = Genome.to_fasta_string(rec)
        outfile.write(out)
        await outfile.drain()

    async def async_write_reads_generator(self, out, reads_generator):
        for rec in reads_generator:
            await self.async_write_fasta(out, rec)
        out.close()

    async def async_mappings_generator(self, reads_generator, bowtie):
        writer = asyncio.Task(
            self.async_write_reads_generator(bowtie.stdin, reads_generator))

        while True:
            line = await bowtie.stdout.readline()
            if not line:
                break
            yield line.decode()

        await bowtie.wait()
        await writer

    async def async_generate_mappings(self, chroms, read_length, outfile=None):
        if outfile is None:
            outfile = sys.stdout

        bowtie = await self.async_start_bowtie()
        reads_generator = self.generate_reads(chroms, read_length)
        async for mappings in self.async_mappings_generator(
                reads_generator, bowtie):
            outfile.write(mappings)

    async def async_generate_mappable_regions(self,
                                              chroms,
                                              read_length,
                                              outfile=None,
                                              bowtie_opts=""):

        bowtie = await self.async_start_bowtie(bowtie_opts=bowtie_opts)
        reads_generator = self.generate_reads(chroms, read_length)
        writer = asyncio.Task(
            self.async_write_reads_generator(bowtie.stdin, reads_generator))
        if outfile is None:
            outfile = sys.stdout
        async for mapping in self.async_mappable_regions_generator(
                bowtie.stdout):
            outfile.write(str(mapping))
            outfile.write('\n')
        await bowtie.wait()
        await writer

    async def async_mappable_regions_generator(self, infile):
        prev = None
        state = MappableState.OUT

        while True:
            line = await infile.readline()
            if not line:
                break
            line = line.decode()
            if line[0] == '@':
                # comment
                continue

            mapping = Mapping.parse_sam(line)

            if state == MappableState.OUT:
                if mapping.flag == 0:
                    prev = MappableRegion(mapping)
                    state = MappableState.IN
            else:
                if mapping.flag == 0:
                    if mapping.chrom == prev.chrom:
                        prev.extend(mapping.start)
                    else:
                        yield prev
                        prev = MappableRegion(mapping)
                else:
                    yield prev
                    state = MappableState.OUT

        if state == MappableState.IN:
            yield prev

    def run_once(self, chrom):
        event_loop = asyncio.get_event_loop()

        # LOG.info('enabling debugging')
        # Enable debugging
        # event_loop.set_debug(True)

        outfilename = self.config.mappable_regions_filename(chrom)
        with open(outfilename, "w") as outfile:
            event_loop.run_until_complete(
                self.async_generate_mappable_regions(
                    [chrom],
                    self.config.mappable_regions.length,
                    outfile=outfile,
                    bowtie_opts=self.config.mappable_regions.bowtie_opts))

    def concatenate_all_chroms(self):
        dst = self.config.mappable_regions_filename()
        if os.path.exists(dst) and not self.config.force:
            print(
                colored(
                    "destination mappable regions file already exists"
                    "use --force to overwrite", "red"))
            raise ValueError("destination file exists... use --force")

        if not self.config.dry_run:
            with open(dst, 'wb') as output:
                for chrom in self.hg.version.CHROMS:
                    src = self.config.mappable_regions_filename(chrom)
                    print(
                        colored("appending {} to {}".format(src, dst),
                                "green"))
                    with open(src, 'rb') as src:
                        if not self.config.dry_run:
                            shutil.copyfileobj(src, output, 1024 * 1024 * 10)

    def run(self):
        outfilename = self.config.mappable_regions_filename()
        print(
            colored(
                "going to generate mappable regions with length {} "
                "from genome {} into {}".format(
                    self.config.mappable_regions.length,
                    self.config.genome.work_dir, outfilename), "green"))

        if os.path.exists(outfilename) and not self.config.force:
            print(
                colored(
                    "output file {} already exists; "
                    "use --force to overwrite".format(outfilename), "red"))
            raise ValueError("output file already exists")

        if not self.config.genome_index_filename_exists():
            print(
                colored(
                    "genome index file {} not found".format(
                        self.config.genome_index_filename()), "red"))
            raise ValueError("genome index file not found")

        if self.config.dry_run:
            return

        if not os.path.exists(self.config.mappable_regions.work_dir):
            os.makedirs(self.config.mappable_regions.work_dir)

        pool = multiprocessing.Pool(processes=self.config.parallel)
        pool.map(self.run_once, self.hg.version.CHROMS)

        self.concatenate_all_chroms()