Example #1
0
class refcache:
    def __init__(self, fasta_file, cacheSize=5000000):
        self.fasta_file = fasta_file
        self.FA = FastaFile(fasta_file)
        self.chroms = self.FA.references
        self._get_offsets()
        self.chrom_qualities = {
            chrom: self.detect_quality(chrom)
            for chrom in self.chroms
        }
        self.chrom_lens = {
            c: self.FA.get_reference_length(c)
            for c in self.chroms
        }
        self.cacheSize = cacheSize
        self.start = {c: 0 for c in self.chroms}
        self.end = {c: min(cacheSize, self.chrom_lens[c]) for c in self.chroms}
        self.chrom_caches = {
            c: self.FA.fetch(c, 0, self.end[c])
            for c in self.chroms
        }

    def __del__(self):
        self.FA.close()

    def _get_offsets(self):
        self.chrom_offsets = {}
        fai = '%s.fai' % (self.fasta_file)
        with open(fai, 'r') as FAI:
            for split_line in map(lambda x: x.rstrip('\n').split('\t'), FAI):
                self.chrom_offsets[split_line[0]] = int(split_line[2])

    def detect_quality(self, chrom):
        fasta_name = '>%s' % (chrom)
        with open(self.fasta_file, 'r') as FA:
            FA.seek(max(0, self.chrom_offsets[chrom] - 200))
            for line in filter(lambda x: x[0] == '>', FA):
                split_line = line.rstrip('\n').split(' ')
                if split_line[0] == fasta_name:
                    return _split2quality(split_line)

    def fetch(self, chrom, pos, pos2):
        assert (pos2 <= self.chrom_lens[chrom])
        if pos2 - pos + 1 >= self.cacheSize:
            logger.debug(
                "Region was too large for refcache, you should consider increasing the cache size to %i"
                % ((pos2 - pos1 + 1) * 10))
            return self.FA.fetch(chrom, pos, pos2)
        if pos < self.start[chrom] or pos2 > self.end[chrom]:
            self.start[chrom] = pos
            self.end[chrom] = min(pos + self.cacheSize, self.chrom_lens[chrom])
            self.chrom_caches[chrom] = self.FA.fetch(chrom, self.start[chrom],
                                                     self.end[chrom])
        assert (pos >= self.start[chrom])
        sI = pos - self.start[chrom]
        eI = pos2 - self.start[chrom]
        return self.chrom_caches[chrom][sI:eI]
Example #2
0
class FastaHandler:
    """
    Handles fasta files using pyfaidx API
    """
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path

        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except:
            raise IOError("FASTA FILE READ ERROR")

    def get_sequence(self, chromosome_name, start, stop):
        """
        Return the sequence of a query region
        :param chromosome_name: Chromosome name
        :param start: Region start
        :param stop: Region end
        :return: Sequence of the region
        """
        return self.fasta.fetch(region=chromosome_name, start=start,
                                end=stop).upper()

    def get_chr_sequence_length(self, chromosome_name):
        """
        Get sequence length of a chromosome. This is used for selecting windows of parallel processing.
        :param chromosome_name: Chromosome name
        :return: Length of the chromosome reference sequence
        """
        return self.fasta.get_reference_length(chromosome_name)

    def get_contig_names(self):
        return self.fasta.references

    def get_ref_of_region(self, contig, site):
        """
        Return a string containing reference of a site
        :param contig: Contig [ex chr3]
        :param site: Site [ex 100000-200000]
        :return:
        """
        ret_val = ""
        error_val = 0
        try:
            ret_val = self.fasta.fetch(region=contig + site).upper()
        except:
            print("ERROR IN REF FETCH: ", contig, site)
            error_val = 1
        return ret_val, error_val

    def close(self):
        self.fasta.close()
Example #3
0
class IndexedFasta(DataSource):
    name = "indexed_bedfile"
    version = "0.1.0"
    container = "python"
    partition_access = True
    description = "A bgzipped and indexed fasta file"

    def __init__(self, urlpath, metadata=None):
        self._urlpath = urlpath
        self._dataset = None
        self._dtype = None
        self._chroms = None
        super().__init__(metadata=metadata)

    def _open_dataset(self):
        self._dataset = FastaFile(self._urlpath)

    def _get_schema(self):
        if self._dataset is None:
            self._open_dataset()
        self._chroms = list(self._dataset.references)
        chrom_lengths = [{
            "chrom": t[0],
            "length": t[1]
        } for t in zip(self._dataset.references, self._dataset.lengths)]
        return Schema(
            datashape=None,
            dtype=None,
            shape=None,
            npartitions=len(self._chroms),
            extra_metadata={"chroms": chrom_lengths},
        )

    def _get_partition(self, i):
        chrom = self._chroms[i]
        return [{"seqid": chrom, "seq": self._dataset.fetch(chrom)}]

    def read_chunked(self):
        self._load_metadata()
        for i in range(self.npartitions):
            yield self._get_partition(i)

    def to_dask(self):
        from dask import bag as db

        self._load_metadata()
        return db.from_delayed([
            dask.delayed(self._get_partition(i))
            for i in range(self.npartitions)
        ])

    def _close(self):
        # close any files, sockets, etc
        if self._dataset is not None:
            self._dataset.close()
Example #4
0
def _chrom_sizes(fasta_file):
    """Get the chromosome sizes for a fasta file
    """
    from pysam import FastaFile
    fa = FastaFile(fasta_file)
    chrom_lens = OrderedDict([(name, l) for name, l in zip(fa.references, fa.lengths)])
    if len(chrom_lens) == 0:
        raise ValueError(f"no chromosomes found in fasta file: {fasta_file}. "
                         "Make sure the file path is correct and that the fasta index "
                         "file {fasta_file}.fai is up to date")
    fa.close()
    return chrom_lens
Example #5
0
    def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
Example #6
0
    def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
Example #7
0
def get_contig_list_from_fasta(fasta_path, with_length=False):
    """Obtain list of contigs froma  fasta file,
        all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO

    Args:
        fasta_path (str or pysam.FastaFile) : Path or handle to fasta file

        with_length(bool): return list of lengths

    Returns:
        contig_list (list ) : List of contigs + ['MISC_ALT_CONTIGS_SCMO'] if any alt contig is present in the fasta file
        """

    contig_list = []
    has_alt = False
    if with_length:
        lens = []

    if type(fasta_path) is str:
        fa = FastaFile(fasta_path)
    elif type(fasta_path) is FastaFile:
        fa = fasta_path
    else:
        raise TypeError('Supply pysam.FastaFile or str')

    for reference, length in zip(fa.references, fa.lengths):
        if is_main_chromosome(reference):
            contig_list.append(reference)
            if with_length:
                lens.append(length)
        else:
            has_alt = True

    # Close handle if we just opened one
    if type(fasta_path) is str:
        fa.close()

    if has_alt:
        contig_list.append('MISC_ALT_CONTIGS_SCMO')
        if with_length:
            lens.append(None)

    if with_length:
        return contig_list, lens

    return contig_list
def generate_header(reference_fa: str, tag: str) -> VariantHeader:
    """
    Generates the header for the minimal VCF.

    :param reference_fa: Path to reference fasta file.
    :param tag: The filter tag to use.
    """
    header = VariantHeader()
    header.filters.add(tag, None, None, "Failed dToxoG")

    fasta = FastaFile(reference_fa)
    try:
        for contig in fasta.references:
            header.contigs.add(contig,
                               length=fasta.get_reference_length(contig))
    finally:
        fasta.close()

    return header
Example #9
0
class SequenceExtractor(object):
    """

    Extracting sequences from FASTA file by interval objects.

    """
    def __init__(self, path):
        self._fasta = FastaFile(path)

    def get_sequence(self, gi):
        seqs = []
        for x, y in gi.blocks:
            seqs.append(self._fasta.fetch(gi.chrom, x, y))
        seq = Seq("".join(seqs))
        if gi.reverse:
            seq = seq.reverse_complement()
        return seq

    def close(self):
        if not self._fasta.closed:
            self._fasta.close()

    def __del__(self):
        self.close()
Example #10
0
    def export_bw(self,
                  regions,
                  output_prefix,
                  fasta_file=None,
                  contrib_method='grad',
                  pred_summaries=['profile/wn', 'counts/pre-act'],
                  batch_size=512,
                  scale_contribution=False,
                  chromosomes=None):
        """Export predictions and model contributions to big-wig files

        Args:
          regions: list of genomic regions
          output_prefix: output file prefix

          batch_size:
          scale_contribution: if True, multiple the contribution scores by the predicted count value
          chromosomes: a list of chromosome names consisting a genome
        """
        from pysam import FastaFile
        #          pred_summary: which operation to use for the profile gradients
        logger.info("Get model predictions and contribution scores")
        out = self.predict_regions(regions,
                                   contrib_method=contrib_method,
                                   pred_summaries=pred_summaries,
                                   fasta_file=fasta_file,
                                   batch_size=batch_size)

        logger.info("Setup bigWigs for writing")
        # Get the genome lengths
        if fasta_file is None:
            fasta_file = self.fasta_file

        fa = FastaFile(fasta_file)
        if chromosomes is None:
            genome = OrderedDict([(c, l) for c, l in zip(fa.references, fa.lengths)])
        else:
            genome = OrderedDict([(c, l) for c, l in zip(fa.references, fa.lengths) if c in chromosomes])
        fa.close()

        output_feats = ['preds.pos', 'preds.neg', 'contrib.profile', 'contrib.counts']

        # make sure the regions are in the right order
        first_chr = list(np.unique(np.array([interval.chrom for interval in regions])))
        last_chr = [c for c, l in genome.items() if c not in first_chr]
        genome = [(c, genome[c]) for c in first_chr + last_chr]

        # open bigWigs for writing
        bws = {}
        for task in self.tasks:
            bws[task] = {}
            for feat in output_feats:
                delim = "." if not output_prefix.endswith("/") else ""
                bw_preds_pos = pyBigWig.open(f"{output_prefix}{delim}{task}.{feat}.bw", "w")
                bw_preds_pos.addHeader(genome)
                bws[task][feat] = bw_preds_pos

        def add_entry(bw, arr, interval, start_idx=0):
            """Macro for adding an entry to the bigwig file

            Args:
              bw: pyBigWig file handle
              arr: 1-dimensional numpy array
              interval: genomic interval pybedtools.Interval
              start_idx: how many starting values in the array to skip
            """
            assert arr.ndim == 1
            assert start_idx < len(arr)

            if interval.stop - interval.start != len(arr):
                logger.warning(f"interval.stop - interval.start ({interval.stop - interval.start})!= len(arr) ({len(arr)})")
                logger.warning(f"Skipping the entry: {interval}")
                return
            bw.addEntries(interval.chrom, interval.start + start_idx,
                          values=arr[start_idx:],
                          span=1, step=1)

        def to_1d_contrib(hyp_contrib, seq):
            # mask the hyp_contrib + add them up
            return (hyp_contrib * seq).sum(axis=-1)

        # interval logic to handle overlapping intervals
        #   assumption: all intervals are sorted w.r.t the start coordinate
        #   strategy: don't write values at the same position twice (skip those)
        #
        # graphical representation:
        # ...     ]    - prev_stop
        #      [     ]   - new interval 1
        #         [  ]   - added chunk from interval 1
        #   [  ]         - new interval 2 - skip
        #          [   ] - new interval 3, fully add

        logger.info("Writing to bigWigs")
        prev_stop = None   # Keep track of what the previous interval already covered
        prev_chrom = None
        for i in tqdm(range(len(out))):
            interval = out[i]['interval']

            if prev_chrom != interval.chrom:
                # Encountered a new chromosome
                prev_stop = 0  # Restart the end-counter
                prev_chrom = interval.chrom

            if prev_stop >= interval.stop:
                # Nothing new to add to that range
                continue
            start_idx = max(prev_stop - interval.start, 0)

            for tid, task in enumerate(self.tasks):
                # Write predictions
                preds = out[i]['pred'][task]
                add_entry(bws[task]['preds.pos'], preds[:, 0],
                          interval, start_idx)
                add_entry(bws[task]['preds.neg'], preds[:, 1],
                          interval, start_idx)

                # Get the contribution scores
                seq = out[i]['seq']
                hyp_contrib = out[i]['contrib_score']

                if scale_contribution:
                    si_profile = preds.sum()  # Total number of counts in the region
                    si_counts = preds.sum()
                else:
                    si_profile = 1
                    si_counts = 1

                # profile - multipl
                if np.all(seq.astype(bool).sum(axis=-1).max() == 1):
                    continue

                add_entry(bws[task]['contrib.profile'],
                          to_1d_contrib(hyp_contrib[f'{task}/profile'], seq) * si_profile,
                          interval, start_idx)
                add_entry(bws[task]['contrib.counts'],
                          to_1d_contrib(hyp_contrib[f'{task}/count'], seq) * si_counts,
                          interval, start_idx)

            prev_stop = max(interval.stop, prev_stop)

        logger.info("Done writing. Closing bigWigs")
        # Close all the big-wig files
        for task in self.tasks:
            for feat in output_feats:
                bws[task][feat].close()
        logger.info(f"Done! Output files stored as: {output_prefix}{delim}*")
Example #11
0
def write_final_vcf(int_duplication_candidates, inversion_candidates,
                    tandem_duplication_candidates, deletion_candidates,
                    novel_insertion_candidates, breakend_candidates, version,
                    contig_names, contig_lengths, types_to_output, options):
    vcf_output = open(options.working_dir + '/variants.vcf', 'w')

    # Write header lines
    print("##fileformat=VCFv4.2", file=vcf_output)
    print("##fileDate={0}".format(time.strftime("%Y-%m-%d|%I:%M:%S%p|%Z|%z")),
          file=vcf_output)
    print("##source=SVIM-v{0}".format(version), file=vcf_output)
    for contig_name, contig_length in zip(contig_names, contig_lengths):
        print("##contig=<ID={0},length={1}>".format(contig_name,
                                                    contig_length),
              file=vcf_output)
    if "DEL" in types_to_output:
        print("##ALT=<ID=DEL,Description=\"Deletion\">", file=vcf_output)
    if "INV" in types_to_output:
        print("##ALT=<ID=INV,Description=\"Inversion\">", file=vcf_output)
    if (not options.tandem_duplications_as_insertions and "DUP:TANDEM" in types_to_output) or \
       (not options.interspersed_duplications_as_insertions and "DUP:INT" in types_to_output):
        print("##ALT=<ID=DUP,Description=\"Duplication\">", file=vcf_output)
    if not options.tandem_duplications_as_insertions and "DUP:TANDEM" in types_to_output:
        print("##ALT=<ID=DUP:TANDEM,Description=\"Tandem Duplication\">",
              file=vcf_output)
    if not options.interspersed_duplications_as_insertions and "DUP:INT" in types_to_output:
        print("##ALT=<ID=DUP:INT,Description=\"Interspersed Duplication\">",
              file=vcf_output)
    if "INS" in types_to_output:
        print("##ALT=<ID=INS,Description=\"Insertion\">", file=vcf_output)
    if "BND" in types_to_output:
        print("##ALT=<ID=BND,Description=\"Breakend\">", file=vcf_output)
    print(
        "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
        file=vcf_output)
    print(
        "##INFO=<ID=CUTPASTE,Number=0,Type=Flag,Description=\"Genomic origin of interspersed duplication seems to be deleted\">",
        file=vcf_output)
    print(
        "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
        file=vcf_output)
    print(
        "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
        file=vcf_output)
    print(
        "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting this variant\">",
        file=vcf_output)
    print(
        "##INFO=<ID=STD_SPAN,Number=1,Type=Float,Description=\"Standard deviation in span of merged SV signatures\">",
        file=vcf_output)
    print(
        "##INFO=<ID=STD_POS,Number=1,Type=Float,Description=\"Standard deviation in position of merged SV signatures\">",
        file=vcf_output)
    print(
        "##INFO=<ID=STD_POS1,Number=1,Type=Float,Description=\"Standard deviation of breakend 1 position\">",
        file=vcf_output)
    print(
        "##INFO=<ID=STD_POS2,Number=1,Type=Float,Description=\"Standard deviation of breakend 2 position\">",
        file=vcf_output)
    if options.insertion_sequences:
        print(
            "##INFO=<ID=SEQS,Number=.,Type=String,Description=\"Insertion sequences from all supporting reads\">",
            file=vcf_output)
    if options.read_names:
        print(
            "##INFO=<ID=READS,Number=.,Type=String,Description=\"Names of all supporting reads\">",
            file=vcf_output)
    if options.zmws:
        print(
            "##INFO=<ID=ZMWS,Number=1,Type=Integer,Description=\"Number of supporting ZMWs (PacBio only)\">",
            file=vcf_output)
    print(
        "##FILTER=<ID=hom_ref,Description=\"Genotype is homozygous reference\">",
        file=vcf_output)
    print(
        "##FILTER=<ID=not_fully_covered,Description=\"Tandem duplication is not fully covered by a single read\">",
        file=vcf_output)
    print("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
          file=vcf_output)
    print("##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">",
          file=vcf_output)
    print(
        "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Read depth for each allele\">",
        file=vcf_output)
    if not options.tandem_duplications_as_insertions and "DUP:TANDEM" in types_to_output:
        print(
            "##FORMAT=<ID=CN,Number=1,Type=Integer,Description=\"Copy number of tandem duplication (e.g. 2 for one additional copy)\">",
            file=vcf_output)
    print("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t" +
          options.sample,
          file=vcf_output)

    # Open reference genome sequence file
    sequence_alleles = options.sequence_alleles
    if sequence_alleles:
        try:
            reference = FastaFile(options.genome)
        except ValueError:
            logging.warning(
                "The given reference genome is missing an index file ({path}.fai). Sequence alleles cannot be retrieved."
                .format(options.genome))
            sequence_alleles = False
        except IOError:
            logging.warning(
                "The given reference genome is missing ({path}). Sequence alleles cannot be retrieved."
                .format(options.genome))
            sequence_alleles = False
    else:
        reference = None

    # Prepare VCF entries depending on command-line parameters
    vcf_entries = []
    if "DEL" in types_to_output:
        for candidate in deletion_candidates:
            vcf_entries.append(
                (candidate.get_source(),
                 candidate.get_vcf_entry(sequence_alleles, reference,
                                         options.read_names,
                                         options.zmws), "DEL"))
    if "INV" in types_to_output:
        for candidate in inversion_candidates:
            vcf_entries.append(
                (candidate.get_source(),
                 candidate.get_vcf_entry(sequence_alleles, reference,
                                         options.read_names,
                                         options.zmws), "INV"))
    if "INS" in types_to_output:
        for candidate in novel_insertion_candidates:
            vcf_entries.append(
                (candidate.get_destination(),
                 candidate.get_vcf_entry(sequence_alleles, reference,
                                         options.insertion_sequences,
                                         options.read_names,
                                         options.zmws), "INS"))
    if options.tandem_duplications_as_insertions:
        if "INS" in types_to_output:
            for candidate in tandem_duplication_candidates:
                vcf_entries.append(
                    (candidate.get_destination(),
                     candidate.get_vcf_entry_as_ins(options.read_names,
                                                    options.zmws), "INS"))
    else:
        if "DUP:TANDEM" in types_to_output:
            for candidate in tandem_duplication_candidates:
                vcf_entries.append(
                    (candidate.get_source(),
                     candidate.get_vcf_entry_as_dup(options.read_names,
                                                    options.zmws),
                     "DUP_TANDEM"))
    if options.interspersed_duplications_as_insertions:
        if "INS" in types_to_output:
            for candidate in int_duplication_candidates:
                vcf_entries.append(
                    (candidate.get_destination(),
                     candidate.get_vcf_entry_as_ins(options.read_names,
                                                    options.zmws), "INS"))
    else:
        if "DUP:INT" in types_to_output:
            for candidate in int_duplication_candidates:
                vcf_entries.append(
                    (candidate.get_source(),
                     candidate.get_vcf_entry_as_dup(options.read_names,
                                                    options.zmws), "DUP_INT"))
    if "BND" in types_to_output:
        for candidate in breakend_candidates:
            vcf_entries.append(
                ((candidate.get_source()[0], candidate.get_source()[1],
                  candidate.get_source()[1] + 1),
                 candidate.get_vcf_entry(options.read_names,
                                         options.zmws), "BND"))
            vcf_entries.append(
                ((candidate.get_destination()[0],
                  candidate.get_destination()[1],
                  candidate.get_destination()[1] + 1),
                 candidate.get_vcf_entry_reverse(options.read_names,
                                                 options.zmws), "BND"))

    if sequence_alleles:
        reference.close()

    # Sort and write entries to VCF
    svtype_counter = defaultdict(int)
    for source, entry, svtype in sorted_nicely(vcf_entries):
        variant_id = "svim.{svtype}.{number}".format(
            svtype=svtype, number=svtype_counter[svtype] + 1)
        entry_with_id = entry.replace("PLACEHOLDERFORID", variant_id, 1)
        svtype_counter[svtype] += 1
        print(entry_with_id, file=vcf_output)

    vcf_output.close()