Esempio n. 1
0
 def scaffold_dict_init(self, file, introns):
     sequences_object = FastaFile(file)
     dict = {}
     for i in introns:
         if (not (i.scaffold_id in dict)):
             dict[i.scaffold_id] = sequences_object.fetch(i.scaffold_id)
     return dict
Esempio n. 2
0
def extract_fasta_to_file(fasta,
                          output_dir,
                          mode='2D_transpose_bcolz',
                          overwrite=False):
    assert mode in _array_writer

    makedirs(output_dir, exist_ok=overwrite)
    fasta_file = FastaFile(fasta)
    file_shapes = {}
    for chrom, size in zip(fasta_file.references, fasta_file.lengths):
        data = np.zeros((size, NUM_SEQ_CHARS), dtype=np.float32)
        seq = fasta_file.fetch(chrom)
        one_hot_encode_sequence(seq, data)
        shape = data.shape
        shape_transpose = shape[::-1]
        file_shapes[chrom] = shape_transpose
        _array_writer[mode](data, os.path.join(
            output_dir,
            chrom))  #We have the metadata shape to be the transposed shape

    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(
            {
                'file_shapes': file_shapes,
                'type': 'array_{}'.format(mode),
                'source': fasta
            }, fp)
Esempio n. 3
0
def split_variants_to_files(vcf_file, genome_file, bi_file, multi_file):
    vcf = VariantFile(vcf_file)
    genome = FastaFile(genome_file)
    vcf.header.add_line('##INFO=<ID=multi,Number=0,Type=Flag,'
                        'Description="Variant with multiple allele">')
    vcf.header.add_line('##INFO=<ID=duplicated,Number=0,Type=Flag,'
                        'Description="Duplicated in position">')
    vcf.header.formats.add('GT', 1, 'String', "Genotype")
    vcf.header.add_sample("Genotype")

    with open(bi_file, 'wt') as outbi:
        with open(multi_file, 'wt') as outmu:

            outbi.write(str(vcf.header))
            outmu.write(str(vcf.header))

            for multi_alleles, duplicated, record in iter_wanted_variants(
                    vcf, genome):
                record = record_to_string(record) + ['GT', '0/1']
                record = '\t'.join(record)
                if duplicated:
                    continue
                if multi_alleles:
                    outmu.write(record + '\n')
                else:
                    outbi.write(record + '\n')
Esempio n. 4
0
def split_variants(vcf_file, genome_file):
    vcf = VariantFile(vcf_file)
    genome = FastaFile(genome_file)
    vcf.header.add_line('##INFO=<ID=multi,Number=0,Type=Flag,'
                        'Description="Variant with multiple allele">')
    vcf.header.add_line('##INFO=<ID=duplicated,Number=0,Type=Flag,'
                        'Description="Duplicated in position">')
    vcf.header.formats.add('GT', 1, 'String', "Genotype")
    vcf.header.add_sample("Genotype")

    print(vcf.header, end='')

    for multi_alleles, duplicated, record in iter_wanted_variants(vcf, genome):
        record = record_to_string(record) + ['GT', '0/1']
        if multi_alleles:
            add = "multi" if record[6] else ";multi"
            record[6] += add

        if duplicated:
            add = "duplicated" if record[6] else ";duplicated"
            record[6] += add

        record = '\t'.join(record)

        print(record)
Esempio n. 5
0
 def __init__(self,
              ref_fa_path=None,
              vcf_path=None,
              idx_path=None,
              batch_size=32,
              bin_size=100,
              tie='r'):
     '''
     :param str ref_fa_path: Path to indexed reference fasta
     :param str vcf_path: Path to indexed vcf
     :param str idx_path: Path to bed-file which will contain the names and locations of compatible variants
     :param int batch_size: Batch size
     :param int bin_size: Length of the DNA-sequences (centered on the start position of the variant)
     '''
     self.vcf = VariantFile(vcf_path)
     self.ref = FastaFile(ref_fa_path)
     assert os.path.isfile(
         ref_fa_path +
         '.fai'), 'Error: no index found for Fasta-file: {}'.format(
             ref_fa_path)
     self.idx_path = idx_path
     self.batch_size = batch_size
     self.bin_size = bin_size
     assert tie in ['l', 'r']
     self.tie = tie
     if not bin_size % 2:
         self.offset = 0 if tie == 'r' else 1
     else:
         self.offset = 0
     self.n_variants = self._initialize_index()
     self._verify_refmatch()
Esempio n. 6
0
def initWorker(localWindowSize, fastaFile, k, N, M):
    global FA, windowSize, kSize, useN, method
    windowSize = localWindowSize
    FA = FastaFile(fastaFile)
    kSize = k
    useN = N
    method = M
Esempio n. 7
0
def _chrom_names(fasta_file):
    """Get the list of chromosome names from a fasta file
    """
    from pysam import FastaFile
    with FastaFile(fasta_file) as fa:
        chroms = list(fa.references)
    return chroms
Esempio n. 8
0
def shotgun_library(fasta_file, mu, sigma, direction=(1, -1)):
    """Generate random fragment sequences of a given input sequence

    :param seq: input sequence.
    :param mu: mean fragment length.
    :param sigma: stdv of fragment length.
    :param direction: tuple represention direction of output sequences with
        respect to the input sequence.

    :yields: sequence fragments.

    .. note:: Could be made more efficient using buffers for random samples
        and handling cases separately.
    """
    fasta = FastaFile(fasta_file)
    seq_lens = [fasta.get_reference_length(x) for x in fasta.references]
    total_len = sum(seq_lens)
    seq_probs = [x / total_len for x in seq_lens]
    # FastaFile.fetch is proper slow, just read everything
    refs = fasta.references
    fasta = {k: fasta.fetch(k) for k in refs}

    def random_buffer(probs, size=10000):
        while True:
            buf = []
            for x, n in zip(range(len(probs)),
                            np.random.multinomial(size, probs)):
                buf.extend([x] * n)
            np.random.shuffle(buf)
            for x in buf:
                yield x

    seq_chooser = random_buffer(seq_probs)

    # parameters for lognormal
    mean = np.log(mu / np.sqrt(1 + sigma**2 / mu**2))
    stdv = np.sqrt(np.log(1 + sigma**2 / mu**2))

    while True:
        # choose a seq based on length
        seq_i = next(seq_chooser)
        seq = fasta[refs[seq_i]]
        seq_len = seq_lens[seq_i]

        start = np.random.randint(0, seq_len)
        frag_length = int(np.random.lognormal(mean, stdv))
        move = np.random.choice(direction)
        end = max(0, start + move * frag_length)
        start, end = sorted([start, end])

        if end - start < 2:
            # Expand a bit to ensure we grab at least one base.
            start = max(0, start - 1)
            end += 1

        frag_seq = seq[start:end]
        if move == -1:
            frag_seq = reverse_complement(frag_seq)
        yield frag_seq, refs[seq_i], start, end, '+' if move == 1 else '-'
Esempio n. 9
0
def main(args):
    sample_name = extract_sample_name(args.input_path)
    with open(args.input_path) as cnv_input, FastaFile(args.genome_ref) as genome_ref,\
            open(args.output_path, 'w') as vcf_output:
        is_full_chrom_name = genome_ref.references[0].startswith('chr')
        cnv_reader = csv.DictReader(cnv_input, delimiter='\t')
        vcf_output.write('\n'.join(get_vcf_headers(sample_name, genome_ref)) + '\n')
        for cnv_line in cnv_reader:
            vcf_line = get_vcf_line(cnv_line, genome_ref, is_full_chrom_name)
            vcf_output.write(vcf_line + '\n')
Esempio n. 10
0
    def _extract(self, intervals, out, **kwargs):
        fasta = FastaFile(self._datafile)

        for index, interval in enumerate(intervals):
            seq = fasta.fetch(str(interval.chrom), interval.start,
                              interval.stop)

            out[index, :, :, 0] = one_hot_encode_sequence(seq)

        return out
Esempio n. 11
0
def vcf2chain(input_file, fasta_file, strain, output_file, vcf_keep=False, passed=False, quality=False, diploid=False):
    """

    :param input_file:
    :param fasta_file:
    :param strain:
    :param output_file:
    :param vcf_keep:
    :param passed:
    :param quality:
    :param diploid:
    :return:
    """
    start = time.time()

    input_file = g2g_fu.check_file(input_file)
    fasta_file = g2g_fu.check_file(fasta_file)

    if not strain:
        raise G2GValueError("No strain was specified.")

    output_file = g2g_fu.check_file(output_file, 'w')
    output_file_dir = os.path.dirname(output_file)

    LOG.info("VCF FILE: {0}".format(input_file))
    LOG.info("FASTA FILE: {0}".format(fasta_file))
    LOG.info("CHAIN FILE: {0}".format(output_file))

    vcf_discard_file = None

    if vcf_keep:
        vcf_discard_file = "{0}.errors.vcf".format(os.path.basename(input_file))
        vcf_discard_file = os.path.join(output_file_dir, vcf_discard_file)
        LOG.info("VCF DISCARD FILE: {0}".format(vcf_discard_file))

    LOG.info("STRAIN: {0}".format(strain))
    LOG.info("PASS FILTER ON: {0}".format(str(passed)))
    LOG.info("QUALITY FILTER ON: {0}".format(str(quality)))
    LOG.info("DIPLOID: {0}".format(str(diploid)))

    if not isinstance(fasta_file, FastaFile):
        fasta_file = FastaFile(fasta_file)

    tb = TabixFile(input_file)
    sample_index = None

    for h in tb.header:
        if h[:6] == '#CHROM':
            try:
                elems = h.split('\t')
                samples = elems[9:]
                samples = dict(zip(samples, (x for x in xrange(len(samples)))))
                sample_index = samples[strain]
            except KeyError, ke:
                raise G2GVCFError("Unknown strain '{0}', valid strains are: {1}".format(strain, ", ".join(samples)))
Esempio n. 12
0
def get_chrom_sizes(fasta_file, chromosomes=None):
    """Get chromosome files from a fasta file
    """
    from pysam import FastaFile
    fa = FastaFile(fasta_file)
    if chromosomes is None:
        genome = [(c, l) for c, l in zip(fa.references, fa.lengths)]
    else:
        genome = [(c, l) for c, l in zip(fa.references, fa.lengths)
                  if c in chromosomes]
    return genome
Esempio n. 13
0
def Get_fusionGene_seq(GenesU,GenesV,geneCoordniates_dict,reference_fa=''):
    if not (os.path.exists(reference_fa) or os.path.exists(os.curdir+'/'+reference_fa)):
        print('Error: There is no reference fasta files')
        exit(1)
    genome_name=os.path.basename(reference_fa)
    if not os.path.exists(genome_name):
        genome=os.system('ln -s %s %s'%(reference_fa,genome_name))
    genome_index=os.system('samtools faidx %s'%genome_name)
    genome=FastaFile(genome_name)


    fusiongenes_ref_U=open('fusion_total_index/fusiongenes_ref_U.fa','w')
    for gene in GenesU:
        chr, strand, start, end=None,None,None,None
        try:
            chr, strand, start, end, gene = geneCoordniates_dict[gene]
        except KeyError as e:
            print('%s: Input gene name wasnot found in Gtf,Check gene names'%e)
            exit(1)
        if strand!=None:
            if strand == '1':
                seq = genome.fetch(reference=chr, start=int(start), end=int(end))
            else:
                seq_plus = genome.fetch(reference=chr, start=int(start), end=int(end))
                trantab = str.maketrans('ACGTacgtNn', 'TGCAtgcaNn')
                seq = seq_plus.translate(trantab)
                seq = seq[::-1]
            fusiongenes_ref_U.write('>%s \n' % gene)
            for line in re.findall(r'.{60}', seq):
                fusiongenes_ref_U.write('%s\n' % line)
    fusiongenes_ref_U.close()


    fusiongenes_ref_V=open('fusion_total_index/fusiongenes_ref_V.fa','w')
    for gene in GenesV:
        chr, strand, start, end = None, None, None, None
        try:
            chr, strand, start, end, gene = geneCoordniates_dict[gene]
        except KeyError as e:
            print('%s: Input gene name wasnot found in Gtf,Check gene names'%e)
            exit(1)
        if strand != None:
            if strand == '1':
                seq = genome.fetch(reference=chr, start=int(start), end=int(end))
            else:
                seq_plus = genome.fetch(reference=chr, start=int(start), end=int(end))
                trantab = str.maketrans('ACGTacgtNn', 'TGCAtgcaNn')
                seq = seq_plus.translate(trantab)
                seq = seq[::-1]
            fusiongenes_ref_V.write('>%s \n' % gene)
            for line in re.findall(r'.{60}', seq):
                fusiongenes_ref_V.write('%s\n' % line)
    fusiongenes_ref_V.close()
    return 0
Esempio n. 14
0
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path

        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except:
            raise IOError("FASTA FILE READ ERROR")
Esempio n. 15
0
def _chrom_sizes(fasta_file):
    """Get the chromosome sizes for a fasta file
    """
    from pysam import FastaFile
    fa = FastaFile(fasta_file)
    chrom_lens = OrderedDict([(name, l) for name, l in zip(fa.references, fa.lengths)])
    if len(chrom_lens) == 0:
        raise ValueError(f"no chromosomes found in fasta file: {fasta_file}. "
                         "Make sure the file path is correct and that the fasta index "
                         "file {fasta_file}.fai is up to date")
    fa.close()
    return chrom_lens
def main():
    min_region_size = 1000
    genome = FastaFile("GRCh38.genome.fa")
    for region, label in iter_peaks_and_labels(sys.argv[1]):
        # create a new region exactly min_region_size basepairs long centered on 
        # region  
        expanded_start = region[1] + (region[2] - region[1])/2 - min_region_size/2
        expanded_stop = expanded_start + min_region_size
        region = (region[0], expanded_start, expanded_stop)
        print region, label
        print genome.fetch(*region)
    return
def generate_homopolymer_plots(bed_file, fasta_file, bam_file):
    bed_file_records = open(bed_file, 'r')
    for line in bed_file_records:
        contig, start_pos, end_pos = line.rstrip().split('\t')
        start_pos = int(start_pos)
        end_pos = int(end_pos)
        if start_pos < 1000:
            continue
        if end_pos - start_pos > 50:
            continue

        samfile = pysam.AlignmentFile(bam_file, "rb")

        assembly_fasta_file = FastaFile(fasta_file)
        reference_sequence = assembly_fasta_file.fetch(reference=contig, start=start_pos, end=start_pos + 200)

        reference_homopolymer_index_start = 1
        reference_homopolymer_index_end = 1
        homopolymer_base = reference_sequence[reference_homopolymer_index_start]
        # print(homopolymer_base)
        while reference_homopolymer_index_end < len(reference_sequence) and reference_sequence[reference_homopolymer_index_end] == homopolymer_base:
            reference_homopolymer_index_end += 1

        # print(reference_sequence[reference_homopolymer_index_start:reference_homopolymer_index_end])
        reference_homopolymer_length = reference_homopolymer_index_end - reference_homopolymer_index_start

        all_reads = samfile.fetch(contig, start_pos - 1, end_pos)

        read_homopolymers = []
        for read in all_reads:
            aligned_pairs = read.get_aligned_pairs()

            start_index = 0
            for index, position in aligned_pairs:
                if index is None:
                    continue
                if position == start_pos:
                    start_index = index + 1
                    break
            if read.query_sequence is None:
                continue
            if start_index == len(read.query_sequence):
                continue
            homopolymer_base = read.query_sequence[start_index]
            # print(homopolymer_base)
            end_index = start_index
            while end_index < len(read.query_sequence) and read.query_sequence[end_index] == homopolymer_base:
                end_index += 1
            read_homopolymer_length = end_index - start_index
            read_homopolymers.append(read_homopolymer_length)

        print(contig + "\t" + str(start_pos) + "\t" + str(end_pos) + "\t" + str(reference_homopolymer_length) + "\t" + str(','.join([str(x) for x in read_homopolymers])))
Esempio n. 18
0
    def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([self._bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
Esempio n. 19
0
 def __init__(self, datafile, use_strand=False, **kwargs):
     """Fasta file extractor
     
     NOTE: The extractor is not thread-save.
     If you with to use it with multiprocessing,
     create a new extractor object in each process.
     
     Args:
       datafile (str): path to the bigwig file
       use_strand (bool): if True, the extracted sequence
         is reverse complemented in case interval.strand == "-"
     """
     super(FastaExtractor, self).__init__(datafile, **kwargs)
     self.use_strand = use_strand
     self.fasta = FastaFile(self._datafile)
Esempio n. 20
0
def countContexts(fastaFilePath, whiteListBed=None, blackListBed=None):
    debug(f"Starting to count contexts of nucleotides in {fastaFilePath}")

    triNucCounts = defaultdict(int)
    diNucCounts = defaultdict(int)
    # open the fastaFile
    with FastaFile(fastaFilePath) as fastaFile:

        # if we do not have a whitelist to start out, we make one from the fasta, which includes
        # everything
        if whiteListBed is None:
            wlObj = from_dict(
                {
                    "Chromosome": fastaFile.references,
                    "Start": [1] * fastaFile.nreferences,
                    "End": fastaFile.lengths,
                }
            )
        else:
            # we cast this to string, because pyranges wants string and we use the Path type
            wlObj = read_bed(str(whiteListBed))
            wlObj = wlObj.merge()

        # if we have a blacklist, we subtract that from the whitelist, otherwise we leave it how
        # it is
        if not blackListBed is None:
            # we cast this to string, because pyranges wants string and we use the Path type
            blObj = read_bed(str(blackListBed))
            blObj = blObj.merge()
            wlObj = wlObj.subtract(blObj)
            # shouldnt need to merge again here, as we only have less ranges than before

        # while we could use the get_fasta function from pyranges, it needs another
        # dependency (pyfaidx) and is slower (from my preliminary testing)
        # i terate over all chromosomes and each of the ranges
        for chr, df in wlObj:
            # iterrows has to return the index, even though we dont use it
            for idx, region in df.iterrows():
                seq = fastaFile.fetch(
                    reference=chr, start=region["Start"], end=region["End"]
                )

                for i in range(len(seq) - 2):
                    diNucCounts[seq[i : i + 2]] += 1
                    triNucCounts[seq[i : i + 3]] += 1
            debug(f"contect frequency analysis complete for chromsome {chr}")

    return (diNucCounts, triNucCounts)
Esempio n. 21
0
def data_generator_pysam(my_args, name, start, stop, is_bulk):
    fasta_file = FastaFile(my_args.fasta)
    ref = fasta_file.fetch(name, start, stop)

    my_arg = {
        'fastafile': fasta_file,
        'stepper': 'samtools',
        'adjust_capq_threshold': 50,
        'contig': name,
        'start': start,
        'stop': stop,
        'min_mapping_quality': 0 if is_bulk else 20,
        'min_base_quality': 13,
    }

    if is_bulk:
        bam_file = AlignmentFile(my_args.bulk, 'rb')
    else:
        bam_file = AlignmentFile(my_args.bam, 'rb')

    read_bases_list = []
    for pileup_column in bam_file.pileup(**my_arg):
        pos = pileup_column.reference_pos

        if pos >= stop:
            break
        if pos < start:
            continue

        read_bases_list = pileup_column.get_query_sequences(mark_matches=True,
                                                            mark_ends=True,
                                                            add_indels=True)

        read_bases = ''.join(read_bases_list).upper()
        n = pileup_column.get_num_aligned()
        if n == 0:
            read_bases = '*'
            base_q = '*'
            map_q = '*'
        else:
            base_q = ''.join([chr(int(i) + PHREDSCORE) \
                for i in pileup_column.get_query_qualities()])
            map_q = ''.join([chr(int(i) + PHREDSCORE) \
                for i in pileup_column.get_mapping_qualities()])

        yield [name, pos, ref[pos - start], str(n), read_bases, base_q, map_q]

    yield None
Esempio n. 22
0
def main():
    min_region_size = 1000
    genome = FastaFile("./genome/GRCh38.genome.fa")
    train_path = "./train_data/"
    list_dir = os.listdir(train_path)
    for filename in list_dir:
        for region, label in iter_peaks_and_labels(train_path + filename):
            # create a new region exactly min_region_size basepairs long centered on
            # region
            expanded_start = region[1] + (region[2] -
                                          region[1]) / 2 - min_region_size / 2
            expanded_stop = expanded_start + min_region_size
            region = (region[0], expanded_start, expanded_stop)
            #print region, label
            print genome.fetch(*region), label
    return
Esempio n. 23
0
def method2(basefl):
    fa = FLAGS.input + ".feature.fa"
    loader = FastaFile(fa)
    fl1 = FLAGS.input + ".feature.tsv"
    output = open("%s/20bp.fa" % (basefl), "w")
    for i in open(fl1, "r"):
        ele = i.rstrip().split()
        ids, pos = ele[0].split("|")[:-1]
        pos = int(pos)
        try:
            seq = loader.fetch(ids, pos - 30, pos + 30)
            output.write(">%s|%s\n%s\n" % (ids, pos, seq))
        except:
            print("ids %s %s,error" % (ids, pos))
    output.close()
    align_hisat2()
Esempio n. 24
0
def extract_fasta_to_file(fasta, output_dir, overwrite):
    """
    Returns compressed version of fasta file for a quickly accessible memory map
    Args:
        fasta: fasta file to be converted
        output_dir: output directory for memory map location
        overwrite: boolean - whether to overwrite current memory map
    """

    for i in [0, 1]:
        if overwrite:
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            fasta_file = FastaFile(fasta)
            file_shapes = {}
            for chrom, size in zip(fasta_file.references, fasta_file.lengths):
                seq = fasta_file.fetch(chrom)
                data = one_hot_encode_sequence(seq)
                file_shapes[chrom] = data.shape
                bcolz.carray(data,
                             rootdir=os.path.join(output_dir, chrom),
                             cparams=_blosc_params,
                             mode='w').flush()
            mode = '2D_transpose_bcolz'
            metadata = {
                'file_shapes': file_shapes,
                'type': 'array_{}'.format(mode),
                'extractor': 'CompressedFastaExtractor',
                'source': fasta
            }
            with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
                json.dump(metadata, fp)
                overwrite = False
        else:
            try:
                with open(os.path.join(output_dir, 'metadata.json'),
                          'r') as fp:
                    metadata = json.load(fp)
                break
            except IOError as e:
                print("I/O error({0}): {1} for {2}".format(
                    e.errno, e.strerror, output_dir))
                print(
                    "There is a problem with opening the metadata. Recreating the mmap files and overwriting..."
                )
                overwrite = True
    return metadata
Esempio n. 25
0
def get_contig_list_from_fasta(fasta_path, with_length=False):
    """Obtain list of contigs froma  fasta file,
        all alternative contigs are pooled into the string MISC_ALT_CONTIGS_SCMO

    Args:
        fasta_path (str or pysam.FastaFile) : Path or handle to fasta file

        with_length(bool): return list of lengths

    Returns:
        contig_list (list ) : List of contigs + ['MISC_ALT_CONTIGS_SCMO'] if any alt contig is present in the fasta file
        """

    contig_list = []
    has_alt = False
    if with_length:
        lens = []

    if type(fasta_path) is str:
        fa = FastaFile(fasta_path)
    elif type(fasta_path) is FastaFile:
        fa = fasta_path
    else:
        raise TypeError('Supply pysam.FastaFile or str')

    for reference, length in zip(fa.references, fa.lengths):
        if is_main_chromosome(reference):
            contig_list.append(reference)
            if with_length:
                lens.append(length)
        else:
            has_alt = True

    # Close handle if we just opened one
    if type(fasta_path) is str:
        fa.close()

    if has_alt:
        contig_list.append('MISC_ALT_CONTIGS_SCMO')
        if with_length:
            lens.append(None)

    if with_length:
        return contig_list, lens

    return contig_list
Esempio n. 26
0
def extract_fasta_to_npy(fasta, output_dir):
    fasta_file = FastaFile(fasta)
    file_shapes = {}
    for chrom, size in zip(fasta_file.references, fasta_file.lengths):
        data = np.empty((NUM_SEQ_CHARS, size), dtype=np.float32)
        seq = fasta_file.fetch(chrom)
        one_hot_encode_sequence(seq, data)
        np.save('{}.npy'.format(os.path.join(output_dir, chrom)), data)
        file_shapes[chrom] = data.shape

    with open(os.path.join(output_dir, 'metadata.json'), 'w') as fp:
        json.dump(
            {
                'file_shapes': file_shapes,
                'type': 'array',
                'source': fasta
            }, fp)
Esempio n. 27
0
    def __init__(self, reference_file_path):
        """
        create fasta file object given file path to a fasta reference file
        :param fasta_file_path: full path to a fasta reference file
        """

        self.fasta_file_path = reference_file_path
        assert os.path.exists(
            reference_file_path), "Reference path does not exist: {}".format(
                reference_file_path)
        try:
            self.fasta = FastaFile(self.fasta_file_path)
        except Exception as e:
            print(e)
            raise IOError(
                "Fasta File Read Error: Try indexing reference with 'samtools faidx {}'"
                .format(reference_file_path))
Esempio n. 28
0
 def __init__(self,
              fasta_file,
              noTEMD=False,
              h5_file='tmp_vote.h5',
              stranded=False):
     self.fasta_file = fasta_file
     self.noTEMD = noTEMD
     self.stranded = stranded
     with FastaFile(fasta_file) as FA:
         self.chrom_dict = {
             c: FA.get_reference_length(c)
             for c in FA.references
         }
     self.cur_chrom = ''
     self.h5_file = h5_file
     self.H5 = h5py.File(h5_file, 'a')
     self._genome_init()
Esempio n. 29
0
def extract_seq(interval, variant, fasta_file, one_hot=False):
    """
    Note: in case the variant is an indel, the anchorpoint at the beginning is used

    Args:
      interval: pybedtools.Interval where to extract the sequence from
      variant: Variant class with attributes: chr, pos, ref, alt
      fasta_file: file path or pysam.FastaFile instance
      one_hot: if True, one-hot-encode the output sequence

    Returns:
      sequence
    """
    if isinstance(fasta_file, str):
        from pysam import FastaFile
        fasta_file = FastaFile(fasta_file)
    if variant is not None and variant.pos - 1 >= interval.start and variant.pos <= interval.stop:
        inside = True
        lendiff = len(variant.alt) - len(variant.ref)
    else:
        inside = False
        lendiff = 0
    seq = fasta_file.fetch(str(interval.chrom), interval.start,
                           interval.stop - lendiff)

    if not inside:
        out = seq
    else:
        # now, mutate the sequence
        pos = variant.pos - interval.start - 1
        expect_ref = seq[pos:(pos + len(variant.ref))]
        if expect_ref != variant.ref:
            raise ValueError(
                f"Expected reference: {expect_ref}, observed reference: {variant.ref}"
            )
        # Anchor at the beginning
        out = seq[:pos] + variant.alt + seq[(pos + len(variant.ref)):]
    assert len(
        out
    ) == interval.stop - interval.start  # sequece length has to be correct at the end
    if one_hot:
        out = encodeDNA([out.upper()])[0]
    return out
def generate_header(reference_fa: str, tag: str) -> VariantHeader:
    """
    Generates the header for the minimal VCF.

    :param reference_fa: Path to reference fasta file.
    :param tag: The filter tag to use.
    """
    header = VariantHeader()
    header.filters.add(tag, None, None, "Failed dToxoG")

    fasta = FastaFile(reference_fa)
    try:
        for contig in fasta.references:
            header.contigs.add(contig,
                               length=fasta.get_reference_length(contig))
    finally:
        fasta.close()

    return header