コード例 #1
0
ファイル: train_model.py プロジェクト: quinlan-lab/kmertools
def model_region_singletons(data_container, vcf_path, fasta_path, kmer_size,
                            region):
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        if region.strand is not None:
            if ek.is_dash(region.strand):
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                     region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region),
              file=sys.stderr,
              flush=True)
        return
    region_ref_counts = ek.kmer_search(
        sequence, kmer_size)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(
        region.stop)
    transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    idx_nuc = list('ACGT')
    for variant in vcf(r_string):
        if ek.is_singleton_snv(variant):
            new_var = Variant(variant=variant, fields=['vep'])
            # take 7mer around variant. pyfaidx excludes start index and includes end index
            adj_seq = fasta[str(
                new_var.CHROM)][(new_var.POS -
                                 start_idx_offset):(new_var.POS +
                                                    kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print(
                    'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' %
                    (adj_seq[kmer_mid_idx], variant.REF),
                    file=sys.stderr,
                    flush=True)
            if ek.complete_sequence(adj_seq):
                transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    temp = data_container.get()
    temp.add_kmer_counts(region_ref_counts)
    temp.add_transition(transitions)
    data_container.set(temp)
    print('Finished region %s in %s' % (str(region), str(time.time() - start)),
          flush=True)
    return
コード例 #2
0
def process_chrom_bin(region, kmer_size, vcf_path, fasta_path, AF=False):
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr)
        return
    region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True,
                                                            count_n=True)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(region.stop)
    if AF:
        transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0]))
    else:
        transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    # count, singletons = ek.count_regional_variants(vcf(r_string))
    for variant in vcf(r_string):
        if ek.is_singleton_snv(variant):
            new_var = Variant(variant=variant)
            # take 7mer around variant. pyfaidx excludes start index and includes end index
            adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF),
                      file=sys.stderr, flush=True)
            if ek.complete_sequence(adj_seq):
                if AF:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF')
                else:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0:
        bin_trans = pd.DataFrame.from_dict(transitions, orient='index')
        bin_trans.sort_index(inplace=True)
        # bin_trans['tot'] = bin_trans.sum(axis=1)
        bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index')
        bin_kcounts.sort_index(inplace=True)
        bin_trans['counts'] = bin_kcounts[0]
        bin_trans['freq'] = bin_trans.apply(row_multinomial)
        # kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True)
        # kmer_freq.fillna(0, inplace=True)
        # kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts
        bin_trans.loc['GC_content', 'freq'] = gc_content
        bin_trans.loc['N_count', 'freq'] = n_count
        print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True)
        return region, bin_trans['freq'].to_dict()
    else:
        print('Finished region %s in %s' % (str(region), str(time.time() - start)), flush=True)
        return region, None
コード例 #3
0
ファイル: train_model.py プロジェクト: quinlan-lab/kmertools
def model_region_nonsingletons(data_container, vcf_path, fasta_path, kmer_size,
                               region, AC_cutoff):
    if AC_cutoff is not None:
        try:
            AC_cutoff = int(AC_cutoff)
        except ValueError:
            AC_cutoff = None
            print(
                'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.',
                file=sys.stderr,
                flush=True)
    try:
        kmer_size = int(kmer_size)
        if kmer_size < 1: raise ValueError
    except ValueError:
        print('kmer_size must be a positive integer. Please check arguments.',
              file=sys.stderr,
              flush=True)
        exit(1)
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        if region.strand is not None:
            if ek.is_dash(region.strand):
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                     region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region),
              file=sys.stderr,
              flush=True)
        return
    region_ref_counts = ek.kmer_search(
        sequence, kmer_size)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(
        region.stop)
    ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    idx_nuc = list('ACGT')
    for variant in vcf(r_string):
        if ek.is_quality_snv(variant, AC_cutoff=AC_cutoff):
            new_var = Variant(variant=variant)
            adj_seq = fasta[str(
                new_var.CHROM)][(new_var.POS -
                                 start_idx_offset):(new_var.POS +
                                                    kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print(
                    'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' %
                    (adj_seq[kmer_mid_idx], variant.REF),
                    file=sys.stderr,
                    flush=True)
            if ek.complete_sequence(adj_seq):
                ac_transitions[adj_seq.upper()][nuc_idx[
                    new_var.ALT[0]]] += new_var.AC
                an_transitions[adj_seq.upper()][nuc_idx[
                    new_var.ALT[0]]] += new_var.AN
        # if ek.is_singleton_snv(variant):
        #     new_var = Variant(variant=variant, fields=['vep'])
        #     # take 7mer around variant. pyfaidx excludes start index and includes end index
        #     adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq
        #     if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
        #         print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF), file=sys.stderr, flush=True)
        #     if ek.complete_sequence(adj_seq):
        #         transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    temp = data_container.get()
    temp.add_kmer_counts(region_ref_counts)
    temp.add_transition(ac_transitions)
    temp.add_transition2(an_transitions)
    data_container.set(temp)
    print('Finished region %s in %s' % (str(region), str(time.time() - start)),
          flush=True)
    return
コード例 #4
0
def process_bed_region(region, kmer_size, vcf_path, fasta_path, AF=False, delim=','):
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        # sequence = fasta.get_seq(region.chrom, region.start, region.stop).seq.upper()
        if region.strand is not None:
            if ek.is_dash(region.strand):
                sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                         region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                         region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx, region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region), file=sys.stderr)
        return
    region_ref_counts, gc_content, n_count = ek.kmer_search(sequence, kmer_size, count_gc=True,
                                                            count_n=True)  # nprocs=1 due to short region
    if AF:
        transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0]))
    else:
        transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    # count, singletons = ek.count_regional_variants(vcf(r_string))
    for variant in vcf(region.vcf_str()):
        if ek.is_singleton_snv(variant):
            new_var = Variant(variant=variant)
            # take 7mer around variant. pyfaidx excludes start index and includes end index
            adj_seq = fasta[str(new_var.CHROM)][(new_var.POS - start_idx_offset):(new_var.POS + kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print('WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' % (adj_seq[kmer_mid_idx], variant.REF),
                      file=sys.stderr, flush=True)
            if ek.complete_sequence(adj_seq):
                if AF:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += variant.INFO.get('AF')
                else:
                    transitions[adj_seq.upper()][nuc_idx[new_var.ALT[0]]] += 1
    if len(transitions.keys()) > 0 and len(region_ref_counts.keys()) > 0:
        bin_trans = pd.DataFrame.from_dict(transitions, orient='index')
        bin_trans.sort_index(inplace=True)
        bin_trans['tot'] = bin_trans.sum(axis=1)
        bin_kcounts = pd.DataFrame.from_dict(region_ref_counts, orient='index')
        bin_kcounts.sort_index(inplace=True)
        bin_kcounts.columns = ['counts']
        kmer_freq = pd.concat([bin_trans.loc[:, 'tot'], bin_kcounts], join='outer', axis=1, sort=True)
        kmer_freq.fillna(0, inplace=True)
        kmer_freq['freq'] = kmer_freq.tot / kmer_freq.counts
        kmer_freq.loc['GC_content', 'freq'] = gc_content
        kmer_freq.loc['N_count', 'freq'] = n_count
        kdict = kmer_freq['freq'].to_dict()
        # kmer_freq.sort_index(inplace=True)
        # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True)
        outstring = region.str_name() + delim
        kkeys = ek.generate_kmers(kmer_size)
        kkeys.append('GC_content')
        kkeys.append('N_count')
        for i, k in enumerate(kkeys):
            try:
                outstring = outstring + str(kmer_freq.loc[k, 'freq'])
            except KeyError:
                outstring = outstring + '0'
            if (i + 1) < len(kkeys):
                outstring = outstring + delim
        print(outstring, flush=True)
        # return region, kmer_freq['freq'].to_dict()
    else:
        # print('Finished region %s in %s' % (region.str_name(), str(time.time() - start)), flush=True)
        outstring = region.str_name() + delim
        for i in range((kmer_size ** 4) + 2):
            outstring = outstring + '0'
            if (i + 1) < ((kmer_size ** 4) + 2):
                outstring = outstring + delim
        print(outstring, flush=True)
コード例 #5
0
def model_region(datacontainer, vcf_path, fasta_path, kmer_size, region,
                 AC_cutoff):
    if AC_cutoff is not None:
        try:
            AC_cutoff = int(AC_cutoff)
        except ValueError:
            AC_cutoff = None
            print(
                'AC cutoff must be a positive integer. Ignoring user value and using SNVs with any AC.',
                file=sys.stderr,
                flush=True)
    try:
        kmer_size = int(kmer_size)
        if kmer_size < 1: raise ValueError
    except ValueError:
        print('kmer_size must be a positive integer. Please check arguments.',
              file=sys.stderr,
              flush=True)
        exit(1)
    start = time.time()
    fasta = Fasta(fasta_path)
    vcf = VCF(vcf_path)
    start_idx_offset = int(kmer_size / 2 + 1)
    kmer_mid_idx = int(start_idx_offset - 1)
    try:
        if region.strand is not None:
            if is_dash(region.strand):
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).complement.seq.upper()
            else:
                sequence = fasta.get_seq(
                    region.chrom, region.start - kmer_mid_idx,
                    region.stop + kmer_mid_idx).seq.upper()
        else:
            sequence = fasta.get_seq(region.chrom, region.start - kmer_mid_idx,
                                     region.stop + kmer_mid_idx).seq.upper()
    except (KeyError, FetchError):
        print('Region %s not found in fasta, continuing...' % str(region),
              file=sys.stderr,
              flush=True)
        return
    region_ref_counts = kmer_search(sequence,
                                    kmer_size)  # nprocs=1 due to short region
    r_string = str(region.chrom) + ':' + str(region.start) + '-' + str(
        region.stop)
    singleton_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    ac_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    an_transitions = defaultdict(lambda: array.array('L', [0, 0, 0, 0]))
    af_transitions = defaultdict(lambda: array.array('d', [0, 0, 0, 0]))
    # Define indices for nucleotides
    nuc_idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    idx_nuc = list('ACGT')
    for variant in vcf(r_string):
        if is_quality_snv(variant, AC_cutoff=AC_cutoff):
            adj_seq = fasta[str(
                variant.CHROM)][(variant.POS -
                                 start_idx_offset):(variant.POS +
                                                    kmer_mid_idx)].seq
            if str(adj_seq[kmer_mid_idx]).upper() != str(variant.REF).upper():
                print(
                    'WARNING: Reference mismatch\tFasta REF: %s\tVCF REF: %s' %
                    (adj_seq[kmer_mid_idx], variant.REF),
                    file=sys.stderr,
                    flush=True)
            if complete_sequence(adj_seq):
                ac_transitions[adj_seq.upper()][nuc_idx[
                    variant.ALT[0]]] += variant.INFO.get('AC')
                an_transitions[adj_seq.upper()][nuc_idx[
                    variant.ALT[0]]] += variant.INFO.get('AN')
                af_transitions[adj_seq.upper()][nuc_idx[
                    variant.ALT[0]]] += variant.INFO.get('AF')
                if variant.INFO.get('AC') == 1:
                    singleton_transitions[adj_seq.upper()][nuc_idx[
                        variant.ALT[0]]] += 1
    data = {
        'singleton': singleton_transitions,
        'AC': ac_transitions,
        'AN': an_transitions,
        'AF': af_transitions
    }
    temp = datacontainer.get()
    temp.add_kmer_counts(region_ref_counts)
    for k, v in data.items():
        temp.add_transition(v, k)

    datacontainer.set(temp)
    print('Finished region %s in %s' % (str(region), str(time.time() - start)),
          flush=True)
    return