Ejemplo n.º 1
0
def kmer_sample_db(data,
                   kim_file,
                   contains_sample_kmer_file,
                   lock,
                   truths=None):
    kim = load_pickle(kim_file)
    kmer_sample_chunk = []
    for line in data:
        linelist = line.split('\t')
        kmer = linelist[0]

        kmer_sample_lines = []
        for sample_ in linelist[1:]:
            sample = sample_.split(',')
            kmer_sample_lines.append(
                f'{sample[0]}\t{kim[kmer]}'
            )  # add some perturbation of sample[1] to the end to get different truth values for different num CNVs
        kmer_sample_chunk.append('\n'.join(kmer_sample_lines))

        # write every 500k to limit memory usage
        if len(kmer_sample_chunk) >= 500000:
            lock.acquire()
            write_list(kmer_sample_chunk, contains_sample_kmer_file)
            lock.release()
            kmer_sample_chunk = []
    write_files(lock, (kmer_sample_chunk, contains_sample_kmer_file))
Ejemplo n.º 2
0
def process(data, lock, pim, kim_file, fsa_file, scored_kmers_file):
    kim = load_pickle(kim_file)
    chunk = []
    for line in data:
        linelist = line.split()
        outline = (kim[int(linelist[0])], pim[int(linelist[1])], linelist[2])
        chunk.append(outline)
    kmers = [f'>{i}\n{line[0]}' for i, line in enumerate(chunk)]
    values = ['\t'.join(tup) for tup in chunk]
    write_files(lock, (values, scored_kmers_file), (kmers, fsa_file))
Ejemplo n.º 3
0
def kmer_pheno_db(data,
                  kim_file,
                  value_kmer_pheno_file,
                  truth_kmer_pheno_file,
                  baseline_kmer_pheno_file,
                  lock,
                  truths=None,
                  baseline=None):
    kim = load_pickle(kim_file)
    kmer_pheno_chunk = []
    if truths:
        truths_chunk = []
    else:
        truths_chunk = None
    if baseline:
        baseline_chunk = []
    else:
        baseline_chunk = None
    for line in data:
        linelist = line.split('\t')
        kmer = linelist[0]
        for pheno in linelist[1:]:
            kmer_pheno_chunk.append(f'{kim[kmer]}\t{pheno}')
            if truths and kmer_in_truths(kmer, truths, pheno):
                truths_chunk.append(f'{kim[kmer]}\t{pheno}')
            if baseline:
                score = kmer_in_truths(kmer, baseline, pheno)
                if score is True:
                    baseline_chunk.append(f'{kim[kmer]}\t{pheno}')
                elif score is not False and score > 0.0 and score < 1.0:
                    baseline_chunk.append(f'{kim[kmer]}\t{pheno}\t{score}')
        if len(kmer_pheno_chunk) >= 500000:
            write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file),
                        (truths_chunk, truth_kmer_pheno_file),
                        (baseline_chunk, baseline_kmer_pheno_file))
            kmer_pheno_chunk = []
            if truths_chunk is not None:
                truths_chunk = []
    write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file),
                (truths_chunk, truth_kmer_pheno_file),
                (baseline_chunk, baseline_kmer_pheno_file))
Ejemplo n.º 4
0
def filter_kmers(data, thresh, dfdisp, dfnodisp, kmer_sample_file,
                 kmer_pheno_file, lock):
    printd('Filtering kmers...')
    nphenos = dfdisp.shape[1]
    kmer_samples = []
    kmer_phenos = []
    while (data):
        line = data.pop()
        kmer = line[0]
        # collect resistant/vulnerable frequencies for each antibiotic for
        # this unitig
        disp = sum(dfdisp[sample_id[0]] for sample_id in line[1:])
        nodisp = sum(dfnodisp[sample_id[0]] for sample_id in line[1:])

        # 1 test per antibiotic; unitig needs to pass only 1 to avoid
        # getting filtered out
        samples_thresh = 5
        a = np.where((disp + nodisp >= samples_thresh) \
                    & (disp / (disp + nodisp + .01) > thresh))[0]
        if a.size == 0:
            continue
        kmer_pheno_chunk = [kmer]
        for pheno in a:
            kmer_pheno_chunk.append(str(pheno))
        kmer_phenos.append('\t'.join(kmer_pheno_chunk))

        kmer_samples.append('\t'.join(map(format_tuple, line)))
        # write every 500K kmers to keep memory consumption under control
        if len(kmer_phenos) >= 500000:
            write_files(lock, (kmer_samples, kmer_sample_file),
                        (kmer_phenos, kmer_pheno_file))
            kmer_samples = []
            kmer_phenos = []
    write_files(lock, (kmer_samples, kmer_sample_file),
                (kmer_phenos, kmer_pheno_file))
    printd('Finished filtering kmers.')
    return