Beispiel #1
0
def create_pheno_int_map(phenos, pim_file):
    printd('Creating pheno int map...')
    with open(phenos, 'r') as f:
        line = f.readline()  # only need first line which contains headers
    pim = {}
    for i, p in enumerate(line.split()[1:]):  # skip id column
        pim[i] = p
        pim[p] = i
    with open(pim_file, 'wb') as f:
        pickle.dump(pim, f)
    printd('Successfully created pheno int map.')
Beispiel #2
0
def create_kmer_int_map(kmers, kim_file):
    printd('Creating kmer int map...')
    kim = {}
    with open(kmers, 'r') as f:
        lines = f.readlines()
    for i, line in enumerate(lines):
        u = line.split('\t')[0]
        kim[u] = i
        kim[i] = u
    with open(kim_file, 'wb') as f:
        pickle.dump(kim, f)
    printd('Successfully created kmer int map.')
Beispiel #3
0
def sample_kmers(data, n, seed=randint(1, 100000)):
    printd('Sampling kmers...')
    sample_matrix = np.zeros((n, n))
    rng = Random(seed)
    num_kmers = int(len(data) * 0.05)
    sampled = rng.sample(data, num_kmers)

    for line in sampled:
        samplelist = line[1:]
        for i, s1 in enumerate(samplelist):
            for s2 in samplelist[i:]:
                sample_matrix[s1[0]][s2[0]] += 1
                sample_matrix[s2[0]][s1[0]] += 1
    printd('Finished sampling kmers.')
    return num_kmers, sample_matrix
Beispiel #4
0
def main():
    # load params 
    params = get_params()
    project = params['project']
    k = params['k']

    # define file paths
    samples_file = join(project, 'data', 'raw', params['sample'])
    outfile = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    catted_samples = join(project, 'data', 'preprocessed', 'samples.fa')

    # check if output file exists; if so, do nothing.
    if file_exists(outfile):
        exit(0)

    # create catted samples file if it does not exist.
    if not file_exists(catted_samples):
        cat_samples(samples_file, catted_samples)

    # multiprocessing queue for transferring data to the main thread
    q = Manager().Queue()

    # invoke process(...) on catted_samples files with kwargs, for each thread
    process_file(process, catted_samples, q=q, k=k)
    
    # consolidate all threads' counters into single counter holding all kmers
    counter = Counter()
    while not q.empty():
        counter.update(q.get())
    for kmer in counter.keys():
        comp = complement(kmer)
        if comp in counter:
            comp_count = counter[comp]
            counter[comp] = 0
            counter[kmer] += comp_count
    counter = +counter
    printd('Finished consolidating counters.')

    # write counter to file
    write_dict(counter, outfile, sep='\t')
    
    # remove catted samples file
    if file_exists(catted_samples):
        remove(catted_samples)
Beispiel #5
0
def consolidate(data, k):
    printd('Consolidating chunk...')
    prev_line = data.pop()
    prev_unitig = prev_line[0]
    unitigs = []
    while (data):
        line = data.pop()
        this_unitig = line[0]

        # kmers are sequential and the same set of samples contain both kmers
        if prev_unitig[0:k - 1] == this_unitig[-(k - 1):] \
                and len(line) == len(prev_line) \
                and set(line[1:]) == set(prev_line[1:]):
            this_unitig = this_unitig[0] + prev_unitig
            line = (this_unitig, *line[1:])
        else:
            unitigs.append(prev_line)
        prev_line = line
        prev_unitig = this_unitig
    printd('Finished consolidating chunk.')
    return unitigs
Beispiel #6
0
def filter_kmers(data, thresh, dfdisp, dfnodisp, kmer_sample_file,
                 kmer_pheno_file, lock):
    printd('Filtering kmers...')
    nphenos = dfdisp.shape[1]
    kmer_samples = []
    kmer_phenos = []
    while (data):
        line = data.pop()
        kmer = line[0]
        # collect resistant/vulnerable frequencies for each antibiotic for
        # this unitig
        disp = sum(dfdisp[sample_id[0]] for sample_id in line[1:])
        nodisp = sum(dfnodisp[sample_id[0]] for sample_id in line[1:])

        # 1 test per antibiotic; unitig needs to pass only 1 to avoid
        # getting filtered out
        samples_thresh = 5
        a = np.where((disp + nodisp >= samples_thresh) \
                    & (disp / (disp + nodisp + .01) > thresh))[0]
        if a.size == 0:
            continue
        kmer_pheno_chunk = [kmer]
        for pheno in a:
            kmer_pheno_chunk.append(str(pheno))
        kmer_phenos.append('\t'.join(kmer_pheno_chunk))

        kmer_samples.append('\t'.join(map(format_tuple, line)))
        # write every 500K kmers to keep memory consumption under control
        if len(kmer_phenos) >= 500000:
            write_files(lock, (kmer_samples, kmer_sample_file),
                        (kmer_phenos, kmer_pheno_file))
            kmer_samples = []
            kmer_phenos = []
    write_files(lock, (kmer_samples, kmer_sample_file),
                (kmer_phenos, kmer_pheno_file))
    printd('Finished filtering kmers.')
    return
Beispiel #7
0
def create_kmer_sample_map(data, raw, q, k, upper, lower, thresh, dfdisp,
                           dfnodisp, sim, n, lock, kmer_sample_file,
                           kmer_pheno_file):
    printd('Creating kmer sample map...')
    # get all kmers in chunk and complement them
    kmers = {}
    for line in data:
        kmer, count = line.split('\t')
        if kmer_frequency_fails(count, upper, lower):
            continue
        kmers[kmer] = Counter()

    # map all kmers in chunk to samples containing them
    for count, (raw_id, seq) in enumerate(raw.items()):
        sample_id = sim.get(raw_id, None)
        if sample_id is None:
            continue
        for c_id, contig in enumerate(seq):
            l = len(contig)
            if l >= k:  # ensure this contig is long enough to sample
                for i in range(l - k + 1):
                    kmer = contig[i:i + k]
                    kmerlist = kmers.get(kmer, None)
                    if kmerlist is not None:
                        kmerlist[sample_id] += 1
                    else:
                        complist = kmers.get(complement(kmer), None)
                        if complist is not None:
                            complist[sample_id] += 1
    kmers = [(key, *v.items()) for key, v in kmers.items()]
    printd('Finished creating kmer sample map.')
    num_kmers, sample_matrix = sample_kmers(kmers, n)
    printd('Putting data in queue')
    q.put((num_kmers, sample_matrix))
    printd('Finished putting data in queue')
    if kmer_sample_file is None and kmer_pheno_file is None:
        return

    # consolidate() will clear kmers list as it builds unitigs list
    # with net 0 memory gain
    kmers = consolidate(kmers, k)
    # filter_unitigs() will clear unitigs list as it builds new unitigs list
    # with net 0 memory gain
    filter_kmers(kmers, thresh, dfdisp, dfnodisp, kmer_sample_file,
                 kmer_pheno_file, lock)
    return
Beispiel #8
0
def consolidate(name, unitigs, outdir):
    unitigs = list(unitigs.iloc[:, 0])
    unitigs = [(u, [1/(i+1)]) for i, u in enumerate(unitigs)]
    printd('Original num kmers:', len(unitigs))
    for k in range(30, 20, -1):
        unitigs = consolidate_model(unitigs, k, p=False)
        printd(f'K: {k}, num kmers: {len(unitigs)}')
    printd('Final num kmers:', len(unitigs))
    unitigs = [(unitig, 1 / (sum(ranks) / len(ranks))) for unitig, ranks in unitigs]
    unitigs = sorted(unitigs, key=lambda x: x[1])
    unitigs = [u[0] for u in unitigs]
    write_list(unitigs, join(outdir, name))
Beispiel #9
0
def create_sample_int_map(samples, phenos, sim_file):
    printd('Creating sample int map...')
    sim = {}
    with open(samples, 'r') as f:
        lines = f.readlines()
    phenosdf = pd.read_csv(phenos, sep='\t', index_col=0)
    phenosdf.dropna(how='all', inplace=True)
    droppedsamples = []
    i = 0
    for line in lines[1:]:  # ignore header
        name = line.split('\t')[0]
        if name in phenosdf.index:
            sim[name] = i
            sim[i] = name
            i += 1
        else:
            droppedsamples.append(name)
    with open(sim_file, 'wb') as f:
        pickle.dump(sim, f)
    if len(droppedsamples) > 0:
        printd(('Ignoring samples not present in both'
                f' sample and pheno files: {droppedsamples}'))
    printd('Successfully created sample int map.')
Beispiel #10
0
def similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file,
                   hist_sim_scaled_file, hist_dissim_scaled_file,
                   similarities_file, dissimilarities_file):
    if not file_exists(similarities_tsv):
        # scale similarities matrix by the mean num sampled kmers each sample
        # shares with itself. Then, normalize to [0,1]. Then remove the diagonal
        # and the lower triangle of the array (since it is symmetric about the
        # major diagonal), and finally round values to 4 decimal places.
        mean_shared_w_self = sample_matrix.diagonal().mean()
        sample_matrix /= mean_shared_w_self
        sample_matrix += 0.001  # ensure all values are nonzero
        sample_matrix *= 1.0 / sample_matrix.max()
        np.fill_diagonal(sample_matrix, np.nan)
        sample_matrix = np.triu(sample_matrix)
        np.round(sample_matrix, 4)

        df = pd.DataFrame(sample_matrix)

        # dump to tsv file for ease of restoring, and because tsv file of similarities
        # is a common input to other mGWAS programs
        df.to_csv(similarities_tsv, sep='\t')

    else:
        df = pd.read_csv(similarities_tsv, sep='\t', index_col=0)

    # create similarity histogram and save it
    plt.hist(df.values, facecolor='green')
    plt.savefig(hist_orig_file, dpi=150)
    plt.clf()
    df = df.stack()
    df = df.reset_index()
    df = df[df[0] > 0]  # remove the lower half of the triangle
    # set threshold; 0.75 means drop lowest 75%, keep highest 25%
    highthresh = 0.9
    lowthresh = 0.1
    # find numeric cutoff; the lowest 75% of the data are below this value
    highcutoff = df[0].quantile(highthresh)
    lowcutoff = df[0].quantile(lowthresh)
    # cut off all everything in the middle; only keep the very similar and very dissimilar
    simdf = df[df[0] >= highcutoff].copy(deep=True)
    dissimdf = df[df[0] <= lowcutoff].copy(deep=True)
    dissimdf[0] = 1 - dissimdf[0]
    dfs = (simdf, dissimdf)
    # determine new min, max, range
    files = ((hist_sim_scaled_file, similarities_file),
             (hist_dissim_scaled_file, dissimilarities_file))
    for i, (pngfile, outfile) in enumerate(files):
        df = dfs[i]
        min_ = df[0].min()
        max_ = df[0].max()
        range_ = max_ - min_
        # shift df left by the min so the new min is 0
        df[0] -= min_
        # rescale data to [0,0.5] or [0,1]
        if i == 0:  # high
            scale_factor = 2
            intercept = 0.5
        else:  # low
            scale_factor = 2
            intercept = 0.5
        df[0] /= range_ * scale_factor
        # rescale data to [0.5, 1] or [0, 1]
        df[0] += intercept
        # create similarity histogram and save it
        try:
            plt.hist(df[0], bins=50, facecolor='green')
            plt.savefig(pngfile, dpi=150)
            plt.clf()
        except ValueError as e:
            printd('Unable to generate histogram of scaled data')

        # write to csv
        df.to_csv(outfile, sep='\t', index=False, header=False)