def kmer_sample_db(data, kim_file, contains_sample_kmer_file, lock, truths=None): kim = load_pickle(kim_file) kmer_sample_chunk = [] for line in data: linelist = line.split('\t') kmer = linelist[0] kmer_sample_lines = [] for sample_ in linelist[1:]: sample = sample_.split(',') kmer_sample_lines.append( f'{sample[0]}\t{kim[kmer]}' ) # add some perturbation of sample[1] to the end to get different truth values for different num CNVs kmer_sample_chunk.append('\n'.join(kmer_sample_lines)) # write every 500k to limit memory usage if len(kmer_sample_chunk) >= 500000: lock.acquire() write_list(kmer_sample_chunk, contains_sample_kmer_file) lock.release() kmer_sample_chunk = [] write_files(lock, (kmer_sample_chunk, contains_sample_kmer_file))
def main(): # get params params = get_params() project = params['project'] # define file paths INPUT_FILE = join(project, 'data', 'postprocessed', 'KMERPHENO.txt') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') fsa_file = join(project, 'data', 'postprocessed', 'scored_kmers.fsa') kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') scored_kmers_file = join(project, 'data', 'postprocessed', 'scored_kmers.txt') outdir = join(project, 'data', 'postprocessed') # create output files if they do not exist if file_exists(fsa_file): fsa_file = None if file_exists(scored_kmers_file): scored_kmers_file = None if fsa_file or scored_kmers_file: lock = Manager().Lock() pim = load_pickle(pim_file) process_file(process, INPUT_FILE, lock=lock, pim=pim, uim_file=uim_file, fsa_file=fsa_file, scored_unitigs_file=scored_unitigs_file) separate_phenos(scored_kmers_file, outdir, params['separate-phenos'], params['no-consolidate'])
def main(): #truths = get_truth_kmer_seqs_that_occur_in_data() #usm = read_unitig_sample_map_into_dict() #crossed = cross_truth_seqs_with_usm(usm, truths) #dump_pickle(crossed, 'crossed_truth.pkl') crossed = load_pickle('crossed_target.pkl') convert_sample_ids_to_pheno_values(crossed)
def process(data, lock, pim, kim_file, fsa_file, scored_kmers_file): kim = load_pickle(kim_file) chunk = [] for line in data: linelist = line.split() outline = (kim[int(linelist[0])], pim[int(linelist[1])], linelist[2]) chunk.append(outline) kmers = [f'>{i}\n{line[0]}' for i, line in enumerate(chunk)] values = ['\t'.join(tup) for tup in chunk] write_files(lock, (values, scored_kmers_file), (kmers, fsa_file))
def kmer_pheno_db(data, kim_file, value_kmer_pheno_file, truth_kmer_pheno_file, baseline_kmer_pheno_file, lock, truths=None, baseline=None): kim = load_pickle(kim_file) kmer_pheno_chunk = [] if truths: truths_chunk = [] else: truths_chunk = None if baseline: baseline_chunk = [] else: baseline_chunk = None for line in data: linelist = line.split('\t') kmer = linelist[0] for pheno in linelist[1:]: kmer_pheno_chunk.append(f'{kim[kmer]}\t{pheno}') if truths and kmer_in_truths(kmer, truths, pheno): truths_chunk.append(f'{kim[kmer]}\t{pheno}') if baseline: score = kmer_in_truths(kmer, baseline, pheno) if score is True: baseline_chunk.append(f'{kim[kmer]}\t{pheno}') elif score is not False and score > 0.0 and score < 1.0: baseline_chunk.append(f'{kim[kmer]}\t{pheno}\t{score}') if len(kmer_pheno_chunk) >= 500000: write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file), (truths_chunk, truth_kmer_pheno_file), (baseline_chunk, baseline_kmer_pheno_file)) kmer_pheno_chunk = [] if truths_chunk is not None: truths_chunk = [] write_files(lock, (kmer_pheno_chunk, value_kmer_pheno_file), (truths_chunk, truth_kmer_pheno_file), (baseline_chunk, baseline_kmer_pheno_file))
def main(): # get params params = get_params() project = params['project'] # define file paths unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt') phenos_file = join(project, 'data', 'raw', params['pheno']) samples_file = join(project, 'data', 'raw', params['sample']) similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv') hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png') hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png') hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png') similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt') dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt') kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt') kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt') sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') # create and load sample and pheno int maps if not file_exists(sim_file): int_maps.create_sample_int_map(samples_file, phenos_file, sim_file) if not file_exists(pim_file): int_maps.create_pheno_int_map(phenos_file, pim_file) sim = load_pickle(sim_file) # only do processing if output files do not exist if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and not file_exists(similarities_tsv))): # dfs holding samples that display vs not display pheno dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim) # read in all sequences in input into python object seqs = parse_input(samples_file) # number of samples n_samples = int(len(sim) / 2) # upper and lower bounds for frequency of samples to filter kmers by upper = int(params['maxkf'] * n_samples) lower = int(params['minkf'] * n_samples) # multiprocessing queue for transferring data to the main thread m = Manager() q = m.Queue() # multiprocessing lock for locking file before writing to it lock = m.Lock() # kmers file name reference for subprocesses to write to kmer_sample_file_ref = kmer_sample_file # because the int map uses it if file_exists(kmer_sample_file): kmer_sample_file_ref = None if file_exists(kmer_pheno_file): kmer_pheno_file = None kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'], upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp, sim=sim, n=n_samples, kmer_sample_file=kmer_sample_file_ref, kmer_pheno_file=kmer_pheno_file) process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs) sample_matrix = np.zeros((n_samples, n_samples)) num_kmers = 0 # write all chunks to output files sequentially while not q.empty(): q_num_kmers, q_sample_matrix = q.get() num_kmers += q_num_kmers sample_matrix += q_sample_matrix # create sample similarity file if the similarities tsv does not exist if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file): similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similar_sample_file, dissimilar_sample_file) if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv): similar_sample(None, None, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similar_sample_file, dissimilar_sample_file) # create kmer int map if not file_exists(uim_file): int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
#!/usr/bin/env python3 import pandas as pd import numpy as np from utility import load_pickle, dump_pickle import random base = 'ecoli/data/preprocessed/' #truths_file = base + 'truth_unitig_pheno.txt' unitig_int_map = load_pickle(base + 'unitig_int_map.pkl') unitig_sample_map_file = base + 'unitig_sample_map.txt' phenos_df = pd.read_csv('ecoli/data/raw/phenos.tsv', sep='\t', index_col=0) sim = load_pickle(base + 'sample_int_map.pkl') pim = load_pickle(base + 'pheno_int_map.pkl') def get_truth_kmer_seqs_that_occur_in_data(): with open(truths_file, 'r') as f: truthslines = f.readlines() #with open(unique_kmers_file, 'r') as f: # kmerslines = f.readlines() #kmerslines = set(k.split('\t')[0] for k in kmerslines) truthslines = [l for l in truthslines if l != '\n'] truthslines = [l.split('\t') for l in truthslines] truthslines = [(unitig_int_map[int(k)], p) for k, p, _ in truthslines] return truthslines def read_unitig_sample_map_into_dict(): with open(unitig_sample_map_file, 'r') as f: lines = f.readlines() lines = {l.split('\t')[0]: l.split('\t')[1:] for l in lines}
def main(): # get params params = get_params() project = params['project'] # define data paths sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') kmer_sample_map_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt') kmer_pheno_map_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt') phenos_file = join(project, 'data', 'raw', params['pheno']) contains_sample_kmer_file = join(project, 'data', 'preprocessed', 'contains_obs.txt') value_sample_pheno_file = join(project, 'data', 'preprocessed', 'samplePheno_obs.txt') value_kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmerPheno_target.txt') similar_pheno_pheno_file = join(project, 'data', 'preprocessed', 'similarPheno_obs.txt') sim = load_pickle(sim_file) pim = load_pickle(pim_file) # incorporate truth data if params.get('truth'): truths_infile = join(project, 'data', 'raw', params['truth']) truths_dict = create_truths_dict(truths_infile, pim) truth_kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmerPheno_truth.txt') else: truths_dict = None truth_kmer_pheno_file = None # incorporate baseline data if params.get('baseline'): baseline_infile = join(project, 'data', 'raw', params['baseline']) baseline_dict = create_truths_dict(baseline_infile, pim) baseline_kmer_pheno_file = join(project, 'data', 'preprocessed', 'baseline_obs.txt') else: baseline_dict = None baseline_kmer_pheno_file = None # create smaller psl input files that can be efficiently done w 1 thread if not file_exists(value_sample_pheno_file): sample_pheno(phenos_file, sim, pim, value_sample_pheno_file) if not file_exists(similar_pheno_pheno_file): similar_pheno(phenos_file, pim, similar_pheno_pheno_file) contains_exists = file_exists(contains_sample_kmer_file) value_exists = file_exists(value_kmer_pheno_file) truths_exists = file_exists(truth_kmer_pheno_file) if params.get( 'truth') else True baseline_exists = file_exists(baseline_kmer_pheno_file) if params.get( 'baseline') else True lock = Manager().Lock() if not contains_exists: process_file(kmer_sample_db, kmer_sample_map_file, kim_file=kim_file, lock=lock, truths=truths_dict, contains_sample_kmer_file=contains_sample_kmer_file) if not value_exists or not truths_exists or not baseline_exists: if value_exists: value_kmer_pheno_file = None if truths_exists: truth_kmer_pheno_file = None if baseline_exists: baseline_kmer_pheno_file = None process_file(kmer_pheno_db, kmer_pheno_map_file, kim_file=kim_file, value_kmer_pheno_file=value_kmer_pheno_file, truth_kmer_pheno_file=truth_kmer_pheno_file, lock=lock, truths=truths_dict, baseline=baseline_dict, baseline_kmer_pheno_file=baseline_kmer_pheno_file)