def main(): # get params params = get_params() project = params['project'] # define file paths INPUT_FILE = join(project, 'data', 'postprocessed', 'KMERPHENO.txt') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') fsa_file = join(project, 'data', 'postprocessed', 'scored_kmers.fsa') kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') scored_kmers_file = join(project, 'data', 'postprocessed', 'scored_kmers.txt') outdir = join(project, 'data', 'postprocessed') # create output files if they do not exist if file_exists(fsa_file): fsa_file = None if file_exists(scored_kmers_file): scored_kmers_file = None if fsa_file or scored_kmers_file: lock = Manager().Lock() pim = load_pickle(pim_file) process_file(process, INPUT_FILE, lock=lock, pim=pim, uim_file=uim_file, fsa_file=fsa_file, scored_unitigs_file=scored_unitigs_file) separate_phenos(scored_kmers_file, outdir, params['separate-phenos'], params['no-consolidate'])
def pid_status(self): allrows = [] with self.glock: if util.file_exists(self.pidfile): fd = open(self.pidfile, 'r') with fd: allrows = fd.readlines() return allrows
def update_excel_sheet(set_abbr: str, get_complete_set_info=False): my_sets = get_sets() my_set = find_set_in_sets(set_abbr, my_sets) excel_file_path = 'C:\\Users\\User\\OneDrive\\Pokemon TCG Cards Owned.xlsx' alternate_excel_file_path = 'C:\\Users\\jmath\\OneDrive\\Pokemon TCG Cards Owned.xlsx' if not file_exists(excel_file_path) and file_exists( alternate_excel_file_path): excel_file_path = alternate_excel_file_path # Remove previous back-up if existing remove_backup(excel_file_path) # Make back-up file for user shutil.copy2(excel_file_path, backup_excel_sheet_filename(excel_file_path)) my_pokemon_sheet = PokemonSetSheet.create(my_set, excel_file_path) my_pokemon_sheet.save() update_missing_pokemon_metadata(my_pokemon_sheet) if get_complete_set_info: insert_complete_set_data(my_pokemon_sheet) my_pokemon_sheet.save()
def main(): # load params params = get_params() project = params['project'] k = params['k'] # define file paths samples_file = join(project, 'data', 'raw', params['sample']) outfile = join(project, 'data', 'preprocessed', 'unique_kmers.txt') catted_samples = join(project, 'data', 'preprocessed', 'samples.fa') # check if output file exists; if so, do nothing. if file_exists(outfile): exit(0) # create catted samples file if it does not exist. if not file_exists(catted_samples): cat_samples(samples_file, catted_samples) # multiprocessing queue for transferring data to the main thread q = Manager().Queue() # invoke process(...) on catted_samples files with kwargs, for each thread process_file(process, catted_samples, q=q, k=k) # consolidate all threads' counters into single counter holding all kmers counter = Counter() while not q.empty(): counter.update(q.get()) for kmer in counter.keys(): comp = complement(kmer) if comp in counter: comp_count = counter[comp] counter[comp] = 0 counter[kmer] += comp_count counter = +counter printd('Finished consolidating counters.') # write counter to file write_dict(counter, outfile, sep='\t') # remove catted samples file if file_exists(catted_samples): remove(catted_samples)
def start_workers(self): with self.glock: if util.file_exists(self.pidfile): util.error("pidfile already exists: " + self.pidfile) sys.exit(-1) for pname in self.workers.keys(): util.info("{}: worker starting...".format(pname)) self.workers[pname].start() pass
def set_logger(logger, log_path, log_level): import os, sys import utility as util import evntlog as elog log_level = log_level.strip().upper() if log_level not in ['TRACE', 'DEBUG', 'INFO', 'WARN', 'ERROR']: util.error( "log level(=%r) not one of: TRACE | DEBUG | INFO | WARN | ERROR" % log_level) sys.exit(-1) # set logger: logpath = util.source_abspath(log_path) if not os.path.isdir(logpath): util.error("log path not found: %r" % logpath) sys.exit(-1) logging_config = logger['logging_config'] if not util.file_exists(logging_config): util.error("logging.config file not found: %s" % logging_config) sys.exit(-1) # init logger: main try: logger_file = os.path.join(logpath, logger['file']) loggingConfigDict = elog.init_logger(logger_name=logger['name'], logging_config=logging_config, logger_file=logger_file, logger_level=log_level) elog.force("logging config : %s", logging_config) elog.force("logger file : %s", logger_file) elog.force("logger level : %s", log_level) elog.force("logger name : %s", logger['name']) if not loggingConfigDict: util.error("logging config file error: %s" % logging_config) sys.exit(-1) return loggingConfigDict except Exception as ex: util.error("error init logger: %r" % ex) sys.exit(-1)
def notify_stop(self, exitCode = 0): try: self.lock() if not util.file_exists(self.stopfile): os.mknod(self.stopfile) else: util.warn("stopfile already exists: " + self.stopfile) except: util.except_print("notify_stop") finally: try: self.unlock() except: util.except_print("unlock") pass if not exitCode is None: sys.exit(exitCode) pass
def set_logger(logger, log_path, log_level): import os, sys import utility as util import evntlog as elog loggingConfigDict = {} # set logger: logpath = util.source_abspath(log_path) if not os.path.isdir(logpath): elog.error("log path not found: %r", logpath) sys.exit(1) logging_config = logger['logging_config'] if not util.file_exists(logging_config): elog.error("logging.config file not found: %s", logging_config) sys.exit(1) # init logger: main try: logger_file = os.path.join(logpath, logger['file']) configDict = elog.init_logger(logger_name=logger['name'], logging_config=logging_config, logger_file=logger_file, logger_level=log_level) elog.force("logging config: %s", logging_config) elog.force("logger file: %s", logger_file) elog.force("logger level: %s", log_level) elog.force("logger name: %s", logger['name']) return configDict except Exception as ex: elog.error("error init logger: %r", ex) sys.exit(1)
def main(): # get params params = get_params() project = params['project'] # define file paths unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt') phenos_file = join(project, 'data', 'raw', params['pheno']) samples_file = join(project, 'data', 'raw', params['sample']) similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv') hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png') hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png') hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png') similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt') dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt') kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt') kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt') sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') # create and load sample and pheno int maps if not file_exists(sim_file): int_maps.create_sample_int_map(samples_file, phenos_file, sim_file) if not file_exists(pim_file): int_maps.create_pheno_int_map(phenos_file, pim_file) sim = load_pickle(sim_file) # only do processing if output files do not exist if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and not file_exists(similarities_tsv))): # dfs holding samples that display vs not display pheno dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim) # read in all sequences in input into python object seqs = parse_input(samples_file) # number of samples n_samples = int(len(sim) / 2) # upper and lower bounds for frequency of samples to filter kmers by upper = int(params['maxkf'] * n_samples) lower = int(params['minkf'] * n_samples) # multiprocessing queue for transferring data to the main thread m = Manager() q = m.Queue() # multiprocessing lock for locking file before writing to it lock = m.Lock() # kmers file name reference for subprocesses to write to kmer_sample_file_ref = kmer_sample_file # because the int map uses it if file_exists(kmer_sample_file): kmer_sample_file_ref = None if file_exists(kmer_pheno_file): kmer_pheno_file = None kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'], upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp, sim=sim, n=n_samples, kmer_sample_file=kmer_sample_file_ref, kmer_pheno_file=kmer_pheno_file) process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs) sample_matrix = np.zeros((n_samples, n_samples)) num_kmers = 0 # write all chunks to output files sequentially while not q.empty(): q_num_kmers, q_sample_matrix = q.get() num_kmers += q_num_kmers sample_matrix += q_sample_matrix # create sample similarity file if the similarities tsv does not exist if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file): similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similar_sample_file, dissimilar_sample_file) if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv): similar_sample(None, None, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similar_sample_file, dissimilar_sample_file) # create kmer int map if not file_exists(uim_file): int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
def is_stop(self): return util.file_exists(self.stopfile)
def main(): # get params params = get_params() project = params['project'] # define data paths sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl') kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl') kmer_sample_map_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt') kmer_pheno_map_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt') phenos_file = join(project, 'data', 'raw', params['pheno']) contains_sample_kmer_file = join(project, 'data', 'preprocessed', 'contains_obs.txt') value_sample_pheno_file = join(project, 'data', 'preprocessed', 'samplePheno_obs.txt') value_kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmerPheno_target.txt') similar_pheno_pheno_file = join(project, 'data', 'preprocessed', 'similarPheno_obs.txt') sim = load_pickle(sim_file) pim = load_pickle(pim_file) # incorporate truth data if params.get('truth'): truths_infile = join(project, 'data', 'raw', params['truth']) truths_dict = create_truths_dict(truths_infile, pim) truth_kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmerPheno_truth.txt') else: truths_dict = None truth_kmer_pheno_file = None # incorporate baseline data if params.get('baseline'): baseline_infile = join(project, 'data', 'raw', params['baseline']) baseline_dict = create_truths_dict(baseline_infile, pim) baseline_kmer_pheno_file = join(project, 'data', 'preprocessed', 'baseline_obs.txt') else: baseline_dict = None baseline_kmer_pheno_file = None # create smaller psl input files that can be efficiently done w 1 thread if not file_exists(value_sample_pheno_file): sample_pheno(phenos_file, sim, pim, value_sample_pheno_file) if not file_exists(similar_pheno_pheno_file): similar_pheno(phenos_file, pim, similar_pheno_pheno_file) contains_exists = file_exists(contains_sample_kmer_file) value_exists = file_exists(value_kmer_pheno_file) truths_exists = file_exists(truth_kmer_pheno_file) if params.get( 'truth') else True baseline_exists = file_exists(baseline_kmer_pheno_file) if params.get( 'baseline') else True lock = Manager().Lock() if not contains_exists: process_file(kmer_sample_db, kmer_sample_map_file, kim_file=kim_file, lock=lock, truths=truths_dict, contains_sample_kmer_file=contains_sample_kmer_file) if not value_exists or not truths_exists or not baseline_exists: if value_exists: value_kmer_pheno_file = None if truths_exists: truth_kmer_pheno_file = None if baseline_exists: baseline_kmer_pheno_file = None process_file(kmer_pheno_db, kmer_pheno_map_file, kim_file=kim_file, value_kmer_pheno_file=value_kmer_pheno_file, truth_kmer_pheno_file=truth_kmer_pheno_file, lock=lock, truths=truths_dict, baseline=baseline_dict, baseline_kmer_pheno_file=baseline_kmer_pheno_file)
def similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file, similarities_file, dissimilarities_file): if not file_exists(similarities_tsv): # scale similarities matrix by the mean num sampled kmers each sample # shares with itself. Then, normalize to [0,1]. Then remove the diagonal # and the lower triangle of the array (since it is symmetric about the # major diagonal), and finally round values to 4 decimal places. mean_shared_w_self = sample_matrix.diagonal().mean() sample_matrix /= mean_shared_w_self sample_matrix += 0.001 # ensure all values are nonzero sample_matrix *= 1.0 / sample_matrix.max() np.fill_diagonal(sample_matrix, np.nan) sample_matrix = np.triu(sample_matrix) np.round(sample_matrix, 4) df = pd.DataFrame(sample_matrix) # dump to tsv file for ease of restoring, and because tsv file of similarities # is a common input to other mGWAS programs df.to_csv(similarities_tsv, sep='\t') else: df = pd.read_csv(similarities_tsv, sep='\t', index_col=0) # create similarity histogram and save it plt.hist(df.values, facecolor='green') plt.savefig(hist_orig_file, dpi=150) plt.clf() df = df.stack() df = df.reset_index() df = df[df[0] > 0] # remove the lower half of the triangle # set threshold; 0.75 means drop lowest 75%, keep highest 25% highthresh = 0.9 lowthresh = 0.1 # find numeric cutoff; the lowest 75% of the data are below this value highcutoff = df[0].quantile(highthresh) lowcutoff = df[0].quantile(lowthresh) # cut off all everything in the middle; only keep the very similar and very dissimilar simdf = df[df[0] >= highcutoff].copy(deep=True) dissimdf = df[df[0] <= lowcutoff].copy(deep=True) dissimdf[0] = 1 - dissimdf[0] dfs = (simdf, dissimdf) # determine new min, max, range files = ((hist_sim_scaled_file, similarities_file), (hist_dissim_scaled_file, dissimilarities_file)) for i, (pngfile, outfile) in enumerate(files): df = dfs[i] min_ = df[0].min() max_ = df[0].max() range_ = max_ - min_ # shift df left by the min so the new min is 0 df[0] -= min_ # rescale data to [0,0.5] or [0,1] if i == 0: # high scale_factor = 2 intercept = 0.5 else: # low scale_factor = 2 intercept = 0.5 df[0] /= range_ * scale_factor # rescale data to [0.5, 1] or [0, 1] df[0] += intercept # create similarity histogram and save it try: plt.hist(df[0], bins=50, facecolor='green') plt.savefig(pngfile, dpi=150) plt.clf() except ValueError as e: printd('Unable to generate histogram of scaled data') # write to csv df.to_csv(outfile, sep='\t', index=False, header=False)