Example #1
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    INPUT_FILE = join(project, 'data', 'postprocessed', 'KMERPHENO.txt')
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    fsa_file = join(project, 'data', 'postprocessed', 'scored_kmers.fsa')
    kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')
    scored_kmers_file = join(project, 'data', 'postprocessed',
                             'scored_kmers.txt')
    outdir = join(project, 'data', 'postprocessed')

    # create output files if they do not exist
    if file_exists(fsa_file):
        fsa_file = None
    if file_exists(scored_kmers_file):
        scored_kmers_file = None
    if fsa_file or scored_kmers_file:
        lock = Manager().Lock()
        pim = load_pickle(pim_file)

        process_file(process,
                     INPUT_FILE,
                     lock=lock,
                     pim=pim,
                     uim_file=uim_file,
                     fsa_file=fsa_file,
                     scored_unitigs_file=scored_unitigs_file)
    separate_phenos(scored_kmers_file, outdir, params['separate-phenos'],
                    params['no-consolidate'])
Example #2
0
 def pid_status(self):
     allrows = []
     with self.glock:
         if util.file_exists(self.pidfile):
             fd = open(self.pidfile, 'r')
             with fd:
                 allrows = fd.readlines()
     return allrows
Example #3
0
def update_excel_sheet(set_abbr: str, get_complete_set_info=False):
    my_sets = get_sets()
    my_set = find_set_in_sets(set_abbr, my_sets)
    excel_file_path = 'C:\\Users\\User\\OneDrive\\Pokemon TCG Cards Owned.xlsx'
    alternate_excel_file_path = 'C:\\Users\\jmath\\OneDrive\\Pokemon TCG Cards Owned.xlsx'
    if not file_exists(excel_file_path) and file_exists(
            alternate_excel_file_path):
        excel_file_path = alternate_excel_file_path
    # Remove previous back-up if existing
    remove_backup(excel_file_path)
    # Make back-up file for user
    shutil.copy2(excel_file_path, backup_excel_sheet_filename(excel_file_path))
    my_pokemon_sheet = PokemonSetSheet.create(my_set, excel_file_path)
    my_pokemon_sheet.save()
    update_missing_pokemon_metadata(my_pokemon_sheet)
    if get_complete_set_info:
        insert_complete_set_data(my_pokemon_sheet)
    my_pokemon_sheet.save()
Example #4
0
def main():
    # load params 
    params = get_params()
    project = params['project']
    k = params['k']

    # define file paths
    samples_file = join(project, 'data', 'raw', params['sample'])
    outfile = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    catted_samples = join(project, 'data', 'preprocessed', 'samples.fa')

    # check if output file exists; if so, do nothing.
    if file_exists(outfile):
        exit(0)

    # create catted samples file if it does not exist.
    if not file_exists(catted_samples):
        cat_samples(samples_file, catted_samples)

    # multiprocessing queue for transferring data to the main thread
    q = Manager().Queue()

    # invoke process(...) on catted_samples files with kwargs, for each thread
    process_file(process, catted_samples, q=q, k=k)
    
    # consolidate all threads' counters into single counter holding all kmers
    counter = Counter()
    while not q.empty():
        counter.update(q.get())
    for kmer in counter.keys():
        comp = complement(kmer)
        if comp in counter:
            comp_count = counter[comp]
            counter[comp] = 0
            counter[kmer] += comp_count
    counter = +counter
    printd('Finished consolidating counters.')

    # write counter to file
    write_dict(counter, outfile, sep='\t')
    
    # remove catted samples file
    if file_exists(catted_samples):
        remove(catted_samples)
Example #5
0
    def start_workers(self):
        with self.glock:
            if util.file_exists(self.pidfile):
                util.error("pidfile already exists: " + self.pidfile)
                sys.exit(-1)

        for pname in self.workers.keys():
            util.info("{}: worker starting...".format(pname))
            self.workers[pname].start()
        pass
Example #6
0
def set_logger(logger, log_path, log_level):
    import os, sys
    import utility as util
    import evntlog as elog

    log_level = log_level.strip().upper()

    if log_level not in ['TRACE', 'DEBUG', 'INFO', 'WARN', 'ERROR']:
        util.error(
            "log level(=%r) not one of: TRACE | DEBUG | INFO | WARN | ERROR" %
            log_level)
        sys.exit(-1)

    # set logger:
    logpath = util.source_abspath(log_path)
    if not os.path.isdir(logpath):
        util.error("log path not found: %r" % logpath)
        sys.exit(-1)

    logging_config = logger['logging_config']
    if not util.file_exists(logging_config):
        util.error("logging.config file not found: %s" % logging_config)
        sys.exit(-1)

    # init logger: main
    try:
        logger_file = os.path.join(logpath, logger['file'])

        loggingConfigDict = elog.init_logger(logger_name=logger['name'],
                                             logging_config=logging_config,
                                             logger_file=logger_file,
                                             logger_level=log_level)

        elog.force("logging config : %s", logging_config)
        elog.force("logger file    : %s", logger_file)
        elog.force("logger level   : %s", log_level)
        elog.force("logger name    : %s", logger['name'])

        if not loggingConfigDict:
            util.error("logging config file error: %s" % logging_config)
            sys.exit(-1)

        return loggingConfigDict
    except Exception as ex:
        util.error("error init logger: %r" % ex)
        sys.exit(-1)
Example #7
0
    def notify_stop(self, exitCode = 0):
        try:
            self.lock()

            if not util.file_exists(self.stopfile):
                os.mknod(self.stopfile)
            else:
                util.warn("stopfile already exists: " + self.stopfile)
        except:
            util.except_print("notify_stop")
        finally:
            try:
                self.unlock()
            except:
                util.except_print("unlock")
                pass

        if not exitCode is None:
            sys.exit(exitCode)
        pass
Example #8
0
def set_logger(logger, log_path, log_level):
    import os, sys
    import utility as util
    import evntlog as elog

    loggingConfigDict = {}

    # set logger:
    logpath = util.source_abspath(log_path)
    if not os.path.isdir(logpath):
        elog.error("log path not found: %r", logpath)
        sys.exit(1)

    logging_config = logger['logging_config']
    if not util.file_exists(logging_config):
        elog.error("logging.config file not found: %s", logging_config)
        sys.exit(1)

    # init logger: main
    try:
        logger_file = os.path.join(logpath, logger['file'])

        configDict = elog.init_logger(logger_name=logger['name'],
                                      logging_config=logging_config,
                                      logger_file=logger_file,
                                      logger_level=log_level)

        elog.force("logging config: %s", logging_config)
        elog.force("logger file: %s", logger_file)
        elog.force("logger level: %s", log_level)
        elog.force("logger name: %s", logger['name'])

        return configDict
    except Exception as ex:
        elog.error("error init logger: %r", ex)
        sys.exit(1)
Example #9
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define file paths
    unique_kmers_file = join(project, 'data', 'preprocessed', 'unique_kmers.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    samples_file = join(project, 'data', 'raw', params['sample'])
    similarities_tsv = join(project, 'data', 'preprocessed', 'sample_similarities.tsv')
    hist_orig_file = join(project, 'data', 'preprocessed', 'hist_orig.png')
    hist_sim_scaled_file = join(project, 'data', 'preprocessed', 'hist_sim_scaled.png')
    hist_dissim_scaled_file = join(project, 'data', 'preprocessed', 'hist_dissim_scaled.png')
    similar_sample_file = join(project, 'data', 'preprocessed', 'similarSample_obs.txt')
    dissimilar_sample_file = join(project, 'data', 'preprocessed', 'dissimilarSample_obs.txt')
    kmer_sample_file = join(project, 'data', 'preprocessed', 'kmer_sample_map.txt')
    kmer_pheno_file = join(project, 'data', 'preprocessed', 'kmer_pheno_map.txt')
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl') 
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    uim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')

    # create and load sample and pheno int maps
    if not file_exists(sim_file):
        int_maps.create_sample_int_map(samples_file, phenos_file, sim_file)
    if not file_exists(pim_file):
        int_maps.create_pheno_int_map(phenos_file, pim_file)
    sim = load_pickle(sim_file)
    
    # only do processing if output files do not exist
    if (not file_exists(kmer_sample_file) or not file_exists(kmer_pheno_file) 
            or ((not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file))
            and not file_exists(similarities_tsv))):
        # dfs holding samples that display vs not display pheno
        dfdisp, dfnodisp = create_disp_nodisp_dfs(phenos_file, sim)
        # read in all sequences in input into python object
        seqs = parse_input(samples_file)
        # number of samples
        n_samples = int(len(sim) / 2)
        # upper and lower bounds for frequency of samples to filter kmers by
        upper = int(params['maxkf'] * n_samples)
        lower = int(params['minkf'] * n_samples)
        # multiprocessing queue for transferring data to the main thread
        m = Manager()
        q = m.Queue()
        # multiprocessing lock for locking file before writing to it
        lock = m.Lock()
        # kmers file name reference for subprocesses to write to
        kmer_sample_file_ref = kmer_sample_file # because the int map uses it
        if file_exists(kmer_sample_file):
            kmer_sample_file_ref = None
        if file_exists(kmer_pheno_file):
            kmer_pheno_file = None
        
        kwargs = dict(raw=seqs, k=params['k'], thresh=params['correlation-thresh'],
                    upper=upper, lower=lower, dfdisp=dfdisp, dfnodisp=dfnodisp,
                    sim=sim, n=n_samples,
                    kmer_sample_file=kmer_sample_file_ref,
                    kmer_pheno_file=kmer_pheno_file)

        process_file(create_kmer_sample_map, unique_kmers_file, q=q, lock=lock, **kwargs)
       
        sample_matrix = np.zeros((n_samples, n_samples))
        num_kmers = 0
        # write all chunks to output files sequentially
        while not q.empty():
            q_num_kmers, q_sample_matrix = q.get()
            num_kmers += q_num_kmers
            sample_matrix += q_sample_matrix
        
        # create sample similarity file if the similarities tsv does not exist
        if not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file):
            similar_sample(sample_matrix, num_kmers, similarities_tsv,
                hist_orig_file, hist_sim_scaled_file, hist_dissim_scaled_file,
                similar_sample_file, dissimilar_sample_file)
    if (not file_exists(similar_sample_file) or not file_exists(dissimilar_sample_file)) and file_exists(similarities_tsv):
        similar_sample(None, None, similarities_tsv, hist_orig_file,
            hist_sim_scaled_file, hist_dissim_scaled_file,
            similar_sample_file, dissimilar_sample_file)
    # create kmer int map
    if not file_exists(uim_file):
        int_maps.create_kmer_int_map(kmer_sample_file, uim_file)
Example #10
0
 def is_stop(self):
     return util.file_exists(self.stopfile)
Example #11
0
def main():
    # get params
    params = get_params()
    project = params['project']

    # define data paths
    sim_file = join(project, 'data', 'preprocessed', 'sample_int_map.pkl')
    pim_file = join(project, 'data', 'preprocessed', 'pheno_int_map.pkl')
    kim_file = join(project, 'data', 'preprocessed', 'kmer_int_map.pkl')
    kmer_sample_map_file = join(project, 'data', 'preprocessed',
                                'kmer_sample_map.txt')
    kmer_pheno_map_file = join(project, 'data', 'preprocessed',
                               'kmer_pheno_map.txt')
    phenos_file = join(project, 'data', 'raw', params['pheno'])
    contains_sample_kmer_file = join(project, 'data', 'preprocessed',
                                     'contains_obs.txt')
    value_sample_pheno_file = join(project, 'data', 'preprocessed',
                                   'samplePheno_obs.txt')
    value_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                 'kmerPheno_target.txt')
    similar_pheno_pheno_file = join(project, 'data', 'preprocessed',
                                    'similarPheno_obs.txt')

    sim = load_pickle(sim_file)
    pim = load_pickle(pim_file)

    # incorporate truth data
    if params.get('truth'):
        truths_infile = join(project, 'data', 'raw', params['truth'])
        truths_dict = create_truths_dict(truths_infile, pim)
        truth_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                     'kmerPheno_truth.txt')
    else:
        truths_dict = None
        truth_kmer_pheno_file = None

    # incorporate baseline data
    if params.get('baseline'):
        baseline_infile = join(project, 'data', 'raw', params['baseline'])
        baseline_dict = create_truths_dict(baseline_infile, pim)
        baseline_kmer_pheno_file = join(project, 'data', 'preprocessed',
                                        'baseline_obs.txt')
    else:
        baseline_dict = None
        baseline_kmer_pheno_file = None

    # create smaller psl input files that can be efficiently done w 1 thread
    if not file_exists(value_sample_pheno_file):
        sample_pheno(phenos_file, sim, pim, value_sample_pheno_file)
    if not file_exists(similar_pheno_pheno_file):
        similar_pheno(phenos_file, pim, similar_pheno_pheno_file)

    contains_exists = file_exists(contains_sample_kmer_file)
    value_exists = file_exists(value_kmer_pheno_file)
    truths_exists = file_exists(truth_kmer_pheno_file) if params.get(
        'truth') else True
    baseline_exists = file_exists(baseline_kmer_pheno_file) if params.get(
        'baseline') else True

    lock = Manager().Lock()

    if not contains_exists:
        process_file(kmer_sample_db,
                     kmer_sample_map_file,
                     kim_file=kim_file,
                     lock=lock,
                     truths=truths_dict,
                     contains_sample_kmer_file=contains_sample_kmer_file)

    if not value_exists or not truths_exists or not baseline_exists:
        if value_exists:
            value_kmer_pheno_file = None
        if truths_exists:
            truth_kmer_pheno_file = None
        if baseline_exists:
            baseline_kmer_pheno_file = None
        process_file(kmer_pheno_db,
                     kmer_pheno_map_file,
                     kim_file=kim_file,
                     value_kmer_pheno_file=value_kmer_pheno_file,
                     truth_kmer_pheno_file=truth_kmer_pheno_file,
                     lock=lock,
                     truths=truths_dict,
                     baseline=baseline_dict,
                     baseline_kmer_pheno_file=baseline_kmer_pheno_file)
Example #12
0
def similar_sample(sample_matrix, num_kmers, similarities_tsv, hist_orig_file,
                   hist_sim_scaled_file, hist_dissim_scaled_file,
                   similarities_file, dissimilarities_file):
    if not file_exists(similarities_tsv):
        # scale similarities matrix by the mean num sampled kmers each sample
        # shares with itself. Then, normalize to [0,1]. Then remove the diagonal
        # and the lower triangle of the array (since it is symmetric about the
        # major diagonal), and finally round values to 4 decimal places.
        mean_shared_w_self = sample_matrix.diagonal().mean()
        sample_matrix /= mean_shared_w_self
        sample_matrix += 0.001  # ensure all values are nonzero
        sample_matrix *= 1.0 / sample_matrix.max()
        np.fill_diagonal(sample_matrix, np.nan)
        sample_matrix = np.triu(sample_matrix)
        np.round(sample_matrix, 4)

        df = pd.DataFrame(sample_matrix)

        # dump to tsv file for ease of restoring, and because tsv file of similarities
        # is a common input to other mGWAS programs
        df.to_csv(similarities_tsv, sep='\t')

    else:
        df = pd.read_csv(similarities_tsv, sep='\t', index_col=0)

    # create similarity histogram and save it
    plt.hist(df.values, facecolor='green')
    plt.savefig(hist_orig_file, dpi=150)
    plt.clf()
    df = df.stack()
    df = df.reset_index()
    df = df[df[0] > 0]  # remove the lower half of the triangle
    # set threshold; 0.75 means drop lowest 75%, keep highest 25%
    highthresh = 0.9
    lowthresh = 0.1
    # find numeric cutoff; the lowest 75% of the data are below this value
    highcutoff = df[0].quantile(highthresh)
    lowcutoff = df[0].quantile(lowthresh)
    # cut off all everything in the middle; only keep the very similar and very dissimilar
    simdf = df[df[0] >= highcutoff].copy(deep=True)
    dissimdf = df[df[0] <= lowcutoff].copy(deep=True)
    dissimdf[0] = 1 - dissimdf[0]
    dfs = (simdf, dissimdf)
    # determine new min, max, range
    files = ((hist_sim_scaled_file, similarities_file),
             (hist_dissim_scaled_file, dissimilarities_file))
    for i, (pngfile, outfile) in enumerate(files):
        df = dfs[i]
        min_ = df[0].min()
        max_ = df[0].max()
        range_ = max_ - min_
        # shift df left by the min so the new min is 0
        df[0] -= min_
        # rescale data to [0,0.5] or [0,1]
        if i == 0:  # high
            scale_factor = 2
            intercept = 0.5
        else:  # low
            scale_factor = 2
            intercept = 0.5
        df[0] /= range_ * scale_factor
        # rescale data to [0.5, 1] or [0, 1]
        df[0] += intercept
        # create similarity histogram and save it
        try:
            plt.hist(df[0], bins=50, facecolor='green')
            plt.savefig(pngfile, dpi=150)
            plt.clf()
        except ValueError as e:
            printd('Unable to generate histogram of scaled data')

        # write to csv
        df.to_csv(outfile, sep='\t', index=False, header=False)