def __init__( self, output_dir="output", resources_dir="resources", parallelize=False, processes=os.cpu_count(), ): """ Initialize a ``Lineage`` object. Parameters ---------- output_dir : str name / path of output directory resources_dir : str name / path of resources directory parallelize : bool utilize multiprocessing to speedup calculations processes : int processes to launch if multiprocessing """ self._output_dir = os.path.abspath(output_dir) self._resources = Resources(resources_dir=resources_dir) self._parallelizer = Parallelizer(parallelize=parallelize, processes=processes)
if __name__ == "__main__": logger.info("start") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [{"file": filenames[i]} for i in samples] # results are a list of lists rows = p(get_xy_chrom_snp_ratios, tasks) # remove None results rows = [row for row in rows if row] df = pd.DataFrame( rows, columns=[ "file", "source", "build", "build_detected",
def main(): logger.info("start") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() filenames = [ filename for filename in filenames if "readme" not in filename and "phenotype" not in filename ] # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [{"file": filenames[i]} for i in samples] # run tasks; results is a list of dicts results = p(load_file, tasks) # get results from `load_file` where `count` was non-zero rows = [item for item in results if "msg" not in item] df = pd.DataFrame( rows, columns=["file", "source", "build", "build_detected", "chromosomes", "count"], ) save_df_as_csv(df, OUTPUT_DIR, "parse-opensnp-files.csv") # log parsing statistics file_count = len(filenames) logger.info(f"{file_count} files in the openSNP datadump") logger.info(f"{(len(df) / file_count):.2%} of openSNP datadump files parsed") logger.info( f"build detected in {len(df.loc[df.build_detected]) / len(df):.2%} of files parsed" ) # extract files from the datadump where `load_file` returned a message if EXTRACT_FILES: # group files with same message (e.g., {"some message": ["file1", "file2"], ...}) d = {} for result in results: if "msg" in result: if result["msg"] in d: d[result["msg"]].append(result["file"]) else: d[result["msg"]] = [result["file"]] # add messages / file filters as necessary... d["build not detected"] = list(df.loc[~df.build_detected].file.values) # extract files that have messages for debugging for msg, files in d.items(): if len(files) == 0: continue # create a directory for each message (prefix indicates number of files) path = os.path.join(OUTPUT_DIR, f"{len(files):04}_{clean_str(msg)}") create_dir(path) # save each file with message into created directory for filename in files: with atomic_write(os.path.join(path, filename), mode="wb") as f: f.write(r.load_opensnp_datadump_file(filename)) logger.info("stop")
def main(): logging.info("start analysis") # get filenames from openSNP data dump filenames = r.get_opensnp_datadump_filenames() # draw a sample from the observations random.seed(1) SAMPLE_SIZE = len(filenames) # SAMPLE_SIZE = 10 samples = random.sample(range(len(filenames)), SAMPLE_SIZE) # get the 1000 genomes samples dfsamples = get_1kg_samples( f"{DATA_DIR}/integrated_call_samples_v3.20130502.ALL.panel" ) logging.info("retreived the 1kg samples") aisnps_1kg = ( vcf2df(f"{DATA_DIR}/kidd.55aisnp.1kg.vcf", dfsamples) if aisnp_SET == "kidd et al. 55 aisnps" else vcf2df(f"{DATA_DIR}/Seldin.128aisnp.1kg.vcf", dfsamples) ) logging.info("made the AIsnp DataFrame") # Encode 1kg data X_encoded, encoder = encode_genotypes(aisnps_1kg) logging.info("encoded the genotypes") # perform dimensionality reduction on the 1kg set X_reduced, reducer = dimensionality_reduction( X_encoded, algorithm=DIMENSIONALITY_REDUCTION_ALGORITHM ) logging.info("Reduced the dimensionality of the genotypes") # predicted population knn_super_pop = KNeighborsClassifier( n_neighbors=9, weights="distance", n_jobs=1 ) knn_pop = KNeighborsClassifier(n_neighbors=9, weights="distance", n_jobs=1) # fit the knn before adding the user sample logging.info("Fitting the superpopulation model") knn_super_pop.fit(X_reduced, dfsamples["super population"]) logging.info("Done!") logging.info("Fitting the population model") knn_pop.fit(X_reduced, dfsamples["population"]) logging.info("Done!") # setup tasks for parallelizing / execution on multiple cores p = Parallelizer(parallelize=True) tasks = [ { "file": filenames[i], "aisnps_1kg": aisnps_1kg, "X_encoded": X_encoded, "encoder": encoder, "reducer": reducer, "knn_super_pop": knn_super_pop, "knn_pop": knn_pop, } for i in samples ] # run tasks; results is a list of dicts results = p(process_file, tasks) # get rows for dataframe summarizing results rows = [row for row in results if row] df = pd.DataFrame( rows, columns=[ "file", "source", "build", "build_detected", "chromosomes_summary", "snp_count", "AFR", "AMR", "EAS", "EUR", "SAS", "ACB", "ASW", "BEB", "CDX", "CEU", "CHB", "CHS", "CLM", "ESN", "FIN", "GBR", "GIH", "GWD", "IBS", "ITU", "JPT", "KHV", "LWK", "MSL", "MXL", "PEL", "PJL", "PUR", "STU", "TSI", "YRI", "component1", "component2", "component3", ], ) save_df_as_csv(df, OUTPUT_DIR, "opensnp_ancestry.csv") logging.info("analysis done!")