def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") snp_key = KeyedDataSource.load_data(args.snp_annotation_file, "varID", "rsid_dbSNP150", should_skip=KeyedDataSource.skip_na) logging.info("Loading Genotype") genotype, individual_ids = ModelTraining.load_genotype_folder( args.input_genotype_folder, args.input_genotype_file_pattern, snp_key) logging.info("Saving Genotype") path_variant = args.output_prefix + ".variants.parquet" Parquet.save_variants(path_variant, genotype, individual_ids) path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet.save_metadata(path_metadata_variant, genotype) logging.info("Processing Expression Phenotype") expression_logic = Utilities.file_logic( args.input_phenotype_folder, args.input_phenotype_expression_pattern) for row in expression_logic.itertuples(): logging.info("Phenotype: %s", row.name) process_phenotype(row.path, row.name, args.output_prefix) end = timer() logging.info("Finished in %s", str(end - start))
def generate_multi_backend(args, variant_key): logging.info("Processing Genotype") dosage_conversion = GenotypeUtilities.impute_to_mean_conversion if args.impute_to_mean else None dosage_filter = get_filter(args, variant_key) metadata = [] for genotype, individual_ids in ModelTraining.load_genotype_file_by_chromosome( args.input_genotype_file, variant_key, dosage_conversion, dosage_filter): if args.simplify_individual_id: logging.info("simplifying individual id") individual_ids = [x.split("_")[0] for x in individual_ids] _m = genotype.get_variants_metadata() metadata.append(_m) _chr = _m.chromosome.values[0] logging.log(9, "Processing {}".format(_chr)) _o = args.output_prefix + ".chr{}".format(_chr) + ".variants.parquet" Parquet.save_variants(_o, genotype, individual_ids) logging.info("Saving metadata") metadata = pandas.concat(metadata) path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet._save_metadata(path_metadata_variant, metadata)
def generate_single_backend(args, variant_key): logging.info("Loading Genotype") dosage_conversion = GenotypeUtilities.impute_to_mean_conversion if args.impute_to_mean else None dosage_filter = get_filter(args, variant_key) genotype, individual_ids = ModelTraining.load_genotype_file(args.input_genotype_file, variant_key, dosage_conversion, dosage_filter) logging.info("Saving Genotype") path_variant = args.output_prefix + ".variants.parquet" Parquet.save_variants(path_variant, genotype, individual_ids) logging.info("Saving Genotype") path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet.save_metadata(path_metadata_variant, genotype)