def run(args): start = timer() Utilities.ensure_requisite_folders(args.output_prefix) logging.info("Loading SNP annotation") snp_key = KeyedDataSource.load_data(args.snp_annotation_file, "varID", "rsid_dbSNP150", should_skip=KeyedDataSource.skip_na) logging.info("Loading Genotype") genotype, individual_ids = ModelTraining.load_genotype_folder( args.input_genotype_folder, args.input_genotype_file_pattern, snp_key) logging.info("Saving Genotype") path_variant = args.output_prefix + ".variants.parquet" Parquet.save_variants(path_variant, genotype, individual_ids) path_metadata_variant = args.output_prefix + ".variants_metadata.parquet" Parquet.save_metadata(path_metadata_variant, genotype) logging.info("Processing Expression Phenotype") expression_logic = Utilities.file_logic( args.input_phenotype_folder, args.input_phenotype_expression_pattern) for row in expression_logic.itertuples(): logging.info("Phenotype: %s", row.name) process_phenotype(row.path, row.name, args.output_prefix) end = timer() logging.info("Finished in %s", str(end - start))
def get_gene_variant_list(model_db_folder, pattern): logic = Utilities.file_logic(model_db_folder, pattern) g = [] for i, l in enumerate(logic.itertuples()): logging.log(9, "Opening db %d/%d: %s ", i + 1, logic.shape[0], l.name) with sqlite3.connect(l.path) as connection: w = pandas.read_sql("select * from weights;", connection)[["gene", "varID", "rsid"]] g.append(w) g = pandas.concat(g).drop_duplicates() return g
def run(args): logging.info("Processing...") Utilities.ensure_requisite_folders(args.output_prefix) spec = Utilities.file_logic(args.input_folder, args.input_pattern) with gzip.open(args.output_prefix + ".models.txt.gz", mode="w") as models: models.write("gene\tmodel\tn\tpp\tps\n".encode()) with gzip.open(args.output_prefix + ".models_variants.txt.gz", mode="w") as model_variants: model_variants.write("gene\tmodel\tvariant\n".encode()) with gzip.open(args.output_prefix + ".model_summary.txt.gz", mode="w") as model_summary: model_summary.write( "gene\tpes\tpes_se\tlog_nc\tlog10_nc\n".encode()) with gzip.open(args.output_prefix + ".variants_pip.txt.gz", mode="w") as variant_pip: variant_pip.write( "gene\trank\tvariant_id\tpip\tlog10_abf\tcluster_id\n". encode()) with gzip.open(args.output_prefix + ".clusters.txt.gz", mode="w") as clusters: clusters.write( "gene\tcluster\tn_snps\tpip\taverage_r2\n".encode( )) with gzip.open(args.output_prefix + ".cluster_correlations.txt.gz", mode="w") as cluster_correlations: cluster_correlations.write( "gene\tid1\tid2\tvalue\n".encode()) for i, t in enumerate(spec.itertuples()): logging.log(9, "Processing %s", t.name) written = set() with open(t.path) as dap: p, pse, lognc, log10nc = None, None, None, None for l in dap: s = model_re.search(l) if s: ml = parse_model_line(t.name, s) models.write(ml.encode()) vl = parse_model_line_for_variant( t.name, s) if vl: for vl_ in vl: model_variants.write( vl_.encode()) continue s = model_expected_size_re.search(l) if s: p, pse = parse_expected_size(s) continue s = lognc_re.search(l) if s: lognc, log10nc = parse_log_10_nc(s) model_summary.write( "{}\t{}\t{}\t{}\t{}\n".format( t.name, p, pse, lognc, log10nc).encode()) continue s = variant_re.search(l) if s: rank, id, pip, log10_abvf, cluster_id = parse_variant_line( s) variant_pip.write( "{}\t{}\t{}\t{}\t{}\t{}\n". format(t.name, rank, id, pip, log10_abvf, cluster_id).encode()) continue s = cluster_re.search(l) if s: id, n, pip, r2 = parse_cluster_line( s) clusters.write( "{}\t{}\t{}\t{}\t{}\n".format( t.name, id, n, pip, r2).encode()) _id1 = int(id) comps = s.group( "correlation").strip().split() for _id2 in range( 1, len(comps) + 1): if (_id1, _id2) in written or ( _id2, _id1) in written: continue comp = comps[_id2 - 1] cluster_correlations.write( "{}\t{}\t{}\t{}\n".format( t.name, _id1, _id2, comp).encode()) written.add((_id1, _id2)) logging.info("Finished")