def run(args):
    logging.info("Starting")
    Utilities.ensure_requisite_folders(args.output)

    logging.info("Read covariate")
    covariate = pq.read_table(args.covariate).to_pandas()
    logging.info("Read data")
    data = pq.read_table(args.data).to_pandas()

    logging.info("Processing")
    covariate_names = covariate.columns.values[1:]
    results = {"individual": data.individual.values}
    variables = [x for x in data.columns.values[1:]]
    for i, column in enumerate(variables):
        logging.log(9, "%i/%i:%s", i, len(variables), column)
        d = data[["individual", column]].rename(columns={
            column: "y"
        }).merge(covariate, on="individual", how="inner").drop("individual",
                                                               axis=1)
        y, X = dmatrices("y ~ {}".format(" + ".join(covariate_names)),
                         data=d,
                         return_type="dataframe")
        model = sm.OLS(y, X)
        result = model.fit()
        results[column] = result.resid
    results = pandas.DataFrame(results)[["individual"] + variables]
    Parquet.save_variable(args.output, results)
    logging.info("Finished")
Example #2
0
def run(args):
    start = timer()
    Utilities.ensure_requisite_folders(args.parquet_output)
    logging.info("Loading variable")
    variables = ModelTraining.load_variable_file(args.variable_file)
    logging.info("Saving")
    Parquet.save_variable(args.parquet_output, variables)
    end = timer()
    logging.info("Finished in %s", str(end-start))
Example #3
0
def process_phenotype(path, name, output_prefix):
    pheno = ModelTraining.load_variable_file(path)
    pheno_path = output_prefix + ".expression." + name + ".parquet"
    Parquet.save_variable(pheno_path, pheno)