Example #1
0
    def run(self):
        if self.args.weight_db_path:
            logging.info("Loading weight model")
            weight_db_logic = WeightDBUtilities.WeightDBEntryLogic(
                self.weight_db_path)
        else:
            weight_db_logic = None

        names = Utilities.contentsWithRegexpFromFolder(self.gwas_folder,
                                                       self.gwas_regexp)

        if not os.path.exists(self.output_folder):
            os.makedirs(self.output_folder)

        if len(names) == 0:
            raise Exceptions.ReportableException(
                "No GWAS files found on %s with pattern %s" % (
                    self.gwas_folder,
                    self.gwas_regexp.pattern,
                ))

        for name in names:
            try:
                self.buildBetas(weight_db_logic, name)
            # This just means that there is some extra stuff inside that directory,
            # so I'm thinking we want to ignore it.
            except Exceptions.BadFilename as e:
                logging.info("Wrong file name: %s, skipping", e.msg)
                pass
Example #2
0
    def processPrediXcanFiles(self):
        logging.info("Loading people")
        all_people = Person.Person.loadPeople(self.samples_input, '\t', False)
        selected_people = Person.Person.loadPeople(self.samples_output)
        selected_people_by_id = {p.id: p for p in selected_people}
        logging.info("%d total people, %d selected", len(all_people),
                     len(selected_people_by_id))

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(
            self.snp_list)
        snp_dict = {k: True for k in snp_data_set.data}
        print len(snp_dict.keys())

        contents = Utilities.contentsWithPatternsFromFolder(
            self.dosage_folder, ["dosage.txt.gz"])
        for content_name in contents:
            input_path = os.path.join(self.dosage_folder, content_name)
            fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(
                input_path, self.output_folder, content_name, all_people,
                selected_people_by_id, snp_dict)
            if self.output_format == Formats.IMPUTE:
                fileBuilder.buildIMPUTE()
            if self.output_format == Formats.PrediXcan:
                fileBuilder.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
Example #3
0
def load_samples(args):
    s = None
    if args.text_sample_ids:
        if len(args.text_sample_ids) == 1:
            s = pandas.read_table(args.text_sample_ids[0],
                                  header=None,
                                  names=["FID", "IID"])
        elif args.text_sample_ids[1] == "UKB":
            k = pandas.read_table(args.text_sample_ids[0], sep=" ")
            k = k[k.sex != "D"].reset_index(drop=True)
            s = k[["ID_1", "ID_2"]].rename(columns={
                "ID_1": "FID",
                "ID_2": "IID"
            })
    elif args.vcf_genotypes:
        from metax.genotype import CYVCF2Genotype
        s = CYVCF2Genotype.get_samples(args.vcf_genotypes[0])
    elif args.bgen_genotypes:
        from metax.genotype import BGENGenotype
        s = BGENGenotype.get_samples(args.bgen_genotypes[0])
    elif args.generate_sample_ids:
        s = ["ID_{}".format(x) for x in range(0, args.generate_sample_ids)]
        s = [(x, x) for x in s]
        s = pandas.DataFrame(data=s, columns=["FID", "IID"])

    if s is None:
        raise Exceptions.InvalidArguments("Unsupported samples argument")
    return s
def readGWAS(args):
    start = timer()
    validate(args)
    regexp = re.compile(args.gwas_file_pattern) if args.gwas_file_pattern else  None
    names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder, regexp)
    names.sort() #cosmetic, because different filesystems/OS yield folders in different order

    if len(names) == 0:
        msg = "No GWAS files found on %s with pattern %s" % (args.gwas_folder, args.gwas_file_pattern,)
        raise Exceptions.ReportableException(msg)
    
    print "INFO: Reading GWAS data"
    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    #model = PredictionModel.load_model(args.model_db_path) if args.model_db_path else None
    model = None
    # dataframe
    r = pandas.DataFrame()
    for name in names:
        b = build_betas(args, model, gwas_format, name)
        r = pandas.concat([r,b])
    end = timer()
    logging.info("Successfully parsed input gwas in %s seconds"%(str(end-start)))
    print("Successfully parsed input gwas in %s seconds"%(str(end-start)))
    return r
Example #5
0
 def buildDosages(self):
     if self.input_format == Formats.IMPUTE:
         self.processIMPUTEFiles()
     elif self.input_format == Formats.PrediXcan:
         self.processPrediXcanFiles()
     else:
         raise Exceptions.InvalidInputFormat(self.input_format)
Example #6
0
def run(args):
    start = timer()
    validate(args)

    if args.gwas_folder:
        regexp = re.compile(
            args.gwas_file_pattern) if args.gwas_file_pattern else None
        names = Utilities.contentsWithRegexpFromFolder(args.gwas_folder,
                                                       regexp)
        names.sort(
        )  #cosmetic, because different filesystems/OS yield folders in different order

        if len(names) == 0:
            msg = "No GWAS files found on %s with pattern %s" % (
                args.gwas_folder,
                args.gwas_file_pattern,
            )
            raise Exceptions.ReportableException(msg)
    else:
        names = [args.gwas_file]

    gwas_format = GWASUtilities.gwas_format_from_args(args)
    GWAS.validate_format_basic(gwas_format)
    GWAS.validate_format_for_strict(gwas_format)
    model = PredictionModel.load_model(
        args.model_db_path,
        args.model_db_snp_key) if args.model_db_path else None

    if args.output_folder:
        if not os.path.exists(args.output_folder):
            os.makedirs(args.output_folder)

        for name in names:
            output_path = os.path.join(args.output_folder, name)
            if not ".gz" in output_path:
                output_path += ".gz"
            if os.path.exists(output_path):
                logging.info(
                    "%s already exists, delete it if you want it to be done again",
                    output_path)
                continue

            b = build_betas(args, model, gwas_format, name)
            c = "gzip" if ".gz" in name else None
            b.to_csv(output_path, sep="\t", index=False, compression=c)
        end = timer()
        logging.info("Successfully ran GWAS input processing in %s seconds" %
                     (str(end - start)))
    else:
        r = pandas.DataFrame()
        for name in names:
            b = build_betas(args, model, gwas_format, name)
            r = pandas.concat([r, b])
        end = timer()
        logging.info("Successfully parsed input gwas in %s seconds" %
                     (str(end - start)))

        return r
Example #7
0
def _prepare_phenotype(context):
    logging.info("Accquiring phenotype")
    context.pheno = _pheno_from_file_and_column(context.args.input_phenos_file, context.args.input_phenos_column)
    if context.args.mode == MTPMode.K_LOGISTIC:
        try:
            v = set([str(float(x)) for x in context.pheno])
            if not v.issubset({'0.0', '1.0', 'nan'}):
                raise Exceptions.InvalidArguments("Logistic regression was asked but phenotype is not binomial")
        except:
            raise Exceptions.InvalidArguments("Logistic regression: could not parse phenotype")

    context.mode = context.args.mode
    if context.args.covariates_file and context.args.covariates:
        context.mode = MTPMode.K_LINEAR
        logging.info("Acquiring covariates")
        context.covariates = _get_covariates(context.args)
        logging.info("Replacing phenotype with residuals")
        context.pheno = _get_residual(context.pheno, context.covariates)
Example #8
0
def dosage_generator(args, variant_mapping=None, weights=None):
    if args.liftover:
        logging.info("Acquiring liftover conversion")
        liftover_chain = pyliftover.LiftOver(args.liftover)
        liftover_conversion = lambda chr, pos: Genomics.lift(
            liftover_chain, chr, pos, args.zero_based_positions)
    else:
        liftover_chain = None
        liftover_conversion = None

    whitelist = None
    if variant_mapping and type(variant_mapping) == dict:
        logging.info("Setting whitelist from mapping keys")
        whitelist = set(variant_mapping.keys())
    else:
        logging.info("Setting whitelist from available models")
        whitelist = set(weights.rsid)

    d = None
    if args.text_genotypes:
        from metax.genotype import DosageGenotype
        d = DosageGenotype.dosage_files_geno_lines(
            args.text_genotypes,
            variant_mapping=variant_mapping,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic,
            liftover_conversion=liftover_conversion)
    elif args.bgen_genotypes:
        from metax.genotype import BGENGenotype
        d = BGENGenotype.bgen_files_geno_lines(
            args.bgen_genotypes,
            variant_mapping=variant_mapping,
            force_colon=args.force_colon,
            use_rsid=args.bgen_use_rsid,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic)
    elif args.vcf_genotypes:
        from metax.genotype import CYVCF2Genotype
        d = CYVCF2Genotype.vcf_files_geno_lines(
            args.vcf_genotypes,
            mode=args.vcf_mode,
            variant_mapping=variant_mapping,
            whitelist=whitelist,
            skip_palindromic=args.skip_palindromic,
            liftover_conversion=liftover_conversion)

    if d is None:
        raise Exceptions.InvalidArguments("unsupported genotype input")
    if args.force_mapped_metadata:
        d = Genotype.force_mapped_metadata(d, args.force_mapped_metadata)
    return d
Example #9
0
def prepare_prediction(args, extra, samples):
    logging.info("Preparing prediction")
    results = None
    if len(args.prediction_output) < 2:
        from metax.predixcan.Utilities import BasicPredictionRepository
        results = BasicPredictionRepository(samples, extra,
                                            args.prediction_output[0])
    else:
        if args.prediction_output[1] == "HDF5":
            from metax.predixcan.Utilities import HDF5PredictionRepository
            results = HDF5PredictionRepository(samples, extra,
                                               args.prediction_output[0])
        else:
            raise Exceptions.InvalidArguments(
                "Unsupported output specification")
    return results
Example #10
0
def expression_from_args(args, prediction_results = None):
    if prediction_results:
        if isinstance(prediction_results, BasicPredictionRepository):
            logging.info("Preparing PrediXcan context from data")
            expression = Expression.ExpressionFromData(prediction_results.genes)
        else:
            raise Exceptions.ReportableException("Invalid prediction results")
    elif args.hdf5_expression_file:
        logging.info("Preparing PrediXcan HDF5 context")
        expression = HDF5Expression.Expression(args.hdf5_expression_file)
    elif args.expression_file:
        logging.info("Preparing PrediXcan context")
        expression = PlainTextExpression.Expression(args.expression_file)
    else:
        raise RuntimeError("Could not build context from arguments")
    return expression
Example #11
0
 def buildPeople(self):
     if os.path.exists(self.samples_output):
         logging.info(
             "%s already exists, delete it if you want it figured out again",
             self.samples_output)
     else:
         if self.input_format == Formats.IMPUTE:
             Person.Person.buildFilteredSamples(
                 self.samples_input, self.samples_output,
                 self.population_group_filters, self.individual_filters)
         elif self.input_format == Formats.PrediXcan:
             Person.Person.buildFilteredSamples(
                 self.samples_input,
                 self.samples_output,
                 group_filters=self.population_group_filters,
                 individual_filters=self.individual_filters,
                 row_delimiter="\t",
                 skip_header=False)
         else:
             raise Exceptions.InvalidInputFormat(self.input_format)
Example #12
0
def get_variant_mapping(args, weights):
    mapping = None

    if len(args.variant_mapping):
        if len(args.variant_mapping) == 3:
            logging.info("Acquiring variant mapping")
            mapping = KeyedDataSource.load_data(args.variant_mapping[0],
                                                args.variant_mapping[1],
                                                args.variant_mapping[2],
                                                value_white_list=set(
                                                    weights.rsid))
            # if args.variant_mapping[1] == "UKB":
            #     mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "panel_variant_id", value_white_list=set(weights.rsid))
            # elif args.variant_mapping[1] == "RSID":
            #     mapping = KeyedDataSource.load_data(args.variant_mapping[0], "variant", "rsid", value_white_list=set(weights.rsid))
            # elif args.variant_mapping[1] == "ID_TO_RSID":
            #     mapping = KeyedDataSource.load_data(args.variant_mapping[0], "id", "rsid", value_white_list=set(weights.rsid))
        else:
            raise Exceptions.InvalidArguments(
                "Unsupported variant mapping argument")
    elif len(args.on_the_fly_mapping):
        checklist = set(weights.rsid)

    if len(args.on_the_fly_mapping) > 0:
        logging.info("Acquiring on-the-fly mapping")
        if args.on_the_fly_mapping[0] == "METADATA":
            if mapping:
                _mapping = mapping  # Python scope subtlety, they are not blocks like swift
                mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.map_on_the_fly(
                    _mapping, args.on_the_fly_mapping[1], chromosome, position,
                    ref_allele, alt_allele)
            else:
                mapping = lambda chromosome, position, ref_allele, alt_allele: Genomics.coordinate_format(
                    checklist, args.on_the_fly_mapping[1], chromosome,
                    position, ref_allele, alt_allele)
        else:
            raise RuntimeError("Unsupported on_the_fly argument")
    return mapping
Example #13
0
    def processIMPUTEFiles(self):
        logging.info("Loading people")
        names = Utilities.hapNamesFromFolder(self.dosage_folder)
        all_people = Person.Person.loadPeople(self.samples_input)

        selected_people = Person.Person.loadPeople(self.samples_output,
                                                   delim=" ")
        selected_people_by_id = {p.id: p for p in selected_people}

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(
            self.snp_list)
        snp_dict = {rsid: True for rsid in snp_data_set.data}

        for name in names:
            output = os.path.join(self.output_folder, name)
            filter = ThousandGenomesUtilities.IMPUTEFilteredDosageFileBuilder()
            filter.base_path = self.dosage_folder
            filter.name = name
            filter.output_pattern = output
            filter.snp_dict = snp_dict
            filter.all_people = all_people
            filter.selected_people_by_id = selected_people_by_id

            if self.output_format == Formats.IMPUTE:
                filter.buildIMPUTE()
            elif self.output_format == Formats.PrediXcan:
                search = self.chromosome_in_name_regex.search(name)
                exitIf(search is None, Exceptions.InvalidInputFormat, \
                             "No files found in '%s' that match the pattern, '%s'" \
                             % (self.dosage_folder, self.chromosome_in_name_regex.pattern))
                chr = search.group(1)
                filter.chromosome_name = chr
                filter.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
Example #14
0
def validate(args):
    if (args.gwas_file and args.gwas_folder) or (not args.gwas_file
                                                 and not args.gwas_folder):
        raise Exceptions.InvalidArguments(
            "Provide either (--gwas_file) or (--gwas_folder [--gwas_file_pattern])"
        )
Example #15
0
 def update(self, gene, dosage, weight):
     raise Exceptions.NotImplemented("gene_repository is not implemented")
Example #16
0
 def store_prediction(self):
     raise Exceptions.NotImplemented("gene_repository is not implemented")
Example #17
0
 def summary(self):
     raise Exceptions.NotImplemented("gene_repository is not implemented")
Example #18
0
def _check_args(args):
    if not args.mode in MTPMode.K_MODES:
        raise Exceptions.InvalidArguments("Invalid mode")
Example #19
0
def validate(args):
    if not args.gwas_folder:
        raise Exceptions.InvalidArguments(
            "You need to provide an input folder containing GWAS files")