Example #1
0
    def processPrediXcanFiles(self):
        logging.info("Loading people")
        all_people = Person.Person.loadPeople(self.samples_input, '\t', False)
        selected_people = Person.Person.loadPeople(self.samples_output)
        selected_people_by_id = {p.id: p for p in selected_people}
        logging.info("%d total people, %d selected", len(all_people),
                     len(selected_people_by_id))

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(
            self.snp_list)
        snp_dict = {k: True for k in snp_data_set.data}
        print len(snp_dict.keys())

        contents = Utilities.contentsWithPatternsFromFolder(
            self.dosage_folder, ["dosage.txt.gz"])
        for content_name in contents:
            input_path = os.path.join(self.dosage_folder, content_name)
            fileBuilder = PrediXcanFormatUtilities.PrediXcanFormatFilteredFilesProcess(
                input_path, self.output_folder, content_name, all_people,
                selected_people_by_id, snp_dict)
            if self.output_format == Formats.IMPUTE:
                fileBuilder.buildIMPUTE()
            if self.output_format == Formats.PrediXcan:
                fileBuilder.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)
Example #2
0
    def processIMPUTEFiles(self):
        logging.info("Loading people")
        names = Utilities.hapNamesFromFolder(self.dosage_folder)
        all_people = Person.Person.loadPeople(self.samples_input)

        selected_people = Person.Person.loadPeople(self.samples_output,
                                                   delim=" ")
        selected_people_by_id = {p.id: p for p in selected_people}

        logging.info("Loading snps")
        snp_data_set = DataSet.DataSetFileUtilities.loadFromCompressedFile(
            self.snp_list)
        snp_dict = {rsid: True for rsid in snp_data_set.data}

        for name in names:
            output = os.path.join(self.output_folder, name)
            filter = ThousandGenomesUtilities.IMPUTEFilteredDosageFileBuilder()
            filter.base_path = self.dosage_folder
            filter.name = name
            filter.output_pattern = output
            filter.snp_dict = snp_dict
            filter.all_people = all_people
            filter.selected_people_by_id = selected_people_by_id

            if self.output_format == Formats.IMPUTE:
                filter.buildIMPUTE()
            elif self.output_format == Formats.PrediXcan:
                search = self.chromosome_in_name_regex.search(name)
                exitIf(search is None, Exceptions.InvalidInputFormat, \
                             "No files found in '%s' that match the pattern, '%s'" \
                             % (self.dosage_folder, self.chromosome_in_name_regex.pattern))
                chr = search.group(1)
                filter.chromosome_name = chr
                filter.buildPrediXcan()
            else:
                raise Exceptions.InvalidOutputFormat(self.output_format)