Ejemplo n.º 1
0
    def load_family_details(self, pheno_covar):
        """Load family data updating the pheno_covar with  family ids found.

        :param pheno_covar: Phenotype/covariate object
        :return: None
        """
        file = open(self.fam_details)
        header = file.readline()
        format = file.readline()
        self.file_index = 0

        mask_components = []  # 1s indicate an individual is to be masked out
        for line in file:
            words = line.strip().split()
            indid = ":".join(words[0:2])
            if DataParser.valid_indid(indid):
                mask_components.append(0)
                sex = int(words[5])
                pheno = float(words[6])
                pheno_covar.add_subject(indid, sex, pheno)
            else:
                mask_components.append(1)
        mask_components = numpy.array(mask_components)
        self.ind_mask = numpy.zeros(len(mask_components) * 2,
                                    dtype=numpy.int8).reshape(-1, 2)
        self.ind_mask[0:, 0] = mask_components
        self.ind_mask[0:, 1] = mask_components
        self.ind_count = self.ind_mask.shape[0]
        pheno_covar.freeze_subjects()
Ejemplo n.º 2
0
    def load_fam(self, pheno_covar=None):
        """Load contents from the .fam file, updating the pheno_covar with \
            family ids found.

        :param pheno_covar: Phenotype/covariate object
        :return: None
        """
        logging.info("Loading file: %s" % (self.fam_file))
        pheno_col = 5
        if not DataParser.has_sex:
            pheno_col -= 1
        if not DataParser.has_parents:
            pheno_col -= 2
        if not DataParser.has_fid:
            pheno_col -= 1

        sex_col = pheno_col - 1
        mask_components = []
        for line in open(self.fam_file):
            words = line.strip().split()
            if len(words) > 1:
                indid = ":".join(words[0:2])
                if DataParser.valid_indid(indid):
                    mask_components.append(0)

                    sex = None
                    pheno = None
                    if DataParser.has_sex:
                        sex = int(words[sex_col])
                    if DataParser.has_pheno:
                        pheno = float(words[pheno_col])
                    if pheno_covar is not None:
                        pheno_covar.add_subject(indid, sex, pheno)
                    if len(words) > 0:
                        self.families.append(words)
                else:
                    mask_components.append(1)
        mask_components = numpy.array(mask_components)
        self.ind_mask = numpy.zeros(len(mask_components), dtype=numpy.int8)
        self.ind_mask = mask_components
        self.ind_count = self.ind_mask.shape[0]
        if pheno_covar is not None:
            pheno_covar.freeze_subjects()
    def load_tfam(self, pheno_covar):
        """Load the pedigree portion of the data and sort out exclusions"""

        pheno_col = 5
        if not DataParser.has_sex:
            pheno_col -= 1
        if not DataParser.has_parents:
            pheno_col -= 2
        if not DataParser.has_fid:
            pheno_col -= 1

        sex_col = pheno_col - 1
        mask_components = []
        for line in open(self.tfam_file):
            words = line.strip().split()
            if len(words) > 1:
                indid = ":".join(words[0:2])
                if DataParser.valid_indid(indid):
                    mask_components.append(0)

                    sex = None
                    pheno = None
                    if DataParser.has_sex:
                        sex = int(words[sex_col])
                    if DataParser.has_pheno:
                        pheno = float(words[pheno_col])
                    if pheno_covar is not None:
                        pheno_covar.add_subject(indid, sex, pheno)
                    if len(words) > 0:
                        self.families.append(words)
                else:
                    mask_components.append(1)
        mask_components = numpy.array(mask_components)
        self.ind_mask = numpy.zeros(len(mask_components) * 2,
                                    dtype=numpy.int8).reshape(-1, 2)
        self.ind_mask[0:, 0] = mask_components
        self.ind_mask[0:, 1] = mask_components
        self.ind_count = self.ind_mask.shape[0]

        if pheno_covar is not None:
            pheno_covar.freeze_subjects()
        self.load_genotypes()
Ejemplo n.º 4
0
    def load_family_details(self, pheno_covar):
        """Load contents from the .fam file, updating the pheno_covar with \
            family ids found.

        :param pheno_covar: Phenotype/covariate object
        :return: None
        """
        self.file_index = 0
        # 1s indicate an individual is to be masked out
        mask_components = []

        file = self.family_details
        if DataParser.compressed_pedigree:
            data, serr = sys_call('gunzip -c %s | wc -l' % (file))
            self.line_count = int(data[0].strip().split(" ")[0])
            iddata, serr = sys_call('gunzip -c %s | cut -f 1' % (file))
        else:
            data, serr = sys_call('wc -l %s' % (file))
            self.line_count = int(data[0].strip().split(" ")[0])
            iddata, serr = sys_call('cat %s | cut -f 1' % (file))

        ids_observed = set()
        for line in iddata:
            indid = line.strip().split()[0]
            indid = ":".join(indid.split("->"))

            ExitIf("Duplicate ID found in dose file: %s" % (indid), indid
                   in ids_observed)
            ids_observed.add(indid)

            if DataParser.valid_indid(indid):
                mask_components.append(0)
                pheno_covar.add_subject(indid, PhenoCovar.missing_encoding,
                                        PhenoCovar.missing_encoding)
            else:
                mask_components.append(1)

        self.ind_mask = numpy.array(mask_components) == 1
        self.ind_count = self.ind_mask.shape[0]
        pheno_covar.freeze_subjects()
Ejemplo n.º 5
0
    def load_genotypes(self, pheno_covar):
        """Load all data into memory and propagate valid individuals to \
        pheno_covar.

        :param pheno_covar: Phenotype/covariate object is updated with subject
        information
        :return: None
        """

        first_genotype = 6
        pheno_col = 5
        if not DataParser.has_sex:
            first_genotype -= 1
            pheno_col -= 1
        if not DataParser.has_parents:
            first_genotype -= 2
            pheno_col -= 2
        if not DataParser.has_pheno:
            first_genotype -= 1
        if not DataParser.has_fid:
            first_genotype -= 1
            pheno_col -= 1
        if DataParser.has_liability:
            first_genotype += 1

        sex_col = pheno_col - 1
        individual_mask = []
        self.individual_mask = []
        dropped_individuals = []

        # number of missing SNPs we can tolerate before dropping an individual
        max_missing_for_individual = numpy.sum(
            self.snp_mask[:, 0] == 0) * DataParser.ind_miss_tol

        if DataParser.compressed_pedigree:
            ind_count, err = sys_call("gzip -cd %s | wc -l" %
                                      ("%s.gz" % (self.datasource)))
        else:
            ind_count, err = sys_call("wc -l %s" % (self.datasource))
        ind_count = int(ind_count[0].split()[0]) + 1

        snp_count = numpy.sum(self.snp_mask[:, 0] == 0)

        allelic_data = numpy.empty((ind_count, snp_count, 2), dtype='S1')

        valid_allele_count = 0
        if DataParser.compressed_pedigree:
            input_file = gzip.open("%s.gz" % self.datasource, 'rb')
        else:
            input_file = open(self.datasource)

        for line in input_file:
            line = line.strip()
            if len(line) > 0:
                raw_data = line.strip().split()
                alleles = numpy.ma.MaskedArray(
                    numpy.array(raw_data[first_genotype:]).reshape(-1, 2),
                    self.snp_mask).compressed().reshape(-1, 2)

                # Convert the alleles into genotypes
                indid = ":".join(raw_data[0:2])
                if not DataParser.has_fid:
                    indid = raw_data[0]

                # Ignore any subjects that are to be excluded and remove those
                # that have too much missingness
                if DataParser.valid_indid(indid):
                    missing = numpy.sum(
                        alleles[:, 0] == DataParser.missing_representation)

                    if missing > max_missing_for_individual:
                        individual_mask += [1, 1]
                        self.individual_mask.append(1)
                        dropped_individuals.append(indid)
                    else:
                        sex = None
                        phenotype = None
                        if DataParser.has_pheno:
                            phenotype = float(raw_data[pheno_col])
                        if DataParser.has_sex:
                            sex = int(raw_data[sex_col])
                        if pheno_covar is not None:
                            pheno_covar.add_subject(indid, sex, phenotype)
                        individual_mask += [0, 0]
                        self.individual_mask.append(0)
                        allelic_data[valid_allele_count] = alleles
                        valid_allele_count += 1

                else:
                    individual_mask += [1, 1]
                    self.individual_mask.append(1)
        self.ind_count = valid_allele_count
        allelic_data = allelic_data[0:valid_allele_count]
        self.genotypes = numpy.empty((snp_count, valid_allele_count))
        max_missing_individuals = DataParser.snp_miss_tol * ind_count
        dropped_loci = []
        valid_snps = 0
        valid_markers = []
        valid_rsids = []
        valid_maf = []
        valid_allele_list = []
        allele_count2s = []

        for i in xrange(0, snp_count):
            snp_geno = allelic_data[:, i]
            alleles = list(
                set(numpy.unique(snp_geno)) -
                set([DataParser.missing_representation]))

            if len(alleles) > 2:
                raise TooManyAlleles(chr=self.markers[i][0],
                                     rsid=self.rsids[i],
                                     alleles=alleles)

            allele_count1 = numpy.sum(snp_geno == alleles[0])
            allele_count2 = 0
            maf = 0

            if len(alleles) > 1:
                allele_count2 = numpy.sum(snp_geno == alleles[1])
                real_allele_count2 = allele_count2

                if allele_count2 > allele_count1:
                    sorted_alleles = [alleles[1], alleles[0]]
                    alleles = sorted_alleles
                    allele_count = allele_count1
                    allele_count1 = allele_count2
                    allele_count2 = allele_count
                maf = allele_count2 / float(allele_count1 + allele_count2)
                allele_count2s.append(allele_count2)
                #genotypes = []
                major_allele = alleles[0]
                minor_allele = alleles[1]

                genotype_data = numpy.sum(snp_geno == alleles[1], axis=1)
                genotype_data[
                    snp_geno[:, 0]==DataParser.missing_representation] = \
                    DataParser.missing_storage
            else:
                major_allele = alleles[0]
                minor_allele = '?'

            missing = numpy.sum(genotype_data == DataParser.missing_storage)
            if maf == 0 or maf < DataParser.min_maf or \
                            maf > DataParser.max_maf or \
                            max_missing_individuals < missing:
                locus_details = self.markers[i]
                DataParser.boundary.dropped_snps[locus_details[0]].add(
                    locus_details[1])
                dropped_loci.append("%s:%s" %
                                    (locus_details[0], locus_details[1]))
                self.invalid_loci.append(i)
            else:
                self.genotypes[valid_snps, :] = genotype_data
                valid_snps += 1
                valid_markers.append(list(self.markers[i]))
                valid_rsids.append(self.rsids[i])
                valid_allele_list.append([major_allele, minor_allele])
                valid_maf.append(maf)

        self.markers = valid_markers
        self.alleles = valid_allele_list
        self.rsids = valid_rsids
        self.locus_count = valid_snps
        self.genotypes = self.genotypes[0:self.locus_count, :]
        self.allele_count2s = allele_count2s