def test_double_iter(self): out = Major_reader(self.sample_major, self.pheno_file) reader = out.read('V1', 100) geno, pheno = next(reader) n_geno = geno.shape[0] n_pheno = len(pheno) self.assertEqual(n_geno, n_pheno)
def __init__(self, plink_file: str, pheno_file, pheno_name: str, batch_size: int, ldblock_file: str = None, shuffle=True): """ Primary data generator for keras :param plink_file: path of a plink file in sample major format :param pheno_file: path of the pheno file :param pheno_name: name of the phenotype :param batch_size: size of the mini batches :param ldblock_file: path of the ld block file (bed) (optional) :param shuffle: bool if the data should be shuffled """ Major_reader.__init__(self, plink_file, pheno_file, ldblock_file) self.batch_size = batch_size self.shuffle = shuffle self.pheno_name = pheno_name self.indexes = np.arange(0, self.n, dtype=int) self.on_epoch_end() self.dims = None if ldblock_file is not None: self.block_sequence = self._generate_ld_split_sequence() # check dims tx, ty = self.__getitem__(0) self.dims = [k.shape[1] for k in tx] else: self.block_sequence = None self.dims = self.p
def test_pheno_reader(self): pheno = pd.read_table(self.pheno_file) batch_size = 100 out = Major_reader(self.sample_major, self.pheno_file) reader = out._iter_pheno('V1', batch_size) batch = next(reader) compare = batch == pheno.V1.values[:batch_size] lg.debug(compare[0:10]) self.assertTrue(batch_size, np.sum(compare))
def test_one_iter_geno(self): out = Major_reader(self.sample_major, self.pheno_file) iterat = out._one_iter_geno() mat = np.load(self.sample_major_numpy) compare = list() maxiter = 20 for i, geno in enumerate(iterat): compare.append(np.mean(mat[0][i] == geno.flatten())) if i >= maxiter: break self.assertEqual(np.sum(compare), maxiter + 1)
def test_one_iter_pheno(self): out = Major_reader(self.sample_major, self.pheno_file) iterat = out._one_iter_pheno('V1') pheno = pd.read_table(self.pheno_file) maxiter = 20 compare = list() for i, ph in enumerate(iterat): compare.append(np.mean(pheno.V1[i] == ph)) if i >= maxiter: break self.assertEqual(np.sum(compare), maxiter + 1)
def test_binary_genotype(self): bits = '00011011' expected_genotypes = [0, 1, 9, 2] a = bitarray(bits) input_bytes = a.tobytes() lg.debug('Used bytes: %s', input_bytes) out = Major_reader(self.sample_major, self.pheno_file) genotypes = out._bgeno(input_bytes) lg.debug('outputed genotypes: %s', genotypes) comparision = [genotypes[i] == expected_genotypes[i] for i in range(4)] lg.debug('Comparision result: %s', comparision) comparision = sum(comparision) self.assertEqual(comparision, 4)
def test_geno_read(self): gold_data = np.load(self.sample_major_numpy) lg.debug('index gold: %s', gold_data[1][0:10]) gold_data = gold_data[0] n_gold, p_gold = gold_data.shape lg.debug('Number of samples: %s Number of SNPs %s in gold', n_gold, p_gold) out = Major_reader(self.sample_major, self.pheno_file) reader = out._iter_geno(n_gold) genotype_matrix = next(reader) n, p = genotype_matrix.shape self.assertEqual(n_gold, n) self.assertEqual(p_gold, p) lg.debug('Gold: %s', gold_data[0, 0:10]) lg.debug('Sample-Major: %s', genotype_matrix[0, 0:10]) sub_i = genotype_matrix[0] == gold_data[0] self.assertEqual(np.sum(sub_i), p)
class Predict(object): def __init__(self, train_path: str, dev_path: str, pheno: str, batch_size: int, pheno_dev: str = None): super(Predict, self).__init__() self._plink_train_path = train_path self._plink_dev_path = dev_path self.train = Major_reader(train_path, pheno) assert (self.train.n / batch_size).is_integer() if pheno_dev is None: self.dev = Major_reader(dev_path, pheno) else: self.dev = Major_reader(dev_path, pheno_dev) assert (self.dev.n / batch_size).is_integer() self.batch_size = batch_size self.results = None self.num_dev_iter = int(self.dev.n / batch_size) lg.info( 'Using %s for training and %s for devop. Mini-batch size for both is set to %s', self.train.n, self.dev.n, batch_size) def fit(self, pheno: str, penal: str, lamb: float, l_rate: float, epochs: int = 201, logging_freq: int = 100, type: str = 'c'): assert pheno in self.train.pheno_names assert pheno in self.dev.pheno_names train_reader = self.train.read(pheno, self.batch_size) dev_reader = self.dev.read(pheno, self.batch_size) lg.debug('Finished setting up the iterators') model = pytorch_linear(train_reader, dev_reader, self.train.p, self.train.n, self.num_dev_iter, self.batch_size, type) lg.debug('Set up linear model') self.results = model.run(penal, lamb, epochs, l_rate, logging_freq) lg.debug('Model finished')
def test_binary_genotype_overflow(self): expected_genotypes = [0, 1, 0, 2, 2, 2] a = bitarray('00011011' '11110000', endian='big') size = -(-len(expected_genotypes) // 4) over_flow = size * 4 - len(expected_genotypes) to_remove = [len(expected_genotypes) + k for k in range(over_flow)] input_bytes = a.tobytes() lg.debug('Used bytes: %s', input_bytes) out = Major_reader(self.sample_major, self.pheno_file) out._to_remove = to_remove lg.debug('Removing the following: %s', to_remove) genotypes = out._binary_genotype(input_bytes) self.assertEqual(len(genotypes), len(expected_genotypes)) lg.debug('outputed genotypes: %s', genotypes) comparision = [genotypes[i] == expected_genotypes[i] for i in range(6)] lg.debug('Comparision result: %s', comparision) comparision = sum(comparision) self.assertEqual(comparision, 6)
def test_one_iter(self): out = Major_reader(self.sample_major, self.pheno_file) iterat = out.one_iter('V1') mat = np.load(self.sample_major_numpy) pheno = pd.read_table(self.pheno_file) maxiter = 20 compare = list() for i, value in enumerate(iterat): geno, ph = value compare.append(np.mean(pheno.V1[i] == ph)) compare.append(np.mean(mat[0][i] == geno.flatten())) if i >= maxiter: break self.assertEqual(np.sum(compare), (maxiter + 1) * 2) # test with missingness r = np.random.choice(range(maxiter), 1) lg.debug('Replacing position %s with nan', r[0]) pheno.V1.iloc[r] = np.nan path_to_missing_file = '.pheno_with_missing.csv' pheno.to_csv(path_to_missing_file, index=False, sep='\t') out = Major_reader(self.sample_major, path_to_missing_file) iterat = out.one_iter('V1') compare = list() for i, value in enumerate(iterat): geno, ph = value if np.isnan(ph): lg.debug('ph %s', ph) compare.append(np.mean(1 == np.mean(mat[0][i] == geno.flatten()))) if i >= maxiter: break expected = r[0] self.assertEqual(expected, np.sum(compare))
def test_continious_geno_read(self): fam = pd.read_table(self.pheno_file) batch_size = 100 nn = fam.shape[0] out = Major_reader(self.sample_major, self.pheno_file) reader = out._iter_geno(100) first = next(reader) to_end = nn // batch_size - 1 overlap = nn - nn // batch_size lg.debug('estimated overlap is %s', overlap) lg.debug('steps to end: %s', to_end) for i in range(to_end): lg.debug(i) batch = next(reader) batch = next(reader) lg.debug('shape of first is %s', first.shape) lg.debug('shape of last is %s', batch.shape) expected_overlap = batch_size * (nn // batch_size + 1) - nn compare = batch[4] == first[0] lg.debug(compare) self.assertEqual(out.p, np.sum(compare))
def test_shuffle(self): out = Major_reader(self.sample_major, self.pheno_file) mat = np.load(self.sample_major_numpy)[0] n, p = mat.shape pheno = pd.read_table(self.pheno_file) pheno = pheno.V1.values maxiter = 20 compare = list() ids = np.arange(0, n, dtype=int) np.random.shuffle(ids) geno_iter = out._one_iter_geno(ids) pheno_iter = out._one_iter_pheno('V1', ids) for i, g, p in zip(ids, geno_iter, pheno_iter): geno_comparision = np.equal(g, mat[i, :]) pheno_comparision = np.equal(p, pheno[i]) lg.debug('Index: %s: Geno: %s Pheno: %s', i, geno_comparision.all(), pheno_comparision.all()) if geno_comparision.all() and pheno_comparision.all(): compare.append(True) else: compare.append(False) self.assertEqual(np.sum(compare), len(ids))
def __init__(self, train_path: str, dev_path: str, pheno: str, batch_size: int, pheno_dev: str = None): super(Predict, self).__init__() self._plink_train_path = train_path self._plink_dev_path = dev_path self.train = Major_reader(train_path, pheno) assert (self.train.n / batch_size).is_integer() if pheno_dev is None: self.dev = Major_reader(dev_path, pheno) else: self.dev = Major_reader(dev_path, pheno_dev) assert (self.dev.n / batch_size).is_integer() self.batch_size = batch_size self.results = None self.num_dev_iter = int(self.dev.n / batch_size) lg.info( 'Using %s for training and %s for devop. Mini-batch size for both is set to %s', self.train.n, self.dev.n, batch_size)
def test_check_magic_number(self): with self.assertRaises(ValueError): out = Major_reader(self.plink_file, self.pheno_file) out = Major_reader(self.sample_major, self.pheno_file) self.assertTrue(out._is_sample_major)