Example #1
0
    def _add_datasets(self, group, j, track_times):
        # Create a table
        table = self.h5file.create_table(group,
                                         f'table{j}',
                                         Record,
                                         title=self.title,
                                         filters=None,
                                         track_times=track_times)
        # Get the record object associated with the new table
        d = table.row
        # Fill the table
        for i in range(self.nrows):
            d['var1'] = '%04d' % (self.nrows - i)
            d['var2'] = i
            d['var3'] = i * 2
            d.append()  # This injects the Record values
        # Flush the buffer for this table
        table.flush()

        # Create a couple of arrays in each group
        var1List = [x['var1'] for x in table.iterrows()]
        var3List = [x['var3'] for x in table.iterrows()]

        self.h5file.create_array(group,
                                 f'array{j}',
                                 var1List,
                                 f"col {j}",
                                 track_times=track_times)

        # Create CArrays as well
        self.h5file.create_carray(group,
                                  name=f'carray{j}',
                                  obj=var3List,
                                  title="col {}".format(j + 2),
                                  track_times=track_times)

        # Create EArrays as well
        ea = self.h5file.create_earray(group,
                                       f'earray{j}',
                                       StringAtom(itemsize=4), (0, ),
                                       "col {}".format(j + 4),
                                       track_times=track_times)
        # And fill them with some values
        ea.append(var1List)

        # Finally VLArrays too
        vla = self.h5file.create_vlarray(group,
                                         f'vlarray{j}',
                                         Int16Atom(),
                                         "col {}".format(j + 6),
                                         track_times=track_times)
        # And fill them with some values
        vla.append(var3List)
Example #2
0
    def setup(self, node, block_size, blob_name, index_name):

        if not hasattr(node, blob_name):
            self.file_.create_earray(node,
                                     blob_name,
                                     StringAtom(itemsize=block_size), (0, ),
                                     filters=filters)

            description = {}
            description["index"] = Int64Col(pos=0)
            description["start"] = UInt32Col(pos=1)
            description["size"] = UInt32Col(pos=2)

            # every colums which appears in a where method call should/must be indexed !
            # this is not only for performance but for correct lookup as well (I had strange bugs
            # else)
            string_index = self.file_.create_table(node,
                                                   index_name,
                                                   description,
                                                   filters=None)
            string_index.cols.index.create_index()
 def test_from_kind_04(self):
     atom1 = Atom.from_kind('string', itemsize=5, dflt=b'hello')
     atom2 = StringAtom(itemsize=5, shape=(), dflt=b'hello')
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
 def test_from_dtype_03(self):
     with self.assertWarns(Warning):
         atom1 = Atom.from_dtype(numpy.dtype('U5'), dflt=b'hello')
     atom2 = StringAtom(itemsize=5, shape=(), dflt=b'hello')
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
 def test_init_parameters_03(self):
     atom1 = StringAtom(itemsize=12)
     self.assertRaises(TypeError, atom1.copy, foobar=42)
 def test_init_parameters_02(self):
     atom1 = StringAtom(itemsize=12)
     atom2 = atom1.copy(itemsize=100, shape=(2, 2))
     self.assertEqual(atom2, StringAtom(itemsize=100,
                                        shape=(2, 2),
                                        dflt=b''))
 def test_init_parameters_01(self):
     atom1 = StringAtom(itemsize=12)
     atom2 = atom1.copy()
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
     self.assertFalse(atom1 is atom2)
Example #8
0
 def test_init_parameters_02(self):
     atom1 = StringAtom(itemsize=12)
     atom2 = atom1.copy(itemsize=100, shape=(2, 2))
     self.assertEqual(atom2,
                      StringAtom(itemsize=100, shape=(2, 2), dflt=b''))
Example #9
0
 def test_init_parameters_01(self):
     atom1 = StringAtom(itemsize=12)
     atom2 = atom1.copy()
     self.assertEqual(atom1, atom2)
     self.assertEqual(str(atom1), str(atom2))
     self.assertFalse(atom1 is atom2)
Example #10
0
def combine_sample_genotypes(path_x,
                             path_y,
                             output_path,
                             contig,
                             samples_x=None,
                             samples_y=None):

    if isfile(output_path):
        raise FileExistsError("out path already exists")

    h5file = openFile(output_path, mode="w")

    # load 1st hdf5
    fh_a = h5py.File(path_x)
    fh_b = h5py.File(path_y)

    # load genotypes
    ga = allel.GenotypeCArray.from_hdf5(fh_a[contig]["calldata"]["genotype"])
    gb = allel.GenotypeCArray.from_hdf5(fh_b[contig]["calldata"]["genotype"])

    alleles = ga.count_alleles()
    biallelic = np.array(alleles.max_allele() < 2)

    # load positions
    pos_a = fh_a[contig]["variants"]["POS"][:]
    pos_b = fh_b[contig]["variants"]["POS"][:]

    # filter out non-biallelic sites:
    ga = ga.compress(biallelic, axis=0)
    pos = np.compress(biallelic, pos_a, axis=0)
    ref = np.compress(biallelic, fh_a[contig]["variants"]["REF"][:], axis=0)
    alt = np.compress(biallelic, fh_a[contig]["variants"]["ALT"][:], axis=0)

    assert np.array_equal(pos, pos_b)

    # samples
    samplesa = fh_a[contig]["samples"][:]
    samplesb = fh_b[contig]["samples"][:]

    if samples_y:
        l = [s.decode() for s in samplesb]
        idx = [l.index(s) for s in samples_y]
        gb = gb.take(idx, axis=1)
        samplesb = samples_y

    if samples_x:
        l = [s.decode() for s in samplesa]
        idx = [l.index(s) for s in samples_x]
        ga = ga.take(idx, axis=1)
        samplesa = samples_x

    root = h5file.root

    # Create the groups
    chrom = h5file.create_group(root, contig)
    grp_calldata = h5file.create_group(chrom, "calldata")
    grp_variants = h5file.create_group(chrom, "variants")

    # create objects

    filters = Filters(complevel=1, complib='zlib')
    sample_names = np.concatenate([samplesa, samplesb]).astype("|S10")
    h5file.create_array(chrom, 'samples', sample_names)

    number_sites = ga.shape[0]

    position = h5file.create_earray(grp_variants,
                                    name='POS',
                                    atom=IntAtom(itemsize=4),
                                    expectedrows=number_sites,
                                    shape=(0, ),
                                    filters=filters)

    reference = h5file.create_earray(grp_variants,
                                     name='REF',
                                     atom=StringAtom(itemsize=1),
                                     expectedrows=number_sites,
                                     shape=(0, ),
                                     filters=filters)

    alternate = h5file.create_earray(grp_variants,
                                     name='ALT',
                                     atom=StringAtom(itemsize=1),
                                     expectedrows=number_sites,
                                     shape=(0, 3),
                                     filters=filters)

    genotypes = h5file.create_earray(grp_calldata,
                                     name='genotype',
                                     atom=IntAtom(itemsize=1),
                                     expectedrows=number_sites,
                                     shape=(0, sample_names.size, 2),
                                     filters=filters)

    chunks = np.arange(0, number_sites, chunk_size)
    chunks[-1] = number_sites

    for start, stop in zip(chunks[:-1], chunks[1:]):

        gt = np.hstack([ga[start:stop], gb[start:stop]])
        genotypes.append(gt)

        position.append(pos[start:stop])
        reference.append(ref[start:stop])
        alternate.append(alt[start:stop])

    h5file.close()