Beispiel #1
0
def extend_samples(samp_hdf_fname, tdict_pkl_fname, reverse_tdict_pkl_fname,
                   ext_hdf_fname, max_samp=None):
    log.info("opening original samples file " + samp_hdf_fname)
    samp_hdfile = h5py.File(samp_hdf_fname, "r") 
    
    ext_mat = make_extension_matrix(samp_hdfile, tdict_pkl_fname, reverse_tdict_pkl_fname)
    
    log.info("creating extended samples file " + ext_hdf_fname)
    ext_hdfile = h5py.File(ext_hdf_fname, "w") 
    ext_samples = ext_hdfile.create_group("samples") 
    
    log.info("copying vocabulary ({0} terms)".format(len(samp_hdfile["vocab"])))
    ext_hdfile.create_dataset("vocab", data=samp_hdfile["vocab"])
    i = 0
    
    for lemma, lemma_group in samp_hdfile["samples"].iteritems():
        for pos, pos_group in lemma_group.iteritems():
            log.info(u"{0}: creating extended samples for {1}/{2}".format(i, lemma,pos))
            samp_mat = coo_matrix_from_hdf5(pos_group).tocsr()
            mat = (samp_mat * ext_mat).tocoo()
            group = ext_hdfile.create_group(u"samples/{0}/{1}".format(lemma,pos))
            coo_matrix_to_hdf5(mat, group, data_dtype="i1", compression="gzip")
            
            i += 1
            if i == max_samp:
                log.info("reached maximum number of samples")
                break
        if i == max_samp:
            break
    
    log.info("closing " + samp_hdf_fname)
    samp_hdfile.close()          

    log.info("closing " + ext_hdf_fname)
    ext_hdfile.close()          
Beispiel #2
0
def make_new_samples(sample_hdfile, filtered_hdfile, columns_selector):
    org_samples = sample_hdfile["samples"]
    filtered_samples = filtered_hdfile.create_group("samples")

    for lemma, lemma_group in org_samples.iteritems():
        for pos, pos_group in lemma_group.iteritems():
            lempos = lemma + u"/" + pos
            log.info("adding filtered samples for " + lempos)
            sample_mat = coo_matrix_from_hdf5(pos_group)
            sample_mat = sample_mat.tocsc()
            # select only columns corresponding to filtered vocabulary,
            # removing other columns
            sample_mat = sample_mat[:, columns_selector]
            # get indices of non-empty rows
            sample_mat = sample_mat.tolil()
            rows_selector = sample_mat.rows.nonzero()[0]
            # select only non-empty rows, removing empty rows
            sample_mat = sample_mat.tocsr()
            sample_mat = sample_mat[rows_selector]
            sample_mat = sample_mat.tocoo()
            filtered_group = filtered_samples.create_group(lempos)
            coo_matrix_to_hdf5(sample_mat, filtered_group, data_dtype="=i1", compression="gzip")