def __init__(self, name, config): self.config = config self.name = name self.max_len = int(config[self.name + '_max_len']) self.dim_emb = int(config[self.name + '_emb_dim']) self.fuelfile = config['dsdir'] + '_' + name + '.h5py' self.nn_model = config[self.name + '_nn'] self.emb_file = self.config['dsdir'] + '_' + self.name + '_embeddings.h5py' self.tune_tune = str_to_bool(self.config[self.name + '_tune_embedding']) self.load_emb = str_to_bool(self.config[self.name + '_load_embedding']) if self.load_emb == False: assert self.tune_tune == True self.rec = False if 'lstm' in self.nn_model or 'rnn' in self.nn_model or 'gru' in self.nn_model: self.rec = True
def __init__(self, name, config): self.config = config self.name = name self.max_len = int(config[self.name + '_max_len']) self.dim_emb = int(config[self.name + '_emb_dim']) self.fuelfile = config['dsdir'] + '_' + name + '.h5py' self.nn_model = config[self.name + '_nn'] self.emb_file = self.config[ 'dsdir'] + '_' + self.name + '_embeddings.h5py' self.tune_tune = str_to_bool(self.config[self.name + '_tune_embedding']) self.load_emb = str_to_bool(self.config[self.name + '_load_embedding']) if self.load_emb == False: assert self.tune_tune == True self.rec = False if 'lstm' in self.nn_model or 'rnn' in self.nn_model or 'gru' in self.nn_model: self.rec = True
def main(args): print 'loading config file', args[1] config = cmn.loadConfig(args[1]) dsdir = config['dsdir'] #first generating name datasets based on the number of names for each set if not os.path.exists(os.path.join(dsdir,'train.txt')): generate_name_dataset(config) trainfile = dsdir + '/train.txt' devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' targetTypesFile=config['typefile'] vectorFile = config['ent_vectors'] vectorFile_words = config['word_vectors'] if 'word_vectors' in config else vectorFile subword_vectorFile = config['fasttext_vecfile'] if 'fasttext_vecfile' in config else None ent2tfidf_features_path = config['ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None # the_features = config['features'].split(' ') #i.e. letters entvec words tc ngrams = [int(n) for n in config['ngrams_n'].split()] if 'ngrams_n' in config else [] ngrams_vecfiles = {ngram: config['ngrams'+str(ngram)+'_vecfile'] for ngram in ngrams} letter_vecfile = config['letters_vecfile'] if 'letters_vecfile' in config else None hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None hs_ngram_versions = config['hsngram_vecs'].split() if hs_ngram_path else None use_lowercase = str_to_bool(config['use_lowercase']) if 'use_lowercase' in config else False print "uselower: ", use_lowercase upto = -1 (t2idx, _) = cmn.loadtypes(targetTypesFile) trnMentions = load_ent_ds(trainfile) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions)) if not os.path.exists(os.path.join(dsdir,'_targets.h5py')): build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40) build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) if hs_ngram_path: build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1) for ng in ngrams: build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1) build_type_patterns(trnMentions, t2idx, dsdir, vectorFile) save_typevecmatrix(t2idx, dsdir, vectorFile) # build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1) build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1) else: build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False): hidden_size = config['hidden_units'].split() use_highway = str_to_bool( config['use_highway']) if 'use_highway' in config else False use_gaus = str_to_bool( config['use_gaus']) if 'use_gaus' in config else False use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0 use_noise = str_to_bool( config['use_noise']) if 'use_noise' in config else False use_vae = str_to_bool(config['use_vae']) if 'use_vae' in config else False hu_decoder = int( config['hu_decoder']) if 'hu_decoder' in config else hidden_size logger.info( 'use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size) init_with_type = str_to_bool( config['init_with_type']) if 'init_with_type' in config else False y = T.matrix('targets', dtype='int32') drop_prob = float(config['dropout']) if 'dropout' in config else 0 #build the feature vector with one model, e.g., with cnn or mean or lstm feature_vec, feature_vec_len = build_feature_vec(fea2obj, config) #drop out if drop_prob > 0: mask = T.cast( srng.binomial(n=1, p=1 - drop_prob, size=feature_vec.shape), 'float32') if test: feature_vec *= (1 - drop_prob) else: feature_vec *= mask #Highway network if use_highway: g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp') t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp') initialize([g_mlp, t_mlp]) t = t_mlp.apply(feature_vec) z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec feature_vec = z #MLP(s) logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size)) if len(hidden_size) > 1: #2 MLP on feature fector mlp = MLP( activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = int(hidden_size[1]) else: hidden_size = int(hidden_size[0]) mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = hidden_size #compute y_hat initial guess hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets) typemfile = None if init_with_type: typemfile = config['dsdir'] + '/_typematrix.npy' #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy' initialize_lasthid(hidden_to_output, typemfile) # initialize([hidden_to_output]) y_hat_init = Logistic().apply(hidden_to_output.apply(before_out)) y_hat_init.name = 'y_hat_init' y_hat_init = debug_print(y_hat_init, 'yhat_init', False) logpy_xz_init = cross_entropy_loss(y_hat_init, y) logpy_xz = logpy_xz_init y_hat_recog = y_hat_init y_hat = y_hat_init KLD = 0 if use_gaus: if use_vae: logger.info('using VAE') vae_conditional = str_to_bool(config['vae_cond']) if vae_conditional: y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal( kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) else: y_hat, logpy_xz, KLD = build_vae_basic( kl_weight, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) y_hat_recog = y_hat else: if use_rec: logger.info('Not using VAE... but using recursion') prior_in = T.concatenate([feature_vec, y_hat_init], axis=1) mu_prior, log_sigma_prior = prior_network( x=prior_in, n_input=feature_vec_len + num_targets, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z + feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat = (y_hat + y_hat_init) / 2. logpy_xz = (logpy_xz + logpy_xz_init) / 2. else: prior_in = T.concatenate([feature_vec], axis=1) mu_prior, log_sigma_prior = prior_network( x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z + feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat_recog = y_hat y_hat = debug_print(y_hat, 'y_hat', False) pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)]) max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False) pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type]) mean_cross = T.mean(logpy_xz) mean_kld = T.mean(KLD) cost = mean_kld + mean_cross cost.name = 'cost' mean_kld.name = 'kld' mean_cross.name = 'cross_entropy_loss' pat1.name = 'p@1' pat1_recog.name = 'p@1_recog' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
t2idx=t2idx) return in_trn_matrix, in_tst_matrix, in_dev_matrix print 'loading config file', sys.argv[1] config = cmn.loadConfig(sys.argv[1]) hdf5_file = config['fuelfile'] trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] targetTypesFile = config['typefile'] max_len_name = int(config['max_name_length']) vectorFile = config['ent_vectors'] if 'typecosine' in config: usetypecosine = cmn.str_to_bool(config['typecosine']) brownMappingFile = config['brownclusters'] maxngram = int(config['maxngram']) featuresToUse = [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir) upto = -1 (t2idx, idx2t) = cmn.loadtypes(targetTypesFile) numtargets = len(t2idx) # load entity to type datasets (etrain2types, etrain2names, _) = cmn.load_entname_ds(trainfile, t2idx) print "number of train examples:" + str(len(etrain2names)) (etest2types, etest2names, _) = cmn.load_entname_ds(testfile, t2idx) print "number of test examples:" + str(len(etest2names)) (edev2types, edev2names, _) = cmn.load_entname_ds(devfile, t2idx)
parser = get_argument_parser() args = parser.parse_args() config = loadConfig(args.config) brownMappingFile = config['brownclusters'] trainfile = config['Etrain'] devfile = config['Edev'] testfile = config['Etest'] batch_size = int(config['batchsize']) targetTypesFile = config['typefile'] learning_rate = float(config['lrate']) networkfile = config['net'] num_of_hidden_units = int(config['hidden_units']) n_epochs = int(config['nepochs']) maxngram = int(config['maxngram']) MLP = str_to_bool(config['mlp']) featuresToUse = [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir) (t2idx, idx2t) = loadtypes(targetTypesFile) numtype = len(t2idx) (etrain2types, etrain2names, _) = load_entname_ds(trainfile, t2idx, use_ix=True) print "number of train examples:" + str(len(etrain2names)) (etest2types, etest2names, _) = load_entname_ds(testfile, t2idx, use_ix=True) print "number of test examples:" + str(len(etest2names)) (edev2types, edev2names, _) = load_entname_ds(devfile, t2idx, use_ix=True)
parser = get_argument_parser() args = parser.parse_args() config = loadConfig(args.config) brownMappingFile=config['brownclusters'] trainfile=config['Etrain'] devfile=config['Edev'] testfile=config['Etest'] batch_size=int(config['batchsize']) targetTypesFile=config['typefile'] learning_rate = float(config['lrate']) networkfile = config['net'] num_of_hidden_units = int(config['hidden_units']) n_epochs = int(config['nepochs']) maxngram = int(config['maxngram']) MLP=str_to_bool(config['mlp']) featuresToUse= [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir) (t2idx, idx2t) = loadtypes(targetTypesFile) numtype = len(t2idx) (etrain2types, etrain2names,_) = load_entname_ds(trainfile, t2idx, use_ix=True) print "number of train examples:" + str(len(etrain2names)) (etest2types, etest2names,_) = load_entname_ds(testfile, t2idx, use_ix=True) print "number of test examples:" + str(len(etest2names)) (edev2types, edev2names,_) = load_entname_ds(devfile, t2idx, use_ix=True) print "number of dev examples:" + str(len(edev2names)) if args.loaddata: (in_trn_matrix, target_trn_matrix, trnents) = load_input_matrix('train')
def build_model_new(fea2obj, num_targets, config, kl_weight, entropy_weight, deterministic=False, test=False ): hidden_size = config['hidden_units'].split() use_highway = str_to_bool(config['use_highway']) if 'use_highway' in config else False use_gaus = str_to_bool(config['use_gaus']) if 'use_gaus' in config else False use_rec = str_to_bool(config['use_rec']) if 'use_rec' in config else True n_latent_z = int(config['n_latent']) if 'use_gaus' in config else 0 use_noise = str_to_bool(config['use_noise']) if 'use_noise' in config else False use_vae=str_to_bool(config['use_vae']) if 'use_vae' in config else False hu_decoder = int(config['hu_decoder']) if 'hu_decoder' in config else hidden_size logger.info('use_gaus: %s, use_rec: %s, use_noise: %s, use_vae: %s, hidden_size: %s, n_latent_z: %d, hu_decoder: %s, hu_encoder: %s', use_gaus, use_rec, use_noise, use_vae, hidden_size, n_latent_z, hu_decoder, hidden_size) init_with_type = str_to_bool(config['init_with_type']) if 'init_with_type' in config else False y = T.matrix('targets', dtype='int32') drop_prob = float(config['dropout']) if 'dropout' in config else 0 #build the feature vector with one model, e.g., with cnn or mean or lstm feature_vec, feature_vec_len = build_feature_vec(fea2obj, config) #drop out if drop_prob > 0: mask = T.cast(srng.binomial(n=1, p=1-drop_prob, size=feature_vec.shape), 'float32') if test: feature_vec *= (1 - drop_prob) else: feature_vec *= mask #Highway network if use_highway: g_mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, feature_vec_len], name='g_mlp') t_mlp = MLP(activations=[Logistic()], dims=[feature_vec_len, feature_vec_len], name='t_mlp') initialize([g_mlp, t_mlp]) t = t_mlp.apply(feature_vec) z = t * g_mlp.apply(feature_vec) + (1. - t) * feature_vec feature_vec = z #MLP(s) logger.info('feature vec length = %s and hidden layer units = %s', feature_vec_len, ' '.join(hidden_size)) if len(hidden_size) > 1: #2 MLP on feature fector mlp = MLP(activations=[Rectifier(), Rectifier()], dims=[feature_vec_len, int(hidden_size[0]), int(hidden_size[1])], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = int(hidden_size[1]) else: hidden_size = int(hidden_size[0]) mlp = MLP(activations=[Rectifier()], dims=[feature_vec_len, hidden_size], name='joint_mlp') initialize([mlp]) before_out = mlp.apply(feature_vec) last_hidden_size = hidden_size #compute y_hat initial guess hidden_to_output = Linear(name='hidden_to_output', input_dim=last_hidden_size, output_dim=num_targets) typemfile = None if init_with_type: typemfile = config['dsdir'] + '/_typematrix.npy' #typemfile = config['dsdir'] + '/_typeCooccurrMatrix.npy' initialize_lasthid(hidden_to_output, typemfile) # initialize([hidden_to_output]) y_hat_init = Logistic().apply(hidden_to_output.apply(before_out)) y_hat_init.name='y_hat_init' y_hat_init = debug_print(y_hat_init, 'yhat_init', False) logpy_xz_init = cross_entropy_loss(y_hat_init, y) logpy_xz = logpy_xz_init y_hat_recog = y_hat_init y_hat = y_hat_init KLD = 0 if use_gaus: if use_vae: logger.info('using VAE') vae_conditional=str_to_bool(config['vae_cond']) if vae_conditional: y_hat, logpy_xz, KLD, y_hat_recog = build_vae_conditoinal(kl_weight, entropy_weight, y_hat_init, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) else: y_hat, logpy_xz, KLD = build_vae_basic(kl_weight, feature_vec, feature_vec_len, config, y, test=test, deterministic=deterministic, num_targets=num_targets, n_latent_z=n_latent_z, hidden_size=hidden_size, hu_decoder=hu_decoder) y_hat_recog = y_hat else: if use_rec: logger.info('Not using VAE... but using recursion') prior_in = T.concatenate([feature_vec, y_hat_init], axis=1) mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len+num_targets, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat = (y_hat + y_hat_init) / 2. logpy_xz = (logpy_xz + logpy_xz_init) / 2. else: prior_in = T.concatenate([feature_vec], axis=1) mu_prior, log_sigma_prior = prior_network(x=prior_in, n_input=feature_vec_len, hu_encoder=hidden_size, n_latent=n_latent_z) z_prior = sampler(mu_prior, log_sigma_prior, deterministic=deterministic, use_noise=use_noise) zl = [T.concatenate([z_prior, feature_vec], axis=1)] y_hat, logpy_xz = generation(zl, n_latent=n_latent_z+feature_vec_len, hu_decoder=hu_decoder, n_out=num_targets, y=y) y_hat_recog = y_hat y_hat = debug_print(y_hat, 'y_hat', False) pat1 = T.mean(y[T.arange(y.shape[0]), T.argmax(y_hat, axis=1)]) max_type = debug_print(T.argmax(y_hat_recog, axis=1), 'max_type', False) pat1_recog = T.mean(y[T.arange(y.shape[0]), max_type]) mean_cross = T.mean(logpy_xz) mean_kld = T.mean(KLD) cost = mean_kld + mean_cross cost.name = 'cost'; mean_kld.name = 'kld'; mean_cross.name = 'cross_entropy_loss'; pat1.name = 'p@1'; pat1_recog.name = 'p@1_recog' misclassify_rate = MultiMisclassificationRate().apply(y, T.ge(y_hat, 0.5)) misclassify_rate.name = 'error_rate' return cost, pat1, y_hat, mean_kld, mean_cross, pat1_recog, misclassify_rate
(in_tst_matrix, _, _) = make_input_matrix(etest2rawFea, etest2types, fea2id, 'test', npdir=npdir, t2idx=t2idx) (in_dev_matrix, _, _) = make_input_matrix(edev2rawFea, edev2types, fea2id, 'dev', npdir=npdir, t2idx=t2idx) return in_trn_matrix, in_tst_matrix, in_dev_matrix print 'loading config file', sys.argv[1] config = cmn.loadConfig(sys.argv[1]) hdf5_file = config['fuelfile'] trainfile=config['Etrain'] devfile=config['Edev'] testfile=config['Etest'] targetTypesFile=config['typefile'] max_len_name= int(config['max_name_length']) vectorFile=config['ent_vectors'] if 'typecosine' in config: usetypecosine = cmn.str_to_bool(config['typecosine']) brownMappingFile=config['brownclusters'] maxngram = int(config['maxngram']) featuresToUse= [fea for fea in config['features'].split(' ')] npdir = config['npdir'] if not os.path.exists(npdir): os.makedirs(npdir) upto = -1 (t2idx, idx2t) = cmn.loadtypes(targetTypesFile) numtargets = len(t2idx) # load entity to type datasets (etrain2types, etrain2names,_) = cmn.load_entname_ds(trainfile, t2idx) print "number of train examples:" + str(len(etrain2names)) (etest2types, etest2names,_) = cmn.load_entname_ds(testfile, t2idx) print "number of test examples:" + str(len(etest2names)) (edev2types, edev2names,_) = cmn.load_entname_ds(devfile, t2idx)
def main(args): print 'loading config file', args[1] config = cmn.loadConfig(args[1]) dsdir = config['dsdir'] #first generating name datasets based on the number of names for each set if not os.path.exists(os.path.join(dsdir, 'train.txt')): generate_name_dataset(config) trainfile = dsdir + '/train.txt' devfile = dsdir + '/dev.txt' testfile = dsdir + '/test.txt' targetTypesFile = config['typefile'] vectorFile = config['ent_vectors'] vectorFile_words = config[ 'word_vectors'] if 'word_vectors' in config else vectorFile subword_vectorFile = config[ 'fasttext_vecfile'] if 'fasttext_vecfile' in config else None ent2tfidf_features_path = config[ 'ent2tfidf_features_path'] if 'ent2tfidf_features_path' in config else None # the_features = config['features'].split(' ') #i.e. letters entvec words tc ngrams = [int(n) for n in config['ngrams_n'].split() ] if 'ngrams_n' in config else [] ngrams_vecfiles = { ngram: config['ngrams' + str(ngram) + '_vecfile'] for ngram in ngrams } letter_vecfile = config[ 'letters_vecfile'] if 'letters_vecfile' in config else None hs_ngram_path = config['hsngrampath'] if 'hsngrampath' in config else None hs_ngram_versions = config['hsngram_vecs'].split( ) if hs_ngram_path else None use_lowercase = str_to_bool( config['use_lowercase']) if 'use_lowercase' in config else False print "uselower: ", use_lowercase upto = -1 (t2idx, _) = cmn.loadtypes(targetTypesFile) trnMentions = load_ent_ds(trainfile) devMentions = load_ent_ds(devfile) tstMentions = load_ent_ds(testfile) logger.info("#train : %d #dev : %d #test : %d", len(trnMentions), len(devMentions), len(tstMentions)) if not os.path.exists(os.path.join(dsdir, '_targets.h5py')): build_targets_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_letters_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, letter_vecfile, max_len_name=40) build_typecosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) if hs_ngram_path: build_hsNgram_ds(config, trnMentions, devMentions, tstMentions, t2idx, dsdir, hs_ngram_path, hs_ngram_versions, vectorsize=300, upto=-1) for ng in ngrams: build_ngram_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, ngrams_vecfiles[ng], ng, upto=-1) build_type_patterns(trnMentions, t2idx, dsdir, vectorFile) save_typevecmatrix(t2idx, dsdir, vectorFile) # build_type_words_cosine_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_subwords_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, subword_vectorFile, use_lowercase=use_lowercase, upto=-1) build_words_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile_words, use_lowercase=use_lowercase, upto=-1) build_entvec_ds(trnMentions, devMentions, tstMentions, t2idx, dsdir, vectorFile, upto=-1) build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1) else: build_desc_features_ds(trnMentions, devMentions, tstMentions, ent2tfidf_features_path, t2idx, dsdir, vectorFile_words, use_lowercase=True, upto=-1)