def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_neg_samples', type=int, default=10) parser.add_argument('--num_candidates', type=int, default=10) parser.add_argument('--num_cells', type=int, nargs='+', default=[128, 128]) parser.add_argument('--num_hidden_layers', type=int, default=2) parser.add_argument('--num_epochs', type=int, default=300) parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--embedding_file', type=str, default='eacl_data/ennl.mono.dim=50.bin') parser.add_argument('--bwesg_embedding_file', type=str, default='eacl_data/ennl.bwesg.dim=50.window=100.bin') parser.add_argument('--training_data', type=str, default='eacl_data/lex.filtered.train80-20.txt') parser.add_argument('--test_data', type=str, default='eacl_data/lex.filtered.test80-20.txt') args = parser.parse_args() num_neg_samples = args.num_neg_samples num_candidates = args.num_candidates num_cells = args.num_cells num_hidden_layers = args.num_hidden_layers num_epochs = args.num_epochs training_data = args.training_data test_data = args.test_data batch_size = args.batch_size vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab(args.embedding_file) _, _, _, multi_Embs_S, multi_Embs_T, _ = embeddings_and_vocab(args.bwesg_embedding_file) char_vocab_S, char_vocab_T = char_vocab(vocab_S, vocab_T) with tf.Session() as session: classifier = create_classifier( session, vocab_S, vocab_T, Embs_S, Embs_T, char_vocab_S, char_vocab_T, num_cells, num_hidden_layers, num_neg_samples, num_candidates) trainer = create_trainer(classifier, batch_size, num_epochs, training_data, test_data) trainer.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_neg_samples', type=int, default=10) # parser.add_argument('--num_candidates', type=int, default=10) parser.add_argument('--num_cells', type=int, nargs='+', default=[128, 128]) parser.add_argument('--num_hidden_layers', type=int, default=2) parser.add_argument('--num_epochs', type=int, default=300) parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--embedding_file', type=str, default='eacl_data/ennl.mono.dim=50.bin') # parser.add_argument('--bwesg_embedding_file', type=str, default='eacl_data/ennl.bwesg.dim=50.window=100.bin') parser.add_argument('--training_data', type=str, default='eacl_data/lex.filtered.train80-20.txt') parser.add_argument('--test_data', type=str, default='eacl_data/lex.filtered.test80-20.txt') parser.add_argument('--out_dir', type=str, default='logs/charLSTM_Embs') # parser.add_argument('--editdistance_file', type=str, default='edit_distance.npy') parser.add_argument('--candidates_file', type=str) args = parser.parse_args() num_neg_samples = args.num_neg_samples # num_candidates = args.num_candidates num_cells = args.num_cells num_hidden_layers = args.num_hidden_layers num_epochs = args.num_epochs training_data = args.training_data test_data = args.test_data batch_size = args.batch_size LOG_DIR = args.out_dir # EDIT_DISTANCE_FILE = args.editdistance_file candidates_file = args.candidates_file vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab(args.embedding_file) # _, _, _, multi_Embs_S, multi_Embs_T, _ = embeddings_and_vocab(args.bwesg_embedding_file) char_vocab_S, char_vocab_T = char_vocab(vocab_S, vocab_T) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as session: classifier = create_classifier( session, vocab_S, vocab_T, Embs_S, Embs_T, char_vocab_S, char_vocab_T, num_cells, num_hidden_layers, num_neg_samples, candidates_file) trainer = create_trainer(classifier, batch_size, num_epochs, training_data, test_data, LOG_DIR) trainer.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_candidates', type=int, default=10) parser.add_argument('--threads', type=int, default=4) parser.add_argument('--bwesg_embedding_file', type=str, default='eacl_data/ennl.bwesg.dim=50.window=100.bin') parser.add_argument('--output', type=str, default='logs/charLSTM_Embs') parser.add_argument('--editdistance_file', type=str, default='edit_distance.npy') args = parser.parse_args() num_candidates = args.num_candidates output = args.output EDIT_DISTANCE_FILE = args.editdistance_file if os.path.exists(output) and os.path.exists(EDIT_DISTANCE_FILE): print '{} exists!'.format(output) return vocab_S, vocab_T, _, multi_Embs_S, multi_Embs_T, _ = embeddings_and_vocab(args.bwesg_embedding_file) candidates = create_candidates(vocab_S, vocab_T, multi_Embs_S, multi_Embs_T, num_candidates, EDIT_DISTANCE_FILE, args.threads) print 'Saving candidates to: {}'.format(output) with open(output, 'wb') as output: pickle.dump(candidates, output)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--num_neg_samples', type=int, default=10) parser.add_argument('--num_cells', type=int, nargs='+', default=[128, 128]) parser.add_argument('--num_hidden_layers', type=int, default=2) parser.add_argument('--num_epochs', type=int, default=300) parser.add_argument('--batch_size', type=int, default=10) parser.add_argument('--embedding_file', type=str, default='eacl_data/ennl.mono.dim=50.bin') parser.add_argument('--training_data', type=str, default='eacl_data/lex.filtered.train80-20.txt') parser.add_argument('--unlabeled_data', type=str, default='eacl_data/lex.filtered.train80-20.tune.txt') parser.add_argument('--test_data', type=str, default='eacl_data/lex.filtered.test80-20.txt') parser.add_argument('--out_dir', type=str, default='logs/charLSTM_Embs') parser.add_argument('--test_candidates_file', type=str) parser.add_argument('--unlabeled_candidates_file', type=str) parser.add_argument('--walker_weight', type=float, default=1.0) parser.add_argument('--visit_weight', type=float, default=1.0) parser.add_argument('--logit_weight', type=float, default=1.0) parser.add_argument('--learning_rate', type=float, default=0.001) parser.add_argument('--add_negative_unlabeled', type=int, default=0) parser.add_argument('--reversed_unlabeled_candidates', type=int, default=0) args = parser.parse_args() num_neg_samples = args.num_neg_samples num_cells = args.num_cells num_hidden_layers = args.num_hidden_layers num_epochs = args.num_epochs training_data = args.training_data unlabeled_data = args.unlabeled_data test_data = args.test_data batch_size = args.batch_size LOG_DIR = args.out_dir test_candidates_file = args.test_candidates_file unlabeled_candidates_file = args.unlabeled_candidates_file walker_weight = args.walker_weight visit_weight = args.visit_weight logit_weight = args.logit_weight learning_rate = args.learning_rate add_negative_unlabeled = args.add_negative_unlabeled == 1 unlabeled_source2target = args.reversed_unlabeled_candidates == 0 vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab( args.embedding_file) char_vocab_S, char_vocab_T = char_vocab(vocab_S, vocab_T) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as session: classifier = create_classifier( session, vocab_S, vocab_T, Embs_S, Embs_T, char_vocab_S, char_vocab_T, num_cells, num_hidden_layers, num_neg_samples, batch_size, test_candidates_file, unlabeled_candidates_file, walker_weight=walker_weight, visit_weight=visit_weight, logit_weight=logit_weight, lr=learning_rate, add_negative_unlabeled=add_negative_unlabeled, unlabeled_source2target=unlabeled_source2target) trainer = create_trainer(classifier, batch_size, num_epochs, training_data, unlabeled_data, test_data, LOG_DIR) trainer.train()
'/Users/yoshinarifujinuma/work/cross_lingual_embed/eacl_data/ennl.mono.dim=50.bin' ) parser.add_argument( '--bwesg_embedding_file', type=str, default= '/Users/yoshinarifujinuma/work/cross_lingual_embed/eacl_data/ennl.bwesg.dim=50.window=100.bin' ) parser.add_argument( '--training_data', type=str, default= '/Users/yoshinarifujinuma/work/cross_lingual_embed/eacl_data/lex.filtered.train80-20.txt' ) args = parser.parse_args() vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab( args.embedding_file) word2id_source = {w: i for i, w in enumerate(vocab_S)} word2id_target = {w: i for i, w in enumerate(vocab_T)} # Reading the bilingual training examples source_vecs = [] target_vecs = [] for line in open(args.training_data): source, target = line.strip().split("\t") source_id = word2id_source[source] target_id = word2id_target[target] source_vecs.append(Embs_S[source_id]) target_vecs.append(Embs_T[target_id]) source_vecs = np.array(source_vecs) target_vecs = np.array(target_vecs)