Ejemplo n.º 1
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--num_neg_samples', type=int, default=10)
  parser.add_argument('--num_candidates', type=int, default=10)
  parser.add_argument('--num_cells', type=int, nargs='+', default=[128, 128])
  parser.add_argument('--num_hidden_layers', type=int, default=2)
  parser.add_argument('--num_epochs', type=int, default=300)
  parser.add_argument('--batch_size', type=int, default=10)
  parser.add_argument('--embedding_file', type=str, default='eacl_data/ennl.mono.dim=50.bin')
  parser.add_argument('--bwesg_embedding_file', type=str, default='eacl_data/ennl.bwesg.dim=50.window=100.bin')
  parser.add_argument('--training_data', type=str, default='eacl_data/lex.filtered.train80-20.txt')
  parser.add_argument('--test_data', type=str, default='eacl_data/lex.filtered.test80-20.txt')
  args = parser.parse_args()

  num_neg_samples = args.num_neg_samples
  num_candidates = args.num_candidates
  num_cells = args.num_cells
  num_hidden_layers = args.num_hidden_layers
  num_epochs = args.num_epochs
  training_data = args.training_data
  test_data = args.test_data
  batch_size = args.batch_size

  vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab(args.embedding_file)
  _, _, _, multi_Embs_S, multi_Embs_T, _ = embeddings_and_vocab(args.bwesg_embedding_file)
  char_vocab_S, char_vocab_T = char_vocab(vocab_S, vocab_T)
  with tf.Session() as session:
    classifier = create_classifier(
      session,
      vocab_S, vocab_T, Embs_S, Embs_T, char_vocab_S, char_vocab_T,
      num_cells, num_hidden_layers,
      num_neg_samples, num_candidates)
    trainer = create_trainer(classifier, batch_size, num_epochs, training_data, test_data)
    trainer.train()
Ejemplo n.º 2
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--num_neg_samples', type=int, default=10)
  # parser.add_argument('--num_candidates', type=int, default=10)
  parser.add_argument('--num_cells', type=int, nargs='+', default=[128, 128])
  parser.add_argument('--num_hidden_layers', type=int, default=2)
  parser.add_argument('--num_epochs', type=int, default=300)
  parser.add_argument('--batch_size', type=int, default=10)
  parser.add_argument('--embedding_file', type=str, default='eacl_data/ennl.mono.dim=50.bin')
  # parser.add_argument('--bwesg_embedding_file', type=str, default='eacl_data/ennl.bwesg.dim=50.window=100.bin')
  parser.add_argument('--training_data', type=str, default='eacl_data/lex.filtered.train80-20.txt')
  parser.add_argument('--test_data', type=str, default='eacl_data/lex.filtered.test80-20.txt')
  parser.add_argument('--out_dir', type=str, default='logs/charLSTM_Embs')
  # parser.add_argument('--editdistance_file', type=str, default='edit_distance.npy')
  parser.add_argument('--candidates_file', type=str)
  args = parser.parse_args()

  num_neg_samples = args.num_neg_samples
  # num_candidates = args.num_candidates
  num_cells = args.num_cells
  num_hidden_layers = args.num_hidden_layers
  num_epochs = args.num_epochs
  training_data = args.training_data
  test_data = args.test_data
  batch_size = args.batch_size
  LOG_DIR = args.out_dir
  # EDIT_DISTANCE_FILE = args.editdistance_file
  candidates_file = args.candidates_file

  vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab(args.embedding_file)
  # _, _, _, multi_Embs_S, multi_Embs_T, _ = embeddings_and_vocab(args.bwesg_embedding_file)
  char_vocab_S, char_vocab_T = char_vocab(vocab_S, vocab_T)

  config = tf.ConfigProto()
  config.gpu_options.allow_growth = True

  with tf.Session(config=config) as session:
    classifier = create_classifier(
      session,
      vocab_S, vocab_T, Embs_S, Embs_T, char_vocab_S, char_vocab_T,
      num_cells, num_hidden_layers,
      num_neg_samples, candidates_file)
    trainer = create_trainer(classifier, batch_size, num_epochs, training_data, test_data, LOG_DIR)
    trainer.train()
Ejemplo n.º 3
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--num_candidates', type=int, default=10)
  parser.add_argument('--threads', type=int, default=4)
  parser.add_argument('--bwesg_embedding_file', type=str, default='eacl_data/ennl.bwesg.dim=50.window=100.bin')
  parser.add_argument('--output', type=str, default='logs/charLSTM_Embs')
  parser.add_argument('--editdistance_file', type=str, default='edit_distance.npy')
  args = parser.parse_args()

  num_candidates = args.num_candidates
  output = args.output
  EDIT_DISTANCE_FILE = args.editdistance_file
  if os.path.exists(output) and os.path.exists(EDIT_DISTANCE_FILE):
    print '{} exists!'.format(output)
    return

  vocab_S, vocab_T, _, multi_Embs_S, multi_Embs_T, _ = embeddings_and_vocab(args.bwesg_embedding_file)
  candidates = create_candidates(vocab_S, vocab_T, multi_Embs_S, multi_Embs_T, num_candidates, EDIT_DISTANCE_FILE, args.threads)

  print 'Saving candidates to: {}'.format(output)
  with open(output, 'wb') as output:
    pickle.dump(candidates, output)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--num_neg_samples', type=int, default=10)
    parser.add_argument('--num_cells', type=int, nargs='+', default=[128, 128])
    parser.add_argument('--num_hidden_layers', type=int, default=2)
    parser.add_argument('--num_epochs', type=int, default=300)
    parser.add_argument('--batch_size', type=int, default=10)
    parser.add_argument('--embedding_file',
                        type=str,
                        default='eacl_data/ennl.mono.dim=50.bin')
    parser.add_argument('--training_data',
                        type=str,
                        default='eacl_data/lex.filtered.train80-20.txt')
    parser.add_argument('--unlabeled_data',
                        type=str,
                        default='eacl_data/lex.filtered.train80-20.tune.txt')
    parser.add_argument('--test_data',
                        type=str,
                        default='eacl_data/lex.filtered.test80-20.txt')
    parser.add_argument('--out_dir', type=str, default='logs/charLSTM_Embs')
    parser.add_argument('--test_candidates_file', type=str)
    parser.add_argument('--unlabeled_candidates_file', type=str)
    parser.add_argument('--walker_weight', type=float, default=1.0)
    parser.add_argument('--visit_weight', type=float, default=1.0)
    parser.add_argument('--logit_weight', type=float, default=1.0)
    parser.add_argument('--learning_rate', type=float, default=0.001)
    parser.add_argument('--add_negative_unlabeled', type=int, default=0)
    parser.add_argument('--reversed_unlabeled_candidates', type=int, default=0)
    args = parser.parse_args()

    num_neg_samples = args.num_neg_samples
    num_cells = args.num_cells
    num_hidden_layers = args.num_hidden_layers
    num_epochs = args.num_epochs
    training_data = args.training_data
    unlabeled_data = args.unlabeled_data
    test_data = args.test_data
    batch_size = args.batch_size
    LOG_DIR = args.out_dir
    test_candidates_file = args.test_candidates_file
    unlabeled_candidates_file = args.unlabeled_candidates_file
    walker_weight = args.walker_weight
    visit_weight = args.visit_weight
    logit_weight = args.logit_weight
    learning_rate = args.learning_rate
    add_negative_unlabeled = args.add_negative_unlabeled == 1
    unlabeled_source2target = args.reversed_unlabeled_candidates == 0

    vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab(
        args.embedding_file)
    char_vocab_S, char_vocab_T = char_vocab(vocab_S, vocab_T)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    with tf.Session(config=config) as session:
        classifier = create_classifier(
            session,
            vocab_S,
            vocab_T,
            Embs_S,
            Embs_T,
            char_vocab_S,
            char_vocab_T,
            num_cells,
            num_hidden_layers,
            num_neg_samples,
            batch_size,
            test_candidates_file,
            unlabeled_candidates_file,
            walker_weight=walker_weight,
            visit_weight=visit_weight,
            logit_weight=logit_weight,
            lr=learning_rate,
            add_negative_unlabeled=add_negative_unlabeled,
            unlabeled_source2target=unlabeled_source2target)
        trainer = create_trainer(classifier, batch_size, num_epochs,
                                 training_data, unlabeled_data, test_data,
                                 LOG_DIR)
        trainer.train()
Ejemplo n.º 5
0
    '/Users/yoshinarifujinuma/work/cross_lingual_embed/eacl_data/ennl.mono.dim=50.bin'
)
parser.add_argument(
    '--bwesg_embedding_file',
    type=str,
    default=
    '/Users/yoshinarifujinuma/work/cross_lingual_embed/eacl_data/ennl.bwesg.dim=50.window=100.bin'
)
parser.add_argument(
    '--training_data',
    type=str,
    default=
    '/Users/yoshinarifujinuma/work/cross_lingual_embed/eacl_data/lex.filtered.train80-20.txt'
)
args = parser.parse_args()
vocab_S, vocab_T, _, Embs_S, Embs_T, _ = embeddings_and_vocab(
    args.embedding_file)
word2id_source = {w: i for i, w in enumerate(vocab_S)}
word2id_target = {w: i for i, w in enumerate(vocab_T)}

# Reading the bilingual training examples
source_vecs = []
target_vecs = []
for line in open(args.training_data):
    source, target = line.strip().split("\t")
    source_id = word2id_source[source]
    target_id = word2id_target[target]
    source_vecs.append(Embs_S[source_id])
    target_vecs.append(Embs_T[target_id])
source_vecs = np.array(source_vecs)
target_vecs = np.array(target_vecs)