Esempio n. 1
0
def load_params(model, from_file):
    log_info("Loading parameters from %s" % from_file)
    params = np.load(from_file)
    model.p_f_given_e = params
Esempio n. 2
0
def save_params(model, to_file):
    log_info("Saving parameters to %s" % to_file)
    with open(to_file, "w+") as f:
        np.save(f, model.p_f_given_e)
Esempio n. 3
0
english_validation_file_path = "data/validation/dev.e"
french_testing_file_path = "data/testing/test/test.f"
english_testing_file_path = "data/testing/test/test.e"
french_vocab_path = "data/vocabulary/french.txt"
english_vocab_path = "data/vocabulary/english.txt"
validation_golden = 'data/validation/dev.wa.nonullalign'
testing_golden = 'data/testing/answers/test.wa.nonullalign'

# Load the vocabularies for English and French.
vocab_french = Vocabulary(french_file_path, vocab_file_path=french_vocab_path, min_count=min_count, \
        max_size=max_vocab_size)
vocab_english = Vocabulary(english_file_path, vocab_file_path=english_vocab_path, min_count=min_count, \
        max_size=max_vocab_size)

# Set up the model.
log_info("Setting up the model, French vocabulary size = %d, English vocabulary size = %d." % \
        (len(vocab_french), len(vocab_english)))
model = IBM1(french_vocab_size=len(vocab_french),
             english_vocab_size=len(vocab_english))
log_info("Model has been set up.")

# Tokenize the French and English sentences.
parallel_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_file_path, english_file_path=english_file_path)
parallel_validation_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_validation_file_path, english_file_path=english_validation_file_path)
parallel_testing_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_testing_file_path, english_file_path=english_testing_file_path)

# Calculate the validation AER and log likelihood for the initial parameters.
validation_aer = evaluate_model(model,
                                validation_golden,
Esempio n. 4
0
french_file_path = "data/training/small/hansards.36.2.f" if small_dataset else "data/training/hansards.36.2.f"
french_validation_file_path = "data/validation/dev.f"
english_file_path = "data/training/small/hansards.36.2.e" if small_dataset else "data/training/hansards.36.2.e"
french_validation_file_path = "data/validation/dev.f"
english_validation_file_path = "data/validation/dev.e"
french_vocab_path = "data/vocabulary/french.txt"
english_vocab_path = "data/vocabulary/english.txt"

# Load the vocabularies for English and French.
vocab_french = Vocabulary(french_file_path, vocab_file_path=french_vocab_path, min_count=min_count, \
        max_size=max_vocab_size)
vocab_english = Vocabulary(english_file_path, vocab_file_path=english_vocab_path, min_count=min_count, \
        max_size=max_vocab_size)

# Set up the model.
log_info("Setting up the model, French vocabulary size = %d, English vocabulary size = %d, alpha=%f." % \
        (len(vocab_french), len(vocab_english), alpha))
model = VariationalIBM1(french_vocab_size=len(vocab_french), english_vocab_size=len(vocab_english), alpha=alpha)
log_info("Model has been set up.")

# Tokenize the French and English sentences.
parallel_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_file_path, english_file_path=english_file_path)
parallel_validation_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_validation_file_path, english_file_path=english_validation_file_path)

# Calculate the validation AER and log likelihood for the initial parameters.
predictions = []
for french_sentence, english_sentence in parallel_validation_corpus:
    alignments = model.align(french_sentence, english_sentence)

    # Remove null alignments from predictions
Esempio n. 5
0
english_validation_file_path = "data/validation/dev.e"
french_vocab_path = "data/vocabulary/french.txt"
english_vocab_path = "data/vocabulary/english.txt"
validation_golden = 'data/validation/dev.wa.nonullalign'
testing_golden = 'data/testing/answers/test.wa.nonullalign'
french_testing_file_path = "data/testing/test/test.f"
english_testing_file_path = "data/testing/test/test.e"

# Load the vocabularies for English and French.
vocab_french = Vocabulary(french_file_path, vocab_file_path=french_vocab_path, min_count=min_count,
                          max_size=max_vocab_size)
vocab_english = Vocabulary(english_file_path, vocab_file_path=english_vocab_path, min_count=min_count,
                           max_size=max_vocab_size)

# Set up the model.
log_info("Setting up the model, French vocabulary size = %d, English vocabulary size = %d, max_jump = %d." % \
        (len(vocab_french), len(vocab_english), max_jump))
model = IBM2(french_vocab_size=len(vocab_french), english_vocab_size=len(vocab_english), max_jump=max_jump, \
        init=args.init)
log_info("Model has been set up.")

# Tokenize the French and English sentences.
log_info("Loading parallel corpus from %s and %s" % (french_file_path, english_file_path))
parallel_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_file_path, english_file_path=english_file_path)
parallel_validation_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_validation_file_path, english_file_path=english_validation_file_path)
parallel_testing_corpus = tokenize_corpora_to_ids(vocab_french, vocab_english, \
        french_file_path=french_testing_file_path, english_file_path=english_testing_file_path)

# Load IBM1 parameters
if args.init == "ibm1":