Exemple #1
0
def get_source_data(language):
    text_sentences = load_and_save.read_sentences_from_file('ud2_data/pos/' +
                                                            language + '.pos')
    sentences, full_words, freq_words, sentences_pos, pos = load_and_save.integer_sentences_yuan(
        text_sentences, pos=pos_tags, max_words=1000, unk_word=unk_word)
    word_vec = get_word_vec(language, full_words)
    return full_words, text_sentences, sentences, sentences_pos, word_vec
Exemple #2
0
def get_source_data(language):
    text_sentences = load_and_save.read_sentences_from_file('ud2_data/pos/'+language+'.pos')
    sentences, full_words, freq_words, sentences_pos, pos = load_and_save.integer_sentences_yuan(text_sentences, pos=pos_tags, max_words=1000, unk_word=unk_word)
    word_vec = get_word_vec(language, full_words)
    return full_words, text_sentences, sentences, sentences_pos, word_vec
Exemple #3
0
print 'regularization coeff', base_reg_coeff
print 'shared_reg_coeff', shared_reg_coeff
print 'source language', source
print 'target language', target
print 'pair_str', pair_str
print 'use_indicator', use_indicator
print 'use_misc', use_misc

word_vec_dim = 20
pos_tags = universal_pos_tags
unk_word = '<UNK>'
start_token = pos_tags.index('START')

language = target
text_sentences = load_and_save.read_sentences_from_file('ud2_data/pos/'+language+'.pos')
sentences, full_words, freq_words, sentences_pos, pos = load_and_save.integer_sentences_yuan(text_sentences, pos=universal_pos_tags, max_words=1000, unk_word=unk_word)
unk_id = full_words.index(unk_word) if unk_word in full_words else -1
target_word_vec = get_word_vec(language, full_words)
test_text_sentences, test_sentences, test_sentences_pos = load_and_save.read_test_sentences_from_file('ud2_data/pos/' + language + '.test.pos', full_words, unk_id, pos_tags)
if use_misc:
    misc_feature_map = get_emission_features(full_words)
    n_misc_feat = misc_feature_map.shape[1]

n_pos = len(pos_tags)
n_words = len(full_words)
trans_feat_start = 0
trans_feat_end = n_pos * n_pos
embedding_feat_start = trans_feat_end
embedding_feat_end = trans_feat_end + n_pos * word_vec_dim
projection_matrix_start = embedding_feat_end
Exemple #4
0
    print '%s words have no vector' % num_no_vect
    return word_vect_matrix


source_language = 'spanish'
target_language = 'english07'
pair_filename = 'word_pairs/es-en.pair'
reverse_pair = False

target_vectors, target_vect_size = word_vect_loader.load('pos_data/'+target_language+'.train.sent.vec')
source_vectors, source_vect_size = word_vect_loader.load('pos_data/'+source_language+'.train.sent.vec')
assert source_vect_size == target_vect_size
vect_size = source_vect_size

print 'loading'
source_text_sentences = load_and_save.read_sentences_from_file('pos_data/conll-'+source_language+'.pos')
source_sentences, source_words, source_sentences_pos, _ = load_and_save.integer_sentences(source_text_sentences, pos=universal_pos_tags, max_words=10000)
source_test_sentences = load_and_save.read_sentences_from_file('pos_data/conll-'+source_language+'-test.pos')
source_test_sentences, _, source_test_sentences_pos, _ = load_and_save.integer_sentences(source_test_sentences, pos=universal_pos_tags, words=source_words)

target_text_sentences = load_and_save.read_sentences_from_file('pos_data/conll-'+target_language+'-test.pos')
target_sentences, target_words, target_sentences_pos, _ = load_and_save.integer_sentences(target_text_sentences, pos=universal_pos_tags, max_words=10000)

source_vector_matrix = make_vector_matrix(source_words, source_vectors, vect_size)
target_vector_matrix = make_vector_matrix(target_words, target_vectors, vect_size)

print 'finding rotation'
translation_pairs = []
pair_file = open(pair_filename)
for line in pair_file:
    split = line.split()
Exemple #5
0
print 'regularization coeff', base_reg_coeff
print 'shared_reg_coeff', shared_reg_coeff
print 'source language', source
print 'target language', target
print 'pair_str', pair_str
print 'use_indicator', use_indicator
print 'use_misc', use_misc

word_vec_dim = 20
pos_tags = universal_pos_tags
unk_word = '<UNK>'
start_token = pos_tags.index('START')

language = target
text_sentences = load_and_save.read_sentences_from_file('ud2_data/pos/' +
                                                        language + '.pos')
sentences, full_words, freq_words, sentences_pos, pos = load_and_save.integer_sentences_yuan(
    text_sentences, pos=universal_pos_tags, max_words=1000, unk_word=unk_word)
unk_id = full_words.index(unk_word) if unk_word in full_words else -1
target_word_vec = get_word_vec(language, full_words)
test_text_sentences, test_sentences, test_sentences_pos = load_and_save.read_test_sentences_from_file(
    'ud2_data/pos/' + language + '.test.pos', full_words, unk_id, pos_tags)
if use_misc:
    misc_feature_map = get_emission_features(full_words)
    n_misc_feat = misc_feature_map.shape[1]

n_pos = len(pos_tags)
n_words = len(full_words)
trans_feat_start = 0
trans_feat_end = n_pos * n_pos
embedding_feat_start = trans_feat_end