def test_process_sentence_russian(self):
     sentences = gensent.SentenceGenerator(language='russian', lemma=True)
     result = sentences._process_sentence(
         "Три девицы под окном Пряли поздно вечерком.")
     correct = [
         'три', 'девица', 'под', 'окно', 'прясть', 'поздно', 'вечерок'
     ]
     self.assertEqual(result, correct)
 def test_dutch(self):
     sentences = gensent.SentenceGenerator(language='dutch', lemma=True)
     result = sentences._process_sentence("Ik ga naar buiten toe")
     correct = ['ik', 'gaan', 'naar', 'buiten', 'toe']
     self.assertEqual(result, correct)
 def test_process_numbers(self):
     sentences = gensent.SentenceGenerator()
     result = sentences._process_sentence("Pi is 3.14159")
     correct = ['pi', 'is', sentences.NUM]
     self.assertEqual(result, correct)
 def test_process_EU_money(self):
     sentences = gensent.SentenceGenerator()
     result = sentences._process_sentence("Breakfast cost me €5.60")
     correct = ['breakfast', 'cost', 'me', sentences.NUM]
     self.assertEqual(result, correct)
 def test_process_sentence(self):
     sentences = gensent.SentenceGenerator()
     result = sentences._process_sentence(self.sentence_list[0])
     correct = ['i', 'am', 'sam', 'sam-i-am']
     self.assertEqual(result, correct)
 def test_two_passes(self):
     """Make sure we can make two passes over the sentence generator iterator."""
     sentences = gensent.SentenceGenerator()
     sentences.read_sentence_list(self.sentence_list)
     #the list() function makes one pass over an iterator, so just do it 2x
     self.assertEqual(list(sentences), list(sentences))
 def test_generator_unprepared(self):
     """Make sure an unprepared sentence generator throws an error."""
     sentences = gensent.SentenceGenerator()
     self.assertRaises(Exception, sentences._gen_sentences())
Exemple #8
0
        action='store_true',
        default=False,
        help='lemmatize the sentences before training word vectors')
    args = parser.parse_args()

    return args


args = parse_args()

print('Working on Dutch...')
start_time = time.time()

nl_direc = os.path.join(args.data_dir, 'nl')
nl_sents = gensent.SentenceGenerator(language='dutch',
                                     lemma=args.lemma,
                                     cstlemma_dir=args.cstlemma_dir)
nl_sents.read_directory(nl_direc)
nl_model = gensim.models.Word2Vec(nl_sents, **w2vconfig.gensim_config)
nl_vectors = nl_model.wv
print('Dutch word tokens: {}'.format(nl_sents.word_token_count))
print('Dutch vocab size: {}'.format(len(nl_model.wv.vocab)))
if args.lemma:
    nl_vectors_fp = os.path.join(args.vectors_dir, 'nl_vectors_lemma.txt')
else:
    nl_vectors_fp = os.path.join(args.vectors_dir, 'nl_vectors_nolemma.txt')
nl_vectors.save_word2vec_format(nl_vectors_fp, binary=False)

elapsed_time = time.time() - start_time
print('Elapsed time:', elapsed_time)