Example #1
0
 def test_sliding_window(self):
     corpus = FileCorpus(path_text_file)
     sliding_window_iter = corpus.get_sliding_window_iterator()
     for i, s in enumerate(sliding_window_iter):
         if i >= 2:
             break
     assert s == {'current': 'long', 'context': ['family', 'dashwood', 'settled', 'sussex']}
Example #2
0
 def test_file_corpus(self):
     corpus = FileCorpus(path_text_file)
     tokens_iter = corpus.get_token_iterator(verbose=1)
     total_words, words = count_words_and_collect_prefix(tokens_iter)
     print("!!!!!total words", total_words)
     assert total_words == TEST_TEXT_LEN
     assert '|'.join(words) == TEST_FIRST_10_WORDS
Example #3
0
def create_from_file(path, min_frequency=0):
    """Collects vocabulary from a corpus by a given file path.
    """
    if not os.path.isfile(path):
        raise RuntimeError("source file does not exist")
    iter = FileCorpus(path).get_token_iterator()
    v = _create_from_iterator(iter, min_frequency)
    return v
Example #4
0
def create_from_file(path, min_frequency=0, language='eng'):
    """Collects vocabulary from a corpus by a given file path.
    """
    if not os.path.isfile(path):
        raise RuntimeError("source file does not exist")
    tokenizer = Tokenizer(stopwords=[])
    iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer)
    v = _create_from_iterator(iter, min_frequency)
    return v
Example #5
0
def create_from_path(path, min_frequency=0, language='eng'):
    """Collects vocabulary from a corpus by a given directory path.
    """
    tokenizer = Tokenizer(stopwords=[])
    if os.path.isfile(path):
        iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer)
    else:
        if os.path.isdir(path):
            iter = DirCorpus(path, language).get_token_iterator(tokenizer)
        else:
            raise RuntimeError("source path can not be read")
    # TODO: add option for stopwords
    v = _create_from_iterator(iter, min_frequency)
    return v
Example #6
0
 def test_sentence(self):
     corpus = FileCorpus(path_text_file)
     sentence_iter = corpus.get_sentence_iterator(verbose=True)
     for s in sentence_iter:
         assert s == ['family', 'dashwood', 'long', 'settled', 'sussex']
         break
Example #7
0
 def test_file_corpus(self):
     corpus = FileCorpus(path_text_file)
     tokens_iter = corpus.get_token_iterator(verbose=1)
     total_words, words = count_words_and_collect_prefix(tokens_iter)
     print("!!!!!total words", total_words)