Python FileCorpus.FileCorpus Examples

Programming Language: Python

Namespace/Package Name: vecto.corpus

Class/Type: FileCorpus

Method/Function: FileCorpus

Examples at hotexamples.com: 7

Python FileCorpus.FileCorpus - 7 examples found. These are the top rated real world Python examples of vecto.corpus.FileCorpus.FileCorpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

FileCorpus(7)

get_token_iterator(2)

get_sentence_iterator(1)

get_sliding_window_iterator(1)

Example #1

Show file

File: test_corpus.py Project: vecto-ai/vecto

 def test_sliding_window(self):
     corpus = FileCorpus(path_text_file)
     sliding_window_iter = corpus.get_sliding_window_iterator()
     for i, s in enumerate(sliding_window_iter):
         if i >= 2:
             break
     assert s == {'current': 'long', 'context': ['family', 'dashwood', 'settled', 'sussex']}

Example #2

Show file

 def test_file_corpus(self):
     corpus = FileCorpus(path_text_file)
     tokens_iter = corpus.get_token_iterator(verbose=1)
     total_words, words = count_words_and_collect_prefix(tokens_iter)
     print("!!!!!total words", total_words)
     assert total_words == TEST_TEXT_LEN
     assert '|'.join(words) == TEST_FIRST_10_WORDS

Example #3

Show file

File: vocabulary.py Project: vohoaiviet/vecto-vsm-space

def create_from_file(path, min_frequency=0):
    """Collects vocabulary from a corpus by a given file path.
    """
    if not os.path.isfile(path):
        raise RuntimeError("source file does not exist")
    iter = FileCorpus(path).get_token_iterator()
    v = _create_from_iterator(iter, min_frequency)
    return v

Example #4

Show file

File: vocabulary.py Project: yuanzhiKe/vecto

def create_from_file(path, min_frequency=0, language='eng'):
    """Collects vocabulary from a corpus by a given file path.
    """
    if not os.path.isfile(path):
        raise RuntimeError("source file does not exist")
    tokenizer = Tokenizer(stopwords=[])
    iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer)
    v = _create_from_iterator(iter, min_frequency)
    return v

Example #5

Show file

def create_from_path(path, min_frequency=0, language='eng'):
    """Collects vocabulary from a corpus by a given directory path.
    """
    tokenizer = Tokenizer(stopwords=[])
    if os.path.isfile(path):
        iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer)
    else:
        if os.path.isdir(path):
            iter = DirCorpus(path, language).get_token_iterator(tokenizer)
        else:
            raise RuntimeError("source path can not be read")
    # TODO: add option for stopwords
    v = _create_from_iterator(iter, min_frequency)
    return v

Example #6

Show file

 def test_sentence(self):
     corpus = FileCorpus(path_text_file)
     sentence_iter = corpus.get_sentence_iterator(verbose=True)
     for s in sentence_iter:
         assert s == ['family', 'dashwood', 'long', 'settled', 'sussex']
         break

Example #7

Show file

File: test_corpus.py Project: vecto-ai/vecto

 def test_file_corpus(self):
     corpus = FileCorpus(path_text_file)
     tokens_iter = corpus.get_token_iterator(verbose=1)
     total_words, words = count_words_and_collect_prefix(tokens_iter)
     print("!!!!!total words", total_words)