Ejemplo n.º 1
0
def create_ngram_tokens_from_dir(path, min_gram, max_gram, min_frequency=0):
    """Collects ngram tokens from a corpus by a given path.

    """
    t_start = time.time()
    dic_freqs = {}
    if not os.path.isdir(path):
        raise RuntimeError("source directory does not exist")
    corpus = DirCorpus(path).get_token_iterator()
    for word in corpus:
        ngram_tokensList = get_ngram_tokensList_from_word(word, min_gram, max_gram)
        for nts in ngram_tokensList:
            for nt in nts:
                if nt in dic_freqs:
                    dic_freqs[nt] += 1
                else:
                    dic_freqs[nt] = 1
    v = Vocabulary()
    v.lst_frequencies = []
    for i, word in enumerate(sorted(dic_freqs, key=dic_freqs.get, reverse=True)):
        frequency = dic_freqs[word]
        if frequency < min_frequency:
            break
        v.lst_frequencies.append(frequency)
        v.lst_words.append(word)
        v.dic_words_ids[word] = i
    v.metadata["min_frequency"] = min_frequency
    v.metadata["min_gram"] = min_gram
    v.metadata["max_gram"] = max_gram
    v.metadata["cnt_words"] = v.cnt_words
    t_end = time.time()
    v.metadata["execution_time"] = t_end - t_start
    v.metadata["timestamp"] = datetime.datetime.now().isoformat()
    v.metadata["source"] = corpus.metadata
    return v
Ejemplo n.º 2
0
def create_from_dir(path, min_frequency=0):
    """Collects vocabulary from a corpus by a given directory path.
    """
    if not os.path.isdir(path):
        raise RuntimeError("source directory does not exist")
    iter = DirCorpus(path).get_token_iterator()
    v = _create_from_iterator(iter, min_frequency)
    return v
Ejemplo n.º 3
0
def create_from_dir(path, min_frequency=0, language='eng'):
    """Collects vocabulary from a corpus by a given directory path.
    """
    if not os.path.isdir(path):
        raise RuntimeError("source directory does not exist")
    # TODO: add option for stopwords
    tokenizer = Tokenizer(stopwords=[])
    iter = DirCorpus(path, language).get_token_iterator(tokenizer)
    v = _create_from_iterator(iter, min_frequency)
    return v
Ejemplo n.º 4
0
def create_from_path(path, min_frequency=0, language='eng'):
    """Collects vocabulary from a corpus by a given directory path.
    """
    tokenizer = Tokenizer(stopwords=[])
    if os.path.isfile(path):
        iter = FileCorpus(path, language).get_token_iterator(tokenizer=tokenizer)
    else:
        if os.path.isdir(path):
            iter = DirCorpus(path, language).get_token_iterator(tokenizer)
        else:
            raise RuntimeError("source path can not be read")
    # TODO: add option for stopwords
    v = _create_from_iterator(iter, min_frequency)
    return v
Ejemplo n.º 5
0
def create_from_annotated_dir(
        path,
        min_frequency=0,
        representation='word'):  # todo faster creation of vocab
    """Collects vocabulary from a annotated corpus by a given path.

    """
    t_start = time.time()
    dic_freqs = {}
    if not os.path.isdir(path):
        raise RuntimeError("source directory does not exist")
    # source_corpus = DirTokenCorpus(path, tokenizer=ANNOTATED_TEXT_TOKENIZER)
    source_corpus = DirCorpus(path).get_token_iterator(
        tokenizer=ANNOTATED_TEXT_TOKENIZER)
    for token in source_corpus:
        words = get_words_from_annotated_token(token, representation)
        for w in words:
            # print(w)
            if w in dic_freqs:
                dic_freqs[w] += 1
            else:
                dic_freqs[w] = 1
    # TODO: does it really differs from _create_from_iterator? maybe merge?
    v = Vocabulary()
    v.lst_frequencies = []
    for i, word in enumerate(sorted(dic_freqs, key=dic_freqs.get,
                                    reverse=True)):
        frequency = dic_freqs[word]
        if frequency < min_frequency:
            break
        v.lst_frequencies.append(frequency)
        v.lst_words.append(word)
        v.dic_words_ids[word] = i
    v.cnt_words = len(v.lst_words)
    v.metadata["min_frequency"] = min_frequency
    v.metadata["cnt_words"] = v.cnt_words
    t_end = time.time()
    v.metadata["execution_time"] = t_end - t_start
    v.metadata["timestamp"] = datetime.datetime.now().isoformat()
    v.metadata["context_representation"] = representation
    v.metadata["source"] = source_corpus.metadata
    return v
Ejemplo n.º 6
0
 def test_dir_corpus(self):
     corpus = DirCorpus(path_text)
     tokens_iter = corpus.get_token_iterator()
     total_words, words = count_words_and_collect_prefix(tokens_iter)
     assert total_words == TEST_TEXT_LEN
     assert '|'.join(words) == TEST_FIRST_10_WORDS
Ejemplo n.º 7
0
 def test_dir_corpus(self):
     corpus = DirCorpus(path_text)
     tokens_iter = corpus.get_token_iterator()
     total_words, words = count_words_and_collect_prefix(tokens_iter)