Ejemplo n.º 1
0
def test_tokenized_corpus_interface():

    assert issubclass(TokenizedCorpus, ITokenizedCorpus)

    source = readers.TextTokenizer(source=["a b c", "e f g"])
    instance = TokenizedCorpus(source)
    assert isinstance(instance, ITokenizedCorpus)
Ejemplo n.º 2
0
def to_co_occurrence_matrix(
    corpus_or_reader: Union[ICorpusReader, TokenizedCorpus], vocabulary: Mapping[str, int] = None
) -> scipy.sparse.spmatrix:
    """Computes a term-term co-ocurrence matrix for documents in corpus/reader.

    Parameters
    ----------
    corpus_or_reader : Union[ICorpusReader,TokenizedCorpus]
        Sequence of tokenized documents

    Returns
    -------
    pd.DataFrame
        Upper diagonal of term-term frequency matrix (TTM). Note that diagonal (wi, wi) is not returned
    """

    if not isinstance(corpus_or_reader, ITokenizedCorpus):
        corpus_or_reader = TokenizedCorpus(reader=corpus_or_reader)

    vocabulary = vocabulary or corpus_or_reader.token2id
    dtm_corpus: VectorizedCorpus = CorpusVectorizer().fit_transform(
        corpus_or_reader, already_tokenized=True, vocabulary=vocabulary
    )
    term_term_matrix = dtm_corpus.co_occurrence_matrix()
    return term_term_matrix
Ejemplo n.º 3
0
    def test_to_dataframe_of_term_matrix_gives_expected_result(self):

        # Arrange
        reader = PandasCorpusReader(self.create_test_dataframe())
        corpus = TokenizedCorpus(
            reader,
            # Pre-compute transform options:
            transform_opts=TokensTransformOpts(
                only_any_alphanumeric=False,
                to_lower=False,
                remove_accents=False,
                min_len=1,
                max_len=None,
                keep_numerals=False,
            ),
        )

        term_term_matrix = CorpusVectorizer().fit_transform(corpus, already_tokenized=True).co_occurrence_matrix()

        # Act
        id2w = corpus.id2token.get
        co_occurrences = term_term_matrix_to_co_occurrences(term_term_matrix, threshold_count=1, ignore_ids=set())
        co_occurrences['w1'] = co_occurrences.w1_id.apply(id2w)
        co_occurrences['w2'] = co_occurrences.w2_id.apply(id2w)

        # Assert
        assert 2 == int(co_occurrences[((co_occurrences.w1 == 'A') & (co_occurrences.w2 == 'B'))].value)
        assert 0 == len(co_occurrences[((co_occurrences.w1 == 'C') & (co_occurrences.w2 == 'F'))])
Ejemplo n.º 4
0
 def create_simple_test_corpus(self, transform_opts: TokensTransformOpts):
     data = [
         (2000, 'Detta är en mening med 14 token, 3 siffror och 2 symboler.'),
         (2000, 'Är det i denna mening en mening?'),
     ]
     df = pd.DataFrame(data, columns=['year', 'txt'])
     reader = PandasCorpusReader(df)
     corpus = TokenizedCorpus(reader, transform_opts=transform_opts)
     return corpus
Ejemplo n.º 5
0
def very_simple_corpus(data: List[Tuple[str, List[str]]]) -> TokenizedCorpus:

    reader = tng.CorpusReader(
        source=tng.InMemorySource(data),
        reader_opts=TextReaderOpts(filename_fields="year:_:1"),
        transformer=None,  # already tokenized
    )
    corpus = TokenizedCorpus(reader=reader)
    return corpus
Ejemplo n.º 6
0
def create_corpus():
    reader = create_reader()
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=True,
        to_lower=True,
        remove_accents=False,
        min_len=2,
        max_len=None,
        keep_numerals=False,
    )
    corpus = TokenizedCorpus(reader, transform_opts=transform_opts)
    return corpus
Ejemplo n.º 7
0
 def test_processed_corpus_token_stream(self):
     df = self.create_test_dataframe()
     reader = PandasCorpusReader(df)
     corpus = TokenizedCorpus(reader, transform_opts=TokensTransformOpts())
     result = [x for x in corpus]
     expected = [
         ('document_0.txt', ['A', 'B', 'C']),
         ('document_1.txt', ['B', 'C', 'D']),
         ('document_2.txt', ['C', 'B']),
         ('document_3.txt', ['A', 'B', 'F']),
         ('document_4.txt', ['E', 'B']),
         ('document_5.txt', ['F', 'E', 'E']),
     ]
     self.assertEqual(expected, result)
Ejemplo n.º 8
0
def text_corpus() -> TokenizedCorpus:
    filename_fields = dict(year=r".{5}(\d{4})_.*", serial_no=r".{9}_(\d+).*")
    reader = create_tokens_reader(filename_fields=filename_fields,
                                  fix_whitespaces=True,
                                  fix_hyphenation=True)
    transform_opts = TokensTransformOpts(
        only_any_alphanumeric=True,
        to_lower=True,
        remove_accents=False,
        min_len=2,
        max_len=None,
        keep_numerals=False,
    )
    corpus = TokenizedCorpus(reader, transform_opts=transform_opts)
    return corpus
Ejemplo n.º 9
0
    def test_fit_transform_gives_document_term_matrix(self):
        # Arrange
        reader = PandasCorpusReader(self.create_test_dataframe())
        corpus = TokenizedCorpus(
            reader,
            transform_opts=TokensTransformOpts(
                only_any_alphanumeric=False,
                to_lower=False,
                remove_accents=False,
                min_len=1,
                max_len=None,
                keep_numerals=False,
            ),
        )
        v_corpus = CorpusVectorizer().fit_transform(corpus)

        term_term_matrix = v_corpus.co_occurrence_matrix()
        token2id = v_corpus.token2id

        assert 2 == term_term_matrix.todense()[token2id['A'], token2id['B']]
        assert 0 == term_term_matrix.todense()[token2id['C'], token2id['F']]
Ejemplo n.º 10
0
def test_fit_transform_when_given_a_vocabulary_returns_same_vocabulary():

    corpus = TokenizedCorpus(
        reader=create_reader(),
        transform_opts=TokensTransformOpts(to_lower=True, min_len=10),
    )

    vocabulary = CorpusVectorizer().fit_transform(
        corpus, already_tokenized=True).token2id

    assert corpus.token2id == vocabulary

    expected_vocabulary_reversed = {
        k: abs(v - 5)
        for k, v in corpus.token2id.items()
    }

    vocabulary = (CorpusVectorizer().fit_transform(
        corpus,
        already_tokenized=True,
        vocabulary=expected_vocabulary_reversed).token2id)

    assert expected_vocabulary_reversed == vocabulary
Ejemplo n.º 11
0
 def create_corpus(self):
     df = self.create_test_dataframe()
     reader = PandasCorpusReader(df)
     corpus = TokenizedCorpus(reader, transform_opts=TokensTransformOpts())
     return corpus
Ejemplo n.º 12
0
def compute(
    name: str = None,
    corpus_folder: str = None,
    corpus_source: str = None,
    engine: str = "gensim_lda-multicore",
    engine_args: dict = None,
    filename_field: str = None,
    minimum_probability: float = 0.001,
    n_tokens: int = 200,
    store_corpus: bool = False,
    compressed: bool = True,
):

    if engine not in SUPPORTED_ENGINES:
        raise ValueError(f"Engine {engine} not supported or deprecated")

    if corpus_source is None and corpus_folder is None:
        raise ValueError("corpus filename")

    if len(filename_field or []) == 0:
        raise ValueError("corpus filename fields")

    if corpus_folder is None:
        corpus_folder, _ = os.path.split(os.path.abspath(corpus_source))

    target_folder = os.path.join(corpus_folder, name)

    os.makedirs(target_folder, exist_ok=True)

    reader_opts = TextReaderOpts(
        filename_pattern="*.txt",
        filename_filter=None,
        filename_fields=filename_field,
    )

    transform_opts = TextTransformOpts(fix_whitespaces=False,
                                       fix_hyphenation=True)

    tokens_reader = TextTokenizer(
        source=corpus_source,
        transform_opts=transform_opts,
        reader_opts=reader_opts,
    )

    corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader,
                                              transform_opts=None)

    train_corpus: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=corpus,
        corpus_options=dict(
            reader_opts=reader_opts.props,
            transform_opts=transform_opts.props,
        ),
    )

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=engine,
        engine_args=engine_args,
    )

    inferred_model.topic_model.save(
        os.path.join(target_folder, 'gensim.model.gz'))

    inferred_model.store(target_folder, store_compressed=compressed)

    if store_corpus:
        train_corpus.store(target_folder)

    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
        inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        minimum_probability=minimum_probability,
        n_tokens=n_tokens,
    )

    inferred_topics.store(target_folder)
Ejemplo n.º 13
0
def compute(
    *,
    target_name: str = None,
    corpus_source: str = None,
    target_folder: str = None,
    reader_opts: TextReaderOpts = None,
    text_transform_opts: TextTransformOpts = None,
    transform_opts: TokensTransformOpts = None,
    engine: str = "gensim_lda-multicore",
    engine_args: dict = None,
    store_corpus: bool = False,
    store_compressed: bool = True,
    n_tokens: int = 200,
    minimum_probability: float = 0.001,
):
    """ runner """

    tokens_reader = TextTokenizer(
        source=corpus_source,
        transform_opts=text_transform_opts,
        reader_opts=reader_opts,
    )

    corpus: TokenizedCorpus = TokenizedCorpus(reader=tokens_reader,
                                              transform_opts=transform_opts)

    train_corpus: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=corpus,
        document_index=corpus.document_index,
        token2id=corpus.token2id,
        corpus_options=dict(
            reader_opts=reader_opts.props,
            transform_opts=transform_opts.props,
        ),
    )

    inferred_model: tm.InferredModel = tm.train_model(
        train_corpus=train_corpus,
        method=engine,
        engine_args=engine_args,
    )

    inferred_model.topic_model.save(jj(target_folder, 'gensim.model.gz'))

    inferred_model.store(target_folder, store_compressed=store_compressed)

    if store_corpus:
        train_corpus.store(target_folder)

    inferred_topics: tm.InferredTopicsData = tm.predict_topics(
        inferred_model.topic_model,
        corpus=train_corpus.corpus,
        id2token=train_corpus.id2token,
        document_index=train_corpus.document_index,
        n_tokens=n_tokens,
        minimum_probability=minimum_probability,
    )

    inferred_topics.store(target_folder)

    return dict(folder=target_folder, tag=target_name)