コード例 #1
0
ファイル: tasks.py プロジェクト: humlab/penelope
    def process_stream(self) -> Iterable[DocumentPayload]:

        if self.document_index is None:
            raise CoOccurrenceError(
                "expected document index found no such thing")

        token2id: Token2Id = self.pipeline.payload.token2id
        pair2id: Token2Id = Token2Id()

        normal_builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder(
            VectorizeType.Normal, self.document_index, pair2id, token2id)

        concept_builder: CoOccurrenceCorpusBuilder = (
            CoOccurrenceCorpusBuilder(VectorizeType.Concept,
                                      self.document_index, pair2id, token2id)
            if self.context_opts.concept else None)

        coo_payloads: Iterable[CoOccurrencePayload] = (
            payload.content
            for payload in self.prior.outstream(desc="Ingest",
                                                total=len(self.document_index))
            if payload.content is not None)

        for coo_payload in coo_payloads:
            normal_builder.ingest_pairs(coo_payload).add(payload=coo_payload)
            if concept_builder:
                concept_builder.add(payload=coo_payload)

        pair2id.close()
        """Translation between id-pair (single vocab IDs) and pair-pid (pair vocab IDs)"""
        token_ids_2_pair_id: Mapping[Tuple[int, int], int] = dict(pair2id.data)

        self.translate_id_pair_to_token(pair2id, token2id)

        concept_corpus: VectorizedCorpus = (concept_builder.corpus.remember(
            window_counts=self.get_window_counts(concept_builder))
                                            if concept_builder else None)

        corpus: VectorizedCorpus = normal_builder.corpus.remember(
            window_counts=self.get_window_counts(normal_builder))

        bundle: Bundle = Bundle(
            corpus=corpus,
            token2id=token2id,
            document_index=self.document_index,
            concept_corpus=concept_corpus,
            compute_options=self.pipeline.payload.stored_opts(),
            vocabs_mapping=token_ids_2_pair_id,
        )

        if self.compress:
            bundle.compress()

        payload: DocumentPayload = DocumentPayload(content=bundle)

        yield payload
コード例 #2
0
def test_keyness_transform_corpus2(tag: str, keyness_source: KeynessMetricSource, keyness: KeynessMetric):
    folder: str = f'./tests/test_data/{tag}'
    bundle: Bundle = Bundle.load(folder=folder, tag=tag, compute_frame=False)
    opts: ComputeKeynessOpts = ComputeKeynessOpts(
        period_pivot="year",
        keyness_source=keyness_source,
        keyness=keyness,
        tf_threshold=10,
        pivot_column_name='time_period',
        normalize=False,
        fill_gaps=False,
    )
    corpus: VectorizedCorpus = bundle.keyness_transform(opts=opts)

    assert corpus is not None
コード例 #3
0
def test_zero_out_by_tf_threshold():
    expected_sums = [28, 12, 9, 11, 39, 34, 7, 8, 15, 16, 10, 34, 8, 28, 14, 19, 28, 23, 23, 9, 16, 9, 16, 4, 16, 17, 4]
    tag: str = 'ABCDEFG_7DOCS_CONCEPT'
    folder: str = f'./tests/test_data/{tag}'
    bundle: Bundle = Bundle.load(folder=folder, tag=tag, compute_frame=False)

    corpus: VectorizedCorpus = bundle.corpus

    assert (corpus.term_frequency == expected_sums).all()

    tf_threshold: int = 10
    indices = [i for i, v in enumerate(expected_sums) if v < tf_threshold]
    for i in indices:
        expected_sums[i] = 0

    corpus.zero_out_by_tf_threshold(tf_threshold)

    assert (corpus.term_frequency == expected_sums).all()
コード例 #4
0
ファイル: tabular_gui_test.py プロジェクト: humlab/penelope
def test_table_gui_debug_setup(tag: str, keyness: KeynessMetric):

    folder: str = f'./tests/test_data/{tag}'

    bundle: Bundle = Bundle.load(folder=folder, tag=tag, compute_frame=False)

    assert bundle is not None

    gui: TabularCoOccurrenceGUI = TabularCoOccurrenceGUI(bundle=bundle)

    gui.stop_observe()
    gui.pivot = "year"
    gui.keyness_source = KeynessMetricSource.Full
    gui.keyness = keyness
    gui.token_filter = ""
    gui.global_threshold = 1
    gui.concepts = set()
    gui.largest = 10
    gui.start_observe()

    gui.update_corpus()
コード例 #5
0
def load_bundle(folder: str, tag: str):
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
コード例 #6
0
ファイル: utils.py プロジェクト: humlab/penelope
def create_bundle(tag: str = 'DUMMY') -> Bundle:
    folder = f'./tests/test_data/{tag}'
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
コード例 #7
0
def create_bundle() -> Bundle:
    folder, tag = './tests/test_data/VENUS', 'VENUS'
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle
コード例 #8
0
ファイル: tabular_gui_test.py プロジェクト: humlab/penelope
def bundle():
    folder, tag = './tests/test_data/SSI', 'SSI'
    filename = to_filename(folder=folder, tag=tag)
    bundle: Bundle = Bundle.load(filename, compute_frame=False)
    return bundle