def process_stream(self) -> Iterable[DocumentPayload]: if self.document_index is None: raise CoOccurrenceError( "expected document index found no such thing") token2id: Token2Id = self.pipeline.payload.token2id pair2id: Token2Id = Token2Id() normal_builder: CoOccurrenceCorpusBuilder = CoOccurrenceCorpusBuilder( VectorizeType.Normal, self.document_index, pair2id, token2id) concept_builder: CoOccurrenceCorpusBuilder = ( CoOccurrenceCorpusBuilder(VectorizeType.Concept, self.document_index, pair2id, token2id) if self.context_opts.concept else None) coo_payloads: Iterable[CoOccurrencePayload] = ( payload.content for payload in self.prior.outstream(desc="Ingest", total=len(self.document_index)) if payload.content is not None) for coo_payload in coo_payloads: normal_builder.ingest_pairs(coo_payload).add(payload=coo_payload) if concept_builder: concept_builder.add(payload=coo_payload) pair2id.close() """Translation between id-pair (single vocab IDs) and pair-pid (pair vocab IDs)""" token_ids_2_pair_id: Mapping[Tuple[int, int], int] = dict(pair2id.data) self.translate_id_pair_to_token(pair2id, token2id) concept_corpus: VectorizedCorpus = (concept_builder.corpus.remember( window_counts=self.get_window_counts(concept_builder)) if concept_builder else None) corpus: VectorizedCorpus = normal_builder.corpus.remember( window_counts=self.get_window_counts(normal_builder)) bundle: Bundle = Bundle( corpus=corpus, token2id=token2id, document_index=self.document_index, concept_corpus=concept_corpus, compute_options=self.pipeline.payload.stored_opts(), vocabs_mapping=token_ids_2_pair_id, ) if self.compress: bundle.compress() payload: DocumentPayload = DocumentPayload(content=bundle) yield payload
def test_keyness_transform_corpus2(tag: str, keyness_source: KeynessMetricSource, keyness: KeynessMetric): folder: str = f'./tests/test_data/{tag}' bundle: Bundle = Bundle.load(folder=folder, tag=tag, compute_frame=False) opts: ComputeKeynessOpts = ComputeKeynessOpts( period_pivot="year", keyness_source=keyness_source, keyness=keyness, tf_threshold=10, pivot_column_name='time_period', normalize=False, fill_gaps=False, ) corpus: VectorizedCorpus = bundle.keyness_transform(opts=opts) assert corpus is not None
def test_zero_out_by_tf_threshold(): expected_sums = [28, 12, 9, 11, 39, 34, 7, 8, 15, 16, 10, 34, 8, 28, 14, 19, 28, 23, 23, 9, 16, 9, 16, 4, 16, 17, 4] tag: str = 'ABCDEFG_7DOCS_CONCEPT' folder: str = f'./tests/test_data/{tag}' bundle: Bundle = Bundle.load(folder=folder, tag=tag, compute_frame=False) corpus: VectorizedCorpus = bundle.corpus assert (corpus.term_frequency == expected_sums).all() tf_threshold: int = 10 indices = [i for i, v in enumerate(expected_sums) if v < tf_threshold] for i in indices: expected_sums[i] = 0 corpus.zero_out_by_tf_threshold(tf_threshold) assert (corpus.term_frequency == expected_sums).all()
def test_table_gui_debug_setup(tag: str, keyness: KeynessMetric): folder: str = f'./tests/test_data/{tag}' bundle: Bundle = Bundle.load(folder=folder, tag=tag, compute_frame=False) assert bundle is not None gui: TabularCoOccurrenceGUI = TabularCoOccurrenceGUI(bundle=bundle) gui.stop_observe() gui.pivot = "year" gui.keyness_source = KeynessMetricSource.Full gui.keyness = keyness gui.token_filter = "" gui.global_threshold = 1 gui.concepts = set() gui.largest = 10 gui.start_observe() gui.update_corpus()
def load_bundle(folder: str, tag: str): filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def create_bundle(tag: str = 'DUMMY') -> Bundle: folder = f'./tests/test_data/{tag}' filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def create_bundle() -> Bundle: folder, tag = './tests/test_data/VENUS', 'VENUS' filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle
def bundle(): folder, tag = './tests/test_data/SSI', 'SSI' filename = to_filename(folder=folder, tag=tag) bundle: Bundle = Bundle.load(filename, compute_frame=False) return bundle