def _index_maxqda_coded_news_csv_dataset(self) -> None: source = self._settings.source_maxqda_coded_news_csv assert source news_csv_document_dicts = {} for document_dict in load_document_dicts_from_news_csv( source.file, lang=source.lang, progress_bar=False): news_csv_document_dicts[str( document_dict["index"])] = document_dict new_index(self._settings.index, TokenizedMaxqdaCodedNewsCsvDocument) add_documents_to_index( self._settings.index, TokenizedMaxqdaCodedNewsCsvDocument, tqdm( news_csv_document_dicts.values(), desc=source.file.name, total=len(news_csv_document_dicts), dynamic_ncols=True, ), max_retries=self._max_retries, num_procs=self._num_procs, ) self._index_maxqda_coded_news_csv_code(source.codes, news_csv_document_dicts)
def _index_news_csv_dataset(self) -> None: source = self._settings.source_news_csv assert source new_index(self._settings.index, TokenizedNewsCsvDocument) add_documents_to_index( self._settings.index, TokenizedNewsCsvDocument, load_document_dicts_from_news_csv(source.file, lang=source.lang), max_retries=self._max_retries, num_procs=self._num_procs, )
def _index_nasty_dataset(self) -> None: source = self._settings.source_nasty assert source if not Index(self._settings.index).exists(): new_index(self._settings.index, TokenizedNastyBatchResultsTwitterDocument) indexed_index = self._settings.index + _INDEXED_SUFFIX if not Index(indexed_index).exists(): new_index(indexed_index, IndexedFilesDocument) batch_results = BatchResults(source.batch_results_dir) total_size = 0 for batch_entry in batch_results: data_file = source.batch_results_dir / batch_entry.data_file_name total_size += data_file.stat().st_size with tqdm( desc=self._settings.name, total=total_size, unit="B", unit_scale=True, unit_divisor=1024, dynamic_ncols=True, position=1, ) as progress_bar: for batch_entry in batch_results: data_file = source.batch_results_dir / batch_entry.data_file_name if (IndexedFilesDocument.search(index=indexed_index).query( "term", file_name=data_file.name).execute()): _LOGGER.debug( "Data file '{}' is already indexed, skipping.", data_file.name, ) progress_bar.update(data_file.stat().st_size) continue add_documents_to_index( self._settings.index, TokenizedNastyBatchResultsTwitterDocument, load_document_dicts_from_nasty_batch_results(data_file), max_retries=self._max_retries, num_procs=self._num_procs, ) IndexedFilesDocument(file_name=data_file.name).save( index=indexed_index) progress_bar.update(data_file.stat().st_size)
def _index_maxqda_coded_nasty_code( self, codes: Sequence[DatasetSourceMaxqdaCodeSection], lang: str) -> None: for code in codes: if code.file: add_documents_to_index( self._settings.index, TokenizedMaxqdaCodedNastyDocument, load_document_dicts_from_maxqda_coded_nasty_csv( code.file, code.code_identifier, lang), max_retries=self._max_retries, num_procs=self._num_procs, ) if code.codes: self._index_maxqda_coded_nasty_code(code.codes, lang)
def _index_maxqda_coded_news_csv_code( self, codes: Sequence[DatasetSourceMaxqdaCodeSection], news_csv_document_dicts: Mapping[str, Mapping[str, object]], ) -> None: for code in codes: if code.file: add_documents_to_index( self._settings.index, TokenizedMaxqdaCodedNewsCsvDocument, load_document_dicts_from_maxqda_coded_news_csv( code.file, code.code_identifier, news_csv_document_dicts, ), max_retries=self._max_retries, num_procs=self._num_procs, ) if code.codes: self._index_maxqda_coded_news_csv_code( code.codes, news_csv_document_dicts)