def test_run_url(self): path = "http://file.biolab.si/text-semantics/data" \ "/predlogi-vladi-sample/" importer = ImportDocuments(path, True) corpus1, _ = importer.run() self.assertGreater(len(corpus1), 0) mask = np.ones_like(corpus1.metas, dtype=bool) mask[:, 1] = False path = "http://file.biolab.si/text-semantics/data" \ "/predlogi-vladi-sample////" importer = ImportDocuments(path, True) corpus2, _ = importer.run() self.assertGreater(len(corpus1), 0) self.assertEqual(corpus1.metas[mask].tolist(), corpus2.metas[mask].tolist()) path = "http://file.biolab.si/text-semantics/data" \ "/predlogi-vladi-sample" importer = ImportDocuments(path, True) corpus3, _ = importer.run() self.assertGreater(len(corpus2), 0) self.assertEqual(corpus1.metas[mask].tolist(), corpus3.metas[mask].tolist())
def test_run_url_special_characters(self): path = "http://file.biolab.si/text-semantics/data/" \ "elektrotehniski-vestnik-clanki/" importer = ImportDocuments(path, True) corpus, errors, _, _, _, _ = importer.run() self.assertGreater(len(corpus), 0) self.assertEqual(0, len(errors))
def test_read_meta_data_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) _, meta_paths = importer._retrieve_paths() callback = importer._shared_callback(len(meta_paths)) data1, err = importer._read_meta_data(meta_paths, callback) self.assertIsInstance(data1, pd.DataFrame) self.assertEqual(len(err), 0)
def test_conllu_reader(self): path = os.path.join(os.path.dirname(__file__), "../widgets/tests/data/conllu") importer = ImportDocuments(path) corpus, errors, lemma, pos, ner, _ = importer.run() self.assertEqual(len(corpus), 5) self.assertEqual(len(corpus), len(lemma)) self.assertEqual(len(corpus), len(pos)) self.assertEqual(len(corpus), len(ner))
def test_merge_metadata_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) text_data, _ = importer._read_text_data() meta_data, _ = importer._read_meta_data() importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' importer._meta_data = meta_data[:50] corpus = importer._create_corpus() corpus = importer._add_metadata(corpus) self.assertGreater(len(corpus), 0) columns = [ "name", "path", "content", "Content", "Text file", "Keywords" ] self.assertEqual([v.name for v in corpus.domain.metas], columns) importer._text_data = text_data[:4] # 'C-1', 'C-14', 'C-17', 'C-18' importer._meta_data = None corpus = importer._create_corpus() corpus = importer._add_metadata(corpus) self.assertGreater(len(corpus), 0) columns = ["name", "path", "content"] self.assertEqual([v.name for v in corpus.domain.metas], columns)
def start(self): """ Start/execute the text indexing operation """ self.error() self.Warning.clear() self.progress_widget.setValue(0) self.__invalidated = False if self.currentPath is None: return if self.__state == State.Processing: assert self.__pendingTask is not None log.info("Starting a new task while one is in progress. " "Cancel the existing task (dir:'{}')".format( self.__pendingTask.startdir)) self.cancel() startdir = self.currentPath self.__setRuntimeState(State.Processing) report_progress = methodinvoke(self, "__onReportProgress", (object, )) task = ImportDocuments(startdir, report_progress=report_progress) # collect the task state in one convenient place self.__pendingTask = taskstate = namespace( task=task, startdir=startdir, future=None, watcher=None, cancelled=False, cancel=None, ) def cancel(): # Cancel the task and disconnect if taskstate.future.cancel(): pass else: taskstate.task.cancelled = True taskstate.cancelled = True try: taskstate.future.result(timeout=0) except UserInterruptError: pass except TimeoutError: log.info("The task did not stop in in a timely manner") taskstate.watcher.finished.disconnect(self.__onRunFinished) taskstate.cancel = cancel def run_text_scan_task_interupt(): try: return task.run() except UserInterruptError: # Suppress interrupt errors, so they are not logged return taskstate.future = self.__executor.submit(run_text_scan_task_interupt) taskstate.watcher = FutureWatcher(taskstate.future) taskstate.watcher.finished.connect(self.__onRunFinished)
def test_scan_url_csv(self): path = "http://file.biolab.si/text-semantics/data/" importer = ImportDocuments(path, True) paths = importer.scan_url(path, include_patterns=["*.csv"]) self.assertGreater(len(paths), 0)
def test_scan_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) paths = importer.scan_url(path) self.assertGreater(len(paths), 0)
def test_url_errors(self, _): path = "http://file.biolab.si/text-semantics/data/elektrotehniski-vestnik-clanki/" importer = ImportDocuments(path, True) corpus, errors, _, _, _, _ = importer.run() self.assertIsNone(corpus) self.assertGreater(len(errors), 0)
def test_read_meta_data_url(self): path = "http://file.biolab.si/text-semantics/data/semeval/" importer = ImportDocuments(path, True) data1, err = importer._read_meta_data() self.assertIsInstance(data1, pd.DataFrame) self.assertEqual(len(err), 0)