Beispiel #1
0
 def test_run_url_special_characters(self):
     path = "http://file.biolab.si/text-semantics/data/" \
            "elektrotehniski-vestnik-clanki/"
     importer = ImportDocuments(path, True)
     corpus, errors, _, _, _, _ = importer.run()
     self.assertGreater(len(corpus), 0)
     self.assertEqual(0, len(errors))
Beispiel #2
0
 def test_read_meta_data_url(self):
     path = "http://file.biolab.si/text-semantics/data/semeval/"
     importer = ImportDocuments(path, True)
     _, meta_paths = importer._retrieve_paths()
     callback = importer._shared_callback(len(meta_paths))
     data1, err = importer._read_meta_data(meta_paths, callback)
     self.assertIsInstance(data1, pd.DataFrame)
     self.assertEqual(len(err), 0)
Beispiel #3
0
 def test_conllu_reader(self):
     path = os.path.join(os.path.dirname(__file__),
                         "../widgets/tests/data/conllu")
     importer = ImportDocuments(path)
     corpus, errors, lemma, pos, ner, _ = importer.run()
     self.assertEqual(len(corpus), 5)
     self.assertEqual(len(corpus), len(lemma))
     self.assertEqual(len(corpus), len(pos))
     self.assertEqual(len(corpus), len(ner))
    def test_run_url(self):
        path = "http://file.biolab.si/text-semantics/data" \
               "/predlogi-vladi-sample/"
        importer = ImportDocuments(path, True)
        corpus1, _ = importer.run()
        self.assertGreater(len(corpus1), 0)

        mask = np.ones_like(corpus1.metas, dtype=bool)
        mask[:, 1] = False

        path = "http://file.biolab.si/text-semantics/data" \
               "/predlogi-vladi-sample////"
        importer = ImportDocuments(path, True)
        corpus2, _ = importer.run()
        self.assertGreater(len(corpus1), 0)
        self.assertEqual(corpus1.metas[mask].tolist(),
                         corpus2.metas[mask].tolist())

        path = "http://file.biolab.si/text-semantics/data" \
               "/predlogi-vladi-sample"
        importer = ImportDocuments(path, True)
        corpus3, _ = importer.run()
        self.assertGreater(len(corpus2), 0)
        self.assertEqual(corpus1.metas[mask].tolist(),
                         corpus3.metas[mask].tolist())
    def start(self):
        """
        Start/execute the text indexing operation
        """
        self.error()
        self.Warning.clear()
        self.progress_widget.setValue(0)

        self.__invalidated = False
        if self.currentPath is None:
            return

        if self.__state == State.Processing:
            assert self.__pendingTask is not None
            log.info("Starting a new task while one is in progress. "
                     "Cancel the existing task (dir:'{}')".format(
                         self.__pendingTask.startdir))
            self.cancel()

        startdir = self.currentPath

        self.__setRuntimeState(State.Processing)

        report_progress = methodinvoke(self, "__onReportProgress", (object, ))

        task = ImportDocuments(startdir, report_progress=report_progress)

        # collect the task state in one convenient place
        self.__pendingTask = taskstate = namespace(
            task=task,
            startdir=startdir,
            future=None,
            watcher=None,
            cancelled=False,
            cancel=None,
        )

        def cancel():
            # Cancel the task and disconnect
            if taskstate.future.cancel():
                pass
            else:
                taskstate.task.cancelled = True
                taskstate.cancelled = True
                try:
                    taskstate.future.result(timeout=0)
                except UserInterruptError:
                    pass
                except TimeoutError:
                    log.info("The task did not stop in in a timely manner")
            taskstate.watcher.finished.disconnect(self.__onRunFinished)

        taskstate.cancel = cancel

        def run_text_scan_task_interupt():
            try:
                return task.run()
            except UserInterruptError:
                # Suppress interrupt errors, so they are not logged
                return

        taskstate.future = self.__executor.submit(run_text_scan_task_interupt)
        taskstate.watcher = FutureWatcher(taskstate.future)
        taskstate.watcher.finished.connect(self.__onRunFinished)
Beispiel #6
0
 def test_scan_url_csv(self):
     path = "http://file.biolab.si/text-semantics/data/"
     importer = ImportDocuments(path, True)
     paths = importer.scan_url(path, include_patterns=["*.csv"])
     self.assertGreater(len(paths), 0)
Beispiel #7
0
 def test_scan_url(self):
     path = "http://file.biolab.si/text-semantics/data/semeval/"
     importer = ImportDocuments(path, True)
     paths = importer.scan_url(path)
     self.assertGreater(len(paths), 0)
Beispiel #8
0
 def test_url_errors(self, _):
     path = "http://file.biolab.si/text-semantics/data/elektrotehniski-vestnik-clanki/"
     importer = ImportDocuments(path, True)
     corpus, errors, _, _, _, _ = importer.run()
     self.assertIsNone(corpus)
     self.assertGreater(len(errors), 0)
Beispiel #9
0
    def test_merge_metadata_url(self):
        path = "http://file.biolab.si/text-semantics/data/semeval/"
        importer = ImportDocuments(path, True)
        file_paths, meta_paths = importer._retrieve_paths()
        callback = importer._shared_callback(len(file_paths) + len(meta_paths))
        text_data, _, _, _, _, _ = importer._read_text_data(file_paths, callback)
        meta_data, _ = importer._read_meta_data(meta_paths, callback)

        importer._text_data = text_data[:4]  # 'C-1', 'C-14', 'C-17', 'C-18'
        importer._meta_data = meta_data[:50]
        corpus = importer._create_corpus()
        corpus = importer._add_metadata(corpus)
        self.assertGreater(len(corpus), 0)
        columns = ["name", "path", "content", "Content",
                   "Text file", "Keywords"]
        self.assertEqual([v.name for v in corpus.domain.metas], columns)

        importer._text_data = text_data[:4]  # 'C-1', 'C-14', 'C-17', 'C-18'
        importer._meta_data = None
        corpus = importer._create_corpus()
        corpus = importer._add_metadata(corpus)
        self.assertGreater(len(corpus), 0)
        columns = ["name", "path", "content"]
        self.assertEqual([v.name for v in corpus.domain.metas], columns)
 def test_read_meta_data_url(self):
     path = "http://file.biolab.si/text-semantics/data/semeval/"
     importer = ImportDocuments(path, True)
     data1, err = importer._read_meta_data()
     self.assertIsInstance(data1, pd.DataFrame)
     self.assertEqual(len(err), 0)