def get_extraction_runner():

   runner = ExtractionRunner()
   runner.enable_logging('~/logs/results', '~/logs/runnables')

   runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
   runner.add_runnable(filters.AcademicPaperFilter)
   runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
   runner.add_runnable(tei.TEItoHeaderExtractor)
   runner.add_runnable(parscit.ParsCitCitationExtractor)
   runner.add_runnable(figures.PDFFiguresExtractor)
   runner.add_runnable(algorithms.AlgorithmsExtractor)

   return runner
Example #2
0
   def test_disable_logs_works(self):
      runner = ExtractionRunner()
      results_log_path = os.path.join(self.results_dir, 'results')
      runnables_log_path = os.path.join(self.results_dir, 'runnables')

      runner.enable_logging(results_log_path, runnables_log_path)
      runner.disable_logging()
      runner.add_runnable(SelfLogExtractor)
      runner.run('abc', output_dir = self.results_dir, run_name = 'RUN!')

      log_list = glob.glob(results_log_path + "*.log")
      self.assertFalse(log_list) 
      log_list = glob.glob(runnables_log_path + "*.log")
      self.assertFalse(log_list) 
Example #3
0
def get_extraction_runner():

    runner = ExtractionRunner()
    runner.enable_logging('~/logs/results', '~/logs/runnables')

    runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)
    runner.add_runnable(filters.AcademicPaperFilter)
    runner.add_runnable(grobid.GrobidHeaderTEIExtractor)
    runner.add_runnable(tei.TEItoHeaderExtractor)
    runner.add_runnable(parscit.ParsCitCitationExtractor)
    runner.add_runnable(figures.PDFFiguresExtractor)
    runner.add_runnable(algorithms.AlgorithmsExtractor)

    return runner
Example #4
0
    def test_disable_logs_works(self):
        runner = ExtractionRunner()
        results_log_path = os.path.join(self.results_dir, 'results')
        runnables_log_path = os.path.join(self.results_dir, 'runnables')

        runner.enable_logging(results_log_path, runnables_log_path)
        runner.disable_logging()
        runner.add_runnable(SelfLogExtractor)
        runner.run('abc', output_dir=self.results_dir, run_name='RUN!')

        log_list = glob.glob(results_log_path + "*.log")
        self.assertFalse(log_list)
        log_list = glob.glob(runnables_log_path + "*.log")
        self.assertFalse(log_list)
def get_extraction_runner():

   runner = ExtractionRunner()
   runner.enable_logging('~/logs/results', '~/logs/runnables')

   # Option 1
   runner.add_runnable(grobid.GrobidTEIExtractor)
   runner.add_runnable(extractors.TEItoPlainTextExtractor)
   runner.add_runnable(extractors.TEItoHeaderExtractor)
   # OR
   # Option 2
   # runner.add_runnable(pdfbox.PDFBoxPlainTextExtractor)

   runner.add_runnable(filters.AcademicPaperFilter)

   return runner
Example #6
0
   def test_logs_work(self):
      runner = ExtractionRunner()
      results_log_path = os.path.join(self.results_dir, 'results')
      runnables_log_path = os.path.join(self.results_dir, 'runnables')

      runner.enable_logging(results_log_path, runnables_log_path)
      runner.add_runnable(SelfLogExtractor)
      runner.run('abc', output_dir = self.results_dir, run_name = 'RUN!')

      results_log = glob.glob(results_log_path + "*.log")[0]
      log_data = open(results_log, 'r').read()
      self.assertTrue('[SUCCESS]' in log_data)
      self.assertTrue('RUN!' in log_data)

      runnables_log = glob.glob(runnables_log_path + "*.log")[0]
      log_data = open(runnables_log, 'r').read()
      self.assertTrue('abc' in log_data)
      self.assertTrue('SelfLogExtractor' in log_data)
      self.assertTrue('RUN!' in log_data)
Example #7
0
    def test_logs_work(self):
        runner = ExtractionRunner()
        results_log_path = os.path.join(self.results_dir, 'results')
        runnables_log_path = os.path.join(self.results_dir, 'runnables')

        runner.enable_logging(results_log_path, runnables_log_path)
        runner.add_runnable(SelfLogExtractor)
        runner.run('abc', output_dir=self.results_dir, run_name='RUN!')

        results_log = glob.glob(results_log_path + "*.log")[0]
        log_data = open(results_log, 'r').read()
        self.assertTrue('[SUCCESS]' in log_data)
        self.assertTrue('RUN!' in log_data)

        runnables_log = glob.glob(runnables_log_path + "*.log")[0]
        log_data = open(runnables_log, 'r').read()
        self.assertTrue('abc' in log_data)
        self.assertTrue('SelfLogExtractor' in log_data)
        self.assertTrue('RUN!' in log_data)