def _init(self, subscriber):
        config = {helper.DATA_ROOT: 'local_data/related', 'worker_id': 0}

        if os.path.exists(config[helper.DATA_ROOT]):
            shutil.rmtree(config[helper.DATA_ROOT])

        os.makedirs(config[helper.DATA_ROOT])

        subscriber.setup(config)

        doc1 = document.get_document('dummy1.txt')
        doc1.docid = 'mock'
        doc1.entities.add(0, 'mock1', 'e1')
        doc1.entities.add(1, 'mock2', 'e2')
        doc1.entities.add(2, 'mock3', 'e3')
        subscriber.consume(doc1, None)

        doc2 = document.get_document('dummy2.txt')
        doc2.entities.add(0, 'mock1', 'e1')
        doc2.entities.add(1, 'mock2', 'e2')
        doc2.entities.add(2, 'mock3', 'e4')
        subscriber.consume(doc2, None)

        doc3 = document.get_document('dummy3.txt')
        doc3.entities.add(0, 'mock1', 'e1')
        subscriber.consume(doc3, None)

        subscriber.stop()
Exemple #2
0
def upload():
    """Receive and process an uploaded file."""
    _file = request.files.get('file')

    doc = document.get_document(secure_filename(_file.filename),
                                parent=document.get_document('root'))

    doc.tag = 'upload'

    _globals['gransk'].add_file(doc, file_object=_file)

    return Response('ok')
Exemple #3
0
    def init(self, config, queue, worker_id, injector):
        """
    Initialize worker and read paths from queue, stopping when queue is empty.

    :param config: Configuration object.
    :param queue: Multiprcessing Queue object.
    :param worker_id: Value identifying this worker.
    :param injector: Object from which to fetch dependencies.
    :type config: ``dict``
    :type queue: ``multiprocessing.Queue``
    :type worker_id: ``int``
    :type injector: ``gransk.core.injector.Injector``
    """
        logger = logging.getLogger('worker')

        config[helper.WORKER_ID] = worker_id
        config[helper.INJECTOR] = injector

        pipe = pipeline.build_pipeline(config)

        mod = gransk.api.Subscriber(pipe)
        mod.setup(config)

        while True:
            try:
                path = queue.get(timeout=1)
            except Empty:
                logger.info('[normal stop] worker %d', worker_id)
                break

            try:
                doc = document.get_document(
                    path, parent=document.get_document('root'))
                mod.consume(doc)
            except KeyboardInterrupt:
                logger.info('[aborting] worker %d', worker_id)
                break

        pipe.stop()

        with open(
                os.path.join(config[helper.DATA_ROOT],
                             'time-%s.csv' % worker_id), 'w') as out:
            out.write('%s;%s;%s;%s\n' %
                      ('consumer', 'total', 'consume_count', 'avg'))
            for consumer, (total, consume_count,
                           avg) in pipe.get_time_report():
                out.write('%s;%.2f;%.2f;%.2f\n' %
                          (consumer, total, consume_count, avg))
Exemple #4
0
    def test_config(self):
        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        _find_entities = find_entities.Subscriber(
            test_helper.get_mock_pipeline([]))
        _find_entities.setup(config)

        doc = document.get_document('dummy')

        for entity_type, pattern_conf in config.get(helper.ENTITIES,
                                                    {}).items():
            if not isinstance(pattern_conf['test'], list):
                pattern_conf['test'] = [pattern_conf['test']]

            for test in pattern_conf['test']:
                doc.text = 'dum dum {} dum'.format(test)
                _find_entities.consume(doc, None)
                entities = doc.entities.get_all()

                self.assertEqual(1,
                                 len(entities),
                                 msg='regex for %s found nothing' %
                                 entity_type)
                self.assertEqual(entity_type, entities[0][1]['type'])
                self.assertEqual(test, entities[0][1]['value'])
    def _callback(self, entry, path, data_stream, doc):
        stat = entry.GetStat()

        newdoc = document.get_document(path, parent=doc)
        newdoc.tag = doc.tag
        newdoc.meta['mtime'] = stat.mtime
        newdoc.meta['atime'] = stat.atime
        newdoc.meta['ctime'] = stat.ctime
        newdoc.meta['size'] = stat.size

        if stat.size < self.max_size:
            file_object = None
            try:
                file_object = entry.GetFileObject(data_stream_name=data_stream)
                self.produce(helper.PROCESS_FILE, newdoc, file_object)
                doc.children += 1
            except IOError as err:
                LOGGER.debug(u'could not read path "%s": %s' % (path, err))
                doc.meta['diskimage_read_error'] = six.text_type(err)
                return None
            except Exception as err:
                doc.meta['diskimage_other_read_error'] = six.text_type(err)
            finally:
                if file_object:
                    file_object.close()
        else:
            self.produce(helper.OVERSIZED_FILE, newdoc, None)
    def test_simple(self):
        _strings = ewf_strings.Subscriber(None)
        _strings.setup({'min_string_length': 12})

        doc = document.get_document('mock')

        with open('gransk/plugins/unpackers/tests/test_data/dummy.E01',
                  'rb') as inp:
            _strings.consume(doc, inp)

        expected = (u"IDUMMY      FAT12"
                    u"Non-system disk"
                    u"Press any key to reboot"
                    u"DUMMY      ("
                    u"~1      TRA\""
                    u"FILE-B  TXT"
                    u".          2"
                    u"Mac OS X"
                    u"This resource fork intentionally left blank"
                    u".          2"
                    u"FSEVEN~1"
                    u"000000~1"
                    u"000000~2"
                    u"D3E90FC1-F0EF-427D-B874-2BECB6BEA409"
                    u".          0"
                    u"FILE-A  TXT"
                    u"Hi, I'm file A."
                    u"And I'm file B.")

        actual = doc.text

        self.assertNotEqual(None, actual)
        self.assertEqual(re.sub(r'\s', u'', expected),
                         re.sub(r'\s', u'', actual))
Exemple #7
0
 def _produce_child_doc(self, doc, text, offset):
   base = '%%s.%%0%sd.child' % max(len('%s' % doc.meta['size']), 1)
   new_doc = document.get_document(base % (doc.path, offset), parent=doc)
   new_doc.tag = doc.tag
   new_doc.text = text
   doc.children += 1
   self.produce(helper.RUN_PIPELINE, new_doc, new_doc.text)
Exemple #8
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        data_root = os.path.join('local_data', 'unittests')

        if os.path.exists(data_root):
            shutil.rmtree(data_root)

        _copy_picture = copy_picture.Subscriber(mock_pipeline)
        _copy_picture.setup({
            helper.DATA_ROOT: data_root,
            'workers': 1,
            'tag': 'default',
        })

        doc = document.get_document('mock.jpg')
        doc.meta['type'] = 'picture'

        with open(self.get_test_file('gransk-logo.png'), 'rb') as inp:
            _copy_picture.consume(doc, inp)

        expected = '6913571e-mock.jpg'

        actual = os.listdir('local_data/unittests/pictures')

        self.assertEqual([expected], actual)
        self.assertEqual(expected, doc.meta['picture'])
    def test_scanned_pdf(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])
        mock_injector = test_helper.MockInjector()

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected_headers = {
            'Content-Disposition': 'attachment; filename=scanned.pdf.tiff',
            'Content-type': 'image/tiff',
            'X-Tika-OCRLanguage': 'eng+rus'
        }

        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        config[helper.DATA_ROOT] = 'local_data'
        config[helper.WORKER_ID] = 1
        config[helper.OCR_LANGUAGES] = 'eng+rus'
        config[helper.INJECTOR] = mock_injector
        extractor.setup(config)

        path = self.get_test_file('scanned.pdf')

        doc = document.get_document(path)
        doc.meta['Content-Type'] = 'application/pdf'

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual_headers = mock_injector.http_connection.request_headers

        self.assertEqual(expected_headers, actual_headers)
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected = (
            b'This is an unstructured document containing the \nidentifier '
            b'"193.34.2.1" (ip address), stored as a PDF document.')

        with open('config.yml') as inp:
            config = yaml.load(inp.read())

        config[helper.DATA_ROOT] = 'local_data'
        config[helper.WORKER_ID] = 1
        config[helper.INJECTOR] = test_helper.MockInjector(
            response_text=expected)
        extractor.setup(config)

        path = self.get_test_file('document.pdf')

        doc = document.get_document(path)
        doc.meta['Content-Type'] = 'application/pdf'

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual = doc.text

        self.assertEqual(expected.decode('utf-8'), actual)
Exemple #11
0
    def test_simple(self):
        pipe = test_helper.get_mock_pipeline([])

        mock_mod = test_helper.MockSubscriber()
        mock_mod = test_helper.MockSubscriber()

        pipe.register_magic(b'\xFF\xEE\xDD', ('mock', mock_mod.consume))
        pipe.register_magic(b'\x00\x00\x00', ('mock', mock_mod.consume))

        _magic = magic.Subscriber(pipe)
        _magic.setup(None)

        doc = document.get_document('mock')

        content = b'\xFF\xEE\xDDMOCKMOCKMOCK'

        _magic.consume(doc, BytesIO(content))

        self.assertEquals(True, doc.magic_hit)
        self.assertEquals(1, len(mock_mod.produced))

        expected = content
        actual = mock_mod.produced[0][1].read()

        self.assertEquals(expected, actual)
Exemple #12
0
  def test_simple(self):
    _strings = ewf_strings.Subscriber(None)
    _strings.setup({'min_string_length': 12})

    doc = document.get_document('mock')

    with open('gransk/plugins/unpackers/tests/test_data/dummy.E01', 'rb') as inp:
      _strings.consume(doc, inp)

    expected = ('IDUMMY      FAT12'
                'Non-system disk'
                'Press any key to reboot'

                'DUMMY      ('
                '~1      TRA"'
                'FILE-B  TXT'
                '.          2'
                'Mac OS X'
                'This resource fork intentionally left blank'
                '.          2'
                'FSEVEN~1'
                '000000~1'
                '000000~2'
                'D3E90FC1-F0EF-427D-B874-2BECB6BEA409'
                '.          0'
                'FILE-A  TXT'
                "Hi, I'm file A."
                "And I'm file B.")

    actual = doc.text

    self.assertNotEqual(None, actual)
    self.assertEqual(re.sub(r'\s', '', expected), re.sub(r'\s', '', actual))
Exemple #13
0
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline(
            [helper.DOCUMENT, helper.TEXT])

        extractor = tika_extractor.Subscriber(mock_pipeline)

        expected = (
            u'This is an unstructured document containing the \nidentifier '
            u'"193.34.2.1" (ip address), stored as a PDF document.'
        ).encode('utf-8')

        with open('config.yml') as inp:
            config = yaml.load(inp.read())
            config[helper.INJECTOR] = test_helper.MockInjector(
                response_text=expected)
            extractor.setup(config)

        path = self.get_test_file('document.pdf')

        doc = document.get_document(path)

        with open(doc.path, 'rb') as file_object:
            extractor.consume(doc, file_object)

        actual = doc.text.encode('utf-8')

        self.assertEqual(expected, actual)
Exemple #14
0
    def consume(self, doc, payload):
        """
    Writes payload to disk and unpack archive using 7zip. Then adds all
    unpacked files to the pipeline.

    :param doc: Document object.
    :param payload: File pointer beloning to document.
    :type doc: ``gransk.core.document.Document``
    :type payload: ``file``
    """
        tag = self.config[helper.TAG]
        if doc.tag:
            tag = doc.tag

        filename = os.path.basename(doc.path)

        unique_filename = '%s-%s' % (doc.docid[0:8], filename)
        unpack_to = os.path.join(self.config[helper.DATA_ROOT], 'archives',
                                 unique_filename)

        if not os.path.exists(unpack_to):
            os.makedirs(unpack_to)

        tmp_path = os.path.join(
            self.tmp_root, '%s-%s.%s' % (self.wid, doc.docid[0:8], doc.ext))

        if not os.path.exists(self.tmp_root):
            os.makedirs(self.tmp_root)

        with open(tmp_path, 'wb') as out:
            payload.seek(0)
            out.write(payload.read())
            payload.seek(0)

        cmd = self._get_cmd(tmp_path, unpack_to, doc.meta['Content-Type'])

        proc = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        out, err = proc.communicate()

        if err:
            doc.meta['archive_error'] = err.decode('utf-8')

        for folder, _, filenames in os.walk(unpack_to):
            for filename in filenames:
                path = os.path.join(folder, filename)
                new_doc = document.get_document(path, parent=doc)
                new_doc.tag = tag

                with open(path, "rb") as file_object:
                    self.produce(helper.EXTRACT_META, new_doc, file_object)
                    self.produce(helper.PROCESS_FILE, new_doc, file_object)
                    doc.children += 1

        if os.path.exists(tmp_path):
            os.remove(tmp_path)

        shutil.rmtree(unpack_to)
Exemple #15
0
 def test_bug_2(self):
     text = """ os setup( name='recgonizer', author='Petter Christian Bjelland', version='0.3',"""
     config = {'code_root': '.', 'name_model': 'utils/names.gz'}
     find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
     find_names.setup(config)
     doc = document.get_document('dummy')
     doc.text = text
     find_names.consume(doc, None)
     self.assertEqual(2, len(doc.entities.get_all()))
Exemple #16
0
  def test_no_match(self):
    _picture_meta = picture_meta.Subscriber(None)
    _picture_meta.setup({})

    doc = document.get_document('mock')

    with open(self.get_test_file('document.pdf'), 'rb') as file_object:
      _picture_meta.consume(doc, file_object)

    self.assertEqual(None, doc.meta.get('img_width'))
    self.assertEqual(None, doc.meta.get('img_height'))
  def test_get_network(self):
    mock_pipeline = test_helper.get_mock_pipeline([helper.FINISH_DOCUMENT])

    entities = related_entities.Subscriber(mock_pipeline)
    documents = related_documents.Subscriber(mock_pipeline)
    network = entity_network.Subscriber(mock_pipeline)

    config = {
        helper.DATA_ROOT: 'local_data/network',
        'worker_id': 1
    }

    if os.path.exists(config[helper.DATA_ROOT]):
      shutil.rmtree(config[helper.DATA_ROOT])

    os.makedirs(config[helper.DATA_ROOT])

    entities.setup(config)
    documents.setup(config)
    network.setup(config)

    doc1 = document.get_document('dummy1.txt')
    doc1.entities.add(0, 'mock', u'e1')
    doc1.entities.add(1, 'mock', u'e2')
    doc1.entities.add(2, 'mock', u'e3')
    mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None)

    doc1 = document.get_document('dummy2.txt')
    doc1.entities.add(1, 'mock', u'e2')
    doc1.entities.add(2, 'mock', u'e4')
    mock_pipeline.produce(helper.FINISH_DOCUMENT, doc1, None)

    one_hop = network.get_for('e1', hops=1)
    two_hop = network.get_for('e1', hops=2)

    self.assertNotEqual(0, len(one_hop['nodes']))
    self.assertNotEqual(0, len(one_hop['links']))

    self.assertNotEqual(0, len(two_hop['nodes']))
    self.assertNotEqual(0, len(two_hop['links']))
Exemple #18
0
 def test_simple(self):
     config = {'code_root': '.', 'name_model': 'utils/names.gz'}
     find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
     find_names.setup(config)
     doc = document.get_document('dummy')
     doc.text = 'Dette  er Tom Martin.'
     find_names.consume(doc, None)
     expected = [(10, {
         'entity_id': 'tom_martin',
         'type': 'per',
         'value': 'Tom Martin'
     })]
     self.assertEqual(expected, doc.entities.get_all())
Exemple #19
0
  def test_size_not_overriden(self):
    mock_pipeline = test_helper.get_mock_pipeline(
        [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT])

    _process = process.Subscriber(mock_pipeline)
    _process.setup({})

    doc = document.get_document('mock')
    doc.set_size(100)
    doc.text = 'dcba'

    _process.consume(doc, None)

    self.assertEquals(100, doc.meta['size'])
    def test_simple(self):
        doc = document.get_document(test_helper.get_test_path('dummy.E01'))
        doc.docid = '4321'

        with open(doc.path) as inp:
            self.detector.consume(doc, inp)

        actual = [doc.path for doc, _ in self.mock_pipe.consumer.produced]

        expected = [
            '/DUMMY       (Volume Label Entry)', '/test/file-a.txt',
            '/file-b.txt'
        ]

        self.assertEquals(expected, actual)
Exemple #21
0
  def _run_test(self, filename):
    mock_pipeline = test_helper.get_mock_pipeline([
        helper.DOCUMENT, helper.PICTURE,
        helper.ARCHIVE, helper.DISKIMAGE])

    detector = detect_type.Subscriber(mock_pipeline)

    with open('config.yml') as inp:
      detector.setup(yaml.load(inp.read()))

    doc = document.get_document(filename)

    detector.consume(doc, StringIO('dummy'))

    return doc
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        data_root = os.path.join('local_data', 'unittests')

        if os.path.exists(data_root):
            shutil.rmtree(data_root)

        _copy = copy_file.Subscriber(mock_pipeline)
        _copy.setup({
            helper.DATA_ROOT: data_root,
            'workers': 1,
            'tag': 'default',
            helper.COPY_EXT: ['xyz']
        })

        _copy.consume(document.get_document('mock.xyz'), BytesIO(b'juba.'))
        _copy.consume(document.get_document('ignore.doc'), BytesIO(b'mock'))

        expected = ['39bbf948-mock.xyz']

        actual = os.listdir(os.path.join(data_root, 'files', 'xyz'))

        self.assertEqual(expected, actual)
Exemple #23
0
  def test_simple(self):
    mock_pipeline = test_helper.get_mock_pipeline(
        [helper.PROCESS_TEXT, helper.ANALYZE, helper.FINISH_DOCUMENT])

    _process = process.Subscriber(mock_pipeline)
    _process.setup({})

    doc = document.get_document('mock')
    doc.status = 'untouched'
    doc.text = 'abcd'

    _process.consume(doc, None)

    self.assertNotEqual('untouched', doc.status)
    self.assertEquals(4, doc.meta['size'])
    self.assertNotEqual(0, len(mock_pipeline.consumer.produced))
Exemple #24
0
    def test_bug(self):
        text = """MT-2009-12-015-W001 – SIMULATED WARRANT
Computers assigned to Jo Smith from November 13, 2009 to December 12, 2009.
"""
        config = {'code_root': '.', 'name_model': 'utils/names.gz'}
        find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
        find_names.setup(config)
        doc = document.get_document('dummy')
        doc.text = text
        find_names.consume(doc, None)
        expected = [(62, {
            'entity_id': 'jo_smith',
            'type': 'per',
            'value': 'Jo Smith'
        })]
        self.assertEqual(expected, doc.entities.get_all())
    def test_base(self):
        pipe = test_helper.get_mock_pipeline([helper.RUN_PIPELINE])
        _strings = strings.Subscriber(pipe)
        _strings.setup({'min_string_length': 4, 'max_lines': 2})

        doc = document.get_document('mock')
        doc.set_size(12345)

        _strings.consume(doc, StringIO('AAAA\x00BBBB\x00CCCC'))

        # Two child documents produced.
        self.assertEquals(2, len(pipe.consumer.produced))

        expected = 'mock.00000.child'
        actual = pipe.consumer.produced[0][0].path

        self.assertEquals(expected, actual)
 def test_simple(self):
   config = {
       'code_root': '.',
       helper.INJECTOR: test_helper.MockInjector(
           ner_entities=[(10, u'Hans Petter')])
   }
   find_names = _find_names.Subscriber(test_helper.get_mock_pipeline([]))
   find_names.setup(config)
   doc = document.get_document('dummy')
   doc.text = u'Dette  er Hans Petter.'
   find_names.consume(doc, None)
   expected = [(10, {
       'entity_id': u'Hans_Petter',
       'type': u'per',
       'value': u'Hans Petter'
   })]
   self.assertEqual(expected, doc.entities.get_all())
    def test_simple(self):
        _file_meta = file_meta.Subscriber(test_helper.get_mock_pipeline([]))
        response = json.dumps({u'Content-Type': u'image/jpeg'}).encode('utf-8')
        _file_meta.setup({
            'code_root': '.',
            'host': 'mock',
            helper.INJECTOR: test_helper.MockInjector(response)
        })

        doc = document.get_document('mock.txt')

        _file_meta.consume(doc, StringIO(u'mock'))

        expected = u'picture'
        actual = doc.doctype

        self.assertEqual(expected, actual)
    def test_simple(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        data_root = os.path.join('local_data', 'unittests')

        if os.path.exists(data_root):
            shutil.rmtree(data_root)

        _store_text = store_text.Subscriber(mock_pipeline)
        _store_text.setup({helper.DATA_ROOT: data_root, 'workers': 1})

        doc = document.get_document('mock')
        doc.text = 'mock-mock-mock'

        _store_text.consume(doc, None)

        expected = 'local_data/unittests/text/17404a59-mock'
        actual = doc.meta['text_file']

        self.assertEquals(expected, actual)
    def test_encrypted(self):
        mock_pipe = test_helper.get_mock_pipeline(
            [helper.PROCESS_FILE, helper.TEXT])

        detector = unpack_archive.Subscriber(mock_pipe)
        detector.setup({
            helper.DATA_ROOT: 'local_data',
            helper.TAG: 'test',
            helper.WORKER_ID: 0
        })

        doc = document.get_document(
            test_helper.get_test_path('password-protected.zip'))

        doc.docid = '4321'

        with open(doc.path, 'rb') as inp:
            detector.consume(doc, inp)

        self.assertEqual(1, len(mock_pipe.consumer.produced))
Exemple #30
0
    def _init(self):
        mock_pipeline = test_helper.get_mock_pipeline([])

        injector = test_helper.MockInjector('{}')

        _index_text = index_text.Subscriber(mock_pipeline)
        _index_text.setup({
            'tag': 'default',
            'context_size': 14,
            helper.INJECTOR: injector
        })

        doc = document.get_document('mock.txt')
        doc.text = 'abcd mock-value efgh'
        doc.entities.add(5, 'mock-type', 'mock-value')

        _index_text.consume(doc, None)

        _index_text.stop()

        return injector.elastic_helper._bulk