Esempio n. 1
0
def test_no_normed_cross_db(destination, index='test'):

    if scrapi.settings.CANONICAL_PROCESSOR == destination:
        return

    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    # Get the test documents into the caonical processor, but don't process normalized
    canonical_processor = get_processor(scrapi.settings.CANONICAL_PROCESSOR)
    canonical_processor.process_raw(RAW)
    # canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor raw is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc.raw
    assert not canonical_doc.normalized

    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc

    # # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index)

    # Check to see if the document didn't make made it to the destinaton, and is still in the canonical
    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc

    scrapi.processing.elasticsearch.es = real_es
Esempio n. 2
0
def test_rename():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    old_source = NORMALIZED['shareProperties']['source']

    assert(queryset[0].source == utils.RECORD['shareProperties']['source'])
    assert(queryset[0].source == old_source)

    new_record = copy.deepcopy(utils.RECORD)

    new_record['shareProperties']['source'] = 'wwe_news'

    test_harvester.short_name = 'wwe_news'

    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = DocumentModel.objects(docID=RAW['docID'], source='wwe_news')
    assert(queryset[0].source == 'wwe_news')
    assert(len(queryset) == 1)
    scrapi.processing.elasticsearch.es = real_es
Esempio n. 3
0
def test_renormalize(processor_name, monkeypatch):
    # Set up
    # real_es = scrapi.processing.elasticsearch.es
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es

    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    # Process raw and normalized with fake docs
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    # Check to see those docs were processed
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    # Create a new doucment to be renormalized
    new_raw = copy.deepcopy(RAW)
    new_raw.attributes['docID'] = 'get_the_tables'
    new_raw.attributes['doc'] = new_raw.attributes['doc'].encode('utf-8')

    # This is basically like running the improved harvester right?
    processor.create(new_raw.attributes)

    tasks.migrate(renormalize, sources=[RAW['source']], dry=False)

    queryset = processor.get(docID='get_the_tables', source=RAW['source'])
    assert queryset
    scrapi.processing.elasticsearch.es = real_es
    processor.delete(docID='get_the_tables', source=RAW['source'])
Esempio n. 4
0
def test_rename():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    old_source = NORMALIZED['shareProperties']['source']

    assert queryset[0].source == utils.RECORD['shareProperties']['source']
    assert queryset[0].source == old_source

    new_record = copy.deepcopy(utils.RECORD)

    new_record['shareProperties']['source'] = 'wwe_news'

    test_harvester.short_name = 'wwe_news'

    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = DocumentModel.objects(docID=RAW['docID'], source='wwe_news')
    assert queryset[0].source == 'wwe_news'
    assert len(queryset) == 1
    scrapi.processing.elasticsearch.es = real_es
Esempio n. 5
0
def test_rename(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(source=RAW['source'], docID=RAW['docID'])

    old_source = NORMALIZED['shareProperties']['source']

    assert queryset.normalized.attributes['shareProperties'][
        'source'] == utils.RECORD['shareProperties']['source']
    assert queryset.normalized.attributes['shareProperties'][
        'source'] == old_source

    new_record = copy.deepcopy(utils.RECORD)
    new_record['shareProperties']['source'] = 'wwe_news'
    test_harvester.short_name = 'wwe_news'
    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = processor.get(source='wwe_news', docID=RAW['docID'])

    assert queryset.normalized.attributes['shareProperties'][
        'source'] == 'wwe_news'

    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
    test_harvester.short_name = RAW['source']
    registry['test'] = test_harvester
    del registry['wwe_news']
Esempio n. 6
0
def test_rename(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(source=RAW['source'], docID=RAW['docID'])

    old_source = NORMALIZED['shareProperties']['source']

    assert queryset.normalized.attributes['shareProperties']['source'] == utils.RECORD['shareProperties']['source']
    assert queryset.normalized.attributes['shareProperties']['source'] == old_source

    new_record = copy.deepcopy(utils.RECORD)
    new_record['shareProperties']['source'] = 'wwe_news'
    test_harvester.short_name = 'wwe_news'
    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = processor.get(source='wwe_news', docID=RAW['docID'])

    assert queryset.normalized.attributes['shareProperties']['source'] == 'wwe_news'

    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
    test_harvester.short_name = RAW['source']
    registry['test'] = test_harvester
    del registry['wwe_news']
Esempio n. 7
0
def test_renormalize(processor_name, monkeypatch):
    # Set up
    # real_es = scrapi.processing.elasticsearch.es
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es

    scrapi.processing.elasticsearch.es = mock.MagicMock()
    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    # Process raw and normalized with fake docs
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    # Check to see those docs were processed
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    # Create a new doucment to be renormalized
    new_raw = copy.deepcopy(RAW)
    new_raw.attributes['docID'] = 'get_the_tables'
    new_raw.attributes['doc'] = new_raw.attributes['doc'].encode('utf-8')

    # This is basically like running the improved harvester right?
    processor.create(new_raw.attributes)

    tasks.migrate(renormalize, sources=[RAW['source']], dry=False)

    queryset = processor.get(docID='get_the_tables', source=RAW['source'])
    assert queryset
    scrapi.processing.elasticsearch.es = real_es
    processor.delete(docID='get_the_tables', source=RAW['source'])
Esempio n. 8
0
def test_cross_db(canonical, destination, monkeypatch, index='test'):

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    if destination != 'elasticsearch':
        real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
        scrapi.processing.elasticsearch.es = mock.MagicMock()
    else:
        monkeypatch.setattr('scrapi.settings.ELASTIC_INDEX', 'test')

    # Get the test documents into the caonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    source=RAW['source'])
        assert not destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    index=index,
                                                    source=RAW['source'])
        assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    source=RAW['source'])
        assert destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'],
                                                    index=index,
                                                    source=RAW['source'])
        assert destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        scrapi.processing.elasticsearch.es = real_es
Esempio n. 9
0
def test_cross_db_with_versions(canonical,
                                destination,
                                monkeypatch,
                                index='test'):
    new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide'

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    # Get the test documents into the canonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    # Get a version in there too
    new_normalized = copy.deepcopy(NORMALIZED.attributes)
    new_normalized['title'] = new_title
    canonical_processor.process_normalized(RAW,
                                           NormalizedDocument(new_normalized))

    destination_processor = get_processor(destination)

    # Check to see canonical_processor versions are there, and destination are not
    canonical_versions = list(
        canonical_processor.get_versions(docID=RAW['docID'],
                                         source=RAW['source']))
    assert len(canonical_versions) == 3
    assert canonical_versions[1].normalized['title'] == NORMALIZED['title']
    assert canonical_versions[2].normalized['title'] == new_title

    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index,
                  versions=True)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    destination_versions = list(
        destination_processor.get_versions(docID=RAW['docID'],
                                           source=RAW['source']))
    assert len(destination_versions) == 3
    assert destination_versions[1].normalized['title'] == NORMALIZED['title']
    assert destination_versions[2].normalized['title'] == new_title

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc
Esempio n. 10
0
def test_renormalize():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert(len(queryset) == 1)

    tasks.migrate(renormalize, source=RAW['source'])
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert(len(queryset) == 1)
    scrapi.processing.elasticsearch.es = real_es
Esempio n. 11
0
def test_renormalize():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 1

    tasks.migrate(renormalize, source=RAW['source'])
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 1
    scrapi.processing.elasticsearch.es = real_es
Esempio n. 12
0
def test_delete():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert (len(queryset) == 1)

    tasks.migrate(delete, sources=[RAW['source']], dry=False)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert (len(queryset) == 0)
    scrapi.processing.elasticsearch.es = real_es
Esempio n. 13
0
def test_cross_db(canonical, destination, monkeypatch, index='test'):

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    if destination != 'elasticsearch':
        real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
        scrapi.processing.elasticsearch.es = mock.MagicMock()
    else:
        monkeypatch.setattr('scrapi.settings.ELASTIC_INDEX', 'test')

    # Get the test documents into the caonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
        assert not destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'], index=index, source=RAW['source'])
        assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    if destination != 'elasticsearch':
        destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
        assert destination_doc
    else:
        destination_doc = destination_processor.get(docID=RAW['docID'], index=index, source=RAW['source'])
        assert destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc

    if destination != 'elasticsearch':
        scrapi.processing.elasticsearch.es = real_es
Esempio n. 14
0
def test_delete(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    print('Canonical Processor is {}'.format(scrapi.settings.CANONICAL_PROCESSOR))
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    tasks.migrate(delete, sources=[RAW['source']], dry=False)
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not queryset
    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
Esempio n. 15
0
def test_delete(processor_name, monkeypatch):
    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', processor_name)

    print('Canonical Processor is {}'.format(
        scrapi.settings.CANONICAL_PROCESSOR))
    processor = get_processor(processor_name)
    processor.process_raw(RAW)
    processor.process_normalized(RAW, NORMALIZED)

    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert queryset

    tasks.migrate(delete, sources=[RAW['source']], dry=False)
    queryset = processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not queryset
    scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es = real_es
Esempio n. 16
0
def test_no_normed_cross_db(destination, index='test'):

    if scrapi.settings.CANONICAL_PROCESSOR == destination:
        return

    real_es = scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()

    # Get the test documents into the caonical processor, but don't process normalized
    canonical_processor = get_processor(scrapi.settings.CANONICAL_PROCESSOR)
    canonical_processor.process_raw(RAW)
    # canonical_processor.process_normalized(RAW, NORMALIZED)

    destination_processor = get_processor(destination)

    # Check to see canonical_processor raw is there, and destination is not
    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc.raw
    assert not canonical_doc.normalized

    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc

    # # Migrate from the canonical to the destination
    tasks.migrate(cross_db,
                  target_db=destination,
                  dry=False,
                  sources=['test'],
                  index=index)

    # Check to see if the document didn't make made it to the destinaton, and is still in the canonical
    destination_doc = destination_processor.get(docID=RAW['docID'],
                                                source=RAW['source'])
    assert not destination_doc.normalized

    canonical_doc = canonical_processor.get(docID=RAW['docID'],
                                            source=RAW['source'])
    assert canonical_doc

    scrapi.processing.elasticsearch.es = real_es
Esempio n. 17
0
def test_cross_db_with_versions(canonical, destination, monkeypatch, index='test'):
    new_title = 'How to be really good at Zoo Tycoon: The Definitive Guide'

    if canonical == destination:
        return

    monkeypatch.setattr('scrapi.settings.CANONICAL_PROCESSOR', canonical)

    # Get the test documents into the canonical processor
    canonical_processor = get_processor(canonical)
    canonical_processor.process_raw(RAW)
    canonical_processor.process_normalized(RAW, NORMALIZED)

    # Get a version in there too
    new_normalized = copy.deepcopy(NORMALIZED.attributes)
    new_normalized['title'] = new_title
    canonical_processor.process_normalized(RAW, NormalizedDocument(new_normalized))

    destination_processor = get_processor(destination)

    # Check to see canonical_processor versions are there, and destination are not
    canonical_versions = list(canonical_processor.get_versions(docID=RAW['docID'], source=RAW['source']))
    assert len(canonical_versions) == 3
    assert canonical_versions[1].normalized['title'] == NORMALIZED['title']
    assert canonical_versions[2].normalized['title'] == new_title

    destination_doc = destination_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert not destination_doc

    # Migrate from the canonical to the destination
    tasks.migrate(cross_db, target_db=destination, dry=False, sources=['test'], index=index, versions=True)

    # Check to see if the document made it to the destinaton, and is still in the canonical
    destination_versions = list(destination_processor.get_versions(docID=RAW['docID'], source=RAW['source']))
    assert len(destination_versions) == 3
    assert destination_versions[1].normalized['title'] == NORMALIZED['title']
    assert destination_versions[2].normalized['title'] == new_title

    canonical_doc = canonical_processor.get(docID=RAW['docID'], source=RAW['source'])
    assert canonical_doc
Esempio n. 18
0
        elif isinstance(kwargs[key], list):
            kwargs[key].append(val)
        else:
            kwargs[key] = [kwargs[key], val]

    kwargs['dry'] = dry
    kwargs['async'] = async
    kwargs['group_size'] = group_size
    kwargs['sources'] = list(map(lambda x: x.strip(), sources.split(',')))

    if kwargs['sources'] == ['']:
        kwargs.pop('sources')

    migrate_func = migrations.__dict__[migration]

    migrate(migrate_func, **kwargs)


@task
def reset_search():
    ''' Restarts Elasticsearch '''
    run("curl -XPOST 'http://localhost:9200/_shutdown'")
    if platform.linux_distribution()[0] == 'Ubuntu':
        run("sudo service elasticsearch restart")
    elif platform.system() == 'Darwin':  # Mac OSX
        run('elasticsearch')


@task
def elasticsearch():
    '''Start a local elasticsearch server
Esempio n. 19
0
        elif isinstance(kwargs[key], list):
            kwargs[key].append(val)
        else:
            kwargs[key] = [kwargs[key], val]

    kwargs['dry'] = dry
    kwargs['async'] = async
    kwargs['group_size'] = group_size
    kwargs['sources'] = list(map(lambda x: x.strip(), sources.split(',')))

    if kwargs['sources'] == ['']:
        kwargs.pop('sources')

    migrate_func = migrations.__dict__[migration]

    migrate(migrate_func, **kwargs)


@task
def reset_search():
    ''' Restarts Elasticsearch '''
    run("curl -XPOST 'http://localhost:9200/_shutdown'")
    if platform.linux_distribution()[0] == 'Ubuntu':
        run("sudo service elasticsearch restart")
    elif platform.system() == 'Darwin':  # Mac OSX
        run('elasticsearch')


@task
def elasticsearch():
    '''Start a local elasticsearch server