Exemple #1
0
def test_migrate_v2():
    DocumentModelOld.create(**RAW.attributes).save()
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 0
    tasks.migrate_to_source_partition(dry=False)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 1
def test_migrate_v2():
    DocumentModelOld.create(**RAW.attributes).save()
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert(len(queryset) == 0)
    tasks.migrate_to_source_partition(dry=False)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert(len(queryset) == 1)
def test_rename():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    old_source = NORMALIZED['shareProperties']['source']

    assert(queryset[0].source == utils.RECORD['shareProperties']['source'])
    assert(queryset[0].source == old_source)

    new_record = copy.deepcopy(utils.RECORD)

    new_record['shareProperties']['source'] = 'wwe_news'

    test_harvester.short_name = 'wwe_news'

    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = DocumentModel.objects(docID=RAW['docID'], source='wwe_news')
    assert(queryset[0].source == 'wwe_news')
    assert(len(queryset) == 1)
    scrapi.processing.elasticsearch.es = real_es
Exemple #4
0
def test_rename():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    old_source = NORMALIZED['shareProperties']['source']

    assert queryset[0].source == utils.RECORD['shareProperties']['source']
    assert queryset[0].source == old_source

    new_record = copy.deepcopy(utils.RECORD)

    new_record['shareProperties']['source'] = 'wwe_news'

    test_harvester.short_name = 'wwe_news'

    registry['wwe_news'] = test_harvester

    tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False)

    queryset = DocumentModel.objects(docID=RAW['docID'], source='wwe_news')
    assert queryset[0].source == 'wwe_news'
    assert len(queryset) == 1
    scrapi.processing.elasticsearch.es = real_es
Exemple #5
0
def test_migrate_v2():
    try:
        RAW['doc'] = RAW['doc'].encode('utf-8')
    except AttributeError:
        RAW['doc'] = str(RAW['doc'])
    DocumentModelOld.create(**RAW.attributes).save()
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 0
    tasks.migrate_to_source_partition(dry=False)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 1
def test_renormalize():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert(len(queryset) == 1)

    tasks.migrate(renormalize, source=RAW['source'])
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert(len(queryset) == 1)
    scrapi.processing.elasticsearch.es = real_es
Exemple #7
0
def test_renormalize():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 1

    tasks.migrate(renormalize, source=RAW['source'])
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert len(queryset) == 1
    scrapi.processing.elasticsearch.es = real_es
Exemple #8
0
def test_delete():
    real_es = scrapi.processing.elasticsearch.es
    scrapi.processing.elasticsearch.es = mock.MagicMock()
    test_cass.process_raw(RAW)
    test_cass.process_normalized(RAW, NORMALIZED)

    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert (len(queryset) == 1)

    tasks.migrate(delete, sources=[RAW['source']], dry=False)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])
    assert (len(queryset) == 0)
    scrapi.processing.elasticsearch.es = real_es
Exemple #9
0
def pytest_runtest_setup(item):
    TIMEOUT = 20

    marker = item.get_marker('cassandra')
    if marker is not None:
        from scrapi.processing.cassandra import DocumentModel
        if not database.setup():
            pytest.skip('No connection to Cassandra')

        start = time.time()
        while True:
            try:
                DocumentModel.all().limit(1).get()
                break
            except NoHostAvailable as e:
                now = time.time()
                if (now - start) > TIMEOUT:
                    raise e
                continue
            except Exception:
                break

    marker = item.get_marker('elasticsearch')
    if marker is not None:
        if not use_es:
            pytest.skip('No connection to Elasticsearch')
        con.indices.create(index='test', body={}, ignore=400)

        # This is done to let the test index finish being created before connecting to search
        start = time.time()
        while True:
            try:
                scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es.search(
                    index='test')
                break
            except TransportError as e:
                now = time.time()
                if (now - start) > TIMEOUT:
                    raise e
                continue
Exemple #10
0
def pytest_runtest_setup(item):
    TIMEOUT = 20

    marker = item.get_marker('cassandra')
    if marker is not None:
        from scrapi.processing.cassandra import DocumentModel
        if not database.setup():
            pytest.skip('No connection to Cassandra')

        start = time.time()
        while True:
            try:
                DocumentModel.all().limit(1).get()
                break
            except NoHostAvailable as e:
                now = time.time()
                if (now - start) > TIMEOUT:
                    raise e
                continue
            except Exception:
                break


    marker = item.get_marker('elasticsearch')
    if marker is not None:
        if not use_es:
            pytest.skip('No connection to Elasticsearch')
        con.indices.create(index='test', body={}, ignore=400)

        # This is done to let the test index finish being created before connecting to search
        start = time.time()
        while True:
            try:
                scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es.search(index='test')
                break
            except TransportError as e:
                now = time.time()
                if (now - start) > TIMEOUT:
                    raise e
                continue
def test_versions():
    test_db.process_normalized(RAW, NORMALIZED)
    queryset = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"])

    assert len(queryset) == 1

    old_title = NORMALIZED["title"]

    NORMALIZED["title"] = "some new title"
    test_db.process_normalized(RAW, NORMALIZED)
    doc = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"])[0]
    assert doc.title == "some new title"
    assert len(doc.versions) == 1

    version = VersionModel.objects(key=doc.versions[-1])[0]

    assert version.title == old_title

    test_db.process_normalized(RAW, NORMALIZED)
    doc = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"])[0]
    assert doc.title == "some new title"
    assert len(doc.versions) == 1
def test_versions():
    test_db.process_normalized(RAW, NORMALIZED)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])

    assert (len(queryset) == 1)

    old_title = NORMALIZED['title']

    NORMALIZED['title'] = 'some new title'
    test_db.process_normalized(RAW, NORMALIZED)
    doc = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])[0]
    assert (doc.title == 'some new title')
    assert len(doc.versions) == 1

    version = VersionModel.objects(key=doc.versions[-1])[0]

    assert (version.title == old_title)

    test_db.process_normalized(RAW, NORMALIZED)
    doc = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])[0]
    assert (doc.title == 'some new title')
    assert len(doc.versions) == 1
def test_process_normalized():
    test_db.process_normalized(RAW, NORMALIZED)
    queryset = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"])

    assert queryset[0].title == utils.RECORD["title"]
def test_process_raw():
    test_db.process_raw(RAW)
    queryset = DocumentModel.objects(docID="someID", source=RAW["source"])
    assert len(queryset) == 1
def test_process_normalized():
    test_db.process_normalized(RAW, NORMALIZED)
    queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])

    assert(queryset[0].title == utils.RECORD['title'])
def test_process_raw():
    test_db.process_raw(RAW)
    queryset = DocumentModel.objects(docID='someID', source=RAW['source'])
    assert(len(queryset) == 1)