def test_migrate_v2(): DocumentModelOld.create(**RAW.attributes).save() queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert len(queryset) == 0 tasks.migrate_to_source_partition(dry=False) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert len(queryset) == 1
def test_migrate_v2(): DocumentModelOld.create(**RAW.attributes).save() queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert(len(queryset) == 0) tasks.migrate_to_source_partition(dry=False) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert(len(queryset) == 1)
def test_rename(): real_es = scrapi.processing.elasticsearch.es scrapi.processing.elasticsearch.es = mock.MagicMock() test_cass.process_raw(RAW) test_cass.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) old_source = NORMALIZED['shareProperties']['source'] assert(queryset[0].source == utils.RECORD['shareProperties']['source']) assert(queryset[0].source == old_source) new_record = copy.deepcopy(utils.RECORD) new_record['shareProperties']['source'] = 'wwe_news' test_harvester.short_name = 'wwe_news' registry['wwe_news'] = test_harvester tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False) queryset = DocumentModel.objects(docID=RAW['docID'], source='wwe_news') assert(queryset[0].source == 'wwe_news') assert(len(queryset) == 1) scrapi.processing.elasticsearch.es = real_es
def test_rename(): real_es = scrapi.processing.elasticsearch.es scrapi.processing.elasticsearch.es = mock.MagicMock() test_cass.process_raw(RAW) test_cass.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) old_source = NORMALIZED['shareProperties']['source'] assert queryset[0].source == utils.RECORD['shareProperties']['source'] assert queryset[0].source == old_source new_record = copy.deepcopy(utils.RECORD) new_record['shareProperties']['source'] = 'wwe_news' test_harvester.short_name = 'wwe_news' registry['wwe_news'] = test_harvester tasks.migrate(rename, sources=[old_source], target='wwe_news', dry=False) queryset = DocumentModel.objects(docID=RAW['docID'], source='wwe_news') assert queryset[0].source == 'wwe_news' assert len(queryset) == 1 scrapi.processing.elasticsearch.es = real_es
def test_migrate_v2(): try: RAW['doc'] = RAW['doc'].encode('utf-8') except AttributeError: RAW['doc'] = str(RAW['doc']) DocumentModelOld.create(**RAW.attributes).save() queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert len(queryset) == 0 tasks.migrate_to_source_partition(dry=False) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert len(queryset) == 1
def test_renormalize(): real_es = scrapi.processing.elasticsearch.es scrapi.processing.elasticsearch.es = mock.MagicMock() test_cass.process_raw(RAW) test_cass.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert(len(queryset) == 1) tasks.migrate(renormalize, source=RAW['source']) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert(len(queryset) == 1) scrapi.processing.elasticsearch.es = real_es
def test_renormalize(): real_es = scrapi.processing.elasticsearch.es scrapi.processing.elasticsearch.es = mock.MagicMock() test_cass.process_raw(RAW) test_cass.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert len(queryset) == 1 tasks.migrate(renormalize, source=RAW['source']) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert len(queryset) == 1 scrapi.processing.elasticsearch.es = real_es
def test_delete(): real_es = scrapi.processing.elasticsearch.es scrapi.processing.elasticsearch.es = mock.MagicMock() test_cass.process_raw(RAW) test_cass.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert (len(queryset) == 1) tasks.migrate(delete, sources=[RAW['source']], dry=False) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert (len(queryset) == 0) scrapi.processing.elasticsearch.es = real_es
def pytest_runtest_setup(item): TIMEOUT = 20 marker = item.get_marker('cassandra') if marker is not None: from scrapi.processing.cassandra import DocumentModel if not database.setup(): pytest.skip('No connection to Cassandra') start = time.time() while True: try: DocumentModel.all().limit(1).get() break except NoHostAvailable as e: now = time.time() if (now - start) > TIMEOUT: raise e continue except Exception: break marker = item.get_marker('elasticsearch') if marker is not None: if not use_es: pytest.skip('No connection to Elasticsearch') con.indices.create(index='test', body={}, ignore=400) # This is done to let the test index finish being created before connecting to search start = time.time() while True: try: scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es.search( index='test') break except TransportError as e: now = time.time() if (now - start) > TIMEOUT: raise e continue
def pytest_runtest_setup(item): TIMEOUT = 20 marker = item.get_marker('cassandra') if marker is not None: from scrapi.processing.cassandra import DocumentModel if not database.setup(): pytest.skip('No connection to Cassandra') start = time.time() while True: try: DocumentModel.all().limit(1).get() break except NoHostAvailable as e: now = time.time() if (now - start) > TIMEOUT: raise e continue except Exception: break marker = item.get_marker('elasticsearch') if marker is not None: if not use_es: pytest.skip('No connection to Elasticsearch') con.indices.create(index='test', body={}, ignore=400) # This is done to let the test index finish being created before connecting to search start = time.time() while True: try: scrapi.processing.elasticsearch.ElasticsearchProcessor.manager.es.search(index='test') break except TransportError as e: now = time.time() if (now - start) > TIMEOUT: raise e continue
def test_versions(): test_db.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"]) assert len(queryset) == 1 old_title = NORMALIZED["title"] NORMALIZED["title"] = "some new title" test_db.process_normalized(RAW, NORMALIZED) doc = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"])[0] assert doc.title == "some new title" assert len(doc.versions) == 1 version = VersionModel.objects(key=doc.versions[-1])[0] assert version.title == old_title test_db.process_normalized(RAW, NORMALIZED) doc = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"])[0] assert doc.title == "some new title" assert len(doc.versions) == 1
def test_versions(): test_db.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert (len(queryset) == 1) old_title = NORMALIZED['title'] NORMALIZED['title'] = 'some new title' test_db.process_normalized(RAW, NORMALIZED) doc = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])[0] assert (doc.title == 'some new title') assert len(doc.versions) == 1 version = VersionModel.objects(key=doc.versions[-1])[0] assert (version.title == old_title) test_db.process_normalized(RAW, NORMALIZED) doc = DocumentModel.objects(docID=RAW['docID'], source=RAW['source'])[0] assert (doc.title == 'some new title') assert len(doc.versions) == 1
def test_process_normalized(): test_db.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW["docID"], source=RAW["source"]) assert queryset[0].title == utils.RECORD["title"]
def test_process_raw(): test_db.process_raw(RAW) queryset = DocumentModel.objects(docID="someID", source=RAW["source"]) assert len(queryset) == 1
def test_process_normalized(): test_db.process_normalized(RAW, NORMALIZED) queryset = DocumentModel.objects(docID=RAW['docID'], source=RAW['source']) assert(queryset[0].title == utils.RECORD['title'])
def test_process_raw(): test_db.process_raw(RAW) queryset = DocumentModel.objects(docID='someID', source=RAW['source']) assert(len(queryset) == 1)