def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) assert doc.source != target, "Can't rename {} to {}, names are the same.".format( doc.source, target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.source, raw['docID'])) es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=doc.source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format( doc.source, raw['docID']))
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes[ 'source'] != target, "Can't rename {} to {}, names are the same.".format( doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete( index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete( index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format( doc.raw.attributes['source'], target, raw['docID']))
def rename(source, target, dry=True): assert source != target, "Can't rename {} to {}, names are the same".format(source, target) count = 0 exceptions = [] for doc in documents(source): count += 1 try: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) if not dry: process_raw(raw) process_normalized(normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(source, raw['docID'])) except Exception as e: logger.exception(e) exceptions.append(e) else: if not dry: es.delete(index=settings.ELASTIC_INDEX, doc_type=source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format(source, raw['docID'])) if dry: logger.info('Dry run complete') for ex in exceptions: logger.exception(e) logger.info('{} documents processed, with {} exceptions'.format(count, len(exceptions)))
def test_process_norm_calls(raw_doc, monkeypatch): pmock = mock.Mock() monkeypatch.setattr('scrapi.tasks.processing.process_normalized', pmock) tasks.process_normalized(raw_doc, raw_doc) pmock.assert_called_once_with(raw_doc, raw_doc, {})
def test_process_norm_calls(raw_doc, monkeypatch): pmock = mock.Mock() monkeypatch.setattr("scrapi.tasks.processing.process_normalized", pmock) tasks.process_normalized(raw_doc, raw_doc) pmock.assert_called_once_with(raw_doc, raw_doc, {})
def renormalize(doc, **kwargs): raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': doc.source, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) if not kwargs.get('dry'): tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes['source'] != target, "Can't rename {} to {}, names are the same.".format(doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete(index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format(doc.raw.attributes['source'], target, raw['docID']))
def renormalize(sources=()): count = 0 exceptions = [] for doc in documents(*sources): count += 1 try: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': doc.source, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) process_normalized(normalize(raw, raw['source']), raw) except Exception as e: logger.exception(e) exceptions.append(e) for ex in exceptions: logger.exception(e) logger.info('{} documents processed, with {} exceptions'.format(count, len(exceptions)))
def rename(doc, target=None, **kwargs): assert target, "To run this migration you need a target." raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) assert doc.source != target, "Can't rename {} to {}, names are the same.".format(doc.source, target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(doc.source, raw['docID'])) es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=doc.source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format(doc.source, raw['docID']))
def main(): for raw in document_generator(): try: process_normalized(normalize(raw, raw['source']), raw) except Exception as e: logger.exception(e)
def renormalize(docs, *args, **kwargs): for doc in docs: if not kwargs.get('dry'): tasks.process_normalized( tasks.normalize(doc.raw, doc.raw['source']), doc.raw)
def renormalize(docs, *args, **kwargs): for doc in docs: if not kwargs.get('dry'): tasks.process_normalized(tasks.normalize(doc.raw, doc.raw['source']), doc.raw)