Ejemplo n.º 1
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."

    for doc in docs:
        new_doc = copy.deepcopy(doc.raw.attributes)
        new_doc['source'] = target

        raw = RawDocument(new_doc, validate=False)

        assert doc.raw.attributes[
            'source'] != target, "Can't rename {} to {}, names are the same.".format(
                doc.raw['source'], target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(
                doc.raw.attributes['source'], raw['docID']))

            es_processor = get_processor('elasticsearch')
            es_processor.manager.es.delete(
                index=settings.ELASTIC_INDEX,
                doc_type=doc.raw.attributes['source'],
                id=raw['docID'],
                ignore=[404])
            es_processor.manager.es.delete(
                index='share_v1',
                doc_type=doc.raw.attributes['source'],
                id=raw['docID'],
                ignore=[404])

        logger.info('Renamed document from {} to {} with id {}'.format(
            doc.raw.attributes['source'], target, raw['docID']))
Ejemplo n.º 2
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."
    for doc in docs:
        raw = RawDocument({
            'doc': doc.doc,
            'docID': doc.docID,
            'source': target,
            'filetype': doc.filetype,
            'timestamps': doc.timestamps,
            'versions': doc.versions
        })

        assert doc.source != target, "Can't rename {} to {}, names are the same.".format(
            doc.source, target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(
                doc.source, raw['docID']))

            es.delete(index=settings.ELASTIC_INDEX,
                      doc_type=doc.source,
                      id=raw['docID'],
                      ignore=[404])
            es.delete(index='share_v1',
                      doc_type=doc.source,
                      id=raw['docID'],
                      ignore=[404])

        logger.info('Deleted document from {} with id {}'.format(
            doc.source, raw['docID']))
Ejemplo n.º 3
0
def rename(source, target, dry=True):
    assert source != target, "Can't rename {} to {}, names are the same".format(source, target)
    count = 0
    exceptions = []

    for doc in documents(source):
        count += 1
        try:
            raw = RawDocument({
                'doc': doc.doc,
                'docID': doc.docID,
                'source': target,
                'filetype': doc.filetype,
                'timestamps': doc.timestamps,
                'versions': doc.versions
            })
            if not dry:
                process_raw(raw)
                process_normalized(normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(source, raw['docID']))
        except Exception as e:
            logger.exception(e)
            exceptions.append(e)
        else:
            if not dry:
                es.delete(index=settings.ELASTIC_INDEX, doc_type=source, id=raw['docID'], ignore=[404])
                es.delete(index='share_v1', doc_type=source, id=raw['docID'], ignore=[404])
            logger.info('Deleted document from {} with id {}'.format(source, raw['docID']))
    if dry:
        logger.info('Dry run complete')

    for ex in exceptions:
        logger.exception(e)
    logger.info('{} documents processed, with {} exceptions'.format(count, len(exceptions)))
Ejemplo n.º 4
0
def process_one(harvester_name, harvester, raw_path):
    date = parser.parse(raw_path.split('/')[-2])

    timestamp = date.isoformat()

    raw_file = store.get_as_string(raw_path)

    raw_doc = RawDocument({
        'doc': raw_file,
        'timestamps': {
            'harvestFinished': timestamp
        },
        'docID': b64decode(raw_path.split('/')[-3]).decode('utf-8'),
        'source': harvester_name,
        'filetype': harvester['fileFormat'],
    })

    try:
        raw_list = raw_path.split('/')
        raw_list[-1] = 'normalized.json'
        normalized_path = '/'.join(raw_list)
        with open(normalized_path, 'r') as f:
            normalized = NormalizedDocument(json.load(f))
    except Exception:
        normalized = normalize(raw_doc, harvester_name)

    (
        process_to_cassandra.si(raw_doc, normalized) |
        process_to_elasticsearch.si(raw_doc, normalized) |
        move_to_backup.si(raw_path)
    ).apply_async()
Ejemplo n.º 5
0
def renormalize(doc, **kwargs):
    raw = RawDocument({
        'doc': doc.doc,
        'docID': doc.docID,
        'source': doc.source,
        'filetype': doc.filetype,
        'timestamps': doc.timestamps,
        'versions': doc.versions
    })
    if not kwargs.get('dry'):
        tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
Ejemplo n.º 6
0
def rename(docs, target=None, **kwargs):
    assert target, "To run this migration you need a target."

    for doc in docs:
        new_doc = copy.deepcopy(doc.raw.attributes)
        new_doc['source'] = target

        raw = RawDocument(new_doc, validate=False)

        assert doc.raw.attributes['source'] != target, "Can't rename {} to {}, names are the same.".format(doc.raw['source'], target)

        if not kwargs.get('dry'):
            tasks.process_raw(raw)
            tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
            logger.info('Processed document from {} with id {}'.format(doc.raw.attributes['source'], raw['docID']))

            es_processor = get_processor('elasticsearch')
            es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404])
            es_processor.manager.es.delete(index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404])

        logger.info('Renamed document from {} to {} with id {}'.format(doc.raw.attributes['source'], target, raw['docID']))
Ejemplo n.º 7
0
def renormalize(sources=()):
    count = 0
    exceptions = []
    for doc in documents(*sources):
        count += 1
        try:
            raw = RawDocument({
                'doc': doc.doc,
                'docID': doc.docID,
                'source': doc.source,
                'filetype': doc.filetype,
                'timestamps': doc.timestamps,
                'versions': doc.versions
            })
            process_normalized(normalize(raw, raw['source']), raw)
        except Exception as e:
            logger.exception(e)
            exceptions.append(e)

    for ex in exceptions:
        logger.exception(e)
    logger.info('{} documents processed, with {} exceptions'.format(count, len(exceptions)))
Ejemplo n.º 8
0
def rename(doc, target=None, **kwargs):
    assert target, "To run this migration you need a target."

    raw = RawDocument({
        'doc': doc.doc,
        'docID': doc.docID,
        'source': target,
        'filetype': doc.filetype,
        'timestamps': doc.timestamps,
        'versions': doc.versions
    })

    assert doc.source != target, "Can't rename {} to {}, names are the same.".format(doc.source, target)

    if not kwargs.get('dry'):
        tasks.process_raw(raw)
        tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
        logger.info('Processed document from {} with id {}'.format(doc.source, raw['docID']))

        es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.source, id=raw['docID'], ignore=[404])
        es.delete(index='share_v1', doc_type=doc.source, id=raw['docID'], ignore=[404])

    logger.info('Deleted document from {} with id {}'.format(doc.source, raw['docID']))
Ejemplo n.º 9
0
def main():
    for raw in document_generator():
        try:
            process_normalized(normalize(raw, raw['source']), raw)
        except Exception as e:
            logger.exception(e)
Ejemplo n.º 10
0
def renormalize(docs, *args, **kwargs):
    for doc in docs:
        if not kwargs.get('dry'):
            tasks.process_normalized(
                tasks.normalize(doc.raw, doc.raw['source']), doc.raw)
Ejemplo n.º 11
0
def renormalize(docs, *args, **kwargs):
    for doc in docs:
        if not kwargs.get('dry'):
            tasks.process_normalized(tasks.normalize(doc.raw, doc.raw['source']), doc.raw)