Ejemplo n.º 1
0
def delete_post_from_solr(post_id):
    logger.info("deleting post with id %d" % post_id)
    try:
        solr = Solr(settings.SOLR_FORUM_URL)
        solr.delete_by_id(post_id)
        solr.commit()
    except SolrException as e:
        logger.error('could not delete post with id %s (%s).' % (post_id, e))
Ejemplo n.º 2
0
def add_posts_to_solr(posts):
    logger.info("adding multiple forum posts to solr index")
    solr = Solr(settings.SOLR_FORUM_URL, auto_commit=False)

    logger.info("creating XML")
    documents = map(convert_to_solr_document, posts)
    logger.info("posting to Solr")
    solr.add(documents)

    solr.commit()
    logger.info("optimizing solr index")
    #solr.optimize()
    logger.info("done")
Ejemplo n.º 3
0
def add_posts_to_solr(posts):
    logger.info("adding multiple forum posts to solr index")
    solr = Solr(settings.SOLR_FORUM_URL, auto_commit=False)


    logger.info("creating XML")
    documents = map(convert_to_solr_document, posts)
    logger.info("posting to Solr")
    solr.add(documents)

    solr.commit()
    logger.info("optimizing solr index")
    #solr.optimize()
    logger.info("done")
Ejemplo n.º 4
0
def send_posts_to_solr(posts):
    logger.info("adding forum posts to solr index")
    logger.info("creating XML")
    documents = [convert_to_solr_document(p) for p in posts]

    try:
        logger.info("posting to Solr")
        solr = Solr(settings.SOLR_FORUM_URL)

        solr.add(documents)

        solr.commit()
    except SolrException as e:
        logger.error("failed to add posts to solr index, reason: %s" % str(e))
    logger.info("done")
Ejemplo n.º 5
0
def main(config):
    cfg = cliconfig(config)
    session = SessionFactory(cfg['database']['url']).create()
    server = Solr(str(cfg['solr']['url']),
        http_user=cfg['solr'].get('username'),
        http_pass=cfg['solr'].get('password'))

    documents = []
    q = session.query(Address).filter(Address.prefecture is not None)
    q = q.order_by(Address.zipcode)
    for r in ifilter(lambda r: r, imap(transform, q)):
        documents.append(r)
        if len(documents) >= COMMIT_UNIT:
            server.add_many(documents)
            documents = []
    if len(documents) > 0:
        server.add_many(documents)
    server.commit()
Ejemplo n.º 6
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(
        couchdb_obj=get_couchdb(), collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(
            collection_key,
            updated_docs,
            num_added,
            report))
    return updated_docs, report
Ejemplo n.º 7
0
def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    delete_solr_collection(collection_key)
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(),
                                collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(collection_key, updated_docs, num_added, report))
    return updated_docs, report
Ejemplo n.º 8
0
def main(url_couchdb=None,
         dbname=None,
         url_solr=None,
         all_docs=False,
         since=None):
    '''Use the _changes feed with a "since" parameter to only catch new
    changes to docs. The _changes feed will only have the *last* event on
    a document and does not retain intermediate changes.
    Setting the "since" to 0 will result in getting a _changes record for
    each document, essentially dumping the db to solr
    '''
    print('Solr update PID: {}'.format(os.getpid()))
    dt_start = datetime.datetime.now()
    print('Start time:{}'.format(dt_start))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    s3_seq_cache = CouchdbLastSeq_S3()
    if not since:
        since = s3_seq_cache.last_seq
    if all_docs:
        since = '0'
    print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname))
    print('Getting changes since:{}'.format(since))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    changes = db.changes(since=since)
    previous_since = since
    last_since = int(
        changes['last_seq'])  # get new last_since for changes feed
    results = changes['results']
    n_up = n_design = n_delete = 0
    solr_db = Solr(url_solr)
    start_time = datetime.datetime.now()
    for row in results:
        cur_id = row['id']
        if '_design' in cur_id:
            n_design += 1
            print("Skip {0}".format(cur_id))
            continue
        if row.get('deleted', False):
            # need to get the solr doc for this couch
            resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"')))
            if resp.numFound == 1:
                sdoc = resp.results[0]
                print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id']))
                solr_db.delete(id=sdoc['id'])
                n_delete += 1
            else:
                print("-----DELETION of {} - FOUND {} docs".format(
                    cur_id, resp.numFound))
        else:
            doc = db.get(cur_id)
            try:
                doc = fill_in_title(doc)
                has_required_fields(doc)
            except KeyError as e:
                print(e.message)
                continue
            except ValueError as e:
                print(e.message)
                continue
            try:
                try:
                    solr_doc = map_couch_to_solr_doc(doc)
                except OldCollectionException:
                    print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id))
                    continue
                try:
                    check_nuxeo_media(solr_doc)
                except ValueError as e:
                    print(e.message)
                    continue
                solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
            except TypeError as e:
                print('TypeError for {0} : {1}'.format(cur_id, e))
                continue
        n_up += 1
        if n_up % 1000 == 0:
            elapsed_time = datetime.datetime.now() - start_time
            print("Updated {} so far in {}".format(n_up, elapsed_time))
    solr_db.commit()
    if not all_docs:
        s3_seq_cache.last_seq = last_since
    print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete))
    print("PREVIOUS SINCE:{0}".format(previous_since))
    print("LAST SINCE:{0}".format(last_since))
    run_time = datetime.datetime.now() - dt_start
    print("RUN TIME:{}".format(run_time))
Ejemplo n.º 9
0
def main(url_couchdb=None,
         dbname=None,
         url_solr=None,
         all_docs=False,
         since=None):
    '''Use the _changes feed with a "since" parameter to only catch new
    changes to docs. The _changes feed will only have the *last* event on
    a document and does not retain intermediate changes.
    Setting the "since" to 0 will result in getting a _changes record for
    each document, essentially dumping the db to solr
    '''
    print('Solr update PID: {}'.format(os.getpid()))
    dt_start = datetime.datetime.now()
    print('Start time:{}'.format(dt_start))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    s3_seq_cache = CouchdbLastSeq_S3()
    if not since:
        since = s3_seq_cache.last_seq
    if all_docs:
        since = '0'
    print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname))
    print('Getting changes since:{}'.format(since))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    changes = db.changes(since=since)
    previous_since = since
    last_since = int(
        changes['last_seq'])  # get new last_since for changes feed
    results = changes['results']
    n_up = n_design = n_delete = 0
    solr_db = Solr(url_solr)
    start_time = datetime.datetime.now()
    for row in results:
        cur_id = row['id']
        if '_design' in cur_id:
            n_design += 1
            print("Skip {0}".format(cur_id))
            continue
        if row.get('deleted', False):
            # need to get the solr doc for this couch
            resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"')))
            if resp.numFound == 1:
                sdoc = resp.results[0]
                print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id']))
                solr_db.delete(id=sdoc['id'])
                n_delete += 1
            else:
                print("-----DELETION of {} - FOUND {} docs".format(
                    cur_id, resp.numFound))
        else:
            doc = db.get(cur_id)
            try:
                doc = fill_in_title(doc)
                has_required_fields(doc)
            except KeyError as e:
                print(e.message)
                continue
            except ValueError as e:
                print(e.message)
                continue
            try:
                try:
                    solr_doc = map_couch_to_solr_doc(doc)
                except OldCollectionException:
                    print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id))
                    continue
                try:
                    check_nuxeo_media(solr_doc)
                except ValueError as e:
                    print(e.message)
                    continue
                solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
            except TypeError as e:
                print('TypeError for {0} : {1}'.format(cur_id, e))
                continue
        n_up += 1
        if n_up % 1000 == 0:
            elapsed_time = datetime.datetime.now() - start_time
            print("Updated {} so far in {}".format(n_up, elapsed_time))
    solr_db.commit()
    if not all_docs:
        s3_seq_cache.last_seq = last_since
    print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete))
    print("PREVIOUS SINCE:{0}".format(previous_since))
    print("LAST SINCE:{0}".format(last_since))
    run_time = datetime.datetime.now() - dt_start
    print("RUN TIME:{}".format(run_time))
Ejemplo n.º 10
0
            u['phonenumbers'] = {'set': d['ner_phone_number_ts_md']}
            u['ner_phone_number_ts_md'] = {'set': None}
        else:
            print("Error: Skipped")
            continue
        yield u


def read_stream(filename):
    '''
    Reads json line stream
    :param filename: path to json line
    :return: doc stream
    '''
    with open(filename) as inf:
        for l in inf:
            yield json.loads(l)

if __name__ == '__main__':
    url = "http://127.0.0.1:8983/solr/imagecatdev"
    solr = Solr(url)
    docs = solr.query_iterator("ner_phone_number_t_md:* OR ner_phone_number_ts_md:*",
                        rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc")

    updates = fix_phonenumbers(docs)
    count, success = solr.post_iterator(updates, False, buffer_size=1000)
    solr.commit()
    print(success)
    print(count)

Ejemplo n.º 11
0
            print("Error: Skipped")
            continue
        yield u


def read_stream(filename):
    '''
    Reads json line stream
    :param filename: path to json line
    :return: doc stream
    '''
    with open(filename) as inf:
        for l in inf:
            yield json.loads(l)


if __name__ == '__main__':
    url = "http://127.0.0.1:8983/solr/imagecatdev"
    solr = Solr(url)
    docs = solr.query_iterator(
        "ner_phone_number_t_md:* OR ner_phone_number_ts_md:*",
        rows=1000,
        fl='id,ner_phone_number_t_md,ner_phone_number_ts_md',
        sort="indexedAt asc")

    updates = fix_phonenumbers(docs)
    count, success = solr.post_iterator(updates, False, buffer_size=1000)
    solr.commit()
    print(success)
    print(count)