Python Solr.Solr Exemples, solr.Solr.Solr Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : solr-similarity.py Projet : voltek62/tika-similarity

def sk_kmeans(core):  #, kval=3

    solrURL = "http://localhost:8983/solr/" + core
    solrInstance = Solr(solrURL)

    list_of_points = []
    docs = solrInstance.query_iterator(query="*:*", start=0)

    for doc in docs:
        list_of_points.append(Vector(doc['id'], doc))

    list_of_Dicts = (point.features for point in list_of_points)

    df = pd.DataFrame(list_of_Dicts)
    df = df.fillna(0)

    silhouettes = {}
    for k in range(2, 10):

        kmeans = KMeans(
            n_clusters=k,
            init='k-means++',
            max_iter=300,  # k-means convergence
            n_init=10,  # find global minima
            n_jobs=-2,  # parallelize
        )

        labels = kmeans.fit_predict(df)
        silhouettes[k] = silhouette_score(df, labels)

    return str(silhouettes)

Exemple #2

0

Afficher le fichier

def get_all_sound_ids_from_solr(limit=False):
    logger.info("getting all sound ids from solr.")
    if not limit:
        limit = 99999999999999
    solr = Solr(settings.SOLR_URL)
    solr_ids = []
    solr_count = None
    PAGE_SIZE = 2000
    current_page = 1
    try:
        while (len(solr_ids) < solr_count
               or solr_count == None) and len(solr_ids) < limit:
            #print "Getting page %i" % current_page
            response = SolrResponseInterpreter(
                solr.select(
                    unicode(
                        search_prepare_query('',
                                             '',
                                             search_prepare_sort(
                                                 'created asc',
                                                 SEARCH_SORT_OPTIONS_WEB),
                                             current_page,
                                             PAGE_SIZE,
                                             include_facets=False))))
            solr_ids += [element['id'] for element in response.docs]
            solr_count = response.num_found
            current_page += 1
    except Exception, e:
        raise Exception(e)

Exemple #3

0

Afficher le fichier

def add_sounds_to_solr(sounds):
    logger.info("adding multiple sounds to solr index")
    solr = Solr(settings.SOLR_URL)
    logger.info("creating XML")
    documents = map(convert_to_solr_document, sounds)
    logger.info("posting to Solr")
    solr.add(documents)

Exemple #4

0

Afficher le fichier

def add_sound_to_solr(sound):
    logger.info("adding single sound to solr index")
    try:
        Solr(settings.SOLR_URL).add([convert_to_solr_document(sound)])
    except SolrException, e:
        logger.error("failed to add sound %d to solr index, reason: %s" %
                     (sound.id, str(e)))

Exemple #5

0

Afficher le fichier

Fichier : search_forum.py Projet : DINKIN/freesound

def add_post_to_solr(post):
    logger.info("adding single forum post to solr index")
    try:
        Solr(settings.SOLR_FORUM_URL).add([convert_to_solr_document(post)])
    except SolrException, e:
        logger.error("failed to add forum post %d to solr index, reason: %s" %
                     (post.id, str(e)))

Exemple #6

0

Afficher le fichier

Fichier : csvindexer.py Projet : farhadrclass/DataScience-Lab

 def index(self, docs, solr_url):
     solr = Solr(solr_url)
     success, count = solr.post_iterator(docs)
     if success:
         print("Indexed %d docs" % count)
     else:
         print("Error: Indexing failed, check solr logs")

Exemple #7

0

Afficher le fichier

Fichier : views.py Projet : blmeena1991/bcg_lab

def _solr_search(query_dict):
    solr = Solr()
    solr.query(query_dict)

    product_list = Product.objects.filter(
        id__in=[doc['id'] for doc in solr.docs()])

    return product_list, solr.numFound()

Exemple #8

0

Afficher le fichier

def delete_post_from_solr(post_id):
    logger.info("deleting post with id %d" % post_id)
    try:
        solr = Solr(settings.SOLR_FORUM_URL)
        solr.delete_by_id(post_id)
        solr.commit()
    except SolrException as e:
        logger.error('could not delete post with id %s (%s).' % (post_id, e))

Exemple #9

0

Afficher le fichier

Fichier : solr_util.py Projet : buxiebug/ProxyCrawl

def clean(solr_address):
    s = Solr(solr_address)
    resp = s.select('*:*', fl='id', rows='50000')
    ids = []
    for result in resp.results:
        ids.append(result.get('id'))
    print ids
    s.delete_many(ids, commit=True)

Exemple #10

0

Afficher le fichier

def check_if_sound_exists_in_solr(sound):
    solr = Solr(settings.SOLR_URL)
    response = SolrResponseInterpreter(
        solr.select(
            unicode(
                search_prepare_query(
                    '', 'id:%i' % sound.id,
                    search_prepare_sort('created asc',
                                        SEARCH_SORT_OPTIONS_WEB), 1, 1))))
    return response.num_found > 0

Exemple #11

0

Afficher le fichier

def index_docs(docs, solr_url, corpus, buffer_size=2000):
    from solr import Solr
    solr = Solr(solr_url)
    docs = (seg for doc in docs for seg in doc.to_rec_dicts())

    def set_corpus(doc):
        doc['corpus'] = corpus
        return doc

    docs = map(set_corpus, docs)
    solr.post_iterator(docs, buffer_size=buffer_size)

Exemple #12

0

Afficher le fichier

def add_posts_to_solr(posts):
    logger.info("adding multiple forum posts to solr index")
    solr = Solr(settings.SOLR_FORUM_URL)

    logger.info("creating XML")
    documents = map(convert_to_solr_document, posts)
    logger.info("posting to Solr")
    solr.add(documents)

    logger.info("optimizing solr index")
    #solr.optimize()
    logger.info("done")

Exemple #13

0

Afficher le fichier

 def index(self, solr_url, in_file):
     '''
     Reads annotations at the specified path and indexes them to solr
     @param solr_url Target Solr URL to index
     @param in_file CSV file having text file and annotation file paths
     '''
     solr = Solr(solr_url)
     recs = self.read_records(in_file)
     count, success, = solr.post_iterator(recs)
     if success:
         print("Indexed %d docs" % count)
     else:
         print("Error: Failed. Check solr logs")

Exemple #14

0

Afficher le fichier

Fichier : ingest.py Projet : jiaruiou/tika-similarity

def solrIngest(URL, dataset=None, inputDir=None, accept=None):

    solr = Solr(URL)
    documents = []

    if dataset:
        documents = lazyDataset(dataset)
    elif inputDir:
        documents = lazySolr(inputDir, accept)

    count, res = solr.post_iterator(documents, commit=True, buffer_size=100)

    print(("Res : %s; count=%d" % (res, count)))

Exemple #15

0

Afficher le fichier

def send_posts_to_solr(posts):
    logger.info("adding forum posts to solr index")
    logger.info("creating XML")
    documents = [convert_to_solr_document(p) for p in posts]

    try:
        logger.info("posting to Solr")
        solr = Solr(settings.SOLR_FORUM_URL)

        solr.add(documents)

        solr.commit()
    except SolrException as e:
        logger.error("failed to add posts to solr index, reason: %s" % str(e))
    logger.info("done")

Exemple #16

0

Afficher le fichier

Fichier : sync_couch_collection_to_solr.py Projet : ucldc/harvester

def main(collection_key):
    v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(),
                                collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    results = []
    for r in v:
        dt_start = dt_end = datetime.datetime.now()
        try:
            doc = fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError, e:
            print(e.message)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        results.append(solr_doc)
        solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
        dt_end = datetime.datetime.now()

Exemple #17

0

Afficher le fichier

Fichier : indexer.py Projet : farhadrclass/DataScience-Lab

def main():
    # Step : Parse CLI args
    parser = ArgumentParser(
        description="This tool can read JSON line dump and index to solr.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        version="1.0")

    parser.add_argument("-i",
                        "--in",
                        help="Path to Input JSON line file.",
                        required=True)
    parser.add_argument("-s",
                        "--solr-url",
                        help="URL of Solr core.",
                        default="http://localhost:8983/solr/docsdev")
    parser.add_argument("-sc",
                        "--schema",
                        help="Schema Mapping to be used. Options:\n%s" %
                        schema_map.keys(),
                        default='journal')
    args = vars(parser.parse_args())
    if args['schema'] not in schema_map:
        print("Error: %s  schema is unknown. Known options: %s" %
              (args['schema'], schema_map.keys()))
        sys.exit(1)

    schema_mapper = schema_map[args['schema']]
    docs = read_jsonlines(args['in'])
    # map to schema
    docs = map(schema_mapper, docs)

    def merge_lists(docs):
        for docgroup in docs:
            for doc in docgroup:
                yield doc

    docs_solr = merge_lists(docs)

    # send to solr
    solr = Solr(args['solr_url'])
    index(solr, docs_solr, len(docs))

Exemple #18

0

Afficher le fichier

def loadAllObjectPids(db):
    solr = Solr()
    solr.loadConfig('solr.cfg','prod')
    queryParams = {
        'q':'*:*',
        'fl': 'PID',
    }
#    Test small sample
    # queryParams = {
    #     'q':'RELS_EXT_isConstituentOf_uri_s:info\:fedora\/mtholyoke\:25060',
    #     'fl': 'PID',
    # }

    response = solr.query(queryParams)
    allpids = list(map(lambda x: x['PID'], response))
    for pid in allpids:
        try:
            db.cursor.execute("INSERT INTO md5s_remote VALUES ('%s', NULL, 0)" % pid)
        except sqlite3.IntegrityError as e:
            logging.warning(str(e) + ' ' + pid)
    db.connection.commit()

Exemple #19

0

Afficher le fichier

    def post_to_solr(self):
        print >> sys.stderr, "POSTing product ID %s to Solr ..." % self.id
        data = {
            'id': "%s" % self.id,
            'product': self.name,
            'reference': self.reference,
            'provider': self.provider.name,
            'origin': self.origin,
            'price': str(self.price),
            'packaging': self.packaging,
            'offer_nb': self.offer_nb,
            'nomenclature': self.nomenclature,
            'category': self.category and self.category.name or None,
            'sub_category': self.sub_category and self.sub_category.name
            or None,
            'last_change': self.last_change.strftime("%d/%m/%Y"),
            'expiry': self.expiry and self.expiry.strftime("%d/%m/%Y") or None
        }

        solr = Solr()
        solr.post(data)

Exemple #20

0

Afficher le fichier

Fichier : SolrPipeline.py Projet : hanwei2008/ENV

    def open_spider(self, spider):
        solr_collection_name = self.solr_collection_map.get(spider.name)
        if solr_collection_name:
            self.solr_collection_name = solr_collection_name
        if not self.solr_collection_name:
            spider.log("No collection associated with " + spider.name + "!",
                       level=log.CRITICAL)
            raise CloseSpider
        if self.solr_cloud_mode:
            from solrcloudpy import SolrConnection

            self.solr_connection = SolrConnection(
                server=self.solr_servers,
                detect_live_nodes=self.solr_detect_live_nodes,
                user=self.solr_user,
                password=self.solr_password,
                timeout=self.solr_timeout,
                webappdir=self.solr_web_app)
            self.solr_collection = self.solr_connection[
                self.solr_collection_name]
        else:
            from solr import Solr
            from urlparse import urljoin

            collection_url = reduce(urljoin,
                                    (self.solr_servers[0], self.solr_web_app,
                                     self.solr_collection_name))
            if isinstance(collection_url, unicode):
                collection_url = collection_url.encode("UTF-8")
            self.solr_collection = Solr(url=collection_url,
                                        http_user=self.solr_user,
                                        http_pass=self.solr_password,
                                        timeout=self.solr_timeout)

        if self.solr_cache_max_len > 0:
            max_len = self.solr_cache_max_len * 2
        else:
            max_len = 2
        self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len)
        self.locks[spider.name] = Lock()

Exemple #21

0

Afficher le fichier

Fichier : views.py Projet : blmeena1991/bcg_lab

def search(request):
    query = request.GET.get("q", None)
    facet_query = request.GET.get("fq", '')

    if query:
        solr = Solr()
        solr.query({'q': query, 'fq': facet_query, 'fl': '*'})

        suggestion = solr.suggestion()

        return render(
            request, 'product/search.html', {
                'numFound': solr.numFound(),
                'query': query,
                'facet_query': facet_query and facet_query.split(':')[1]
                or None,
                'facets': solr.facet_fields(),
                'suggestion': suggestion,
                'solr_docs': solr.docs()
            })
    else:
        return render(request, 'product/search.html', {})

Exemple #22

0

Afficher le fichier

def sync_couch_collection_to_solr(collection_key):
    # This works from inside an environment with default URLs for couch & solr
    delete_solr_collection(collection_key)
    URL_SOLR = os.environ.get('URL_SOLR', None)
    collection_key = str(collection_key)  # Couch need string keys
    v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(),
                                collection_key=collection_key)
    solr_db = Solr(URL_SOLR)
    updated_docs = []
    num_added = 0
    report = defaultdict(int)
    for r in v:
        try:
            fill_in_title(r.doc)
            has_required_fields(r.doc)
        except KeyError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        except ValueError as e:
            report[e.dict_key] += 1
            print(e.message, file=sys.stderr)
            continue
        solr_doc = map_couch_to_solr_doc(r.doc)
        # TODO: here is where to check if existing and compare collection vals
        try:
            check_nuxeo_media(solr_doc)
        except ValueError as e:
            print(e.message, file=sys.stderr)
            report[e.dict_key] += 1
            continue
        updated_docs.append(solr_doc)
        num_added += push_doc_to_solr(solr_doc, solr_db=solr_db)
    solr_db.commit()
    publish_to_harvesting(
        'Synced collection {} to solr'.format(collection_key),
        harvesting_report(collection_key, updated_docs, num_added, report))
    return updated_docs, report

Exemple #23

0

Afficher le fichier

def delete_sound_from_solr(sound):
    logger.info("deleting sound with id %d" % sound.id)
    try:
        Solr(settings.SOLR_URL).delete_by_id(sound.id)
    except Exception, e:
        logger.error('could not delete sound with id %s (%s).' % (sound.id, e))

Exemple #24

0

Afficher le fichier

Fichier : solr-similarity.py Projet : voltek62/tika-similarity

def jaccard(core, metric, threshold=0.01):

    solrURL = "http://localhost:8983/solr/" + core
    solrInstance = Solr(solrURL)

    if metric == "meta":
        docs = computeJaccardMeta(solrURL, solrInstance)
    elif metric == "value":
        docs = computeJaccardValue(solrInstance)

    json_data = {"name": "clusters", "children": []}

    prior_node = {
        "metadata":
        json.dumps(docs[0]),
        "name":
        docs[0]['id'].split('/')[-1],
        "path":
        os.environ["IMAGE_MOUNT"] +
        docs[0]['id'].split('/')[-1].split('.')[0] + ".jpg",
        "score":
        docs[0]["jaccard_{0}_abs".format(metric)]
    }

    prior = docs[0]["jaccard_{0}_abs".format(metric)]

    cluster0 = {"name": "cluster0", "children": [prior_node]}

    clusters = [cluster0]
    clusterCount = 0

    for i in range(1, len(docs)):

        node = {
            "metadata":
            json.dumps(docs[i]),
            "name":
            docs[i]['id'].split('/')[-1],
            "path":
            os.environ["IMAGE_MOUNT"] +
            docs[i]['id'].split('/')[-1].split('.')[0] + ".jpg",
            "score":
            docs[i]["jaccard_{0}_abs".format(metric)]
        }

        diff = prior - docs[i]["jaccard_{0}_abs".format(metric)]

        if diff <= threshold:
            clusters[clusterCount]["children"].append(node)
        else:
            clusterCount += 1
            newCluster = {
                "name": "cluster" + str(clusterCount),
                "children": [node]
            }
            clusters.append(newCluster)

        prior = docs[i]["jaccard_{0}_abs".format(metric)]

    json_data["children"] = clusters

    return json.dumps(json_data)

Exemple #25

0

Afficher le fichier

                objects = []
                scores = []
                for obj, confd in row.items():
                    for o in obj.split(","):
                        objects.append(o.strip())
                        scores.append(confd)
            delta['objects'] = {'set' : objects}
            delta['confidence'] = {'set': scores}
            yield delta


if __name__ == '__main__':
    # Get the CSV file from classifier-local.py
    if len(sys.argv) != 2:
        print("required args:\n <CSV_file>")
        sys.exit(1)
    infile = sys.argv[1]
    min_confidence = 0.30
    print("Reading from %s, Min confidence=%f" % (infile, min_confidence))
    solr_url = "http://localhost:8983/solr/imagecatdev"
    solr = Solr(solr_url)

    updates = generate_solr_updates(infile, min_confidence=min_confidence)
    count, res = solr.post_iterator(updates, commit=True, buffer_size=1000)
    print("Res : %s; count=%d" %(res, count))
    '''
    from pprint import pprint
    for u in updates:
      pprint(u)
    '''

Exemple #26

0

Afficher le fichier

Fichier : name_finder.py Projet : thammegowda/mt-box

                    groups.append([])
                groups[-1].append(tok)
                spanning = True
            else:
                spanning = False
        return groups


def catch_names(path, finder):
    with codecs.open(path, 'r', 'utf-8') as f:
        for line in f:
            line = line.strip()
            groups = finder.scan_names(line)
            for group in groups:
                finder.beam(group)


if __name__ == '__main__':
    sys.argv = [
        '', '-in', '../data/set1.source.tok', '-solr',
        'http://localhost:8983/solr/name'
    ]
    p = ArgumentParser()
    p.add_argument("-in", required=True, help="Input File.")
    p.add_argument("-solr",
                   required=True,
                   help="Solr URL. Eg:http://localhost:8983/solr/name")
    args = vars(p.parse_args())
    solr = Solr(args['solr'])
    finder = NameFinder(solr, stem_func=stem)
    catch_names(args['in'], finder)

Exemple #27

0

Afficher le fichier

Fichier : solr_util.py Projet : buxiebug/ProxyCrawl

def add_item(solr_address, doc):
    s = Solr(solr_address)
    resp = s.add(doc, commit=True)

Exemple #28

0

Afficher le fichier

def main(url_couchdb=None,
         dbname=None,
         url_solr=None,
         all_docs=False,
         since=None):
    '''Use the _changes feed with a "since" parameter to only catch new
    changes to docs. The _changes feed will only have the *last* event on
    a document and does not retain intermediate changes.
    Setting the "since" to 0 will result in getting a _changes record for
    each document, essentially dumping the db to solr
    '''
    print('Solr update PID: {}'.format(os.getpid()))
    dt_start = datetime.datetime.now()
    print('Start time:{}'.format(dt_start))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    s3_seq_cache = CouchdbLastSeq_S3()
    if not since:
        since = s3_seq_cache.last_seq
    if all_docs:
        since = '0'
    print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname))
    print('Getting changes since:{}'.format(since))
    sys.stdout.flush()  # put pd
    db = get_couchdb(url=url_couchdb, dbname=dbname)
    changes = db.changes(since=since)
    previous_since = since
    last_since = int(
        changes['last_seq'])  # get new last_since for changes feed
    results = changes['results']
    n_up = n_design = n_delete = 0
    solr_db = Solr(url_solr)
    start_time = datetime.datetime.now()
    for row in results:
        cur_id = row['id']
        if '_design' in cur_id:
            n_design += 1
            print("Skip {0}".format(cur_id))
            continue
        if row.get('deleted', False):
            # need to get the solr doc for this couch
            resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"')))
            if resp.numFound == 1:
                sdoc = resp.results[0]
                print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id']))
                solr_db.delete(id=sdoc['id'])
                n_delete += 1
            else:
                print("-----DELETION of {} - FOUND {} docs".format(
                    cur_id, resp.numFound))
        else:
            doc = db.get(cur_id)
            try:
                doc = fill_in_title(doc)
                has_required_fields(doc)
            except KeyError as e:
                print(e.message)
                continue
            except ValueError as e:
                print(e.message)
                continue
            try:
                try:
                    solr_doc = map_couch_to_solr_doc(doc)
                except OldCollectionException:
                    print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id))
                    continue
                try:
                    check_nuxeo_media(solr_doc)
                except ValueError as e:
                    print(e.message)
                    continue
                solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db)
            except TypeError as e:
                print('TypeError for {0} : {1}'.format(cur_id, e))
                continue
        n_up += 1
        if n_up % 1000 == 0:
            elapsed_time = datetime.datetime.now() - start_time
            print("Updated {} so far in {}".format(n_up, elapsed_time))
    solr_db.commit()
    if not all_docs:
        s3_seq_cache.last_seq = last_since
    print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete))
    print("PREVIOUS SINCE:{0}".format(previous_since))
    print("LAST SINCE:{0}".format(last_since))
    run_time = datetime.datetime.now() - dt_start
    print("RUN TIME:{}".format(run_time))

Exemple #29

0

Afficher le fichier

Fichier : search_forum.py Projet : DINKIN/freesound

def delete_post_from_solr(post):
    logger.info("deleting post with id %d" % post.id)
    try:
        Solr(settings.SOLR_FORUM_URL).delete_by_id(post.id)
    except Exception, e:
        logger.error('could not delete post with id %s (%s).' % (post.id, e))

Exemple #30

0

Afficher le fichier

Fichier : views.py Projet : VincentRoma/pysolr-rest

 def __init__(self):
     self.url = "http://{}:{}/solr/{}".format(settings.HOST,settings.PORT,settings.COLLECTION)
     self.s = Solr(self.url, timeout=10)