def sk_kmeans(core): #, kval=3 solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) list_of_points = [] docs = solrInstance.query_iterator(query="*:*", start=0) for doc in docs: list_of_points.append(Vector(doc['id'], doc)) list_of_Dicts = (point.features for point in list_of_points) df = pd.DataFrame(list_of_Dicts) df = df.fillna(0) silhouettes = {} for k in range(2, 10): kmeans = KMeans( n_clusters=k, init='k-means++', max_iter=300, # k-means convergence n_init=10, # find global minima n_jobs=-2, # parallelize ) labels = kmeans.fit_predict(df) silhouettes[k] = silhouette_score(df, labels) return str(silhouettes)
def get_all_sound_ids_from_solr(limit=False): logger.info("getting all sound ids from solr.") if not limit: limit = 99999999999999 solr = Solr(settings.SOLR_URL) solr_ids = [] solr_count = None PAGE_SIZE = 2000 current_page = 1 try: while (len(solr_ids) < solr_count or solr_count == None) and len(solr_ids) < limit: #print "Getting page %i" % current_page response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query('', '', search_prepare_sort( 'created asc', SEARCH_SORT_OPTIONS_WEB), current_page, PAGE_SIZE, include_facets=False)))) solr_ids += [element['id'] for element in response.docs] solr_count = response.num_found current_page += 1 except Exception, e: raise Exception(e)
def add_sounds_to_solr(sounds): logger.info("adding multiple sounds to solr index") solr = Solr(settings.SOLR_URL) logger.info("creating XML") documents = map(convert_to_solr_document, sounds) logger.info("posting to Solr") solr.add(documents)
def add_sound_to_solr(sound): logger.info("adding single sound to solr index") try: Solr(settings.SOLR_URL).add([convert_to_solr_document(sound)]) except SolrException, e: logger.error("failed to add sound %d to solr index, reason: %s" % (sound.id, str(e)))
def add_post_to_solr(post): logger.info("adding single forum post to solr index") try: Solr(settings.SOLR_FORUM_URL).add([convert_to_solr_document(post)]) except SolrException, e: logger.error("failed to add forum post %d to solr index, reason: %s" % (post.id, str(e)))
def index(self, docs, solr_url): solr = Solr(solr_url) success, count = solr.post_iterator(docs) if success: print("Indexed %d docs" % count) else: print("Error: Indexing failed, check solr logs")
def _solr_search(query_dict): solr = Solr() solr.query(query_dict) product_list = Product.objects.filter( id__in=[doc['id'] for doc in solr.docs()]) return product_list, solr.numFound()
def delete_post_from_solr(post_id): logger.info("deleting post with id %d" % post_id) try: solr = Solr(settings.SOLR_FORUM_URL) solr.delete_by_id(post_id) solr.commit() except SolrException as e: logger.error('could not delete post with id %s (%s).' % (post_id, e))
def clean(solr_address): s = Solr(solr_address) resp = s.select('*:*', fl='id', rows='50000') ids = [] for result in resp.results: ids.append(result.get('id')) print ids s.delete_many(ids, commit=True)
def check_if_sound_exists_in_solr(sound): solr = Solr(settings.SOLR_URL) response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query( '', 'id:%i' % sound.id, search_prepare_sort('created asc', SEARCH_SORT_OPTIONS_WEB), 1, 1)))) return response.num_found > 0
def index_docs(docs, solr_url, corpus, buffer_size=2000): from solr import Solr solr = Solr(solr_url) docs = (seg for doc in docs for seg in doc.to_rec_dicts()) def set_corpus(doc): doc['corpus'] = corpus return doc docs = map(set_corpus, docs) solr.post_iterator(docs, buffer_size=buffer_size)
def add_posts_to_solr(posts): logger.info("adding multiple forum posts to solr index") solr = Solr(settings.SOLR_FORUM_URL) logger.info("creating XML") documents = map(convert_to_solr_document, posts) logger.info("posting to Solr") solr.add(documents) logger.info("optimizing solr index") #solr.optimize() logger.info("done")
def index(self, solr_url, in_file): ''' Reads annotations at the specified path and indexes them to solr @param solr_url Target Solr URL to index @param in_file CSV file having text file and annotation file paths ''' solr = Solr(solr_url) recs = self.read_records(in_file) count, success, = solr.post_iterator(recs) if success: print("Indexed %d docs" % count) else: print("Error: Failed. Check solr logs")
def solrIngest(URL, dataset=None, inputDir=None, accept=None): solr = Solr(URL) documents = [] if dataset: documents = lazyDataset(dataset) elif inputDir: documents = lazySolr(inputDir, accept) count, res = solr.post_iterator(documents, commit=True, buffer_size=100) print(("Res : %s; count=%d" % (res, count)))
def send_posts_to_solr(posts): logger.info("adding forum posts to solr index") logger.info("creating XML") documents = [convert_to_solr_document(p) for p in posts] try: logger.info("posting to Solr") solr = Solr(settings.SOLR_FORUM_URL) solr.add(documents) solr.commit() except SolrException as e: logger.error("failed to add posts to solr index, reason: %s" % str(e)) logger.info("done")
def main(collection_key): v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) results = [] for r in v: dt_start = dt_end = datetime.datetime.now() try: doc = fill_in_title(r.doc) has_required_fields(r.doc) except KeyError, e: print(e.message) continue solr_doc = map_couch_to_solr_doc(r.doc) results.append(solr_doc) solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) dt_end = datetime.datetime.now()
def main(): # Step : Parse CLI args parser = ArgumentParser( description="This tool can read JSON line dump and index to solr.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, version="1.0") parser.add_argument("-i", "--in", help="Path to Input JSON line file.", required=True) parser.add_argument("-s", "--solr-url", help="URL of Solr core.", default="http://localhost:8983/solr/docsdev") parser.add_argument("-sc", "--schema", help="Schema Mapping to be used. Options:\n%s" % schema_map.keys(), default='journal') args = vars(parser.parse_args()) if args['schema'] not in schema_map: print("Error: %s schema is unknown. Known options: %s" % (args['schema'], schema_map.keys())) sys.exit(1) schema_mapper = schema_map[args['schema']] docs = read_jsonlines(args['in']) # map to schema docs = map(schema_mapper, docs) def merge_lists(docs): for docgroup in docs: for doc in docgroup: yield doc docs_solr = merge_lists(docs) # send to solr solr = Solr(args['solr_url']) index(solr, docs_solr, len(docs))
def loadAllObjectPids(db): solr = Solr() solr.loadConfig('solr.cfg','prod') queryParams = { 'q':'*:*', 'fl': 'PID', } # Test small sample # queryParams = { # 'q':'RELS_EXT_isConstituentOf_uri_s:info\:fedora\/mtholyoke\:25060', # 'fl': 'PID', # } response = solr.query(queryParams) allpids = list(map(lambda x: x['PID'], response)) for pid in allpids: try: db.cursor.execute("INSERT INTO md5s_remote VALUES ('%s', NULL, 0)" % pid) except sqlite3.IntegrityError as e: logging.warning(str(e) + ' ' + pid) db.connection.commit()
def post_to_solr(self): print >> sys.stderr, "POSTing product ID %s to Solr ..." % self.id data = { 'id': "%s" % self.id, 'product': self.name, 'reference': self.reference, 'provider': self.provider.name, 'origin': self.origin, 'price': str(self.price), 'packaging': self.packaging, 'offer_nb': self.offer_nb, 'nomenclature': self.nomenclature, 'category': self.category and self.category.name or None, 'sub_category': self.sub_category and self.sub_category.name or None, 'last_change': self.last_change.strftime("%d/%m/%Y"), 'expiry': self.expiry and self.expiry.strftime("%d/%m/%Y") or None } solr = Solr() solr.post(data)
def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection( server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[ self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock()
def search(request): query = request.GET.get("q", None) facet_query = request.GET.get("fq", '') if query: solr = Solr() solr.query({'q': query, 'fq': facet_query, 'fl': '*'}) suggestion = solr.suggestion() return render( request, 'product/search.html', { 'numFound': solr.numFound(), 'query': query, 'facet_query': facet_query and facet_query.split(':')[1] or None, 'facets': solr.facet_fields(), 'suggestion': suggestion, 'solr_docs': solr.docs() }) else: return render(request, 'product/search.html', {})
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr delete_solr_collection(collection_key) URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report(collection_key, updated_docs, num_added, report)) return updated_docs, report
def delete_sound_from_solr(sound): logger.info("deleting sound with id %d" % sound.id) try: Solr(settings.SOLR_URL).delete_by_id(sound.id) except Exception, e: logger.error('could not delete sound with id %s (%s).' % (sound.id, e))
def jaccard(core, metric, threshold=0.01): solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) if metric == "meta": docs = computeJaccardMeta(solrURL, solrInstance) elif metric == "value": docs = computeJaccardValue(solrInstance) json_data = {"name": "clusters", "children": []} prior_node = { "metadata": json.dumps(docs[0]), "name": docs[0]['id'].split('/')[-1], "path": os.environ["IMAGE_MOUNT"] + docs[0]['id'].split('/')[-1].split('.')[0] + ".jpg", "score": docs[0]["jaccard_{0}_abs".format(metric)] } prior = docs[0]["jaccard_{0}_abs".format(metric)] cluster0 = {"name": "cluster0", "children": [prior_node]} clusters = [cluster0] clusterCount = 0 for i in range(1, len(docs)): node = { "metadata": json.dumps(docs[i]), "name": docs[i]['id'].split('/')[-1], "path": os.environ["IMAGE_MOUNT"] + docs[i]['id'].split('/')[-1].split('.')[0] + ".jpg", "score": docs[i]["jaccard_{0}_abs".format(metric)] } diff = prior - docs[i]["jaccard_{0}_abs".format(metric)] if diff <= threshold: clusters[clusterCount]["children"].append(node) else: clusterCount += 1 newCluster = { "name": "cluster" + str(clusterCount), "children": [node] } clusters.append(newCluster) prior = docs[i]["jaccard_{0}_abs".format(metric)] json_data["children"] = clusters return json.dumps(json_data)
objects = [] scores = [] for obj, confd in row.items(): for o in obj.split(","): objects.append(o.strip()) scores.append(confd) delta['objects'] = {'set' : objects} delta['confidence'] = {'set': scores} yield delta if __name__ == '__main__': # Get the CSV file from classifier-local.py if len(sys.argv) != 2: print("required args:\n <CSV_file>") sys.exit(1) infile = sys.argv[1] min_confidence = 0.30 print("Reading from %s, Min confidence=%f" % (infile, min_confidence)) solr_url = "http://localhost:8983/solr/imagecatdev" solr = Solr(solr_url) updates = generate_solr_updates(infile, min_confidence=min_confidence) count, res = solr.post_iterator(updates, commit=True, buffer_size=1000) print("Res : %s; count=%d" %(res, count)) ''' from pprint import pprint for u in updates: pprint(u) '''
groups.append([]) groups[-1].append(tok) spanning = True else: spanning = False return groups def catch_names(path, finder): with codecs.open(path, 'r', 'utf-8') as f: for line in f: line = line.strip() groups = finder.scan_names(line) for group in groups: finder.beam(group) if __name__ == '__main__': sys.argv = [ '', '-in', '../data/set1.source.tok', '-solr', 'http://localhost:8983/solr/name' ] p = ArgumentParser() p.add_argument("-in", required=True, help="Input File.") p.add_argument("-solr", required=True, help="Solr URL. Eg:http://localhost:8983/solr/name") args = vars(p.parse_args()) solr = Solr(args['solr']) finder = NameFinder(solr, stem_func=stem) catch_names(args['in'], finder)
def add_item(solr_address, doc): s = Solr(solr_address) resp = s.add(doc, commit=True)
def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr ''' print('Solr update PID: {}'.format(os.getpid())) dt_start = datetime.datetime.now() print('Start time:{}'.format(dt_start)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) s3_seq_cache = CouchdbLastSeq_S3() if not since: since = s3_seq_cache.last_seq if all_docs: since = '0' print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname)) print('Getting changes since:{}'.format(since)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) changes = db.changes(since=since) previous_since = since last_since = int( changes['last_seq']) # get new last_since for changes feed results = changes['results'] n_up = n_design = n_delete = 0 solr_db = Solr(url_solr) start_time = datetime.datetime.now() for row in results: cur_id = row['id'] if '_design' in cur_id: n_design += 1 print("Skip {0}".format(cur_id)) continue if row.get('deleted', False): # need to get the solr doc for this couch resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"'))) if resp.numFound == 1: sdoc = resp.results[0] print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id'])) solr_db.delete(id=sdoc['id']) n_delete += 1 else: print("-----DELETION of {} - FOUND {} docs".format( cur_id, resp.numFound)) else: doc = db.get(cur_id) try: doc = fill_in_title(doc) has_required_fields(doc) except KeyError as e: print(e.message) continue except ValueError as e: print(e.message) continue try: try: solr_doc = map_couch_to_solr_doc(doc) except OldCollectionException: print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id)) continue try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message) continue solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) except TypeError as e: print('TypeError for {0} : {1}'.format(cur_id, e)) continue n_up += 1 if n_up % 1000 == 0: elapsed_time = datetime.datetime.now() - start_time print("Updated {} so far in {}".format(n_up, elapsed_time)) solr_db.commit() if not all_docs: s3_seq_cache.last_seq = last_since print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete)) print("PREVIOUS SINCE:{0}".format(previous_since)) print("LAST SINCE:{0}".format(last_since)) run_time = datetime.datetime.now() - dt_start print("RUN TIME:{}".format(run_time))
def delete_post_from_solr(post): logger.info("deleting post with id %d" % post.id) try: Solr(settings.SOLR_FORUM_URL).delete_by_id(post.id) except Exception, e: logger.error('could not delete post with id %s (%s).' % (post.id, e))
def __init__(self): self.url = "http://{}:{}/solr/{}".format(settings.HOST,settings.PORT,settings.COLLECTION) self.s = Solr(self.url, timeout=10)