def sk_kmeans(core): #, kval=3 solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) list_of_points = [] docs = solrInstance.query_iterator(query="*:*", start=0) for doc in docs: list_of_points.append(Vector(doc['id'], doc)) list_of_Dicts = (point.features for point in list_of_points) df = pd.DataFrame(list_of_Dicts) df = df.fillna(0) silhouettes = {} for k in range(2, 10): kmeans = KMeans(n_clusters=k, init='k-means++', max_iter=300, # k-means convergence n_init=10, # find global minima n_jobs=-2, # parallelize ) labels = kmeans.fit_predict(df) silhouettes[k] = silhouette_score(df, labels) return str(silhouettes)
def add_sounds_to_solr(sounds): logger.info("adding multiple sounds to solr index") solr = Solr(settings.SOLR_URL) logger.info("creating XML") documents = map(convert_to_solr_document, sounds) logger.info("posting to Solr") solr.add(documents)
def sk_kmeans(core): #, kval=3 solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) list_of_points = [] docs = solrInstance.query_iterator(query="*:*", start=0) for doc in docs: list_of_points.append(Vector(doc['id'], doc)) list_of_Dicts = (point.features for point in list_of_points) df = pd.DataFrame(list_of_Dicts) df = df.fillna(0) silhouettes = {} for k in range(2, 10): kmeans = KMeans( n_clusters=k, init='k-means++', max_iter=300, # k-means convergence n_init=10, # find global minima n_jobs=-2, # parallelize ) labels = kmeans.fit_predict(df) silhouettes[k] = silhouette_score(df, labels) return str(silhouettes)
def _solr_search( query_dict ): solr = Solr() solr.query( query_dict ) product_list = Product.objects.filter( id__in = [doc['id'] for doc in solr.docs()] ) return product_list, solr.numFound()
def index(self, docs, solr_url): solr = Solr(solr_url) success, count = solr.post_iterator(docs) if success: print("Indexed %d docs" % count) else: print("Error: Indexing failed, check solr logs")
def get_all_sound_ids_from_solr(limit=False): logger.info("getting all sound ids from solr.") if not limit: limit = 99999999999999 solr = Solr(settings.SOLR_URL) solr_ids = [] solr_count = None PAGE_SIZE = 2000 current_page = 1 try: while (len(solr_ids) < solr_count or solr_count == None) and len(solr_ids) < limit: #print "Getting page %i" % current_page response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query('', '', search_prepare_sort( 'created asc', SEARCH_SORT_OPTIONS_WEB), current_page, PAGE_SIZE, include_facets=False)))) solr_ids += [element['id'] for element in response.docs] solr_count = response.num_found current_page += 1 except Exception, e: raise Exception(e)
def get_all_sound_ids_from_solr(limit=False): logger.info("getting all sound ids from solr.") if not limit: limit = 99999999999999 solr = Solr(settings.SOLR_URL) solr_ids = [] solr_count = None PAGE_SIZE = 2000 current_page = 1 try: while (len(solr_ids) < solr_count or solr_count == None) and len(solr_ids) < limit: # print "Getting page %i" % current_page response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query( "", "", search_prepare_sort("created asc", SEARCH_SORT_OPTIONS_WEB), current_page, PAGE_SIZE, include_facets=False, ) ) ) ) solr_ids += [element["id"] for element in response.docs] solr_count = response.num_found current_page += 1 except Exception, e: raise Exception(e)
def clean(solr_address): s = Solr(solr_address) resp = s.select('*:*', fl='id', rows='50000') ids = [] for result in resp.results: ids.append(result.get('id')) print ids s.delete_many(ids, commit=True)
def check_if_sound_exists_in_solr(sound): solr = Solr(settings.SOLR_URL) response = SolrResponseInterpreter( solr.select( unicode( search_prepare_query( '', 'id:%i' % sound.id, search_prepare_sort('created asc', SEARCH_SORT_OPTIONS_WEB), 1, 1)))) return response.num_found > 0
def index_docs(docs, solr_url, corpus, buffer_size=2000): from solr import Solr solr = Solr(solr_url) docs = (seg for doc in docs for seg in doc.to_rec_dicts()) def set_corpus(doc): doc['corpus'] = corpus return doc docs = map(set_corpus, docs) solr.post_iterator(docs, buffer_size=buffer_size)
def add_posts_to_solr(posts): logger.info("adding multiple forum posts to solr index") solr = Solr(settings.SOLR_FORUM_URL) logger.info("creating XML") documents = map(convert_to_solr_document, posts) logger.info("posting to Solr") solr.add(documents) logger.info("optimizing solr index") #solr.optimize() logger.info("done")
def solrIngest(URL, dataset=None, inputDir=None, accept=None): solr = Solr(URL) documents = [] if dataset: documents = lazyDataset(dataset) elif inputDir: documents = lazySolr(inputDir, accept) count, res = solr.post_iterator(documents, commit=True, buffer_size=100) print("Res : %s; count=%d" % (res, count))
def index(self, solr_url, in_file): ''' Reads annotations at the specified path and indexes them to solr @param solr_url Target Solr URL to index @param in_file CSV file having text file and annotation file paths ''' solr = Solr(solr_url) recs = self.read_records(in_file) count, success, = solr.post_iterator(recs) if success: print("Indexed %d docs" % count) else: print("Error: Failed. Check solr logs")
def add_posts_to_solr(posts): logger.info("adding multiple forum posts to solr index") solr = Solr(settings.SOLR_FORUM_URL) logger.info("creating XML") documents = map(convert_to_solr_document, posts) logger.info("posting to Solr") solr.add(documents) logger.info("optimizing solr index") solr.optimize() logger.info("done")
def solrIngest(URL, dataset=None, inputDir=None, accept=None): solr = Solr(URL) documents = [] if dataset: documents = lazyDataset(dataset) elif inputDir: documents = lazySolr(inputDir, accept) count, res = solr.post_iterator(documents, commit=True, buffer_size=100) print(("Res : %s; count=%d" % (res, count)))
def add_post_to_solr(post): logger.info("adding single forum post to solr index") try: Solr(settings.SOLR_FORUM_URL).add([convert_to_solr_document(post)]) except SolrException, e: logger.error("failed to add forum post %d to solr index, reason: %s" % (post.id, str(e)))
def add_sound_to_solr(sound): logger.info("adding single sound to solr index") try: Solr(settings.SOLR_URL).add([convert_to_solr_document(sound)]) except SolrException, e: logger.error("failed to add sound %d to solr index, reason: %s" % (sound.id, str(e)))
def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection(server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock()
def index(): s = Solr('http://localhost:8983/solr/jobs') db = get_db() for job in db.jobs.find(): job["id"] = str(job.pop("_id")) job["updated_at"] = date.today() try: company = job["company"] location = job["location"] geo = db.company_coordinates.find(_get_query(company, location))[0] job["geo_location"] = "%f,%f" % (geo["lat"], geo["lng"]) print dict(job) s.add(dict(job), commit=True) except Exception as e: print e pass
def post_to_solr(self): print >> sys.stderr, "POSTing product ID %s to Solr ..." % self.id data = { 'id': "%s" % self.id, 'product': self.name, 'reference': self.reference, 'provider': self.provider.name, 'origin': self.origin, 'price': str(self.price), 'packaging': self.packaging, 'offer_nb': self.offer_nb, 'nomenclature': self.nomenclature, 'category': self.category and self.category.name or None, 'sub_category': self.sub_category and self.sub_category.name or None, 'last_change': self.last_change.strftime("%d/%m/%Y"), 'expiry': self.expiry and self.expiry.strftime("%d/%m/%Y") or None } solr = Solr() solr.post( data )
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter( couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report( collection_key, updated_docs, num_added, report)) return updated_docs, report
def post_to_solr(self): print >> sys.stderr, "POSTing product ID %s to Solr ..." % self.id data = { 'id': "%s" % self.id, 'product': self.name, 'reference': self.reference, 'provider': self.provider.name, 'origin': self.origin, 'price': str(self.price), 'packaging': self.packaging, 'offer_nb': self.offer_nb, 'nomenclature': self.nomenclature, 'category': self.category and self.category.name or None, 'sub_category': self.sub_category and self.sub_category.name or None, 'last_change': self.last_change.strftime("%d/%m/%Y"), 'expiry': self.expiry and self.expiry.strftime("%d/%m/%Y") or None } solr = Solr() solr.post(data)
def loadAllObjectPids(db): solr = Solr() solr.loadConfig('solr.cfg','prod') queryParams = { 'q':'*:*', 'fl': 'PID', } # Test small sample # queryParams = { # 'q':'RELS_EXT_isConstituentOf_uri_s:info\:fedora\/mtholyoke\:25060', # 'fl': 'PID', # } response = solr.query(queryParams) allpids = list(map(lambda x: x['PID'], response)) for pid in allpids: try: db.cursor.execute("INSERT INTO md5s_remote VALUES ('%s', NULL, 0)" % pid) except sqlite3.IntegrityError as e: logging.warning(str(e) + ' ' + pid) db.connection.commit()
def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection( server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[ self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock()
def delete_post_from_solr(post_id): logger.info("deleting post with id %d" % post_id) try: solr = Solr(settings.SOLR_FORUM_URL) solr.delete_by_id(post_id) solr.commit() except SolrException as e: logger.error('could not delete post with id %s (%s).' % (post_id, e))
def _solr_search(query_dict): solr = Solr() solr.query(query_dict) product_list = Product.objects.filter( id__in=[doc['id'] for doc in solr.docs()]) return product_list, solr.numFound()
def sync_couch_collection_to_solr(collection_key): # This works from inside an environment with default URLs for couch & solr delete_solr_collection(collection_key) URL_SOLR = os.environ.get('URL_SOLR', None) collection_key = str(collection_key) # Couch need string keys v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) updated_docs = [] num_added = 0 report = defaultdict(int) for r in v: try: fill_in_title(r.doc) has_required_fields(r.doc) except KeyError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue except ValueError as e: report[e.dict_key] += 1 print(e.message, file=sys.stderr) continue solr_doc = map_couch_to_solr_doc(r.doc) # TODO: here is where to check if existing and compare collection vals try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message, file=sys.stderr) report[e.dict_key] += 1 continue updated_docs.append(solr_doc) num_added += push_doc_to_solr(solr_doc, solr_db=solr_db) solr_db.commit() publish_to_harvesting( 'Synced collection {} to solr'.format(collection_key), harvesting_report(collection_key, updated_docs, num_added, report)) return updated_docs, report
def main(collection_key): v = CouchDBCollectionFilter(couchdb_obj=get_couchdb(), collection_key=collection_key) solr_db = Solr(URL_SOLR) results = [] for r in v: dt_start = dt_end = datetime.datetime.now() try: doc = fill_in_title(r.doc) has_required_fields(r.doc) except KeyError, e: print(e.message) continue solr_doc = map_couch_to_solr_doc(r.doc) results.append(solr_doc) solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) dt_end = datetime.datetime.now()
def main(config): cfg = cliconfig(config) session = SessionFactory(cfg['database']['url']).create() server = Solr(str(cfg['solr']['url']), http_user=cfg['solr'].get('username'), http_pass=cfg['solr'].get('password')) documents = [] q = session.query(Address).filter(Address.prefecture is not None) q = q.order_by(Address.zipcode) for r in ifilter(lambda r: r, imap(transform, q)): documents.append(r) if len(documents) >= COMMIT_UNIT: server.add_many(documents) documents = [] if len(documents) > 0: server.add_many(documents) server.commit()
def search(request): query = request.GET.get("q", None) facet_query = request.GET.get("fq", '') if query: solr = Solr() solr.query({'q': query, 'fq': facet_query, 'fl': '*'}) suggestion = solr.suggestion() return render( request, 'product/search.html', { 'numFound': solr.numFound(), 'query': query, 'facet_query': facet_query and facet_query.split(':')[1] or None, 'facets': solr.facet_fields(), 'suggestion': suggestion, 'solr_docs': solr.docs() }) else: return render(request, 'product/search.html', {})
def main(): # Step : Parse CLI args parser = ArgumentParser( description="This tool can read JSON line dump and index to solr.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, version="1.0") parser.add_argument("-i", "--in", help="Path to Input JSON line file.", required=True) parser.add_argument("-s", "--solr-url", help="URL of Solr core.", default="http://localhost:8983/solr/docsdev") parser.add_argument("-sc", "--schema", help="Schema Mapping to be used. Options:\n%s" % schema_map.keys(), default='journal') args = vars(parser.parse_args()) if args['schema'] not in schema_map: print("Error: %s schema is unknown. Known options: %s" % (args['schema'], schema_map.keys())) sys.exit(1) schema_mapper = schema_map[args['schema']] docs = read_jsonlines(args['in']) # map to schema docs = map(schema_mapper, docs) def merge_lists(docs): for docgroup in docs: for doc in docgroup: yield doc docs_solr = merge_lists(docs) # send to solr solr = Solr(args['solr_url']) index(solr, docs_solr, len(docs))
def send_posts_to_solr(posts): logger.info("adding forum posts to solr index") logger.info("creating XML") documents = [convert_to_solr_document(p) for p in posts] try: logger.info("posting to Solr") solr = Solr(settings.SOLR_FORUM_URL) solr.add(documents) solr.commit() except SolrException as e: logger.error("failed to add posts to solr index, reason: %s" % str(e)) logger.info("done")
def search(request): query = request.GET.get("q", None) facet_query = request.GET.get("fq", '') if query: solr = Solr() solr.query({ 'q': query, 'fq': facet_query, 'fl':'*' }) suggestion = solr.suggestion() return render(request, 'product/search.html', { 'numFound': solr.numFound(), 'query': query, 'facet_query': facet_query and facet_query.split(':')[1] or None, 'facets': solr.facet_fields(), 'suggestion': suggestion, 'solr_docs': solr.docs() }) else: return render(request, 'product/search.html', {})
def process(self, duplicate_removal=False): print 'processing...' session = self.Session() flag = True sleepcount = 0 while flag==True: flag = True # sleep sleepcount interval time mysleep( sleepcount ) if(sleepcount==0): sleepcount = 1 else: sleepcount = sleepcount + 1 #endif #print '[WeixiaoSim : ' + get_current_time_str() + '] - new loop to see if we have newly found events...' for instance in session.query(Events).filter(Events.status=='0').filter(Events.city=='beijing').order_by(Events.date, Events.time).limit(10000): #flag = True sleepcount = 0 #print 'Processing event - ' + instance.title.encode('utf-8') + ' ' + instance.place.encode('utf-8') source = instance.source title = instance.title desc = instance.desc date = instance.date category = instance.category time = instance.time place = instance.place fee = instance.fee feelist = instance.feelist imageurl = instance.image originurl = instance.link #print place.encode('utf-8') #print date #print time if is_num(fee) == False : instance.status = '2' print 'Note: fee (' + fee + ') is strange. So we will skip it...' session.commit() continue #endif # determine if the event is obsolete one year, month, day = get_date_detail(date) if validate_time(time) == False : instance.status = '2' print 'Note: time (' + time + ') is strange. So we will skip it...' session.commit() continue else : hour, minute = get_time_detail(time) event_time = build_datetime(year, month, day, hour, minute) current_china_datetime = datetime.now(pytz.timezone('Asia/Shanghai')) if (event_time.isoformat(' ') <= current_china_datetime.isoformat(' ')) : print '[WeixiaoSim : ' + get_current_time_str() + 'processing: ' + title + ' with date '+ date + ' ' + time + ' is obsolete. So we will skip it...' instance.status = '1' session.commit() continue #endif loc_details = getDetailedInfo(place) if loc_details['status']==1: #raw_input("Press Enter to continue...") # FIXME - put this strange address into TBD_address table print '[WeixiaoSim : ' + get_current_time_str() + 'processing: ' + title + ' with place (' + place + ') is strange. So we will skip it...' instance.status = '3' session.commit() continue #endif #print loc_details['formatted_address'].encode('utf-8') #print loc_details['province'].encode('utf-8') #print loc_details['city'].encode('utf-8') #print loc_details['areaname'].encode('utf-8') #print loc_details['areacode'].encode('utf-8') #print loc_details['longitude'] #print loc_details['latitude'] #wrap info into potentialItem potentialItem = {} potentialItem['source'] = to_unicode_or_bust(source) potentialItem['title'] = to_unicode_or_bust(title) potentialItem['desc'] = to_unicode_or_bust(desc) potentialItem['category'] = to_unicode_or_bust(category) potentialItem['date'] = to_unicode_or_bust(date) potentialItem['time'] = to_unicode_or_bust(time) potentialItem['place'] = to_unicode_or_bust(place) potentialItem['fee'] = to_unicode_or_bust(fee) potentialItem['feelist'] = to_unicode_or_bust(feelist) potentialItem['imageurl'] = to_unicode_or_bust(imageurl) potentialItem['originurl'] = to_unicode_or_bust(originurl) potentialItem['formatted_address'] = to_unicode_or_bust(loc_details['formatted_address']) potentialItem['province'] = to_unicode_or_bust(loc_details['province']) potentialItem['city'] = to_unicode_or_bust(loc_details['city']) potentialItem['areaname'] = to_unicode_or_bust(loc_details['areaname']) potentialItem['areacode'] = to_unicode_or_bust(loc_details['areacode']) potentialItem['longitude'] = to_unicode_or_bust(loc_details['longitude']) potentialItem['latitude'] = to_unicode_or_bust(loc_details['latitude']) # get all similar items (Q1) from search engine with criteria (query inputs) searchengine = Solr() # same - city, areacode, date, time q_areacode = 'areacode:' + loc_details['areacode'] q_eventdate = 'eventdate:"' + date + '"' q_eventtime = 'eventtime:"' + time + '"' query = {} query['q'] = q_areacode.encode('utf-8') + ' AND ' + q_eventdate.encode('utf-8') +' AND ' + q_eventtime.encode('utf-8') # almost - keywords from title and description # FIXME, now we do not provide this feature # this flag used to turn on and turn off the duplicate removal feature #duplicate_removal = False; instance.status = '1' if(duplicate_removal==True): # if len(Q1) == 0, regard this item as new item Q1 = searchengine.process(query) if ( len(Q1) == 0 ): #put this item to lele repository self.addToLeleRepository(potentialItem) else: # if not, create WeixiaoTask to determine if it is a new item or not self.createWeixiaoSimTask( potentialItem, Q1) #raw_input("Press Enter to continue...") #end if else: print 'no duplicate removal feature ...' try: self.addToLeleRepository(potentialItem) except: print 'exception happening' instance.status = '4' #end try-except #endif # label this item as analyzed in the table of db - lelespider session.commit()
def main(url_couchdb=None, dbname=None, url_solr=None, all_docs=False, since=None): '''Use the _changes feed with a "since" parameter to only catch new changes to docs. The _changes feed will only have the *last* event on a document and does not retain intermediate changes. Setting the "since" to 0 will result in getting a _changes record for each document, essentially dumping the db to solr ''' print('Solr update PID: {}'.format(os.getpid())) dt_start = datetime.datetime.now() print('Start time:{}'.format(dt_start)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) s3_seq_cache = CouchdbLastSeq_S3() if not since: since = s3_seq_cache.last_seq if all_docs: since = '0' print('Attempt to connect to {0} - db:{1}'.format(url_couchdb, dbname)) print('Getting changes since:{}'.format(since)) sys.stdout.flush() # put pd db = get_couchdb(url=url_couchdb, dbname=dbname) changes = db.changes(since=since) previous_since = since last_since = int( changes['last_seq']) # get new last_since for changes feed results = changes['results'] n_up = n_design = n_delete = 0 solr_db = Solr(url_solr) start_time = datetime.datetime.now() for row in results: cur_id = row['id'] if '_design' in cur_id: n_design += 1 print("Skip {0}".format(cur_id)) continue if row.get('deleted', False): # need to get the solr doc for this couch resp = solr_db.select(q=''.join(('harvest_id_s:"', cur_id, '"'))) if resp.numFound == 1: sdoc = resp.results[0] print('====DELETING: {0} -- {1}'.format(cur_id, sdoc['id'])) solr_db.delete(id=sdoc['id']) n_delete += 1 else: print("-----DELETION of {} - FOUND {} docs".format( cur_id, resp.numFound)) else: doc = db.get(cur_id) try: doc = fill_in_title(doc) has_required_fields(doc) except KeyError as e: print(e.message) continue except ValueError as e: print(e.message) continue try: try: solr_doc = map_couch_to_solr_doc(doc) except OldCollectionException: print('---- ERROR: OLD COLLECTION FOR:{}'.format(cur_id)) continue try: check_nuxeo_media(solr_doc) except ValueError as e: print(e.message) continue solr_doc = push_doc_to_solr(solr_doc, solr_db=solr_db) except TypeError as e: print('TypeError for {0} : {1}'.format(cur_id, e)) continue n_up += 1 if n_up % 1000 == 0: elapsed_time = datetime.datetime.now() - start_time print("Updated {} so far in {}".format(n_up, elapsed_time)) solr_db.commit() if not all_docs: s3_seq_cache.last_seq = last_since print("UPDATED {0} DOCUMENTS. DELETED:{1}".format(n_up, n_delete)) print("PREVIOUS SINCE:{0}".format(previous_since)) print("LAST SINCE:{0}".format(last_since)) run_time = datetime.datetime.now() - dt_start print("RUN TIME:{}".format(run_time))
def __init__(self): self.url = "http://{}:{}/solr/{}".format(settings.HOST,settings.PORT,settings.COLLECTION) self.s = Solr(self.url, timeout=10)
o.write("\n") count += 1 return count def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://imagecat.dyndns.org:8983/solr/imagecatdev" filename = "docs.docs.jsonl" solr = Solr(url) docs = solr.query_iterator("lastModified:[1960-01-01T00:00:00Z TO 2005-12-31T00:00:00Z]", rows=1000, fl='id') count = store_stream(docs, filename) print("Wrote %d docs to %s" % (count, filename)) docs = read_stream(filename) updates = remove_last_modified(docs) count, success = solr.post_iterator(updates, False) print(success) print(count)
def delete_sound_from_solr(sound): logger.info("deleting sound with id %d" % sound.id) try: Solr(settings.SOLR_URL).delete_by_id(sound.id) except Exception, e: logger.error('could not delete sound with id %s (%s).' % (sound.id, e))
def add_item(solr_address, doc): s = Solr(solr_address) resp = s.add(doc, commit=True)
def check_if_sound_exists_in_solr(sound): solr = Solr(settings.SOLR_URL) response = SolrResponseInterpreter( solr.select(unicode(search_prepare_query( '', 'id:%i' % sound.id, search_prepare_sort('created asc', SEARCH_SORT_OPTIONS_WEB), 1, 1)))) return response.num_found > 0
def delete_post_from_solr(post): logger.info("deleting post with id %d" % post.id) try: Solr(settings.SOLR_FORUM_URL).delete_by_id(post.id) except Exception, e: logger.error('could not delete post with id %s (%s).' % (post.id, e))
def jaccard(core, metric, threshold=0.01): solrURL = "http://localhost:8983/solr/" + core solrInstance = Solr(solrURL) if metric == "meta": docs = computeJaccardMeta(solrURL, solrInstance) elif metric == "value": docs = computeJaccardValue(solrInstance) json_data = {"name": "clusters", "children": []} prior_node = { "metadata": json.dumps(docs[0]), "name": docs[0]['id'].split('/')[-1], "path": os.environ["IMAGE_MOUNT"] + docs[0]['id'].split('/')[-1].split('.')[0] + ".jpg", "score": docs[0]["jaccard_{0}_abs".format(metric)] } prior = docs[0]["jaccard_{0}_abs".format(metric)] cluster0 = {"name": "cluster0", "children": [prior_node]} clusters = [cluster0] clusterCount = 0 for i in range(1, len(docs)): node = { "metadata": json.dumps(docs[i]), "name": docs[i]['id'].split('/')[-1], "path": os.environ["IMAGE_MOUNT"] + docs[i]['id'].split('/')[-1].split('.')[0] + ".jpg", "score": docs[i]["jaccard_{0}_abs".format(metric)] } diff = prior - docs[i]["jaccard_{0}_abs".format(metric)] if diff <= threshold: clusters[clusterCount]["children"].append(node) else: clusterCount += 1 newCluster = { "name": "cluster" + str(clusterCount), "children": [node] } clusters.append(newCluster) prior = docs[i]["jaccard_{0}_abs".format(metric)] json_data["children"] = clusters return json.dumps(json_data)
objects = [] scores = [] for obj, confd in row.items(): for o in obj.split(","): objects.append(o.strip()) scores.append(confd) delta['objects'] = {'set' : objects} delta['confidence'] = {'set': scores} yield delta if __name__ == '__main__': # Get the CSV file from classifier-local.py if len(sys.argv) != 2: print("required args:\n <CSV_file>") sys.exit(1) infile = sys.argv[1] min_confidence = 0.30 print("Reading from %s, Min confidence=%f" % (infile, min_confidence)) solr_url = "http://localhost:8983/solr/imagecatdev" solr = Solr(solr_url) updates = generate_solr_updates(infile, min_confidence=min_confidence) count, res = solr.post_iterator(updates, commit=True, buffer_size=1000) print("Res : %s; count=%d" %(res, count)) ''' from pprint import pprint for u in updates: pprint(u) '''
class SolrPipeline(object): """ Scrapy solr 存储中间件,需要在 scrapy 工程 settings.py 中配置 SOLR_SERVERS 信息以指明需要连接的 solr 服务器,同时需至少配置 SOLR_COLLECTION_MAP 与 SOLR_COLLECTION_DEFAULT 选项中的一个,前者用于指定 spider 名与 solr collection 名的对应关系,后者用于指定默认存储的 solr collection。 另外,可根据需要配置: SOLR_CLOUD_MODE: 是否以 solrcloud 模式运行,默认为 False SOLR_WEB_APP: solr 运行的 web app,默认为 "solr" SOLR_USER: solr 集群用户 SOLR_PASSWORD: solr 集群用户密码 SOLR_DETECT_LIVE_NODES: 是否自动探测 solr 集群中 active 的节点,默认为 False SOLR_TIMEOUT: solr 超时设置,默认为 10 秒 SOLR_CACHE_MAX_SIZE_PER_SPIDER: 每个 spider 可使用的以字节数计算的 solr 批量 提交缓存大小,默认为 10 * 1024 * 1024(10 兆) DEFAULT_CACHE_MAX_SIZE_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_SIZE_PER_SPIDER, 则以此配置作为它的值 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER: 每个 spider solr 批量提交缓存的最大元素个数, 默认为 100 个 DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER: 若未设置 SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER, 则以此配置作为它的值 本组件假定 item 对象所有的域名在对应 collection 的 schema.xml 文件中都有相应设置。 """ @classmethod def from_crawler(cls, crawler): return cls(crawler) def __init__(self, crawler): self.solr_connection = None self.solr_collection = None solr_cloud_mode = crawler.settings.get("SOLR_CLOUD_MODE") self.solr_cloud_mode = solr_cloud_mode if solr_cloud_mode else False solr_servers = crawler.settings.get("SOLR_SERVERS") if not solr_servers: log.msg("No field SOLR_SERVERS in settings.py!", level=log.CRITICAL) raise NotConfigured elif not isinstance(solr_servers, list): log.msg("Field SOLR_SERVERS in settings.py must be a list of URL(s) of solr server(s)!", level=log.CRITICAL) raise NotConfigured elif not self.solr_cloud_mode and len(solr_servers) > 1: log.msg("Can't specify multi URL(s) when SOLR_CLOUD_MODE is False!", level=log.CRITICAL) raise NotConfigured self.solr_servers = solr_servers solr_web_app = crawler.settings.get("SOLR_WEB_APP") solr_web_app = solr_web_app if solr_web_app else "solr" self.solr_web_app = solr_web_app if solr_web_app.endswith('/') else solr_web_app + '/' solr_collection_default = crawler.settings.get('SOLR_COLLECTION_DEFAULT') solr_collection_map = crawler.settings.get('SOLR_COLLECTION_MAP') if not solr_collection_default and not solr_collection_map: log.msg( "You must at least set one of the two fields SOLR_COLLECTION_MAP " "and SOLR_COLLECTION_DEFAULT in settings.py!", level=log.CRITICAL) raise NotConfigured if solr_collection_map and not isinstance(solr_collection_map, dict): log.msg("Field SOLR_COLLECTION_MAP in settings.py must be a dict!", level=log.CRITICAL) raise NotConfigured self.solr_collection_name = solr_collection_default self.solr_collection_map = solr_collection_map if solr_collection_map else {} self.solr_user = crawler.settings.get("SOLR_USER") self.solr_password = crawler.settings.get("SOLR_PASSWORD") solr_detect_live_nodes = crawler.settings.get("SOLR_DETECT_LIVE_NODES") self.solr_detect_live_nodes = solr_detect_live_nodes if solr_detect_live_nodes else False solr_timeout = crawler.settings.get("SOLR_TIMEOUT") self.solr_timeout = solr_timeout if solr_timeout is not None else 10 solr_cache_max_size = crawler.settings.get("SOLR_CACHE_MAX_SIZE_PER_SPIDER") if solr_cache_max_size is None: solr_cache_max_size = crawler.settings.get("DEFAULT_CACHE_MAX_SIZE_PER_SPIDER") self.solr_cache_max_size = solr_cache_max_size if solr_cache_max_size is not None else 10 * 1024 * 1024 solr_cache_max_len = crawler.settings.get("SOLR_CACHE_MAX_ELEMENTS_PER_SPIDER") if solr_cache_max_len is None: solr_cache_max_len = crawler.settings.get("DEFAULT_CACHE_MAX_ELEMENTS_PER_SPIDER") self.solr_cache_max_len = solr_cache_max_len if solr_cache_max_len is not None else 100 self.crawler = crawler self.cache_buffer = {} self.locks = {} def open_spider(self, spider): solr_collection_name = self.solr_collection_map.get(spider.name) if solr_collection_name: self.solr_collection_name = solr_collection_name if not self.solr_collection_name: spider.log("No collection associated with " + spider.name + "!", level=log.CRITICAL) raise CloseSpider if self.solr_cloud_mode: from solrcloudpy import SolrConnection self.solr_connection = SolrConnection(server=self.solr_servers, detect_live_nodes=self.solr_detect_live_nodes, user=self.solr_user, password=self.solr_password, timeout=self.solr_timeout, webappdir=self.solr_web_app) self.solr_collection = self.solr_connection[self.solr_collection_name] else: from solr import Solr from urlparse import urljoin collection_url = reduce(urljoin, (self.solr_servers[0], self.solr_web_app, self.solr_collection_name)) if isinstance(collection_url, unicode): collection_url = collection_url.encode("UTF-8") self.solr_collection = Solr(url=collection_url, http_user=self.solr_user, http_pass=self.solr_password, timeout=self.solr_timeout) if self.solr_cache_max_len > 0: max_len = self.solr_cache_max_len * 2 else: max_len = 2 self.cache_buffer[spider.name] = SpiderCache(maxlen=max_len) self.locks[spider.name] = Lock() def close_spider(self, spider): try: self.index_item(None, spider, True) finally: self.cache_buffer[spider.name].clear() @check_spider_pipeline def process_item(self, item, spider): self.index_item(item, spider, False) return item def index_item(self, item, spider, close_spider): lock = self.locks[spider.name] lock.acquire() cache_queue = self.cache_buffer[spider.name] if not close_spider: cache_queue.append(dict(item)) cache_len = len(cache_queue) cache_size = sizeof(cache_queue) try: if (close_spider and cache_len > 0) or cache_len >= self.solr_cache_max_len \ or (cache_len > 0 and cache_size >= self.solr_cache_max_size): if self.solr_cloud_mode: self.solr_collection.add(list(cache_queue)) else: self.solr_collection.add_many(list(cache_queue)) # self.solr_collection.commit() spider.log( "{cache_len} items of size {cache_size} byte(s) indexed in solr".format( cache_len=cache_len, cache_size=cache_size), level=log.INFO) cache_queue.clear() except Exception, e: trace_info = traceback.format_exc() spider.log( "Failed to index item(s): {message}\n{trace_info}".format( message=e.message, trace_info=trace_info), level=log.ERROR) finally:
groups.append([]) groups[-1].append(tok) spanning = True else: spanning = False return groups def catch_names(path, finder): with codecs.open(path, 'r', 'utf-8') as f: for line in f: line = line.strip() groups = finder.scan_names(line) for group in groups: finder.beam(group) if __name__ == '__main__': sys.argv = [ '', '-in', '../data/set1.source.tok', '-solr', 'http://localhost:8983/solr/name' ] p = ArgumentParser() p.add_argument("-in", required=True, help="Input File.") p.add_argument("-solr", required=True, help="Solr URL. Eg:http://localhost:8983/solr/name") args = vars(p.parse_args()) solr = Solr(args['solr']) finder = NameFinder(solr, stem_func=stem) catch_names(args['in'], finder)
u['phonenumbers'] = {'set': d['ner_phone_number_ts_md']} u['ner_phone_number_ts_md'] = {'set': None} else: print("Error: Skipped") continue yield u def read_stream(filename): ''' Reads json line stream :param filename: path to json line :return: doc stream ''' with open(filename) as inf: for l in inf: yield json.loads(l) if __name__ == '__main__': url = "http://127.0.0.1:8983/solr/imagecatdev" solr = Solr(url) docs = solr.query_iterator("ner_phone_number_t_md:* OR ner_phone_number_ts_md:*", rows=1000, fl='id,ner_phone_number_t_md,ner_phone_number_ts_md', sort="indexedAt asc") updates = fix_phonenumbers(docs) count, success = solr.post_iterator(updates, False, buffer_size=1000) solr.commit() print(success) print(count)