def simple_query(page): d = dict() query = buildQuery() print(query) solr = SolrClient(current_app.config['SOLR']) res = solr.query('scripties', { 'q': query, 'rows': '0', }) count = res.get_num_found() pages = math.ceil(count / 10) start = (page - 1) * 10 res = solr.query( 'scripties', { 'q': query, 'rows': '10', 'start': start, 'fl': 'id,titel,auteur,jaar', 'facet': True, 'facet.field': ['jaar', 'type', 'faculteit'], }) facets = res.get_facets() d['result'] = res d['pages'] = pages d['page'] = page d['f_jaar'] = facets['jaar'] d['f_type'] = facets['type'] d['f_faculteit'] = collect(facets['faculteit']) d['f'] = request.args.get('faculteit') d['j'] = request.args.get('jaar') d['t'] = request.args.get('type') return d
def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field( test_config['SOLR_COLLECTION'], field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field( test_config['SOLR_COLLECTION'], field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'], json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'], openSearcher=True)
def update_ml_tag(solr: SolrClient, tweets_core_name, tags_core_name, docs, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger): tweets = [] for d in docs: text = d['status_text'] if "rt @" in text.lower(): start = text.lower().index("rt @") + 4 text = text[start].strip() tweets.append(text) #ml classify, also compute risk scores logger.info("begin ml classification for tweets={}, time={}".format( len(tweets), datetime.datetime.now())) tags, risk_scores = ml_tag(tweets, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger, solr, tags_core_name) logger.info("ml classification done. updating solr index...{}".format( datetime.datetime.now())) count = 0 for idx, tag in enumerate(tags): if tag == 0: count += 1 #print(d['status_text']) d = docs[idx] d['ml_tag'] = str(tag) d['tweet_risk'] = risk_scores[idx] print(count) solr.index(tweets_core_name, docs) code = iu.commit(tweets_core_name)
def calculate(term, all_candidates, solr_core_url): solrClient = SolrClient(solr_core_url) longer_terms = CValueRanker.get_longer_terms(term, all_candidates) term_freq_dict, normed_term_dict = solrClient.totaltermfreq( FIELD_CONTENT, {term}) term_freq = list(term_freq_dict.values())[0] # print("term freq of '",term,"': ", term_freq) num_of_term_words = len(TermUtil.normalise(term).split(' ')) # print("num of term words:", num_of_term_words) log2a = math.log(num_of_term_words, 2) # print("log2a:", log2a) if longer_terms: p_ta = len(longer_terms) # print("p_ta:", p_ta) sum_fb = TermRanker.sum_ttf_candidates(solrClient, longer_terms) # print("sum_fb:", sum_fb) term_cValue = log2a * (term_freq - (1 / p_ta) * sum_fb) else: term_cValue = log2a * term_freq return (term, term_cValue)
def update_pmi_scores(existing_tags: dict, existing_tag_pairs: dict, solr: SolrClient, core_name, batch_commit): count = 0 batch = [] for tag_pair, data in existing_tag_pairs.items(): count += 1 if count > batch_commit: solr.index(core_name, batch) code = util.commit(core_name) count = 0 batch = [] logger.info("\t done batch size={}".format(batch_commit)) co_freq = data[util.tag_index_field_frequency] tags = tag_pair.split(" ") t1_freq = existing_tags[tags[0]][util.tag_index_field_frequency] t2_freq = existing_tags[tags[1]][util.tag_index_field_frequency] if co_freq==0: pmi=0 else: pmi = numpy.emath.log(co_freq / (t1_freq * t2_freq + util.score_denominator_min)) data[util.tag_index_field_pmi] = pmi data[util.tag_index_field_text] =tag_pair data[util.tag_index_field_type] =1 batch.append(data) # commit the rest solr.index(core_name, batch) code = util.commit(core_name)
class TwitterSearch(): __solr = None __core = None __api = None def __init__(self, oauth): super().__init__() self.__solr = SolrClient(iu.solr_url) self.__core = iu.solr_core_tweets self.__api = tweepy.API(oauth) def index(self, keywords): for keyword in keywords: count = 0 for status in tweepy.Cursor(self.__api.search, q=keyword, tweet_mode="extended", lang="en").items(500): count += 1 # created_at_time str_created_at = status.created_at str_solr_time = str_created_at.utcnow().strftime( SOLR_TIME_PATTERN) docs = [{ 'id': status.id, 'created_at': str_solr_time, 'status_text': status.full_text }] self.__solr.index(self.__core, docs) print(str(count) + "," + keyword) code = iu.commit(iu.solr_core_tweets)
def update_solr(self, task=None): solr = SolrClient(SOLR_URI + '/solr/') collection = 'listmanager' if not task: task = self.task document = {} document['id'] = task.id document['title'] = task.title document['note'] = task.note if task.note else '' #document['tag'] =[t for t in task.tag.split(',')] if task.tag else [] document['tag'] =[k.name for k in task.keywords] # better this than relying on tag document['completed'] = task.completed != None document['star'] = task.star # haven't used this yet and schema doesn't currently reflect it #note that I didn't there was any value in indexing or storing context and folder document['context'] = task.context.title document['folder'] = task.folder.title json_docs = json.dumps([document]) response = solr.index_json(collection, json_docs) # response = solr.commit(collection, waitSearcher=False) # doesn't actually seem to work # Since solr.commit didn't seem to work, substituted the below, which works url = SOLR_URI + '/solr/' + collection + '/update' r = requests.post(url, data={"commit":"true"}) #print(r.text) root = ET.fromstring(r.text) if root[0][0].text == '0': print(self.colorize("solr update successful", 'yellow')) else: print(self.colorize("there was a problem with the solr update", 'yellow'))
def suggest(): query_key = request.args.get('query') solr = SolrClient('http://localhost:8983/solr') res = solr.query('myexample',{ 'q':query_key, },'suggest') return json.dumps(res.data['suggest']['suggest'][query_key]['suggestions'])
def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]['params']['params'] ['sort'] == 'index_date asc, id desc') except KeyError: self.assertTrue(solr.transport._action_log[2]['params']['params'] ['sort'] == 'index_date asc, id desc') self.assertEqual( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs.sort(key=lambda x: x['id']), )
def search(query_dict): #pdb.set_trace() #instantiate solr connection solr = SolrClient('http://localhost:8983/solr') # Generic search if no query input given if len(query_dict) == 0: query_string = '*:*' #retrieve value of field in table and prepare a query string else: query_string = '' query_op = ' AND ' item_count = 0 for key in query_dict: if len(query_dict[key]) > 0: if item_count > 0: query_string = query_string + query_op + key + ':' + query_dict[ key] else: query_string = query_string + key + ':' + query_dict[key] item_count += 1 res = solr.query('lyrics', { 'q': query_string, }) return res.data['response']['docs']
def update_tagrisk_scores(existing_tags: dict, solr: SolrClient, core_name, batch_commit): count = 0 batch = [] for tag, data in existing_tags.items(): count += 1 if count > batch_commit: solr.index(core_name, batch) code = util.commit(core_name) count = 0 batch = [] logger.info("\t done batch size={}".format(batch_commit)) freq = data[util.tag_index_field_frequency] freqh = data[util.tag_index_field_frequencyh] if freqh==0: riskscore=0 else: riskscore = numpy.math.sqrt(freqh / (freq+ util.score_denominator_min)) data[util.tag_index_field_risk_score] = riskscore data[util.tag_index_field_text] =tag data[util.tag_index_field_type] =0 batch.append(data) # commit the rest solr.index(core_name, batch) code = util.commit(core_name)
def test_solr_to_solr_reindex_and_resume_reverse(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def detail_query(key): solr = SolrClient(current_app.config['SOLR']) q = 'id:{}'.format(key) res = solr.query( 'scripties', { 'q': q, 'fl': 'titel,auteur,jaar,supervisor,type,faculteit,opleiding,taal', }) return res
def index_data(): docs = get_data() client = SolrClient('http://localhost:8983/solr') client.index_json('stocks', json.dumps(docs)) client.commit('stocks')
def index_json(): client = SolrClient('http://localhost:8983/solr') docs = [ {'id' : '8', 'field8' : 'value8'}, ] client.index_json('test', json.dumps(docs)) client.commit('test')
def setUpClass(self): logging.debug("Starting to run Reindexer Tests") self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.colls = [ test_config['SOLR_REINDEXER_COLLECTION_S'], test_config['SOLR_REINDEXER_COLLECTION_D'] ] self.rand_docs = RandomTestData()
def get_solr(): solr = SolrClient(current_app.config['SOLR']) res = solr.query('scripties', { 'q': 'titel:muslim', 'facet': True, 'facet.field': 'taal', }) return res.get_results_count() # def get_post(id, check_author=True): # post = get_db().execute( # 'SELECT p.id, title, body, created, author_id, username' # ' FROM post p JOIN user u ON p.author_id = u.id' # ' WHERE p.id = ?', # (id,) # ).fetchone() # if post is None: # abort(404, "Post id {0} doesn't exist.".format(id)) # if check_author and post['author_id'] != g.user['id']: # abort(403) # return post # @bp.route('/<int:id>/update', methods=('GET', 'POST')) # @login_required # def update(id): # post = get_post(id) # if request.method == 'POST': # title = request.form['title'] # body = request.form['body'] # error = None # if not title: # error = 'Title is required.' # if error is not None: # flash(error) # else: # db = get_db() # db.execute( # 'UPDATE post SET title = ?, body = ?' # ' WHERE id = ?', # (title, body, id) # ) # db.commit() # return redirect(url_for('blog.index')) # return render_template('blog/update.html', post=post) # @bp.route('/<int:id>/delete', methods=('POST',)) # @login_required # def delete(id): # get_post(id) # db = get_db() # db.execute('DELETE FROM post WHERE id = ?', (id,)) # db.commit() # return redirect(url_for('blog.index'))
def update_solr(): def now(): return datetime.now().isoformat(' ').split('.')[0] solr = SolrClient(SOLR_URI + '/solr/') collection = 'listmanager' solr_sync = remote_session.query(Sync).get('solr') last_solr_sync = solr_sync.timestamp log = f"{now()}: last Solr sync = {last_solr_sync.isoformat(' ').split('.')[0]}\n" tasks = remote_session.query(Task).filter(Task.modified > last_solr_sync) log = f"{now()}: number of tasks modified since "\ f"last sync = {str(tasks.count())}\n" + log max = round(tasks.count(), -2) + 200 i = -1 s = 0 for n in range(100, max, 100): documents = [] for i, task in enumerate(tasks[s:n]): document = {} document['id'] = task.id document['title'] = task.title document['note'] = task.note if task.note else '' document['tag'] = [t for t in task.tag.split(',') ] if task.tag else [] document['completed'] = task.completed != None document[ 'star'] = task.star # haven't used this yet and schema doesn't currently reflect it #note that I didn't there was any value in indexing or storing context and folder document['context'] = task.context.title document['folder'] = task.folder.title documents.append(document) json_docs = json.dumps(documents) response = solr.index_json(collection, json_docs) # response = solr.commit(collection, waitSearcher=False) # doesn't actually seem to work # Since solr.commit didn't seem to work, substituted the below, which works url = SOLR_URI + '/solr/' + collection + '/update' r = requests.post(url, data={"commit": "true"}) #print(r.text) #print("Tasks {} to {}".format(s,n)) s = n solr_sync.timestamp = datetime.now() + timedelta(seconds=2) remote_session.commit() log = f"{now()}: new Solr sync = "\ f"{solr_sync.timestamp.isoformat(' ').split('.')[0]}\n" + log return log, i
def test_index_bad_data(self): index = IndexQ(test_config['indexqbase'], 'testq') solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS']) if index._is_locked(): index._unlock() self.assertEqual(index.get_all_as_list(),[]) solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') todo_file = index.add({'date':'asd'}, finalize=True) self.assertEqual(index.get_all_as_list()[0],todo_file) with self.assertRaises(SolrError): index.index(solr,test_config['SOLR_COLLECTION']) self.assertEqual(index.get_all_as_list()[0],todo_file) self.assertFalse(index._is_locked())
def read_all(): client = SolrClient('http://localhost:8983/solr') res = client.query('test', { 'q' : '*:*' }) res = json.loads(res.get_json()) docs = res['response']['docs'] for doc in docs: print (doc)
def process_one(record_list, coll): url_solr = 'http://xxx.xxx.xxx.xx:xxxx/solr/%s/'% coll solrClient = SolrClient(url_solr) for record in record_list: htmlEntity = HtmlEntity() parser = ParserHtml() htmlEntity = parser.parseHtml(record, htmlEntity) solrClient.addDoc(htmlEntity) solrClient.addDocs() last_id = record_list[-1]['_id'] _LOGGER_.info(last_id + ' is Done!') del record_list
def build(self): try: CLIENT = SolrClient(SEARCH_ENGINE.get('URL')) except SolrError: print( "Solr не запущен, попробуйте выполнить команду: solr start -e cloud" ) # http://lucene.apache.org/solr/guide/8_2/requestdispatcher-in-solrconfig.html # script = """ # curl - H # 'Content-type:application/json' - d # '{"set-property": # {"requestDispatcher.requestParsers.enableRemoteStreaming": true}, "set-property":{"requestDispatcher.requestParsers.enableStreamBody": true}}' # http://localhost:8983/api/collections/infoportal/config # """ # rc = call(script, shell=True) # print(f"Выполнение скрипта: {rc}") print("СТАТУС КЛАСТЕРА") print(f'CLIENT.collections={CLIENT.collections.clusterstatus()}') print('ЭКСПОРТ ДОКУМЕНТОВ postres') conn = create_connection() if conn is None: return {} try: with conn: cur = conn.cursor() cur.execute("SELECT * FROM documents_documents") self.rows = cur.fetchall() open(TMP_FILENAME, 'w').close() with open(TMP_FILENAME, "wb") as f: f.write(bytes("[", encoding='utf-8')) for row in self.rows: document = SolrDocument(row) f.write(bytes(document.toJSON(), encoding='utf-8')) # print(bytes(json.dumps(document), encoding='utf-8')) # f.write(bytes(json.dumps(document), encoding='utf-8')) f.write(bytes(",", encoding='utf-8')) f.write(bytes("]", encoding='utf-8')) except Exception as ex: print(f"{ex}") print('ИМПОРТ ДОКУМЕНТОВ В Solr') CLIENT.local_index('infoportal', TMP_FILENAME)
def test_solr_to_solr_resume_checkonly(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0) reindexer.resume(check=True) # Makes sure nothing got indexed self.assertEqual(len(solr.query(self.colls[0], {'q': '*:*', 'rows': 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {'q': '*:*', 'rows': 10000000}).docs), 0)
def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) for field in test_config['collections']['copy_fields']: try: self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field) except: pass
def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.reindex() # sloppy check over here, will improve later self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc") except KeyError: self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc") self.assertEqual( solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), )
def test_solr_to_solr_reindex_and_resume_reverse(self): ''' Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. ''' self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config['SOLR_SERVER'][0], auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='date') #Make sure only source has data self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), 50000) self.assertEqual( len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs), 0) #This gets somehwat of a mid point date in the range. midpoint = (datetime.datetime.now() - datetime.timedelta(days=( (self._end_date - self._start_date).days / 2))) #Reindex approximately half of the data by restricting FQ reindexer.reindex( fq=['date:[{} TO *]'.format(midpoint.isoformat() + 'Z')]) sleep(10) #Make sure we have at least 20% of the data. dest_count = len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs) s_count = len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs) self.assertTrue(s_count > dest_count > s_count * .20) reindexer.resume() sleep(10) #Make sure countc match up after reindex self.assertEqual( len( solr.query(self.colls[0], { 'q': '*:*', 'rows': 10000000 }).docs), len( solr.query(self.colls[1], { 'q': '*:*', 'rows': 10000000 }).docs))
def get_latest_update(url,collection, query): dttm = None solr = SolrClient(url) res = solr.query(collection, { 'q': query, 'rows': 1, 'sort': 'system_mtime desc' }) pp.pprint(res.get_results_count()) if res.get_results_count() == 1: pp.pprint(res.docs[0]['system_mtime']) date = res.docs[0]['system_mtime'] dttm = datetime.strptime(date,"%Y-%m-%dT%H:%M:%SZ") pp.pprint(dttm) return dttm
def get_tweets_by_time(timespan, solr:SolrClient, core_name="tweets"): rows=100 #100 results per page stop=False start=0 facet_counts=None q='created_at:' + timespan+' AND ml_tag:0' while not stop: res = solr.query(core_name, { 'q':q, #remember we only show tweets tagged as hate (0) 'facet.field':'entities_hashtag', #count results per facet (NOTE, not every tweet will have a hashtag, but this is ok 'facet':"on", #switch on facet search 'facet.mincount':"1", #show facets that have at least 1 result 'rows':rows, 'fl':'*', #return all fields from the index (when available 'start':start, #start from 'sort':'tweet_risk desc'}) #sort by risk_score descending start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows print("total number found={}".format(res.num_found)) if start>res.num_found: stop=True #assign facet results to another var. facet counts is for the whole dataset, not just this page if facet_counts is None: facet_counts=res.data['facet_counts']['facet_fields']['entities_hashtag'] #now go through every page, every result for d in res.docs: #res.docs only contain documents on the CURRENT page print("https://twitter.com/"+d['user_screen_name']+"/"+d['id']) if 'coordinates' in d.keys(): print(d['coordinates']) #finally print facet counts print(facet_counts)
def update(solr: SolrClient, tweet_core_name, tag_core_name, timespan, rows, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger): stop = False start = 0 while not stop: logger.warn("Processing from {} for a batch of {}".format(start, rows)) print("Processing from {} for a batch of {}".format(start, rows)) res = solr.query( tweet_core_name, { 'q': 'created_at:' + timespan, 'rows': rows, 'fl': '*', 'start': start, 'sort': 'id asc' }) start += rows if start > res.num_found: stop = True #apply pretrained ML model to tag data and update them update_ml_tag(solr, tweet_core_name, tag_core_name, res.docs, feat_vectorizer, ml_model, selected_features, hate_indicative_features, scaling_option, sysout, logger) pass
def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.coll = test_config['SOLR_COLLECTION'] + str(random.random() * 100) self.temp_dir = test_config['temp_data'] res, con_info = self.solr.collections.api( 'create', { 'name': self.coll, 'numShards': 1, 'replicationFactor': 1, 'collection.configName': 'basic_configs' }) self.zk = self.solr.get_zk()
def do_it(): global omd global s3 global ss # the pickle global tmpdir global ctr global solr global repo_ctr omd = get_details() main_log.debug("temp: {} url: {} s3 yaml:{} ".format( omd.get('tmpdir'), omd.get('pdfurl'), omd.get('s3_yaml'))) instance = omd.get('instance') main_log.info("Instance: " + instance) main_log.info("retrieving saved state, if any, at {}".format( omd.get("savedstate"))) ss = savestate(omd.get("savedstate")) if all: ss.clear() solr = SolrClient(omd.get('solr_url')) tmpdir = omd.get('tmpdir') try: s3 = S3(configpath=omd.get('s3_yaml')) except Exception as e: raise e aspace = ASpace() for repo in aspace.repositories: if all or repo_code is None or repo.repo_code == repo_code: process_repository(repo) repo_ctr += 1 ss.save() # last time for good luck!
def test_solr_to_solr_resume_checkonly(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume(check=True) # Makes sure nothing got indexed self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0)
def get_tags_by_pmi(target_tag, solr:SolrClient, core_name="tags"): #http://localhost:8983/solr/tags/select?indent=on&q=tag_text:banmuslims%20AND%20type:1&wt=json rows=100 #100 results per page stop=False start=0 q='tag_text:' + target_tag+' AND type:1' #0=single tag; 1=tag pairs #because we need to get tags similar to this target, so we need to get all pairs and process them while not stop: res = solr.query(core_name, { 'q':q, #remember we only show tweets tagged as hate (0) 'rows':rows, 'fl':'*', #return all fields from the index (when available 'start':start, #start from 'sort':'pmi desc'}) #sort by risk_score descending start+=rows #resetting start will turn to next page. for specific page number, you need to work out the 'start' by pagenum*rows print("total number found={}".format(res.num_found)) if start>res.num_found: stop=True #now go through every page, every result for d in res.docs: #res.docs only contain documents on the CURRENT page tags=d['tag_text'].split(" ") relevant_tag=tags[0] if relevant_tag==target_tag: relevant_tag=tags[1] print(relevant_tag+", pmi="+d['pmi'])
def test_get_date_range_query(self): ''' Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one ''' solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) reindexer = Reindexer(source=solr, source_coll='source_coll', dest=solr, dest_coll='dest_coll', date_field='index_date') self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11'), {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY'} ) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123'), {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1DAY'} ) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', date_field='date123', timespan='MONTH'), {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'date123', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH'} ) self.assertEqual( reindexer._get_date_range_query('2015-11-10', '2015-12-11', timespan='MONTH'), {'rows': 0, 'facet.range.end': '2015-12-11', 'facet': 'true', 'facet.range': 'index_date', 'facet.range.start': '2015-11-10', 'q': '*:*', 'facet.range.include': 'all', 'facet.range.gap': '+1MONTH'} )
def test_index_dynamic_collections_basic_1(self): index = IndexQ(test_config['indexqbase'], 'testq') solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS']) if index._is_locked(): index._unlock() self.assertEqual(index.get_all_as_list(), []) # Set up mock for indexing temp = {} def mock(temp, coll, docs): temp[coll] = docs return True todo_file = index.add([ { 'type': '1', 'data': '1' }, { 'type': '1', 'data': '2' }, { 'type': '1', 'data': '3' }, { 'type': '2', 'data': '4' }, { 'type': '3', 'data': '5' }, ], finalize=True) runner_wrap = index._wrap_dynamic(partial(mock, temp), lambda x: x['type'], todo_file) self.assertTrue(runner_wrap) self.assertEqual(json.loads(temp['3']), [{"data": "5", "type": "3"}]) self.assertEqual(json.loads(temp['2']), [{'type': '2', 'data': '4'}]) self.assertEqual( sorted(json.loads(temp['1']), key=lambda x: x['data']), sorted([{ 'type': '1', 'data': '1' }, { 'type': '1', 'data': '2' }, { 'type': '1', 'data': '3' }], key=lambda x: x['data'])) self.assertFalse( index.get_all_as_list()) # Make sure item is completed
def get_solr_connection(): '''Initialize a Solr connection using project settings''' # TODO: error handling on config not present? solr_config = settings.SOLR_CONNECTIONS['default'] solr = SolrClient(solr_config['URL']) # NOTE: may want to extend SolrClient to set a default collection solr_collection = solr_config['COLLECTION'] return solr, solr_collection
def test_index(self): index = IndexQ(test_config['indexqbase'], 'testq') solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS']) solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') buff = [] files = [] for doc in self.docs: files.append(index.add(doc, finalize=True)) index.index(solr,test_config['SOLR_COLLECTION']) solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) for doc in self.docs: res = solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}) self.assertTrue(res.get_results_count()==1) solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*')
def test_complete_compress_basic_re_indexing(self): log = logging.getLogger() solr = SolrClient(test_config['SOLR_SERVER'], devel=True, auth=test_config['SOLR_CREDENTIALS']) index = IndexQ(test_config['indexqbase'], 'testq', size = 1, log = log, compress=True) solr.delete_doc_by_id(test_config['SOLR_COLLECTION'], '*') for item in self.docs[1:10]: index.add(item, finalize=True) index.index(solr, test_config['SOLR_COLLECTION']) # At this point items are indexed and are moved into the done directory # Lets re-index them to make sure all json got properly encoded files = index.get_all_as_list('_done_dir') for f in index.get_all_as_list('_done_dir'): shutil.move(f, index._todo_dir) index.index(solr, test_config['SOLR_COLLECTION']) self.assertEqual(files, index.get_all_as_list('_done_dir'))
def test_solr_to_solr_resume_basic(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has datae self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
def computeScores1(type, query, output_file): solr = SolrClient('http://localhost:8983/solr') res = solr.query(query['index'], { 'q': '*:*', 'wt': 'json', 'indent': True, 'rows': 1000, }) docs = res.data['response']['docs'] with open(output_file, "wb") as outF: a = csv.writer(outF, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) a.writerow(["type", "x-coordinate", "y-coordinate", "Similarity_score"]) for doc in docs: for key in doc: if key in ["id", "_version_"]: continue try: doc[key] = doc[key][0].encode("ascii", "ignore") except: doc[key] = str(doc[key][0]).decode("unicode_escape").encode("ascii", "ignore") doc_tuples = itertools.combinations(docs, 2) for raw1, raw2 in doc_tuples: doc1 = raw1.copy() doc2 = raw2.copy() if "Name" in doc1: row_cosine_distance = [type, doc1["Name"], doc2["Name"]] else: row_cosine_distance = [type, doc1["name"], doc2["name"]] v1 = Vector(row_cosine_distance[0], doc1) v2 = Vector(row_cosine_distance[1], doc2) row_cosine_distance.append(v1.cosTheta(v2)) a.writerow(row_cosine_distance)
def get(self): term = self.get_argument('term') client = SolrClient('http://localhost:8983/solr') res = client.query('stocks', { #'q' : 'symbol:%s' % '*' 'q' : term }) res = json.loads(res.get_json()) docs = res['response']['docs'] formatted = [] for doc in docs: formatted.append({ 'name' : doc['name'], 'symbol' : doc['symbol'], 'sector' : doc['sector'], 'open' : doc['open'] }) self.write(json.dumps(formatted))
def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.coll = test_config['SOLR_COLLECTION']+str(random.random()*100) self.temp_dir = test_config['temp_data'] res, con_info = self.solr.collections.api('create', { 'name': self.coll, 'numShards': 1, 'replicationFactor': 1, 'collection.configName': 'basic_configs' }) sleep(2) self.zk = self.solr.get_zk()
class ZKTest(unittest.TestCase): #High level zk tests @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.coll = test_config['SOLR_COLLECTION']+str(random.random()*100) self.temp_dir = test_config['temp_data'] res, con_info = self.solr.collections.api('create', { 'name': self.coll, 'numShards': 1, 'replicationFactor': 1, 'collection.configName': 'basic_configs' }) sleep(2) self.zk = self.solr.get_zk() @classmethod def tearDownClass(self): res, con_info = self.solr.collections.api('delete', {'name':self.coll}) def test_zk_get_collection_config_bad_collection(self): with self.assertRaises(ZookeeperError): self.zk.download_collection_configs('asdasdasd', self.temp_dir + os.sep + self.coll) def test_zk_copy_config(self): a = self.zk.copy_config('basic_configs', 'new_config') self.assertTrue(self.zk.kz.get('/configs/new_config')) self.zk.kz.delete('/configs/new_config', recursive=True) def test_download_collection_configs(self): # really bad test, need to rework later a = self.zk.download_collection_configs('basic_configs', self.temp_dir+'/configs') self.assertTrue(os.path.isdir(self.temp_dir+'/configs')) def test_upload_collection_configs(self): a = self.zk.upload_collection_configs('test1', self.temp_dir+'/configs/basic_configs') self.zk.kz.delete('/configs/test1', recursive=True)
def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0],devel=True,auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') for field in test_config['collections']['copy_fields']: try: self.solr.collections.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.collections.create_field(test_config['SOLR_COLLECTION'],field) except: pass #Index Some data self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True)
Note: We do use album, album_art, uri, title, artist We don't use duration or position and we don't use metadata, which doesn't fully match the metadata needed to play songs, which we get through wire shark The upload format for SolrClient (python3 solr client program) is jsonifying a list of dictionaries: [{'id':'After_The_Gold_Rush_Birds', 'artist':'Neil Young', 'title':'Birds', 'album':'After the Gold Rush', 'uri':'x-sonos-http:amz%3atr%3a44ce93d2-4105-416a-a905-51fe0f38ed9a.mp4?sid=26&flags=8224&sn=2'...}{... ''' from SolrClient import SolrClient import sys import json import requests from config import ec_uri solr = SolrClient(ec_uri+':8983/solr') collection = 'sonos_companion' file_name = input("What file do you want to use for uploading track information to solr?") with open(file_name,'r') as f: z = f.read() full_items = json.loads(z) documents = [] for item in full_items: document = {} # We create a unique id but concatenating the album and the song title id_ = item['album'] + ' ' + item['title'] id_ = id_.replace(' ', '_') document['id'] = id_
#!/usr/bin/env python from __future__ import division import json import os from SolrClient import SolrClient import sys from tika import detector solr = SolrClient('http://localhost:8983/solr') walk_n = sum(len(files) for root, dirs, files in os.walk(sys.argv[1])) walk_i = 0 ratios = {} for root, dirs, files in os.walk(sys.argv[1]): for file in files: path = root + '/' + file file_size = os.stat(path).st_size if file_size == 0: continue mime = detector.from_file(path) sum, n = ratios.get(mime, (0, 0)) ratios[mime] = sum + len(json.dumps(solr.query('collection1', {'q': 'id:' + file}).data['response']['docs'])) / file_size, n + 1 walk_i += 1 print str(walk_i * 100 // walk_n) + '%\r', with open('size-diversity.json', 'w') as f: json.dump({mime: sum / n for mime, (sum, n) in ratios.iteritems()}, f)
def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'],devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest',{'q':'not_gonna_happen'})
def __init__(self): self._logger = logging.getLogger(__name__) self.remote_solr_client = SolrClient(remote_solr_server, username="******", password="******") self.local_solr_client = SolrClient(local_solr_server)
class ClientTestIndexing(unittest.TestCase): #High Level Client Tests @classmethod def setUpClass(self): self.solr = SolrClient(test_config['SOLR_SERVER'][0], devel=True, auth=test_config['SOLR_CREDENTIALS']) self.rand_docs = RandomTestData() self.docs = self.rand_docs.get_docs(50) for field in test_config['collections']['copy_fields']: try: self.solr.schema.delete_copy_field(test_config['SOLR_COLLECTION'],field) except: pass for field in test_config['collections']['fields']: try: self.solr.schema.create_field(test_config['SOLR_COLLECTION'],field) except: pass def setUp(self): self.delete_docs() self.commit() def delete_docs(self): self.solr.delete_doc_by_id(test_config['SOLR_COLLECTION'],'*') self.commit() def commit(self): self.solr.commit(test_config['SOLR_COLLECTION'],openSearcher=True) sleep(5) @unittest.skip("Skipping for now") def test_access_without_auth(self): if not test_config['SOLR_CREDENTIALS'][0]: return solr = SolrClient(test_config['SOLR_SERVER'],devel=True) with self.assertRaises(ConnectionError) as cm: solr.query('SolrClient_unittest',{'q':'not_gonna_happen'}) def test_indexing_json(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1) self.delete_docs() self.commit() def test_indexing_conn_log(self): self.docs = self.rand_docs.get_docs(53) self.solr.index_json(test_config['SOLR_COLLECTION'],json.dumps(self.docs)) self.commit() sleep(5) for doc in self.docs: logging.debug("Checking {}".format(doc['id'])) self.assertEqual(self.solr.query(test_config['SOLR_COLLECTION'],{'q':'id:{}'.format(doc['id'])}).get_num_found(),1) logging.info(self.solr.transport._action_log) self.delete_docs() self.commit() def test_index_json_file(self): self.docs = self.rand_docs.get_docs(55) with open('temp_file.json','w') as f: json.dump(self.docs,f) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_stream_file_gzip_file(self): self.docs = self.rand_docs.get_docs(60) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass @unittest.skip("Don't test remote indexing in travis") def test_index_json_file(self): self.docs = self.rand_docs.get_docs(61) with open('temp_file.json','w') as f: json.dump(self.docs,f) r = self.solr.local_index(test_config['SOLR_COLLECTION'],'temp_file.json') self.commit() r = self.solr.query(test_config['SOLR_COLLECTION'],{'q':'*:*'}) self.assertEqual(r.get_num_found(),len(self.docs)) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_rows(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}, rows=50): self.assertTrue(len(res.docs) == 50) docs.extend(res.docs) queries +=1 self.assertEqual( [x['id'] for x in sorted(docs, key= lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])] ) self.assertTrue(1000/50 == queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'],{'q':'*:*'}): self.assertTrue(len(res.docs) == 1000) docs.extend(res.docs) queries +=1 self.assertTrue(queries == 1) self.assertEqual( [x['id'] for x in sorted(docs, key= lambda x: x['id'])], [x['id'] for x in sorted(self.docs, key= lambda x: x['id'])] ) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass def test_paging_query_with_max(self): self.docs = self.rand_docs.get_docs(1000) with gzip.open('temp_file.json.gz','wb') as f: f.write(json.dumps(self.docs).encode('utf-8')) r = self.solr.stream_file(test_config['SOLR_COLLECTION'],'temp_file.json.gz') self.commit() queries = 0 docs = [] for res in self.solr.paging_query(test_config['SOLR_COLLECTION'], {'q':'*:*'}, rows = 50, max_start = 502): self.assertTrue(len(res.docs) == 50) queries +=1 docs.extend(res.docs) ids = [x['id'] for x in docs] for item in docs: self.assertTrue(item['id'] in ids) self.assertEqual(11, queries) self.delete_docs() self.commit() try: os.remove('temp_file.json.gz') os.remove('temp_file.json') except: pass
def setUpClass(self): logging.debug("Starting to run Reindexer Tests") self.solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) self.colls = [test_config["SOLR_REINDEXER_COLLECTION_S"], test_config["SOLR_REINDEXER_COLLECTION_D"]] self.rand_docs = RandomTestData()
class Integrator(object): """ Provide integration with KNow Knowledge Portal as scheduled batch job """ def __init__(self): self._logger = logging.getLogger(__name__) self.remote_solr_client = SolrClient(remote_solr_server, username="******", password="******") self.local_solr_client = SolrClient(local_solr_server) def batch_processing_product_issue_attachments(self): """ query remote Solr server to retrieve all the attachment ids :return: """ self._logger.info("starting to retrieving attachement urls and batch indexing textual attachments ...") # solrClient=SolrClient(remote_solr_server) batch_num = 10 response = self.remote_solr_client.load_documents_by_custom_query('attachment_ids_txt:*', start=0, rows=batch_num) total_num = response['numFound'] self._logger.info("total number of document with attachments: [%s]", total_num) # if total_num > batch_num : for start_index in range(0, total_num, batch_num): response = self.remote_solr_client.load_documents_by_custom_query('attachment_ids_txt:*', start=start_index, rows=batch_num) docs = response['docs'] try: self.batch_indexing_documents(docs) self._logger.info("batch indexing documents. progress [%s]", start_index) except IntegrationException as error: self._logger.error("error batch processing while indexing!") raise self._logger.info("complete batch processing of documents. Documents has been indexed completely.") def batch_indexing_documents(self, docs): """ batch process a number of attachments associated with product issue :param docs: dictionary, Solr document objects :return: """ self._logger.info("batch processing and indexing [%s] product issues ..." % len(docs)) docs_to_index = [] for doc in docs: prod_issue_doc_id = doc['id'] attachment_ids = doc['attachment_ids_txt'] if 'attachment_ids_txt' in doc else '' # domain specific metadata prod_issue = doc[ 'product_issue_details#productIssue_s'] if 'product_issue_details#productIssue_s' in doc else '' product = doc['product_issue_details#product_s'] if 'product_issue_details#product_s' in doc else '' prod_issue_location = doc[ 'product_issue_details#location_s'] if 'product_issue_details#location_s' in doc else '' prod_issue_owner = doc['product_issue_details#owner_s'] if 'product_issue_details#owner_s' in doc else '' location_type = doc['location#type_s'] if 'location#type_s' in doc else '' location_local_name = doc['location#localName_s'] if 'location#localName_s' in doc else '' metadata_dict = {"literal.product_issue_details#productIssue_s": prod_issue, "literal.product_issue_details#product_s": product, "literal.product_issue_details#location_s": prod_issue_location, "literal.location#type_s": location_type, "literal.product_issue_details#owner_s": prod_issue_owner, "literal.location#localName_s": location_local_name, "literal.prod_issue_doc_id_s": prod_issue_doc_id} for attachment_id in attachment_ids: attachment_url = self.request_attachment_url_by_id(attachment_id) if not is_url_accessible(attachment_url): self._logger.warn("The attachment [%s] is not accessible.", attachment_url) continue if is_image(attachment_url): self._logger.warn("The attachment [%s] is image. Skip for indexing", attachment_url) continue existing_doc = self.local_solr_client.load_document_by_id(attachment_url) try: if existing_doc is None: self._logger.debug("current doc is not exist. Indexing now...") self.local_solr_client.update_document_by_url(attachment_url, metadata=metadata_dict) self._logger.debug("new doc is indexed.") else: # if current doc is existed # update existing doc with possible new metadata self._logger.debug("current doc is exist. update existing index now...") existing_doc.update(metadata_dict) self.local_solr_client.update_document_by_url(attachment_url, metadata=existing_doc) self._logger.debug("updating of existing doc is complete.") except SolrError as solrError: error_message = str(solrError) self._logger.error(error_message) if 'Conflict' in str(solrError): error_message = "Conflict! Another process is running." raise IntegrationException(error_message) # config Solr for improved indexing speed # self.solr_client.commit_all() @staticmethod def request_attachment_url_by_id(attachment_id): """ request attachment url by attachement id :param attachment_id: :return: string, attachment url """ _headers = {"Authorization": attachment_retrieval_api_auth_token} attachment_retrieval_get_api = attachment_retrieval_api r = requests.get(attachment_retrieval_get_api + "/" + str(attachment_id), headers=_headers) if r.status_code == 200: response = json.loads(r.text, encoding="utf-8") attachment_url = response["url"] else: raise Exception(r.reason) return attachment_url
class ReindexerTests(unittest.TestCase): # Methos to create the schema in the collections def create_fields(self): for coll in self.colls: logging.debug("Creating fields for {}".format(coll)) for field in test_config["collections"]["fields"]: try: self.solr.schema.create_field(coll, field) except ValueError: # Filed already exists probably pass def create_copy_fields(self): for coll in self.colls: logging.debug("Creating copy fields for {}".format(coll)) for field in test_config["collections"]["copy_fields"]: try: self.solr.schema.create_copy_field(coll, field) except ValueError: # Filed already exists probably pass def setUp(self): [self.solr.delete_doc_by_id(coll, "*") for coll in self.colls] [self.solr.commit(coll, openSearcher=True) for coll in self.colls] def _index_docs(self, numDocs, coll): """ Generates and indexes in random data while maintaining counts of items in various date ranges. These counts in self.date_counts are used later to validate some reindexing methods. Brace yourself or have a drink..... """ self.docs = self.rand_docs.get_docs(numDocs) sdate = datetime.datetime.now() - datetime.timedelta(days=180) edate = datetime.datetime.now() + datetime.timedelta(days=30) self._start_date = sdate self._end_date = edate import random # Assign random times to documents that are generated. This is used to spread out the documents over multiple time ranges hours = (edate - sdate).days * 24 hour_range = [x for x in range(int(hours))] self.date_counts = {} # Save the newest and oldest timestamps as well as assign them to first and second doc self.docs[0]["date"] = sdate.isoformat() + "Z" self.date_counts[sdate.date().isoformat()] = 1 self.docs[1]["date"] = edate.isoformat() + "Z" self.date_counts[edate.date().isoformat()] = 1 for doc in self.docs[2:]: # Make a new date and store a count of it so I can compare later new_date = sdate + datetime.timedelta(hours=random.choice(hour_range)) new_date_s = new_date.date().isoformat() if new_date_s in self.date_counts: self.date_counts[new_date_s] += 1 else: self.date_counts[new_date_s] = 1 doc["date"] = new_date.isoformat() + "Z" self.solr.index_json(coll, json.dumps(self.docs)) self.solr.commit(coll, openSearcher=True) time.sleep(10) def get_all_json_from_indexq(self, index): files = index.get_all_as_list() out = [] for efile in files: if efile.endswith(".gz"): f = gzip.open(efile, "rt", encoding="utf-8") else: f = open(efile) f_data = json.load(f) f.close() out.extend(f_data) return out @classmethod def setUpClass(self): logging.debug("Starting to run Reindexer Tests") self.solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) self.colls = [test_config["SOLR_REINDEXER_COLLECTION_S"], test_config["SOLR_REINDEXER_COLLECTION_D"]] self.rand_docs = RandomTestData() def test_solr_to_indexq(self): """ Will export documents from Solr and put them into an IndexQ. """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] self._index_docs(5000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) from_solr = self.solr.query("source_coll", {"q": "*:*", "rows": 5000}).docs from_solr = reindexer._trim_fields(from_solr) self.assertEqual(sorted(from_files, key=lambda x: x["id"]), sorted(from_solr, key=lambda x: x["id"])) def test_ignore_fields(self): """ Will export documents from Solr and put them into an IndexQ. """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) for field in ["_version_", "product_name_exact"]: self.assertTrue(field in reindexer._ignore_fields) def test_ignore_fields_disable(self): """ Checks to make sure ignore_fields override works """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index, ignore_fields=False) self.assertEqual(reindexer._ignore_fields, False) def test_ignore_fields_override(self): """ Checks to make sure ignore_fields override works """ index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) reindexer = Reindexer( source=self.solr, source_coll="source_coll", dest=index, ignore_fields=["_text_", "_any_other_field"] ) self.assertEqual(reindexer._ignore_fields, ["_text_", "_any_other_field"]) def test_get_copy_fields(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter") self.assertEqual( reindexer._get_copy_fields(), [field["dest"] for field in self.solr.schema.get_schema_copyfields(self.colls[0])], ) def test_query_gen(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer(source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter") self.assertEqual( reindexer._get_query("cursor"), {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc"}, ) def test_query_gen_pershard(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer( source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", per_shard=True ) self.assertEqual( reindexer._get_query("cursor"), {"cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc", "distrib": "false"}, ) def test_query_gen_date(self): """ Tests the method to get copy fields from Solr. """ reindexer = Reindexer( source=self.solr, source_coll=self.colls[0], dest=self.solr, dest_coll="doesntmatter", date_field="ddddd" ) self.assertEqual( reindexer._get_query("cursor"), { "cursorMark": "cursor", "rows": reindexer._rows, "q": "*:*", "sort": "id desc", "sort": "ddddd asc, id desc", }, ) def test_remove_copy_fields_from_data(self): index = IndexQ(test_config["indexqbase"], "test_reindexer", size=0) for dir in ["_todo_dir", "_done_dir"]: [os.remove(x) for x in index.get_all_as_list(dir=dir)] reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=index) reindexer.reindex() from_files = self.get_all_json_from_indexq(index) excluded_fields = reindexer._ignore_fields for doc in from_files: for field in excluded_fields: if field in doc: print(doc) # self.assertTrue(field not in doc) def test_solr_to_solr(self): self._index_docs(50000, self.colls[0]) reindexer = Reindexer(source=self.solr, source_coll="source_coll", dest=self.solr, dest_coll="dest_coll") reindexer.reindex() self.assertEquals( self.solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), self.solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), ) def test_solr_to_solr_with_date(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) reindexer.reindex() try: self.assertTrue(solr.transport._action_log[1]["params"]["params"]["sort"] == "index_date asc, id desc") except KeyError: self.assertTrue(solr.transport._action_log[2]["params"]["params"]["sort"] == "index_date asc, id desc") self.assertEqual( solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs.sort(key=lambda x: x["id"]), ) def test_get_edge_date(self): """ Checks to make sure _get_edge_date returns correct start and end dates. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) solr_end_date_string = reindexer._get_edge_date("date", "desc") solr_start_date_string = reindexer._get_edge_date("date", "asc") self.assertTrue( self._start_date.date(), datetime.datetime.strptime(solr_start_date_string, "%Y-%m-%dT%H:%M:%S.%fZ") ) self.assertTrue( self._end_date.date(), datetime.datetime.strptime(solr_end_date_string, "%Y-%m-%dT%H:%M:%S.%fZ") ) def test_get_date_range_query(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="index_date" ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "index_date", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1DAY", }, ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "date123", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1DAY", }, ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11", date_field="date123", timespan="MONTH"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "date123", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1MONTH", }, ) self.assertEqual( reindexer._get_date_range_query("2015-11-10", "2015-12-11", timespan="MONTH"), { "rows": 0, "facet.range.end": "2015-12-11", "facet": "true", "facet.range": "index_date", "facet.range.start": "2015-11-10", "q": "*:*", "facet.range.include": "all", "facet.range.gap": "+1MONTH", }, ) def test_get_date_facet_counts(self): """ Checks the date_range_query generation function. Makes sure the date ranges returned matches what got indexed. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts( "DAY", "date", start_date=self._start_date.date().isoformat() ) for dt_range in source_facet: dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_without_start_date(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Testing this one source_facet, dest_facet = reindexer._get_date_facet_counts("DAY", "date") for dt_range in source_facet: dt = datetime.datetime.strptime(dt_range, "%Y-%m-%dT%H:%M:%SZ").date().isoformat() if source_facet[dt_range] != self.date_counts[dt]: logging.info("{} - {} - {}".format(dt, source_facet[dt_range], self.date_counts[dt])) self.assertEqual(source_facet[dt_range], self.date_counts[dt]) def test_get_date_facet_counts_not_day(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Testing this one with self.assertRaises(ValueError): source_facet, dest_facet = reindexer._get_date_facet_counts("MONTH", "date") ## These tests are focused on methods related to resuming re-indexing def test_solr_to_solr_resume_checkonly(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], devel=True, auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume(check=True) # Makes sure nothing got indexed self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) def test_solr_to_solr_resume_basic(self): """ Checks the date_range_query generation function. Since it's pretty simple, running all the tests as one """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has datae self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), ) def test_solr_to_solr_reindex_and_resume(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has datae self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[* TO {}]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), ) def test_solr_to_solr_reindex_and_resume_reverse(self): """ Only reindexes half of the collection on the first time. Then goes back and does a resume to make sure it works. """ self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) # This gets somehwat of a mid point date in the range. midpoint = datetime.datetime.now() - datetime.timedelta(days=((self._end_date - self._start_date).days / 2)) # Reindex approximately half of the data by restricting FQ reindexer.reindex(fq=["date:[{} TO *]".format(midpoint.isoformat() + "Z")]) sleep(10) # Make sure we have at least 20% of the data. dest_count = len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs) s_count = len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs) self.assertTrue(s_count > dest_count > s_count * 0.20) reindexer.resume() sleep(10) # Make sure countc match up after reindex self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), ) def test_solr_to_solr_reindexer_per_shard(self): self._index_docs(50000, self.colls[0]) solr = SolrClient(test_config["SOLR_SERVER"][0], auth=test_config["SOLR_CREDENTIALS"]) reindexer = Reindexer( source=solr, source_coll="source_coll", dest=solr, dest_coll="dest_coll", per_shard=True, date_field="date" ) # Make sure only source has data self.assertEqual(len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), 50000) self.assertEqual(len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), 0) reindexer.reindex() # sloppy check over here, will improve later self.assertEqual( len(solr.query(self.colls[0], {"q": "*:*", "rows": 10000000}).docs), len(solr.query(self.colls[1], {"q": "*:*", "rows": 10000000}).docs), )
''' Create a playlist manually by entering songs one at a time and searching solr for the particular song There is also create_playlist_from_queue.py that has you put the songs on the queue (from a playlist or whatever) and creates a playlist from the queue ''' from SolrClient import SolrClient from config import ec_uri solr = SolrClient(ec_uri+':8983/solr') collection = 'sonos_companion' track_title = input("\nwhat is the title of the track that you are looking for? ") s = 'title:' + ' AND title:'.join(track_title.split()) result = solr.query(collection, {'q':s, 'rows':10, 'fl':['score', 'id', 'uri', 'title', 'artist', 'album'], 'sort':'score desc'}) tracks = result.docs count = result.get_results_count() if count==0: print("Didn't find any tracks\n") elif count==1: track = tracks[0] try: print('id: ' + track['id']) print('artist: ' + track['artist']) print('album: ' + track['album']) print('song: ' + track['title']) print('uri: ' + track['uri']) except Exception as e: print(e) print('------------------------------------------------------------------------------------------------') else:
documents = [] n=1 for track in queue: title = track.title uri = track.uri id_ = album + ' ' + title id_ = id_.replace(' ', '_') id_ = id_.lower() document = {"id":id_, "title":title, "uri":uri, "album":album, "artist":artist, "track":n} print(repr(document).encode('cp1252', errors='replace')) for k in document: print(str(k+':'+str(document[k])).encode('cp1252', errors='ignore')) documents.append(document) n+=1 solr = SolrClient(solr_uri+'/solr') collection = 'sonos_companion' response = solr.index_json(collection, json.dumps(documents)) print(response) # Since solr.commit didn't seem to work, substituted the below, which works url = solr_uri+"/solr/"+collection+"/update" r = requests.post(url, data={"commit":"true"}) print(r.text) resp = input("Do you want to continue? (y or n) ") if resp not in ('y', 'yes'): cont = False ######################################################################
import time from time import sleep import random import json import argparse import sys import datetime home = os.path.split(os.getcwd())[0] sys.path = [os.path.join(home, 'SoCo')] + sys.path import soco from soco import config import boto3 import config as c from SolrClient import SolrClient solr = SolrClient(c.ec_uri+':8983/solr') collection = 'sonos_companion' parser = argparse.ArgumentParser(description='Command line options ...') parser.add_argument('--player', '-p', default='all', help="This is the name of the player you want to control or all") args = parser.parse_args() s3 = boto3.resource('s3') object = s3.Object('sonos-scrobble','location') location = object.get()['Body'].read() print("The current location is {}".format(location)) sqs = boto3.resource('sqs', region_name='us-east-1') queue_name = 'echo_sonos_ct' if location==b'ct' else 'echo_sonos' sqs_queue = sqs.get_queue_by_name(QueueName=queue_name)
from solr import * import pysolr #conn = solr.solr("http://solr.example.net/solr") #conn = solr.Solr("http://solr.example.net/solr") #solr.SearchHandler(conn,"/select") #conn.query() import sklearn from SolrClient import SolrClient solr=SolrClient('http://192.168.1.100:8983/solr/') result=solr.query('tableAbstract',{'q':'memBody:blood','facet':True,'facet.range.start':0,'facet.range.end':1000000}) for x in result.docs: #print(x['id']) print(int(float(x['id']))) #print(x['id']) print (result.get_num_found())