def __init__(self, core_name=None, wt='json'): self.core_name = core_name if core_name: self.__solr = pysolr.Solr(self.__solr_url + core_name) if wt: self.wt = wt
import re import pysolr from sqlalchemy.sql import func import config import models if hasattr(config, "solr") and config.solr == "lib_prod": blake_object_solr = pysolr.Solr( 'http://webapp.lib.unc.edu:8200/solr/blake/blake_object') blake_copy_solr = pysolr.Solr( 'http://webapp.lib.unc.edu:8200/solr/blake/blake_copy') blake_work_solr = pysolr.Solr( 'http://webapp.lib.unc.edu:8200/solr/blake/blake_work') elif hasattr(config, "solr") and config.solr == "lib_dev": blake_object_solr = pysolr.Solr( 'http://london.libint.unc.edu:8983/solr/blake_object') blake_copy_solr = pysolr.Solr( 'http://london.libint.unc.edu:8983/solr/blake_copy') blake_work_solr = pysolr.Solr( 'http://london.libint.unc.edu:8983/solr/blake_work') elif hasattr(config, "solr") and config.solr == "local": blake_object_solr = pysolr.Solr('http://localhost:8983/solr/blake_object') blake_copy_solr = pysolr.Solr('http://localhost:8983/solr/blake_copy') blake_work_solr = pysolr.Solr('http://localhost:8983/solr/blake_work') else: blake_object_solr = pysolr.Solr( 'http://ctools-dev.its.unc.edu/solr/blake-object') blake_copy_solr = pysolr.Solr( 'http://ctools-dev.its.unc.edu/solr/blake-copy') blake_work_solr = pysolr.Solr( 'http://ctools-dev.its.unc.edu/solr/blake-work')
def query_network_fast(self, uuid, search_string, max_edges): niceCx = NiceCXNetwork() #uuid = '7246d8cf-c644-11e6-b48c-0660b7976219' search_terms_dict = {k:1 for k in search_string.split(',')} edge_keepers = set([]) node_keepers = set([]) solr = pysolr.Solr(solr_url + uuid + '/', timeout=10) try: results = solr.search(search_string, rows=10000) #search_terms_array = [int(n['id']) for n in results.docs] search_terms_array = {int(n['id']):1 for n in results.docs} search_terms_set = set([int(n['id']) for n in results.docs]) if(not search_terms_array): return {'message': 'No nodes found'} print('starting nodes 1') #=================== # METADATA #=================== available_aspects = [] for ae in (o for o in self.stream_aspect(uuid, 'metaData')): available_aspects.append(ae.get(CX_CONSTANTS.METADATA_NAME)) mde = MetaDataElement(json_obj=ae) niceCx.add_metadata(mde) opaque_aspects = set(available_aspects).difference(known_aspects_min) print(opaque_aspects) #=================== # EDGES #=================== edge_count = 0 added_edges = 0 edges = [] edge_found = False edge_id = -1 edge_s = None edge_t = None edge_i = None e_count = 0 start_time = time.time() parser = self.stream_aspect_raw(uuid, 'edges') for prefix, event, value in parser: # process event, prefix and value if(event == 'end_map'): e_count += 1 if e_count % 5000 == 0: print(e_count) if edge_found: edge_keepers.add(edge_id) node_keepers.update([edge_s, edge_t]) edges.append(EdgeElement(id=edge_id, edge_source=edge_s, edge_target=edge_t, edge_interaction=edge_i)) edge_id = -1 edge_s = None edge_t = None edge_i = None edge_found = False if (prefix) == ('item.s'): if value in search_terms_set: edge_found = True edge_s = value elif (prefix) == ('item.t'): if value in search_terms_set: edge_found = True edge_t = value elif (prefix) == ('item.i'): if edge_found: edge_i = value elif (prefix) == ('item.@id'): edge_id = value print('Response time (Edge search): ' + str(time.time() - start_time)) print(node_keepers) print(edge_keepers) ''' if 'edges' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'edges')): if ae.get(CX_CONSTANTS.EDGE_SOURCE_NODE_ID) in search_terms_set or ae.get(CX_CONSTANTS.EDGE_TARGET_NODE_ID) in search_terms_set: edge_keepers.add(ae.get(CX_CONSTANTS.ID)) node_keepers.update([ae.get(CX_CONSTANTS.EDGE_SOURCE_NODE_ID), ae.get(CX_CONSTANTS.EDGE_TARGET_NODE_ID)]) if edge_count % 5000 == 0: print(edge_count) if added_edges > max_edges: raise StopIteration('Max edges reached') edge_count += 1 else: raise Exception('Network does not contain any nodes. Cannot query') ''' except SolrError as se: if('404' in se.message): ndex2.get_logger('SOLR').warning('Network not found ' + self.uuid + ' on ' + solr_url + ' server.') raise Exception("Network not found (SOLR)") else: ndex2.get_logger('SOLR').warning('Network error ' + self.uuid + ' on ' + solr_url + ' server. ' + se.message) raise Exception(se.message) except StopIteration as si: ndex2.get_logger('QUERY').warning("Found more than max edges. Raising exception") raise StopIteration(si.message)
path.replace(target_base_dir, '') for path in glob(target_base_dir + '*') ] for account in accounts: logs = glob(target_base_dir + account + '/*') for log in logs: try: # MOD - 2018/03/20 - Fujinet - SF6.7.0_開発依頼内容_No.1 - START #fp = open(log) #fcntl.flock(fp.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB) if os.path.getsize(log) <= 0: continue else: fp = open(log) fcntl.flock(fp.fileno(), fcntl.LOCK_SH | fcntl.LOCK_NB) # MOD - 2018/03/20 - Fujinet - SF6.7.0_開発依頼内容_No.1 - END except IOError as e: print e continue with fp: json_list = json.loads(fp.read()) fcntl.flock(fp.fileno(), fcntl.LOCK_UN) try: s = solr.Solr(solr_base_URL + account, timeout=60) result = s.add([json_list]) except solr.SolrError as e: print e continue os.remove(log)
def delete_user(user_id): """Celery task to remove a user from search.""" solr = pysolr.Solr(settings.SOLR_URL, always_commit=True) solr.delete(q="type:users AND atlas_id:%s" % user_id)
def searcher(self): self.solr = pysolr.Solr(self.options.url)
# -*- coding: utf-8 -*- """ Created on Fri May 14 10:23:00 2021 @author: cbadenes """ import worker.annotator as workers import pysolr import multiprocessing as mp import time if __name__ == '__main__': print("annotating documents..") solr = pysolr.Solr('http://librairy.linkeddata.es/data/mesinesp', always_commit=True, timeout=50) print("Number of processors: ", mp.cpu_count()) pool = mp.Pool(mp.cpu_count()) print("reading from solr..") counter = 0 completed = False window_size = 50 cursor = "*" t = time.time() while (not completed): old_counter = counter try:
def update_by_query(self, query, field=None, value=None, data={}, queryparameters=None): import pysolr count = 0 solr = pysolr.Solr(self.solr + self.core) # # extend query: do not return documents, that are tagged # query_marked_before = '' if field: query_marked_before = field + ':"' + solr_mask(value) + '"' # else extract field and value from data to build query of yet tagged docs to exclude for fieldname in data: if isinstance(data[fieldname], list): for value in data[fieldname]: if query_marked_before: query_marked_before += " AND " query_marked_before += fieldname + ':"' + solr_mask( value) + '"' else: value = data[fieldname] if query_marked_before: query_marked_before += " AND " query_marked_before += fieldname + ':"' + solr_mask( value) + '"' solrparameters = { 'fl': 'id', 'defType': 'edismax', 'rows': 10000000, } # add custom Solr parameters (if the same parameter, overwriting the obove defaults) if queryparameters: solrparameters.update(queryparameters) if query_marked_before: # don't extend query but use filterquery for more performance (cache) on aliases solrparameters["fq"] = 'NOT (' + query_marked_before + ')' if self.verbose: print("Solr query:") print(query) print("Solr parameters:") print(solrparameters) results = solr.search(query, **solrparameters) for result in results: docid = result['id'] if self.verbose: print("Tagging {}".format(docid)) self.tag(docid=docid, field=field, value=value, data=data) count += 1 return count
def search(self, query, project_id, version_id, top_n=50, return_json=False, \ query_string=None, query_field=None): """ This function takes a lucene query which can be created from the lucene query parser class and performs a search on the index It then returns the top n results according to the ranking score Inputs ------ query : Lucene Query A query create by the QueryGenerator class present in query_generator.py top_n : Int The number of top results we want our search to return return_json : Bool If true, the search results are jsonified and returned else the search results are return in the Lucene Document format query_string : String The string entered by the user rerank_fiels : String The name of the field against which the reranker must be run """ # Field names do not contain spaces query_field = query_field.replace(" ","_") proj_exists = self.ensure_collection_exists(project_id,version_id) if proj_exists: index_url = self.solr_server_link + "/solr/" + proj_exists if not proj_exists: return 400 if self.use_rm3 and index_url: new_url = index_url + '/anserini' response = self.session.get(new_url,data={"q":query}) data = response.json() docs = data['docs']['docs'] for idx, x in enumerate(docs): for key in x: x[key] = [x[key]] x['score']=x['score'][0] x['id']=str(idx) search_results_list = [x for x in docs] """ the resonse contains these keys dict_keys( [ 'question', 'answer', 'answer_formatted', 'question_variation_0', 'question_variation_1', 'question_variation_2', 'score', 'id' ] ) """ else: client = pysolr.Solr(index_url, always_commit=True) search_results = client.search(query,fl='*,score',rows=top_n) search_results_list = [x for x in search_results] if search_results.raw_response['response']['numFound'] > 0: max_score = search_results.raw_response['response']['docs'][0]['score'] if max_score < 3: return "Not present" """ the resonse contains these keys dict_keys( [ 'answer', 'answer_formatted', 'disease_1', 'disease_2', 'question_variation_0', 'question_variation_1', 'question_variation_2', 'question', 'subject_1_immunization', 'vaccine_1', 'who_is_writing_this', 'id', 'score', '_version_' ] ) """ else: search_results_list = [] # print("reranking") # TODO : Add support for reranking multiple fields if self.rerank_endpoint is not None and query_string and query_field: ids = {} text = [] # pdb.set_trace() for document in search_results_list: ids[document['question'][0]]=document['id'] text.append([document['id'],document['question'][0]]) scoreDocs = self.reranker.rerank(query_string, text) # case where gpu is rate limited if not scoreDocs: return_docs = [[x,x['score']] for x in search_results_list] else: return_docs = [] for x in scoreDocs: for y in search_results_list: if x[1]==y['question'][0]: return_docs.append([y,x[0]]) break else: #TODO:setup so that score is correct # return document as well as score return_docs = [[x,x['score']] for x in search_results_list] if self.debug: scoreDocs=[] if query_field.endswith("*"): # mapper from text to doc fields = [ "question_variation_1", "question_variation_0", "answer", "answer_formatted" ] else: # only show qa fields = [ "answer", "answer_formatted" ] for doc in return_docs: text = doc[0][query_field.replace('*',"")][0] for field in fields: text += " ||| " + doc[0][field][0] scoreDocs.append([doc[1],text]) return scoreDocs
urls = ( '/', 'SimpleIndexSearchPage', '/searchSimpleIndex', 'SearchSimpleIndex', ) CATEGORY = { 'b': 'Business', 'e': 'Entertainment', 't': 'Science and Technology', 'm': 'Health' } render = web.template.render('templates/', base='layout') SOLR_SIMPLEINDEX = pysolr.Solr('http://localhost:8983/solr/simpleindex') def get_web_input(web_input): draw = web_input['draw'] query = web_input['search[value]'] offset = web_input['start'] count = web_input['length'] return draw, query, offset, count def search(query, offset, count, draw, solr_endpoint): """ This function is responsible for hitting the solr endpoint and returning the results back. """
def connect_to_solr(core, port=8983): solr_instance = pysolr.Solr('http://localhost:{}/solr/{}'.format( port, core), always_commit=True) return solr_instance
def intent_request(session, request): intent = request['intent']['name'] print "intent_request: {}".format(intent) if intent == "CreateItem": title = request['intent']['slots']['mytitle']['value'] data = {'title': title, 'note': "This was created via Amazon Echo"} output_speech = "You want to add: {}. What is the context?".format( title) response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': False }, 'sessionAttributes': { 'title': title } } return response elif intent == "SetContextOrFolder": attributes = session['attributes'] if attributes.get('context'): folder = request['intent']['slots']['mycontextorfolder'].get( 'value', "No Folder") d = { 'new_task': True, 'title': attributes.get('title', 'No title'), 'context': attributes['context'], 'folder': folder } output_speech = "The item will be placed in folder {}. Do you want it to be starred?".format( folder) else: context = request['intent']['slots']['mycontextorfolder'].get( 'value', "No Context") d = { 'title': attributes.get('title', 'No title'), 'context': context } output_speech = "The item will be placed in context {}. What folder do you want to place it in?".format( context) response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': False }, 'sessionAttributes': d } return response elif intent == "AMAZON.YesIntent": # ? should use dict.copy() attributes = session['attributes'] if attributes.get('new_task'): title = attributes['title'] context_title = attributes['context'] folder_title = attributes['folder'] #data={'title':title, 'context':context, 'folder':folder, 'star':True} #r = requests.post(c.ec_uri+":5000/incoming_from_echo", json=data) task = Task(priority=3, title=title, star=True) folder = remote_session.query(Folder).filter_by( title=folder_title.lower()).first() if folder: task.folder = folder else: folder_title = 'No Folder' context = remote_session.query(Context).filter_by( title=context_title.lower()).first() if context: task.context = context else: context_title = 'No Context' task.startdate = datetime.today().date() task.note = "Created from Echo on {}".format(task.startdate) remote_session.add(task) remote_session.commit() r = requests.get(c.ec_uri + ':5000/sync') print r.text output_speech = "The new item {} will be created for context {} and folder {} and it will be starred".format( title, context_title, folder_title) response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response else: which_task = attributes.get('which_task') if which_task: task = remote_session.query(Task).get(which_task) output_speech = "I will read the note: " + task.note response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } elif intent == "AMAZON.NoIntent": attributes = session['attributes'] if attributes.get('new_task'): title = attributes['title'] context_title = attributes['context'] folder_title = attributes['folder'] #data={'title':title, 'context':context, 'folder':folder, 'star':True} #r = requests.post(c.ec_uri+":5000/incoming_from_echo", json=data) task = Task(priority=3, title=title, star=False) folder = remote_session.query(Folder).filter_by( title=folder_title.lower()).first() if folder: task.folder = folder else: folder_title = 'No Folder' context = remote_session.query(Context).filter_by( title=context_title.lower()).first() if context: task.context = context else: context_title = 'No Context' task.startdate = datetime.today().date() task.note = "Created from Echo on {}".format(task.startdate) remote_session.add(task) remote_session.commit() r = requests.get(c.ec_uri + ':5000/sync') print r.text output_speech = "The new item {} will be created for context {} and folder {} and it will not be starred".format( title, context_title, folder_title) response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response elif intent == "RetrieveFolderItems": folder_title = request['intent']['slots']['myfolder'].get('value', '') folder_title = folder_title.lower() q = remote_session.query(Task).join(Folder).filter( and_(Folder.title == folder_title, Task.completed == None, Task.deleted == False, datetime.now() - Task.modified < timedelta(days=30))) count = q.count() tasks = q.limit(10).all() if count: output_speech = "The total number of tasks is {}. ".format(count) now = datetime.now() for n, task in enumerate(tasks, start=1): output_speech += "{}, {}. Created {} days ago.{}".format( n, task.title, (now - task.created).days, ' It is starred. ' if task.star else ' ') else: output_speech = "I did not find anything." response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response elif intent == "RetrieveContextItems": context_title = request['intent']['slots']['mycontext'].get( 'value', '') context_title = context_title.lower() count = remote_session.query(Task).join(Context).filter( and_(Context.title == context_title, Task.completed == None, Task.deleted == False, datetime.now() - Task.modified < timedelta(days=30))).count() tasks = remote_session.query(Task).join(Context).filter( and_(Context.title == context_title, Task.completed == None, Task.deleted == False, datetime.now() - Task.modified < timedelta(days=30))).limit(10).all() if count: output_speech = "The total number of tasks is {}. ".format(count) now = datetime.now() for n, task in enumerate(tasks, start=1): output_speech += "{}, {}. Created {} days ago. It is {} starred. ".format( n, task.title, (now - task.created).days, '' if task.star else 'not') else: output_speech = 'I did not find anything.' response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response elif intent == 'RetrieveSpokenItems': ####this should become retrieve text2speech items ######### tasks = remote_session.query(Task).join(TaskKeyword, Keyword).filter( and_(Task.completed == None, Task.deleted == False, Task.id == TaskKeyword.task_id, TaskKeyword.keyword_id == Keyword.id, Keyword.name == 'text2speech')).all() #tasks = remote_session.query(Task).filter(Task.star==True).all() #tasks = remote_session.query(Task).join(Context).filter(and_(Context.title=='work', Task.star==True, Task.completed==None, datetime.now()-Task.modified<timedelta(days=30))).all() if tasks: output_speech = '' for n, task in enumerate(tasks, start=1): output_speech += '{}, the title: {}, the note: {} '.format( n, task.title, task.note) else: output_speech = 'I did not find anything.' response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response elif intent == "RetrieveSpecificItems": solr = pysolr.Solr(c.ec_uri + ':8983/solr/listmanager/', timeout=10) queryterm = request['intent']['slots']['queryterm']['value'] queryterm = queryterm.replace(' ', '') #Initially thought the queryterm could be queryterms and hence the below #s = 'title:' + ' AND title:'.join(queryterm.split()) #s = s + ' note:' + ' AND note:'.join(queryterm.split()) s = 'title:{} note:{} tag:{}'.format(*(3 * (queryterm, ))) print s #fq = ['star:true', 'completed:false'] fq = ['completed:false'] result = solr.search(s, fq=fq) # rows=1 or **{'rows':1} if len(result): output_speech = '' task_ids = {} for n, task in enumerate(result.docs, start=1): output_speech += '{}, {}, {}. '.format(n, task['id'], task['title']) task_ids[n] = task['id'] output_speech += 'Which one do you want read?' response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': False }, 'sessionAttributes': { 'task_ids': task_ids } } else: output_speech = 'I did not find anything, sorry.' response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response elif intent == 'GetTaskNumber': tasknumber = request['intent']['slots']['tasknumber']['value'] print "tasknumber =", tasknumber print "type(tasknumber)=", type(tasknumber) #"1" attributes = session['attributes'] task_ids = attributes.get('task_ids') #{"1": 2938} print "task_ids=", task_ids id_ = task_ids.get(tasknumber) if id_: print "id_=", id_ task = remote_session.query(Task).get(id_) output_speech = "I will read the note: " + task.note response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response else: output_speech = 'I did not find anything, sorry.' response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } elif intent == 'RetrieveStarredItems': context = request['intent']['slots']['mycontext']['value'] tasks = remote_session.query(Task).join(Context).filter( and_(Context.title == context, Task.star == True, Task.completed == None, datetime.now() - Task.modified < timedelta(days=30))).all() output_speech = '' for n, task in enumerate(tasks, start=1): output_speech += '{}, {}. '.format(n, task.title) response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': True } } return response else: output_speech = "I couldn't tell which type of intent request that was. Try again." response = { 'response': { 'outputSpeech': { 'type': 'PlainText', 'text': output_speech }, 'shouldEndSession': False } } return response
def str_rep(in_str): in_str = in_str.replace("\n", '') out_str = re.sub('\s+', ' ', in_str) out_str = re.sub("[.]+", '', out_str) return out_str def extract_content(solr_data): soup = BeautifulSoup(solr_data, "html5lib") out = [str_rep(x) for x in soup.stripped_strings] return " ".join(out) solr = pysolr.Solr( 'http://localhost:8983/solr/gettingstarted', timeout=10 ) solr.delete(q='*:*') for key, val in build_dict(DST_PATH).items(): with open(val, 'rb') as fh: try: data = solr.extract(fh, extractOnly=True) except pysolr.SolrError as e: print("Damaged file %s" % val) continue metadata = data['metadata'] content = extract_content(data['contents']) index = [{ 'id': key, '_text_': content
def main(): from sqlalchemy.orm import sessionmaker if hasattr(config, "solr") and config.solr == "lib_prod": blake_object_solr = pysolr.Solr( 'http://webapp.lib.unc.edu:8200/solr/blake/blake_object') blake_copy_solr = pysolr.Solr( 'http://webapp.lib.unc.edu:8200/solr/blake/blake_copy') blake_work_solr = pysolr.Solr( 'http://webapp.lib.unc.edu:8200/solr/blake/blake_work') elif hasattr(config, "solr") and config.solr == "lib_dev": blake_object_solr = pysolr.Solr( 'http://london.libint.unc.edu:8983/solr/blake_object') blake_copy_solr = pysolr.Solr( 'http://london.libint.unc.edu:8983/solr/blake_copy') blake_work_solr = pysolr.Solr( 'http://london.libint.unc.edu:8983/solr/blake_work') else: blake_object_solr = pysolr.Solr( 'http://localhost:8983/solr/blake_object') blake_copy_solr = pysolr.Solr('http://localhost:8983/solr/blake_copy') blake_work_solr = pysolr.Solr('http://localhost:8983/solr/blake_work') engine = models.db.create_engine(config.db_connection_string) session = sessionmaker(bind=engine)() objects = session.query(models.BlakeObject).all() blake_object_solr.delete(q='*:*') for blake_object in objects: try: if blake_object.supplemental is None: obj = { "id": blake_object.object_id, "title": blake_object.title, "bentley_id": blake_object.bentley_id, "dbi": blake_object.dbi, "desc_id": blake_object.desc_id, "copy_id": blake_object.copy_bad_id, "characteristics": blake_object.characteristics, "components": json.dumps(blake_object.components), "illustration_description": json.dumps(blake_object.illustration_description), "text": json.dumps(blake_object.text), "copy_title": blake_object.copy.title, "copy_institution": blake_object.copy.institution, # FIXME: properly convert unicode rather than stripping characters "notes": json.dumps([ unicodedata.normalize('NFKD', note["note"]).encode( 'ascii', 'ignore') for note in blake_object.notes ]) } print obj["id"] if blake_object.copy.work: obj["work_title"] = blake_object.copy.work.title obj["work_id"] = blake_object.copy.work.bad_id obj["composition_date"] = blake_object.copy.composition_date obj["print_date"] = blake_object.copy.print_date obj["medium"] = blake_object.copy.work.medium blake_object_solr.add([obj]) except pysolr.SolrError as err: print err blake_object_solr.optimize() copies = session.query(models.BlakeCopy).all() blake_copy_solr.delete(q='*:*') for blake_copy in copies: copy_ = { "id": blake_copy.copy_id, "bad_id": blake_copy.bad_id, "source": blake_copy.source, "title": blake_copy.title, "institution": blake_copy.institution, "header": blake_copy.header, "composition_date": blake_copy.composition_date, "print_date": blake_copy.print_date, "effective_copy_id": blake_copy.effective_copy_id } if blake_copy.work: copy_["medium"] = blake_copy.work.medium copy_["work_id"] = blake_copy.work.bad_id blake_copy_solr.add([copy_]) blake_copy_solr.optimize() works = session.query(models.BlakeWork).all() blake_work_solr.delete(q='*:*') for blake_work in works: blake_work_solr.add([{ "id": blake_work.work_id, "bad_id": blake_work.bad_id, "title": blake_work.title, "medium": blake_work.medium, "info": blake_work.info, "image": blake_work.image, "composition_date": blake_work.composition_date, "composition_date_string": blake_work.composition_date_string }]) blake_work_solr.optimize()
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.searcher = pysolr.Solr(settings.SOLR_URL)
import pysolr import json import time #sudo su - solr -c "/opt/solr/bin/solr create -c TKTDNhom10 -n data_driven_schema_configs" #sudo su - solr -c "/opt/solr/bin/solr delete -c TKTDNhom10 -deleteConfig data_driven_schema_configs" solr = pysolr.Solr("http://localhost:8983/solr/TestConfig/", timeout=1000) # import spacy # nlp = spacy.load("vi_spacy_model") #/home/luuthanh/Desktop/test_config #danh chi muc # with open("/home/luuthanh/Desktop/TKTD/backend/data.json","r") as file: # solr.add(json.load(file)) # a = nlp('"Quốc tế"') results = solr.search( 'title_decription_content: ("WikiLeaks") && date: [ * TO NOW ]', **{ 'start': 0, 'rows': 5, 'fl': '* score', 'fq': ['topic: ( * )', 'author: ( * )'], 'hl': 'true', 'hl.method': 'original', 'hl.simple.pre': '<mark>', 'hl.simple.post': '</mark>', 'hl.highlightMultiTerm': 'true', # 'hl.usePhraseHighlighter':'false', 'hl.fragsize': 100, 'defType': 'edismax',
def indexer(self): self.solr_doclist = [] self.conn = pysolr.Solr(self.options.url) self.conn.delete("*:*") self.conn.commit()
def run(): ping('metadata_database', verbose=True) ping('temporal_index_database', verbose=True) ping('spatial_index_database', verbose=True) ping('thematic_index_solr', verbose=True) engine1 = create_engine(conf_metadata_processor.db_connection) with engine1.connect() as conn: test = conn.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';""").fetchall() if len(test) < 2: raise Exception("As tabelas não foram criadas ainda.") test = conn.execute( "SELECT trigger_name FROM information_schema.triggers;").fetchall( ) if len(test) < 2: raise Exception("Os gatilhos não foram criadas ainda.") del engine1 engine2 = create_engine(conf_temporal_indexing.db_connection) with engine2.connect() as conn: test = conn.execute("""SELECT table_name FROM information_schema.tables WHERE table_schema='public' AND table_type='BASE TABLE';""").fetchall( ) if len(test) < 1: raise Exception("As tabelas não foram criadas ainda.") test = conn.execute("""SELECT routine_name FROM information_schema.routines WHERE routine_type='FUNCTION' AND specific_schema='public';""").fetchall() if len(test) < 4: raise Exception( "Os procedimentos armazenados não foram criados ainda.") del engine2 driver = GraphDatabase.driver( conf_spatial_indexing.db_connection["bolt_uri"], auth=basic_auth(conf_spatial_indexing.db_connection["user"], conf_spatial_indexing.db_connection["password"]), encrypted=False) with driver.session() as session: test = session.run("match(p:Place) return count(p);").value() if test[0] < 5596: raise Exception("O grafo de lugares não foi totalmente criado.") del driver solr_resource = pysolr.Solr(conf_thematic_indexing.resource_solr_core_uri, always_commit=True) solr_resource.ping() solr_dataset = pysolr.Solr(conf_thematic_indexing.dataset_solr_core_uri, always_commit=True) solr_dataset.ping() del solr_dataset del solr_resource
def add_to_search_index(data_dict_id, in_bulk=False): log = logging.getLogger('ckan') od_search_solr_url = config.get(SEARCH_INTEGRATION_URL_OPTION, "") od_search_enabled = config.get(SEARCH_INTEGRATION_ENABLED_OPTION, False) od_search_od_url_en = config.get( SEARCH_INTEGRATION_OD_URL_EN_OPTION, "https://open.canada.ca/data/en/dataset/") od_search_od_url_fr = config.get( SEARCH_INTEGRATION_OD_URL_FR_OPTION, "https://ouvert.canada.ca/data/fr/dataset/") # Retrieve the full record - it has additional information including organization title and metadata modified date # that are not available in the regular data dict portal = LocalCKAN() data_dict = portal.action.package_show(id=data_dict_id) if not od_search_enabled: return try: subject_codes = scheming_choices_label_by_value( scheming_get_preset('canada_subject')['choices']) type_codes = scheming_choices_label_by_value( scheming_get_preset('canada_resource_related_type')['choices']) collection_codes = scheming_choices_label_by_value( scheming_get_preset('canada_collection')['choices']) juristiction_codes = scheming_choices_label_by_value( scheming_get_preset('canada_jurisdiction')['choices']) resource_type_codes = scheming_choices_label_by_value( scheming_get_preset('canada_resource_type')['choices']) frequency_codes = scheming_choices_label_by_value( scheming_get_preset('canada_frequency')['choices']) org_title = data_dict['organization']['title'].split('|') owner_org_title_en = org_title[0].strip() owner_org_title_fr = org_title[1].strip() subjects_en = [] subjects_fr = [] subjects = json.loads(data_dict['subject']) if \ isinstance(data_dict['subject'], str) else data_dict['subject'] for s in subjects: subjects_en.append(subject_codes['en'][s].replace(",", "")) subjects_fr.append(subject_codes['fr'][s].replace(",", "")) resource_type_en = [] resource_type_fr = [] resource_fmt = [] resource_title_en = [] resource_title_fr = [] for r in data_dict['resources']: resource_type_en.append( resource_type_codes['en'][r['resource_type']] if r['resource_type'] in resource_type_codes['en'] else '') resource_type_fr.append( resource_type_codes['fr'][r['resource_type']] if r['resource_type'] in resource_type_codes['fr'] else '') resource_fmt.append(r['format']) resource_name = json.loads(r['name_translated']) if \ isinstance(r['name_translated'], str) else r['name_translated'] if 'en' in resource_name: resource_title_en.append(resource_name['en']) elif 'fr-t-en' in resource_name: resource_title_en.append(resource_name['fr-t-en']) if 'fr' in resource_name: resource_title_fr.append(resource_name['fr'].strip()) elif 'en-t-fr' in resource_name: resource_title_fr.append(resource_name['en-t-fr'].strip()) notes_translated = json.loads(data_dict['notes_translated']) if \ isinstance(data_dict['notes_translated'], str) else data_dict['notes_translated'] title_translated = json.loads(data_dict['title_translated']) if \ isinstance(data_dict['title_translated'], str) else data_dict['title_translated'] od_obj = { 'portal_type_en_s': type_codes['en'][data_dict['type']], 'portal_type_fr_s': type_codes['fr'][data_dict['type']], 'collection_type_en_s': collection_codes['en'][data_dict['collection']], 'collection_type_fr_s': collection_codes['fr'][data_dict['collection']], 'jurisdiction_en_s': juristiction_codes['en'][data_dict['jurisdiction']], 'jurisdiction_fr_s': juristiction_codes['fr'][data_dict['jurisdiction']], 'owner_org_title_en_s': owner_org_title_en, 'owner_org_title_fr_s': owner_org_title_fr, 'subject_en_s': subjects_en, 'subject_fr_s': subjects_fr, 'resource_type_en_s': list(set(resource_type_en)), 'resource_type_fr_s': list(set(resource_type_fr)), 'update_cycle_en_s': frequency_codes['en'][data_dict['frequency']], 'update_cycle_fr_s': frequency_codes['fr'][data_dict['frequency']], 'id_name_s': data_dict['name'], 'id': data_dict['name'], 'owner_org_s': data_dict['owner_org'], 'author_txt': data_dict['author'] if 'author' in data_dict else '', 'description_txt_en': notes_translated['en'] if 'en' in data_dict['notes_translated'] else '', 'description_txt_fr': notes_translated['fr'] if 'fr' in data_dict['notes_translated'] else '', 'description_xlt_txt_fr': notes_translated['fr-t-en'] if 'fr-t-en' in notes_translated else '', 'description_xlt_txt_en': notes_translated['en-t-fr'] if 'en-t-f-r' in notes_translated else '', 'title_en_s': title_translated['en'] if 'en' in title_translated else '', 'title_fr_s': title_translated['fr'] if 'fr' in title_translated else '', 'title_xlt_fr_s': title_translated['fr-t-en'] if 'fr-t-en' in title_translated else '', 'title_xlt_en_s': title_translated['en-t-fr'] if 'en-t-fr' in title_translated else '', 'resource_format_s': list(set(resource_fmt)), 'resource_title_en_s': resource_title_en, 'resource_title_fr_s': resource_title_fr, 'last_modified_tdt': parser.parse(data_dict['metadata_modified']).replace( microsecond=0).isoformat() + 'Z', 'ogp_link_en_s': '{0}{1}'.format(od_search_od_url_en, data_dict['name']), 'ogp_link_fr_s': '{0}{1}'.format(od_search_od_url_fr, data_dict['name']), } keywords = json.loads(data_dict['keywords']) if \ isinstance(data_dict['keywords'], str) else data_dict['keywords'] if 'en' in keywords: od_obj['keywords_en_s'] = keywords['en'] elif 'fr-t-en' in keywords: od_obj['keywords_en_s'] = keywords['fr-t-en'] if 'fr' in keywords: od_obj['keywords_fr_s'] = keywords['fr'] elif 'en-t-fr' in keywords: od_obj['keywords_fr_s'] = keywords['en-t-fr'] solr = pysolr.Solr(od_search_solr_url) if in_bulk: solr.add([od_obj]) else: solr.delete(id=od_obj['id']) solr.add([od_obj]) solr.commit() except Exception as x: log.error("Exception: {} {}".format(x.message, x.args))
import pysolr import nltk import numpy as np solr = pysolr.Solr('http://localhost:8983/solr/glove', timeout=10) def getVectorFromSolr(word): result = solr.search("id:" + word) return result.raw_response['response']['docs'][0]['value'][0].split(" ") def getVector(word): word = word.lower().strip() wList = nltk.word_tokenize(word) count = 0 res = np.zeros(100, dtype=float) for w in wList: temp = np.array(getVectorFromSolr(w), dtype=float) res = np.add(res, temp) count += 1 res = np.divide(res, count) return res def getCosineSimBetVectors(vec1, vec2): sim = max( 0, np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))) return sim
# -*- coding: utf-8 -*- """ Created on Sun Dec 3 16:42:38 2017 @author: apurva """ import pysolr import os import sys solr = pysolr.Solr('http://localhost:8983/solr/task4/', timeout=10000) import nltk path = sys.argv[1] from nltk.tokenize import sent_tokenize from nltk.tokenize import word_tokenize from nltk.corpus import wordnet from nltk.stem import WordNetLemmatizer from nltk.stem.lancaster import LancasterStemmer from rake_nltk import Rake lemmatizer = WordNetLemmatizer() st = LancasterStemmer() r= Rake() documents = [] docId=1 for filename in os.listdir(path): f = open(path+'/'+filename) raw = f.read() sentence = sent_tokenize(raw) sentId=1 for sen in sentence: words = word_tokenize(sen)
import pysolr import json import sys # Argument 1 the json line file to index file_to_index = sys.argv[1] # Argument 2 core to connect core_name = sys.argv[2] solr = pysolr.Solr('http://localhost:8983/solr/%s/' % core_name) size_buffer = 1000 count = 0 with open(file_to_index) as fil: for line in fil: json_obj = json.loads(line) json_obj['id'] = json_obj['url'] solr.add([json_obj], commit=(count % size_buffer == 0)) print("\r%d" % count, end='') count += 1 solr.commit() print("\r%d" % count)
'overview': tmdbMovie['overview'], 'tagline': tmdbMovie['tagline'], 'cast_nomv': " ".join( [castMember['name'] for castMember in tmdbMovie['cast']]), 'directors': [director['name'] for director in tmdbMovie['directors']], 'cast': [castMember['name'] for castMember in tmdbMovie['cast']], 'genres': [genre['name'] for genre in tmdbMovie['genres']], 'release_date': releaseDate, 'vote_average': tmdbMovie['vote_average'] if 'vote_average' in tmdbMovie else None, 'vote_count': int(tmdbMovie['vote_count']) if 'vote_count' in tmdbMovie else None, } except KeyError as k: # Ignore any movies missing these attributes print(k) continue if __name__ == "__main__": solr = pysolr.Solr('http://localhost:8983/solr/tmdb', timeout=100) solr.add(indexableMovies()) solr.commit()
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', }, { 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', }, { 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', }, ] #Solar url connection and access SOLAR_CONFIGURATION = {"URL": "https://52.152.191.13:8983/solr"} solr_product = pysolr.Solr( 'https://52.152.191.13:8983/solr/product_information/', timeout=10, verify=False) solr_notification_status = pysolr.Solr( 'https://52.152.191.13:8983/solr/sap_notification_status/', timeout=10, verify=False) solr_unstructure_data = pysolr.Solr( 'https://52.152.191.13:8983/solr/unstructure_processed_data/', timeout=10, verify=False) solr_document_variant = pysolr.Solr( 'https://52.152.191.13:8983/solr/sap_document_variant/', timeout=10, verify=False) # Internationalization # https://docs.djangoproject.com/en/2.1/topics/i18n/
def solr_load_batch(batch): """Process batch.""" solr = pysolr.Solr(settings.SOLR_URL, always_commit=True) solr.add(list(map(build_doc, batch)))
def make(self, conn_name, *args, **kwargs): url = settings.HAYSTACK_CONNECTIONS[conn_name]['URL'] return pysolr.Solr(url, **kwargs)
def query_network(self, uuid, search_string, max_edges): myConst = CX_CONSTANTS niceCx = NiceCXNetwork() #uuid = '7246d8cf-c644-11e6-b48c-0660b7976219' search_terms_dict = {k:1 for k in search_string.split(',')} solr = pysolr.Solr(solr_url + uuid + '/', timeout=10) try: results = solr.search(search_string, rows=10000) #search_terms_array = [int(n['id']) for n in results.docs] search_terms_array = {int(n['id']):1 for n in results.docs} if(not search_terms_array): return {'message': 'No nodes found'} print('starting nodes 1') #=================== # METADATA #=================== available_aspects = [] for ae in (o for o in self.stream_aspect(uuid, 'metaData')): available_aspects.append(ae.get(CX_CONSTANTS.METADATA_NAME)) mde = MetaDataElement(json_obj=ae) niceCx.add_metadata(mde) #available_aspects = ['edges', 'nodes'] # TODO - remove this opaque_aspects = set(available_aspects).difference(known_aspects_min) print(opaque_aspects) #=================== # NODES #=================== if 'nodes' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'nodes')): if search_terms_array.get(ae.get(CX_CONSTANTS.ID)): add_this_node = NodeElement(cx_fragment=ae) niceCx.create_node(add_this_node) else: raise Exception('Network does not contain any nodes. Cannot query') print('starting edges 1') #=================== # EDGES #=================== edge_count = 0 added_edges = 0 start_time = time.time() if 'edges' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'edges')): if niceCx.nodes.get(ae.get(CX_CONSTANTS.EDGE_SOURCE_NODE_ID_OR_SUBNETWORK)) is not None or niceCx.nodes.get(ae.get(CX_CONSTANTS.EDGE_TARGET_NODE_ID)) is not None: add_this_edge = EdgeElement(cx_fragment=ae) niceCx.create_edge(add_this_edge) added_edges += 1 if edge_count % 5000 == 0: print(edge_count) #if edge_count > 30000: # break if added_edges > max_edges: raise StopIteration('Max edges reached') edge_count += 1 else: raise Exception('Network does not contain any nodes. Cannot query') print('Response time (Edge search): ' + str(time.time() - start_time)) print('starting nodes 2') #=================== # NODES #=================== for ae in (o for o in self.stream_aspect(uuid, 'nodes')): if niceCx.get_missing_nodes().get(ae.get(CX_CONSTANTS.ID)): add_this_node = NodeElement(cx_fragment=ae) niceCx.create_node(add_this_node) #==================== # NETWORK ATTRIBUTES #==================== if 'networkAttributes' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'networkAttributes')): add_this_network_attribute = NetworkAttributesElement(cx_fragment=ae) niceCx.add_network_attribute(add_this_network_attribute) #=================== # NODE ATTRIBUTES #=================== if 'nodeAttributes' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'nodeAttributes')): if niceCx.nodes.get(ae.get(CX_CONSTANTS.PROPERTY_OF)): add_this_node_att = NodeAttributesElement(json_obj=ae) niceCx.add_node_attribute(add_this_node_att) #=================== # EDGE ATTRIBUTES #=================== if 'edgeAttributes' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'edgeAttributes')): if niceCx.edges.get(ae.get(CX_CONSTANTS.PROPERTY_OF)): add_this_edge_att = EdgeAttributesElement(json_obj=ae) niceCx.set_edge_attribute() #=================== # NODE CITATIONS #=================== if 'nodeCitations' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'nodeCitations')): for e_po in ae.get(CX_CONSTANTS.PROPERTY_OF): if niceCx.get_nodes().get(e_po) is not None: niceCx.add_node_citations_from_cx(ae) #=================== # EDGE CITATIONS #=================== ec_count = 0 if 'edgeCitations' in available_aspects: for ae in (o for o in self.stream_aspect(uuid, 'edgeCitations')): for e_po in ae.get(CX_CONSTANTS.PROPERTY_OF): if niceCx.get_edges().get(e_po) is not None: niceCx.add_edge_citations_from_cx(ae) ec_count += 1 if ec_count % 500 == 0: print(ec_count) #=================== # CITATIONS #=================== if 'citations' in available_aspects: #====================================================== # FILTER CITATIONS IF THERE ARE EDGE OR NODE CITATIONS # OTHERWISE ADD THEM ALL (NO-FILTER) -- TODO #====================================================== for ae in (o for o in self.stream_aspect(uuid, 'citations')): add_this_citation = CitationElement(cx_fragment=ae) niceCx.add_citation(add_this_citation) #=================== # OPAQUE ASPECTS #=================== for oa in opaque_aspects: objects = self.stream_aspect(uuid, oa) obj_items = (o for o in objects) for oa_item in obj_items: aspect_element = AspectElement(oa_item, oa) niceCx.add_opaque_aspect(aspect_element) except SolrError as se: if('404' in se.message): ndex2.get_logger('SOLR').warning('Network not found ' + self.uuid + ' on ' + solr_url + ' server.') raise Exception("Network not found (SOLR)") else: ndex2.get_logger('SOLR').warning('Network error ' + self.uuid + ' on ' + solr_url + ' server. ' + se.message) raise Exception(se.message) except StopIteration as si: ndex2.get_logger('QUERY').warning("Found more than max edges. Raising exception") raise StopIteration(si.message) #nice_cx_json = niceCx.to_cx() return niceCx
import pysolr import requests import os import sqlite3 import psycopg2 import psycopg2.extras from gensim.parsing import preprocessing # GLOBALS conn = psycopg2.connect( "dbname=MAG19 user=mag password=1maG$ host=shetland.informatik.uni-freiburg.de" ) cur = conn.cursor(cursor_factory=psycopg2.extras.RealDictCursor) solr = pysolr.Solr('http://localhost:8983/solr/arxiv_cs_metadata', always_commit=True) def search_solr_parse_json(query, collection, search_field): """ Searches the arxiv_cs_metadata collection on arxiv_identifier (search_field) using the resp. arxiv id as the query, parses the json result and returns it as a list of dictionaries where each dictionary corresponds to a record. ARGUMENTS: query, string: each arxiv id collection: the Solr collection name (=arxiv_cs_metadata) search_field: the Solr field which is queried (=arxiv_identifier) RETURNS: docs, list of dicts: the documents (records) returned by Solr AFTER getting the JSON response and parsing it.""" solr_url = 'http://localhost:8983/solr/' + collection + '/select' url_params = {'q': query, 'rows': 1, 'df': search_field} solr_response = requests.get(solr_url, params=url_params)
#print bibdir print '====================================' solrinstance.add([bibdir], commit=False) #Issue with loading into sh obsids.sh and all wont we duplicate them if we do stuff separateky for overlaps and stuff. Should we do it just once or check whats been loaded to protect against this BUG if __name__=="__main__": if len(sys.argv)==2: execfile("./default.conf") elif len(sys.argv)==3: execfile(sys.argv[2]) else: print "Usage: python rdf2solr3.py biblistfile [conffile]" sys.exit(-1) c=adsrdf.ADSConnection(SESAME, REPOSITORY) print "cccccccccccccccccccccccccccccccccccc",c #researchpapers=[unquote(e.split('#')[1]) for e in c.getDataByType('cito:ResearchPaper')] #h= HTMLParser.HTMLParser() researchpapers=[ele.strip() for ele in open(sys.argv[1]).readlines()] print researchpapers #researchpapers=['2000A&A...359..489C', '2000ApJ...534L..47G', '2000ApJ...536L..27W', '2000ApJ...540L..69S', '2000ApJ...541...49H'] solr=pysolr.Solr(SOLR) #solr=None #researchpapers=['2000ApJ...534L..47G', '2009ApJ...692.1143K'] for ele in researchpapers: print "Indexing: ",ele putIntoSolr(solr, ele) print "-------------" solr.commit()
# If on Python 2.X from __future__ import print_function import pysolr from django.conf import settings from iati.models import Budget from solr.budget.indexing import BudgetIndexing from solr.tasks import BaseTaskIndexing solr = pysolr.Solr('{url}/{core}'.format( url=settings.SOLR.get('url'), core=settings.SOLR.get('cores').get('budget')), always_commit=True, timeout=180) class BudgetTaskIndexing(BaseTaskIndexing): indexing = BudgetIndexing model = Budget solr = solr def run_from_activity(self, activity): for budget in activity.budget_set.all(): self.instance = budget self.run()