def submitDocs (docs, name): #pprint.pprint(docs) index = solr.Solr('http://localhost:8080/solr/edfu') index.add_many(docs) index.commit() index = solr.Solr('http://vlib.sub.uni-goettingen.de/solr/edfu') index.add_many(docs) index.commit() print str(len(docs)) + ' ' + name + u' Dokumente indexiert'
def _make_solr_inst(self) -> "solr.Solr": return solr.Solr( self.solr_conn_addr, persistent=self.solr_persistent_connection, timeout=self.solr_timeout, # debug=True # This makes things pretty verbose )
def __init__(self, url_harvest, query, **query_params): super(SolrFetcher, self).__init__(url_harvest, query) self.solr = solr.Solr(url_harvest) # , debug=True) self.query = query self.resp = self.solr.select(self.query) self.numFound = self.resp.numFound self.index = 0
def get(self, request, *args, **kwargs): if not ('q' in request.GET and 'dictionary' in request.GET): return Response() q = u"{0}".format(request.GET['q']) dictionary = u"{0}".format( request.GET['dictionary']) # The suggester to use manuscript = u"{0}".format( request.GET['manuscript'] ) # Can be '*' when searching through all manuscripts connection = solr.Solr(settings.SOLR_SERVER) search_handler = solr.SearchHandler(connection, "/suggest") # TODO fix solr so that the suggesters work with a context field (cfq)] # search_results = search_handler(q=q, suggest_dictionary=dictionary, suggest_cfq=manuscript) search_results = search_handler(q=q, suggest_dictionary=dictionary) results = search_results.suggest[dictionary][q] # Remove duplicates from the suggestions and limits the return number to 10 results['suggestions'] = self._get_filtered_results( results['suggestions']) response = Response(results) return response
def _delete_solr_records(self, solr_base_url, core=None, query=DEFAULT_QUERY): solr_url = ( solr_base_url + "/" + core if core is not None else solr_base_url) solr_server = solr.Solr(solr_url) solr_server.delete_query(query) solr_server.close()
def __init__(self, request): self.server = solr.Solr(settings.SOLR_SERVER) self.request = request self.parsed_request = {} self.prepared_query = '' self.solr_params = {} self._parse_request() self._prep_q()
def __init__(self, type_str, uuid, solr_conn_addr, type_field, uuid_field, vector_field, timestamp_field, timeout=10, persistent_connection=False, commit_on_set=True): """ Initialize a new Solr-stored descriptor element. :param type_str: Type of descriptor. This is usually the name of the content descriptor that generated this vector. :type type_str: str :param uuid: Unique ID reference of the descriptor. :type uuid: collections.Hashable :param solr_conn_addr: HTTP(S) address for the Solr index to use :type solr_conn_addr: str :param type_field: Solr index field to store descriptor type string value. :type type_field: str :param uuid_field: Solr index field to store descriptor UUID string value in. :type uuid_field: str :param vector_field: Solr index field to store the descriptor vector of floats in. :type vector_field: str :param timestamp_field: Solr index field to store floating-point UNIX timestamps. :type timestamp_field: str :param timeout: Whether or not the Solr connection should be persistent or not. :type timeout: int :param persistent_connection: Maintain a connection between Solr index interactions. :type persistent_connection: bool :param commit_on_set: Immediately commit changes when a vector is set. :type commit_on_set: bool """ super(SolrDescriptorElement, self).__init__(type_str, uuid) self.type_field = type_field self.uuid_field = uuid_field self.vector_field = vector_field self.timestamp_field = timestamp_field self.commit_on_set = commit_on_set self.solr = solr.Solr(solr_conn_addr, persistent=persistent_connection, timeout=timeout, # debug=True # This makes things pretty verbose )
def _commit_solr(self, solr_base_url): for core in CORES: solr_url = solr_base_url + "/" + core logging.info("Committing to Solr index: %s" % solr_url) solr_server = solr.Solr(solr_url) solr_server.commit() solr_server.close()
def _optimize_solr(self, solr_base_url): for core in CORES: solr_url = solr_base_url + "/" + core logging.info("Optimizing Solr index: %s" % solr_url) solr_server = solr.Solr(solr_url) solr_server.optimize() solr_server.close()
def __init__(self, request, additional_query_params=None): self.server = solr.Solr(settings.SOLR_SERVER) # self.query_dict = query_dict self.request = request self.additional_query_params = additional_query_params self.parsed_request = {} self.prepared_query = u"" self.solr_params = {} self._parse_request() self._prepare_query()
def __init__(self, solr_conn_addr: str, set_uuid: str, set_uuid_field: str, d_uid_field: str, descriptor_field: str, timestamp_field: str, solr_params: Dict[str, Any] = None, commit_on_add: bool = True, max_boolean_clauses: int = 1024, pickle_protocol: int = -1): """ Construct a descriptor set pointing to a Solr instance. :param solr_conn_addr: HTTP(S) address for the Solr set to use :param set_uuid: Unique ID for the descriptor set to use within the configured Solr set. :param set_uuid_field: Solr set field to store/locate set UUID value. :param d_uid_field: Solr set field to store/locate descriptor UUID values :param descriptor_field: Solr set field to store the code-associated descriptor object. :param timestamp_field: Solr set field to store floating-point UNIX timestamps. :param solr_params: Dictionary of additional keyword parameters to set in the ``solr.Solr`` instance used. See the ``pysolr`` documentation for available parameters and values. :param commit_on_add: Immediately commit changes when one or many descriptor are added. :param max_boolean_clauses: Solr instance's configured maxBooleanClauses configuration property (found in solrconfig.xml file). This is needed so we can correctly chunk up batch queries without breaking the server. This may also be less than the Solr instance's set value. :param pickle_protocol: Pickling protocol to use. We will use -1 by default (latest version, probably binary). """ super(SolrDescriptorSet, self).__init__() self.set_uuid = set_uuid self.set_uuid_field = set_uuid_field self.d_uid_field = d_uid_field self.descriptor_field = descriptor_field self.timestamp_field = timestamp_field self.commit_on_add = commit_on_add self.max_boolean_clauses = int(max_boolean_clauses) assert self.max_boolean_clauses >= 2, "Need more clauses" self.pickle_protocol = pickle_protocol self.solr_params = solr_params self.solr = solr.Solr(solr_conn_addr, **solr_params)
def _check_record(self, solr_base_url, core, record_id): '''Checks for the existence of a record with a given id.''' solr_url = solr_base_url + "/" + core solr_server = solr.Solr(solr_url) query = "id:%s" % record_id response = solr_server.select(query) solr_server.close() if response.numFound > 0: return True else: return False
def collectionView(request, collection_id): if request.method == 'GET': q = reduce(concat_query, request.GET.getlist('q')) if 'q' in request.GET else '*:*' rows = request.GET['rows'] if 'rows' in request.GET else '16' start = request.GET['start'] if 'start' in request.GET else '0' view_format = request.GET['view_format'] if 'view_format' in request.GET else 'thumbnails' collection_url = 'https://registry.cdlib.org/api/v1/collection/' + collection_id + '/?format=json' collection_json = urllib2.urlopen(collection_url).read() collection_details = json.loads(collection_json) filters = dict((filter_type[0], request.GET.getlist(filter_type[0])) for filter_type in FACET_TYPES) filters['collection_name'] = [collection_details['name']] fq = solrize_filters(filters) # perform the search s = solr.Solr('http://107.21.228.130:8080/solr/dc-collection') solr_response = SOLR.select( q=q, rows=rows, start=start, fq=fq, facet='true', facet_field=list(facet_type[0] for facet_type in FACET_TYPES) ) for item in solr_response.results: if 'reference_image_md5' in item: item['reference_image_http'] = md5_to_http_url(item['reference_image_md5']) facets = {} for facet_type in FACET_TYPES: facets[facet_type[0]] = process_facets( solr_response.facet_counts['facet_fields'][facet_type[0]], filters[facet_type[0]] ) return render(request, 'public_interface/collectionResults.html', { 'q': q, 'filters': filters, 'rows': rows, 'start': start, 'search_results': solr_response.results, 'facets': facets, 'FACET_TYPES': FACET_TYPES, 'numFound': solr_response.numFound, 'pages': int(math.ceil(float(solr_response.numFound)/int(rows))), 'view_format': view_format, 'collection': collection_details }) return render(request, 'public_interface/searchResults.html', {'yay': 'yamy'})
def __setstate__(self, state): self._type_label = state['type_label'] self._uuid = state['uuid'] self.type_field = state['type_field'] self.uuid_field = state['uuid_field'] self.vector_field = state['vector_field'] self.timestamp_field = state['timestamp_field'] self.commit_on_set = state['commit_on_set'] self.solr = solr.Solr(state['solr_url'], persistent=state['solr_persistent'], timeout=state['solr_timeout'], # debug=True # see above )
def __setstate__(self, state): self.uuid = state['uuid'] self.commit_on_add = state['commit_on_add'] self.max_boolean_clauses = state['max_boolean_clauses'] self.idx_uuid_field = state['field_uuid'] self.code_field = state['field_code'] self.d_uid_field = state['field_descr_uuid'] self.descriptor_field = state['field_descr_obj'] self.timestamp_field = state['field_timestamp'] self.solr = solr.Solr(state['solr_url'], persistent=state['solr_persistent'], timeout=state['solr_timeout'])
def main(url_solr=URL_SOLR, url_couchdb=None, couchdb_db=None): solr_db = solr.Solr(url_solr) db = get_couchdb(url=url_couchdb, dbname=couchdb_db) v = couchdb_pager(db, include_docs='true') # update or create new solr doc for each couchdb doc for r in v: doc_couch = r.doc if '_design' not in doc_couch['_id']: try: if not isinstance(doc_couch['originalRecord']['collection'], list): doc_couch['originalRecord']['collection'] = [ doc_couch['originalRecord']['collection'], ] print("orgRec.Collection: {}".format( doc_couch['sourceResource']['collection'])) except KeyError: pass try: if not isinstance(doc_couch['sourceResource']['collection'], list): doc_couch['sourceResource']['collection'] = [ doc_couch['sourceResource']['collection'], ] print("srcRes.Collection: {}".format( doc_couch['sourceResource']['subject'])) except KeyError: pass try: subject = doc_couch['sourceResource'].get('subject', None) if not isinstance(subject, list): subject = [subject] subjects_norm = [] for sub in subject: if not isinstance(sub, dict): subjects_norm.append({'name': sub}) else: subjects_norm.append(sub) doc_couch['sourceResource']['subject'] = subjects_norm except KeyError: pass db.save(doc_couch) try: doc_solr = push_doc_to_solr(map_couch_to_solr_doc(doc_couch), solr_db=solr_db) print("PUSHED {} to solr".format(doc_couch['_id'])) except TypeError: pass solr_db.commit()
def __init__(self, limit=100000): global lock, ready t = time() self.limit = limit self.api = solr.Solr('http://localhost:8983/solr/samos') self.wktreg = re.compile(r'[-+]?\d*\.\d+|\d+') self.data = None self.size, self.total = self._load() # python3 version of cKDTree does periodic quick sort # which severely hurts performance on sorted data # -- solr returns semi sorted data usually # compact_nodes barely affects performance # , compact_nodes=False) self.tree = cKDTree(self.data['loc'][:self.total], balanced_tree=False) self.loadtime = time() - t with lock: ready = True
def solr_image_paths(solr_addr, begin_time, end_time, username, password, batch_size): log = logging.getLogger(__name__) conn = solr.Solr(solr_addr, http_user=username, http_pass=password) # Query for number of matching documents q = 'mainType:image AND indexedAt:[%s TO %s]' % (begin_time, end_time) r = conn.select(q, fields=['id'], rows=0) num_results = r.numFound log.debug("Found: %d", num_results) loops = (num_results // batch_size) + (num_results % batch_size > 0) log.debug("Making %d iterations", loops) for i in xrange(loops): r = conn.select(q, fields=['id'], rows=batch_size, start=i * batch_size) for doc in r.results: yield doc['id'][5:]
def solr_connection(core): """ Creates a :class:`solr:solr.Solr` connection for the core ``core``. :param str core: :raises urllib2.URLError: if a ping to the cores ping handler doesn't succeed :rtype: :class:`solr:solr.Solr` """ solr_uri = config.CFG.get("solr", "uri") core_uri = solr_uri + "/" + core ping_uri = core_uri + "/admin/ping" logger.info("Setting up a connection to %s", solr_uri) logger.debug("Pinging %s", ping_uri) urllib2.urlopen(ping_uri) logger.debug("Connection to the Solr core at %s", core_uri) return solr.Solr(core_uri)
def delete_entry(self, file_id): """ Deletes solr entry. Called by run_remove_index_file(() """ log.debug('file_id, `{}`'.format(file_id)) s = solr.Solr(self.SOLR_URL) response = s.delete(file_id) s.commit() s.close() log.debug( 'deletion-post complete; response, ```{}```'.format(response)) status_str: str = '' if type(response) == str: status_str = response # type: ignore else: status_str = repr(response) process_status_updater.update_single_status( inscription_id=file_id, status='deletion-processed', status_detail=status_str) return
def Main(): s = solr.Solr("http://localhost:8080/solr") con = sqlite3.connect("d:/temp/test.db") cursor = con.cursor() #cursor.execute("select * from t_talents limit 6,3") cursor.execute("select * from t_talents") rows = cursor.fetchall() count = 0 for row in rows: count += 1 tid = row[0] tname = row[1] ttitle = row[2] tlocality = row[4] tindustry = row[5] tcurposition = row[6] tprofile = row[9] docn = {'id':tid,'talent_name':tname,'talent_title':ttitle,'talent_locality':tlocality,'talent_industry':tindustry,'talent_curposition':tcurposition,'talent_profile':tprofile} s.add(docn,commit=True) print('done:'+str(count)) con.close()
def collectionsExplore(request): s = solr.Solr('http://107.21.228.130:8080/solr/dc-collection') collections_solr_query = SOLR.select(q='*:*', rows=0, start=0, facet='true', facet_field=['collection'], facet_limit='10') solr_collections = collections_solr_query.facet_counts['facet_fields']['collection'] collections = [] for collection_url in solr_collections: collection_api = urllib2.urlopen(collection_url + "?format=json") collection_json = collection_api.read() collection_details = json.loads(collection_json) rows = '4' if collection_details['description'] != '' else '5' display_items = SOLR.select( q='*:*', fields='reference_image_md5, title, id', rows=rows, start=0, fq=['collection: \"' + collection_url + '\"'] ) for item in display_items: if 'reference_image_md5' in item: item['reference_image_http'] = md5_to_http_url(item['reference_image_md5']) collection_url_pattern = re.compile('https://registry.cdlib.org/api/v1/collection/([0-9]+)[/]?') collection_id = collection_url_pattern.match(collection_url) collections.append({ 'name': collection_details['name'], 'description': collection_details['description'], 'slug': collection_details['slug'], 'collection_id': collection_id.group(1), 'display_items': display_items.results }) return render(request, 'public_interface/collections-explore.html', {'collections': collections})
return row.value if __name__ == "__main__": parser = argparse.ArgumentParser( description='Make csv report of indexed collections') parser.add_argument('auth_token', help='Authentication token') parser.add_argument('--solr_url', help='Solr index url') parser.add_argument('--couchdb_url', help='CouchDB url') args = parser.parse_args() solr_url = args.solr_url if args.solr_url else SOLR_URL print "SOLR_URL:{}".format(solr_url) SOLR = solr.SearchHandler( solr.Solr( solr_url, post_headers={ 'X-Authentication-Token': args.auth_token, }, ), "/query") if args.couchdb_url: cdb = get_couchdb(url_couchdb=couchdb_url, dbname='ucldc') else: cdb = get_couchdb(dbname='ucldc') collections = get_indexed_collection_list(SOLR) date_to_minute = datetime.datetime.now().strftime('%Y%m%d-%H%M') fname = 'indexed_collections-{}.csv'.format(date_to_minute) with open(fname, 'wb') as csvfile: csvwriter = UnicodeWriter(csvfile) csvwriter.writerow( ('Collection Name', 'Collection URL', 'Number in index', 'Number in couchdb', 'Number in OAC', 'Couch missing in solr', 'OAC missing in couch', 'Repository Name', 'Repository URL',
from __future__ import print_function import solr import sys import re con = solr.Solr('http://localhost:8983/solr/wikipedia_core') mlt = solr.SearchHandler(con, "/mlt") THRESHOLD = 0.9 def process(line): # replace some known utf-8 chars with ascii line = re.sub("\xe2\x80\x99", "x", line) # U+2019 (right single quotation mark) line = re.sub("\xe2\x80\x93", "-", line) # U+2013 (EN-DASH) # remove the rest of the non-ascii chars line = re.sub(r'[^\x00-\x7F]+', ' ', line) r = mlt(mlt_fl='page_text', mlt_mindf=1, mlt_mintf=1, stream_body=line) return ",".join([ d['page_name'] for d in r.results if d['score'] / r.maxScore >= THRESHOLD ]).encode('utf-8') def main(): try: for line in sys.stdin: print(process(line.strip())) except:
#! /usr/bin/env python import sys import os import iso8601 import json import solr import urllib import urllib2 from utils import solr_escape, read_blob, get_accounts_for_blob, get_filters import logging logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) solrMessages = solr.Solr('http://localhost:8983/solr/messages') ROWS = 100 INBOX_LABEL = "inbox" def run_filters(account_hash, message_id, query_label_pairs): labels = set() skip_inbox = False for query, label, _skip_inbox in query_label_pairs: caged_query = '+id:%s +(%s) +account:%s -labels:"%s"' % ( message_id, query, account_hash, solr_escape(label))
def search(keyword, value): s = solr.Solr("http://localhost:8983/solr/minor") response = s.select(keyword + ':' + value) print(response) for i in response.results: print i
def __init__(self): self.mapping = settings['SOLR_MAPPING'].items() self.solr = solr.Solr(settings['SOLR_URL'])
def connection(self): if self._connection: return self._connection else: return solr.Solr(settings.SOLR_HOST + '/' + self.core)
# You should have received a copy of the GNU General Public License # along with Invenio; if not, write to the Free Software Foundation, Inc., # 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. """ Solr utilities. """ import itertools from invenio.config import CFG_SOLR_URL from intbitset import intbitset from invenio.ext.logging import register_exception if CFG_SOLR_URL: import solr conn = solr.Solr(CFG_SOLR_URL) SOLR_CONNECTION = solr.SolrConnection(CFG_SOLR_URL) # pylint: disable=E1101 SOLR_MLT_CONNECTION = solr.SearchHandler(conn, "/mlt") BOOLEAN_EQUIVALENTS = {"+": "AND", "|": "OR", "-": "NOT"} def get_collection_filter(hitset, cutoff_amount): # The last n hitset records are considered to be newest and therfore most relevant start_index = len(hitset) - cutoff_amount if start_index < 0: start_index = 0 it = itertools.islice(hitset, start_index, None) ids = ' '.join([str(recid) for recid in it]) if ids:
# coding:utf-8 import solr import sys reload(sys) sys.setdefaultencoding('utf-8') print sys.getdefaultencoding() # create a connection to a solr server conn = solr.Solr('http://localhost:8080/solr', timeout=1000) # add a document to the index tdoc = {"id": 3, "title": "Lucene in Action"} for k in tdoc: print "dict[%s] =" % k, tdoc[k] conn.add(tdoc) conn.commit() # do a search response = conn.select('Lucene') for hit in response.results: print hit