def delete(self, id): ''' @see: IArticleSearchProvider.delete() ''' si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article')) si.delete(str(id)) si.commit()
def update(self, metaInfo, metaData): ''' @see: ISearchProvider.update() ''' si = SolrInterface('http://%s%s' % (self.solr_server_url, metaData.Type)) document = dict() document["MetaInfoId"] = metaInfo.Id document["MetaDataId"] = metaData.Id document["languageId"] = metaInfo.Language # custom processing on some fields field = 'CreationDate' if hasattr(metaInfo, field) and getattr(metaInfo, field): document['CreationData_Year'] = getattr(metaInfo, field).year for field in si.schema.fields: if hasattr(metaInfo, field) and getattr(metaInfo, field): document[field] = getattr(metaInfo, field) elif hasattr(metaData, field) and getattr(metaData, field): document[field] = getattr(metaData, field) si.add(document) si.commit()
def delete(self, idMetaInfo, metaType): ''' @see: ISearchProvider.delete() ''' si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType)) si.delete(str(idMetaInfo)) si.commit()
def make_connection(): solr_url = config.get('adhocracy.solr.url', 'http://localhost:8983/solr/') solr_url = solr_url.strip() if not solr_url.endswith('/'): solr_url = solr_url + '/' http_connection = Http() return SolrInterface(solr_url, http_connection=http_connection)
def run(self, solr_id): """ Run the synchronization, delete the record on Solr :param solr_id: identifier of the record to delete """ si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth si.delete(solr_id) si.commit() return _('Record %s deleted on Solr') % solr_id
def get_sunburnt_connection(): from pylons import config solr_url = config.get('adhocracy.solr.url', 'http://localhost:8983/solr/') solr_url = solr_url.strip() if not solr_url.endswith('/'): solr_url = solr_url + '/' http_connection = Http() return SolrInterface(solr_url, http_connection=http_connection, mode='r')
def make_connection(): solr_url = config.get_string('adhocracy.solr.url', 'http://localhost:8983/solr/') solr_url = solr_url.strip() if not solr_url.endswith('/'): solr_url = solr_url + '/' kwargs = {} if config.get_bool('adhocracy.force_no_http_proxy'): kwargs['proxy_info'] = None http_connection = Http(**kwargs) return SolrInterface(solr_url, http_connection=http_connection)
def add(self, url, doc): si_item = self.__solr_pool.get(url) if not si_item: si_item = [SolrInterface(url), 0, []] self.__solr_pool[url] = si_item si_item[2].append(doc) si_item[1] += 1 if si_item[ 1] % NUMBER_OF_DOCS_PER_ADD == 0: # NOTE: Solr itself will also auto-commit after some time si_item[0].add(si_item[2]) si_item[2] = [] if si_item[1] > NUMBER_OF_DOCS_PER_COMMIT: si_item[0].commit() si_item[1] = 0 return _('Record exported with ID %s on SolR.') % doc['id']
def main(): solr_url = "http://politicalframing.com:8983/solr/collection1" h = httplib2.Http(cache="/var/tmp/solr_cache") si = SolrInterface(url = solr_url, http_connection = h) # chamber = 'Senate' # print commit_solr() numFound = si.query(chamber='senate').paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches about Topic X in senate " + str(numFound) for i in range(0, int(math.ceil(numFound/10000))): current_speeches = si.query(chamber='senate').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], chamber='Senate') if partial_document: print speech['id'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='house').paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches about Topic X in house " + str(numFound) for i in range(0, int(math.ceil(numFound/10000))): current_speeches = si.query(chamber='house').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], chamber='House') if partial_document: print speech['id'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr()
def processQuery(self, session, scheme, qa=None, qi=None, qd=None): ''' Creates the solr query based on received REST queries ''' si = SolrInterface('http://%sother' % self.solr_server_url) types = [ self.queryIndexer.typesByMetaData[key] for key in self.queryIndexer.typesByMetaData.keys() ] solrQuery = None orClauses = [] if qa is not None: assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses) if QMetaDataInfo.type in qa: types = qa.type.values if qi is not None: solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses) if qd is not None: solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses) if orClauses: extend = None for clause in orClauses: if extend: extend = extend | clause else: extend = clause if solrQuery is None: solrQuery = si.query(extend) else: solrQuery = solrQuery.query(extend) if solrQuery is None: solrQuery = si.query() solrQuery = buildShards(solrQuery, self.solr_server_url, types) return solrQuery
# getCollectionsFromSolr20140919 revised to cover several cases and to report progress # VT CS4984, Computational Linguistics, by Xuan Zhang, Tarek Kanan, Edward Fox import os from sunburnt import SolrInterface si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr") # This is where you put the event name eventQuery = "Brazil_NightClub_Fire" # Commented out lines support the special handling when there are spaces in the event name. # eventQuery = "Connecticut School Shooting" # This is where you put the downloaded files #root = 'D:\Test\EventCollections\SmallCollections' # Or, for a Mac, use something like root = '/Users/mzamani/Documents/CS4984/Unit2/Brazil_NightClub_Fire' # Create and execute a Solr query words = eventQuery.split() query = si.query(event=words[0]) for w in words[1:]: query = query.query(event=w) response = query.execute() # Or, for the case of spaces in the name: # response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute() tot = response.result.numFound #print response.result.numFound print tot, "documents found in collection [", eventQuery, "]\n"
from sunburnt import SolrInterface import sys si = SolrInterface("http://nick.dlib.vt.edu:8080/solr") eventQuery = sys.argv[1] response = si.query( event=eventQuery).execute() tot = response.result.numFound response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute() docs = {} print response.result.numFound i = 1 for res in response: f = open(str(i) + ".txt","w") f.write(res['content'].encode("utf-8")) f.close() i+=1 si.commit()
def get_solr(url=None): """ return a `SolrInterface` instance using the `solr_url` setting """ if url is None: url = get_settings()['kotti_solr.solr_url'] return SolrInterface(url)
This repo has a YAML file we might be able to use https://github.com/unitedstates/congress-legislators It seems the original parser has been improved (or maybe just migrated) https://github.com/unitedstates/congressional-record/blob/master/congressionalrecord/fdsys/cr_parser.py """ import httplib2 from sunburnt import SolrInterface from dateutil import parser from datetime import datetime solr_url = "http://politicalframing.com:8983/solr" # "http://localhost:8983/solr/" h = httplib2.Http(cache="/var/tmp/solr_cache") si = SolrInterface(url=solr_url, http_connection=h) def get_speeches(rows, start, dabool, **kwargs): query = {} neg_query = {} if kwargs.get('speech_id'): query['id'] = kwargs['speech_id'] if kwargs.get('phrase'): query['speaking'] = kwargs['phrase'] if kwargs.get('congress'): query['congress'] = kwargs['congress'] kwargs['start_date'] = parser.parse( kwargs['start_date']) if kwargs.get('start_date') else datetime( 1994, 1, 1) kwargs['end_date'] = parser.parse( kwargs['end_date']) if kwargs.get('end_date') else datetime.now()
def main(): solr_url = "http://politicalframing.com:8983/solr" h = httplib2.Http(cache="/var/tmp/solr_cache") si = SolrInterface(url=solr_url, http_connection=h) totalNumFound = si.query(**{ "*": "*" }).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore").sort_by( "speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( totalNumFound) senateNumFound = si.query(chamber='Senate').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( senateNumFound) houseNumFound = si.query(chamber='House').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( houseNumFound) extensionsNumFound = si.query(chamber='Extensions').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( extensionsNumFound) print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound) print "-----------------------" print "-----------------------" numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude( speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( numFound) for i in range(0, int(math.ceil(numFound / 100000))): current_speeches = si.query(chamber='Senate').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).field_limit( ["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate( rows=100000, start=100000 * i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata( id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Senate') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='House').exclude(speaker_party="*").exclude( speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( numFound) for i in range(0, int(math.ceil(numFound / 100000))): current_speeches = si.query(chamber='House').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).field_limit( ["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate( rows=100000, start=100000 * i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata( id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='House') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='Extensions').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( numFound) for i in range(0, int(math.ceil(numFound / 100000))): current_speeches = si.query(chamber='Extensions').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).field_limit( ["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate( rows=100000, start=100000 * i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata( id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Extensions') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr()