def processQuery(self, session, scheme, qa=None, qi=None, qd=None): ''' Creates the solr query based on received REST queries ''' si = SolrInterface('http://%sother' % self.solr_server_url) types = [self.queryIndexer.typesByMetaData[key] for key in self.queryIndexer.typesByMetaData.keys()] solrQuery = None orClauses = [] if qa is not None: assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses) if QMetaDataInfo.type in qa: types = qa.type.values if qi is not None: solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses) if qd is not None: solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses) if orClauses: extend = None for clause in orClauses: if extend: extend = extend | clause else: extend = clause if solrQuery is None: solrQuery = si.query(extend) else: solrQuery = solrQuery.query(extend) if solrQuery is None: solrQuery = si.query() solrQuery = buildShards(solrQuery, self.solr_server_url, types) return solrQuery
def _do_search(self, sort_by=None): si = SolrInterface(self.solr_endpoint) searchquery = si.Q(*[si.Q(s) for s in self.searchable]) query = si.query(searchquery).field_limit(score=True) realm_query = self._build_realm_filter(si) if realm_query: query = query.filter(realm_query) author_query = self._build_author_filter(si) if author_query: query = query.filter(author_query) trac_query = self._build_trac_filter(si) if trac_query: query = query.filter(trac_query) for field in sort_by or []: query = query.sort_by(field) query = query.paginate(start=self.page_start, rows=self.page_size)\ .highlight('oneline', **{'simple.pre':'<span class="highlight">', 'snippets': 3, 'fragsize': 600, 'simple.post':'</span>'}) # boosting - super hacky but sunburnt is not support bq options = query.options() options['bq'] = ['realm:ticket^999','status:new^100', 'status:assigned^100', 'status:reopened^999', 'status:reviewing^100', 'status:accepted^100','(*:* -xxx)^999'] result = query.interface.search(**options) return query.transform_result(result, dict)
def main(): solr_url = "http://politicalframing.com:8983/solr/collection1" h = httplib2.Http(cache="/var/tmp/solr_cache") si = SolrInterface(url = solr_url, http_connection = h) # chamber = 'Senate' # print commit_solr() numFound = si.query(chamber='senate').paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches about Topic X in senate " + str(numFound) for i in range(0, int(math.ceil(numFound/10000))): current_speeches = si.query(chamber='senate').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], chamber='Senate') if partial_document: print speech['id'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='house').paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches about Topic X in house " + str(numFound) for i in range(0, int(math.ceil(numFound/10000))): current_speeches = si.query(chamber='house').field_limit(["id"]).paginate(rows=10000, start=10000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], chamber='House') if partial_document: print speech['id'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr()
def _retrieve_address_from_history(original_source): server = endpoints.solr + '/dealschrome/geodata' solr = SolrInterface(server) res = solr.query(id=original_source).execute() if len(res): ll = str(res[0]['latlng'][0])+','+str(res[0]['latlng'][1]) determined_source = res[0]['determined_source'] else: ll = None determined_source = None return (ll,determined_source)
def processQuery(self, session, scheme, qa=None, qi=None, qd=None): ''' Creates the solr query based on received REST queries ''' si = SolrInterface('http://%sother' % self.solr_server_url) types = [ self.queryIndexer.typesByMetaData[key] for key in self.queryIndexer.typesByMetaData.keys() ] solrQuery = None orClauses = [] if qa is not None: assert isinstance(qa, QMetaDataInfo), 'Invalid query %s' % qa solrQuery = buildSolrQuery(si, solrQuery, qa, orClauses) if QMetaDataInfo.type in qa: types = qa.type.values if qi is not None: solrQuery = buildSolrQuery(si, solrQuery, qi, orClauses) if qd is not None: solrQuery = buildSolrQuery(si, solrQuery, qd, orClauses) if orClauses: extend = None for clause in orClauses: if extend: extend = extend | clause else: extend = clause if solrQuery is None: solrQuery = si.query(extend) else: solrQuery = solrQuery.query(extend) if solrQuery is None: solrQuery = si.query() solrQuery = buildShards(solrQuery, self.solr_server_url, types) return solrQuery
si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr") # This is where you put the event name eventQuery = "Brazil_NightClub_Fire" # Commented out lines support the special handling when there are spaces in the event name. # eventQuery = "Connecticut School Shooting" # This is where you put the downloaded files #root = 'D:\Test\EventCollections\SmallCollections' # Or, for a Mac, use something like root = '/Users/mzamani/Documents/CS4984/Unit2/Brazil_NightClub_Fire' # Create and execute a Solr query words = eventQuery.split() query = si.query(event=words[0]) for w in words[1:]: query = query.query(event=w) response = query.execute() # Or, for the case of spaces in the name: # response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute() tot = response.result.numFound #print response.result.numFound print tot, "documents found in collection [", eventQuery, "]\n" print "Retrieving documents...\n" response = si.query(event=eventQuery).paginate(0, tot).execute() # Or, for the case of spaces in the name: # response = si.query(event="Connecticut").query(event="School").query(event="shooting").paginate(0,tot).execute()
def search_tiles(request, collection_id=None, version=None, variant=None): results = None solr_metric = None subset_query = None query = None ed_class = None ed_level = None if request.GET: #if 'level' in request.GET: ed_level = request.GET.get('level') #if 'class' in request.GET: ed_class = request.GET.get('class') if 'q' in request.GET: query = remove_quotation_marks(request.GET.get('q')) if query: subset = '' if collection_id: subset = str(collection_id) + '/' + str(version) + '/' + str(variant).split('-')[0] page = request.GET.get('p') start_time = time.clock() setting = solr_switcher() if subset and not (ed_level and ed_class): subset_query = "%s*" % subset query_metric = QueryMetric(request) register_user_query(query_metric) request_metric = RequestMetric(request, query_hash=query_metric.get_md5()) register_client_metric(request_metric) solr_metric = SolrMetric(request_hash=request_metric.get_md5(), query_hash=query_metric.get_md5()) try: solr_interface = SolrInterface(setting.SOLR_MAIN_URL) solr_interface.conn.request_handler_name('search') results = solr_interface.query(solr_interface.Q('"' + query + '"')) if subset_query: results = results.filter(collectionid=subset_query) if ed_level: results = results.filter(collection_school_type_code=ed_level) if ed_class: results = results.filter(collection_ep_class=ed_class) results = results.filter(published=True) except SolrError: solr_metric.solr_error = SolrError if results is not None: pages = Paginator(results, PAGINATION_ROWS) try: results = pages.page(page) except PageNotAnInteger: results = pages.page(1) except EmptyPage: results = pages.page(pages.num_pages) results.num_pages = pages.num_pages results.total_count = pages._count results.processing_time = time.clock() - start_time if solr_metric: if results.number < pages.num_pages: solr_metric.next_page = results.number + 1 solr_metric.num_pages = pages.num_pages if results.number > 1: solr_metric.prev_page = results.number - 1 solr_metric.page = results.number solr_metric.total_count = pages._count solr_metric.processing_time = results.processing_time # get post and pre pages max_count = pages.num_pages if pages.num_pages < results.number + PAGES_OFFSET else results.number + PAGES_OFFSET results.post_pages = range(results.number + 1, max_count + 1) results.pre_pages = [x for x in range(results.number - PAGES_OFFSET, results.number) if x > 0] if solr_metric: solr_metric.request_time = time.clock() - start_time register_solr_metric(solr_metric) else: results = None return render(request, 'search_tiles.html', {'results': results, 'solr_metric': solr_metric, 'query': query, 'collection_id': collection_id, 'variant': variant, 'version': version, 'level': ed_level, 'class': ed_class, 'chosen_education_level': ed_level, 'chosen_level': ed_class})
from sunburnt import SolrInterface import sys si = SolrInterface("http://nick.dlib.vt.edu:8080/solr") eventQuery = sys.argv[1] response = si.query( event=eventQuery).execute() tot = response.result.numFound response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute() docs = {} print response.result.numFound i = 1 for res in response: f = open(str(i) + ".txt","w") f.write(res['content'].encode("utf-8")) f.close() i+=1 si.commit()
def dataset_import_data(dataset_id): """ Import a dataset into Solr. """ from redd.models import Dataset log = logging.getLogger('redd.tasks.dataset_import_data') log.info('Beginning import, dataset_id: %i' % dataset_id) dataset = Dataset.objects.get(id=dataset_id) solr = SolrInterface(settings.SOLR_ENDPOINT) #solr_fields = [] #for h, t in dataset.schema: # if t == 'NoneType': # solr_fields.append(None) # else: # solr_fields.append('%s_%s' % (h, t.__name__)) reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r')) reader.next() add_buffer = [] normal_type_exceptions = [] for i, row in enumerate(reader, start=1): data = {} typing="""for t, header, field, value in izip(normal_types, headers, solr_fields, row): try: value = normalize_column_type([value], normal_type=t)[1][0] except InvalidValueForTypeException: # Convert exception to row-specific error normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t)) continue # No reason to send null fields to Solr (also sunburnt doesn't like them) if value == None: continue if t in [unicode, bool, int, float]: if value == None: continue data[field] = value elif t == datetime: data[field] = value.isoformat() elif t == date: pass elif t == time: pass else: # Note: if NoneType should never fall through to here raise TypeError('Unexpected normal type: %s' % t.__name__)""" # If we've had a normal type exception, don't bother do the rest of this if not normal_type_exceptions: data = { 'id': uuid4(), 'dataset_id': dataset.id, 'row': i, 'full_text': '\n'.join(row), 'csv_data': json.dumps(row) } add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(add_buffer) add_buffer = [] if add_buffer: solr.add(add_buffer) add_buffer = [] if not normal_type_exceptions: solr.commit() else: # Rollback pending changes solr.delete(queries=solr.query(dataset_id=dataset.id)) for e in normal_type_exceptions: print e log.info('Finished import, dataset_id: %i' % dataset_id)
class SolrBackend(Component): implements(ISearchBackend) UNIQUE_ID = "unique_id" HIGHLIGHTABLE_FIELDS = { "unique_id" : True, "id" : True, "type" : True, "product" : True, "milestone" : True, "author" : True, "component" : True, "status" : True, "resolution" : True, "keywords" : True, "summary" : True, "content" : True, "changes" : True, "owner" : True, "repository" : True, "revision" : True, "message" : True, "name" : True } server_url = Option( BHSEARCH_CONFIG_SECTION, 'solr_server_url', doc="""Url of the server running Solr instance.""", doc_domain='bhsearch') def __init__(self): self.solr_interface = SolrInterface(str(self.server_url)) def add_doc(self, doc, operation_context=None): self._reformat_doc(doc) doc[self.UNIQUE_ID] = self._create_unique_id(doc.get("product", ''), doc["type"], doc["id"]) self.solr_interface.add(doc) self.solr_interface.commit() def delete_doc(product, doc_type, doc_id, operation_context=None): unique_id = self._create_unique_id(product, doc_type, doc_id) self.solr_interface.delete(unique_id) def optimize(self): self.solr_interface.optimize() def query( self, query, query_string, sort = None, fields = None, filter = None, facets = None, pagenum = 1, pagelen = 20, highlight = False, highlight_fields = None, context = None): if not query_string: query_string = "*.*" final_query_chain = self._create_query_chain(query, query_string) solr_query = self.solr_interface.query(final_query_chain) faceted_solr_query = solr_query.facet_by(facets) highlighted_solr_query = faceted_solr_query.highlight( self.HIGHLIGHTABLE_FIELDS) start = 0 if pagenum == 1 else pagelen * pagenum paginated_solr_query = highlighted_solr_query.paginate( start=start, rows=pagelen) results = paginated_solr_query.execute() mlt, hexdigests = self.query_more_like_this(paginated_solr_query, fields="type", mindf=1, mintf=1) query_result = self._create_query_result(highlighted_solr_query, results, fields, pagenum, pagelen) return query_result, mlt, hexdigests def query_more_like_this(self, query_chain, **kwargs): mlt_results = query_chain.mlt(**kwargs).execute().more_like_these mlt_dict = {} hexdigests = {} for doc, results in mlt_results.iteritems(): hexdigest = hashlib.md5(doc).hexdigest() hexdigests[doc] = hexdigest for mlt_doc in results.docs: if doc not in mlt_dict: mlt_dict[doc] = [self._process_doc(mlt_doc)] else: mlt_dict[doc].append(self._process_doc(mlt_doc)) return mlt_dict, hexdigests def _process_doc(self, doc): ui_doc = dict(doc) if doc.get('product'): env = ProductEnvironment(self.env, doc['product']) product_href = ProductEnvironment.resolve_href(env, self.env) ui_doc["href"] = product_href(doc['type'], doc['id']) else: ui_doc["href"] = self.env.href(doc['type'], doc['id']) ui_doc['title'] = str(doc['type'] + ": " + doc['_stored_name']).title() return ui_doc def _create_query_result( self, query, results, fields, pagenum, pagelen): total_num, total_page_count, page_num, offset = \ self._prepare_query_result_attributes(query, results, pagenum, pagelen) query_results = QueryResult() query_results.hits = total_num query_results.total_page_count = total_page_count query_results.page_number = page_num query_results.offset = offset docs = [] highlighting = [] for retrieved_record in results: result_doc = self._process_record(fields, retrieved_record) docs.append(result_doc) result_highlights = dict(retrieved_record['solr_highlights']) highlighting.append(result_highlights) query_results.docs = docs query_results.highlighting = highlighting return query_results def _create_query_chain(self, query, query_string): matches = re.findall(re.compile(r'([\w\*]+)'), query_string) tokens = set([match for match in matches]) final_query_chain = None for token in tokens: token_query_chain = self._search_fields_for_token(token) if final_query_chain is None: final_query_chain = token_query_chain else: final_query_chain |= token_query_chain return final_query_chain def _process_record(self, fields, retrieved_record): result_doc = dict() if fields: for field in fields: if field in retrieved_record: result_doc[field] = retrieved_record[field] else: for key, value in retrieved_record.items(): result_doc[key] = value for key, value in result_doc.iteritems(): result_doc[key] = self._from_whoosh_format(value) return result_doc def _from_whoosh_format(self, value): if isinstance(value, datetime): value = utc.localize(value) return value def _prepare_query_result_attributes( self, query, results, pagenum, pagelen): results_total_num = query.execute().result.numFound total_page_count = int(ceil(results_total_num / pagelen)) pagenum = min(total_page_count, pagenum) offset = (pagenum-1) * pagelen if (offset+pagelen) > results_total_num: pagelen = results_total_num - offset return results_total_num, total_page_count, pagenum, offset def is_index_outdated(self): return False def recreate_index(self): return True @contextmanager def start_operation(self): yield def _search_fields_for_token(self, token): q_chain = None field_boosts = DefaultQueryParser(self.env).field_boosts for field, boost in field_boosts.iteritems(): if field != 'query_suggestion_basket' and field != 'relations': field_token_dict = {field: token} if q_chain is None: q_chain = self.solr_interface.Q(**field_token_dict)**boost else: q_chain |= self.solr_interface.Q(**field_token_dict)**boost return q_chain def _reformat_doc(self, doc): for key, value in doc.items(): if key is None: del doc[None] elif value is None: del doc[key] elif isinstance(value, basestring) and value == "": del doc[key] else: doc[key] = self._to_whoosh_format(value) def _to_whoosh_format(self, value): if isinstance(value, basestring): value = unicode(value) elif isinstance(value, datetime): value = self._convert_date_to_tz_naive_utc(value) return value def _convert_date_to_tz_naive_utc(self, value): if value.tzinfo: utc_time = value.astimezone(utc) value = utc_time.replace(tzinfo=None) return value def _create_unique_id(self, product, doc_type, doc_id): if product: return u"%s:%s:%s" % (product, doc_type, doc_id) else: return u"%s:%s" % (doc_type, doc_id)
import os from sunburnt import SolrInterface si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr") # This is where you put the event name # eventQuery = "Typhoon Haiyan" eventQuery = "Connecticut School Shooting" # This is where you put the downloaded files root = 'D:\Test\EventCollections\SmallCollections' #response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute() response = si.query(text="west africa").execute() #response = si.query(event=eventQuery).execute() tot = response.result.numFound print tot #response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute() response = si.query(event=eventQuery).paginate(0,tot).execute() docs = {} print response.result.numFound i = 1
import os from sunburnt import SolrInterface si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr") # This is where you put the event name # eventQuery = "Typhoon Haiyan" eventQuery = "Connecticut School Shooting" # This is where you put the downloaded files root = 'D:\Test\EventCollections\SmallCollections' #response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute() response = si.query(text="west africa").execute() #response = si.query(event=eventQuery).execute() tot = response.result.numFound print tot #response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute() response = si.query(event=eventQuery).paginate(0, tot).execute() docs = {} print response.result.numFound i = 1
def main(): solr_url = "http://politicalframing.com:8983/solr" h = httplib2.Http(cache="/var/tmp/solr_cache") si = SolrInterface(url=solr_url, http_connection=h) totalNumFound = si.query(**{ "*": "*" }).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore").sort_by( "speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( totalNumFound) senateNumFound = si.query(chamber='Senate').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( senateNumFound) houseNumFound = si.query(chamber='House').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( houseNumFound) extensionsNumFound = si.query(chamber='Extensions').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( extensionsNumFound) print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound) print "-----------------------" print "-----------------------" numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude( speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( numFound) for i in range(0, int(math.ceil(numFound / 100000))): current_speeches = si.query(chamber='Senate').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).field_limit( ["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate( rows=100000, start=100000 * i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata( id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Senate') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='House').exclude(speaker_party="*").exclude( speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( numFound) for i in range(0, int(math.ceil(numFound / 100000))): current_speeches = si.query(chamber='House').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).field_limit( ["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate( rows=100000, start=100000 * i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata( id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='House') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='Extensions').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).sort_by("speaker_raw").paginate( rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str( numFound) for i in range(0, int(math.ceil(numFound / 100000))): current_speeches = si.query(chamber='Extensions').exclude( speaker_party="*").exclude(speaker_raw="recorder").exclude( speaker_raw="the presiding officer").exclude( speaker_raw="the vice president").exclude( speaker_raw="the speaker pro tempore").exclude( speaker_raw="the acting president pro tempore" ).field_limit( ["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate( rows=100000, start=100000 * i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata( id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Extensions') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr()
def main(): solr_url = "http://politicalframing.com:8983/solr" h = httplib2.Http(cache="/var/tmp/solr_cache") si = SolrInterface(url = solr_url, http_connection = h) totalNumFound = si.query(**{"*":"*"}).exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "Number of Speeches in Solr without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(totalNumFound) senateNumFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(senateNumFound) houseNumFound = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(houseNumFound) extensionsNumFound = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(extensionsNumFound) print "Sum: " + str(senateNumFound + houseNumFound + extensionsNumFound) print "-----------------------" print "-----------------------" numFound = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in Senate without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(numFound) for i in range(0, int(math.ceil(numFound/100000))): current_speeches = si.query(chamber='Senate').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Senate') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in House without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(numFound) for i in range(0, int(math.ceil(numFound/100000))): current_speeches = si.query(chamber='House').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='House') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr() numFound = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").sort_by("speaker_raw").paginate(rows=0, start=0).execute().result.numFound print "-----------------------" print "Number of Speeches in Extensions without a Speaker Party that isn't recorder, the presiding officer, vice president, or the speaker pro tempore, or the acting president pro tempore " + str(numFound) for i in range(0, int(math.ceil(numFound/100000))): current_speeches = si.query(chamber='Extensions').exclude(speaker_party="*").exclude(speaker_raw="recorder").exclude(speaker_raw="the presiding officer").exclude(speaker_raw="the vice president").exclude(speaker_raw="the speaker pro tempore").exclude(speaker_raw="the acting president pro tempore").field_limit(["id", "speaker_raw", "congress", "date"]).sort_by("speaker_raw").paginate(rows=100000, start=100000*i).execute().result.docs json_documents = [] for j, speech in enumerate(current_speeches): partial_document = get_speaker_metadata(id=speech['id'], date=speech['date'], congress=speech['congress'], speaker=speech['speaker_raw'], chamber='Extensions') print speech['id'] if partial_document: print speech['speaker_raw'] + " queued to be ingested" json_documents.append(partial_document) if len(json_documents) > 1: json_doc_list_string, body = update_solr2(json_documents) print len(json_documents) print body print commit_solr()
import os from sunburnt import SolrInterface si = SolrInterface("http://jingluo.dlib.vt.edu:8080/solr") # This is where you put the event name # eventQuery = "Typhoon Haiyan" eventQuery = "Connecticut School Shooting" # This is where you put the downloaded files root = 'CollectionSmall' response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute() #response = si.query(event=eventQuery).execute() tot = response.result.numFound print tot #response = si.query(event=eventQuery).field_limit(["content"]).paginate(0,tot).execute() response = si.query(event=eventQuery).paginate(0,tot).execute() docs = {} print response.result.numFound i = 1 directory = root + "/"
eventQuery = "Texas_Fertilizer_Plant_Explosion" #these are the query lists for Team A eventQueryList = ["Texas_Fertilizer_Plant_Explosion", "Rain_at_Islip"] # Commented out lines support the special handling when there are spaces in the event name. # eventQuery = "Connecticut School Shooting" # This is where you put the downloaded files #root = 'D:\Test\EventCollections\SmallCollections' # Or, for a Mac, use something like #someone needs to change this part root = '../Unit3/output' # Create and execute a Solr query words = eventQuery.split(); query = si.query(event=words[0]) for w in words[1:]: query = query.query(event = w) response = query.execute() # Or, for the case of spaces in the name: # response = si.query(event="Connecticut").query(event="School").query(event="shooting").execute() tot = response.result.numFound #print response.result.numFound print tot, "documents found in collection [", eventQuery, "]\n" print "Retrieving documents...\n" response = si.query(event=eventQuery).paginate(0,tot).execute() # Or, for the case of spaces in the name: # response = si.query(event="Connecticut").query(event="School").query(event="shooting").paginate(0,tot).execute()
class ScoreSpider(CrawlSpider): name = "score" allowed_domains = ["matchendirect.fr"] start_urls = ["http://www.matchendirect.fr/hier/"] rules = [ Rule( SgmlLinkExtractor(allow=(r"/live-score/[a-z0-9\-]+\.html$", r"/foot-score/[a-z0-9\-]+\.html$")), "parse_score", ) ] # init solr instance def __init__(self, *args, **kwargs): super(ScoreSpider, self).__init__(*args, **kwargs) self.si = SolrInterface("http://*****:*****@class="tableau"][1]') rows = table.xpath("tr") for row in rows: # if match has has started & is finished scoring = row.xpath('td[@class="lm4"]/a[not(span)]/text()').extract() isPlaying = row.xpath('td[@class="lm2_1"]').extract() if scoring and not isPlaying: score = ScoreItem() score["id"] = "http://www.matchendirect.fr" + row.xpath('td[@class="lm4"]/a/@href').extract().pop() score["host"] = row.xpath('td[@class="lm3"]/a/text()').extract().pop() score["visitor"] = row.xpath('td[@class="lm5"]/a/text()').extract().pop() scoringArr = scoring.pop().split(" - ") score["scorehost"] = int(scoringArr[0]) score["scorevisitor"] = int(scoringArr[1]) if score["scorehost"] > score["scorevisitor"]: score["winner"] = score["host"] elif score["scorehost"] < score["scorevisitor"]: score["winner"] = score["visitor"] leagueArr = league.xpath("a[1]/text()").extract().pop().split(" : ") score["country"] = leagueArr[0] score["league"] = leagueArr[1] docs.append(dict(score)) # index crawled games self.si.add(docs) self.si.commit() # called on followed urls # get game details (goal scorer & time) def parse_score(self, response): sel = Selector(response) # if match has started & is finished scorehost = sel.xpath('//div[@id="match_score"]/div[@class="col2"]/text()').extract().pop().strip() scorevisitor = sel.xpath('//div[@id="match_score"]/div[@class="col3"]/text()').extract().pop().strip() isPlaying = sel.xpath('//div[@id="match_entete_2"]/img').extract() if scorehost and scorevisitor and not isPlaying: score = ScoreItem() # get already indexed data solr_doc = self.si.query(id=response.url).execute() if list(solr_doc): doc = solr_doc[0] else: doc = {} score["id"] = response.url # get goals table = sel.xpath('//table[@class="tableau match_evenement"]') rows = table.xpath("tr") score["goalscorershost"], score["goalscorersvisitor"], score["goaltimeshost"], score["goaltimesvisitor"] = ( [], [], [], [], ) score["penaltytimeshost"], score["penaltytimesvisitor"], score["ogtimeshost"], score["ogtimesvisitor"] = ( [], [], [], [], ) for row in rows: tdgoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement1"]]') tdpenaltyhost = row.xpath('td[@class="c1" and span[@class="ico_evenement2"]]') tdowngoalhost = row.xpath('td[@class="c1" and span[@class="ico_evenement7"]]') tdgoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement1"]]') tdpenaltyvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement2"]]') tdowngoalvisitor = row.xpath('td[@class="c3" and span[@class="ico_evenement7"]]') tdgoalhost = tdgoalhost or tdpenaltyhost or tdowngoalhost tdgoalvisitor = tdgoalvisitor or tdpenaltyvisitor or tdowngoalvisitor if tdgoalhost: time = tdgoalhost.xpath('following-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'") if tdpenaltyhost: score["penaltytimeshost"].append(time) elif tdowngoalhost: score["ogtimeshost"].append(time) score["goaltimeshost"].append(time) score["goalscorershost"].append(tdgoalhost.xpath("a/text()").extract().pop()) elif tdgoalvisitor: time = ( tdgoalvisitor.xpath('preceding-sibling::td[@class="c2"][1]/text()').extract().pop().rstrip("'") ) if tdpenaltyvisitor: score["penaltytimesvisitor"].append(time) elif tdowngoalvisitor: score["ogtimesvisitor"].append(time) score["goaltimesvisitor"].append(time) score["goalscorersvisitor"].append(tdgoalvisitor.xpath("a/text()").extract().pop()) # get time, refree & stadium matchinfos = sel.xpath('//table[@id="match_entete_1"]/tr/td[@class="info"]/text()').extract() matchinfos.pop() matchinfos = [x.lstrip("\n\t\r") for x in matchinfos] if u"Arbitre : - " in matchinfos: matchinfos.remove(u"Arbitre : - ") date = format_date(matchinfos[0]) time = matchinfos[1].split(" ")[-1].replace("h", ":") + ":00" score["date"] = "%sT%sZ" % (date, time) if len(matchinfos) >= 3: score["stadium"] = matchinfos[2] if len(matchinfos) == 4: score["referee"] = matchinfos[3].split(" : ")[1] # index all datas doc = dict(doc.items() + dict(score).items()) self.si.add(doc) self.si.commit()