def delete(self, id): ''' @see: IArticleSearchProvider.delete() ''' si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article')) si.delete(str(id)) si.commit()
def delete(self, idMetaInfo, metaType): ''' @see: ISearchProvider.delete() ''' si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType)) si.delete(str(idMetaInfo)) si.commit()
def get_test_solr(): settings.SOLR_ENDPOINT = 'http://localhost:8983/solr/data_test' solr = SolrInterface(settings.SOLR_ENDPOINT) solr.delete(queries='*:*', commit=True) return solr
def run(self, solr_id): """ Run the synchronization, delete the record on SolR :param solr_id: identifier of the record to delete """ si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth si.delete(solr_id) return _('Record %s deleted on SolR') % solr_id
def run(self, solr_id): """ Run the synchronization, delete the record on Solr :param solr_id: identifier of the record to delete """ si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth si.delete(solr_id) si.commit() return _('Record %s deleted on Solr') % solr_id
def dataset_purge_data(dataset_id): """ Purge a dataset from Solr. """ log = logging.getLogger('redd.tasks.dataset_purge_data') log.info('Beginning purge, dataset_id: %i' % dataset_id) solr = SolrInterface(settings.SOLR_ENDPOINT) solr.delete(queries='dataset_id: %i' % dataset_id, commit=True) log.info('Finished purge, dataset_id: %i' % dataset_id)
def dataset_import_data(dataset_id): """ Import a dataset into Solr. """ from redd.models import Dataset log = logging.getLogger('redd.tasks.dataset_import_data') log.info('Beginning import, dataset_id: %i' % dataset_id) dataset = Dataset.objects.get(id=dataset_id) solr = SolrInterface(settings.SOLR_ENDPOINT) #solr_fields = [] #for h, t in dataset.schema: # if t == 'NoneType': # solr_fields.append(None) # else: # solr_fields.append('%s_%s' % (h, t.__name__)) reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r')) reader.next() add_buffer = [] normal_type_exceptions = [] for i, row in enumerate(reader, start=1): data = {} typing="""for t, header, field, value in izip(normal_types, headers, solr_fields, row): try: value = normalize_column_type([value], normal_type=t)[1][0] except InvalidValueForTypeException: # Convert exception to row-specific error normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t)) continue # No reason to send null fields to Solr (also sunburnt doesn't like them) if value == None: continue if t in [unicode, bool, int, float]: if value == None: continue data[field] = value elif t == datetime: data[field] = value.isoformat() elif t == date: pass elif t == time: pass else: # Note: if NoneType should never fall through to here raise TypeError('Unexpected normal type: %s' % t.__name__)""" # If we've had a normal type exception, don't bother do the rest of this if not normal_type_exceptions: data = { 'id': uuid4(), 'dataset_id': dataset.id, 'row': i, 'full_text': '\n'.join(row), 'csv_data': json.dumps(row) } add_buffer.append(data) if i % SOLR_ADD_BUFFER_SIZE == 0: solr.add(add_buffer) add_buffer = [] if add_buffer: solr.add(add_buffer) add_buffer = [] if not normal_type_exceptions: solr.commit() else: # Rollback pending changes solr.delete(queries=solr.query(dataset_id=dataset.id)) for e in normal_type_exceptions: print e log.info('Finished import, dataset_id: %i' % dataset_id)
class SolrPipeline(object): search_engine = endpoints.solr + '/dealschrome/search-engine' archive = endpoints.solr + '/dealschrome/archive' def __init__(self): dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed) self.si_eng = SolrInterface(self.search_engine) self.si_eng.init_schema() self.si_arc = SolrInterface(self.archive) self.si_arc.init_schema() self.old_deals = {} def spider_opened(self, spider): source = spider.allowed_domains[0] old_temp = self.get_old_deals(source) self.old_deals[spider] = {i['id']:i for i in old_temp} spider.old_deals = dict(self.old_deals[spider]) def spider_closed(self, spider): source = spider.allowed_domains[0] old_deals = self.old_deals.pop(spider) if spider.crawled_items.items(): for k,v in spider.crawled_items.items(): if old_deals.has_key(v['url']): field_created = old_deals[v['url']]['created'] del old_deals[v['url']] else: field_created = int(time()) data = { 'id' : v['url'], 'title' : v['title'], 'dealsource' : source, 'price' : str(v['price']), 'worth' : str(v['worth']), 'discount' : str(v['discount']), 'bought' : str(v['bought']), 'imgsrc' : v['imgsrc'], 'category' : v['category'], 'created' : field_created, 'expiry' : str(v['expiry']), 'merchant' : v['merchant'], 'address' : v['address'], 'description': v['description'], } if v['location']: # only add location when location exists data['location'] = v['location'] # its a BUG,this code is to correct multiple valued category if len(data['category']) > 1 and not isinstance(data['category'], types.StringTypes): data['category'] = data['category'][0] self.si_eng.add(data) self.si_arc.add(data) self.si_eng.commit() self.si_arc.commit() pending_delete = [doc for doc in old_deals.itervalues()] if pending_delete: self.si_eng.delete(pending_delete) self.si_eng.commit() self.si_arc.commit() def get_old_deals(self, source): old_deals = self.si_eng\ .query(dealsource_raw=source)\ .field_limit(['id','created','category_raw'],score=False)\ .paginate(rows=900)\ .execute() return old_deals
class SolrBackend(Component): implements(ISearchBackend) UNIQUE_ID = "unique_id" HIGHLIGHTABLE_FIELDS = { "unique_id" : True, "id" : True, "type" : True, "product" : True, "milestone" : True, "author" : True, "component" : True, "status" : True, "resolution" : True, "keywords" : True, "summary" : True, "content" : True, "changes" : True, "owner" : True, "repository" : True, "revision" : True, "message" : True, "name" : True } server_url = Option( BHSEARCH_CONFIG_SECTION, 'solr_server_url', doc="""Url of the server running Solr instance.""", doc_domain='bhsearch') def __init__(self): self.solr_interface = SolrInterface(str(self.server_url)) def add_doc(self, doc, operation_context=None): self._reformat_doc(doc) doc[self.UNIQUE_ID] = self._create_unique_id(doc.get("product", ''), doc["type"], doc["id"]) self.solr_interface.add(doc) self.solr_interface.commit() def delete_doc(product, doc_type, doc_id, operation_context=None): unique_id = self._create_unique_id(product, doc_type, doc_id) self.solr_interface.delete(unique_id) def optimize(self): self.solr_interface.optimize() def query( self, query, query_string, sort = None, fields = None, filter = None, facets = None, pagenum = 1, pagelen = 20, highlight = False, highlight_fields = None, context = None): if not query_string: query_string = "*.*" final_query_chain = self._create_query_chain(query, query_string) solr_query = self.solr_interface.query(final_query_chain) faceted_solr_query = solr_query.facet_by(facets) highlighted_solr_query = faceted_solr_query.highlight( self.HIGHLIGHTABLE_FIELDS) start = 0 if pagenum == 1 else pagelen * pagenum paginated_solr_query = highlighted_solr_query.paginate( start=start, rows=pagelen) results = paginated_solr_query.execute() mlt, hexdigests = self.query_more_like_this(paginated_solr_query, fields="type", mindf=1, mintf=1) query_result = self._create_query_result(highlighted_solr_query, results, fields, pagenum, pagelen) return query_result, mlt, hexdigests def query_more_like_this(self, query_chain, **kwargs): mlt_results = query_chain.mlt(**kwargs).execute().more_like_these mlt_dict = {} hexdigests = {} for doc, results in mlt_results.iteritems(): hexdigest = hashlib.md5(doc).hexdigest() hexdigests[doc] = hexdigest for mlt_doc in results.docs: if doc not in mlt_dict: mlt_dict[doc] = [self._process_doc(mlt_doc)] else: mlt_dict[doc].append(self._process_doc(mlt_doc)) return mlt_dict, hexdigests def _process_doc(self, doc): ui_doc = dict(doc) if doc.get('product'): env = ProductEnvironment(self.env, doc['product']) product_href = ProductEnvironment.resolve_href(env, self.env) ui_doc["href"] = product_href(doc['type'], doc['id']) else: ui_doc["href"] = self.env.href(doc['type'], doc['id']) ui_doc['title'] = str(doc['type'] + ": " + doc['_stored_name']).title() return ui_doc def _create_query_result( self, query, results, fields, pagenum, pagelen): total_num, total_page_count, page_num, offset = \ self._prepare_query_result_attributes(query, results, pagenum, pagelen) query_results = QueryResult() query_results.hits = total_num query_results.total_page_count = total_page_count query_results.page_number = page_num query_results.offset = offset docs = [] highlighting = [] for retrieved_record in results: result_doc = self._process_record(fields, retrieved_record) docs.append(result_doc) result_highlights = dict(retrieved_record['solr_highlights']) highlighting.append(result_highlights) query_results.docs = docs query_results.highlighting = highlighting return query_results def _create_query_chain(self, query, query_string): matches = re.findall(re.compile(r'([\w\*]+)'), query_string) tokens = set([match for match in matches]) final_query_chain = None for token in tokens: token_query_chain = self._search_fields_for_token(token) if final_query_chain is None: final_query_chain = token_query_chain else: final_query_chain |= token_query_chain return final_query_chain def _process_record(self, fields, retrieved_record): result_doc = dict() if fields: for field in fields: if field in retrieved_record: result_doc[field] = retrieved_record[field] else: for key, value in retrieved_record.items(): result_doc[key] = value for key, value in result_doc.iteritems(): result_doc[key] = self._from_whoosh_format(value) return result_doc def _from_whoosh_format(self, value): if isinstance(value, datetime): value = utc.localize(value) return value def _prepare_query_result_attributes( self, query, results, pagenum, pagelen): results_total_num = query.execute().result.numFound total_page_count = int(ceil(results_total_num / pagelen)) pagenum = min(total_page_count, pagenum) offset = (pagenum-1) * pagelen if (offset+pagelen) > results_total_num: pagelen = results_total_num - offset return results_total_num, total_page_count, pagenum, offset def is_index_outdated(self): return False def recreate_index(self): return True @contextmanager def start_operation(self): yield def _search_fields_for_token(self, token): q_chain = None field_boosts = DefaultQueryParser(self.env).field_boosts for field, boost in field_boosts.iteritems(): if field != 'query_suggestion_basket' and field != 'relations': field_token_dict = {field: token} if q_chain is None: q_chain = self.solr_interface.Q(**field_token_dict)**boost else: q_chain |= self.solr_interface.Q(**field_token_dict)**boost return q_chain def _reformat_doc(self, doc): for key, value in doc.items(): if key is None: del doc[None] elif value is None: del doc[key] elif isinstance(value, basestring) and value == "": del doc[key] else: doc[key] = self._to_whoosh_format(value) def _to_whoosh_format(self, value): if isinstance(value, basestring): value = unicode(value) elif isinstance(value, datetime): value = self._convert_date_to_tz_naive_utc(value) return value def _convert_date_to_tz_naive_utc(self, value): if value.tzinfo: utc_time = value.astimezone(utc) value = utc_time.replace(tzinfo=None) return value def _create_unique_id(self, product, doc_type, doc_id): if product: return u"%s:%s:%s" % (product, doc_type, doc_id) else: return u"%s:%s" % (doc_type, doc_id)