Ejemplo n.º 1
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Ejemplo n.º 2
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Ejemplo n.º 3
0
 def delete(self, idMetaInfo, metaType):
     '''
     @see: ISearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, metaType))
     si.delete(str(idMetaInfo))
     si.commit()
Ejemplo n.º 4
0
 def delete(self, id):
     '''
     @see: IArticleSearchProvider.delete()
     '''
     si = SolrInterface('http://%s%s' % (self.solr_server_url, 'article'))
     si.delete(str(id))
     si.commit()
Ejemplo n.º 5
0
def get_test_solr():
    settings.SOLR_ENDPOINT = 'http://localhost:8983/solr/data_test'

    solr = SolrInterface(settings.SOLR_ENDPOINT) 
    solr.delete(queries='*:*', commit=True)

    return solr
Ejemplo n.º 6
0
 def run(self, solr_id):
     """ Run the synchronization, delete the record on SolR
     :param solr_id: identifier of the record to delete
     """
     si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth
     si.delete(solr_id)
     return _('Record %s deleted on SolR') % solr_id
Ejemplo n.º 7
0
 def run(self, solr_id):
     """ Run the synchronization, delete the record on Solr
     :param solr_id: identifier of the record to delete
     """
     si = SolrInterface(self.backend_record.location.encode('utf-8')) #TODO auth
     si.delete(solr_id)
     si.commit()
     return _('Record %s deleted on Solr') % solr_id
Ejemplo n.º 8
0
def dataset_purge_data(dataset_id):
    """
    Purge a dataset from Solr.
    """
    log = logging.getLogger('redd.tasks.dataset_purge_data')
    log.info('Beginning purge, dataset_id: %i' % dataset_id)

    solr = SolrInterface(settings.SOLR_ENDPOINT)
    solr.delete(queries='dataset_id: %i' % dataset_id, commit=True)

    log.info('Finished purge, dataset_id: %i' % dataset_id)
Ejemplo n.º 9
0
def dataset_import_data(dataset_id):
    """
    Import a dataset into Solr.
    """
    from redd.models import Dataset

    log = logging.getLogger('redd.tasks.dataset_import_data')
    log.info('Beginning import, dataset_id: %i' % dataset_id)

    dataset = Dataset.objects.get(id=dataset_id)

    solr = SolrInterface(settings.SOLR_ENDPOINT)
    #solr_fields = []

    #for h, t in dataset.schema:
    #    if t == 'NoneType':
    #        solr_fields.append(None)
    #    else:
    #        solr_fields.append('%s_%s' % (h, t.__name__))
        
    reader = CSVKitReader(open(dataset.data_upload.get_path(), 'r'))
    reader.next()

    add_buffer = []
    normal_type_exceptions = []

    for i, row in enumerate(reader, start=1):
        data = {}

        typing="""for t, header, field, value in izip(normal_types, headers, solr_fields, row):
         try:
                value = normalize_column_type([value], normal_type=t)[1][0]
            except InvalidValueForTypeException:
                # Convert exception to row-specific error
                normal_type_exceptions.append(InferredNormalFalsifiedException(i, header, value, t))
                continue

            # No reason to send null fields to Solr (also sunburnt doesn't like them) 
            if value == None:
                continue

            if t in [unicode, bool, int, float]:
                if value == None:
                    continue

                data[field] = value
            elif t == datetime:
                data[field] = value.isoformat()
            elif t == date:
                pass
            elif t == time:
                pass
            else:
                # Note: if NoneType should never fall through to here 
                raise TypeError('Unexpected normal type: %s' % t.__name__)"""

        # If we've had a normal type exception, don't bother do the rest of this
        if not normal_type_exceptions:
            data = {
                'id': uuid4(),
                'dataset_id': dataset.id,
                'row': i,
                'full_text': '\n'.join(row),
                'csv_data': json.dumps(row)
            }

            add_buffer.append(data)

            if i % SOLR_ADD_BUFFER_SIZE == 0:
                solr.add(add_buffer)
                add_buffer = []

    if add_buffer:
        solr.add(add_buffer)
        add_buffer = []
    
    if not normal_type_exceptions:
        solr.commit()
    else:
        # Rollback pending changes
        solr.delete(queries=solr.query(dataset_id=dataset.id))
        
        for e in normal_type_exceptions:
            print e 

    log.info('Finished import, dataset_id: %i' % dataset_id)
Ejemplo n.º 10
0
class SolrPipeline(object):

    search_engine = endpoints.solr + '/dealschrome/search-engine'
    archive = endpoints.solr + '/dealschrome/archive'

    def __init__(self):
        dispatcher.connect(self.spider_opened, signals.spider_opened)
        dispatcher.connect(self.spider_closed, signals.spider_closed)
        self.si_eng = SolrInterface(self.search_engine)
        self.si_eng.init_schema()
        self.si_arc = SolrInterface(self.archive)
        self.si_arc.init_schema()
        self.old_deals = {}

    def spider_opened(self, spider):
        source = spider.allowed_domains[0]
        old_temp = self.get_old_deals(source)
        self.old_deals[spider] = {i['id']:i for i in old_temp}
        spider.old_deals = dict(self.old_deals[spider])

    def spider_closed(self, spider):
        source = spider.allowed_domains[0]
        old_deals = self.old_deals.pop(spider)
        
        if spider.crawled_items.items():        
            for k,v in spider.crawled_items.items():
                if old_deals.has_key(v['url']):
                    field_created = old_deals[v['url']]['created']
                    del old_deals[v['url']]
                else:
                    field_created = int(time())
                    
                data = {
                    'id' : v['url'],
                    'title' : v['title'],
                    'dealsource' : source,
                    'price' : str(v['price']),
                    'worth' : str(v['worth']),
                    'discount' : str(v['discount']),
                    'bought' : str(v['bought']),
                    'imgsrc' : v['imgsrc'],
                    'category' : v['category'],
                    'created' : field_created,
                    'expiry' : str(v['expiry']),
                    'merchant' : v['merchant'],
                    'address' : v['address'],
                    'description': v['description'],
                }
                if v['location']:
                    # only add location when location exists
                    data['location'] = v['location']
                
                # its a BUG,this code is to correct multiple valued category
                if len(data['category']) > 1 and not isinstance(data['category'], types.StringTypes):
                    data['category'] = data['category'][0]
                            
                self.si_eng.add(data)
                self.si_arc.add(data)
            
            self.si_eng.commit()
            self.si_arc.commit()
            
            pending_delete = [doc for doc in old_deals.itervalues()]
            if pending_delete:
                self.si_eng.delete(pending_delete)
            
            self.si_eng.commit()
            self.si_arc.commit()
    
    def get_old_deals(self, source):
        old_deals = self.si_eng\
            .query(dealsource_raw=source)\
            .field_limit(['id','created','category_raw'],score=False)\
            .paginate(rows=900)\
            .execute()
        return old_deals        
        
class SolrBackend(Component):
    implements(ISearchBackend)

    UNIQUE_ID = "unique_id"

    HIGHLIGHTABLE_FIELDS = {
        "unique_id" : True,
        "id" : True,
        "type" : True,
        "product" : True,
        "milestone" : True,
        "author" : True,
        "component" : True,
        "status" : True,
        "resolution" : True,
        "keywords" : True,
        "summary" : True,
        "content" : True,
        "changes" : True,
        "owner" : True,
        "repository" : True,
        "revision" : True,
        "message" : True,
        "name" : True
        }

    server_url = Option(
            BHSEARCH_CONFIG_SECTION,
            'solr_server_url',
            doc="""Url of the server running Solr instance.""",
            doc_domain='bhsearch')

    def __init__(self):
        self.solr_interface = SolrInterface(str(self.server_url))

    def add_doc(self, doc, operation_context=None):
        self._reformat_doc(doc)
        doc[self.UNIQUE_ID] = self._create_unique_id(doc.get("product", ''),
                                                doc["type"], doc["id"])
        self.solr_interface.add(doc)
        self.solr_interface.commit()

    def delete_doc(product, doc_type, doc_id, operation_context=None):
        unique_id = self._create_unique_id(product, doc_type, doc_id)
        self.solr_interface.delete(unique_id)

    def optimize(self):
        self.solr_interface.optimize()

    def query(
            self, query, query_string, sort = None, fields = None,
            filter = None, facets = None, pagenum = 1, pagelen = 20,
            highlight = False, highlight_fields = None, context = None):

        if not query_string:
            query_string = "*.*"

        final_query_chain = self._create_query_chain(query, query_string)
        solr_query = self.solr_interface.query(final_query_chain)
        faceted_solr_query = solr_query.facet_by(facets)
        highlighted_solr_query = faceted_solr_query.highlight(
                                    self.HIGHLIGHTABLE_FIELDS)

        start = 0 if pagenum == 1 else pagelen * pagenum
        paginated_solr_query = highlighted_solr_query.paginate(
                            start=start, rows=pagelen)
        results = paginated_solr_query.execute()

        mlt, hexdigests = self.query_more_like_this(paginated_solr_query,
                                                    fields="type", mindf=1,
                                                    mintf=1)

        query_result = self._create_query_result(highlighted_solr_query,
                                                 results, fields, pagenum,
                                                 pagelen)
        return query_result, mlt, hexdigests

    def query_more_like_this(self, query_chain, **kwargs):
        mlt_results = query_chain.mlt(**kwargs).execute().more_like_these
        mlt_dict = {}
        hexdigests = {}

        for doc, results in mlt_results.iteritems():
            hexdigest = hashlib.md5(doc).hexdigest()
            hexdigests[doc] = hexdigest

            for mlt_doc in results.docs:
                if doc not in mlt_dict:
                    mlt_dict[doc] = [self._process_doc(mlt_doc)]
                else:
                    mlt_dict[doc].append(self._process_doc(mlt_doc))

        return mlt_dict, hexdigests

    def _process_doc(self, doc):
        ui_doc = dict(doc)

        if doc.get('product'):
            env = ProductEnvironment(self.env, doc['product'])
            product_href = ProductEnvironment.resolve_href(env, self.env)
            ui_doc["href"] = product_href(doc['type'], doc['id'])
        else:
            ui_doc["href"] = self.env.href(doc['type'], doc['id'])

        ui_doc['title'] = str(doc['type'] + ": " + doc['_stored_name']).title()

        return ui_doc

    def _create_query_result(
                        self, query, results, fields, pagenum, pagelen):
        total_num, total_page_count, page_num, offset = \
                    self._prepare_query_result_attributes(query, results,
                                                          pagenum, pagelen)

        query_results = QueryResult()
        query_results.hits = total_num
        query_results.total_page_count = total_page_count
        query_results.page_number = page_num
        query_results.offset = offset

        docs = []
        highlighting = []

        for retrieved_record in results:
            result_doc = self._process_record(fields, retrieved_record)
            docs.append(result_doc)

            result_highlights = dict(retrieved_record['solr_highlights'])

            highlighting.append(result_highlights)
            query_results.docs = docs
            query_results.highlighting = highlighting

        return query_results

    def _create_query_chain(self, query, query_string):
        matches = re.findall(re.compile(r'([\w\*]+)'), query_string)
        tokens = set([match for match in matches])

        final_query_chain = None
        for token in tokens:
            token_query_chain = self._search_fields_for_token(token)
            if final_query_chain is None:
                final_query_chain = token_query_chain
            else:
                final_query_chain |= token_query_chain

        return final_query_chain

    def _process_record(self, fields, retrieved_record):
        result_doc = dict()
        if fields:
            for field in fields:
                if field in retrieved_record:
                    result_doc[field] = retrieved_record[field]
        else:
            for key, value in retrieved_record.items():
                result_doc[key] = value

        for key, value in result_doc.iteritems():
            result_doc[key] = self._from_whoosh_format(value)

        return result_doc

    def _from_whoosh_format(self, value):
        if isinstance(value, datetime):
            value = utc.localize(value)
        return value

    def _prepare_query_result_attributes(
                                    self, query, results, pagenum, pagelen):
        results_total_num = query.execute().result.numFound
        total_page_count = int(ceil(results_total_num / pagelen))
        pagenum = min(total_page_count, pagenum)

        offset = (pagenum-1) * pagelen
        if (offset+pagelen) > results_total_num:
            pagelen = results_total_num - offset

        return results_total_num, total_page_count, pagenum, offset

    def is_index_outdated(self):
        return False

    def recreate_index(self):
        return True

    @contextmanager
    def start_operation(self):
        yield

    def _search_fields_for_token(self, token):
        q_chain = None
        field_boosts = DefaultQueryParser(self.env).field_boosts

        for field, boost in field_boosts.iteritems():
            if field != 'query_suggestion_basket' and field != 'relations':
                field_token_dict = {field: token}
                if q_chain is None:
                    q_chain = self.solr_interface.Q(**field_token_dict)**boost
                else:
                    q_chain |= self.solr_interface.Q(**field_token_dict)**boost

        return q_chain

    def _reformat_doc(self, doc):
        for key, value in doc.items():
            if key is None:
                del doc[None]
            elif value is None:
                del doc[key]
            elif isinstance(value, basestring) and value == "":
                del doc[key]
            else:
                doc[key] = self._to_whoosh_format(value)

    def _to_whoosh_format(self, value):
        if isinstance(value, basestring):
            value = unicode(value)
        elif isinstance(value, datetime):
            value = self._convert_date_to_tz_naive_utc(value)
        return value

    def _convert_date_to_tz_naive_utc(self, value):
        if value.tzinfo:
            utc_time = value.astimezone(utc)
            value = utc_time.replace(tzinfo=None)
        return value

    def _create_unique_id(self, product, doc_type, doc_id):
        if product:
            return u"%s:%s:%s" % (product, doc_type, doc_id)
        else:
            return u"%s:%s" % (doc_type, doc_id)