def _select(self, params): # encode the query as utf-8 so urlencode can handle it params['q'] = unicode_safe(params['q']) path = '%s/select/?%s' % (self.path, urlencode(params)) conn = HTTPConnection(self.host, self.port) conn.request('GET', path) return conn.getresponse()
def fetch_article(article,titlefield,linkfield,niceify=False): if isinstance(article,dict): keys=article.keys() if titlefield in keys and linkfield in keys: title=unicode_safe(validate_title(article[titlefield])) link=validate_link(article[linkfield]) if niceify: title=niceify_title(title,niceify) print "Niced title: %s" % title if title and link: return dict(title=title,link=link) return None
def _from_python(value): """ Converts python values to a form suitable for insertion into the xml we send to solr. """ if isinstance(value, datetime): value = value.strftime('%Y-%m-%dT%H:%M:%S.000Z') elif isinstance(value, date): value = value.strftime('%Y-%m-%dT00:00:00.000Z') elif isinstance(value, bool): if value: value = 'true' else: value = 'false' else: value = unicode_safe(value) return value
def changed(types=None,since=None,commit=True,optimize=False): """ Run by `cron` (through `paster run`) on a schedule to update all Things that have been created or have changed since the last run. Things add themselves to a `thing_changes` table, which we read, find the Things, tokenise, and re-submit them to Solr """ global indexed_types set_emptying_cache() start_t = datetime.now() if not types: types = indexed_types if not since: since = get_last_run() all_changed = [] for cls in types: changed = set(x[0] for x in thing_changes.get_changed(cls,min_date = since)) # changed =:= [(Fullname,Date) | ...] changed = cls._by_fullname(changed, data=True, return_dict=False) changed = [x for x in changed if not x._spam and not x._deleted] # note: anything marked as spam or deleted is not updated in # the search database. Since these are filtered out in the UI, # that's probably fine. if len(changed) > 0: changed = tokenize_things(changed) print "Found %d %ss starting with %s" % (len(changed),cls.__name__,unicode_safe(changed[0]['contents'])) all_changed += changed else: print "No changed %ss detected" % (cls.__name__,) with SolrConnection(commit=commit,optimize=optimize) as s: s.add(all_changed) save_last_run(start_t)
Thing: (Field('fullname', '_fullname'), Field('date', '_date', is_date=True, reverse=True), Field('lang'), Field('ups', '_ups', is_number=True, reverse=True), Field('downs', '_downs', is_number=True, reverse=True), Field('spam', '_spam'), Field('deleted', '_deleted'), Field('hot', lambda t: t._hot * 1000, is_number=True, reverse=True), Field('controversy', '_controversy', is_number=True, reverse=True), Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)), Subreddit: ( Field('contents', lambda s: ' '.join([ unicode_safe(s.name), unicode_safe(s.title), unicode_safe(s.description), unicode_safe(s.firsttext) ]), tokenize=True), Field('boost', '_downs'), #Field('title'), #Field('firsttext'), #Field('description'), #Field('over_18'), #Field('sr_type','type'), ), Link: ( Field('contents', 'title', tokenize=True), Field(
def str_to_python(self, value): """ Convert an 'str' field from solr's xml format to python and return it. """ return unicode_safe(value)
# discussion of multi-language search. The 'boost' field is a # solr-magic field that ends up being an attribute on the <doc> # message (rather than a field), and is used to do an index-time boost # (this magic is done in pysolr.dor_to_elemtree) search_fields={Thing: (Field('fullname', '_fullname'), Field('date', '_date', is_date = True, reverse=True), Field('lang'), Field('ups', '_ups', is_number=True, reverse=True), Field('downs', '_downs', is_number=True, reverse=True), Field('spam','_spam'), Field('deleted','_deleted'), Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True), Field('controversy', '_controversy', is_number=True, reverse=True), Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)), Subreddit: (Field('contents', lambda s: ' '.join([unicode_safe(s.name), unicode_safe(s.title), unicode_safe(s.description), unicode_safe(s.firsttext)]), tokenize = True), Field('boost', '_downs'), #Field('title'), #Field('firsttext'), #Field('description'), #Field('over_18'), #Field('sr_type','type'), ), Link: (Field('contents','title', tokenize = True), Field('boost', lambda t: int(t._hot*1000), # yes, it's a copy of 'hot' is_number=True, reverse=True),
return ("<ThingField: (%s,%s,%s,%s)>" % (self.name,self.cls,self.id_attr,self.lu_attr_name)) search_fields={Thing: (Field('fullname', '_fullname'), Field('date', '_date', is_date = True, reverse=True), Field('lang'), Field('ups', '_ups', is_number=True, reverse=True), Field('downs', '_downs', is_number=True, reverse=True), Field('spam','_spam'), Field('deleted','_deleted'), Field('hot', lambda t: t._hot*1000, is_number=True, reverse=True), Field('controversy', '_controversy', is_number=True, reverse=True), Field('points', lambda t: (t._ups - t._downs), is_number=True, reverse=True)), Subreddit: (Field('contents', lambda s: ' '.join([unicode_safe(s.name), unicode_safe(s.title), unicode_safe(s.description), unicode_safe(s.firsttext)]), tokenize = True), Field('boost', '_downs'), #Field('title'), #Field('firsttext'), #Field('description'), #Field('over_18'), #Field('sr_type','type'), ), Link: (Field('contents','title', tokenize = True), Field('boost', lambda t: int(t._hot*1000), # yes, it's a copy of 'hot' is_number=True, reverse=True),