def iter_facet_query_types(self, association): """Get an iterator over query types and their associated facets. Only facets associated with the query types in the specified manner are returned; `association` must be one of IndexerConnection.FacetQueryType_Preferred or IndexerConnection.FacetQueryType_Never. The iterator returns 2-tuples, in which the first item is the query type and the second item is the associated set of facets. The return values are suitable for the dict() builtin, for example: >>> conn = IndexerConnection('db') >>> conn.add_field_action('foo', FieldActions.FACET) >>> conn.add_field_action('bar', FieldActions.FACET) >>> conn.add_field_action('baz', FieldActions.FACET) >>> conn.set_facet_for_query_type('type1', 'foo', conn.FacetQueryType_Preferred) >>> conn.set_facet_for_query_type('type1', 'bar', conn.FacetQueryType_Never) >>> conn.set_facet_for_query_type('type1', 'baz', conn.FacetQueryType_Never) >>> conn.set_facet_for_query_type('type2', 'bar', conn.FacetQueryType_Preferred) >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Preferred)) {'type1': set(['foo']), 'type2': set(['bar'])} >>> dict(conn.iter_facet_query_types(conn.FacetQueryType_Never)) {'type1': set(['bar', 'baz'])} """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if 'facets' in _checkxapian.missing_features: raise errors.IndexerError( "Facets unsupported with this release of xapian") return FacetQueryTypeIter(self._facet_query_table, association)
def get_document(self, id): """Get the document with the specified unique ID. Raises a KeyError if there is no such document. Otherwise, it returns a ProcessedDocument. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") postlist = self._index.postlist('Q' + id) try: plitem = postlist.next() except StopIteration: # Unique ID not found raise KeyError('Unique ID %r not found' % id) try: postlist.next() raise errors.IndexerError("Multiple documents " #pragma: no cover "found with same unique ID") except StopIteration: # Only one instance of the unique ID found, as it should be. pass result = ProcessedDocument(self._field_mappings) result.id = id result._doc = self._index.get_document(plitem.docid) return result
def replace(self, document): """Replace a document in the search engine index. If the document does not have a id set, an exception will be raised. If the document has a id set, and the id does not already exist in the database, this method will have the same effect as add(). """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if not hasattr(document, '_doc'): # It's not a processed document. document = self.process(document) # Ensure that we have a id id = document.id if id is None: raise errors.IndexerError( "No document ID set for document supplied to replace().") xapdoc = document.prepare() self._index.replace_document('Q' + id, xapdoc) if self._max_mem is not None: self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc) if self._mem_buffered > self._max_mem: self.flush()
def set_metadata(self, key, value): """Set an item of metadata stored in the connection. The value supplied will be returned by subsequent calls to get_metadata() which use the same key. Keys with a leading underscore are reserved for internal use - you should not use such keys unless you really know what you are doing. This will store the value supplied in the database. It will not be visible to readers (ie, search connections) until after the next flush. The key is limited to about 200 characters (the same length as a term is limited to). The value can be several megabytes in size. To remove an item of metadata, simply call this with a `value` parameter containing an empty string. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if not hasattr(self._index, 'set_metadata'): raise errors.IndexerError( "Version of xapian in use does not support metadata") log(self._index.set_metadata, key, value)
def set_facet_for_query_type(self, query_type, facet, association): """Set the association between a query type and a facet. The value of `association` must be one of IndexerConnection.FacetQueryType_Preferred, IndexerConnection.FacetQueryType_Never or None. A value of None removes any previously set association. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if query_type is None: raise errors.IndexerError( "Cannot set query type information for None") self._assert_facet(facet) if query_type not in self._facet_query_table: self._facet_query_table[query_type] = {} if association is None: if facet in self._facet_query_table[query_type]: del self._facet_query_table[query_type][facet] else: self._facet_query_table[query_type][facet] = association if self._facet_query_table[query_type] == {}: del self._facet_query_table[query_type] self._config_modified = True
def get_metadata(self, key): """Get an item of metadata stored in the connection. This returns a value stored by a previous call to set_metadata. If the value is not found, this will return the empty string. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if not hasattr(self._index, 'get_metadata'): raise errors.IndexerError( "Version of xapian in use does not support metadata") return log(self._index.get_metadata, key)
def get_subfacets(self, facet): """Get a list of subfacets of a facet. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") return [k for k, v in self._facet_hierarchy.iteritems() if v == facet]
def iter_synonyms(self, prefix=""): """Get an iterator over the synonyms. - `prefix`: if specified, only synonym keys with this prefix will be returned. The iterator returns 2-tuples, in which the first item is the key (ie, a 2-tuple holding the term or terms which will be synonym expanded, followed by the fieldname specified (or None if no fieldname)), and the second item is a tuple of strings holding the synonyms for the first item. These return values are suitable for the dict() builtin, so you can write things like: >>> conn = IndexerConnection('foo') >>> conn.add_synonym('foo', 'bar') >>> conn.add_synonym('foo bar', 'baz') >>> conn.add_synonym('foo bar', 'foo baz') >>> dict(conn.iter_synonyms()) {('foo', None): ('bar',), ('foo bar', None): ('baz', 'foo baz')} """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") return SynonymIter(self._index, self._field_mappings, prefix)
def add_synonym(self, original, synonym, field=None, original_field=None, synonym_field=None): """Add a synonym to the index. - `original` is the word or words which will be synonym expanded in searches (if multiple words are specified, each word should be separated by a single space). - `synonym` is a synonym for `original`. - `field` is the field which the synonym is specific to. If no field is specified, the synonym will be used for searches which are not specific to any particular field. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if original_field is None: original_field = field if synonym_field is None: synonym_field = field key = self._make_synonym_key(original, original_field) # FIXME - this only works for exact fields which have no upper case # characters, or single words value = self._make_synonym_key(synonym, synonym_field) self._index.add_synonym(key, value)
def process(self, document): """Process an UnprocessedDocument with the settings in this database. The resulting ProcessedDocument is returned. Note that this processing will be automatically performed if an UnprocessedDocument is supplied to the add() or replace() methods of IndexerConnection. This method is exposed to allow the processing to be performed separately, which may be desirable if you wish to manually modify the processed document before adding it to the database, or if you want to split processing of documents from adding documents to the database for performance reasons. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") result = ProcessedDocument(self._field_mappings) result.id = document.id context = ActionContext(self._index) for field in document.fields: try: actions = self._field_actions[field.name] except KeyError: # If no actions are defined, just ignore the field. continue actions.perform(result, field.value, context) return result
def get_fields_with_actions(self): """Get a list of field names which have actions defined. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") return self._field_actions.keys()
def _assert_facet(self, facet): """Raise an error if facet is not a declared facet field. """ for action in self._field_actions[facet]._actions: if action == FieldActions.FACET: return raise errors.IndexerError("Field %r is not indexed as a facet" % facet)
def remove_subfacet(self, subfacet): """Remove any existing facet hierarchy relationship for a subfacet. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if subfacet in self._facet_hierarchy: del self._facet_hierarchy[subfacet] self._config_modified = True
def iterids(self): """Get an iterator which returns all the ids in the database. The unqiue_ids are currently returned in binary lexicographical sort order, but this should not be relied on. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") return PrefixedTermIter('Q', self._index.allterms())
def get_doccount(self): """Count the number of documents in the database. This count will include documents which have been added or removed but not yet flushed(). """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") return self._index.get_doccount()
def add(self, document): """Add a new document to the search engine index. If the document has a id set, and the id already exists in the database, an exception will be raised. Use the replace() method instead if you wish to overwrite documents. Returns the id of the newly added document (making up a new unique ID if no id was set). The supplied document may be an instance of UnprocessedDocument, or an instance of ProcessedDocument. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if not hasattr(document, '_doc'): # It's not a processed document. document = self.process(document) # Ensure that we have a id orig_id = document.id if orig_id is None: id = self._allocate_id() document.id = id else: id = orig_id if self._index.term_exists('Q' + id): raise errors.IndexerError( "Document ID of document supplied to add() is not unique.") # Add the document. xapdoc = document.prepare() self._index.add_document(xapdoc) if self._max_mem is not None: self._mem_buffered += self._get_bytes_used_by_doc_terms(xapdoc) if self._mem_buffered > self._max_mem: self.flush() if id is not orig_id: document.id = orig_id return id
def delete(self, id): """Delete a document from the search engine index. If the id does not already exist in the database, this method will have no effect (and will not report an error). """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") self._index.delete_document('Q' + id)
def set_max_mem_use(self, max_mem=None, max_mem_proportion=None): """Set the maximum memory to use. This call allows the amount of memory to use to buffer changes to be set. This will affect the speed of indexing, but should not result in other changes to the indexing. Note: this is an approximate measure - the actual amount of memory used max exceed the specified amount. Also, note that future versions of xapian are likely to implement this differently, so this setting may be entirely ignored. The absolute amount of memory to use (in bytes) may be set by setting max_mem. Alternatively, the proportion of the available memory may be set by setting max_mem_proportion (this should be a value between 0 and 1). Setting too low a value will result in excessive flushing, and very slow indexing. Setting too high a value will result in excessive buffering, leading to swapping, and very slow indexing. A reasonable default for max_mem_proportion for a system which is dedicated to indexing is probably 0.5: if other tasks are also being performed on the system, the value should be lowered. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if max_mem is not None and max_mem_proportion is not None: raise errors.IndexerError("Only one of max_mem and " "max_mem_proportion may be specified") if max_mem is None and max_mem_proportion is None: self._max_mem = None if max_mem_proportion is not None: physmem = memutils.get_physical_memory() if physmem is not None: max_mem = int(physmem * max_mem_proportion) self._max_mem = max_mem
def clear_synonyms(self, original, field=None): """Remove all synonyms for a word (or phrase). - `field` is the field which this synonym is specific to. If no field is specified, the synonym will be used for searches which are not specific to any particular field. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") key = self._make_synonym_key(original, field) self._index.clear_synonyms(key)
def add_term(self, field, term, wdfinc=1, positions=None): """Add a term to the document. Terms are the main unit of information used for performing searches. - `field` is the field to add the term to. - `term` is the term to add. - `wdfinc` is the value to increase the within-document-frequency measure for the term by. - `positions` is the positional information to add for the term. This may be None to indicate that there is no positional information, or may be an integer to specify one position, or may be a sequence of integers to specify several positions. (Note that the wdf is not increased automatically for each position: if you add a term at 7 positions, and the wdfinc value is 2, the total wdf for the term will only be increased by 2, not by 14.) """ prefix = self._fieldmappings.get_prefix(field) if len(term) > 0: # We use the following check, rather than "isupper()" to ensure # that we match the check performed by the queryparser, regardless # of our locale. if ord(term[0]) >= ord('A') and ord(term[0]) <= ord('Z'): prefix = prefix + ':' # Note - xapian currently restricts term lengths to about 248 # characters - except that zero bytes are encoded in two bytes, so # in practice a term of length 125 characters could be too long. # Xapian will give an error when commit() is called after such # documents have been added to the database. # As a simple workaround, we give an error here for terms over 220 # characters, which will catch most occurrences of the error early. # # In future, it might be good to change to a hashing scheme in this # situation (or for terms over, say, 64 characters), where the # characters after position 64 are hashed (we obviously need to do this # hashing at search time, too). if len(prefix + term) > 220: raise errors.IndexerError( "Field %r is too long: maximum length " "220 - was %d (%r)" % (field, len(prefix + term), prefix + term)) if positions is None: self._doc.add_term(prefix + term, wdfinc) elif isinstance(positions, int): self._doc.add_posting(prefix + term, positions, wdfinc) else: self._doc.add_term(prefix + term, wdfinc) for pos in positions: self._doc.add_posting(prefix + term, pos, 0)
def flush(self): """Apply recent changes to the database. If an exception occurs, any changes since the last call to flush() may be lost. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if self._config_modified: self._store_config() self._index.flush() self._mem_buffered = 0
def add_subfacet(self, subfacet, facet): """Add a subfacet-facet relationship to the facet hierarchy. Any existing relationship for that subfacet is replaced. Raises a KeyError if either facet or subfacet is not a field, and an IndexerError if either facet or subfacet is not a facet field. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") self._assert_facet(facet) self._assert_facet(subfacet) self._facet_hierarchy[subfacet] = facet self._config_modified = True
def clear_field_actions(self, fieldname): """Clear all actions for the specified field. This does not report an error if there are already no actions for the specified field. Note that this change to the configuration will not be preserved on disk until the next call to flush(). """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if fieldname in self._field_actions: del self._field_actions[fieldname] self._config_modified = True
def iter_subfacets(self): """Get an iterator over the facet hierarchy. The iterator returns 2-tuples, in which the first item is the subfacet and the second item is its parent facet. The return values are suitable for the dict() builtin, for example: >>> conn = IndexerConnection('db') >>> conn.add_field_action('foo', FieldActions.FACET) >>> conn.add_field_action('bar', FieldActions.FACET) >>> conn.add_field_action('baz', FieldActions.FACET) >>> conn.add_subfacet('foo', 'bar') >>> conn.add_subfacet('baz', 'bar') >>> dict(conn.iter_subfacets()) {'foo': 'bar', 'baz': 'bar'} """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if 'facets' in _checkxapian.missing_features: raise errors.IndexerError( "Facets unsupported with this release of xapian") return self._facet_hierarchy.iteritems()
def remove_synonym(self, original, synonym, field=None): """Remove a synonym from the index. - `original` is the word or words which will be synonym expanded in searches (if multiple words are specified, each word should be separated by a single space). - `synonym` is a synonym for `original`. - `field` is the field which this synonym is specific to. If no field is specified, the synonym will be used for searches which are not specific to any particular field. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") key = self._make_synonym_key(original, field) self._index.remove_synonym(key, synonym.lower())
def add_field_action(self, fieldname, fieldtype, **kwargs): """Add an action to be performed on a field. Note that this change to the configuration will not be preserved on disk until the next call to flush(). """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if fieldname in self._field_actions: actions = self._field_actions[fieldname] else: actions = FieldActions(fieldname) self._field_actions[fieldname] = actions actions.add(self._field_mappings, fieldtype, **kwargs) self._config_modified = True
def get_facets_for_query_type(self, query_type, association): """Get the set of facets associated with a query type. Only those facets associated with the query type in the specified manner are returned; `association` must be one of IndexerConnection.FacetQueryType_Preferred or IndexerConnection.FacetQueryType_Never. If the query type has no facets associated with it, None is returned. """ if self._index is None: raise errors.IndexerError("IndexerConnection has been closed") if query_type not in self._facet_query_table: return None facet_dict = self._facet_query_table[query_type] return set([ facet for facet, assoc in facet_dict.iteritems() if assoc == association ])
def add(self, field_mappings, action, **kwargs): """Add an action to perform on a field. """ if action in self._unsupported_actions: raise errors.IndexerError( "Action unsupported with this release of xapian") if action not in ( FieldActions.STORE_CONTENT, FieldActions.INDEX_EXACT, FieldActions.INDEX_FREETEXT, FieldActions.SORTABLE, FieldActions.COLLAPSE, FieldActions.TAG, FieldActions.FACET, ): raise errors.IndexerError("Unknown field action: %r" % action) info = self._action_info[action] # Check parameter names for key in kwargs.keys(): if key not in info[1]: raise errors.IndexerError( "Unknown parameter name for action %r: %r" % (info[0], key)) # Fields cannot be indexed both with "EXACT" and "FREETEXT": whilst we # could implement this, the query parser wouldn't know what to do with # searches. if action == FieldActions.INDEX_EXACT: if FieldActions.INDEX_FREETEXT in self._actions: raise errors.IndexerError( "Field %r is already marked for indexing " "as free text: cannot mark for indexing " "as exact text as well" % self._fieldname) if action == FieldActions.INDEX_FREETEXT: if FieldActions.INDEX_EXACT in self._actions: raise errors.IndexerError( "Field %r is already marked for indexing " "as exact text: cannot mark for indexing " "as free text as well" % self._fieldname) # Fields cannot be indexed as more than one type for "SORTABLE": to # implement this, we'd need to use a different prefix for each sortable # type, but even then the search end wouldn't know what to sort on when # searching. Also, if they're indexed as "COLLAPSE", the value must be # stored in the right format for the type "SORTABLE". if action == FieldActions.SORTABLE or action == FieldActions.COLLAPSE: if action == FieldActions.COLLAPSE: sorttype = None else: try: sorttype = kwargs['type'] except KeyError: sorttype = 'string' kwargs['type'] = sorttype action = FieldActions.SORT_AND_COLLAPSE try: oldsortactions = self._actions[FieldActions.SORT_AND_COLLAPSE] except KeyError: oldsortactions = () if len(oldsortactions) > 0: for oldsortaction in oldsortactions: oldsorttype = oldsortaction['type'] if sorttype == oldsorttype or oldsorttype is None: # Use new type self._actions[action] = [] elif sorttype is None: # Use old type return else: raise errors.IndexerError("Field %r is already marked for " "sorting, with a different " "sort type" % self._fieldname) if 'prefix' in info[3]: field_mappings.add_prefix(self._fieldname) if 'slot' in info[3]: purposes = info[3]['slot'] if isinstance(purposes, basestring): field_mappings.add_slot(self._fieldname, purposes) else: slotnum = None for purpose in purposes: slotnum = field_mappings.get_slot(self._fieldname, purpose) if slotnum is not None: break for purpose in purposes: field_mappings.add_slot(self._fieldname, purpose, slotnum=slotnum) # Make an entry for the action if action not in self._actions: self._actions[action] = [] # Check for repetitions of actions for old_action in self._actions[action]: if old_action == kwargs: return # Append the action to the list of actions self._actions[action].append(kwargs)