def vector_as(self, docnum, fieldnum, astype): format = self.vector_format(fieldnum) if format is None: raise FieldConfigurationError("Field %r is not vectored" % self.schema.number_to_name(fieldnum)) elif not format.supports(astype): raise FieldConfigurationError( "Field %r does not support %r" % (self.schema.number_to_name(fieldnum), astype)) interpreter = format.interpreter(astype) for text, data in self.vector(docnum, fieldnum): yield (text, interpreter(data))
def postings_as(self, fieldnum, text, astype, exclude_docs=None): """Yields interpreted data for each document containing the given term. The current field must have stored positions for this to work. :astype: how to interpret the posting data, for example "positions". The field must support the interpretation. :exclude_docs: a set of document numbers to ignore. This is used by queries to skip documents that have already been eliminated from consideration. :boost: a factor by which to multiply each weight. """ format = self.schema.field_by_number(fieldnum).format if not format.supports(astype): raise FieldConfigurationError( "Field %r format does not support %r" % (self.schema.name_to_number(fieldnum), astype)) interp = format.interpreter(astype) for docnum, data in self.postings(fieldnum, text, exclude_docs=exclude_docs): yield (docnum, interp(data))
def doc_field_length(self, docnum, fieldid): fieldid = self.schema.to_number(fieldid) if fieldid not in self._scorable_fields: raise FieldConfigurationError("Field %r does not store lengths" % fieldid) pos = self._fieldnum_to_scorable_pos[fieldid] return self.doclengths.at(docnum, pos)
def doc_field_length(self, docnum, fieldid): """Returns the number of terms in the given field in the given document. This is used by some scoring algorithms. """ fieldid = self.schema.to_number(fieldid) if fieldid not in self._scorable_fields: raise FieldConfigurationError("Field %r does not store lengths" % fieldid) pos = self._fieldnum_to_pos[fieldid] return self.doclength_table.get(docnum, pos)
def add(self, name, fieldtype, glob=False): # If the user passed a type rather than an instantiated field object, # instantiate it automatically if type(fieldtype) is type: try: fieldtype = fieldtype() except Exception: e = sys.exc_info()[1] raise FieldConfigurationError("Error: %s instantiating field " "%r: %r" % (e, name, fieldtype)) if not isinstance(fieldtype, FieldType): raise FieldConfigurationError("%r is not a FieldType object" % fieldtype) self._subfields[name] = sublist = [] for suffix, subfield in fieldtype.subfields(): fname = name + "." + suffix if suffix else name sublist.append(fname) # Check field name if fname.startswith("_"): raise FieldConfigurationError("Names cannot start with _") elif " " in fname: raise FieldConfigurationError("Names cannot contain spaces") elif fname in self._fields or (glob and fname in self._dyn_fields): raise FieldConfigurationError("%r already in schema" % fname) # Add the field if isinstance(subfield, DICT): self.add(fname, subfield) continue if glob: expr = re.compile(fnmatch.translate(name)) self._dyn_fields[fname] = (expr, subfield) else: subfield.on_add(self, fname) self._fields[fname] = subfield
def vector_as(self, docnum, fieldnum, astype): """Yields a sequence of interpreted (text, data) tuples representing the term vector for the given document and field. This method uses the vector format object's 'data_to_*' method to interpret the data. For example, if the vector format has a 'data_to_positions()' method, you can use vector_as(x, y, "positions") to get a positions vector. """ format = self.vector_format(fieldnum) if format is None: raise FieldConfigurationError("Field %r is not vectored" % self.schema.number_to_name(fieldnum)) elif not format.supports(astype): raise FieldConfigurationError( "Field %r does not support %r" % (self.schema.number_to_name(fieldnum), astype)) interpreter = format.interpreter(astype) for text, data in self.vector(docnum, fieldnum): yield (text, interpreter(data))