Esempio n. 1
0
def test_value_stats():
    """Simple test of being able to get value statistics.

    """
    dbpath = 'db_test_value_stats'
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OVERWRITE)

    vals = (6, 9, 4.5, 4.4, 4.6, 2, 1, 4, 3, 0)
    for id in range(10):
        doc = xapian.Document()
        doc.add_value(1, xapian.sortable_serialise(vals[id]))
        db.add_document(doc)

    expect(db.get_value_freq(0), 0)
    expect(db.get_value_lower_bound(0), "")
    expect(db.get_value_upper_bound(0), "")
    expect(db.get_value_freq(1), 10)
    expect(db.get_value_lower_bound(1), xapian.sortable_serialise(0))
    expect(db.get_value_upper_bound(1), xapian.sortable_serialise(9))
    expect(db.get_value_freq(2), 0)
    expect(db.get_value_lower_bound(2), "")
    expect(db.get_value_upper_bound(2), "")

    db.close()
    shutil.rmtree(dbpath)
Esempio n. 2
0
    def range_query(self, fieldname, value1, value2):
        """Construct a xapian.Query object representing a value range.
        
        `fieldname` is the field to search.
        `value1` and `value2` define the range, inclusively.
        
        The values must be of the same type (int, float or datetime). In the
        latter case, the fieldmap will generate helper terms to try to
        optimise the query.
        
        """
        if type(value1) is not type(value2):
            raise SearchError, 'cannot mix types in a query range'

        try:
            prefix, valnum, isfilter = self._fieldmap[fieldname]
        except KeyError:
            raise SearchError, 'fieldname %s not in fieldmap' % fieldname

        if isinstance(value1, int) or isinstance(value1, float):
            return xapian.Query(xapian.Query.OP_VALUE_RANGE, valnum,
                                xapian.sortable_serialise(value1),
                                xapian.sortable_serialise(value2))

        elif isinstance(value1, datetime):
            #            term = '%s%04d%02d%02d' % (prefix, v.year, v.month, v.day)
            #            strv = '%04d%02d%02d%02d%02d%02d' % (
            #                v.year, v.month, v.day, v.hour, v.minute, v.second)
            #           FIXME - helper terms?
            return xapian.Query(
                xapian.Query.OP_VALUE_RANGE, valnum,
                xapian.sortable_serialise(time.mktime(value1.timetuple())),
                xapian.sortable_serialise(time.mktime(value2.timetuple())))
Esempio n. 3
0
    def indexDeb822(self, document, pkg):
        """
        Update the document with the information from this data source.

        This is alternative to index, and it is used when indexing with package
        data taken from a custom Packages file.

        document  is the document to update
        pkg       is the Deb822 object for this package
        """
        try:
            instSize = int(pkg["Installed-Size"])
            pkgSize = int(pkg["Size"])
        except:
            return

        if self.val_inst_size != -1:
            try:
                document.add_value(self.val_inst_size,
                                   xapian.sortable_serialise(instSize))
            except (OverflowError, SystemError):
                pass
        if self.val_pkg_size != -1:
            try:
                document.add_value(self.val_pkg_size,
                                   xapian.sortable_serialise(pkgSize))
            except (OverflowError, SystemError):
                pass
Esempio n. 4
0
    def index_document(self, document, properties):
        document.add_value(
            _VALUE_TIMESTAMP,
            xapian.sortable_serialise(float(properties['timestamp'])))
        document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
        if 'filesize' in properties:
            try:
                document.add_value(
                    _VALUE_FILESIZE,
                    xapian.sortable_serialise(int(properties['filesize'])))
            except (ValueError, TypeError):
                logging.debug('Invalid value for filesize property: %s',
                              properties['filesize'])
        if 'creation_time' in properties:
            try:
                document.add_value(
                    _VALUE_CREATION_TIME,
                    xapian.sortable_serialise(
                        float(properties['creation_time'])))
            except (ValueError, TypeError):
                logging.debug('Invalid value for creation_time property: %s',
                              properties['creation_time'])

        self.set_document(document)

        properties = dict(properties)
        self._index_known(document, properties)
        self._index_unknown(document, properties)
Esempio n. 5
0
def test_value_stats():
    """Simple test of being able to get value statistics.

    """
    dbpath = "db_test_value_stats"
    db = xapian.chert_open(dbpath, xapian.DB_CREATE_OR_OVERWRITE)

    vals = (6, 9, 4.5, 4.4, 4.6, 2, 1, 4, 3, 0)
    for id in range(10):
        doc = xapian.Document()
        doc.add_value(1, xapian.sortable_serialise(vals[id]))
        db.add_document(doc)

    expect(db.get_value_freq(0), 0)
    expect(db.get_value_lower_bound(0), "")
    expect(db.get_value_upper_bound(0), "")
    expect(db.get_value_freq(1), 10)
    expect(db.get_value_lower_bound(1), xapian.sortable_serialise(0))
    expect(db.get_value_upper_bound(1), xapian.sortable_serialise(9))
    expect(db.get_value_freq(2), 0)
    expect(db.get_value_lower_bound(2), "")
    expect(db.get_value_upper_bound(2), "")

    db.close()
    shutil.rmtree(dbpath)
Esempio n. 6
0
    def index(self, document, pkg):
        """
        Update the document with the information from this data source.

        document  is the document to update
        pkg       is the python-apt Package object for this package
        """
        ver = pkg.candidate
        if ver is None: return
        try:
            instSize = ver.installed_size
            pkgSize = ver.size
        except:
            return

        if self.val_inst_size != -1:
            try:
                document.add_value(self.val_inst_size,
                                   xapian.sortable_serialise(instSize))
            except (OverflowError, SystemError):
                pass
        if self.val_pkg_size != -1:
            try:
                document.add_value(self.val_pkg_size,
                                   xapian.sortable_serialise(pkgSize))
            except (OverflowError, SystemError):
                pass
 def __call__(self, doc):
     app = Application(self.db.get_appname(doc), self.db.get_pkgname(doc))
     stats = self.review_loader.get_review_stats(app)
     import xapian
     if stats:
         return xapian.sortable_serialise(stats.dampened_rating)
     return xapian.sortable_serialise(0)
    def index_document(self, document, properties):
        document.add_value(_VALUE_TIMESTAMP,
            xapian.sortable_serialise(float(properties['timestamp'])))
        document.add_value(_VALUE_TITLE, properties.get('title', '').strip())
        if 'filesize' in properties:
            try:
                document.add_value(_VALUE_FILESIZE,
                    xapian.sortable_serialise(int(properties['filesize'])))
            except (ValueError, TypeError):
                logging.debug('Invalid value for filesize property: %s',
                              properties['filesize'])
        if 'creation_time' in properties:
            try:
                document.add_value(
                    _VALUE_CREATION_TIME, xapian.sortable_serialise(
                        float(properties['creation_time'])))
            except (ValueError, TypeError):
                logging.debug('Invalid value for creation_time property: %s',
                              properties['creation_time'])

        self.set_document(document)

        properties = dict(properties)
        self._index_known(document, properties)
        self._index_unknown(document, properties)
Esempio n. 9
0
 def __call__(self, begin, end):
     """
     Construct a tuple for value range processing.
     
     `begin` -- a string in the format '<field_name>:[low_range]'
                If 'low_range' is omitted, assume the smallest possible value.
     `end` -- a string in the the format '[high_range|*]'.  If '*', assume
              the highest possible value.
     
     Return a tuple of three strings: (column, low, high)
     """
     colon = begin.find(':')
     field_name = begin[:colon]
     begin = begin[colon + 1:len(begin)]
     for field_dict in self.sb.schema:
         if field_dict['field_name'] == field_name:
             if not begin:
                 if field_dict['type'] == 'text':
                     begin = u'a' # TODO: A better way of getting a min text value?
                 elif field_dict['type'] == 'long' or field_dict['type'] == 'float':
                     begin = float('-inf')
                 elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime':
                     begin = u'00010101000000'
             elif end == '*':
                 if field_dict['type'] == 'text':
                     end = u'z' * 100 # TODO: A better way of getting a max text value?
                 elif field_dict['type'] == 'long' or field_dict['type'] == 'float':
                     end = float('inf')
                 elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime':
                     end = u'99990101000000'
             if field_dict['type'] == 'long' or field_dict['type'] == 'float':
                 begin = xapian.sortable_serialise(float(begin))
                 end = xapian.sortable_serialise(float(end))
             return field_dict['column'], str(begin), str(end)
Esempio n. 10
0
 def __call__(self, doc):
     app = Application(self.db.get_appname(doc),
                       self.db.get_pkgname(doc))
     stats = self.review_loader.get_review_stats(app)
     import xapian
     if stats:
         return xapian.sortable_serialise(stats.dampened_rating)
     return xapian.sortable_serialise(0)
Esempio n. 11
0
def get_msg_terms(db=None,msg=None):

#   This is pretty important: what data to be shown from the thing?
#   Maybe should be parsed into json already? Ot serialise a hash somehow?
    doc_data = msg.content
    doc_values = []

    doc_terms = []
    
    stemmer = xapian.Stem("finnish")

    for match in re.finditer(r'\b[a-zA-ZäöåüÄÖÅÜÉÈÁÀéèáà]{3,35}\b', to_lower_case(msg.content)):
        word = match.group(0)
        if is_stopword(word):
            continue
        term = stemmer(word)
        doc_terms.append(term)

    for term in ["_commented-by_"+msg.author]:
        doc_terms.append(term)

    if msg.date:
        doc_terms.append("_c_"+str(msg.date)[:7])
                   
    official_terms = ["_o_"+msg.id]

    if msg.places:
        place = db.place.getnode(msg.places[0])
        for term in get_place_terms(place = place):
            doc_terms.append (term)


        for match in re.finditer(r'\b[a-zA-ZäöåüÄÖÅÜÉÈÁÀéèáà]{3,35}\b', to_lower_case(place.address)):
            word = match.group(0)
            if is_stopword(word):
                continue
            term = stemmer(word)
            #print "adding term "+term
            doc_terms.append(term)


        doc_data += "  " + place.address

        for term in get_latlng_range(place.lat):
            doc_terms.append("_glatrange_"+term)
        for term in get_latlng_range(place.lng):
            doc_terms.append("_glngrange_"+term)

        
        doc_values.append({"field": XAPIAN_X_COORD_FIELD, "value":xapian.sortable_serialise(float(place.lat))})
        doc_values.append({"field": XAPIAN_Y_COORD_FIELD, "value":xapian.sortable_serialise(float(place.lng))})	
    if msg.date:
    	doc_values.append({"field": XAPIAN_CREATED_FIELD, "value": xapian.sortable_serialise( float( msg.date.serialise() ) ) })


    return {"doc_data":doc_data,
            "doc_terms":doc_terms,
            "doc_values":doc_values }
Esempio n. 12
0
def index(datapath, dbpath):
    # Create or open the database we're going to be writing to.
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

    # Set up a TermGenerator that we'll use in indexing.
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_states(datapath):
        # 'fields' is a dictionary mapping from field name to value.
        # Pick out the fields we're going to index.
        name = fields.get('name', u'')
        description = fields.get('description', u'')
        motto = fields.get('motto', u'')
        admitted = fields.get('admitted', None)
        population = fields.get('population', None)
        order = fields.get('order', u'')

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # index each field with a suitable prefix
        termgenerator.index_text(name, 1, 'S')
        termgenerator.index_text(description, 1, 'XD')
        termgenerator.index_text(motto, 1, 'XM')

        # Index fields without prefixes for general search.
        termgenerator.index_text(name)
        termgenerator.increase_termpos()
        termgenerator.index_text(description)
        termgenerator.increase_termpos()
        termgenerator.index_text(motto)

        # Add document values.
        if admitted is not None:
            doc.add_value(1, xapian.sortable_serialise(int(admitted[:4])))
            doc.add_value(2, admitted)  # YYYYMMDD
        if population is not None:
            doc.add_value(3, xapian.sortable_serialise(int(population)))
### Start of example code.
        midlat = fields['midlat']
        midlon = fields['midlon']
        if midlat and midlon:
            doc.add_value(4, "%f,%f" % (float(midlat), float(midlon)))


### End of example code.

# Store all the fields for display purposes.
        doc.set_data(json.dumps(fields))

        # We use the order to ensure each object ends up in the
        # database only once no matter how many times we run the
        # indexer.
        idterm = u"Q" + order
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)
def index(datapath, dbpath):
    # Create or open the database we're going to be writing to.
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

    # Set up a TermGenerator that we'll use in indexing.
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_states(datapath):
        # 'fields' is a dictionary mapping from field name to value.
        # Pick out the fields we're going to index.
        name = fields.get('name', u'')
        description = fields.get('description', u'')
        motto = fields.get('motto', u'')
        admitted = fields.get('admitted', None)
        population = fields.get('population', None)
        order = fields.get('order', u'')

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # index each field with a suitable prefix
        termgenerator.index_text(name, 1, 'S')
        termgenerator.index_text(description, 1, 'XD')
        termgenerator.index_text(motto, 1, 'XM')

        # Index fields without prefixes for general search.
        termgenerator.index_text(name)
        termgenerator.increase_termpos()
        termgenerator.index_text(description)
        termgenerator.increase_termpos()
        termgenerator.index_text(motto)

        # Add document values.
        if admitted is not None:
            doc.add_value(1, xapian.sortable_serialise(int(admitted[:4])))
            doc.add_value(2, admitted) # YYYYMMDD
        if population is not None:
            doc.add_value(3, xapian.sortable_serialise(population))
### Start of example code.
        midlat = fields['midlat']
        midlon = fields['midlon']
        if midlat and midlon:
            doc.add_value(4, "%f,%f" % (midlat, midlon))
### End of example code.

        # Store all the fields for display purposes.
        doc.set_data(json.dumps(fields))

        # We use the identifier to ensure each object ends up in the
        # database only once no matter how many times we run the
        # indexer.
        idterm = u"Q" + order
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)
Esempio n. 14
0
def create_index(filename,databasePath):
  print "begin read",filename
  if not os.path.exists(databasePath):
    os.makedirs(databasePath)
  database = xapian.WritableDatabase(databasePath, xapian.DB_CREATE_OR_OPEN)
  stemmer=xapian.Stem('english')
  rex=re.compile(r'[0-9]+|[a-zA-Z]+|[\x80-\xff3]{3}')
  lines=open(filename).readlines()
  processed=0
  len_file=len(lines)
  print filename,"read end"
  time_begin=time.time()
  for line in lines:
    try:
      line=line.encode('utf-8')
    except:
      continue
    line_items=line.split('\t')
    document = xapian.Document()
    freq_sortable=xapian.sortable_serialise(float(line_items[3]))
    click_sortable=xapian.sortable_serialise(float(line_items[4]))
    document.add_value(FREQ,freq_sortable)
    document.add_value(CLICK,click_sortable)
    document.add_value(DATE,line_items[1])
    document.set_data(line_items[0])
    terms=rex.findall(line_items[0])
    for term in terms:
      if len(term) > MAX_TERM_LENGTH:
        document.add_term(stemmer(term[:MAX_TERM_LENGTH]))
      else:
        document.add_term(stemmer(term))
    database.add_document(document)
    processed+=1
    del line
    del line_items
    del document
    del freq_sortable
    del click_sortable
    del terms

    if processed%100000==0:
      end=time.time()
      speed=100000/float(end-time_begin)
      print "="*40
      print filename
      print "speed:\t",speed
      print "percent:\t%s %%" %(100.0*(processed/float(len_file)))
      print "time remain:\t %s hours" %( (len_file-processed)/(speed*3600))
      time_begin=time.time()
  
  gc.collect()
  os.system("rm -rf %s" % filename)
  print filename,"end"
Esempio n. 15
0
def make_value(s, term):
    """Parse various string values and return suitable numeric
    representations."""
    if term == 'year':
        # This is in a date string format due to serialization.
        return xapian.sortable_serialise(int(s))
    if term == 'mtime':
        return xapian.sortable_serialise(time.mktime(time.strptime(s)))
    if term == 'rating':
        return xapian.sortable_serialise(float(s))
    else:
        return xapian.sortable_serialise(int(s))
Esempio n. 16
0
def index(datapath, dbpath):
    # Create or open the database we're going to be writing to.
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

    # Set up a TermGenerator that we'll use in indexing.
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_csv_file(datapath):
        # 'fields' is a dictionary mapping from field name to value.
        # Pick out the fields we're going to index.
        description = fields.get('DESCRIPTION', u'')
        title = fields.get('TITLE', u'')
        identifier = fields.get('id_NUMBER', u'')

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # Index each field with a suitable prefix.
        termgenerator.index_text(title, 1, 'S')
        termgenerator.index_text(description, 1, 'XD')

        # Index fields without prefixes for general search.
        termgenerator.index_text(title)
        termgenerator.increase_termpos()
        termgenerator.index_text(description)

        # Store all the fields for display purposes.
        doc.set_data(json.dumps(fields, encoding='utf8'))

        ### Start of example code.
        # parse the two values we need
        measurements = fields.get('MEASUREMENTS', u'')
        if measurements != u'':
            numbers = numbers_from_string(measurements)
            if len(numbers) > 0:
                doc.add_value(0, xapian.sortable_serialise(max(numbers)))

        date_made = fields.get('DATE_MADE', u'')
        years = numbers_from_string(date_made)
        if len(years) > 0:
            doc.add_value(1, xapian.sortable_serialise(years[0]))


### End of example code.

# We use the identifier to ensure each object ends up in the
# database only once no matter how many times we run the
# indexer.
        idterm = u"Q" + identifier
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)
Esempio n. 17
0
def index(datapath, dbpath):
    # Create or open the database we're going to be writing to.
    db = xapian.WritableDatabase(dbpath, xapian.DB_CREATE_OR_OPEN)

    # Set up a TermGenerator that we'll use in indexing.
    termgenerator = xapian.TermGenerator()
    termgenerator.set_stemmer(xapian.Stem("en"))

    for fields in parse_csv_file(datapath):
        # 'fields' is a dictionary mapping from field name to value.
        # Pick out the fields we're going to index.
        description = fields.get('DESCRIPTION', u'')
        title = fields.get('TITLE', u'')
        identifier = fields.get('id_NUMBER', u'')

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)

        # Index each field with a suitable prefix.
        termgenerator.index_text(title, 1, 'S')
        termgenerator.index_text(description, 1, 'XD')

        # Index fields without prefixes for general search.
        termgenerator.index_text(title)
        termgenerator.increase_termpos()
        termgenerator.index_text(description)

        # Store all the fields for display purposes.
        doc.set_data(json.dumps(fields, encoding='utf8'))

### Start of example code.
        # parse the two values we need
        measurements = fields.get('MEASUREMENTS', u'')
        if measurements != u'':
            numbers = numbers_from_string(measurements)
            if len(numbers) > 0:
                doc.add_value(0, xapian.sortable_serialise(max(numbers)))

        date_made = fields.get('DATE_MADE', u'')
        years = numbers_from_string(date_made)
        if len(years) > 0:
            doc.add_value(1, xapian.sortable_serialise(years[0]))
### End of example code.

        # We use the identifier to ensure each object ends up in the
        # database only once no matter how many times we run the
        # indexer.
        idterm = u"Q" + identifier
        doc.add_boolean_term(idterm)
        db.replace_document(idterm, doc)
Esempio n. 18
0
def _encode_simple_value(field_cls, value):
    # Integers (FIXME this doesn't work with the big integers)
    if issubclass(field_cls, Integer):
        return sortable_serialise(value)
    elif issubclass(field_cls, Decimal):
        # FIXME: We convert decimal->float so we lost precision
        return sortable_serialise(float(value))

    # Datetimes: normalize to UTC, so searching works
    if type(value) is datetime:
        value = value.astimezone(fixed_offset(0))

    # A common field or a new field
    return field_cls.encode(value)
Esempio n. 19
0
File: jot.py Progetto: ttaylordev/z
def dict2doc(y):
    doc = xapian.Document()
    indexer.set_document(doc)

    url = y['url']
    uid = urlid(url)
    sid = uid[:8]
    doc.add_boolean_term(P['id'] + uid)
    # add the id and short id as unprefixed/stemmed terms to
    # make it easier to select bookmarks from search results
    for idterm in [uid, sid, 'Z' + uid, 'Z' + sid]:
        doc.add_boolean_term(idterm)

    doc.add_value(VALUE_URL, url)

    # add hostname parts as site terms
    hostname = urlparse(url).hostname
    if hostname:
        hs = hostname.split('.')
        for i in xrange(len(hs)):
            doc.add_boolean_term(P['site'] + '.'.join(hs[i:]))

    archive_path = get_archive_path(uid)
    if archive_path:
        y['tags'].append('archived')

    # remove duplicate tags, preserving order
    y['tags'] = list(OrderedDict.fromkeys(y['tags']))
    alltags = u'\x1f'.join(y['tags'])
    doc.add_value(VALUE_TAGS, alltags)
    for tag in y['tags']:
        doc.add_boolean_term(P['tag'] + tag)

    if 'title' in y:
        doc.add_value(VALUE_TITLE, y['title'])
        index_text(y['title'], 'title')

    if 'notes' in y:
        doc.set_data(y['notes'])
        index_text(y['notes'], 'notes')

    created = y.get('created', arrow.utcnow()).timestamp
    doc.add_value(VALUE_CREATED, xapian.sortable_serialise(created))

    if archive_path:
        archived = y.get('archived', arrow.utcnow()).timestamp
        doc.add_value(VALUE_ARCHIVED, xapian.sortable_serialise(archived))
        index_archive(doc, archive_path)

    return doc
Esempio n. 20
0
def _encode_simple_value(field_cls, value):
    # Integers (FIXME this doesn't work with the big integers)
    if issubclass(field_cls, Integer):
        return sortable_serialise(value)
    elif issubclass(field_cls, Decimal):
        # FIXME: We convert decimal->float so we lost precision
        return sortable_serialise(float(value))

    # Datetimes: normalize to UTC, so searching works
    if type(value) is datetime:
        value = value.astimezone(fixed_offset(0))

    # A common field or a new field
    return field_cls.encode(value)
Esempio n. 21
0
def normalize_range(begin, end):
    """ 查询时,转换range 参数,主要是把 float/int 转换为 str 格式 """

    if begin is not None:
        if isinstance(begin, float):
            begin = xapian.sortable_serialise(float(begin))
        else:
            begin = str(begin)

    if end is not None:
        if isinstance(end, float):
            end = xapian.sortable_serialise(float(end))
        else:
            end = str(end)
    return begin, end
Esempio n. 22
0
def _act_weight(fieldname, doc, field, context, type=None):
    """Perform the WEIGHT action.

    """
    value = float(field.value)
    value = xapian.sortable_serialise(value)
    doc.add_value(fieldname, value, 'weight')
    def add_product(self, product, database_path=None):
        """Adds product to repository.
        product - Product to be added to database
        database_path - Path of the database where product is added. Default: None
        When repository has been created with many database paths then database_path must
        be defined."""
        # Set up a TermGenerator that we'll use in indexing.
        if len(self._databases) > 1:
            assert database_path != None, \
                "With many databases you must identify the database where product is added"

        termgenerator = xapian.TermGenerator()
        termgenerator.set_stemmer(self._create_stem())

        # We make a document and tell the term generator to use this.
        doc = xapian.Document()
        termgenerator.set_document(doc)
        termgenerator.index_text(unicode(product.title))
        termgenerator.index_text(unicode(product.description))
        doc.set_data(unicode(json.dumps(product.__dict__)))
        doc.add_value(0, xapian.sortable_serialise(float(product.price)))

        idterm = "Q" + product.url
        doc.add_boolean_term(idterm)

        db = self._db
        if database_path:
            db = self._databases[database_path]

        db.replace_document(idterm, doc)
Esempio n. 24
0
    def index(self, document, fname, entry):
        # Index a single term "XD", marking that the package contains .desktop
        # files
        document.add_term("XD")

        # Index the name of the .desktop file, with prefix XDF
        document.add_term("XDF" + fname)

        # Index keywords retrieved in this indexer's language
        self.indexer.set_document(document)
        oldlangs = Locale.langs
        try:
            Locale.langs = self.xdglangs
            self.indexer.index_text_without_positions(entry.getName())
            self.indexer.index_text_without_positions(entry.getGenericName())
            self.indexer.index_text_without_positions(entry.getComment())
        finally:
            Locale.langs = oldlangs

        # Index .desktop categories, with prefix XDT
        for cat in entry.getCategories():
            document.add_term("XDT" + cat)

        # Add an "app-popcon" value with popcon rank
        try:
            popcon = int(entry.get("X-AppInstall-Popcon"))
        except ValueError as e:
            if self.progress:
                self.progress.verbose("%s: parsing X-AppInstall-Popcon: %s" %
                                      (fname, str(e)))
            popcon = -1
        if self.val_popcon != -1:
            document.add_value(self.val_popcon,
                               xapian.sortable_serialise(popcon))
Esempio n. 25
0
        def _add_value(doc, slotnum, value):

            if isinstance(value, float):
                value = xapian.sortable_serialise(float(value))
                doc.add_value(int(slotnum), value)
            else:
                doc.add_value(int(slotnum), str(value))
Esempio n. 26
0
def _marshal_value(value):
    """
    Private utility method that converts Python values to a string for Xapian values.
    """
    if isinstance(value, (int, long)):
        value = xapian.sortable_serialise(value)
    return value
Esempio n. 27
0
    def add(self, msg):
        frombits = msg.from_addr() + ' '
        if msg.from_name():
            frombits += msg.from_name()

        with msg.open() as fp:
            mail = email.message_from_file(fp)
            bodybits = self._get_body(mail)

        if bodybits is None:
            return

        bodybits += u' ' + frombits
        bodybits += u' ' + (msg.subject() or u'')

        doc = xapian.Document()
        self.term_gen.set_document(doc)

        sortable_ts = xapian.sortable_serialise(mua.py.unix_dt(msg.date()))
        doc.add_value(SLOT_DATE, sortable_ts)
        doc.add_value(SLOT_STRONG_ID, mua.py.unb64(msg.strong_id()))

        self.term_gen.index_text(frombits, 1, PREFIX_FROM)
        self.term_gen.index_text(bodybits, 1, PREFIX_BODY)
        self.term_gen.index_text(msg.subject() or u'', 1, PREFIX_SUBJECT)
        return self.db.add_document(doc)
Esempio n. 28
0
    def convert(self, field_value):
        """
        Generates index values (for sorting) for given field value and its content type
        """
        if field_value is None:
            return None

        content_type = self._get_content_type(field_value)

        value = field_value

        if self._is_float_or_interger(content_type):
            value = xapian.sortable_serialise(field_value)
        elif isinstance(content_type, (models.BooleanField, bool)):
            # Boolean fields are stored as 't' or 'f'
            value = field_value and 't' or 'f'
        elif isinstance(content_type, (models.DateTimeField, datetime.datetime)):
            # DateTime fields are stored as %Y%m%d%H%M%S (better sorting)
            # value = field_value.strftime('%Y%m%d%H%M%S')
            value = '%d%02d%02d%02d%02d%02d' % ( field_value.year,
                                                 field_value.month,
                                                 field_value.day,
                                                 field_value.hour,
                                                 field_value.minute,
                                                 field_value.second )

        return smart_str(value)
Esempio n. 29
0
 def _marshal_value(self, value):
     """
     Private method that converts Python values to a string for Xapian values.
     """
     if isinstance(value, datetime.datetime):
         if value.microsecond:
             value = u'%04d%02d%02d%02d%02d%02d%06d' % (
                 value.year, value.month, value.day, value.hour,
                 value.minute, value.second, value.microsecond
             )
         else:
             value = u'%04d%02d%02d%02d%02d%02d' % (
                 value.year, value.month, value.day, value.hour,
                 value.minute, value.second
             )
     elif isinstance(value, datetime.date):
         value = u'%04d%02d%02d000000' % (value.year, value.month, value.day)
     elif isinstance(value, bool):
         if value:
             value = u't'
         else:
             value = u'f'
     elif isinstance(value, (int, long, float)):
         value = xapian.sortable_serialise(value)
     else:
         value = force_unicode(value)
     return value
Esempio n. 30
0
def _encode_simple_value(field_cls, value):
    # Overload the Integer type
    # XXX warning: this doesn't work with the big integers!
    if issubclass(field_cls, Integer):
        return sortable_serialise(value)
    # A common field or a new field
    return field_cls.encode(value)
Esempio n. 31
0
def index(contacts, database, prefixes):
    c = config(contacts)

    db = xapian.WritableDatabase(database, xapian.DB_CREATE_OR_OPEN)

    p = set()
    for person, data in c:
        doc = xapian.Document()
        termgenerator.set_document(doc)

        termgenerator.index_text(person, 1, u'id')
        for prefix, content in data:
            if prefix[0] in digits[:5]:
                doc.add_value(int(prefix[0]), xapian.sortable_serialise(int(content)))
            elif prefix[0] in digits[5:]:
                doc.add_value(int(prefix[0]), content)
            else:
                termgenerator.index_text(content, 1, u'X' + prefix)
                termgenerator.index_text(content)
                termgenerator.increase_termpos()
            p.add(prefix)

        doc.add_boolean_term(u'Q' + person)
        doc.set_data(person)
        db.replace_document(u'Q' + person, doc)

    with open(prefixes, 'wb') as fp:
        json.dump(list(p), fp)
Esempio n. 32
0
def _marshal_value(value):
    """
    Private utility method that converts Python values to a string for Xapian values.
    """
    if isinstance(value, (int, long)):
        value = xapian.sortable_serialise(value)
    return value
Esempio n. 33
0
 def __call__(self, doc):
     # we want to return a sortable string which represents
     # the distance from Washington, DC to the middle of this
     # state.
     coords = map(float, doc.get_value(4).split(","))
     washington = (38.012, -77.037)
     return xapian.sortable_serialise(support.distance_between_coords(coords, washington))
Esempio n. 34
0
def _encode_simple_value(field_cls, value):
    # Overload the Integer type
    # XXX warning: this doesn't work with the big integers!
    if issubclass(field_cls, Integer):
        return sortable_serialise(value)
    # A common field or a new field
    return field_cls.encode(value)
Esempio n. 35
0
 def __call__(self, doc):
     # we want to return a sortable string which represents
     # the distance from Washington, DC to the middle of this
     # state.
     coords = map(float, doc.get_value(4).split(','))
     washington = (38.012, -77.037)
     return xapian.sortable_serialise(
         support.distance_between_coords(coords, washington))
Esempio n. 36
0
    def index(self, document, pkg):
        """
        Update the document with the information from this data source.

        document  is the document to update
        pkg       is the python-apt Package object for this package
        """
        try:
            instSize = pkg.installedSize
            pkgSize = pkg.packageSize
        except:
            return

        if self.val_inst_size != -1:
            document.add_value(self.val_inst_size, xapian.sortable_serialise(instSize));
        if self.val_pkg_size != -1:
            document.add_value(self.val_pkg_size, xapian.sortable_serialise(pkgSize));
Esempio n. 37
0
def update(db, cache, datadir=None):
    if not datadir:
        datadir = softwarecenter.paths.APP_INSTALL_DESKTOP_PATH
    update_from_app_install_data(db, cache, datadir)
    update_from_var_lib_apt_lists(db, cache)
    # add db global meta-data
    LOG.debug("adding popcon_max_desktop '%s'" % popcon_max)
    db.set_metadata("popcon_max_desktop", xapian.sortable_serialise(float(popcon_max)))
Esempio n. 38
0
def encode_sortable_date(d):
    try:
        t = time.strptime(d, ISO_8601)
        n = -int(time.mktime(t))
    except:
        n = 0

    return xapian.sortable_serialise(n)
Esempio n. 39
0
def _marshal_value(value, prefunc=None):
    """
    Private utility method that converts Python values to a string for Xapian values.
    prefunc 对值做预处理
    """
    if value is None:
        return 0

    if prefunc:
        value = prefunc(value)
    if isinstance(value, (int, long, float)):
        value = xapian.sortable_serialise(value)
    elif isinstance(value, bool):
        value = 1 if value else 0
        value = xapian.sortable_serialise(value)
    value = str(value).lower()
    return value
Esempio n. 40
0
def _marshal_value(value, pre_func=None):
    """
    Private utility method that converts Python values to a string for Xapian values.
    """
    if pre_func:
        value = pre_func(value)
    # value 默认为int, long, float
    value = xapian.sortable_serialise(value)
    return value
Esempio n. 41
0
def _marshal_value(value, pre_func=None):
    """
    Private utility method that converts Python values to a string for Xapian values.
    """
    if pre_func:
        value = pre_func(value)
    # value 默认为int, long, float
    value = xapian.sortable_serialise(value)
    return value
Esempio n. 42
0
def update(db, cache, datadir=None):
    if not datadir:
        datadir = softwarecenter.paths.APP_INSTALL_DESKTOP_PATH
    update_from_app_install_data(db, cache, datadir)
    update_from_var_lib_apt_lists(db, cache)
    # add db global meta-data
    LOG.debug("adding popcon_max_desktop %r", popcon_max)
    db.set_metadata("popcon_max_desktop",
                    xapian.sortable_serialise(float(popcon_max)))
Esempio n. 43
0
    def index(self, document, pkg):
        """
        Update the document with the information from this data source.

        document  is the document to update
        pkg       is the python-apt Package object for this package
        """
        time = self._package_cataloged_time.get(pkg.name, self.now)
        self._package_cataloged_time[pkg.name] = time
        document.add_value(self.value, xapian.sortable_serialise(time))
Esempio n. 44
0
    def index(self, document, pkg):
        """
        Update the document with the information from this data source.

        document  is the document to update
        pkg       is the python-apt Package object for this package
        """
        time = self._package_cataloged_time.get(pkg.name, self.now)
        self._package_cataloged_time[pkg.name] = time
        document.add_value(self.value, xapian.sortable_serialise(time))
Esempio n. 45
0
 def __call__(self, doc):
     # we want to return a sortable string which represents
     # the distance from Washington, DC to the middle of this
     # state.
     value = doc.get_value(4).decode('utf8')
     x, y = map(float, value.split(','))
     washington = (38.012, -77.037)
     return xapian.sortable_serialise(
         support.distance_between_coords((x, y), washington)
         )
Esempio n. 46
0
def _encode_simple_value(field_cls, value):
    # Integers (FIXME this doesn't work with the big integers)
    if issubclass(field_cls, Integer):
        return sortable_serialise(value)

    # Datetimes: normalize to UTC, so searching works
    if type(value) is datetime:
        value = value.astimezone(fixed_offset(0))

    # A common field or a new field
    return field_cls.encode(value)
Esempio n. 47
0
def serialise_value(value):
    """
    Utility method that converts Python values to a string for Xapian values.

    """
    values = []
    if isinstance(value, datetime.datetime):
        if value.microsecond:
            value = "%04d%02d%02d%02d%02d%02d%06d" % (
                value.year,
                value.month,
                value.day,
                value.hour,
                value.minute,
                value.second,
                value.microsecond,
            )
        else:
            value = "%04d%02d%02d%02d%02d%02d" % (
                value.year,
                value.month,
                value.day,
                value.hour,
                value.minute,
                value.second,
            )
        values.append(value)
    elif isinstance(value, datetime.date):
        value = "%04d%02d%02d000000" % (value.year, value.month, value.day)
        values.append(value)
    elif isinstance(value, datetime.time):
        if value.microsecond:
            value = "%02d%02d%02d%06d" % (value.hour, value.minute, value.second, value.microsecond)
        else:
            value = "%02d%02d%02d" % (value.hour, value.minute, value.second)
        values.append(value)
    elif isinstance(value, bool):
        values.append("t" if value else "f")
    elif isinstance(value, float):
        values.append(sortable_serialise(value))
    elif isinstance(value, (int, long)):
        values.append("%012d" % value)
    elif isinstance(value, LatLongCoord):
        value = value.serialise()
        values.append(value)
        for term, value in [(value[:-i], 5 - i) if i else (value, 5) for i in range(5)]:
            values.append(value)
    elif hasattr(value, "serialise"):
        values.append(value.serialise())
    elif value:
        values.append(normalize("%s" % value))
    else:
        values.append("")
    return values
Esempio n. 48
0
 def _set_year(self, year):
     # FIXME: what to do if year is not an int?
     try:
         year = int(year)
     except ValueError:
         pass
     prefix = self.db._find_prefix('year')
     for term in self._term_iter(prefix):
         self._remove_term(prefix, year)
     self._add_term(prefix, year)
     facet = self.db._find_facet('year')
     self.xapian_doc.add_value(facet, xapian.sortable_serialise(year))
Esempio n. 49
0
def test_matchspy():
    """Test use of matchspies.

    """
    db = setup_database()
    query = xapian.Query(xapian.Query.OP_OR, "was", "it")
    enq = xapian.Enquire(db)
    enq.set_query(query)

    def set_matchspy_deref(enq):
        """Set a matchspy, and then drop the reference, to check that it
        doesn't get deleted too soon.
        """
        spy = xapian.ValueCountMatchSpy(0)
        enq.add_matchspy(spy)
        del spy

    set_matchspy_deref(enq)
    mset = enq.get_mset(0, 10)
    expect(len(mset), 5)

    spy = xapian.ValueCountMatchSpy(0)
    enq.add_matchspy(spy)
    # Regression test for clear_matchspies() - used to always raise an
    # exception due to a copy and paste error in its definition.
    enq.clear_matchspies()
    mset = enq.get_mset(0, 10)
    expect([item for item in list(spy.values())], [])

    enq.add_matchspy(spy)
    mset = enq.get_mset(0, 10)
    expect(spy.get_total(), 5)
    expect([(item.term, item.termfreq) for item in list(spy.values())], [
        (xapian.sortable_serialise(1.5), 1),
        (xapian.sortable_serialise(2), 2),
    ])
    expect([(item.term, item.termfreq) for item in spy.top_values(10)], [
        (xapian.sortable_serialise(2), 2),
        (xapian.sortable_serialise(1.5), 1),
    ])
Esempio n. 50
0
def setup_database():
    """Set up and return an inmemory database with 5 documents.

    """
    db = xapian.inmemory_open()

    doc = xapian.Document()
    doc.set_data("is it cold?")
    doc.add_term("is")
    doc.add_posting("it", 1)
    doc.add_posting("cold", 2)
    db.add_document(doc)

    doc = xapian.Document()
    doc.set_data("was it warm?")
    doc.add_posting("was", 1)
    doc.add_posting("it", 2)
    doc.add_posting("warm", 3)
    db.add_document(doc)
    doc.set_data("was it warm? two")
    doc.add_term("two", 2)
    doc.add_value(0, xapian.sortable_serialise(2))
    db.add_document(doc)
    doc.set_data("was it warm? three")
    doc.add_term("three", 3)
    doc.add_value(0, xapian.sortable_serialise(1.5))
    db.add_document(doc)
    doc.set_data("was it warm? four it")
    doc.add_term("four", 4)
    doc.add_term("it", 6)
    doc.add_posting("it", 7)
    doc.add_value(5, 'five')
    doc.add_value(9, 'nine')
    doc.add_value(0, xapian.sortable_serialise(2))
    db.add_document(doc)

    expect(db.get_doccount(), 5)
    return db
Esempio n. 51
0
        def mq(v):
            if isinstance(v, unicode):
                v = v.encode('utf-8', 'ignore')

            if isinstance(v, str):
                return xapian.Query('%s%s%s' %
                                    (prefix, ':' if v[0].isupper() else '', v))
            elif isinstance(v, int) or isinstance(v, float):
                strv = xapian.sortable_serialise(v)
                return xapian.Query(xapian.Query.OP_VALUE_RANGE, valnum, strv,
                                    strv)
            elif isinstance(v, datetime):
                term = '%s%04d%02d%02d' % (prefix, v.year, v.month, v.day)
                #                strv = '%04d%02d%02d%02d%02d%02d' % (
                #                    v.year, v.month, v.day, v.hour, v.minute, v.second)
                strv = xapian.sortable_serialise(time.mktime(v.timetuple()))
                return xapian.Query(
                    xapian.Query.OP_AND, xapian.Query(term),
                    xapian.Query(xapian.Query.OP_VALUE_RANGE, valnum, strv,
                                 strv))
            else:
                raise SearchError, 'unexpected type (%s) for value %s' % (
                    type(v), v)
Esempio n. 52
0
class Indexer:
    def __init__(self, lang, val_popcon, progress=None):
        self.val_popcon = val_popcon
        self.progress = progress
        if lang is None:
            lang = "en"
        self.lang = lang
        self.xlang = lang.split("_")[0]
        self.xdglangs = Locale.expand_languages(lang)
        self.indexer = xapian.TermGenerator()
        # Get a stemmer for this language, if available
        try:
            self.stemmer = xapian.Stem(self.xlang)
            self.indexer.set_stemmer(self.stemmer)
        except xapian.InvalidArgumentError:
            pass

    def index(self, document, fname, entry):
        # Index a single term "XD", marking that the package contains .desktop
        # files
        document.add_term("XD")

        # Index the name of the .desktop file, with prefix XDF
        document.add_term("XDF" + fname)

        # Index keywords retrieved in this indexer's language
        self.indexer.set_document(document)
        oldlangs = Locale.langs
        try:
            Locale.langs = self.xdglangs
            self.indexer.index_text_without_positions(entry.getName())
            self.indexer.index_text_without_positions(entry.getGenericName())
            self.indexer.index_text_without_positions(entry.getComment())
        finally:
            Locale.langs = oldlangs

        # Index .desktop categories, with prefix XDT
        for cat in entry.getCategories():
            document.add_term("XDT"+cat)

        # Add an "app-popcon" value with popcon rank
        try:
            popcon = int(entry.get("X-AppInstall-Popcon"))
        except ValueError, e:
            if self.progress:
                self.progress.verbose("%s: parsing X-AppInstall-Popcon: %s" % (fname, str(e)))
            popcon = -1
        if self.val_popcon != -1:
            document.add_value(self.val_popcon, xapian.sortable_serialise(popcon));
Esempio n. 53
0
def test_value_iter():
    """Test iterators over list of values in a document.

    """
    db = setup_database()
    doc = db.get_document(5)

    items = list(doc.values())
    expect(len(items), 3)
    expect(items[0].num, 0)
    expect(items[0].value, xapian.sortable_serialise(2))
    expect(items[1].num, 5)
    expect(items[1].value, 'five')
    expect(items[2].num, 9)
    expect(items[2].value, 'nine')
Esempio n. 54
0
def serialise_value(value):
    """
    Utility method that converts Python values to a string for Xapian values.

    """
    values = []
    if isinstance(value, datetime.datetime):
        if value.microsecond:
            value = '%04d%02d%02d%02d%02d%02d%06d' % (
                value.year, value.month, value.day, value.hour, value.minute,
                value.second, value.microsecond)
        else:
            value = '%04d%02d%02d%02d%02d%02d' % (value.year, value.month,
                                                  value.day, value.hour,
                                                  value.minute, value.second)
        values.append(value)
    elif isinstance(value, datetime.date):
        value = '%04d%02d%02d000000' % (value.year, value.month, value.day)
        values.append(value)
    elif isinstance(value, datetime.time):
        if value.microsecond:
            value = '%02d%02d%02d%06d' % (value.hour, value.minute,
                                          value.second, value.microsecond)
        else:
            value = '%02d%02d%02d' % (value.hour, value.minute, value.second)
        values.append(value)
    elif isinstance(value, bool):
        values.append('t' if value else 'f')
    elif isinstance(value, float):
        values.append(sortable_serialise(value))
    elif isinstance(value, (int, long)):
        values.append('%012d' % value)
    elif isinstance(value, LatLongCoord):
        value = value.serialise()
        values.append(value)
        for term, value in [(value[:-i], 5 - i) if i else (value, 5)
                            for i in range(5)]:
            values.append(value)
    elif hasattr(value, 'serialise'):
        values.append(value.serialise())
    elif value:
        values.append(normalize("%s" % value))
    else:
        values.append('')
    return values
Esempio n. 55
0
def index(keyword_iter):
    for id, cid, rank, kw in keyword_iter():
        doc = xapian.Document()
        doc.add_value(0, id)
        doc.add_value(1, xapian.sortable_serialise(rank))
        doc.add_value(2, cid)

        for word, value in kw:
            if word:
                if not word.startswith('>'):
                    if len(word) < 254:
                        doc.add_term(word, value)

        key = '>%s' % id
        doc.add_term(key)
        SEARCH_DB.replace_document(key, doc)

    flush_db()
Esempio n. 56
0
File: models.py Progetto: lamby/nm2
 def index(self, entries):
     count = 0
     for tag, date, changedby, changelog in entries:
         count += 1
         #if count % 1000 == 0:
         #    print date
         xid = "XP" + tag
         document = xapian.Document()
         document.set_data(changelog + "\n" + " -- " + changedby + "  " +
                           date)
         #print date
         # Ignore timezones for our purposes: dealing with timezones in
         # python means dealing with one of the most demented pieces of code
         # people have ever conceived, or otherwise it means introducing
         # piles of external dependencies that maybe do the job. We can get
         # away without timezones, it is a lucky thing and we take advantage
         # of such strokes of luck.
         ts = 0
         mo = self.re_ts.match(date)
         if mo:
             #ts = time.mktime(time.strptime(mo.group(1), "%a, %d %b %Y %H:%M:%S"))
             parsed = email.utils.parsedate_tz(mo.group(1))
             if parsed is not None:
                 ts = time.mktime(parsed[:9])
         #parsed = dateutil.parser.parse(date)
         #parsed = email.utils.parsedate_tz(date)
         #ts = time.mktime(parsed[:9]) - parsed[9]
         document.add_value(0, xapian.sortable_serialise(ts))
         document.add_term(xid)
         pos = 0
         lines = changelog.split("\n")[1:]
         lines.append(changedby)
         for l in lines:
             for tok in self.tokenise(l):
                 tok = tok.strip(".-")
                 if not tok: continue
                 # see ircd (2.10.04+-1)
                 if len(tok) > 100: continue
                 if tok.isdigit(): continue
                 document.add_posting(tok, pos)
                 pos += 1
         self.xdb.replace_document(xid, document)
         if self.max_ts is None or ts > self.max_ts:
             self.max_ts = ts
Esempio n. 57
0
    def store(self, guid, properties, new, pre_cb=None, post_cb=None, *args):
        if self._db is None:
            self._do_open()

        if pre_cb is not None:
            pre_cb(guid, properties, *args)

        _logger.debug('Index %r object: %r', self.metadata.name, properties)

        document = xapian.Document()
        term_generator = xapian.TermGenerator()
        term_generator.set_document(document)

        for name, prop in self._props.items():
            value = guid if prop.slot == 0 else properties[name]

            if prop.slot is not None:
                if prop.typecast in [int, float, bool]:
                    add_value = xapian.sortable_serialise(value)
                else:
                    if prop.localized:
                        value = env.gettext(value, self._lang) or ''
                    add_value = prop.to_string(value)[0]
                document.add_value(prop.slot, add_value)

            if prop.prefix or prop.full_text:
                for value in prop.to_string(value):
                    if prop.prefix:
                        if prop.boolean:
                            document.add_boolean_term(_term(
                                prop.prefix, value))
                        else:
                            document.add_term(_term(prop.prefix, value))
                    if prop.full_text:
                        term_generator.index_text(value, 1, prop.prefix or '')
                    term_generator.increase_termpos()

        self._db.replace_document(_term(env.GUID_PREFIX, guid), document)
        self._pending_updates += 1

        if post_cb is not None:
            post_cb(guid, properties, *args)

        self._check_for_commit()
Esempio n. 58
0
def _marshal_value(value):
    """
    Private utility method that converts Python values to a string for Xapian values.
    """
    if isinstance(value, datetime.datetime):
        value = _marshal_datetime(value)
    elif isinstance(value, datetime.date):
        value = _marshal_date(value)
    elif isinstance(value, bool):
        if value:
            value = u't'
        else:
            value = u'f'
    elif isinstance(value, float):
        value = xapian.sortable_serialise(value)
    elif isinstance(value, (int, long)):
        value = u'%012d' % value
    else:
        value = force_unicode(value).lower()
    return value