def __buildDoc(self, article): if article.getTitle() == None: return None doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field("title", article.getTitle())) if article.getAbstract() == None: pass else: doc.fields.append(xappy.Field("text", article.getAbstract())) #'INDEX_EXACT' - maximum length 220, but prefix "XA" is added to each term in the document #maximum length 218 for chemical in [ chemical for chemical in article.getChemicals() if len(chemical) < 219 ]: doc.fields.append(xappy.Field("chemical_exact", chemical)) for keyword in article.getKeywords(): doc.fields.append(xappy.Field("keyword", keyword)) for mesh in article.getMeSH(): doc.fields.append(xappy.Field("mesh", mesh)) doc.id = str(article.getPMID()) return doc
def document(self, connection, retry=False): """ return a xapian index document from the context. we can introspect the connection to discover relevant fields available. """ doc = xappy.UnprocessedDocument() if interfaces.ENABLE_LOGGING: log.debug("Indexing Document %r" % self.context) # object type doc.fields.append( xappy.Field("object_type", self.context.__class__.__name__)) # object kind doc.fields.append( xappy.Field("object_kind", domain.object_hierarchy_type(self.context))) try: #TODO: loop thru all available languages and index the translations self.index(doc) except exceptions.OperationalError, exceptions.InvalidRequestError: # detatch the dbapi connection from the pool, and close it # and retry the index operation (once) log.error("Indexing Connection Hosed, Discarding") db_connection = metadata.bind.contextual_connect() db_connection.begin().rollback() db_connection.detach() db_connection.close() if not retry: return self.document(connection, retry=True) raise
def build_index(self, remove_old=True): if remove_old: remove_directory(self.search_db_dir) self.__xappy = xappy.IndexerConnection(self.search_db_dir) self.__xappy.add_field_action("module_uid", xappy.FieldActions.STORE_CONTENT) self.__xappy.add_field_action("keyword_term", xappy.FieldActions.INDEX_FREETEXT, nopos=True) for module_keyword in self.__keywords: for keyword in module_keyword[2]: module_doc = xappy.UnprocessedDocument() module_doc.fields.append(xappy.Field("module_uid", keyword[0])) terms = list(split_word(keyword[1], True)) module_doc.fields.append( xappy.Field("keyword_term", ' '.join(terms))) self.__xappy.add(module_doc) self.__xappy.close()
def document(self, connection, retry=False): """ return a xapian index document from the context. we can introspect the connection to discover relevant fields available. """ doc = xappy.UnprocessedDocument() if interfaces.ENABLE_LOGGING: log.debug("Indexing Document %r"%self.context) # object type doc.fields.append( xappy.Field("object_type", self.context.__class__.__name__)) # object kind doc.fields.append( xappy.Field("object_kind", domain.object_hierarchy_type(self.context))) # object language doc.fields.append( xappy.Field("language", self.context.language)) doc.fields.append(xappy.Field("status", getattr(self.context, "status", ""))) doc.fields.append(xappy.Field("owner", str(getattr(self.context, "owner_id", "")))) try: status_date = getattr(self.context, "status_date") if status_date: status_date = date_value(status_date) doc.fields.append(xappy.Field("status_date", status_date)) except Exception: pass title = "" try: title = bungeni.ui.search.ISearchResult(self.context).title except Exception: pass doc.fields.append(xappy.Field("title", title)) try: #TODO: loop thru all available languages and index the translations self.index(doc) except exceptions.OperationalError, exceptions.InvalidRequestError: # detatch the dbapi connection from the pool, and close it # and retry the index operation (once) log.error("Indexing Connection Hosed, Discarding") db_connection = metadata.bind.contextual_connect() db_connection.begin().rollback() db_connection.detach() db_connection.close() if not retry: return self.document(connection, retry=True) raise
def add_to_index(data): doc = xappy.UnprocessedDocument() doc.id = data.id for k, v in data.items(): doc.fields.append(xappy.Field(k, v)) doc = indexer.process(doc) doc.data = data indexer.replace(doc)
def index_document(self, conn, d): if hasattr(self.bench, "process_document_xappy"): self.bench.process_document_xappy(d) doc = xappy.UnprocessedDocument() for key, values in d: if not isinstance(values, list): values = [values] for value in values: doc.fields.append(xappy.Field(key, value)) conn.add(doc)
def perform(self): updoc = xappy.UnprocessedDocument() for k, v in self.doc.iteritems(): if isinstance(v, list): for v2 in v: updoc.append(k, v2) else: updoc.append(k, v) if self.docid is not None: updoc.id = self.docid self.db_writer.iconn.replace(updoc) else: self.db_writer.iconn.add(updoc)
def _process_file(self, file_name, conn, collection_name, filter_settings): """ Extract text from a file, make a xapian document and add it to the database. Return True if complete succesfully, False otherwise. """ get_remote_log().debug("Indexing collection %s: processing file: %s" % (collection_name, file_name)) unused, ext = os.path.splitext(file_name) ext = ext.lower() if self.stale(file_name, conn): filter = self._find_filter(filter_settings[ext[1:]]) if filter: get_remote_log().debug("Filtering file %s using filter %s" % (file_name, filter)) fixed_fields = ( ("filename", file_name), ("nametext", os.path.basename(file_name)), ("filepathtext", file_name), ("filetype", os.path.splitext(file_name)[1][1:]), ("collection", collection_name), ("mtime", str(os.path.getmtime(file_name))), ("size", str(os.path.getsize(file_name))), ) for field, value in fixed_fields: assert (field in dbspec.internal_fields()) try: filtered_blocks = itertools.ifilter( self._accept_block, filter(file_name)) fields = itertools.starmap( xappy.Field, itertools.chain(fixed_fields, filtered_blocks)) doc = xappy.UnprocessedDocument(fields=fields) doc.id = file_name conn.replace( doc ) # FIXME - if this raises an error, it's probably more serious (eg, database corruption) than if a filter raises an error. get_remote_log().debug( "Added (or replaced) doc %s to collection %s with text from source file %s" % (doc.id, collection_name, file_name)) return True except Exception, e: get_remote_log().error( "Filtering file: %s with filter: %s exception %s(%s), skipping" % (file_name, filter, type(e).__name__, str(e))) return False else: get_remote_log().warn( "Filter for %s is not valid, not filtering file: %s" % (ext, file_name)) return False
def load_pols(iconn): pols = schema.Politician.select() for p in pols: doc = xappy.UnprocessedDocument() for format in nameformats.split('\n'): text = replacetable( format, dict(first=p.firstname, middle=p.middlename, last=p.lastname, nickname=p.nickname)) index(doc, 'name', text) index(doc, 'name', p.id.replace('_', ' ')) index(doc, 'id', p.id) iconn.add(doc)
def update_timestamp(self, timestamp): doc = self.get_timestamp_doc() if doc: doc._doc.set_data(str(timestamp)) self.iconn.replace(doc) self.iconn.flush() else: doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('key', '_last_run_')) processed_doc = self.iconn.process(doc, False) processed_doc._doc.set_data(str(timestamp)) # preempt xappy's processing of data processed_doc._data = None self.iconn.add(processed_doc) self.iconn.flush()
def MakeIndex(): connection = xappy.IndexerConnection('kis/lib/data') cursor = connections['default'].cursor() cursor.execute("SELECT rec_id,name FROM t_show_store_eisup_list;") data = cursor.fetchall() for item in data: doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('kod',item[0].encode('utf-8'))) doc.fields.append(xappy.Field('name',item[1].encode('utf-8'))) connection.add(doc) connection.flush() connection.close()
def index(items, doc_type, create=False): indexer = IndexerContext(settings.XAPIAN_DB) if create: with indexer as conn: create_index(conn) preprocess_text = lambda t: normalize_text(t).lower() with indexer as conn: n = 0 for n, (key, data) in enumerate(items, 1): doc = xappy.UnprocessedDocument(key) doc.append('type', doc_type) for field in TEXT_FIELDS: val = data.get(field, '') if val: doc.append(field, preprocess_text(val)) for field in EXACT_FIELDS: val = data.get(field, '') if field == 'date' and val: val = val.partition(' ')[0] if not val.count('-') == 2: val = None if val: doc.append(field, val) for field, kwargs in SORTABLE_FIELDS: val = data.get(field) if not val: continue doc.append(field, val, **kwargs) for field in FACET_FIELDS: val = data.get(field) if not val: continue doc.append(field, val) for field in COLLAPSE_FIELDS: val = data.get(field) if not val: continue doc.append(field, val) conn.add(doc) return n
def add_to_search_index(self, mission, id, chunk, weight, timestamp): """ Take some text and a set of speakers (also text) and add a document to the search index, with the id stuffed in the document data. """ lines = chunk['lines'] doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field("mission", mission)) doc.fields.append(xappy.Field("weight", weight)) doc.fields.append(xappy.Field("transcript", self.transcript_name)) for line in lines: text = re.sub( r"\[\w+:([^]]+)\|([^]]+)\]", lambda m: m.group(2), line['text'], ) text = re.sub( r"\[\w+:([^]]+)\]", lambda m: m.group(1), text, ) # also strip tags from text, because they're lame lame lame text = strip_tags(text) doc.fields.append(xappy.Field("text", text)) # grab the character to get some more text to index under speaker ch = self.characters.get(line['speaker'], None) if ch: ch2 = ch.current_shift(timestamp) doc.fields.append( xappy.Field("speaker_identifier", ch2.identifier)) doc.fields.append(xappy.Field("speaker", ch2.short_name)) doc.fields.append(xappy.Field("speaker", ch.short_name)) else: doc.fields.append( xappy.Field("speaker_identifier", line['speaker'])) doc.fields.append(xappy.Field("speaker", line['speaker'])) doc.id = id try: search_db.replace(search_db.process(doc)) except xappy.errors.IndexerError: print "umm, error" print id, lines raise
def _get_document(self, connection, doc_id, mtime, mode): do_index = False if mode == 'update': try: doc = connection.get_document(doc_id) docmtime = long(doc.data['mtime'][0]) except KeyError: do_index = True else: do_index = mtime > docmtime elif mode == 'add': do_index = True else: raise ValueError("mode must be 'update' or 'add'") if do_index: document = xappy.UnprocessedDocument() document.id = doc_id else: document = None return document
def _factory(db, doc): ixdoc = xappy.UnprocessedDocument() ixdoc.id = doc['_id'] for D in i: for data in D['data']: data, num_items = expand(data, doc) for n in xrange(num_items): if 'factory' in D: out = import_func(D['factory'])(doc) if isinstance(out, ListType): for index_text in out: print 'INDEX_TEXT', index_text ixdoc.fields.append( xappy.Field(D['name'], index_text)) else: index_text = out print 'INDEX_TEXT', index_text ixdoc.fields.append( xappy.Field(D['name'], index_text)) else: index_text = (data % {'n': n}) % api.dotted(doc) print 'INDEX_TEXT', index_text ixdoc.fields.append(xappy.Field(D['name'], index_text)) return ixdoc
def index_pkgs(self): yum_pkgs = self.index_yum_pkgs() pkg_count = 0 for pkg in yum_pkgs.values(): pkg_count += 1 doc = xappy.UnprocessedDocument() filtered_name = filter_search_string(pkg['name']) filtered_summary = filter_search_string(pkg['summary']) filtered_description = filter_search_string(pkg['description']) if pkg['name'] != filtered_name: print("%d: indexing %s as %s" % (pkg_count, pkg['name'], filtered_name)) else: print("%d: indexing %s" % (pkg_count, pkg['name'])) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_name + '__EX', weight=10.0)) name_parts = filtered_name.split('_') for i in range(20): if len(name_parts) > 1: for part in name_parts: doc.fields.append(xappy.Field('name', part, weight=1.0)) doc.fields.append( xappy.Field('name', filtered_name, weight=10.0)) for i in range(4): doc.fields.append( xappy.Field('summary', filtered_summary, weight=1.0)) doc.fields.append( xappy.Field('description', filtered_description, weight=0.2)) self.index_files(doc, pkg) self.index_tags(doc, pkg) for sub_pkg in pkg['sub_pkgs']: pkg_count += 1 filtered_sub_pkg_name = filter_search_string(sub_pkg['name']) if filtered_sub_pkg_name != sub_pkg['name']: print("%d: indexing subpkg %s as %s" % (pkg_count, sub_pkg['name'], filtered_sub_pkg_name)) else: print("%d: indexing subpkg %s" % (pkg_count, sub_pkg['name'])) doc.fields.append( xappy.Field('subpackages', filtered_sub_pkg_name, weight=1.0)) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_sub_pkg_name + '__EX', weight=10.0)) self.index_files(doc, sub_pkg) self.index_tags(doc, sub_pkg) if sub_pkg['icon'] != self.default_icon and pkg[ 'icon'] == self.default_icon: pkg['icon'] = sub_pkg['icon'] # remove anything we don't want to store del sub_pkg['pkg'] # @@: Right now we're only indexing the first part of the # provides/requires, and not boolean comparison or version #for requires in pkg.requires: # print requires[0] # doc.fields.append(xappy.Field('requires', requires[0])) #for provides in pkg.provides: # doc.fields.append(xappy.Field('provides', provides[0])) # remove anything we don't want to store and then store data in # json format del pkg['pkg'] del pkg['src_pkg'] processed_doc = self.iconn.process(doc, False) processed_doc._doc.set_data(json.dumps(pkg)) # preempt xappy's processing of data processed_doc._data = None self.iconn.add(processed_doc) self.icon_cache.close() return pkg_count
def __init__(self, resource): self.resource = resource self.doc = xappy.UnprocessedDocument() log.debug("indexing %s - %s", self.type, self.resource)
def _create_document(self, package): doc = xappy.UnprocessedDocument() filtered_name = filter_search_string(package['name']) filtered_summary = filter_search_string(package['summary']) filtered_description = filter_search_string(package['description']) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_name + '__EX', weight=10.0)) name_parts = filtered_name.split('_') for i in range(20): if len(name_parts) > 1: for part in name_parts: doc.fields.append(xappy.Field('name', part, weight=1.0)) doc.fields.append(xappy.Field('name', filtered_name, weight=10.0)) for i in range(4): doc.fields.append( xappy.Field('summary', filtered_summary, weight=1.0)) doc.fields.append( xappy.Field('description', filtered_description, weight=0.2)) self.index_files_of_interest(doc, package) self.index_tags(doc, package) for sub_package in package['sub_pkgs']: filtered_sub_package_name = filter_search_string( sub_package['name']) log.info(" indexing subpackage %s" % sub_package['name']) doc.fields.append( xappy.Field('subpackages', filtered_sub_package_name, weight=1.0)) doc.fields.append( xappy.Field('exact_name', 'EX__' + filtered_sub_package_name + '__EX', weight=10.0)) self.index_files_of_interest(doc, sub_package) # fedora-tagger does not provide special tags for sub-packages... #self.index_tags(doc, sub_package) # Set special sub-package icon if appstream has one sub_package['icon'] = self.icon_cache.get(sub_package['name'], self.default_icon) # If the parent has a dull icon, give it ours! if sub_package['icon'] != self.default_icon \ and package['icon'] == self.default_icon: package['icon'] = sub_package['icon'] # remove anything we don't want to store del sub_package['package'] # @@: Right now we're only indexing the first part of the # provides/requires, and not boolean comparison or version #for requires in package.requires: # print requires[0] # doc.fields.append(xappy.Field('requires', requires[0])) #for provides in package.provides: # doc.fields.append(xappy.Field('provides', provides[0])) # remove anything we don't want to store and then store data in # json format del package['package'] return doc
# get all documents in the currently selected journal directory files = os.listdir(recent_filePath) # store current indexing ID (used in ids.txt) recent_xapianPath = str(counter) # open a new file connection to create a Xapian index conn = xappy.IndexerConnection(os.path.join(xapianPath, recent_xapianPath)) # create field to store the full texts conn.add_field_action('text', xappy.FieldActions.INDEX_FREETEXT, language='en') if not use_psql: # create a data field to store the full text in it, e.g. while iterating over search results conn.add_field_action('text', xappy.FieldActions.STORE_CONTENT) # iterate over all journal directories for file_name in files: doc = xappy.UnprocessedDocument() f = open(os.path.join(recent_filePath, file_name), "r") text = f.read() f.close() doc.fields.append(xappy.Field("text", text)) try: file_name = "'" + file_name + "'" pmcid = str(get_PMC(file_name)) doc.id = pmcid if use_psql: insert_ID_and_text(pmcid, text) except: # possibly duplicates (from files_without_pmc.txt - less command shows the same content, although different formatting of line breaks, e.g.: # less Biosci_Rep_2012_Dec_1_32\(6\)_549-557.txt # contains a PMC ID, but the following file not: # less Biosci_Rep_2012_Dec_1_32\(Pt_6\)_549-557.txt
def main(): tornado.options.parse_command_line() from apps.main.models import User from apps.questions.models import Question, Genre from mongokit import Connection con = Connection() con.register([Question, Genre, User]) db = con.gkc if options.reindex_all: since = datetime.datetime(1979, 12, 13) else: since = options.since if not since: since = default_since try: since = datetime.datetime.strptime(since, '%Y-%m-%d %H-%M-%S') except ValueError: since = datetime.datetime.strptime(since, '%Y-%m-%d') if options.verbose: print 'since', since genres = {} authors = {} count = 0 search = {'modify_date': {'$gt': since}} if not db.Question.find(search).count(): if options.verbose: print "0 questions" if not options.test: return youngest = since indexer = xappy.IndexerConnection(settings.XAPIAN_LOCATION) if not indexer.get_fields_with_actions() or options.update_fields: indexer.add_field_action('question', xappy.FieldActions.INDEX_FREETEXT, weight=2, language='en', spell=True, stop=stopwords) indexer.add_field_action( 'answer', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True, ) indexer.add_field_action('accept', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('alternatives', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('comment', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=False, search_by_default=False, stop=stopwords) indexer.add_field_action('date', xappy.FieldActions.SORTABLE, type="date") indexer.add_field_action('state', xappy.FieldActions.SORTABLE) indexer.add_field_action('question', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('answer', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('state', xappy.FieldActions.STORE_CONTENT) t0 = time.time() for question in db.Question.collection.find(search): if question['modify_date'] > youngest: youngest = question['modify_date'] doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('state', question['state'])) doc.fields.append(xappy.Field('question', question['text'])) doc.fields.append(xappy.Field('answer', question['answer'])) if question['genre'].id in genres: genre = genres[question['genre'].id] else: genre = db.Genre.one({'_id': question['genre'].id}) genre = genre.name genres[question['genre'].id] = genre doc.fields.append(xappy.Field('genre', genre)) if question['author'].id in authors: author = authors[question['author'].id] else: author = db.User.one({'_id': question['author'].id}) author = author.username authors[question['author'].id] = author doc.fields.append(xappy.Field('author', author)) doc.fields.append(xappy.Field('comment', question['comment'])) doc.fields.append(xappy.Field('accept', '\n'.join(question['accept']))) doc.fields.append( xappy.Field('alternatives', '\n'.join(question['alternatives']))) doc.id = str(question['_id']) pdoc = indexer.process(doc) indexer.replace(pdoc) count += 1 #if count and not count % 100: # indexer.flush() # add a second to avoid milliseconds causing the same doc to be index over and over youngest += datetime.timedelta(seconds=1) open(since_filename, 'w').write(youngest.strftime('%Y-%m-%d %H-%M-%S\n')) indexer.flush() t1 = time.time() indexer.close() if options.verbose: print round(t1 - t0, 3), "seconds to index", count, "questions" # test if options.test: print settings.XAPIAN_LOCATION searcher = xappy.SearchConnection(settings.XAPIAN_LOCATION) text = 'FRAMCEs capitalls' text = "Capitol STATE" print searcher.spell_correct(text) query = searcher.query_field('question', text, default_op=searcher.OP_OR) results = searcher.search(query, 0, 10) print results.matches_estimated #print results.estimate_is_exact for result in results: print result.rank, result.id print repr(result.summarise('question')), result.data['state'][0] #result.data['state'] text = 'london' query = searcher.query_field('answer', text, default_op=searcher.OP_OR) results = searcher.search(query, 0, 10) print results.matches_estimated #print results.estimate_is_exact for result in results: print result.rank, result.id print repr(result.summarise('question')), result.data['state'][0]
def init_db(self, *args): """ loop through all packages and get the latest builds for koji tags listed in distmappings """ self.new_timestamp = time.time() - 60 print "Calculating timestamp minus 1 minute to account for any skew between the servers (%s)" % time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(self.new_timestamp)) print "Initializing Index" package_list = self.koji_client.listPackages() i = 0 for pkg in package_list: i += 1 pkg_name = pkg['package_name'] print "%d: Processing package %s" % (i, pkg_name) name_len = len(pkg_name) doc = xappy.UnprocessedDocument() filtered_name = filter_search_string(pkg_name) doc.fields.append(xappy.Field('key', filtered_name)) latest_builds = {'name': pkg_name} for t in tags: tag = t['tag'] if t['name'] in latest_builds: # short circuit optimization continue builds = self.koji_client.getLatestBuilds(tag, package=pkg_name) if builds: build = None for b in builds: # only get builds which completed if b['state'] == koji.BUILD_STATES['COMPLETE']: build = b break if build: data = { 'version': build['version'], 'release': build['release'], 'build_id': build['build_id'] } if build.get('epoch', None) != None: data['epoch'] = str(build['epoch']) version_display = "%s:%s.%s" % (data['epoch'], data['version'], data['release']) else: version_display = "%s.%s" % (data['version'], data['release']) latest_builds[t['name']] = data print " %s: %s" % (t['name'], version_display) if len(latest_builds) < 2: # don't process doc if there is no real data # most likely this is an outdated package continue processed_doc = self.iconn.process(doc, False) processed_doc._doc.set_data(json.dumps(latest_builds)) # preempt xappy's processing of data processed_doc._data = None self.iconn.add(processed_doc) print "Finished updating timestamp" self.update_timestamp(self.new_timestamp)
def update_db(self, timestamp=None): """ ask koji for any changes after we last ran the mapper if a timestamp is provided in ISO format ('YYYY-MM-DD HH:MI:SS') use that instead """ try: timestamp = float(timestamp) except (ValueError, TypeError): pass if not timestamp: timestamp = self.get_current_timestamp() try: timestamp = float(timestamp) except (ValueError, TypeError): pass if not timestamp: print "Error: you need to specify a time to update from in ISO format ('YYYY-MM-DD HH:MI:SS') or run init" exit(-1) self.new_timestamp = time.time() - 60 print "Calculating timestamp minus 1 minute to account for any skew between the servers (%s)" % time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(self.new_timestamp)) opts = { 'completedAfter': timestamp, 'method': 'tagBuild', 'decode': True } if isinstance(timestamp, float): display_timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(timestamp)) else: display_timestamp = timestamp print "Getting Task List since %s" % display_timestamp task_list = self.koji_client.listTasks(opts=opts) print "Updating Index" for task in task_list: parent_id = task['parent'] if parent_id: builds = self.koji_client.listBuilds(taskID=parent_id) if len(builds) < 1: continue build = builds[0] pkg_tags = self.koji_client.listTags(build['build_id']) dist_name = None for t in pkg_tags: dist_name = tags_to_name_map.get(t['name'], None) if dist_name: break if not dist_name: continue pkg_doc = None if build['name'] in self.found_packages: pkg_doc = self.found_packages[build['name']] else: results = self.search(build['name']) if results: pkg_doc = results[0] build_epoch = build.get('epoch', None) if build_epoch is not None: build_epoch = str(build_epoch) if not pkg_doc: # TODO create new document print "ran into new package %s" % build['name'] self.new_packages[build['name']] = True doc = xappy.UnprocessedDocument() filtered_name = filter_search_string(build['name']) doc.fields.append(xappy.Field('key', filtered_name)) latest_builds = {'name': build['name']} data = {} if build_epoch is not None: data['epoch'] = build_epoch data['version'] = build['version'] data['release'] = build['release'] data['build_id'] = build['build_id'] latest_builds[dist_name] = data processed_doc = self.iconn.process(doc, False) processed_doc._doc.set_data(json.dumps(latest_builds)) # preempt xappy's processing of data processed_doc._data = None self.iconn.add(processed_doc) self.sconn_needs_reload = True self.iconn.flush() else: latest_builds = json.loads(pkg_doc._doc.get_data()) data = latest_builds.get(dist_name, { 'version': '0', 'release': '0', 'build_id': 0 }) data_epoch = None do_update = False if 'release' not in data: # do the update because we have old data do_update = True else: data_epoch = data.get('epoch', None) if data_epoch is not None: data_epoch = str(data_epoch) if rpm.labelCompare( (build_epoch, build['version'], build['release']), (data_epoch, data['version'], data['release'])) == 1: do_update = True if do_update: self.updated_packages[build['name']] = True build_vr = '' if build_epoch is not None: build_vr = "%s:%s.%s" % (build_epoch, build['version'], build['release']) else: build_vr = "%s.%s" % (build['version'], build['release']) data_vr = '' if data_epoch is not None: data_vr = "%s:%s.%s" % (data_epoch, data['version'], data.get('release', '')) else: data_vr = "%s.%s" % (data['version'], data.get('release', '')) print "Updating package %s in dist %s to version %s (from %s)" % ( build['name'], dist_name, build_vr, data_vr) if build_epoch is not None: data['epoch'] = build_epoch data['version'] = build['version'] data['release'] = build['release'] data['build_id'] = build['build_id'] latest_builds[dist_name] = data pkg_doc._doc.set_data(json.dumps(latest_builds)) # preempt xappy's processing of data pkg_doc._data = None self.iconn.replace(pkg_doc) self.sconn_needs_reload = True self.found_packages[build['name']] = pkg_doc self.iconn.flush() updated_count = len(self.updated_packages) new_count = len(self.new_packages) print "Updated: %d packages" % updated_count print " Added: %d packages" % new_count print "=========================" print " Total: %s" % (updated_count + new_count) self.update_timestamp(self.new_timestamp)