def build_index(self, remove_old=True): if remove_old: remove_directory(self.search_db_dir) self.__xappy = xappy.IndexerConnection(self.search_db_dir) self.__xappy.add_field_action("module_uid", xappy.FieldActions.STORE_CONTENT) self.__xappy.add_field_action("keyword_term", xappy.FieldActions.INDEX_FREETEXT, nopos=True) for module_keyword in self.__keywords: for keyword in module_keyword[2]: module_doc = xappy.UnprocessedDocument() module_doc.fields.append(xappy.Field("module_uid", keyword[0])) terms = list(split_word(keyword[1], True)) module_doc.fields.append( xappy.Field("keyword_term", ' '.join(terms))) self.__xappy.add(module_doc) self.__xappy.close()
def indexer_connection(index_path=None): if not index_path: index_path = configure() indexer = xappy.IndexerConnection(index_path) # indexes indexer.add_field_action('searchable_text', xappy.FieldActions.INDEX_FREETEXT, nopos=True) indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT) #indexer.add_field_action('keywords', xappy.FieldActions.FACET) indexer.add_field_action('type', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('alpha', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('language', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('sortable_title', xappy.FieldActions.SORTABLE) indexer.add_field_action('hidden', xappy.FieldActions.INDEX_EXACT) #indexer.add_field_action('modified', xappy.FieldActions.SORTABLE, type='data') # metadata indexer.add_field_action('title', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('alpha', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('language', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('type', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('searchable_text', xappy.FieldActions.STORE_CONTENT) #indexer.add_field_action('description', xappy.FieldActions.STORE_CONTENT) #indexer.add_field_action('author', xappy.FieldActions.STORE_CONTENT) return indexer
def maybe_make_db(self): dbpath = self.dbpath() if not os.path.exists(dbpath): os.makedirs(dbpath) conn = xappy.IndexerConnection(dbpath) add_internal_field_actions(conn, self.stopwords, self.language) free_text_options = { 'stop': self.stopwords, 'spell': True, 'language': self.language } conn.add_field_action('title', xappy.FieldActions.INDEX_FREETEXT, **free_text_options) conn.add_field_action('title', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('content', xappy.FieldActions.INDEX_FREETEXT, **free_text_options) conn.add_field_action('content', xappy.FieldActions.STORE_CONTENT) conn.add_field_action("description", xappy.FieldActions.INDEX_FREETEXT, **free_text_options) conn.add_field_action("description", xappy.FieldActions.STORE_CONTENT) conn.add_field_action("keyword", xappy.FieldActions.INDEX_FREETEXT, **free_text_options) conn.add_field_action("keyword", xappy.FieldActions.STORE_CONTENT) conn.close()
def index(): """Index entire database.""" indexer = xappy.IndexerConnection(config.search_db) indexer.add_field_action("name", xappy.FieldActions.INDEX_FREETEXT, spell=True) indexer.add_field_action("id", xappy.FieldActions.INDEX_EXACT) indexer.add_field_action("type", xappy.FieldActions.INDEX_EXACT) def add_to_index(data): doc = xappy.UnprocessedDocument() doc.id = data.id for k, v in data.items(): doc.fields.append(xappy.Field(k, v)) doc = indexer.process(doc) doc.data = data indexer.replace(doc) import db def add_table(table, id_prefix="", type=""): for d in db.getdb().select(table): d.id = id_prefix + d.id d.type = type if table == "constituency": d.id = d.state + "/" + d.id add_to_index(d) add_table("party", "party/", "party") add_table("state", type="state") add_table("constituency", type="constituency") add_table("candidate", "candidate/", type="candidate") indexer.flush() indexer.close()
def CreateIndex(): connection = xappy.IndexerConnection('kis/lib/data') connection.add_field_action('kod', xappy.FieldActions.INDEX_EXACT) connection.add_field_action('name', xappy.FieldActions.INDEX_FREETEXT, language='ru') connection.close()
def initdb(): iconn = xappy.IndexerConnection(DBPATH) trash(iconn) iconn.add_field_action('name', xappy.FieldActions.INDEX_FREETEXT, spell=True) iconn.add_field_action('id', xappy.FieldActions.INDEX_FREETEXT) iconn.add_field_action('id', xappy.FieldActions.STORE_CONTENT) return iconn
def MakeIndex(): connection = xappy.IndexerConnection('kis/lib/data') cursor = connections['default'].cursor() cursor.execute("SELECT rec_id,name FROM t_show_store_eisup_list;") data = cursor.fetchall() for item in data: doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('kod',item[0].encode('utf-8'))) doc.fields.append(xappy.Field('name',item[1].encode('utf-8'))) connection.add(doc) connection.flush() connection.close()
def xappy_indexer_connection(self, path): conn = xappy.IndexerConnection(path) conn.add_field_action('body', xappy.FieldActions.INDEX_FREETEXT, language='en') if self.options.storebody: conn.add_field_action('body', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('date', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('date', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('frm', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('frm', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('to', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('to', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('subject', xappy.FieldActions.INDEX_FREETEXT, language='en') conn.add_field_action('subject', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('cc', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('bcc', xappy.FieldActions.INDEX_EXACT) return conn
def __init__(self, base_uri, db_path): """Create a database writer for the specified path. """ BaseDbWriter.__init__(self, base_uri, db_path) self.queue = Queue.Queue(1000) if not hasattr(self.queue, 'task_done'): def nop(*args): pass self.queue.task_done = nop if not hasattr(self.queue, 'join'): def nop(*args): pass self.queue.join = nop self.iconn = xappy.IndexerConnection(self.db_path)
def buildIndexWithArticles(self, articles): conn = xappy.IndexerConnection(self.__xapianPath) #add priority to title field in case of ranked matching (weight=5)- index all fields and store data conn.add_field_action('title', xappy.FieldActions.INDEX_FREETEXT, weight=5, language='en') conn.add_field_action('text', xappy.FieldActions.INDEX_FREETEXT, language='en') conn.add_field_action('chemical_exact', xappy.FieldActions.INDEX_EXACT) conn.add_field_action('keyword', xappy.FieldActions.INDEX_FREETEXT, language='en') conn.add_field_action('mesh', xappy.FieldActions.INDEX_FREETEXT, language='en') conn.add_field_action('text', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('title', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('chemical_exact', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('keyword', xappy.FieldActions.STORE_CONTENT) conn.add_field_action('mesh', xappy.FieldActions.STORE_CONTENT) for article in articles: doc = self.__buildDoc(article) if doc == None: continue try: #process doc to pdoc explicitly - not needed here #pdoc = conn.process(doc) conn.add(doc) except: continue PubMedXapian.__indexCount += 1 nbs = len(PubMedXapian.__indexMsg) PubMedXapian.__indexMsg = "article %s indexed" % (str( PubMedXapian.__indexCount)) sys.stdout.write('\b' * nbs + PubMedXapian.__indexMsg) conn.flush() conn.close()
def create_index(self): """ Create a new index, and set up its field structure """ indexer = xappy.IndexerConnection(self.dbpath) indexer.add_field_action('exact_name', xappy.FieldActions.INDEX_FREETEXT) indexer.add_field_action('name', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('summary', xappy.FieldActions.INDEX_FREETEXT, language='en') indexer.add_field_action('description', xappy.FieldActions.INDEX_FREETEXT, language='en') indexer.add_field_action('subpackages', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('category_tags', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('cmd', xappy.FieldActions.INDEX_FREETEXT, spell=True) # FieldActions.TAG not currently supported in F15 xapian (1.2.7) #indexer.add_field_action('tags', xappy.FieldActions.TAG) indexer.add_field_action('tag', xappy.FieldActions.INDEX_FREETEXT, spell=True) #indexer.add_field_action('requires', xappy.FieldActions.INDEX_EXACT) #indexer.add_field_action('provides', xappy.FieldActions.INDEX_EXACT) self.indexer = indexer
import sys import re import redis import xappy import time try: import json except ImportError: import simplejson as json from django.utils.html import strip_tags from backend.parser import TranscriptParser, MetaParser from backend.api import Act, KeyScene, Character, Glossary, LogLine from backend.util import seconds_to_timestamp search_db = xappy.IndexerConnection( os.path.join(os.path.dirname(__file__), '..', 'xappydb'), ) def mission_time_to_timestamp(mission_time): """Takes a mission time string (XX:XX:XX:XX) and converts it to a number of seconds""" d, h, m, s = map(int, mission_time.split(':')) timestamp = d * 86400 + h * 3600 + m * 60 + s if mission_time[0] == "-": return timestamp * -1 else: return timestamp class TranscriptIndexer(object): """
store_dir = path.join(store_dir, 'parts', part_target) if path.exists(store_dir): assert path.isdir(store_dir) else: os.mkdir(store_dir) return store_dir # create storage directory if not present store_dir = setupStorageDirectory() # search connection hub searcher = search.IndexSearch(store_dir) # async indexer indexer = xappy.IndexerConnection(store_dir) # if synchronous debugging, setup the index connection if iindex.DEBUG_SYNC: iindex.DEBUG_SYNC_IDX = indexer if interfaces.DEBUG: queue.QueueProcessor.POLL_TIMEOUT=3 else: queue.QueueProcessor.POLL_TIMEOUT=5 if interfaces.DEBUG: searcher.hub.auto_refresh_delta = 5 else: searcher.hub.auto_refresh_delta = 10
def do_indexing(self, col, filter_settings): """Perform an indexing pass. Index the database for col using filters given by filter_settings. The filename is used as the document id, and this is used to remove documents in the database that no longer have an associated file. continue_check will be called before each file of the collection is about to be processed. If it returns False then indexing will stop and do_indexing will return False. If do_indexing attempts to index all the files then it will return True. """ conn = None try: name = col.name root_logger = logging.getLogger() get_remote_log().info( "Indexing collection: %s with filter settings: %s" % (name, filter_settings)) dbname = col.dbpath() # This will error if the directory containing the databases has # disappeared, but that's probably a good thing - the document # collection is supposed to know where its database is - if it's # asking for indexing of a non-existent database, then it's the # collection's problem not the indexer's. # FIXME - we should really test for the error though, so we can # give a better error message. conn = xappy.IndexerConnection(dbname) conn.set_max_mem_use(max_mem_proportion=0.1) docs_found = dict((id, False) for id in conn.iterids()) error_count = file_count = 0 for f in col.files(): if self.disk_space_short(dbname): # raise an exception rather than return False - we # don't want to keep trying to index in this # situation. raise DiskSpaceShortage if not self._process_file(f, conn, name, filter_settings): error_count += 1 file_count += 1 docs_found[f] = True if not self.continue_check(file_count, error_count): get_remote_log().debug( "Prematurely terminating indexing, stop flag is true") return False for id, found in docs_found.iteritems(): if not found: get_remote_log().debug("Removing %s from %s" % (id, name)) conn.delete(id) get_remote_log().info("Indexing of %s finished" % name) conn.close() get_remote_log().debug("Changes to %s flushed" % name) return True except xappy.XapianDatabaseLockError, e: get_remote_log().error( "Attempt to index locked database: %s, ignoring" % dbname)
os.chdir(directory) recent_filePath = os.getcwd() # get all subdirectories nested_directories = os.listdir(recent_filePath) # check whether there are subdirectories if os.path.isdir(nested_directories[0]): print directory, nested_directories # change into subdirectory recent_filePath = os.path.join(recent_filePath, nested_directories[0]) os.chdir(recent_filePath) # get all documents in the currently selected journal directory files = os.listdir(recent_filePath) # store current indexing ID (used in ids.txt) recent_xapianPath = str(counter) # open a new file connection to create a Xapian index conn = xappy.IndexerConnection(os.path.join(xapianPath, recent_xapianPath)) # create field to store the full texts conn.add_field_action('text', xappy.FieldActions.INDEX_FREETEXT, language='en') if not use_psql: # create a data field to store the full text in it, e.g. while iterating over search results conn.add_field_action('text', xappy.FieldActions.STORE_CONTENT) # iterate over all journal directories for file_name in files: doc = xappy.UnprocessedDocument() f = open(os.path.join(recent_filePath, file_name), "r") text = f.read() f.close() doc.fields.append(xappy.Field("text", text)) try:
def main(): tornado.options.parse_command_line() from apps.main.models import User from apps.questions.models import Question, Genre from mongokit import Connection con = Connection() con.register([Question, Genre, User]) db = con.gkc if options.reindex_all: since = datetime.datetime(1979, 12, 13) else: since = options.since if not since: since = default_since try: since = datetime.datetime.strptime(since, '%Y-%m-%d %H-%M-%S') except ValueError: since = datetime.datetime.strptime(since, '%Y-%m-%d') if options.verbose: print 'since', since genres = {} authors = {} count = 0 search = {'modify_date': {'$gt': since}} if not db.Question.find(search).count(): if options.verbose: print "0 questions" if not options.test: return youngest = since indexer = xappy.IndexerConnection(settings.XAPIAN_LOCATION) if not indexer.get_fields_with_actions() or options.update_fields: indexer.add_field_action('question', xappy.FieldActions.INDEX_FREETEXT, weight=2, language='en', spell=True, stop=stopwords) indexer.add_field_action( 'answer', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True, ) indexer.add_field_action('accept', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('alternatives', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=True) indexer.add_field_action('author', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('genre', xappy.FieldActions.INDEX_EXACT) indexer.add_field_action('comment', xappy.FieldActions.INDEX_FREETEXT, language='en', spell=False, search_by_default=False, stop=stopwords) indexer.add_field_action('date', xappy.FieldActions.SORTABLE, type="date") indexer.add_field_action('state', xappy.FieldActions.SORTABLE) indexer.add_field_action('question', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('answer', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('genre', xappy.FieldActions.STORE_CONTENT) indexer.add_field_action('state', xappy.FieldActions.STORE_CONTENT) t0 = time.time() for question in db.Question.collection.find(search): if question['modify_date'] > youngest: youngest = question['modify_date'] doc = xappy.UnprocessedDocument() doc.fields.append(xappy.Field('state', question['state'])) doc.fields.append(xappy.Field('question', question['text'])) doc.fields.append(xappy.Field('answer', question['answer'])) if question['genre'].id in genres: genre = genres[question['genre'].id] else: genre = db.Genre.one({'_id': question['genre'].id}) genre = genre.name genres[question['genre'].id] = genre doc.fields.append(xappy.Field('genre', genre)) if question['author'].id in authors: author = authors[question['author'].id] else: author = db.User.one({'_id': question['author'].id}) author = author.username authors[question['author'].id] = author doc.fields.append(xappy.Field('author', author)) doc.fields.append(xappy.Field('comment', question['comment'])) doc.fields.append(xappy.Field('accept', '\n'.join(question['accept']))) doc.fields.append( xappy.Field('alternatives', '\n'.join(question['alternatives']))) doc.id = str(question['_id']) pdoc = indexer.process(doc) indexer.replace(pdoc) count += 1 #if count and not count % 100: # indexer.flush() # add a second to avoid milliseconds causing the same doc to be index over and over youngest += datetime.timedelta(seconds=1) open(since_filename, 'w').write(youngest.strftime('%Y-%m-%d %H-%M-%S\n')) indexer.flush() t1 = time.time() indexer.close() if options.verbose: print round(t1 - t0, 3), "seconds to index", count, "questions" # test if options.test: print settings.XAPIAN_LOCATION searcher = xappy.SearchConnection(settings.XAPIAN_LOCATION) text = 'FRAMCEs capitalls' text = "Capitol STATE" print searcher.spell_correct(text) query = searcher.query_field('question', text, default_op=searcher.OP_OR) results = searcher.search(query, 0, 10) print results.matches_estimated #print results.estimate_is_exact for result in results: print result.rank, result.id print repr(result.summarise('question')), result.data['state'][0] #result.data['state'] text = 'london' query = searcher.query_field('answer', text, default_op=searcher.OP_OR) results = searcher.search(query, 0, 10) print results.matches_estimated #print results.estimate_is_exact for result in results: print result.rank, result.id print repr(result.summarise('question')), result.data['state'][0]
def create_db(self, db_path): """Create a xappy database at db_path. """ db = xappy.IndexerConnection(db_path) db.close()
def create_index(self): self.iconn = xappy.IndexerConnection(self.dbpath) self.sconn = xappy.SearchConnection(self.dbpath) # keys are filtered package names or "_last_run_" self.iconn.add_field_action('key', xappy.FieldActions.INDEX_EXACT)
def __enter__(self): self.conn = xappy.IndexerConnection(self.dbpath) self.conn.set_max_mem_use(MAX_MEM) return self.conn