def build_index(): print("build index") client = MongoClient('localhost', 27017) collection = client['bdhackathon']['Japan_Travel'] schema = Schema( article_title=TEXT(stored=True, analyzer=analyzer), article_id=TEXT(stored=True), author=TEXT(stored=True), #content=TEXT(stored=True, analyzer=analyzer) ) #Initial Whoosh index if not os.path.exists("index"): os.mkdir("index") create_in("index", schema) ix = open_dir("index") writer = ix.writer() articles = collection.find() for article in articles: writer.update_document( article_title = article["article_title"], article_id = article["article_id"], author = article["author"]["account"], #content= article["content"] ) writer.commit()
def create_whoosh(self): print "creating_whoosh: " #initlock = lockfile.ThreadSafeFile(WHOOSH_FOLDER, '_init') thistime = datetime.datetime.now() dateformat = '%d-%m-%Y %H:%M:%S' create_index_flag = False #try: # initlock.acquire(timeout=2) #except lockfile.LockTimeout: # print "Lock timeout when trying to create whoosh index schema. Continuing without index creation" # return #except lockfile.AlreadyLocked: # print "Already locked. Continuing without index creation" # return try: last_creation = datetime.datetime.strptime(initlock.read(), dateformat) #deserialize print "Last index creation: %s"%datetime.datetime.strftime(last_creation, '%d-%m-%Y %H:%M:%S') if (thistime - last_creation).total_seconds() > 4*60*60: #4 hours create_index_flag = True print "Index older than 4 hours - will recreate" else: print "Index is fresh - will not recreate" except: create_index_flag = True #do the creation anyway, maybe initial condition if create_index_flag: create_in(WHOOSH_FOLDER, schema) print "Creating search index" writer = ix.writer() for t in self.collector: #print "index: Adding term %s"%t[0] writer.add_document(term=u"%s"%t[0], url=u"%s"%t[1], description=u"%s"%t[2]) writer.commit() #we can free now the collector self.collector = None
def __load__(region=None): """加载/建立索引 :param region: 索引范围,None表示加载所有索引;news\blog表示加载对应索引 :return: 是否加载成功 """ # 加载索引 if region: if region in Indexer.__index__: return True else: if region not in index_dir: return False if not os.path.exists(index_dir[region]): os.makedirs(index_dir[region]) Indexer.__index__[region] = index.create_in(index_dir[region], schema, indexname=region) else: Indexer.__index__[region] = index.open_dir(index_dir[region], indexname=region) return True else: # 加载全部索引 for reg in index_dir.keys(): if reg in Indexer.__index__: return True else: if not os.path.exists(index_dir[reg]): os.mkdir(index_dir[reg]) Indexer.__index__[reg] = index.create_in(index_dir[reg], schema, indexname=reg) else: Indexer.__index__[reg] = index.open_dir(index_dir[reg], indexname=reg) return True
def open_index(self, schema): """ Opens an index. Returns the writer. """ if not os.path.exists(self.index_path): os.mkdir(self.index_path) index.create_in(self.index_path, schema) self._index = index.open_dir(self.index_path) return self._index.writer()
def build_indexes(self): if os.path.exists(self.index_location): log.debug('removing previous index') rmtree(self.index_location) if not os.path.exists(self.index_location): os.mkdir(self.index_location) chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA, indexname=CHGSET_IDX_NAME) chgset_idx_writer = chgset_idx.writer() file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME) file_idx_writer = file_idx.writer() log.debug('BUILDING INDEX FOR EXTENSIONS %s ' 'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys())) for repo_name, repo in self.repo_paths.items(): # skip indexing if there aren't any revisions if len(repo) < 1: continue self.index_files(file_idx_writer, repo_name, repo) self.index_changesets(chgset_idx_writer, repo_name, repo) log.debug('>> COMMITING CHANGES <<') file_idx_writer.commit(merge=True) chgset_idx_writer.commit(merge=True) log.debug('>>> FINISHED BUILDING INDEX <<<')
def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK): raise IOError( "The path to your Whoosh index '%s' is not writable for the current user/group." % settings.HAYSTACK_WHOOSH_PATH ) self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH) self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields()) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) else: try: self.index = self.storage.open_index(schema=self.schema) except index.EmptyIndexError: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) self.setup_complete = True
def run(self): # open index self.buffer = deque(maxlen=BUFFERLINES) if not exists(self.indexdir): makedirs(self.indexdir) self.ix = create_in(self.indexdir, SCHEMA) else: if exists_in(self.indexdir): self.ix = open_dir(self.indexdir) else: self.ix = create_in(self.indexdir, SCHEMA) self.qp = QueryParser("content", self.ix.schema) self.searcher = self.ix.searcher() index_p = self.index_p while True: try: # check index_p try: type, data = index_p.recv() except EOFError: break try: if type == QUERY: self._processSearch(data) elif type == LOG: self._processLog(data) elif type == RENAME: self._processRename(data) else: prnt("Unexpected data in logindexsearch.") except: print_exc() prnt("EXCEPTION in logindexsearch process.") except KeyboardInterrupt: break self._dumpBuffer(self.buffer) self.searcher.close() self.ix.close()
def setup(self): """ Defers loading until needed. """ new_index = False # Make sure the index is there. if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH): os.makedirs(settings.HAYSTACK_WHOOSH_PATH) new_index = True self.storage = store.FileStorage(settings.HAYSTACK_WHOOSH_PATH) self.content_field_name, fields = self.site.build_unified_schema() self.schema = self.build_schema(fields) self.parser = QueryParser(self.content_field_name, schema=self.schema) if new_index is True: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) else: try: self.index = index.Index(self.storage, schema=self.schema) except index.EmptyIndexError: self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema) self.setup_complete = True
def build_search(cls): analyzer = cls.analyzer schema = Schema( nid=ID(unique=True, stored=True), slug=ID(unique=True, stored=True), title=TEXT(stored=True, analyzer=analyzer), tag=KEYWORD(stored=True, lowercase=True, commas=True, scorable=True), description=TEXT(stored=True, analyzer=analyzer), content=TEXT(stored=True, analyzer=analyzer) ) folder = cls.tmp_dir if not os.path.exists(folder): os.mkdir(folder) create_in(folder, schema) ix = open_dir(folder) writer = ix.writer() for article in Article.find({'status': Article.ACCEPTED}): writer.update_document( nid=str(article._id), slug=article.slug, title=article.title, tag=','.join(article.tag), description=article.description, content=article.content ) writer.commit() cls.searcher = ix.searcher()
def clear_index(self): """ Clear index: whoosh indexe create, create a new index in the directory even if an index exists. """ if os.path.exists("indexes"): index.create_in("indexes", self.schema)
def main(script, command='', index='', field='', *query): """Store, clear or search data in whoosh indices. Can also be used to create vectors needed for task 3. 'command' is either build|store|clean|search|vector 'index' is either atc|icd|therapy|case Usage: python3 index.py <command> [index] [field] [query] """ # Store all objects in index if command == 'build': populate_all() empty = get_empty_indices() for cls in empty: store_objects_in_index(cls) return classes = [ATC, ICD, PatientCase, Therapy] if index: classes = [i for i in classes if i._NAME == index] if not classes: print("Unknown index %s, valids: atc|icd|case|therapy" % index) sys.exit(2) # Store objects in index, will create duplicates if run several times if command == 'store': populate_all() for cls in classes: store_objects_in_index(cls) # Empty index elif command in ('clean', 'clear'): for cls in classes: create_or_open_index(cls) create_in(INDEX_DIR, SCHEMA_MAP[cls._NAME], cls._NAME) print("Emptied %s index" % cls.__name__) # Create vectors elif command.startswith('vector'): populate_all() create_vectors() # Search in whoosh index elif command == 'search': mapping = {'icd': ('short', 'label'), 'atc': ('code', 'title'), 'therapy': ('code', 'title'), 'case': ('code',)} query = ''.join(query) # Flatten query cls, = classes # Can only search on one index at a time print_result(extract(mapping[cls._NAME], search(cls, field, query))) # Unknown command else: print("Unknown command '%s'" % command) print("Usage: python3 index.py <command> [index] [field] [query]") print("Command is either build|store|clean|search|vector") sys.exit(2) sys.exit(None)
def get_index(self): ip = self.indexpath if not self.indexpath.startswith('/'): ip = path.join(self.env.path, ip) if not path.exists(ip): os.mkdir(ip) if not index.exists_in(ip): index.create_in(ip, self.SCHEMA) return index.open_dir(ip)
def setup(self): import os if not os.path.exists(self.location): os.mkdir(self.location) self.ix = index.create_in(self.location, self.schema) elif index.exists_in(self.location): self.ix = index.open_dir(self.location, schema=self.schema) else: self.ix = index.create_in(self.location, self.schema)
def create(self, path): """ Create the index directory if it hasn't already been created. """ if not os.path.exists(path): os.mkdir(path) create_in(self.index_path, self.schema) return True return False
def test_detects_that_index_needs_upgrade(self): wrong_schema = Schema(content=TEXT()) index.create_in(self.index_dir, schema=wrong_schema) whoosh_backend = WhooshBackend(self.env) self.assertEqual(whoosh_backend.is_index_outdated(), True) whoosh_backend.recreate_index() self.assertEqual(whoosh_backend.is_index_outdated(), False)
def run(args): indexdir = "urla.index" schema = Schema(file=ID(stored=True), line=NUMERIC(stored=True), network=ID(stored=True), channel=TEXT(stored=True), when=DATETIME(stored=True), speaker=ID(), to=ID(), content=TEXT(stored=True, analyzer=StemmingAnalyzer())) os.mkdir(indexdir) create_in(indexdir, schema)
def create_index(): schema = fields.Schema(title=fields.TEXT(stored=True), id=fields.ID(stored=True), authors=fields.TEXT(stored=True), wiki=fields.TEXT,) if not os.path.exists("bibishare/search/index"): os.mkdir("bibishare/search/index") create_in("bibishare/search/index", schema)
def create_index(self): """If index directory does not exists, this method creates directory, creates index and saves schema file.""" if not os.path.exists(self.indexdir): os.mkdir(self.indexdir) self.message("Created index at \"" + self.indexdir + "\".") create_in(self.indexdir, self.schema) with open(self.indexdir + "/" + SCHEMA_PATH, 'w') as output: output.write(open(self.schema_path).read()) self.message("Saved schema to " + self.indexdir + "/" + SCHEMA_PATH + ".")
def get_index(): """ Return an index creates index if empty """ if not exists_in("index", indexname="contents"): if not os.path.exists("index"): os.mkdir("index") create_in("index", indexname="contents", schema=get_schema()) return open_dir("index", indexname="contents")
def create_index(reg): woosh_env = reg.getUtility(IWhooshEnvironment) try: docschema = reg.getUtility(IDocumentSchema) except ComponentLookupError: raise AttributeError('') if index.exists_in(woosh_env['index_dir']): print >> sys.stderr, 'Index already exist , you need to delete it before build it again' exit(-1) index.create_in(woosh_env['index_dir'], docschema.Schema)
def get(cls, instance): """ Get the index object for a python object """ index_path = cls.get_path(instance) class_name = Introspection.get_class_name(instance) instance = cls.get_instance_from_class(index_path, instance) schema = cls.get_schema(instance) if not os.path.exists(index_path): os.makedirs(index_path) if not whoosh_index.exists_in(index_path, indexname=class_name): whoosh_index.create_in(index_path, schema, indexname=class_name) return cls._get_index(index_path, class_name, schema=schema)
def handle_noargs(self, *args, **options): print 'Clearing current index...' if not os.path.exists(settings.WHOOSH_INDEX): os.mkdir(settings.WHOOSH_INDEX) index.create_in(settings.WHOOSH_INDEX, DOCUMENT_WHOOSH_SCHEMA) print 'Indexing documents...' for document in Document.objects.all(): document.save() # A save will trigger a re-index of the document sys.stdout.write('.') sys.stdout.flush() print '\nAll done.'
def index(self, value): if not path.exists(value): log.info("creating woosh database directory") mkdir(value) if not exists_in(value): log.info("Whoosh DB does not exist. Creating it.") create_in(value, self.schema) else: log.info("Whoosh DB exist. Using Existing WhooshDB.") ix = open_dir(value) self._index = ix
def delete_all(cls, instance): """Delete all objects of the given type from the index.""" index_path = Index.get_path(instance) instance = Index.get_instance_from_class(index_path, instance) if instance is not None: class_name = Introspection.get_class_name(instance) logger.debug('Deleting all {} from {}'.format(class_name, index_path)) if os.path.isdir(index_path): schema = Index.get_schema(instance) whoosh_index.create_in(index_path, schema, indexname=class_name) return True
def clear_index(self): """ Clear index: whoosh indexe create, create a new index in the directory even if an index exists. """ if os.path.exists("indexes"): index.create_in("indexes", self.schema) if os.path.exists("doctypes"): with open('doctypes/doctypes_schema.json', 'w') as defaultFile: defaultFile.write("{}")
def __init__(self, doc_base): self.__doc_base = doc_base self.__index_folder = os.path.join(self.__doc_base, '.indices') if not os.path.exists(self.__index_folder): print ' Create index directory', self.__index_folder os.mkdir(self.__index_folder) if not exists_in(self.__index_folder): create_in(self.__index_folder, INDEX_SCHEMA) self.__index_write = open_dir(self.__index_folder).writer()
def __init__(self, order=0, whoosh_query_index_dir="", unique=True): super(WhooshQueryLogger, self).__init__(order) self.description = "Adds queries to a Whoosh index" self.unique = unique print "About to create Whoosh query logger" self.whooshIndexDir = whoosh_query_index_dir schema = Schema(title=ID(unique=True, stored=True), content=TEXT(stored=True), ncontent=NGRAM(stored=True), issued=DATETIME(stored=True)) if not exists_in(self.whooshIndexDir): print "Creating a Whoosh Index." create_in(self.whooshIndexDir, schema) self.queryIndex = open_dir(self.whooshIndexDir) print "The current number of queries held in the index is: " + str( self.queryIndex.doc_count() ) print "Done creating Whoosh query log index"
def __init__(self): """ Instantiate the whoosh schema and writer and create/open the index. """ self.users_collection = pymongo.Connection().fullteck.users #self.webpages_collection = pymongo.Connection().fullteck.webpages_col self.indexdir = "index" self.indexname = "users" self.schema = self.get_schema() if not os.path.exists(self.indexdir): os.mkdir(self.indexdir) create_in(self.indexdir, self.schema, indexname=self.indexname) # create an index obj and buffered writer self.ix = open_dir(self.indexdir, indexname=self.indexname)
def __init__(self, db): """ Instantiate the whoosh schema and writer and create/open the index. """ self.bookmarks_collection = db.bookmarks_col self.webpages_collection = db.webpages_col self.indexdir = "index" self.indexname = "bookmarks" self.schema = self.get_schema() if not os.path.exists(self.indexdir): os.mkdir(self.indexdir) create_in(self.indexdir, self.schema, indexname=self.indexname) # create an index obj and buffered writer self.ix = open_dir(self.indexdir, indexname=self.indexname)
def get_ix(self): """Creates the Schema and returns the index_writer""" schema = Schema(title=TEXT(stored=True), path=ID(stored=True, unique=True), content=TEXT(analyzer= SpaceSeparatedTokenizer() | LowercaseFilter()), date=STORED) #create if not exists if not os.path.exists(".indexdir"): os.mkdir(".indexdir") create_in(".indexdir", schema) ix = open_dir(".indexdir") return ix
def __init__(self): self.indexDir = "./indexfile" if not os.path.exists(self.indexDir): os.mkdir(self.indexDir) self.schema = Schema(url=TEXT(stored=True, analyzer=StemmingAnalyzer()), title=TEXT(stored=True, analyzer=ChineseAnalyzer()), content=TEXT(stored=True, analyzer=ChineseAnalyzer()), anchors=KEYWORD(stored=True, commas=True), pageRank=NUMERIC(int, 32, sortable=True, stored=True)) self.exists = index.exists_in(self.indexDir, indexname="nkai") if self.exists: self.index = index.open_dir(self.indexDir, indexname="nkai") else: self.index = index.create_in(self.indexDir, schema=self.schema, indexname="nkai")
def get_index(api, recreate=False, must_exist=False): index_dir = api.ftsindex if index_dir.exists(): if recreate: rmtree(index_dir) # pragma: no cover elif must_exist: raise ValueError('No whoosh index found at {0}.'.format(index_dir)) if not index_dir.exists(): index_dir.mkdir() schema = Schema(id=ID(stored=True), provider=KEYWORD(stored=True), authoryear=TEXT(stored=True), title=TEXT(analyzer=StemmingAnalyzer(), stored=True), author=TEXT(stored=True), year=TEXT(stored=True), doctype=TEXT(stored=True), lgcode=TEXT(stored=True), body=TEXT(), tags=KEYWORD) return index.create_in(index_dir.as_posix(), schema) return index.open_dir(index_dir.as_posix())
def build_index(self): analyzer = ChineseAnalyzer() # 创建索引模板 schema = Schema(novelID=ID(stored=True), novelName=TEXT(stored=True, analyzer=analyzer), novelUrl=ID(stored=True), novelAuthor=TEXT(stored=True, analyzer=analyzer), novelIntroduction=TEXT(stored=True, analyzer=analyzer), novelUpdateTime=TEXT(stored=True), novelUpdateUrl=ID(stored=True), novelUpdateName=TEXT(stored=True)) # create/open index dir_path = 'novel_index' if not os.path.exists(dir_path): os.makedirs(dir_path) ix = create_in('novel_index', schema) else: ix = open_dir('novel_index') # build index writer = ix.writer() rows = self.page.find() indexed_amount = 0 for row in rows: indexed_amount += 1 writer.add_document( novelID=str(row['_id']), novelName=row['novel_name'], novelUrl=row['novel_url'], novelAuthor=row['novel_author'], novelIntroduction=row['novel_introduction'], novelUpdateTime=row['novel_update_last_time'], novelUpdateUrl=row['novel_update_last_url'], novelUpdateName=row['novel_update_last_name'], ) writer.commit() print(indexed_amount)
def index(self): schema = Schema(path=ID(stored=True), title=TEXT(stored=True), year=TEXT(stored=True), rated=TEXT(stored=True), director=TEXT(stored=True), actors=TEXT(stored=True), plot=TEXT(stored=True), imdb=TEXT(stored=True), poster=TEXT(stored=True), url=TEXT(stored=True)) indexer = create_in("index", schema) writer = indexer.writer() # add [encoding='utf-8'] in line 100 after csv file if appears UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 5291: character maps to <undefined> with open('moviedata.csv', encoding='utf-8') as csv_file: # Load the csv file csv_reader = csv.reader(csv_file, delimiter=',') skip = 0 for row in csv_reader: # Skip first row if(skip == 0): skip = 1 continue # For each row in csv, add it as document with appropiate headers to index. writer.add_document(title=u"%s" % (row[1]), year=u"%s" % (row[2]), rated=u"%s" % (row[3]), director=u"%s" % (row[4]), actors=u"%s" % (row[5]), plot=u"%s" % (row[6]), imdb=u"%s" % (row[7]), poster=u"%s" % (row[8]), url=u"%s" % (row[9])) # Commit updates to index writer.commit() self.indexer = indexer
def create_index(): # We create the index schema. schema = Schema(id=ID(stored=True), title=TEXT, content=TEXT) if not os.path.exists(index_dir): os.mkdir(index_dir) ix = index.create_in(index_dir, schema) # The writer() method of the Index object returns an IndexWriter object that lets us add documents to the index. writer = ix.writer() # Add documents to the index. for doc in docs: # Add document writer.add_document(id=doc['id'].decode(), title=doc['title'].decode(), content=doc['content'].decode()) # Calling commit() on the IndexWriter saves the added documents to the index. writer.commit() ix.close()
def queryWithStemming(self, string, content): if string in content: return True words = segment(string) for word in words: if not word.isalpha(): words.remove(word) string1 = " ".join(words) # print string1 words = segment(string + "s") for word in words: if not word.isalpha(): words.remove(word) string2 = " ".join(words) # print string2 self.init_index() schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(analyzer=StemmingAnalyzer())) ix = create_in("index", schema) writer = ix.writer() # content = "The good pid file function of directory good can not work" writer.add_document(title=u"First document", path=u"/a", content=unicode(content)) writer.commit() with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(string1) results = searcher.search(query) res1 = len(results) > 0 query = QueryParser("content", ix.schema).parse(string2) results = searcher.search(query) res2 = len(results) > 0 res = res1 | res2 # print res, res1, res2 return res
def gen_whoosh_database(if_rand=True): analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), type=TEXT(stored=True), link=ID( unique=True, stored=True, ), content=TEXT(stored=True, analyzer=analyzer)) whoosh_db = 'database/whoosh' if not os.path.exists(whoosh_db): os.makedirs(whoosh_db) ix = create_in(whoosh_db, schema) else: ix = open_dir(whoosh_db) writer = ix.writer() do_for_app(writer, rand=if_rand) do_for_post(writer, rand=if_rand) print('-' * 10) writer.commit()
def createSearchableData(data_file): ''' Schema definition: video id, video title, description ''' stem_analyzer = StemmingAnalyzer() schema = Schema(id=ID(stored=True), title=TEXT(stored=True), description=TEXT(analyzer=stem_analyzer, stored=True)) if not os.path.exists("indexdir"): os.mkdir("indexdir") # Creating an index writer to add document as per schema ix = create_in("indexdir", schema) writer = ix.writer() with open(data_file) as f: youtube_array = json.load(f) for youtube_item in tqdm(youtube_array): youtube_id = youtube_item['id'] youtube_title = youtube_item['title'] youtube_description = youtube_item['description'] writer.add_document(id=youtube_id, title=youtube_title, description=youtube_description) writer.commit()
def write_index_file(index_dir, tablename): analyzer = ChineseAnalyzer(minsize=1) # can index one word schema = Schema(seq_no=NUMERIC(stored=True), source=TEXT(stored=True), title=TEXT(stored=True), segwords=TEXT(stored=True, analyzer=analyzer)) ix = create_in(index_dir, schema) writer = ix.writer() datasets = fetch_segwords(tablename) for dataset in datasets: get_seq_no = int(dataset["seq_no"]) get_source = dataset["ad_title_source"] get_title = dataset["ad_title"].replace('\n', '') get_segwords = dataset["ad_title_segwords"].replace('\n', '') writer.add_document(seq_no=get_seq_no, source=get_source, title=get_title, segwords=get_segwords) writer.commit() loginfo = 'Inverted index for %s has been created.' % tablename gl.write_log(logpath, 'info', loginfo)
def main(): # 使用结巴的中文分词 analyzer = ChineseAnalyzer() # 创建 schema, stored 为 True 表示能够被检索 schema = Schema(title=TEXT(stored=True, analyzer=analyzer), path=ID(stored=False), content=TEXT(stored=True, analyzer=analyzer), id=TEXT(stored=True)) # 读取 yaml 信息 config = os.path.abspath(os.path.dirname(__file__))[:-5] + 'config.yaml' with open(config) as f: c = yaml.load(f.read()) indexdir = c['index'] # 存储schema信息至 indexdir 目录下 if not os.path.exists(indexdir): os.mkdir(indexdir) idx = create_in(indexdir, schema) # 按照 schema 定义信息, 增加需要建立索引的文档 # 注意: 字符串格式需要定义为 unicode 格式 return idx
def build_whoosh_database(): analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), type=TEXT(stored=True), link=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) ix = create_in(whoosh_database, schema) writer = ix.writer() mpost = MPost() recs = mpost.query_all() for rec in recs: text2 = html2text.html2text(tornado.escape.xhtml_unescape( rec.cnt_html)) print(text2) writer.add_document(title=rec.title, type='<span style="color:blue;">[文档]</span>', link='/post/{0}.html'.format(rec.uid), content=text2) writer.commit()
def index_create(): #重点在这里,将原先的RegexAnalyzer(ur”([\u4e00-\u9fa5])|(\w+(\.?\w+)*)”),改成这句,用中文分词器代替原先的正则表达式解释器。 analyzer = ChineseAnalyzer() # 列出index的所有域 schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT(stored=True, analyzer=analyzer)) ix = create_in('indexer', schema) # 将所有文本加入索引 writer = ix.writer() for root, dirs, files in os.walk('data2/'): for file in files: path2 = os.path.join(root, file) with open(path2, 'r') as f: content2 = f.read() title2 = content2.split('\n')[0] writer.add_document(title=title2, path='auto.sohu.com/' + path2[6:].replace('|', '/') + '.shtml', content=content2) writer.commit()
def cargar(): categorias = Schema(name=ID(stored=True), content=TEXT) titulos = Schema(name=ID(stored=True), content=KEYWORD) enlaces = Schema(name=ID(stored=True), content=TEXT) fechas = Schema(name=ID(stored=True), content=DATETIME) descripciones = Schema(name=ID(stored=True), content=KEYWORD) schemas = [categorias, titulos, enlaces, fechas, descripciones] paths = [] ix = index.create_in("indexdir3", schema) writer = ix.writer() path = "myRoko.txt" with open(path, "r") as f: content = f.read() f.close() writer.add_document(name=path, content= content) writer.commit() if not os.path.exists("index"): os.mkdir("index") ix = create_in("index", schema) ix = open_dir("index") writer = ix.writer() for i in range(len(lista[0])): writer.add_document(category=lista[0][i], title=lista[1][i], link=lista[2][i], date=lista[3][i], description=lista[4][i]) writer.commit() searcher = ix.searcher() #with ix.searcher() as searcher:
def index(): if not os.path.exists(dirindextemas): os.mkdir(dirindextemas) ix = create_in(dirindextemas, schema=get_schema()) writer = ix.writer() # Extraemos los datos usando BeautifulSoup #Missing l = extractXML() i=0 #Todo: Extract the attributes #for item in l: #writer.add_document(titulo = titulo, antetitulo = antetitulo, link = link, description= description, fecha = fecha) #i+=1 writer.add_document(titulo = "test", description = "test", categoria = "test cag", fecha = datetime.datetime.now()) messagebox.showinfo("Temas indexados", "Se han indexado "+str(i)+ " temas") writer.commit()
def full_index(index_dir, entries): idx = index.create_in(index_dir, SCHEMA) writer = idx.writer() datas = [] for ent in entries: file = open(ent, mode='r') # read all lines at once all_of_it = file.read() all_of_it = all_of_it.replace('\n', ' ') data = {'title': ent.split('/')[1], 'path': ent, 'content': all_of_it} datas.append(data) # TODO: get data for data in datas: writer.add_document(**data) writer.commit() return idx
def create_whoosh_index(video_list, index_name): # Schema definition: schema = Schema(id=ID(stored=True), title=TEXT(stored=True), description=TEXT(analyzer=StemmingAnalyzer(), stored=True)) # create a folder to store the index if not os.path.exists(index_name): os.mkdir(index_name) index = create_in(index_name, schema) writer = index.writer() for video_item in video_list: video_id = video_item['id'] video_title = video_item['snippet']['title'] video_description = video_item['snippet']['description'] writer.add_document(id=video_id, title=video_title, description=video_description) writer.commit()
def __init__(self): global RUN print('Initial searcher') self.schema = Schema(url=ID(stored=True, unique=True), store_path=TEXT(stored=True), timestamp=TEXT(stored=True), content=TEXT(stored=False, analyzer=ChineseAnalyzer())) try: print("Open index dir") self.my_idx = windex.open_dir(cfg.storage_dir) except: print("Index dir not found, create new index dir") self.my_idx = windex.create_in(cfg.storage_dir, self.schema) self.searcher = self.my_idx.searcher() self.content_reader = helper.content_reader() print("All documents: %s" % self.searcher.doc_count_all()) print("Valid documents: %s" % self.searcher.doc_count()) print("Init bayes model, may take some times...") self.bayesData = bayes.get_instance() RUN = True self.ticker = Ticker() self.ticker.tick = False threading.Thread(target=self.ticker.run).start() self.update_lock = threading.Lock() self.read_lock = threading.RLock() self.aborted_update = False self.is_update_in_progress = False self.update_progress = 0
def build_index(dir): # load the well-process doc df = doc_preprocess(dir) # apply jieba chinese analyzer to tokenize the documents analyzer = ChineseAnalyzer() # create schema, stored = True means can be returned to user schema = Schema(idx=ID(stored=True), title=TEXT(stored=True, analyzer=analyzer), keyword=KEYWORD(analyzer=analyzer), content=TEXT(stored=False, analyzer=analyzer)) # store the schema information to 'indexdir' indexdir = 'indexdir/' if not os.path.exists(indexdir): os.mkdir(indexdir) ix = create_in(indexdir, schema) # build the index based on schema writer = ix.writer() for idx, val in df.iterrows(): writer.add_document(idx=str(idx), title=str(val[0]), keyword=val[5], content=str(val[6])) writer.commit()
def handle(self, *args, **kwargs): """ Creates the index iterating over all the pages of the site """ schema = Schema(pk=NUMERIC(unique=True, stored=True), title=TEXT, summary=TEXT, tags=KEYWORD(commas=True, scorable=True), pub_date=DATETIME(sortable=True)) if not os.path.exists(settings.INDEX): os.mkdir(settings.INDEX) ix = create_in(settings.INDEX, schema) writer = ix.writer() objects = Page.objects.all() for object in objects: tags = map(lambda x: x.title, object.tags.all()) writer.add_document(title=object.title, summary=object.summary, tags=",".join(tags), pk=object.pk, pub_date=object.pub_date) writer.commit()
def create_news_index(dir_index, news): if not os.path.exists(dir_index): os.mkdir(dir_index) ind = create_in(dir_index, schema=get_news_schema()) writer = ind.writer() for story in news: category = story[0] title = story[1] link = story[2] description = story[3] date = story[4] writer.add_document(category=str(category), title=str(title), link=str(link), description=str(description), date=date) writer.commit() messagebox.showinfo( "Succes", "Index created correctly, " + str(len(news)) + " news saved")
def __init__(self, index_path, language): from whoosh import index as whoosh_index from whoosh.fields import Schema, TEXT, ID from whoosh import qparser from whoosh.highlight import UppercaseFormatter from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer from whoosh.lang import has_stemmer, has_stopwords import os if not has_stemmer(language) or not has_stopwords(language): # TODO Display a warning? analyzer = SimpleAnalyzer() else: analyzer = LanguageAnalyzer(language) self.schema = Schema(path=ID(unique=True, stored=True), body=TEXT(analyzer=analyzer)) self.formatter = UppercaseFormatter() self.index_path = index_path if not os.path.exists(index_path): try: os.mkdir(index_path) except OSError as e: sys.exit("Error creating Whoosh index: %s" % e) if whoosh_index.exists_in(index_path): try: self.search_index = whoosh_index.open_dir(index_path) except whoosh_index.IndexError as e: sys.exit("Error opening whoosh index: {0}".format(e)) else: self.search_index = whoosh_index.create_in(index_path, self.schema) self.query_parser = qparser.MultifieldParser(["body", "path"], schema=self.schema) self.query_parser.add_plugin(qparser.FuzzyTermPlugin())
def createIndexRsr(path, tblRsr): schema = Schema(fname=TEXT(stored=True), lname=TEXT(stored=True), id=TEXT(stored=True), mstid=TEXT(stored=True),\ science=TEXT(stored=True), scienceCode=TEXT(stored=True), field=TEXT(stored=True), subfield=TEXT(stored=True), keyws_en=TEXT(stored=True), keyws_sl=TEXT(stored=True), content=TEXT) index = create_in(path+"whooshindex/rsr", schema) writer = index.writer() for rsr in tblRsr.all(): content = "" s = u"" s_code = u"" f = u"" sub = u"" keyws_en = u"" keyws_sl = u"" if rsr.has_key('science'): s = rsr['science']['#text'] s_code = rsr['science']['@code'] content += " "+rsr['science']['#text'] if rsr.has_key('field'): f = rsr['field']['#text'] content += " "+rsr['field']['#text'] if rsr.has_key('subfield'): sub = rsr['subfield']['#text'] content += " "+rsr['subfield']['#text'] if rsr.has_key('keyws_en'): keyws_en = rsr['keyws_en']['@keyws'] content += " "+rsr['keyws_en']['@keyws'] if rsr.has_key('keyws_sl'): keyws_sl = rsr['keyws_sl']['@keyws'] content += " "+rsr['keyws_sl']['@keyws'] if content != "": print rsr["@id"]+": "+content writer.add_document(lname=rsr['fname'], fname=rsr['lname'], id=rsr['@id'], mstid=rsr['@mstid'],\ science=s, scienceCode=s_code, field=f, subfield=sub, keyws_en=keyws_en, keyws_sl=keyws_sl, content=content) writer.commit() return index
def gen_whoosh_database(kind_arr=None, post_type=None): ''' kind_arr, define the `type` except Post, Page, Wiki post_type, define the templates for different kind. :param if_rand: :param kind_arr: :param post_type: :return: ''' if kind_arr is None: kind_arr = [] if post_type is None: post_type = {} analyzer = ChineseAnalyzer() schema = Schema(title=TEXT(stored=True, analyzer=analyzer), catid=TEXT(stored=True), type=TEXT(stored=True), link=ID(unique=True, stored=True), content=TEXT(stored=True, analyzer=analyzer)) whoosh_db = 'database/whoosh' if not os.path.exists(whoosh_db): os.makedirs(whoosh_db) create_idx = create_in(whoosh_db, schema) else: create_idx = open_dir(whoosh_db) writer = create_idx.writer() # do_for_app2(writer, rand=if_rand) for switch in [True, False]: do_for_post(writer, rand=switch, doc_type=post_type['1']) do_for_wiki(writer, rand=switch, doc_type=post_type['1']) do_for_page(writer, rand=switch, doc_type=post_type['1']) for kind in kind_arr: do_for_app(writer, rand=switch, kind=kind, doc_type=post_type) writer.commit()
def __init__(self, th): BaseSearchEngine.__init__(self, th, False) self.__ix_writer = None ix_dir = os.path.join(th.prefixes[0], 'index', "ix_" + str(whoosh_ix_ver)) if not os.path.isdir(ix_dir): os.makedirs(ix_dir) # try to load a pre-existing index try: self.indexer = open_dir(ix_dir) except (EmptyIndexError, IndexVersionError): # create a new one try: shutil.rmtree(ix_dir, True) os.makedirs(ix_dir) except OSError: pass schema = Schema( kitab=ID(stored=True), vrr=ID(stored=True, unique=False), # version release nodeIdNum=ID(stored=True, unique=False), title=TEXT(stored=True, field_boost=1.5, analyzer=analyzer), content=TEXT(stored=False, analyzer=analyzer), #content=TEXT(stored=False,analyzer=analyzer, vector=Frequency(analyzer=analyzer)), # with term vector tags=IDLIST(stored=False)) self.indexer = create_in(ix_dir, schema) #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema) self.__ix_qparser = MultifieldSQParser(( "title", "content", ), self.indexer.schema) self.__ix_qparser.add_plugin( FieldAliasPlugin({ u"kitab": (u"كتاب", ), u"title": (u"عنوان", ), u"tags": (u"وسوم", ) })) #self.__ix_pre=whoosh.query.Prefix self.__ix_searcher = self.indexer.searcher()
def create_index(postid, condition, file_dict, schema_name, num_pro): index_path = 'proteomicsdb/index_dir' # make Whoosh index in directory from scratch if does not yet exist if not os.listdir(index_path): ix = index.create_in(index_path, schema_name) # else if there's already an index, just open it and add else: ix = index.open_dir(index_path) # add elements to index writer = ix.writer() for i in range(num_pro): writer.add_document(post_id=postid, condition=condition, accession=list(file_dict.values())[0][i], description=list(file_dict.values())[1][i], gene=list(file_dict.values())[2][i], fdr=list(file_dict.values())[3][i], species=list(file_dict.values())[4][i], mw=list(file_dict.values())[5][i], peptides=list(file_dict.values())[6][i], psm=list(file_dict.values())[7][i], uniq_peptides=list(file_dict.values())[8][i], abun_t1=list(file_dict.values())[9][i], abun_t2=list(file_dict.values())[10][i], abun_t3=list(file_dict.values())[11][i], abun_t4=list(file_dict.values())[12][i], abun_t5=list(file_dict.values())[13][i], q_value=list(file_dict.values())[14][i], pep=list(file_dict.values())[15][i]) writer.commit( merge=False ) # "merge=False" means when adding multiple files worth of entries, keep files separate pass # run function but return nothing
def create_index(): # Create the schema for this index, which denotes the types of each field, and next try to build the index itself # using this schema. Note that this schema treats the URL as the unique identifier for documents in the index, # and scores documents based on the title and content alone index_dir = ".index" # Try to create the index directory os.mkdir(index_dir) # Build a new index in this directory index = create_in(index_dir, index_schema) # Get a writer for the index index_writer = index.writer() # Add the main pages to the index for main_page in ['about_me', 'research', 'resume']: insert_document(index_writer, main_page, 'http://www.jontedesco.net/' + main_page, main_page) # Add the blog entries blog_posts = list(Post.objects.all()) for blog_post in blog_posts: insert_document(index_writer, blog_post.title, 'http://www.jontedesco.net/blog/' + blog_post.name, blog_post.name) # Add the projects projects = list(Project.objects.all()) for project in projects: insert_document(index_writer, project.title, 'http://www.jontedesco.net/projects/' + project.name, project.name) # Commit all the changes, so that every change is flushed to disk, and we can safely query the index index_writer.commit() return index, index_schema
def dblpindextest(): schema = Schema(title=TEXT(stored=True), path=ID(stored=True), author=TEXT, content=TEXT) cwd = os.path.dirname(os.path.realpath(__file__)) indexDirPath = os.path.join(cwd, os.path.pardir, "testindexdir") dataDirPath = os.path.join(cwd, os.path.pardir, "dblpdata", "dblp-ref") if not os.path.exists(indexDirPath): os.mkdir(indexDirPath) ix = create_in(indexDirPath, schema) writer = ix.writer() linelens = [] times = [] filepath = os.path.join(dataDirPath, "dblp-ref-3.json") with open(filepath, "r", encoding='utf-8') as f: for line in f: jsonline = json.loads(line) try: t1 = time.time() writer.add_document(title=jsonline['title'], path=jsonline['id'], content=jsonline['abstract']) t2 = time.time() if t2 - t1 > 0: times.append(t2 - t1) linelens.append(len(jsonline['abstract'])) print(len(line), t2 - t1) except Exception as e: print(str(e)) plotlens = [] plottimes = [] for i in range(0, 50): x = randbelow(1000) plotlens.append(linelens[x]) plottimes.append(times[x]) plt.scatter(plotlens, plottimes) plt.show()
def index_search(s): """ Đánh chỉ mục và search dựa vào câu query :param s: câu truy vấn đã làm sạch : đăng kí nguyện vọng 1 như thế nào ?#dạ cho em hỏi, em muốn đăng kí nguyện vọng 1 thì như thế nào ạ? :return: list các tài liệu được tìm thấy """ results_search = [] results_search.append(s) client = MongoClient('mongodb://localhost:27017/') #kết nối MongoDB db = client.TuyenSinhDB #ket noi database collection = db.WordSegmentation #ket noi collection của Database select_table = collection.find({}, {"_id": 0}) #read data # tiến hành đánh chỉ mục schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT) ix = create_in( "/home/bichmi/Desktop/KhoaLuanCrawl/KhoaLuan/selenium_TuyenSinh/Data_index", schema) writer = ix.writer() for item in select_table: content = item['questions'] + ' ' + item['answers'] + ' ' + item[ 'dates'] writer.add_document(title=content, path=u"/a", content='') writer.commit() # tiến hành search dựa vào câu truy vấn và parse title with ix.searcher() as searcher: query = QueryParser("title", ix.schema).parse(s) results = searcher.search(query) # import pdb # pdb.set_trace() if len(results) <= 0: print("Không có kết quả phù hợp với câu hỏi!") return 0 else: for hit in results: results_search.append(hit['title']) return results_search
def create_gutenberg_index_rdf(bz2_rdf_filename, indexdir): """Build whoosh index from parsed RDF. DB contents are no longer identical to RDF output. Plus index now stores selected db row ids. DEPRECATED""" sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # don't buffer stdout print "WARNING: direct use of rdf content may not accurately reflect database contents" schema = get_schema() whoosh_index = create_in(indexdir, schema) writer = whoosh_index.writer() for count, record in enumerate( gutenberg_rdf_parser.parse_rdf_bz2(bz2_rdf_filename, GutenbergIndexFilter().filter)): # Only index fields from description records. File records can be ignored. if record['record_type'] == 'DESCRIPTION': if count % 5000 == 0: print count, subset = {k: record[k] for k in schema.names() if k in record} writer.add_document(**subset) print "committing...", writer.commit() print "DONE"
def __init__(self): self.scope = 20 self.terms = set() self.index_path = "index" self.common_terms = set() self.schema = Schema(title=TEXT(stored=True), path=TEXT(stored=True), page=NUMERIC(stored=True), content=TEXT(stored=True)) self.ix = None self.index_files = False if not os.path.exists(self.index_path): os.mkdir(self.index_path) self.ix = create_in(self.index_path, self.schema) self.index_files = True else: self.ix = open_dir(self.index_path) self.writer = self.ix.writer() self.read() self.writer.commit() self.searcher = self.ix.searcher() self.corrector = ListCorrector(sorted(list(self.common_terms))) self.parser = QueryParser("content", self.ix.schema)