Esempio n. 1
0
    def test_resetsearchindexes_command_existing_dir_other_indexes(
            self, getdefaultlocale_mock):
        self.options["interactive"] = False

        os.mkdir(self.new_index_dir)
        index.create_in(self.new_index_dir, fields.Schema(content=fields.TEXT),
                        'other_index')
        self.assertTrue(os.path.exists(self.new_index_dir))

        with self.settings(WIRECLOUD_INDEX_DIR=self.new_index_dir):
            try:
                call_command('resetsearchindexes', **self.options)
            except SystemExit:
                raise CommandError('')

        self.options['stdout'].seek(0)
        self.assertEqual(self.options['stdout'].read(), '')
        self.options['stderr'].seek(0)
        self.assertEqual(self.options['stderr'].read(), '')
        self.assertTrue(os.path.exists(self.new_index_dir))
        self.assertTrue(
            index.exists_in(self.new_index_dir, indexname='other_index'))
        for search_index in get_available_search_engines():
            self.assertTrue(
                index.exists_in(self.new_index_dir,
                                indexname=search_index.indexname))
Esempio n. 2
0
    def __init__(self,
                 indexname=IDX_NAME,
                 index_location=None,
                 repo_location=None,
                 sa=None,
                 repo_list=None,
                 repo_update_list=None):
        self.indexname = indexname

        self.index_location = index_location
        if not index_location:
            raise Exception('You have to provide index location')

        self.repo_location = repo_location
        if not repo_location:
            raise Exception('You have to provide repositories location')

        self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

        #filter repo list
        if repo_list:
            #Fix non-ascii repo names to unicode
            repo_list = map(safe_unicode, repo_list)
            self.filtered_repo_paths = {}
            for repo_name, repo in self.repo_paths.items():
                if repo_name in repo_list:
                    self.filtered_repo_paths[repo_name] = repo

            self.repo_paths = self.filtered_repo_paths

        #filter update repo list
        self.filtered_repo_update_paths = {}
        if repo_update_list:
            self.filtered_repo_update_paths = {}
            for repo_name, repo in self.repo_paths.items():
                if repo_name in repo_update_list:
                    self.filtered_repo_update_paths[repo_name] = repo
            self.repo_paths = self.filtered_repo_update_paths

        self.initial = True
        if not os.path.isdir(self.index_location):
            os.makedirs(self.index_location)
            log.info('Cannot run incremental index since it does not '
                     'yet exist running full build')
        elif not exists_in(self.index_location, IDX_NAME):
            log.info('Running full index build as the file content '
                     'index does not exist')
        elif not exists_in(self.index_location, CHGSET_IDX_NAME):
            log.info('Running full index build as the changeset '
                     'index does not exist')
        else:
            self.initial = False
Esempio n. 3
0
    def test_resetsearchindexes_command_individual_index(self, getdefaultlocale_mock):
        self.options['indexes'] = 'user'

        with self.settings(WIRECLOUD_INDEX_DIR=self.new_index_dir):
            call_command('resetsearchindexes', **self.options)

        self.options['stdout'].seek(0)
        self.options['stderr'].seek(0)

        for search_index in get_available_search_engines():
            if search_index.indexname != 'user':
                self.assertFalse(index.exists_in(self.new_index_dir, indexname=search_index.indexname))

        self.assertTrue(index.exists_in(self.new_index_dir, indexname='user'))
Esempio n. 4
0
    def __init__(self, indexname=IDX_NAME, index_location=None,
                 repo_location=None, sa=None, repo_list=None,
                 repo_update_list=None):
        self.indexname = indexname

        self.index_location = index_location
        if not index_location:
            raise Exception('You have to provide index location')

        self.repo_location = repo_location
        if not repo_location:
            raise Exception('You have to provide repositories location')

        self.repo_paths = ScmModel(sa).repo_scan(self.repo_location)

        #filter repo list
        if repo_list:
            #Fix non-ascii repo names to unicode
            repo_list = map(safe_unicode, repo_list)
            self.filtered_repo_paths = {}
            for repo_name, repo in self.repo_paths.items():
                if repo_name in repo_list:
                    self.filtered_repo_paths[repo_name] = repo

            self.repo_paths = self.filtered_repo_paths

        #filter update repo list
        self.filtered_repo_update_paths = {}
        if repo_update_list:
            self.filtered_repo_update_paths = {}
            for repo_name, repo in self.repo_paths.items():
                if repo_name in repo_update_list:
                    self.filtered_repo_update_paths[repo_name] = repo
            self.repo_paths = self.filtered_repo_update_paths

        self.initial = True
        if not os.path.isdir(self.index_location):
            os.makedirs(self.index_location)
            log.info('Cannot run incremental index since it does not '
                     'yet exist running full build')
        elif not exists_in(self.index_location, IDX_NAME):
            log.info('Running full index build as the file content '
                     'index does not exist')
        elif not exists_in(self.index_location, CHGSET_IDX_NAME):
            log.info('Running full index build as the changeset '
                     'index does not exist')
        else:
            self.initial = False
Esempio n. 5
0
def get_indices():
    if not os.path.exists(INDEX_DIR):
        os.mkdir(INDEX_DIR)
    if index.exists_in(INDEX_DIR):
        return index.open_dir(INDEX_DIR)
    else:
        return full_index()
Esempio n. 6
0
def search_files(index_dir, content):
    """
	search file content in index 
	if not hit: return False
	if hit: return results
	"""
    index_exist = index.exists_in(index_dir)
    if not index_exist:
        print ("index not exist")
        return False
    ix = index.open_dir(index_dir)
    content = unicode(content)
    with ix.searcher() as searcher:
        parser = QueryParser("content", ix.schema)
        query = parser.parse(content)
        # whoosh.searching.Results
        results = searcher.search(query)
        print (type(results))
        l = len(results)
        print l
        for h in results:
            # whoosh.searching.Hit
            print type(h)
            print h
        return results
    return False
Esempio n. 7
0
 def run(self):
     # open index
     self.buffer = deque(maxlen=BUFFERLINES)
     if not exists(self.indexdir):
         makedirs(self.indexdir)
         self.ix = create_in(self.indexdir, SCHEMA)
     else:
         if exists_in(self.indexdir): self.ix = open_dir(self.indexdir)
         else: self.ix = create_in(self.indexdir, SCHEMA)
     self.qp = QueryParser("content", self.ix.schema)
     self.searcher = self.ix.searcher()
     index_p = self.index_p
     while True:
         try:
             # check index_p
             try:
                 type, data = index_p.recv()
             except EOFError:
                 break
             try:
                 if type == QUERY: self._processSearch(data)
                 elif type == LOG: self._processLog(data)
                 elif type == RENAME: self._processRename(data)
                 else:
                     prnt("Unexpected data in logindexsearch.")
             except:
                 print_exc()
                 prnt("EXCEPTION in logindexsearch process.")
         except KeyboardInterrupt:
             break
     self._dumpBuffer(self.buffer)
     self.searcher.close()
     self.ix.close()
Esempio n. 8
0
def get_index(index_dir, schema=doc_schema):
    lib.ensure_dir(index_dir)
    if index.exists_in(index_dir):
        ix = index.open_dir(index_dir)
    else:
        ix = index.create_in(index_dir, schema)
    return ix
Esempio n. 9
0
def get_whoosh_index(force_create=False):
  from whoosh.index import create_in, exists_in, open_dir
  from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED
  from whoosh.analysis import CharsetFilter, StemmingAnalyzer, NgramWordAnalyzer
  from whoosh.support.charset import accent_map

  analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
  ngramAnalyzer = NgramWordAnalyzer( minsize=2, maxsize=4)

  schema = Schema(
    title     = TEXT(analyzer=analyzer, spelling=True, stored=True, field_boost=3.0), 
    abstract  = TEXT(analyzer=analyzer, stored=True, field_boost=2.0), 
    path      = ID(unique=True, stored=True), 
    authors   = TEXT(analyzer=analyzer, sortable=True, field_boost=1.5), 
    content   = TEXT(analyzer=analyzer, stored=True), 
    tags      = KEYWORD(sortable=True, commas=True, field_boost=1.5, lowercase=True), 
    status    = KEYWORD,
    classname = KEYWORD,
    typeahead = TEXT(spelling=True, stored=True, phrase=False)
  )
    
  if not os.path.exists(settings.WHOOSH_ROOT):
    os.mkdir(settings.WHOOSH_ROOT)
  
  if not exists_in(settings.WHOOSH_ROOT) or force_create:
    index = create_in(settings.WHOOSH_ROOT, schema)
  else:
    index = open_dir(settings.WHOOSH_ROOT)
  return index
Esempio n. 10
0
def app():
    # indexdir = "indexdir"
    indexdir = r'D:\files\whoosh_code_data\whoosh_base\index_files'
    storage = FileStorage(indexdir)

    fname = storage.list()  # ['dinosaur.db_loh0qsax01wwdijy.seg', 'dinosaur.db_WRITELOCK', 'mmorpg.db_1mwe4pojwea459cm.seg', 'mmorpg.db_WRITELOCK',
    # print(fname)
    indices = []  # [FileIndex(FileStorage('indexdir'), 'dinosaur.db'), FileIndex(FileStorage('indexdir'), 'mmorpg.db'), FileIndex(FileStorage('indexdir'), 'superfamicom.db')]
    n = 0
    for f in fname:
        if not f.endswith(".seg"):
            continue
        print(f)
        ind = f.split('_')[0]
        # print(ind)

        if exists_in(indexdir, indexname=ind):
            indices.append(open_dir(indexdir, ind))
        n += 1
        if n == 1:
            break

    # print(indices[0])
    # indices = [indices[0]]
    search(indices)
Esempio n. 11
0
    def __init__(self):
        """
        Init Instance
        """
        super(Indexer, self).__init__()

        # Indexer configuration - index dir and schema setup
        self.baseindexpath = join(os.environ['AIL_HOME'],
                                  self.process.config.get("Indexer", "path"))
        self.indexRegister_path = join(
            os.environ['AIL_HOME'],
            self.process.config.get("Indexer", "register"))
        self.indexertype = self.process.config.get("Indexer", "type")
        self.INDEX_SIZE_THRESHOLD = self.process.config.getint(
            "Indexer", "index_max_size")

        self.indexname = None
        self.schema = None
        self.ix = None

        if self.indexertype == "whoosh":
            self.schema = Schema(title=TEXT(stored=True),
                                 path=ID(stored=True, unique=True),
                                 content=TEXT)
            if not os.path.exists(self.baseindexpath):
                os.mkdir(self.baseindexpath)

            # create the index register if not present
            time_now = int(time.time())
            if not os.path.isfile(
                    self.indexRegister_path):  # index are not organised
                self.redis_logger.debug("Indexes are not organized")
                self.redis_logger.debug(
                    "moving all files in folder 'old_index' ")
                # move all files to old_index folder
                self.move_index_into_old_index_folder()
                self.redis_logger.debug("Creating new index")
                # create all_index.txt
                with open(self.indexRegister_path, 'w') as f:
                    f.write(str(time_now))
                # create dir
                os.mkdir(join(self.baseindexpath, str(time_now)))

            with open(self.indexRegister_path, "r") as f:
                allIndex = f.read()
                allIndex = allIndex.split()  # format [time1\ntime2]
                allIndex.sort()

                try:
                    self.indexname = allIndex[-1].strip('\n\r')
                except IndexError as e:
                    self.indexname = time_now

                self.indexpath = join(self.baseindexpath, str(self.indexname))
                if not exists_in(self.indexpath):
                    self.ix = create_in(self.indexpath, self.schema)
                else:
                    self.ix = open_dir(self.indexpath)

            self.last_refresh = time_now
Esempio n. 12
0
    def open_index(self, index_folder, create_new=False):
        self.index_folder = index_folder
        if create_new:
            if os.path.exists(index_folder):
                shutil.rmtree(index_folder)
                print "deleted index folder: " + index_folder

        if not os.path.exists(index_folder):
            os.mkdir(index_folder)

        exists = index.exists_in(index_folder)
        stemming_analyzer = StemmingAnalyzer()

        schema = Schema(
            path=ID(stored=True, unique=True)
            , filename=TEXT(stored=True, field_boost=100.0)
            , tags=KEYWORD(stored=True, scorable=True, field_boost=80.0)
            , headlines=KEYWORD(stored=True, scorable=True, field_boost=60.0)
            , doubleemphasiswords=KEYWORD(stored=True, scorable=True, field_boost=40.0)
            , emphasiswords=KEYWORD(stored=True, scorable=True, field_boost=20.0)
            , content=TEXT(stored=True, analyzer=stemming_analyzer)
            , time=STORED
        )
        if not exists:
            self.ix = index.create_in(index_folder, schema)
        else:
            self.ix = index.open_dir(index_folder)
Esempio n. 13
0
def main(index_base_path,vendor_code,index_type,data_file_type,data_file_path,data_file_list):
	if cidx.create_whoosh_idx(os.path.join(index_base_path,vendor_code),index_type):
		print "sucess index creation at -->: ",os.path.join(index_base_path,vendor_code,index_type)
	else:
		print "failed index creation at -->: ", os.path.join(index_base_path, vendor_code, index_type)
		quit()


	if index.exists_in(os.path.join(index_base_path,vendor_code,index_type)):
		ix = index.open_dir(os.path.join(index_base_path,vendor_code,index_type))

		for file in file_list:
			print "indexing file : ", file
			idx_writer = ix.writer()
			data_reader = dfi.DataFileIterator(data_file_type,data_file_path, file)
			for iRecord in data_reader:
				idx_writer.add_document(isin=unicode(iRecord.get('ID_ISIN', None), "utf-8"),
										sedol=unicode(iRecord.get('ID_SEDOL1', None), "utf-8"),
										cusip=unicode(iRecord.get('ID_CUSIP', None), "utf-8"),
										country_issue_iso=unicode(iRecord.get('CNTRY_ISSUE_ISO', None), "utf-8"),
										corp_ticker=unicode(iRecord.get('EQY_PRIM_SECURITY_TICKER', None), "utf-8"),
										exch_code=unicode(iRecord.get('EXCH_CODE', None), "utf-8"),
										currency=unicode(iRecord.get('CRNCY', None), "utf-8"),
										raw_data=iRecord)
			idx_writer.commit()

	else:
		print "failed to open index at -->: ", os.path.join(index_base_path, vendor_code, index_type)
		quit()
	quit()
Esempio n. 14
0
    def __init__(self, index_dir: Path, from_scratch: bool = False):
        index_name = 'index'
        if not Path(index_dir).exists():
            Path(index_dir).mkdir()

        def _clear():
            import shutil
            shutil.rmtree(index_dir)
            index_dir.mkdir()
            self.ix = index.create_in(index_dir, IndexMsg.schema, index_name)

        if from_scratch:
            _clear()

        self.ix = index.open_dir(index_dir, index_name) \
            if index.exists_in(index_dir, index_name) \
            else index.create_in(index_dir, IndexMsg.schema, index_name)

        assert repr(self.ix.schema.names) == repr(IndexMsg.schema.names), \
            f"Incompatible schema in your index '{index_dir}'\n" \
            f"\tExpected: {IndexMsg.schema}\n" \
            f"\tOn disk:  {self.ix.schema}"

        self._clear = _clear  # use closure to avoid introducing too much members
        self.query_parser = QueryParser('content', IndexMsg.schema)
        self.highlighter = highlight.Highlighter()
Esempio n. 15
0
 def _init_index(self, reset=False):
     index_path = os.path.join(jupyter_data_dir(), 'index')
     
     # clear out old index if requested
     if reset:
         shutil.rmtree(index_path, True)
     
     # make sure there's a path to store the index data
     if not os.path.exists(index_path):
         os.makedirs(index_path)
         
     if not exists_in(index_path):
         # create an index with the current schema
         schema = Schema(basename=TEXT(stored=True, field_boost=5.0), 
                         dirname=ID(stored=True),
                         path=ID(stored=True, unique=True), 
                         content=TEXT(stored=False), 
                         time=STORED)
         self.ix = create_in(index_path, schema)
     else:
         # open the existing index
         self.ix = open_dir(index_path)
         
     # build a query parser based on the current schema
     self.query_parser = MultifieldParser(["content", "basename", "dirname"], self.ix.schema)
Esempio n. 16
0
def main():
    try:
        os.mkdir(index_dir)
    except OSError:
        print '%s is already exists' % index_dir
    if exists_in(index_dir):
        choise = raw_input(
            'Previous Index Found\nOptions:\n1.Create new Index\n2.Incremental Indexing\nEnter your option:'
        )
        if choise == '1':
            index_my_docs(index_dir, True)
        elif choise == '2':
            index_my_docs(index_dir)
            ch = raw_input('Do you want to optimize the index?(y/n):')
            if ch == 'y':
                print 'Optimizing.Please wait...'
                optimize_index()
                print 'Optimizing Completed'
        else:
            print 'Wrong Option.Exiting....'
            sys.exit(0)
    else:
        print 'No previous index found. Creating new....'
        index_my_docs(index_dir, True)
    print 'Indexing Completed!'
Esempio n. 17
0
    def indexloc(self):
        from sidr.orm import db
        import json
        from whoosh.index import create_in, open_dir, exists_in
        from whoosh import fields, qparser, query
        schema = fields.Schema(gid=fields.TEXT(stored=True),
                               country_code=fields.ID(stored=True),
                               names=fields.NGRAMWORDS(stored=True,
                                                       minsize=3,
                                                       maxsize=15))
        if not exists_in("indexer", indexname="adms"):
            ix = create_in("indexer", schema, indexname="adms")
        ix = open_dir("indexer", indexname="adms")
        writer = ix.writer()
        """
        with ix.searcher() as s:
            qp = qparser.QueryParser("names", schema=ix.schema)
            q = qp.parse(u"Westonia")
            # results = s.search(q, limit=20, filter=query.Term("country_code", "AU"))
            results = s.documents()
            # results = searcher.search('hey', terms=True)
            # qp = qparser.QueryParser("content", ix.schema)
            # results = searcher.search(user_q)
            for res in results:
                print(repr(res))
        """

        rows = db.engine.execute('SELECT * FROM geoname')
        for row in rows:
            writer.add_document(
                gid=str(row['id']),
                country_code=row['country_code'],
                names="%s , %s , %s" %
                (row['name'], row['asciiname'], row['name_alternate']))
        writer.commit()
Esempio n. 18
0
def get_or_create_index(path, schema, src):
    """Get or create an Index."""
    index = open_dir(path) if exists_in(path) else create_in(path, schema)
    indexed_titles = set(field['title'] for field in gen_indexed_fields(index))
    documents = set(gen_documents(src))
    update_index(index.writer(), indexed_titles, documents)
    return index
Esempio n. 19
0
    def __init__(self, modref):
        ''' inits the plugin
		'''
        self.modref = modref
        super().__init__(modref.message_handler, self)
        self.providers = set()
        self.movies = {}
        self.lock = Lock()

        self.runFlag = True
        # init the search engine
        self.whoosh_schema = Schema(source=KEYWORD(stored=True),
                                    provider=KEYWORD(stored=True),
                                    title=TEXT(stored=True),
                                    category=TEXT(stored=True),
                                    uri=ID(stored=True, unique=True),
                                    url=STORED,
                                    mime=STORED,
                                    duration=STORED,
                                    source_type=STORED,
                                    description=STORED,
                                    timestamp=DATETIME(stored=True))
        self.index_dir = DirectoryMapper.abspath(self.get_plugin_id(),
                                                 'runtime', 'indexdir', True)
        if not os.path.exists(self.index_dir):
            os.mkdir(self.index_dir)
        if index.exists_in(self.index_dir):
            self.whoosh_ix = index.open_dir(self.index_dir)
        else:
            self.reset_index()  # creates a new index
Esempio n. 20
0
    def get(self,
            name='__indexdir',
            dump=config.DUMP_FOLDER,
            destructive=False):
        index_path = config.ROOT.joinpath(name)
        path = str(index_path)

        if destructive and index_path.exists():
            shutil.rmtree(path)
            while index_path.exists():
                pass

        if destructive or (not index_path.exists()
                           or not index.exists_in(path)):
            try:
                index_path.mkdir()
                self.index = index.create_in(path, WikiSchema())
                logging.info('Index newly created, adding documents')
                self.build(directory=dump)
            except (FileExistsError, FileNotFoundError) as e:
                logger.error('Index already exist or parent not found')
                sys.exit(0)
        self.index = index.open_dir(path)
        print(' * Bootstrap index reader')
        self.reader = self.index.reader()

        return self
Esempio n. 21
0
def index_search(group, sheet_name, wiki_key):
    sha = permissions_sha(sheet_name, wiki_key, group)
    dir = os.path.join(app.config['SPREADSHEET_FOLDER'], sheet_name, "indices", sha)

    if(index.exists_in(dir)):
        print("Index already exists for " + sheet_name + " / " + wiki_key + " / " + group + " (or comparable)")
        ix = index.open_dir(dir)
        indices[sha] = ix
        return

    try:
        os.mkdir(dir)
    except FileExistsError:
        pass

    print("Reindexing for " + sheet_name + " / " + group)
    schema = Schema(key=ID(stored=True, unique=True), content=TEXT)
    ix = create_in(dir, schema)
    writer = ix.writer()
    for o in cull_invalid_objects(group, sheet_name, wiki_key):
        writer.add_document(
                key=o[sheet_config[sheet_name]["key_column"]],
                content=" ".join([str(c) for c in cull_invalid_columns(o, permissions[sheet_name][wiki_key][group]["columns"]).values()])
                )
    writer.commit()

    indices[sha] = ix

    return ""
Esempio n. 22
0
def incremental_index(doc_dir):
    """ Update index based on document last update time """
    if (index.exists_in(get_index_dir()) == False):
        clean_index(doc_dir)
        return

    ix = index.open_dir(get_index_dir())

    indexed_paths = set()   # The set of all paths in the index
    to_index = set()        # The set of all paths we need to re-index
    writer = ix.writer()

    with ix.searcher() as searcher:
        # Loop over the stored fields in the index
        for fields in searcher.all_stored_fields():
            indexed_path = fields['path']
            indexed_paths.add(indexed_path)

            if not os.path.exists(indexed_path):
                # This file was deleted since it was indexed --> Delete index
                writer.delete_by_term('path', indexed_path)
            else:
                # Check if this file was changed since it was indexed
                indexed_time = fields['time']
                modify_time = os.path.getmtime(indexed_path)
                if modify_time > indexed_time:
                    # The file has changed, delete it and add it to the list of files to reindex
                    writer.delete_by_term('path', indexed_path)
                    to_index.add(indexed_path)

    for filename in get_document_names(doc_dir):
        path = os.path.join(doc_dir, filename)
        if path in to_index or path not in indexed_paths:
            add_doc(writer, filename)
    writer.commit(optimize=True)
Esempio n. 23
0
 def _get_index(index_path, schema):
     if index.exists_in(index_path):
             return index.open_dir(index_path)
     else:
         if not os.path.exists(index_path):
             os.mkdir(index_path)
         return index.create_in(index_path, schema)
Esempio n. 24
0
    def createIndex(self, file, directory):
        if os.path.exists(directory) and not exists_in(directory):
            print('Directory already exists and does not contain any index, deleting and creating new index...\n')
            shutil.rmtree(directory)
            os.mkdir(directory)

        if not os.path.exists(directory):
            os.mkdir(directory)

        if exists_in(directory):
            print('overwriting current index...\n')

        self.directory = directory
        self.ix = create_in(directory, self.schema)
        self.writer = self.ix.writer()
        self.writeToIndex(file)
def get_index(config):
    """
    Return the current index object if there is one.
    If not attempt to open the index in wsearch.indexdir.
    If there isn't one in the dir, create one. If there is
    not dir, create the dir.
    """
    index_dir = config.get('wsearch.indexdir',
            SEARCH_DEFAULTS['wsearch.indexdir'])
    if not os.path.isabs(index_dir):
        index_dir = os.path.join(config.get('root_dir', ''), index_dir)

    if exists_in(index_dir):
        # For now don't trap exceptions, as we don't know what they
        # will be and so we want them to raise destructively.
        index = open_dir(index_dir)
    else:
        try:
            os.mkdir(index_dir)
        except OSError:
            pass
        schema = config.get('wsearch.schema',
                SEARCH_DEFAULTS['wsearch.schema'])
        index = create_in(index_dir, Schema(**schema))
    return index
Esempio n. 26
0
 def create_or_open_index(self):
     if index.exists_in(self.index_dir):
         self.ix = index.open_dir(self.index_dir)
     else:
         if not os.path.exists(self.index_dir):
             os.mkdir(self.index_dir)
         self.ix = create_in(self.index_dir, self.schema)
Esempio n. 27
0
 def ix(self, name):
     schema = getattr(self, '%s_schema' % name)
     if not exists_in(self.index_path, indexname=name):
         return create_in(self.index_path, schema, indexname=name)
     ix = open_dir(self.index_path, indexname=name)
     update_schema(ix, schema)
     return ix
Esempio n. 28
0
def createIndex():
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)

    if not index.exists_in(index_dir):
        schema = Schema(title=TEXT(stored=True), body=TEXT(stored=True), link=TEXT(stored=True))
        ix = create_in(index_dir, schema)
    else:
        ix = index.open_dir(index_dir)

    writer = ix.writer()
    for feed in source_dn_all.entries:
        description = feed.summary.split("<img")
        title = feed.title.encode('utf-8')
        if checkIfDocExists(title, 'DN') is False:
            with open('dn_news.txt', 'a') as news_file:
                news_file.write(feed.title.encode('utf-8')+' \n')
            writer.add_document(title=feed.title, body=description[0], link=feed['feedburner_origlink'])

    for feed in source_jn_all.entries:
        description = feed.summary.split("<img")
        title = feed.title.encode('utf-8')
        if checkIfDocExists(title, 'JN') is False:
            with open('jn_news.txt', 'a') as news_file:
                news_file.write(feed.title.encode('utf-8')+' \n')
            writer.add_document(title=feed.title, body=description[0], link=feed['feedburner_origlink'])
    writer.commit()
Esempio n. 29
0
def get_index(index, indexname="ARTIFACTS", schema=None):
    """Open or create a whoosh index.

    Opens a whoosh index with the specified name and schema.
    If there is no index with the specified name, a new index is created.

    Parameters
    ----------
    index : str
        The name of the index.
    schema : whoosh.fields.Schema
        The schema to use for the index.

    Returns
    -------
    libcflib.index.NestedIndex
        A whoosh index with the specified name and schema.
    """
    storage = FileStorage(index)
    if not os.path.exists(index):
        os.mkdir(index)
    if exists_in(index, indexname):
        return NestedIndex(storage, schema=schema, indexname=indexname)
    else:
        return NestedIndex.create(storage, schema, indexname)
Esempio n. 30
0
    def open_index(self, index_folder, create_new=False):
        self.index_folder = index_folder
        if create_new:
            if os.path.exists(index_folder):
                shutil.rmtree(index_folder)
                print "deleted index folder: " + index_folder

        if not os.path.exists(index_folder):
            os.mkdir(index_folder)

        exists = index.exists_in(index_folder)
        stemming_analyzer = StemmingAnalyzer()

        schema = Schema(path=ID(stored=True, unique=True),
                        filename=TEXT(stored=True, field_boost=100.0),
                        tags=KEYWORD(stored=True,
                                     scorable=True,
                                     field_boost=80.0),
                        headlines=KEYWORD(stored=True,
                                          scorable=True,
                                          field_boost=60.0),
                        doubleemphasiswords=KEYWORD(stored=True,
                                                    scorable=True,
                                                    field_boost=40.0),
                        emphasiswords=KEYWORD(stored=True,
                                              scorable=True,
                                              field_boost=20.0),
                        content=TEXT(stored=True, analyzer=stemming_analyzer),
                        time=STORED)
        if not exists:
            self.ix = index.create_in(index_folder, schema)
        else:
            self.ix = index.open_dir(index_folder)
Esempio n. 31
0
	def run(self):
		# open index
		self.buffer = deque(maxlen=BUFFERLINES)
		if not exists(self.indexdir):
			makedirs(self.indexdir)
			self.ix = create_in(self.indexdir, SCHEMA)
		else:
			if exists_in(self.indexdir): self.ix = open_dir(self.indexdir)
			else: self.ix = create_in(self.indexdir, SCHEMA)
		self.qp = QueryParser("content", self.ix.schema)
		self.searcher = self.ix.searcher()
		index_p = self.index_p
		while True:
			try:
				# check index_p
				try:
					type, data = index_p.recv()
				except EOFError: break
				try:
					if type == QUERY: self._processSearch(data)
					elif type == LOG: self._processLog(data)
					elif type == RENAME: self._processRename(data)
					else:
						prnt("Unexpected data in logindexsearch.")
				except:
					print_exc()
					prnt("EXCEPTION in logindexsearch process.")
			except KeyboardInterrupt:
				break
		self._dumpBuffer(self.buffer)
		self.searcher.close()
		self.ix.close()	
Esempio n. 32
0
 def addTermNarrower(self, tagSubjectList, termNarrower):
     if not index.exists_in(utils.indexerDir(), utils.indexName):
         self.createNewIndex()
     for tagSubject in tagSubjectList:
         self.__writer.add_document(tagSubject=unicode(tagSubject),
                                    termNarrower=unicode(termNarrower))
     self.addToWordList(termNarrower)
Esempio n. 33
0
def search_documents(filter):
    results = None

    # Check for existing index
    dir_path = os.path.join(DATA_DIR, 'index')

    if not os.path.exists(dir_path) or not Index.exists_in(dir_path):
        return None

    index = Index.open_dir(dir_path)

    if filter.startswith('tags:'):
        fields = ['tags']
        filter = filter[5:]
    else:
        fields = ['path', 'content']

    parser = MultifieldParser(fields, schema=index.schema)
    search_query = parser.parse(unicode(filter))

    # Try documents search
    try:
        searcher = index.searcher(closereader=False)

        return searcher.search(search_query,
            collapse=[sorting.FieldFacet('path'), sorting.FieldFacet('content')],
            collapse_order=sorting.FieldFacet('revision', reverse=True),
            sortedby=[sorting.FieldFacet('path'), sorting.FieldFacet('date', reverse=True)]
        )
    finally:
        searcher.close()

    return results
Esempio n. 34
0
def createIndex():
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)

    if not index.exists_in(index_dir):
        schema = Schema(title=TEXT(stored=True),
                        body=TEXT(stored=True),
                        link=TEXT(stored=True))
        ix = create_in(index_dir, schema)
    else:
        ix = index.open_dir(index_dir)

    writer = ix.writer()
    for feed in source_dn_all.entries:
        description = feed.summary.split("<img")
        title = feed.title.encode('utf-8')
        if checkIfDocExists(title, 'DN') is False:
            with open('dn_news.txt', 'a') as news_file:
                news_file.write(feed.title.encode('utf-8') + ' \n')
            writer.add_document(title=feed.title,
                                body=description[0],
                                link=feed['feedburner_origlink'])

    for feed in source_jn_all.entries:
        description = feed.summary.split("<img")
        title = feed.title.encode('utf-8')
        if checkIfDocExists(title, 'JN') is False:
            with open('jn_news.txt', 'a') as news_file:
                news_file.write(feed.title.encode('utf-8') + ' \n')
            writer.add_document(title=feed.title,
                                body=description[0],
                                link=feed['feedburner_origlink'])
    writer.commit()
Esempio n. 35
0
    def _init_index(self, reset=False):
        index_path = os.path.join(jupyter_data_dir(), "index")

        # clear out old index if requested
        if reset:
            shutil.rmtree(index_path, True)

        # make sure there's a path to store the index data
        if not os.path.exists(index_path):
            os.makedirs(index_path)

        if not exists_in(index_path):
            # create an index with the current schema
            analyzer = ChineseAnalyzer()
            schema = Schema(
                basename=TEXT(stored=True, field_boost=5.0, analyzer=analyzer),
                dirname=ID(stored=True, analyzer=analyzer),
                path=ID(stored=True, unique=True, analyzer=analyzer),
                content=TEXT(stored=False, analyzer=analyzer),
                time=STORED,
            )
            self.ix = create_in(index_path, schema)
        else:
            # open the existing index
            self.ix = open_dir(index_path)

        # build a query parser based on the current schema
        self.query_parser = MultifieldParser(["content", "basename", "dirname"], self.ix.schema)
Esempio n. 36
0
    def __init__(self):
        self.directory = os.path.join(edocuments.root_folder, '.index')
        self.dirty = False
        schema = Schema(**{
            PATH: ID(stored=True, unique=True),
            CONTENT: TEXT(stored=True),
            DATE: STORED,
            DIRECTORY: STORED,
            MD5: TEXT(stored=True),
        })
        self.parser_path = QueryParser("path_id", schema)
        self.parser_content = QueryParser("content", schema)

        if not exists_in(self.directory):
            os.makedirs(self.directory)
            self.index = create_in(self.directory, schema)
        else:
            self.index = open_dir(self.directory)
            if 'path' in self.index.schema.names():
                with self.index.writer() as writer:
                    writer.remove_field('path')
            if 'directory' not in self.index.schema.names():
                with self.index.writer() as writer:
                    writer.add_field('directory', STORED)
            if 'md5' not in self.index.schema.names():
                with self.index.writer() as writer:
                    writer.add_field('md5', TEXT(stored=True))
            print(
                'Field length:\npath: %i\ncontent: %i\nmd5: %i' % (
                    self.index.field_length("path_id"),
                    self.index.field_length("content"),
                    self.index.field_length("md5"),
                )
            )
Esempio n. 37
0
 def __init__(self, db_path):
     ensuredir(db_path)
     if index.exists_in(db_path):
         self.index = index.open_dir(db_path)
     else:
         self.index = index.create_in(db_path, schema=self.schema)
     self.qparser = QueryParser('text', self.schema)
Esempio n. 38
0
 def __init__(self, db_path):
     ensuredir(db_path)
     if index.exists_in(db_path):
         self.index = index.open_dir(db_path)
     else:
         self.index = index.create_in(db_path, schema=self.schema)
     self.qparser = QueryParser('text', self.schema)
Esempio n. 39
0
 def init(self):
     ix_path = os.path.join(self.path, self.name)
     if whoosh_index.exists_in(ix_path):
         return whoosh_index.open_dir(ix_path)
     if not os.path.exists(ix_path):
         os.makedirs(ix_path)
     return whoosh_index.create_in(ix_path, self.schema)
Esempio n. 40
0
    def __init__(self, pickle_path='index', index_name='telegram_searcher', from_scratch=False):
        analyzer = ChineseAnalyzer()
        schema = Schema(
            content=TEXT(stored=True, analyzer=analyzer),
            url=ID(stored=True, unique=True),
            chat_id=STORED(),
            post_time=DATETIME(stored=True),
        )

        if not Path(pickle_path).exists():
            Path(pickle_path).mkdir()

        def _clear():
            pattern = re.compile(f'^_?{index_name}.*')
            for file in Path(pickle_path).iterdir():
                if pattern.match(file.name):
                    os.remove(str(file))
            self.ix = create_in(pickle_path, schema, index_name)

        if from_scratch:
            _clear()

        self.ix = open_dir(pickle_path, index_name) \
            if exists_in(pickle_path, index_name) \
            else create_in(pickle_path, schema, index_name)

        self._clear = _clear  # use closure to avoid introducing to much members
        self.query_parser = QueryParser('content', schema)
        self.highlighter = highlight.Highlighter()
Esempio n. 41
0
def index_exists(dirname=INDEXDIR, indexname=INDEXNAME):
    """
    index_exists([dirname="index", indexname="MAIN"])
    
    Verifica se o índice :attr:`indexname` existe no diretório :attr:`dirname`.
    
    .. code-block:: python
    
        from storyline.engine.index import index_exists
        
        # Exemplo em que existe o diretório index com índice MAIN.
        >>> index_exists() 
        True
        >>> index_exists("index")
        True
        >>> index_exists("index", "indexname")
        False
        
    :param dirname: Nome do diretório do índice.
    :type dirname: str
    :param indexname: Nome do índice.
    :tyoe indexname: str
    :returns: True ou False.
    """
    return index.exists_in(dirname, indexname.upper())
Esempio n. 42
0
    def index(self, locales, init=False, **options):
        """Create index records for all dimensions in the cube"""
        # FIXME: this works only for one locale - specified in browser

        if init:
            self.initialize()

        if not index.exists_in(self.path):
            raise Exception("Index is not initialized in '%s'" % self.path)

        ix = index.open_dir(self.path)

        self.writer = ix.writer()
        # for dimension in self.cube.dimensions:
        options = options or {}
        cube = self.browser.cube

        for locale_tag, locale in enumerate(locales):
            for dim_tag, dimension in enumerate(cube.dimensions):
                self.index_dimension(dimension,
                                     dim_tag,
                                     locale=locale,
                                     locale_tag=locale_tag,
                                     **options)
        self.writer.commit()
Esempio n. 43
0
    def __init__(self,
                 in_folder='testing_index',
                 bool_only=False,
                 from_file=None,
                 with_dict=None,
                 with_index=None,
                 custom_schema=None):
        ''' 
        implement optional stuff
        '''
        self.indexdir = in_folder
        '''if custom_schema:
            self._schema = custom_schema'''

        self.bool_only = bool_only

        self._schema = Schema(keystring=TEXT(stored=not self.bool_only),
                              valuestring=TEXT(stored=not self.bool_only))

        if not index.exists_in(self.indexdir):
            #logging.info("index does not exist in indexdir, will create")
            if not os.path.exists(self.indexdir):
                #logging.info("indexdir does not exist, will create")
                os.mkdir(self.indexdir)
                #logging.info("created indexdir")
            self.ix = index.create_in(self.indexdir, self._schema)
            #logging.info("created index in indexdir")
        else:
            self.ix = index.open_dir(self.indexdir)
            #logging.info("found and opened existing index in indexdir")

        if os.path.exists(self.indexdir + '/ixIsTruthy'):
            ixinfofile = open(self.indexdir + '/ixIsTruthy', 'rb')
            ixinfo = ixinfofile.readline()
            if ixinfo == '1':
                ixinfo = True
            elif ixinfo == '0':
                ixinfo = False
            if ixinfo != self.bool_only:
                raise Exception(
                    'cannot open existing index in a different bool_only mode. change it or add to argument and set as true'
                )
            ixinfofile.close()
        else:
            ixinfofile = open(self.indexdir + '/ixIsTruthy', 'wb')
            if self.bool_only:
                ixinfofile.write('1')
            else:
                ixinfofile.write('0')
            ixinfofile.close()

        self.writer = self.ix.writer()

        if with_dict:
            for key in with_dict:
                writer.add_document(keystring=unicode(key),
                                    valuestring=unicode(with_dict[key]))

        self.writer.commit()
Esempio n. 44
0
def get_location_index():
    if(index.exists_in(whoosh_index_path, indexname="location_index")):
        loc_ix = index.open_dir(whoosh_index_path, indexname="location_index")
    else:
        loc_ix = index.create_in(whoosh_index_path, schema=LocationSchema(), 
            indexname="location_index")
        fill_location_index(loc_ix)
    return loc_ix
Esempio n. 45
0
 def _get_index(self):
     index_directory = "%s/%s" % (current_app.config.get("WHOOSH_BASE"), self.__class__.__name__)
     if not Path(index_directory).exists():
         Path(index_directory).mkdir()
     model_index = None
     if not index.exists_in(index_directory):
         return create_in(index_directory, self._get_schema())
     return index.open_dir(index_directory)
Esempio n. 46
0
def get_or_create_index(path, schema, src):
    """Get or create an Index."""
    index = open_dir(path) if exists_in(path) else create_in(path, schema)
    indexed_titles = set(field['title'] for field in gen_indexed_fields(index))
    corpus = Corpus(src)
    documents = set(corpus.gen_documents())
    update_index(index.writer(), indexed_titles, documents)
    return index
Esempio n. 47
0
def get_category_index():
    if(index.exists_in(whoosh_index_path, indexname="category_index")):
        cat_ix = index.open_dir(whoosh_index_path, indexname="category_index")
    else:
        cat_ix = index.create_in(whoosh_index_path, schema=CategorySchema(), 
            indexname="category_index")
        fill_category_index(cat_ix)
    return cat_ix
Esempio n. 48
0
def get_restaurant_index():
    if(index.exists_in(whoosh_index_path, indexname="restaurant_index")):
        rest_ix = index.open_dir(whoosh_index_path, indexname="restaurant_index")
    else:
        rest_ix = index.create_in(whoosh_index_path, schema=RestaurantSchema(), 
            indexname="restaurant_index")
        fill_restaurant_index(rest_ix)
    return rest_ix
Esempio n. 49
0
 def get_index(self):
     ip = self.indexpath
     if not self.indexpath.startswith('/'):
         ip = path.join(self.env.path, ip)
     if not path.exists(ip):
         os.mkdir(ip)
     if not index.exists_in(ip):
         index.create_in(ip, self.SCHEMA)
     return index.open_dir(ip)
Esempio n. 50
0
 def __init__(self, repos_path, index_path):
     self.repo = Repo(repos_path)
     self.index_path = index_path
     self.git_index = self.repo.open_index()
     if not exists_in(self.index_path):
         schema = Schema(path=ID(unique=True, stored=True), itime=STORED, content=TEXT)
         self.ix = create_in(self.index_path, schema)
     else:
         self.ix = open_dir(self.index_path)
Esempio n. 51
0
 def __init__(self, path="./urllist"):
     if not index.exists_in(path):
         schema = Schema(title=TEXT(stored=True), aya=KEYWORD(stored=True), url=ID(stored=True), cache=ID(stored=True))
         makedirs(path)
         ix = create_in(path, schema)
     else:
         ix = open_dir(path)
     
     self.index = ix
Esempio n. 52
0
def incremental_index(index_dir, root_dir):
    """
	Only re-index the documents that have changed
	index_dir: dir to save index infos
	root_dir: dir of all files to be indexed
	"""
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    index_exist = index.exists_in(index_dir)
    if not index_exist:
        print ("index not exist, create it")
        ix = index.create_in(index_dir, schema=get_schema())

    ix = index.open_dir(index_dir)
    # all paths in the index
    indexed_paths = set()
    # all paths we need to re-index
    to_reindex_paths = set()

    with ix.searcher() as searcher:
        writer = ix.writer()

        # Loop over the stored fileds in the index
        for fields in searcher.all_stored_fields():
            indexed_path = fields["path"]
            indexed_paths.add(indexed_path)

            if not os.path.exists(indexed_path):
                # This file was deleted since it was indexed
                # So delete from the index
                writer.delete_by_term("path", indexed_path)
            else:
                # Check if this file was changed since it was indexed
                indexed_time = fields["time"]
                mtime = os.path.getmtime(indexed_path)
                if mtime > indexed_time:
                    # This file has changed since it was indexed
                    # So delete from the index
                    writer.delete_by_term("path", indexed_path)
                    # And add it to the list of files to reindex
                    to_reindex_paths.add(indexed_path)

                    # Loop over the files in the filesystem
        for filepath in list_all_files(root_dir):
            if filepath not in indexed_paths:
                # This is a new file, so indexed it
                add_file_to_index(writer, filepath)
                print ("{0} is a new file".format(filepath))
            elif filepath in to_reindex_paths:
                # This is file that's changed, so indexed it
                add_file_to_index(writer, filepath)
                print ("{0} is a changed file".format(filepath))
            else:
                # This file has not changed since it was indexed
                print ("{0} not changed".format(filepath))
                pass
        writer.commit()
Esempio n. 53
0
 def setup(self):
     import os
     if not os.path.exists(self.location):
         os.mkdir(self.location)
         self.ix = index.create_in(self.location, self.schema)
     elif index.exists_in(self.location):
         self.ix = index.open_dir(self.location, schema=self.schema)
     else:
         self.ix = index.create_in(self.location, self.schema)
Esempio n. 54
0
 def index(self):
     if self._index is None:
         if not os.path.isdir(self.location):
             os.makedirs(self.location)
         if exists_in(self.location):
             self._index = open_dir(self.location)
         else:
             self._index = create_in(self.location, self._get_schema())
     return self._index
Esempio n. 55
0
    def init_app(self, app):
        """Initialize module and checks if the index exists"""

        self.app = app
        if not 'WHOOSH_INDEX_PATH' in self.app.config:
            raise exc.InitializationError("You must set the WHOOSH_INDEX_PATH option in the configuration")
        self.index_dir = self.app.config["WHOOSH_INDEX_PATH"]
        if not exists_in(self.index_dir):
            self.setup_index()
Esempio n. 56
0
 def get_index(cls):
     idxdir = cls.get_index_dir()
     if index.exists_in(idxdir):
         idx = index.open_dir(idxdir)
     else:
         if not os.path.exists(idxdir):
             os.makedirs(idxdir)
         idx = index.create_in(idxdir, cls.schema)
     return idx
Esempio n. 57
0
 def get_index_writer(self, clear=False):
   if clear:
     ix = self.create_index()
   else:
     if index.exists_in(self._index_dir):
       ix = index.open_dir(self._index_dir)
     else:
       ix = self.create_index()
   return ix.writer()
Esempio n. 58
0
 def __init__(self):
     self.index_dir = self.index_dir_setting
     if not os.path.isabs(self.index_dir):
         self.index_dir = os.path.join(get_global_env(self.env).path,
                                       self.index_dir)
     if index.exists_in(self.index_dir):
         self.index = index.open_dir(self.index_dir)
     else:
         self.index = None
Esempio n. 59
0
def init():
    # Setting my schema ...
    schema_email = Schema(
        path=TEXT(stored=True),
        sender_email=TEXT(stored=True),
        recipient_emails=TEXT,
        date=DATETIME,
        subject=TEXT(stored=True),
        body=TEXT,
    )
    schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True))
    schemas = {"index_emails": schema_email, "index_book": schema_book}

    if not os.path.exists(index_path):
        os.mkdir(index_path)

    indexes = {}
    for ixname, schema in schemas.items():
        """
        Esta parte es mejorable, ya que sólo indexa si no existe indice. 
        No tiene en cuenta si los archivos indexados se han modificado o si 
        se han eliminado como se explica aquí:
            @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing
        """
        exists = index.exists_in(index_path, indexname=ixname)
        if not exists:
            ix = index.create_in(index_path, schema, indexname=ixname)

            # Indexing ...
            ix = index.open_dir(index_path, indexname=ixname)
            writer = ix.writer()
            if ixname == "index_emails":
                files = read_dir()
                index_emails(files, writer)
            elif ixname == "index_book":
                index_book(writer)
        else:
            ix = index.open_dir(index_path, indexname=ixname)
        indexes[ixname] = ix

    # Main routine
    while True:
        ix = indexes.get("index_emails")
        with ix.searcher() as searcher:
            input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): "))
            mparser = MultifieldParser(["subject", "body"], schema=ix.schema)
            myquery = mparser.parse(unicode(input_user))

            results = searcher.search(myquery)
            print "=================================================="
            for result in results:
                # read_file(result.get("path"))

                print ("Remitente: " + findNameBySender(indexes, result.get("sender_email")))
                print ("Asunto: " + result.get("subject"))
                print "=================================================="