Python create_in Examples, whoosh.index.create_in Python Examples

Example #1

0

Show file

File: search_engine.py Project: behappycc/bdhackathon

def build_index():
    print("build index")
    client = MongoClient('localhost', 27017)
    collection = client['bdhackathon']['Japan_Travel']
    schema = Schema(
        article_title=TEXT(stored=True, analyzer=analyzer), 
        article_id=TEXT(stored=True),
        author=TEXT(stored=True),
        #content=TEXT(stored=True, analyzer=analyzer)
        )

    #Initial Whoosh index
    if not os.path.exists("index"):
        os.mkdir("index")
        create_in("index", schema)

    ix = open_dir("index")
    writer = ix.writer()
    articles = collection.find()
    for article in articles:
        writer.update_document(
            article_title = article["article_title"],
            article_id = article["article_id"],
            author = article["author"]["account"],
            #content= article["content"]
            )
    writer.commit()

Example #2

0

Show file

File: views.py Project: byronrwth/oc6web

    def create_whoosh(self):
        print "creating_whoosh: "

        #initlock = lockfile.ThreadSafeFile(WHOOSH_FOLDER, '_init')
        thistime = datetime.datetime.now()
        dateformat = '%d-%m-%Y %H:%M:%S'
        create_index_flag = False
        #try: 
        #    initlock.acquire(timeout=2)
        #except lockfile.LockTimeout:
        #    print "Lock timeout when trying to create whoosh index schema. Continuing without index creation"
        #    return
        #except lockfile.AlreadyLocked:
        #    print "Already locked. Continuing without index creation"                
        #    return
        try:
                last_creation = datetime.datetime.strptime(initlock.read(), dateformat) #deserialize 
                print "Last index creation: %s"%datetime.datetime.strftime(last_creation, '%d-%m-%Y %H:%M:%S')
                if (thistime - last_creation).total_seconds() > 4*60*60: #4 hours  
                        create_index_flag = True
                        print "Index older than 4 hours - will recreate"
                else: print "Index is fresh - will not recreate"
        except: create_index_flag = True #do the creation anyway, maybe initial condition
        if create_index_flag:
            create_in(WHOOSH_FOLDER, schema)
            print "Creating search index"
            
            writer = ix.writer()
            for t in self.collector:
                #print "index: Adding term %s"%t[0]
                writer.add_document(term=u"%s"%t[0], url=u"%s"%t[1], description=u"%s"%t[2])
            writer.commit()
            #we can free now the collector
            self.collector = None

Example #3

0

Show file

File: indexer.py Project: DMGbupt/wechat-crawler

 def __load__(region=None):
     """加载/建立索引
     :param region: 索引范围，None表示加载所有索引；news\blog表示加载对应索引
     :return: 是否加载成功
     """
     # 加载索引
     if region:
         if region in Indexer.__index__:
             return True
         else:
             if region not in index_dir:
                 return False
             if not os.path.exists(index_dir[region]):
                 os.makedirs(index_dir[region])
                 Indexer.__index__[region] = index.create_in(index_dir[region], schema, indexname=region)
             else:
                 Indexer.__index__[region] = index.open_dir(index_dir[region], indexname=region)
             return True
     else:  # 加载全部索引
         for reg in index_dir.keys():
             if reg in Indexer.__index__:
                 return True
             else:
                 if not os.path.exists(index_dir[reg]):
                     os.mkdir(index_dir[reg])
                     Indexer.__index__[reg] = index.create_in(index_dir[reg], schema, indexname=reg)
                 else:
                     Indexer.__index__[reg] = index.open_dir(index_dir[reg], indexname=reg)
                 return True

Example #4

0

Show file

File: indexer.py Project: tea2code/recipe_manager

 def open_index(self, schema):
     """ Opens an index. Returns the writer. """
     if not os.path.exists(self.index_path):
         os.mkdir(self.index_path)
         index.create_in(self.index_path, schema)
     self._index = index.open_dir(self.index_path)
     return self._index.writer()

Example #5

0

Show file

File: daemon.py Project: adamscieszko/rhodecode

    def build_indexes(self):
        if os.path.exists(self.index_location):
            log.debug('removing previous index')
            rmtree(self.index_location)

        if not os.path.exists(self.index_location):
            os.mkdir(self.index_location)

        chgset_idx = create_in(self.index_location, CHGSETS_SCHEMA,
                               indexname=CHGSET_IDX_NAME)
        chgset_idx_writer = chgset_idx.writer()

        file_idx = create_in(self.index_location, SCHEMA, indexname=IDX_NAME)
        file_idx_writer = file_idx.writer()
        log.debug('BUILDING INDEX FOR EXTENSIONS %s '
                  'AND REPOS %s' % (INDEX_EXTENSIONS, self.repo_paths.keys()))

        for repo_name, repo in self.repo_paths.items():
            # skip indexing if there aren't any revisions
            if len(repo) < 1:
                continue

            self.index_files(file_idx_writer, repo_name, repo)
            self.index_changesets(chgset_idx_writer, repo_name, repo)

        log.debug('>> COMMITING CHANGES <<')
        file_idx_writer.commit(merge=True)
        chgset_idx_writer.commit(merge=True)
        log.debug('>>> FINISHED BUILDING INDEX <<<')

Example #6

0

Show file

File: whoosh_backend.py Project: thoblr/django-haystack

    def setup(self):
        """
        Defers loading until needed.
        """
        new_index = False

        # Make sure the index is there.
        if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
            os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
            new_index = True

        if not os.access(settings.HAYSTACK_WHOOSH_PATH, os.W_OK):
            raise IOError(
                "The path to your Whoosh index '%s' is not writable for the current user/group."
                % settings.HAYSTACK_WHOOSH_PATH
            )

        self.storage = FileStorage(settings.HAYSTACK_WHOOSH_PATH)
        self.content_field_name, self.schema = self.build_schema(self.site.all_searchfields())
        self.parser = QueryParser(self.content_field_name, schema=self.schema)

        if new_index is True:
            self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
        else:
            try:
                self.index = self.storage.open_index(schema=self.schema)
            except index.EmptyIndexError:
                self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)

        self.setup_complete = True

Example #7

0

Show file

File: pbm_logindexsearch.py Project: Clam-/pyBurlyBot

	def run(self):
		# open index
		self.buffer = deque(maxlen=BUFFERLINES)
		if not exists(self.indexdir):
			makedirs(self.indexdir)
			self.ix = create_in(self.indexdir, SCHEMA)
		else:
			if exists_in(self.indexdir): self.ix = open_dir(self.indexdir)
			else: self.ix = create_in(self.indexdir, SCHEMA)
		self.qp = QueryParser("content", self.ix.schema)
		self.searcher = self.ix.searcher()
		index_p = self.index_p
		while True:
			try:
				# check index_p
				try:
					type, data = index_p.recv()
				except EOFError: break
				try:
					if type == QUERY: self._processSearch(data)
					elif type == LOG: self._processLog(data)
					elif type == RENAME: self._processRename(data)
					else:
						prnt("Unexpected data in logindexsearch.")
				except:
					print_exc()
					prnt("EXCEPTION in logindexsearch process.")
			except KeyboardInterrupt:
				break
		self._dumpBuffer(self.buffer)
		self.searcher.close()
		self.ix.close()

Example #8

0

Show file

File: whoosh_backend.py Project: JoeGermuska/django-haystack

 def setup(self):
     """
     Defers loading until needed.
     """
     new_index = False
     
     # Make sure the index is there.
     if not os.path.exists(settings.HAYSTACK_WHOOSH_PATH):
         os.makedirs(settings.HAYSTACK_WHOOSH_PATH)
         new_index = True
     
     self.storage = store.FileStorage(settings.HAYSTACK_WHOOSH_PATH)
     self.content_field_name, fields = self.site.build_unified_schema()
     self.schema = self.build_schema(fields)
     self.parser = QueryParser(self.content_field_name, schema=self.schema)
     
     if new_index is True:
         self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
     else:
         try:
             self.index = index.Index(self.storage, schema=self.schema)
         except index.EmptyIndexError:
             self.index = index.create_in(settings.HAYSTACK_WHOOSH_PATH, self.schema)
     
     self.setup_complete = True

Example #9

0

Show file

File: search.py Project: TylerTemp/tomorrow

    def build_search(cls):
        analyzer = cls.analyzer

        schema = Schema(
            nid=ID(unique=True, stored=True),
            slug=ID(unique=True, stored=True),
            title=TEXT(stored=True, analyzer=analyzer),
            tag=KEYWORD(stored=True, lowercase=True, commas=True,
                        scorable=True),
            description=TEXT(stored=True, analyzer=analyzer),
            content=TEXT(stored=True, analyzer=analyzer)
        )

        folder = cls.tmp_dir
        if not os.path.exists(folder):
            os.mkdir(folder)
        create_in(folder, schema)

        ix = open_dir(folder)
        writer = ix.writer()

        for article in Article.find({'status': Article.ACCEPTED}):
            writer.update_document(
                nid=str(article._id),
                slug=article.slug,
                title=article.title,
                tag=','.join(article.tag),
                description=article.description,
                content=article.content
            )

        writer.commit()

        cls.searcher = ix.searcher()

Example #10

0

Show file

File: indexer.py Project: jacquarg/cozy-data-indexer

    def clear_index(self):
        """
        Clear index: whoosh indexe create, create a new index in the directory
        even if an index exists.
        """

        if os.path.exists("indexes"):
            index.create_in("indexes", self.schema)

Example #11

0

Show file

File: index.py Project: eventh/tdt4215

def main(script, command='', index='', field='', *query):
    """Store, clear or search data in whoosh indices.

    Can also be used to create vectors needed for task 3.
    'command' is either build|store|clean|search|vector
    'index' is either atc|icd|therapy|case

    Usage: python3 index.py <command> [index] [field] [query]
    """
    # Store all objects in index
    if command == 'build':
        populate_all()
        empty = get_empty_indices()
        for cls in empty:
            store_objects_in_index(cls)
        return

    classes = [ATC, ICD, PatientCase, Therapy]
    if index:
        classes = [i for i in classes if i._NAME == index]
        if not classes:
            print("Unknown index %s, valids: atc|icd|case|therapy" % index)
            sys.exit(2)

    # Store objects in index, will create duplicates if run several times
    if command == 'store':
        populate_all()
        for cls in classes:
            store_objects_in_index(cls)

    # Empty index
    elif command in ('clean', 'clear'):
        for cls in classes:
            create_or_open_index(cls)
            create_in(INDEX_DIR, SCHEMA_MAP[cls._NAME], cls._NAME)
            print("Emptied %s index" % cls.__name__)

    # Create vectors
    elif command.startswith('vector'):
        populate_all()
        create_vectors()

    # Search in whoosh index
    elif command == 'search':
        mapping = {'icd': ('short', 'label'), 'atc': ('code', 'title'),
                   'therapy': ('code', 'title'), 'case': ('code',)}
        query = ''.join(query)  # Flatten query
        cls, = classes  # Can only search on one index at a time
        print_result(extract(mapping[cls._NAME], search(cls, field, query)))

    # Unknown command
    else:
        print("Unknown command '%s'" % command)
        print("Usage: python3 index.py <command> [index] [field] [query]")
        print("Command is either build|store|clean|search|vector")
        sys.exit(2)

    sys.exit(None)

Example #12

0

Show file

File: search.py Project: dokipen/trac-irclogs-plugin

 def get_index(self):
     ip = self.indexpath
     if not self.indexpath.startswith('/'):
         ip = path.join(self.env.path, ip)
     if not path.exists(ip):
         os.mkdir(ip)
     if not index.exists_in(ip):
         index.create_in(ip, self.SCHEMA)
     return index.open_dir(ip)

Example #13

0

Show file

File: document_db.py Project: scottgw/paperless

 def setup(self):
     import os
     if not os.path.exists(self.location):
         os.mkdir(self.location)
         self.ix = index.create_in(self.location, self.schema)
     elif index.exists_in(self.location):
         self.ix = index.open_dir(self.location, schema=self.schema)
     else:
         self.ix = index.create_in(self.location, self.schema)

Example #14

0

Show file

File: search.py Project: MrskyBoatin/python-searchengine

 def create(self, path):
     """
     Create the index directory if it hasn't already been created.
     """
     if not os.path.exists(path):
         os.mkdir(path)
         create_in(self.index_path, self.schema)
         return True
     return False

Example #15

0

Show file

File: whoosh_backend.py Project: Stackato-Apps/bloodhound

    def test_detects_that_index_needs_upgrade(self):
        wrong_schema = Schema(content=TEXT())
        index.create_in(self.index_dir, schema=wrong_schema)

        whoosh_backend = WhooshBackend(self.env)
        self.assertEqual(whoosh_backend.is_index_outdated(), True)

        whoosh_backend.recreate_index()
        self.assertEqual(whoosh_backend.is_index_outdated(), False)

Example #16

0

Show file

File: commands.py Project: maw/urla

    def run(args):
        indexdir = "urla.index"

        schema = Schema(file=ID(stored=True), line=NUMERIC(stored=True),
                        network=ID(stored=True), channel=TEXT(stored=True),
                        when=DATETIME(stored=True), speaker=ID(), to=ID(),
                        content=TEXT(stored=True, analyzer=StemmingAnalyzer()))

        os.mkdir(indexdir)
        create_in(indexdir, schema)

Example #17

0

Show file

File: indexer.py Project: alvesjnr/bibishare

def create_index():

    schema = fields.Schema(title=fields.TEXT(stored=True), 
                           id=fields.ID(stored=True), 
                           authors=fields.TEXT(stored=True),
                           wiki=fields.TEXT,)

    if not os.path.exists("bibishare/search/index"):
        os.mkdir("bibishare/search/index")
    create_in("bibishare/search/index", schema)

Example #18

0

Show file

File: search_manager.py Project: heolin123/onto-gen

    def create_index(self):
        """If index directory does not exists, this method creates directory, creates index and saves schema file."""

        if not os.path.exists(self.indexdir):
            os.mkdir(self.indexdir)
            self.message("Created index at \"" + self.indexdir + "\".")
            create_in(self.indexdir, self.schema)
            with open(self.indexdir + "/" + SCHEMA_PATH, 'w') as output:
                output.write(open(self.schema_path).read())
                self.message("Saved schema to " + self.indexdir + "/" + SCHEMA_PATH + ".")

Example #19

0

Show file

File: indexer.py Project: willthames/devopsweeklyindex

def get_index():
    """ Return an index
    creates index if empty
    
    """ 
    if not exists_in("index", indexname="contents"):
        if not os.path.exists("index"):
            os.mkdir("index")
        create_in("index", indexname="contents", schema=get_schema())
    return open_dir("index", indexname="contents")

Example #20

0

Show file

File: pwhoosh.py Project: yxlwfds/pyramid_whoosh

def create_index(reg):
    woosh_env = reg.getUtility(IWhooshEnvironment)
    try:
        docschema = reg.getUtility(IDocumentSchema)
    except ComponentLookupError:
        raise AttributeError('')

    if index.exists_in(woosh_env['index_dir']):
        print >> sys.stderr, 'Index already exist , you need to delete it before build it again'
        exit(-1)
    index.create_in(woosh_env['index_dir'], docschema.Schema)

Example #21

0

Show file

File: index.py Project: heyglen/ghstore

 def get(cls, instance):
     """ Get the index object for a python object """
     index_path = cls.get_path(instance)
     class_name = Introspection.get_class_name(instance)
     instance = cls.get_instance_from_class(index_path, instance)
     schema = cls.get_schema(instance)
     if not os.path.exists(index_path):
         os.makedirs(index_path)
     if not whoosh_index.exists_in(index_path, indexname=class_name):
         whoosh_index.create_in(index_path, schema, indexname=class_name)
     return cls._get_index(index_path, class_name, schema=schema)

Example #22

0

Show file

File: reindex_all.py Project: MikaYuoadas/Docbucket

 def handle_noargs(self, *args, **options):
     print 'Clearing current index...'
     if not os.path.exists(settings.WHOOSH_INDEX):
         os.mkdir(settings.WHOOSH_INDEX)
     index.create_in(settings.WHOOSH_INDEX, DOCUMENT_WHOOSH_SCHEMA)
     print 'Indexing documents...'
     for document in Document.objects.all():
         document.save()  # A save will trigger a re-index of the document
         sys.stdout.write('.')
         sys.stdout.flush()
     print '\nAll done.'

Example #23

0

Show file

File: database.py Project: seamustuohy/overview_archive

 def index(self, value):
     if not path.exists(value):
         log.info("creating woosh database directory")
         mkdir(value)
     if not exists_in(value):
         log.info("Whoosh DB does not exist. Creating it.")
         create_in(value, self.schema)
     else:
         log.info("Whoosh DB exist. Using Existing WhooshDB.")
     ix = open_dir(value)
     self._index = ix

Example #24

0

Show file

File: store.py Project: heyglen/ghstore

 def delete_all(cls, instance):
     """Delete all objects of the given type from the index."""
     index_path = Index.get_path(instance)
     instance = Index.get_instance_from_class(index_path, instance)
     if instance is not None:
         class_name = Introspection.get_class_name(instance)
         logger.debug('Deleting all {} from {}'.format(class_name, index_path))
         if os.path.isdir(index_path):
             schema = Index.get_schema(instance)
             whoosh_index.create_in(index_path, schema, indexname=class_name)
     return True

Example #25

0

Show file

File: indexer.py Project: cozy/cozy-data-indexer

    def clear_index(self):
        """
        Clear index: whoosh indexe create, create a new index in the directory
        even if an index exists.
        """

        if os.path.exists("indexes"):
            index.create_in("indexes", self.schema)

        if os.path.exists("doctypes"):
            with open('doctypes/doctypes_schema.json', 'w') as defaultFile:
                defaultFile.write("{}")

Example #26

0

Show file

File: indexing.py Project: sshyran/coldice

    def __init__(self, doc_base):
        self.__doc_base = doc_base
        self.__index_folder = os.path.join(self.__doc_base, '.indices')

        if not os.path.exists(self.__index_folder):
            print '    Create index directory', self.__index_folder
            os.mkdir(self.__index_folder)

        if not exists_in(self.__index_folder):
            create_in(self.__index_folder, INDEX_SCHEMA)

        self.__index_write = open_dir(self.__index_folder).writer()

Example #27

0

Show file

File: whooshQueryLogger.py Project: Loptr250/ifind

 def __init__(self, order=0, whoosh_query_index_dir="", unique=True):
  
   super(WhooshQueryLogger, self).__init__(order)
   self.description = "Adds queries to a Whoosh index"
   self.unique = unique
   print "About to create Whoosh query logger"
   self.whooshIndexDir = whoosh_query_index_dir
   schema = Schema(title=ID(unique=True, stored=True), content=TEXT(stored=True), ncontent=NGRAM(stored=True), issued=DATETIME(stored=True))
   if not exists_in(self.whooshIndexDir):
       print "Creating a Whoosh Index."
       create_in(self.whooshIndexDir, schema)
   self.queryIndex = open_dir(self.whooshIndexDir)
   print "The current number of queries held in the index is: " + str( self.queryIndex.doc_count() )
   print "Done creating Whoosh query log index"

Example #28

0

Show file

File: search.py Project: 421662093/koudaizhuanjia

 def __init__(self):
     """
     Instantiate the whoosh schema and writer and create/open the index.
     """
     self.users_collection = pymongo.Connection().fullteck.users
     #self.webpages_collection = pymongo.Connection().fullteck.webpages_col
     self.indexdir = "index"
     self.indexname = "users"
     self.schema = self.get_schema()
     if not os.path.exists(self.indexdir):
         os.mkdir(self.indexdir)
         create_in(self.indexdir, self.schema, indexname=self.indexname)
     # create an index obj and buffered writer
     self.ix = open_dir(self.indexdir, indexname=self.indexname)

Example #29

0

Show file

File: whoosh_fts.py Project: ZoeyYoung/Bookmarks_Cloud

 def __init__(self, db):
     """
     Instantiate the whoosh schema and writer and create/open the index.
     """
     self.bookmarks_collection = db.bookmarks_col
     self.webpages_collection = db.webpages_col
     self.indexdir = "index"
     self.indexname = "bookmarks"
     self.schema = self.get_schema()
     if not os.path.exists(self.indexdir):
         os.mkdir(self.indexdir)
         create_in(self.indexdir, self.schema, indexname=self.indexname)
     # create an index obj and buffered writer
     self.ix = open_dir(self.indexdir, indexname=self.indexname)

Example #30

0

Show file

File: dirindexer.py Project: sdbrain/dirindexer

    def get_ix(self):
        """Creates the Schema and returns the index_writer"""
        schema = Schema(title=TEXT(stored=True),
                        path=ID(stored=True, unique=True),
                        content=TEXT(analyzer=
                                     SpaceSeparatedTokenizer() |
                                     LowercaseFilter()),
                        date=STORED)
        #create if not exists
        if not os.path.exists(".indexdir"):
            os.mkdir(".indexdir")
            create_in(".indexdir", schema)

        ix = open_dir(".indexdir")
        return ix

Example #31

0

Show file

File: nkaiIndex.py Project: BobLiSwigger/esearcher

 def __init__(self):
     self.indexDir = "./indexfile"
     if not os.path.exists(self.indexDir):
         os.mkdir(self.indexDir)
     self.schema = Schema(url=TEXT(stored=True,
                                   analyzer=StemmingAnalyzer()),
                          title=TEXT(stored=True,
                                     analyzer=ChineseAnalyzer()),
                          content=TEXT(stored=True,
                                       analyzer=ChineseAnalyzer()),
                          anchors=KEYWORD(stored=True, commas=True),
                          pageRank=NUMERIC(int,
                                           32,
                                           sortable=True,
                                           stored=True))
     self.exists = index.exists_in(self.indexDir, indexname="nkai")
     if self.exists:
         self.index = index.open_dir(self.indexDir, indexname="nkai")
     else:
         self.index = index.create_in(self.indexDir,
                                      schema=self.schema,
                                      indexname="nkai")

Example #32

0

Show file

def get_index(api, recreate=False, must_exist=False):
    index_dir = api.ftsindex
    if index_dir.exists():
        if recreate:
            rmtree(index_dir)  # pragma: no cover
    elif must_exist:
        raise ValueError('No whoosh index found at {0}.'.format(index_dir))

    if not index_dir.exists():
        index_dir.mkdir()
        schema = Schema(id=ID(stored=True),
                        provider=KEYWORD(stored=True),
                        authoryear=TEXT(stored=True),
                        title=TEXT(analyzer=StemmingAnalyzer(), stored=True),
                        author=TEXT(stored=True),
                        year=TEXT(stored=True),
                        doctype=TEXT(stored=True),
                        lgcode=TEXT(stored=True),
                        body=TEXT(),
                        tags=KEYWORD)
        return index.create_in(index_dir.as_posix(), schema)
    return index.open_dir(index_dir.as_posix())

Example #33

0

Show file

    def build_index(self):
        analyzer = ChineseAnalyzer()

        # 创建索引模板
        schema = Schema(novelID=ID(stored=True),
                        novelName=TEXT(stored=True, analyzer=analyzer),
                        novelUrl=ID(stored=True),
                        novelAuthor=TEXT(stored=True, analyzer=analyzer),
                        novelIntroduction=TEXT(stored=True, analyzer=analyzer),
                        novelUpdateTime=TEXT(stored=True),
                        novelUpdateUrl=ID(stored=True),
                        novelUpdateName=TEXT(stored=True))

        # create/open index
        dir_path = 'novel_index'
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            ix = create_in('novel_index', schema)
        else:
            ix = open_dir('novel_index')

        # build index
        writer = ix.writer()
        rows = self.page.find()
        indexed_amount = 0
        for row in rows:
            indexed_amount += 1
            writer.add_document(
                novelID=str(row['_id']),
                novelName=row['novel_name'],
                novelUrl=row['novel_url'],
                novelAuthor=row['novel_author'],
                novelIntroduction=row['novel_introduction'],
                novelUpdateTime=row['novel_update_last_time'],
                novelUpdateUrl=row['novel_update_last_url'],
                novelUpdateName=row['novel_update_last_name'],
            )
        writer.commit()
        print(indexed_amount)

Example #34

0

Show file

File: moviesearch.py Project: whelr/MovieSearch

	def index(self):
		schema = Schema(path=ID(stored=True), title=TEXT(stored=True), year=TEXT(stored=True), rated=TEXT(stored=True), director=TEXT(stored=True), actors=TEXT(stored=True), plot=TEXT(stored=True), imdb=TEXT(stored=True), poster=TEXT(stored=True), url=TEXT(stored=True))
		indexer = create_in("index", schema)
		writer = indexer.writer()

# add [encoding='utf-8'] in line 100 after csv file if appears UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 5291: character maps to <undefined>
		with open('moviedata.csv', encoding='utf-8') as csv_file:
			# Load the csv file
			csv_reader = csv.reader(csv_file, delimiter=',')
			skip = 0
			for row in csv_reader:
				# Skip first row
				if(skip == 0):
					skip = 1
					continue
				# For each row in csv, add it as document with appropiate headers to index.
				writer.add_document(title=u"%s" % (row[1]), year=u"%s" % (row[2]), rated=u"%s" 
					% (row[3]), director=u"%s" % (row[4]), actors=u"%s" % (row[5]), plot=u"%s" 
					% (row[6]), imdb=u"%s" % (row[7]), poster=u"%s" % (row[8]), url=u"%s" % (row[9]))
		# Commit updates to index
		writer.commit()
		self.indexer = indexer

Example #35

0

Show file

File: task0.py Project: ashishpatel26/codelibrary

def create_index():
    # We create the index schema.
    schema = Schema(id=ID(stored=True), title=TEXT, content=TEXT)

    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    ix = index.create_in(index_dir, schema)

    # The writer() method of the Index object returns an IndexWriter object that lets us add documents to the index.
    writer = ix.writer()

    # Add documents to the index.
    for doc in docs:
        # Add document
        writer.add_document(id=doc['id'].decode(),
                            title=doc['title'].decode(),
                            content=doc['content'].decode())

    # Calling commit() on the IndexWriter saves the added documents to the index.
    writer.commit()

    ix.close()

Example #36

0

Show file

File: result_analyzer.py Project: confuzz-code/confuzz-code

    def queryWithStemming(self, string, content):
        if string in content:
            return True

        words = segment(string)
        for word in words:
            if not word.isalpha():
                words.remove(word)
        string1 = " ".join(words)
        # print string1
        words = segment(string + "s")
        for word in words:
            if not word.isalpha():
                words.remove(word)
        string2 = " ".join(words)
        # print string2

        self.init_index()
        schema = Schema(title=TEXT(stored=True),
                        path=ID(stored=True),
                        content=TEXT(analyzer=StemmingAnalyzer()))
        ix = create_in("index", schema)
        writer = ix.writer()
        # content = "The good pid file  function of directory good can not work"
        writer.add_document(title=u"First document",
                            path=u"/a",
                            content=unicode(content))
        writer.commit()
        with ix.searcher() as searcher:
            query = QueryParser("content", ix.schema).parse(string1)
            results = searcher.search(query)
            res1 = len(results) > 0

            query = QueryParser("content", ix.schema).parse(string2)
            results = searcher.search(query)
            res2 = len(results) > 0
        res = res1 | res2
        # print res, res1, res2
        return res

Example #37

0

Show file

File: script_gen_whoosh_database.py Project: zhigliu/maplet

def gen_whoosh_database(if_rand=True):

    analyzer = ChineseAnalyzer()
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer),
                    type=TEXT(stored=True),
                    link=ID(
                        unique=True,
                        stored=True,
                    ),
                    content=TEXT(stored=True, analyzer=analyzer))
    whoosh_db = 'database/whoosh'
    if not os.path.exists(whoosh_db):
        os.makedirs(whoosh_db)
        ix = create_in(whoosh_db, schema)
    else:
        ix = open_dir(whoosh_db)

    writer = ix.writer()
    do_for_app(writer, rand=if_rand)
    do_for_post(writer, rand=if_rand)
    print('-' * 10)
    writer.commit()

Example #38

0

Show file

def createSearchableData(data_file):
    '''
    Schema definition: video id, video title, description
    '''
    stem_analyzer = StemmingAnalyzer()
    schema = Schema(id=ID(stored=True), title=TEXT(stored=True), description=TEXT(analyzer=stem_analyzer, stored=True))
    if not os.path.exists("indexdir"):
        os.mkdir("indexdir")

    # Creating an index writer to add document as per schema
    ix = create_in("indexdir", schema)
    writer = ix.writer()

    with open(data_file) as f:
        youtube_array = json.load(f)
        for youtube_item in tqdm(youtube_array):
            youtube_id = youtube_item['id']
            youtube_title = youtube_item['title']
            youtube_description = youtube_item['description']
            writer.add_document(id=youtube_id, title=youtube_title, description=youtube_description)

    writer.commit()

Example #39

0

Show file

File: inverted_index_create.py Project: boris-zhang/advertisement-DSP-code

def write_index_file(index_dir, tablename):
    analyzer = ChineseAnalyzer(minsize=1)  # can index one word
    schema = Schema(seq_no=NUMERIC(stored=True),
                    source=TEXT(stored=True),
                    title=TEXT(stored=True),
                    segwords=TEXT(stored=True, analyzer=analyzer))
    ix = create_in(index_dir, schema)
    writer = ix.writer()

    datasets = fetch_segwords(tablename)
    for dataset in datasets:
        get_seq_no = int(dataset["seq_no"])
        get_source = dataset["ad_title_source"]
        get_title = dataset["ad_title"].replace('\n', '')
        get_segwords = dataset["ad_title_segwords"].replace('\n', '')
        writer.add_document(seq_no=get_seq_no,
                            source=get_source,
                            title=get_title,
                            segwords=get_segwords)
    writer.commit()
    loginfo = 'Inverted index for %s has been created.' % tablename
    gl.write_log(logpath, 'info', loginfo)

Example #40

0

Show file

File: bookmark_whoosh_build.py Project: stickmy/bookmark-search

def main():
    # 使用结巴的中文分词
    analyzer = ChineseAnalyzer()

    # 创建 schema, stored 为 True 表示能够被检索
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer), path=ID(stored=False),
                    content=TEXT(stored=True, analyzer=analyzer), id=TEXT(stored=True))

    # 读取 yaml 信息
    config = os.path.abspath(os.path.dirname(__file__))[:-5] + 'config.yaml'
    with open(config) as f:
        c = yaml.load(f.read())
        indexdir = c['index']

    # 存储schema信息至 indexdir 目录下
    if not os.path.exists(indexdir):
        os.mkdir(indexdir)
    idx = create_in(indexdir, schema)

    # 按照 schema 定义信息, 增加需要建立索引的文档
    # 注意: 字符串格式需要定义为 unicode 格式
    return idx

Example #41

0

Show file

File: script_init_env.py Project: wang12xishan/TorCMS

def build_whoosh_database():

    analyzer = ChineseAnalyzer()
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer),
                    type=TEXT(stored=True),
                    link=ID(stored=True),
                    content=TEXT(stored=True, analyzer=analyzer))
    ix = create_in(whoosh_database, schema)

    writer = ix.writer()

    mpost = MPost()
    recs = mpost.query_all()
    for rec in recs:
        text2 = html2text.html2text(tornado.escape.xhtml_unescape(
            rec.cnt_html))
        print(text2)
        writer.add_document(title=rec.title,
                            type='<span style="color:blue;">[文档]</span>',
                            link='/post/{0}.html'.format(rec.uid),
                            content=text2)
    writer.commit()

Example #42

0

Show file

def index_create():
    #重点在这里，将原先的RegexAnalyzer(ur”([\u4e00-\u9fa5])|(\w+(\.?\w+)*)”),改成这句，用中文分词器代替原先的正则表达式解释器。
    analyzer = ChineseAnalyzer()
    # 列出index的所有域
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT(stored=True, analyzer=analyzer))
    ix = create_in('indexer', schema)
    # 将所有文本加入索引
    writer = ix.writer()
    for root, dirs, files in os.walk('data2/'):
        for file in files:
            path2 = os.path.join(root, file)
            with open(path2, 'r') as f:
                content2 = f.read()
                title2 = content2.split('\n')[0]

            writer.add_document(title=title2,
                                path='auto.sohu.com/' +
                                path2[6:].replace('|', '/') + '.shtml',
                                content=content2)
    writer.commit()

Example #43

0

Show file

def cargar():
    
    categorias = Schema(name=ID(stored=True), content=TEXT)
    titulos = Schema(name=ID(stored=True), content=KEYWORD)
    enlaces = Schema(name=ID(stored=True), content=TEXT)
    fechas = Schema(name=ID(stored=True), content=DATETIME)
    descripciones = Schema(name=ID(stored=True), content=KEYWORD)

    schemas = [categorias, titulos, enlaces, fechas, descripciones]
    paths = []

    ix = index.create_in("indexdir3", schema)
    writer = ix.writer()
    path = "myRoko.txt"

    with open(path, "r") as f:
    content = f.read()
    f.close()
    writer.add_document(name=path, content= content)

    writer.commit()

    if not os.path.exists("index"):
        os.mkdir("index")

    ix = create_in("index", schema)
    ix = open_dir("index")

    writer = ix.writer()

    for i in range(len(lista[0])):
        writer.add_document(category=lista[0][i], title=lista[1][i], link=lista[2][i], 
            date=lista[3][i], description=lista[4][i])
    
    writer.commit()

    searcher = ix.searcher()

    #with ix.searcher() as searcher:

Example #44

0

Show file

File: Lisa-Code.py Project: charleschestnut/PracticasAII

def index():
    if not os.path.exists(dirindextemas):
        os.mkdir(dirindextemas)

    ix = create_in(dirindextemas, schema=get_schema())
    writer = ix.writer()
    
    # Extraemos los datos usando BeautifulSoup
    #Missing
    l = extractXML()

    i=0
    #Todo: Extract the attributes
    #for item in l:      
        #writer.add_document(titulo = titulo, antetitulo = antetitulo, link = link, description= description, fecha = fecha)
        #i+=1
   

    writer.add_document(titulo = "test", description = "test", categoria = "test cag",  fecha = datetime.datetime.now())        
    messagebox.showinfo("Temas indexados", "Se han indexado "+str(i)+ " temas")

    writer.commit()

Example #45

0

Show file

def full_index(index_dir, entries):
    idx = index.create_in(index_dir, SCHEMA)
    writer = idx.writer()

    datas = []

    for ent in entries:
        file = open(ent, mode='r')

        # read all lines at once
        all_of_it = file.read()
        all_of_it = all_of_it.replace('\n', ' ')
        data = {'title': ent.split('/')[1], 'path': ent, 'content': all_of_it}
        datas.append(data)

    # TODO: get data

    for data in datas:
        writer.add_document(**data)

    writer.commit()
    return idx

Example #46

0

Show file

File: description_search.py Project: COSC381-2021Winter/youtube-description-search-AdhamOudeif

def create_whoosh_index(video_list, index_name):
    # Schema definition:
    schema = Schema(id=ID(stored=True),
                    title=TEXT(stored=True),
                    description=TEXT(analyzer=StemmingAnalyzer(), stored=True))

    # create a folder to store the index
    if not os.path.exists(index_name):
        os.mkdir(index_name)

    index = create_in(index_name, schema)
    writer = index.writer()

    for video_item in video_list:
        video_id = video_item['id']
        video_title = video_item['snippet']['title']
        video_description = video_item['snippet']['description']
        writer.add_document(id=video_id,
                            title=video_title,
                            description=video_description)

    writer.commit()

Example #47

0

Show file

File: searcher2.py Project: maomaozi/siteSpider

    def __init__(self):
        global RUN

        print('Initial searcher')

        self.schema = Schema(url=ID(stored=True, unique=True),
                             store_path=TEXT(stored=True),
                             timestamp=TEXT(stored=True),
                             content=TEXT(stored=False,
                                          analyzer=ChineseAnalyzer()))

        try:
            print("Open index dir")
            self.my_idx = windex.open_dir(cfg.storage_dir)
        except:
            print("Index dir not found, create new index dir")
            self.my_idx = windex.create_in(cfg.storage_dir, self.schema)

        self.searcher = self.my_idx.searcher()

        self.content_reader = helper.content_reader()

        print("All documents: %s" % self.searcher.doc_count_all())
        print("Valid documents: %s" % self.searcher.doc_count())

        print("Init bayes model, may take some times...")
        self.bayesData = bayes.get_instance()

        RUN = True
        self.ticker = Ticker()
        self.ticker.tick = False
        threading.Thread(target=self.ticker.run).start()

        self.update_lock = threading.Lock()
        self.read_lock = threading.RLock()

        self.aborted_update = False
        self.is_update_in_progress = False
        self.update_progress = 0

Example #48

0

Show file

def build_index(dir):
    # load the well-process doc
    df = doc_preprocess(dir)

    # apply jieba chinese analyzer to tokenize the documents
    analyzer = ChineseAnalyzer()

    # create schema, stored = True means can be returned to user
    schema = Schema(idx=ID(stored=True), title=TEXT(stored=True, analyzer=analyzer),
                    keyword=KEYWORD(analyzer=analyzer), content=TEXT(stored=False, analyzer=analyzer))

    # store the schema information to 'indexdir'
    indexdir = 'indexdir/'
    if not os.path.exists(indexdir):
        os.mkdir(indexdir)
    ix = create_in(indexdir, schema)

    # build the index based on schema
    writer = ix.writer()
    for idx, val in df.iterrows():
        writer.add_document(idx=str(idx), title=str(val[0]), keyword=val[5], content=str(val[6]))
    writer.commit()

Example #49

0

Show file

File: index.py Project: kudrom/artform

    def handle(self, *args, **kwargs):
        """ Creates the index iterating over all the pages of the site """
        schema = Schema(pk=NUMERIC(unique=True, stored=True),
                        title=TEXT,
                        summary=TEXT,
                        tags=KEYWORD(commas=True, scorable=True),
                        pub_date=DATETIME(sortable=True))

        if not os.path.exists(settings.INDEX):
            os.mkdir(settings.INDEX)

        ix = create_in(settings.INDEX, schema)
        writer = ix.writer()
        objects = Page.objects.all()
        for object in objects:
            tags = map(lambda x: x.title, object.tags.all())
            writer.add_document(title=object.title,
                                summary=object.summary,
                                tags=",".join(tags),
                                pk=object.pk,
                                pub_date=object.pub_date)
        writer.commit()

Example #50

0

Show file

File: Whoosh_4.py Project: josperdom1/AII

def create_news_index(dir_index, news):
    if not os.path.exists(dir_index):
        os.mkdir(dir_index)

    ind = create_in(dir_index, schema=get_news_schema())
    writer = ind.writer()

    for story in news:
        category = story[0]
        title = story[1]
        link = story[2]
        description = story[3]
        date = story[4]
        writer.add_document(category=str(category),
                            title=str(title),
                            link=str(link),
                            description=str(description),
                            date=date)

    writer.commit()
    messagebox.showinfo(
        "Succes", "Index created correctly, " + str(len(news)) + " news saved")

Example #51

0

Show file

File: models.py Project: infrascloudy/realms3

    def __init__(self, index_path, language):
        from whoosh import index as whoosh_index
        from whoosh.fields import Schema, TEXT, ID
        from whoosh import qparser
        from whoosh.highlight import UppercaseFormatter
        from whoosh.analysis import SimpleAnalyzer, LanguageAnalyzer
        from whoosh.lang import has_stemmer, has_stopwords
        import os

        if not has_stemmer(language) or not has_stopwords(language):
            # TODO Display a warning?
            analyzer = SimpleAnalyzer()
        else:
            analyzer = LanguageAnalyzer(language)

        self.schema = Schema(path=ID(unique=True, stored=True),
                             body=TEXT(analyzer=analyzer))
        self.formatter = UppercaseFormatter()

        self.index_path = index_path

        if not os.path.exists(index_path):
            try:
                os.mkdir(index_path)
            except OSError as e:
                sys.exit("Error creating Whoosh index: %s" % e)

        if whoosh_index.exists_in(index_path):
            try:
                self.search_index = whoosh_index.open_dir(index_path)
            except whoosh_index.IndexError as e:
                sys.exit("Error opening whoosh index: {0}".format(e))
        else:
            self.search_index = whoosh_index.create_in(index_path, self.schema)

        self.query_parser = qparser.MultifieldParser(["body", "path"],
                                                     schema=self.schema)
        self.query_parser.add_plugin(qparser.FuzzyTermPlugin())

Example #52

0

Show file

File: get_data.py Project: mkarlovc/exploredu

def createIndexRsr(path, tblRsr):
    schema = Schema(fname=TEXT(stored=True), lname=TEXT(stored=True), id=TEXT(stored=True), mstid=TEXT(stored=True),\
science=TEXT(stored=True), scienceCode=TEXT(stored=True), field=TEXT(stored=True), subfield=TEXT(stored=True), keyws_en=TEXT(stored=True), keyws_sl=TEXT(stored=True), content=TEXT)
    index = create_in(path+"whooshindex/rsr", schema)

    writer = index.writer()
    for rsr in tblRsr.all():
        content = ""
        s = u""
        s_code = u""
        f = u""
        sub = u""
        keyws_en = u""
        keyws_sl = u""
        if rsr.has_key('science'):
            s = rsr['science']['#text']
            s_code = rsr['science']['@code']
            content += " "+rsr['science']['#text']
        if rsr.has_key('field'):
            f = rsr['field']['#text']
            content += " "+rsr['field']['#text']
        if rsr.has_key('subfield'):
            sub = rsr['subfield']['#text']
            content += " "+rsr['subfield']['#text']
        if rsr.has_key('keyws_en'):
            keyws_en = rsr['keyws_en']['@keyws']
            content += " "+rsr['keyws_en']['@keyws']
        if rsr.has_key('keyws_sl'):
            keyws_sl = rsr['keyws_sl']['@keyws']
            content += " "+rsr['keyws_sl']['@keyws']

        if content != "":
            print rsr["@id"]+": "+content
            writer.add_document(lname=rsr['fname'], fname=rsr['lname'], id=rsr['@id'], mstid=rsr['@mstid'],\
science=s, scienceCode=s_code, field=f, subfield=sub, keyws_en=keyws_en, keyws_sl=keyws_sl, content=content) 
    
    writer.commit()
    return index

Example #53

0

Show file

File: run_whoosh.py Project: zombiz/TorCMS

def gen_whoosh_database(kind_arr=None, post_type=None):
    '''
    kind_arr, define the `type` except Post, Page, Wiki
    post_type, define the templates for different kind.
    :param if_rand:
    :param kind_arr:
    :param post_type:
    :return:
    '''
    if kind_arr is None:
        kind_arr = []
    if post_type is None:
        post_type = {}
    analyzer = ChineseAnalyzer()
    schema = Schema(title=TEXT(stored=True, analyzer=analyzer),
                    catid=TEXT(stored=True),
                    type=TEXT(stored=True),
                    link=ID(unique=True, stored=True),
                    content=TEXT(stored=True, analyzer=analyzer))
    whoosh_db = 'database/whoosh'
    if not os.path.exists(whoosh_db):
        os.makedirs(whoosh_db)
        create_idx = create_in(whoosh_db, schema)
    else:
        create_idx = open_dir(whoosh_db)

    writer = create_idx.writer()

    # do_for_app2(writer, rand=if_rand)

    for switch in [True, False]:
        do_for_post(writer, rand=switch, doc_type=post_type['1'])
        do_for_wiki(writer, rand=switch, doc_type=post_type['1'])
        do_for_page(writer, rand=switch, doc_type=post_type['1'])

        for kind in kind_arr:
            do_for_app(writer, rand=switch, kind=kind, doc_type=post_type)
    writer.commit()

Example #54

0

Show file

 def __init__(self, th):
     BaseSearchEngine.__init__(self, th, False)
     self.__ix_writer = None
     ix_dir = os.path.join(th.prefixes[0], 'index',
                           "ix_" + str(whoosh_ix_ver))
     if not os.path.isdir(ix_dir): os.makedirs(ix_dir)
     # try to load a pre-existing index
     try:
         self.indexer = open_dir(ix_dir)
     except (EmptyIndexError, IndexVersionError):
         # create a new one
         try:
             shutil.rmtree(ix_dir, True)
             os.makedirs(ix_dir)
         except OSError:
             pass
         schema = Schema(
             kitab=ID(stored=True),
             vrr=ID(stored=True, unique=False),  # version release
             nodeIdNum=ID(stored=True, unique=False),
             title=TEXT(stored=True, field_boost=1.5, analyzer=analyzer),
             content=TEXT(stored=False, analyzer=analyzer),
             #content=TEXT(stored=False,analyzer=analyzer, vector=Frequency(analyzer=analyzer)), # with term vector
             tags=IDLIST(stored=False))
         self.indexer = create_in(ix_dir, schema)
     #self.__ix_qparser = ThMultifieldParser(self.th, ("title","content",), schema=self.indexer.schema)
     self.__ix_qparser = MultifieldSQParser((
         "title",
         "content",
     ), self.indexer.schema)
     self.__ix_qparser.add_plugin(
         FieldAliasPlugin({
             u"kitab": (u"كتاب", ),
             u"title": (u"عنوان", ),
             u"tags": (u"وسوم", )
         }))
     #self.__ix_pre=whoosh.query.Prefix
     self.__ix_searcher = self.indexer.searcher()

Example #55

0

Show file

File: make_index.py Project: echeng1199/provis

def create_index(postid, condition, file_dict, schema_name, num_pro):
    index_path = 'proteomicsdb/index_dir'

    # make Whoosh index in directory from scratch if does not yet exist
    if not os.listdir(index_path):
        ix = index.create_in(index_path, schema_name)

    # else if there's already an index, just open it and add
    else:
        ix = index.open_dir(index_path)

    # add elements to index
    writer = ix.writer()
    for i in range(num_pro):
        writer.add_document(post_id=postid,
                            condition=condition,
                            accession=list(file_dict.values())[0][i],
                            description=list(file_dict.values())[1][i],
                            gene=list(file_dict.values())[2][i],
                            fdr=list(file_dict.values())[3][i],
                            species=list(file_dict.values())[4][i],
                            mw=list(file_dict.values())[5][i],
                            peptides=list(file_dict.values())[6][i],
                            psm=list(file_dict.values())[7][i],
                            uniq_peptides=list(file_dict.values())[8][i],
                            abun_t1=list(file_dict.values())[9][i],
                            abun_t2=list(file_dict.values())[10][i],
                            abun_t3=list(file_dict.values())[11][i],
                            abun_t4=list(file_dict.values())[12][i],
                            abun_t5=list(file_dict.values())[13][i],
                            q_value=list(file_dict.values())[14][i],
                            pep=list(file_dict.values())[15][i])

    writer.commit(
        merge=False
    )  # "merge=False" means when adding multiple files worth of entries, keep files separate

    pass  # run function but return nothing

Example #56

0

Show file

File: search.py Project: okute/ProfessionalWebsite

def create_index():
    # Create the schema for this index, which denotes the types of each field, and next try to build the index itself
    #   using this schema. Note that this schema treats the URL as the unique identifier for documents in the index,
    #   and scores documents based on the title and content alone
    index_dir = ".index"

    # Try to create the index directory
    os.mkdir(index_dir)

    # Build a new index in this directory
    index = create_in(index_dir, index_schema)

    # Get a writer for the index
    index_writer = index.writer()

    # Add the main pages to the index
    for main_page in ['about_me', 'research', 'resume']:
        insert_document(index_writer, main_page,
                        'http://www.jontedesco.net/' + main_page, main_page)

    # Add the blog entries
    blog_posts = list(Post.objects.all())
    for blog_post in blog_posts:
        insert_document(index_writer, blog_post.title,
                        'http://www.jontedesco.net/blog/' + blog_post.name,
                        blog_post.name)

    # Add the projects
    projects = list(Project.objects.all())
    for project in projects:
        insert_document(index_writer, project.title,
                        'http://www.jontedesco.net/projects/' + project.name,
                        project.name)

    # Commit all the changes, so that every change is flushed to disk, and we can safely query the index
    index_writer.commit()

    return index, index_schema

Example #57

0

Show file

def dblpindextest():
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    author=TEXT,
                    content=TEXT)
    cwd = os.path.dirname(os.path.realpath(__file__))
    indexDirPath = os.path.join(cwd, os.path.pardir, "testindexdir")
    dataDirPath = os.path.join(cwd, os.path.pardir, "dblpdata", "dblp-ref")
    if not os.path.exists(indexDirPath):
        os.mkdir(indexDirPath)
    ix = create_in(indexDirPath, schema)
    writer = ix.writer()
    linelens = []
    times = []
    filepath = os.path.join(dataDirPath, "dblp-ref-3.json")
    with open(filepath, "r", encoding='utf-8') as f:
        for line in f:
            jsonline = json.loads(line)
            try:
                t1 = time.time()
                writer.add_document(title=jsonline['title'],
                                    path=jsonline['id'],
                                    content=jsonline['abstract'])
                t2 = time.time()
                if t2 - t1 > 0:
                    times.append(t2 - t1)
                    linelens.append(len(jsonline['abstract']))
                    print(len(line), t2 - t1)
            except Exception as e:
                print(str(e))
    plotlens = []
    plottimes = []
    for i in range(0, 50):
        x = randbelow(1000)
        plotlens.append(linelens[x])
        plottimes.append(times[x])
    plt.scatter(plotlens, plottimes)
    plt.show()

Example #58

0

Show file

File: search_index.py Project: BichMi/KhoaLuan

def index_search(s):
    """
    Đánh chỉ mục và search dựa vào câu query
    :param s:  câu truy vấn đã làm sạch : đăng kí nguyện vọng 1 như thế nào ?#dạ cho em hỏi, em muốn đăng kí nguyện vọng 1 thì như thế nào ạ?
    :return: list các tài liệu được tìm thấy
    """
    results_search = []
    results_search.append(s)
    client = MongoClient('mongodb://localhost:27017/')  #kết nối MongoDB
    db = client.TuyenSinhDB  #ket noi database
    collection = db.WordSegmentation  #ket noi collection của Database
    select_table = collection.find({}, {"_id": 0})  #read data
    # tiến hành đánh chỉ mục
    schema = Schema(title=TEXT(stored=True),
                    path=ID(stored=True),
                    content=TEXT)
    ix = create_in(
        "/home/bichmi/Desktop/KhoaLuanCrawl/KhoaLuan/selenium_TuyenSinh/Data_index",
        schema)
    writer = ix.writer()
    for item in select_table:
        content = item['questions'] + ' ' + item['answers'] + ' ' + item[
            'dates']
        writer.add_document(title=content, path=u"/a", content='')
    writer.commit()
    # tiến hành search dựa vào câu truy vấn và parse title
    with ix.searcher() as searcher:
        query = QueryParser("title", ix.schema).parse(s)
        results = searcher.search(query)
        # import pdb
        # pdb.set_trace()
        if len(results) <= 0:
            print("Không có kết quả phù hợp với câu hỏi!")
            return 0
        else:
            for hit in results:
                results_search.append(hit['title'])
            return results_search

Example #59

0

Show file

def create_gutenberg_index_rdf(bz2_rdf_filename, indexdir):
    """Build whoosh index from parsed RDF.
    DB contents are no longer identical to RDF output. Plus index now stores selected db row ids.
    DEPRECATED"""
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)  # don't buffer stdout

    print "WARNING: direct use of rdf content may not accurately reflect database contents"

    schema = get_schema()
    whoosh_index = create_in(indexdir, schema)
    writer = whoosh_index.writer()
    for count, record in enumerate(
            gutenberg_rdf_parser.parse_rdf_bz2(bz2_rdf_filename,
                                               GutenbergIndexFilter().filter)):
        # Only index fields from description records. File records can be ignored.
        if record['record_type'] == 'DESCRIPTION':
            if count % 5000 == 0:
                print count,
            subset = {k: record[k] for k in schema.names() if k in record}
            writer.add_document(**subset)
    print "committing...",
    writer.commit()
    print "DONE"

Example #60

0

Show file

File: pdf_search.py Project: wulfithewulf/srpdfcrawler

 def __init__(self):
     self.scope = 20
     self.terms = set()
     self.index_path = "index"
     self.common_terms = set()
     self.schema = Schema(title=TEXT(stored=True),
                          path=TEXT(stored=True),
                          page=NUMERIC(stored=True),
                          content=TEXT(stored=True))
     self.ix = None
     self.index_files = False
     if not os.path.exists(self.index_path):
         os.mkdir(self.index_path)
         self.ix = create_in(self.index_path, self.schema)
         self.index_files = True
     else:
         self.ix = open_dir(self.index_path)
     self.writer = self.ix.writer()
     self.read()
     self.writer.commit()
     self.searcher = self.ix.searcher()
     self.corrector = ListCorrector(sorted(list(self.common_terms)))
     self.parser = QueryParser("content", self.ix.schema)