コード例 #1
0
ファイル: ndgc.py プロジェクト: abhishekgoyal1994/InfoZeal
def searchModule(qq):
	global results_bm25, results_tfidf, results_tf, results ,idx, time_bm25, time_tfidf, time_tf
	results_bm25 =  results_tfidf = results_tf = results = []
	time_bm25 = time_tfidf = time_tf = None
	if flag == 1:
		if doStem!="false" and doStop!="false":
			idx = open_dir("Indexed/Index_stsw")
		elif doStem!="false" and doStop=="false":
			idx = open_dir("Indexed/Index_st")
		elif doStem=="false" and doStop!="false":
			idx = open_dir("Indexed/Index_sw")
		else:
			idx = open_dir("Indexed/Index") 
		qp = qparser.QueryParser("content", schema=idx.schema)
		q = qp.parse(unicode(qq))							#by default, the parser treats the words as if they were connected by AND
		s = idx.searcher()
		results 		= s.search(q)
		results.fragmenter.surround = 50
		start = time.time()
		results_bm25 	= idx.searcher().search(q)
		time_bm25 = (time.time()-start)
		results_bm25.fragmenter.surround = 50

		start = time.time()
		results_tfidf 	= idx.searcher(weighting=scoring.TF_IDF()).search(q)
		time_tfidf = (time.time()-start)
		results_tfidf.fragmenter.surround = 50

		start = time.time()
		results_tf 	= idx.searcher(weighting=scoring.Frequency()).search(q)
		time_tf = (time.time()-start)
		results_tf.fragmenter.surround = 50
コード例 #2
0
 def open_index(self, idx_dir):
     global index_error_given
     if os.path.exists(idx_dir):
         try:
             return open_dir(idx_dir)
         except ValueError:
             if not index_error_given:
                 index_error_given = True
                 g.es_print('bigdash.py: exception in whoosh.open_dir')
                 g.es_print('please remove this directory:', g.os_path_normpath(idx_dir))
             return None
             # Doesn't work: open_dir apparently leaves resources open,
             # so shutil.rmtree(idx_dir) fails.
                 # g.es_print('re-creating', repr(idx_dir))
                 # try:
                     # import shutil
                     # shutil.rmtree(idx_dir)
                     # os.mkdir(idx_dir)
                     # self.create()
                     # return open_dir(idx_dir)
                 # except Exception as why:
                     # g.es_print(why)
                     # return None
     else:
         try:
             os.mkdir(idx_dir)
             self.create()
             return open_dir(idx_dir)
         except Exception:
             g.es_exception()
             return None
コード例 #3
0
ファイル: indexer.py プロジェクト: DMGbupt/wechat-crawler
 def __load__(region=None):
     """加载/建立索引
     :param region: 索引范围,None表示加载所有索引;news\blog表示加载对应索引
     :return: 是否加载成功
     """
     # 加载索引
     if region:
         if region in Indexer.__index__:
             return True
         else:
             if region not in index_dir:
                 return False
             if not os.path.exists(index_dir[region]):
                 os.makedirs(index_dir[region])
                 Indexer.__index__[region] = index.create_in(index_dir[region], schema, indexname=region)
             else:
                 Indexer.__index__[region] = index.open_dir(index_dir[region], indexname=region)
             return True
     else:  # 加载全部索引
         for reg in index_dir.keys():
             if reg in Indexer.__index__:
                 return True
             else:
                 if not os.path.exists(index_dir[reg]):
                     os.mkdir(index_dir[reg])
                     Indexer.__index__[reg] = index.create_in(index_dir[reg], schema, indexname=reg)
                 else:
                     Indexer.__index__[reg] = index.open_dir(index_dir[reg], indexname=reg)
                 return True
コード例 #4
0
ファイル: practica.py プロジェクト: garridev/practicas-aii
def init():
    # Setting my schema ...
    schema_email = Schema(
        path=TEXT(stored=True),
        sender_email=TEXT(stored=True),
        recipient_emails=TEXT,
        date=DATETIME,
        subject=TEXT(stored=True),
        body=TEXT,
    )
    schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True))
    schemas = {"index_emails": schema_email, "index_book": schema_book}

    if not os.path.exists(index_path):
        os.mkdir(index_path)

    indexes = {}
    for ixname, schema in schemas.items():
        """
        Esta parte es mejorable, ya que sólo indexa si no existe indice. 
        No tiene en cuenta si los archivos indexados se han modificado o si 
        se han eliminado como se explica aquí:
            @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing
        """
        exists = index.exists_in(index_path, indexname=ixname)
        if not exists:
            ix = index.create_in(index_path, schema, indexname=ixname)

            # Indexing ...
            ix = index.open_dir(index_path, indexname=ixname)
            writer = ix.writer()
            if ixname == "index_emails":
                files = read_dir()
                index_emails(files, writer)
            elif ixname == "index_book":
                index_book(writer)
        else:
            ix = index.open_dir(index_path, indexname=ixname)
        indexes[ixname] = ix

    # Main routine
    while True:
        ix = indexes.get("index_emails")
        with ix.searcher() as searcher:
            input_user = str(raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): "))
            mparser = MultifieldParser(["subject", "body"], schema=ix.schema)
            myquery = mparser.parse(unicode(input_user))

            results = searcher.search(myquery)
            print "=================================================="
            for result in results:
                # read_file(result.get("path"))

                print ("Remitente: " + findNameBySender(indexes, result.get("sender_email")))
                print ("Asunto: " + result.get("subject"))
                print "=================================================="
コード例 #5
0
ファイル: search.py プロジェクト: nano13/nvcli
 def getIndex(self):
     module_dir = self.indexdir +"/"+ self.sword_module_name
     if os.path.exists(module_dir):
         ix = index.open_dir(module_dir)
     else:
         indexer = Indexer(self.sword_module_name, self.Reader)
         indexer.buildIndex()
         
         ix = index.open_dir(module_dir)
     
     return ix
コード例 #6
0
ファイル: merge_test.py プロジェクト: garridev/practicas-aii
def init():
    # Setting my schema ...
    schema_email = Schema(path=TEXT(stored=True), sender_email=TEXT(stored=True), recipient_emails=TEXT(stored=True), date=DATETIME, subject=TEXT(stored=True), body=TEXT)
    schema_book = Schema(email=TEXT(stored=True), name=TEXT(stored=True))
    schemas = {"index_emails": schema_email, "index_book": schema_book }

    if not os.path.exists(index_path):
        os.mkdir(index_path)

    indexes = {}
    for ixname, schema in schemas.items():
        '''
        Esta parte es mejorable, ya que sólo indexa si no existe indice. 
        No tiene en cuenta si los archivos indexados se han modificado o si 
        se han eliminado como se explica aquí:
            @url http://pythonhosted.org/Whoosh/indexing.html#incremental-indexing
        '''
        exists = index.exists_in(index_path, indexname=ixname)
        if(not exists):
            ix = index.create_in(index_path, schema, indexname=ixname)
        
            # Indexing ...
            ix = index.open_dir(index_path, indexname=ixname)
            writer = ix.writer()
            if(ixname == "index_emails"):
                files = read_dir()
                index_emails(files, writer)
            elif(ixname == "index_book"):
                index_book(writer)
        else:   
            ix = index.open_dir(index_path, indexname=ixname)
        indexes[ixname] = ix
    
    # Main routine
    while(True):
        options = ['a', 'b', 'c']
        input_user = str(raw_input("Elija el apartado que desea ejecutar (A, B o C): "))
        input_user_lower = string.lower(input_user)
        if input_user_lower in options:
            def apartadoA():
                input_user = raw_input("Introduzca una palabra del asunto o cuerpo (p.e. contrato): ")
                findMailByWordInSubjectOrBody(indexes, input_user)
            def apartadoB():
                input_user = raw_input("Buscar emails posteriores a la fecha (con formato YYYYMMDD): ")
                findMailbyDate(indexes, input_user)
            def apartadoC():
                input_user = raw_input("Introduzca palabras spam (p.e. 'Contrato Gracias compraventa'): ")
                findMailBySpamWords(indexes, input_user)
            options = { 'a': apartadoA, 'b': apartadoB, 'c': apartadoC } 
            options[input_user_lower]()
        else:
            print "Opción no válida."
コード例 #7
0
ファイル: indexing.py プロジェクト: pombredanne/moin2
    def update(self, tmp=False):
        """
        Make sure index reflects current backend state, add missing stuff, remove outdated stuff.

        This is intended to be used:
        * after a full rebuild that was done at tmp location
        * after wiki is made read-only or taken offline
        * after the index was moved to the normal index location

        Reason: new revisions that were created after the rebuild started might be missing in new index.

        :returns: index changed (bool)
        """
        index_dir = self.index_dir_tmp if tmp else self.index_dir
        index_all = open_dir(index_dir, indexname=ALL_REVS)
        try:
            # NOTE: self.backend iterator gives (mountpoint, revid) tuples, which is NOT
            # the same as (name, revid), thus we do the set operations just on the revids.
            # first update ALL_REVS index:
            revids_mountpoints = dict((revid, mountpoint) for mountpoint, revid in self.backend)
            backend_revids = set(revids_mountpoints)
            with index_all.searcher() as searcher:
                ix_revids_names = dict((doc[REVID], doc[NAME]) for doc in searcher.all_stored_fields())
            revids_mountpoints.update(ix_revids_names) # this is needed for stuff that was deleted from storage
            ix_revids = set(ix_revids_names)
            add_revids = backend_revids - ix_revids
            del_revids = ix_revids - backend_revids
            changed = add_revids or del_revids
            add_revids = [(revids_mountpoints[revid], revid) for revid in add_revids]
            del_revids = [(revids_mountpoints[revid], revid) for revid in del_revids]
            self._modify_index(index_all, self.schemas[ALL_REVS], self.wikiname, add_revids, 'add')
            self._modify_index(index_all, self.schemas[ALL_REVS], self.wikiname, del_revids, 'delete')

            backend_latest_names_revids = set(self._find_latest_names_revids(index_all))
        finally:
            index_all.close()
        index_latest = open_dir(index_dir, indexname=LATEST_REVS)
        try:
            # now update LATEST_REVS index:
            with index_latest.searcher() as searcher:
                ix_revids = set(doc[REVID] for doc in searcher.all_stored_fields())
            backend_latest_revids = set(revid for name, revid in backend_latest_names_revids)
            upd_revids = backend_latest_revids - ix_revids
            upd_revids = [(revids_mountpoints[revid], revid) for revid in upd_revids]
            self._modify_index(index_latest, self.schemas[LATEST_REVS], self.wikiname, upd_revids, 'update')
            self._modify_index(index_latest, self.schemas[LATEST_REVS], self.wikiname, del_revids, 'delete')
        finally:
            index_latest.close()
        return changed
コード例 #8
0
ファイル: index.py プロジェクト: huyx/contentmanagement
    def _init_index(self, reset=False):
        index_path = os.path.join(jupyter_data_dir(), "index")

        # clear out old index if requested
        if reset:
            shutil.rmtree(index_path, True)

        # make sure there's a path to store the index data
        if not os.path.exists(index_path):
            os.makedirs(index_path)

        if not exists_in(index_path):
            # create an index with the current schema
            analyzer = ChineseAnalyzer()
            schema = Schema(
                basename=TEXT(stored=True, field_boost=5.0, analyzer=analyzer),
                dirname=ID(stored=True, analyzer=analyzer),
                path=ID(stored=True, unique=True, analyzer=analyzer),
                content=TEXT(stored=False, analyzer=analyzer),
                time=STORED,
            )
            self.ix = create_in(index_path, schema)
        else:
            # open the existing index
            self.ix = open_dir(index_path)

        # build a query parser based on the current schema
        self.query_parser = MultifieldParser(["content", "basename", "dirname"], self.ix.schema)
コード例 #9
0
ファイル: views.py プロジェクト: msillence/Internal-Tools
	def get(self):
	
		wikiResults = None
		jobResults = None
		projectResults = None	
	
		if 'searchScope' in request.args and 'searchTerm' in request.args:	
			
			searchTerm = request.args.get('searchTerm')	
			searchScope = request.args.get('searchScope')	
			index = open_dir('app/search/index')
			parser = QueryParser("content", schema=index.schema)
				
			with index.searcher() as searcher:
			
				if searchScope in ['everything', 'wiki']:
					wikiResults = [{'title':result['title'], 'url':'http://jhcwiki.jhc.co.uk/wiki/index.php/' + result['title'].replace(' ', '_')} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'WIKI']
				
				if searchScope in ['everything', 'jobs']:
					jobResults = [{'title':result['title'], 'url':''} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'JOB']

				if searchScope in ['everything', 'projects']:	
					projectResults = [{'title':result['title'], 'url':url_for('projects.projectDetail', projectCode = result['title'].split('-')[0].strip())} for result in searcher.search(parser.parse(searchTerm), limit=200) if result['type'] == 'PROJECT']
		else:
			searchTerm = ''	
			searchScope = 'everything'
			
		return render_template('search/search.html', wikiResults=wikiResults, jobResults=jobResults , projectResults=projectResults, searchTerm=searchTerm, searchScope=searchScope, title="Search")	
コード例 #10
0
ファイル: search.py プロジェクト: freesci/biostar-test
def add_batch(posts):
    ix = index.open_dir(settings.WHOOSH_INDEX)
    wr = ix.writer(limitmb=50)
    for post in posts:
        update(post, handler=wr)
    wr.commit()
    ix.close()
コード例 #11
0
ファイル: search_engine.py プロジェクト: jartieda/wstore
    def update_index(self, offering):
        """
        Update the document of a concrete offering in the search index
        """

        if not os.path.exists(self._index_path) or os.listdir(self._index_path) == []:
            raise Exception('The index does not exist')

        index = open_dir(self._index_path)

        index_writer = index.writer()
        text = self._aggregate_text(offering)
        purchasers_text = self._aggregate_purchasers(offering)

        in_date = None
        if offering.state == 'uploaded':
            in_date = offering.creation_date
        else:
            in_date = offering.publication_date

        # Get the document
        index_writer.update_document(
            id=unicode(offering.pk),
            owner=unicode(offering.owner_organization.pk),
            content=unicode(text),
            name=unicode(offering.name),
            popularity=Decimal(offering.rating),
            date=in_date,
            state=unicode(offering.state),
            purchaser=purchasers_text
        )

        index_writer.commit()
コード例 #12
0
ファイル: bquery.py プロジェクト: bgshin/irqa
def batch_query(querypath, indexpath):
    ix = index.open_dir(indexpath)

    with open(querypath, 'r') as fh:
        with open('output.txt', 'w') as out_file:
            rawdata = fh.read()
            document = get_section2(rawdata, "<top>", "</top>")
            for d in document:
                topicnum = get_section(d, "<num> Number:", "<title>")[0].strip(" ").strip("\n")
                title = get_section(d, "<title>", "<desc>")[0].strip(" ").strip("\n")
                desc = get_section(d, "<desc>", "<narr>")[0].replace("Description:","").strip(" ")
                narr = get_section(d, "<narr>", "</top>")[0].replace("Narrative:","").strip(" ")

                print topicnum, title, desc, narr

                with ix.searcher() as searcher:
                    parser = qparser.QueryParser("content", schema=ix.schema,
                                 group=qparser.OrGroup)

                    query = parser.parse(desc+" "+title)

                    # query = QueryParser("content", ix.schema)
                    results = searcher.search(query, limit=1000)
                    print results[0]
                    print results[1]
                    # return

                    for i in range(1000):
                        out_file.write("%s\tQ0\t%s\t%s\t%s\t jack\n" % (topicnum, results[i].values()[1], results[i].rank+1, results[i].score))
コード例 #13
0
	def run(self):
		# open index
		self.buffer = deque(maxlen=BUFFERLINES)
		if not exists(self.indexdir):
			makedirs(self.indexdir)
			self.ix = create_in(self.indexdir, SCHEMA)
		else:
			if exists_in(self.indexdir): self.ix = open_dir(self.indexdir)
			else: self.ix = create_in(self.indexdir, SCHEMA)
		self.qp = QueryParser("content", self.ix.schema)
		self.searcher = self.ix.searcher()
		index_p = self.index_p
		while True:
			try:
				# check index_p
				try:
					type, data = index_p.recv()
				except EOFError: break
				try:
					if type == QUERY: self._processSearch(data)
					elif type == LOG: self._processLog(data)
					elif type == RENAME: self._processRename(data)
					else:
						prnt("Unexpected data in logindexsearch.")
				except:
					print_exc()
					prnt("EXCEPTION in logindexsearch process.")
			except KeyboardInterrupt:
				break
		self._dumpBuffer(self.buffer)
		self.searcher.close()
		self.ix.close()	
コード例 #14
0
ファイル: search.py プロジェクト: UKTradeInvestment/navigator
def perform_category_query(query_words):
    """
    Perform a query using the supplied words on the product category index
    """

    indexdir = settings.WHOOSH_INDEX_DIR
    ix = open_dir(indexdir)
    query = QueryParser("sub_category", ix.schema).parse(query_words)

    categories = {}
    suggestion = None

    with ix.searcher() as searcher:
        results = searcher.search(query)

        for result in results:
            if result['category'] not in categories:
                categories[result['category']] = [result['sub_category']]
            else:
                categories[result['category']] += [result['sub_category']]

        corrected = searcher.correct_query(query, query_words)
        if hasattr(corrected.query, 'text') and corrected.query.text != query_words:
            suggestion = corrected.string

    return categories, suggestion
コード例 #15
0
ファイル: bquery.py プロジェクト: bgshin/irqa
def query(indexpath):
    ix = index.open_dir(indexpath)

    with ix.searcher() as searcher:
        query = QueryParser("content", ix.schema).parse("test")
        results = searcher.search(query)
        print results[0]
コード例 #16
0
ファイル: leofts.py プロジェクト: Armagedoom/leo-editor
 def __init__(self, idx_dir):
     self.idx_dir = idx_dir
     if not os.path.exists(idx_dir):
         os.mkdir(idx_dir)
         self.create()
     else:
         self.ix = open_dir(idx_dir)
コード例 #17
0
ファイル: bench.py プロジェクト: MapofLife/MOL
    def indexer(self, create=True):
        schema = self.bench.spec.whoosh_schema()
        path = os.path.join(self.options.dir, "%s_whoosh" % self.options.indexname)

        if not os.path.exists(path):
            os.mkdir(path)
        if create:
            ix = index.create_in(path, schema)
        else:
            ix = index.open_dir(path)

        poolclass = None
        if self.options.pool:
            poolclass = find_object(self.options.pool)

        kwargs = dict(limitmb=int(self.options.limitmb), poolclass=poolclass,
                      dir=self.options.tempdir, procs=int(self.options.procs),
                      batchsize=int(self.options.batch))

        if self.options.expw:
            from whoosh.filedb.multiproc import MultiSegmentWriter
            self.writer = MultiSegmentWriter(ix, **kwargs)
        else:
            self.writer = ix.writer(**kwargs)

        self._procdoc = None
        if hasattr(self.bench.spec, "process_document_whoosh"):
            self._procdoc = self.bench.spec.process_document_whoosh
コード例 #18
0
ファイル: search.py プロジェクト: droodle/eureka-opensource
    def __init__(self, **kwargs):
        super(WhooshEngine, self).__init__()

        analyzer = (
            StemmingAnalyzer()
            | CharsetFilter(accent_map)
            | NgramFilter(minsize=4, maxsize=10)
        )
        self.schema = Schema(
            id=ID(stored=True),
            title=TEXT(stored=True, field_boost=5.0, analyzer=analyzer),
            firstname=TEXT(stored=True, field_boost=2.0, analyzer=analyzer),
            lastname=TEXT(stored=True, field_boost=2.0, analyzer=analyzer),
            type=ID(stored=True),
            description=TEXT(stored=True, analyzer=analyzer),
            creators=TEXT(stored=False, analyzer=analyzer),
            tags=TEXT(stored=False, analyzer=analyzer),
            business_unit=TEXT(stored=False, analyzer=analyzer),
            position=TEXT(stored=False, analyzer=analyzer),
            competencies=TEXT(stored=False, analyzer=analyzer),
            text=TEXT(stored=True, analyzer=analyzer))

        self.dir = kwargs['dir']
        if not os.path.exists(self.dir):
            os.makedirs(self.dir)
        try:
            self._index = open_dir(self.dir)
        except EmptyIndexError:
            self._index = create_in(self.dir, self.schema)
コード例 #19
0
ファイル: views.py プロジェクト: paulopontesm/METI_EADW-libra
def home(request):
    template = 'main_content.html'
    ix = index.open_dir(settings.WHOOSH_INDEX)
    hits = []        
    newsfound = []
    query = request.GET.get('q', None)
    if query is not None and query != u"":
        # Whoosh don't understands '+' or '-' but we can replace
        # them with 'AND' and 'NOT'.
        query = query.replace('+', ' AND ').replace(' -', ' NOT ')
        parser = QueryParser("content", schema=ix.schema, group=OrGroup)
        try:
            qry = parser.parse(query)
        except:
            # don't show the user weird errors only because we don't
            # understand the query.
            # parser.parse("") would return None
            qry = None
        if qry is not None:
            searcher = ix.searcher()
            hits = searcher.search(qry)
        for h in hits:
          if News.objects.filter(pk=int(h["id"])).exists():
            newsfound.append(News.objects.get(pk=int(h["id"])))
    return render(request, template,
                              {'query': query, 'hits': newsfound})
コード例 #20
0
ファイル: mikiedit.py プロジェクト: albfan/mikidown
    def __init__(self, parent=None):
        super(MikiEdit, self).__init__(parent)
        self.parent = parent
        self.settings = parent.settings
        self.setFontPointSize(12)
        self.setVisible(False)
        self.ix = open_dir(self.settings.indexdir)

        # Spell checker support
        try:
            import enchant
            enchant.Dict()
            self.speller = enchant.Dict()
        except ImportError:
            print("Spell checking unavailable. Need to install pyenchant.")
            self.speller = None
        except enchant.errors.DictNotFoundError:
            print("Spell checking unavailable. Need to install dictionary (e.g. aspell-en).")
            self.speller = None

        self.imageFilter = ""
        self.documentFilter = ""
        for ext in self.settings.attachmentImage:
            self.imageFilter += " *" + ext
        for ext in self.settings.attachmentDocument:
            self.documentFilter += " *" + ext
        self.imageFilter = "Image (" + self.imageFilter.strip() + ")"
        self.documentFilter = "Document (" + self.documentFilter.strip() + ")"

        self.downloadAs = ""
        self.networkManager = QNetworkAccessManager()
        self.networkManager.finished.connect(self.downloadFinished)
コード例 #21
0
ファイル: search.py プロジェクト: dgquintas/my-code-samples
 def _get_index(index_path, schema):
     if index.exists_in(index_path):
             return index.open_dir(index_path)
     else:
         if not os.path.exists(index_path):
             os.mkdir(index_path)
         return index.create_in(index_path, schema)
コード例 #22
0
    def search(self, ix=None):

        if ix is None:
            ix = open_dir(self.dir_name)

        self.searcher = ix.searcher()
        fields = []
        qs = ''

        # We use parenthesis to prevent operators like OR used in source
        # to affect target
        if self.source is not None and len(self.source) > 0:
            qs += u' source:({0})'.format(self.source)
            fields.append("source")

        if self.target is not None and len(self.target) > 0:
            qs += u' target:({0})'.format(self.target)
            fields.append("target")

        if self.project is not None and self.project != 'tots' and len(self.project) > 0:
            if self.project == 'softcatala':
                qs += u' softcatala:true'
                fields.append("softcatala")
            else:
                if ',' in self.project:
                    projects = self.project.split(',')
                    val = ''.join(["'{0}',".format(project) for project in projects])
                    val = val[:-1].replace(',' , ' OR ')
                    qs += u' project:({0})'.format(val)   
                else:
                    qs += u' project:(\'{0}\')'.format(self.project)

                fields.append("project")

        self.query = MultifieldParser(fields, ix.schema).parse(qs)
コード例 #23
0
ファイル: search.py プロジェクト: abdeldayemalimadany/Margea
def search_toc(words, index_path):
    words = _strip_diacritics(words)
    " ".join(words.split())  # remove multiple spaces
    ix = open_dir(index_path, indexname="toc-index")
    parser = qparser.QueryParser("title", ix.schema, group=qparser.AndGroup)
    words = suffix_query(words)  # prefix and suffix
    query = parser.parse(words)
    final_result = []
    with ix.searcher() as searcher:
        results = searcher.search(query)  # int(page_number), pagelen=10
        total_count = len(results)
        for hit in results:
            title = hit["title"]
            parent_title = hit["parent_title"]
            full_title = shorten(title, 10)
            if len(parent_title) > 0:
                full_title = shorten(parent_title, 10) + " : " + shorten(title, 10)

            final_result.append(
                {
                    "title": full_title,
                    "serial": hit["serial"].encode("utf-8"),  #  "None" if case if not leaf
                    "topicid": hit["topicid"].encode("utf-8"),
                }
            )

    return json.dumps({"count": total_count, "result": final_result}, ensure_ascii=False)
コード例 #24
0
ファイル: utils.py プロジェクト: MechanisM/djoosh
def search_index(search_model, query, fields=[], limit=None):
    ix = index.open_dir(search_model.get_path())
    fields = fields or search_model.fields
    hits = []
    query = smart_unicode(query)
    
    limit = limit or getattr(settings, 'DJOOSH_SEARCH_LIMIT', 100)
    
    if query and fields:
        query = query.replace('+', ' AND ').replace('|', ' OR ')
        parser = qparser.MultifieldParser(fields, schema=ix.schema)
        qry = parser.parse(query)
        
        try:
            qry = parser.parse(query)
        except:
            qry = None
        
        if qry:
            searcher = ix.searcher()
            try:
                hits = searcher.search(qry, limit=limit)
            except:
                hits = []
    
    ix.close()
    return hits
コード例 #25
0
 def __init__(self, db_path):
     ensuredir(db_path)
     if index.exists_in(db_path):
         self.index = index.open_dir(db_path)
     else:
         self.index = index.create_in(db_path, schema=self.schema)
     self.qparser = QueryParser('text', self.schema)
コード例 #26
0
ファイル: utils.py プロジェクト: MechanisM/djoosh
def update_index(search_model, obj=None, created=True):
    ixpath = search_model.get_path()
    ix = index.open_dir(ixpath)
    
    if obj:
        objects = [obj]
    else:
        objects = search_model.model.objects.all()
    
    writer = ix.writer()
    
    for obj in objects:
        fields = {}
        for field in search_model.fields:
            fields[field] = smart_unicode(getattr(obj, field, ''))
        if created:
            try:
                writer.update_document(**fields)
            except:
                pass
        else:
            try:
                writer.add_document(**fields)
            except:
                pass
            
    writer.commit()
    ix.close()
コード例 #27
0
ファイル: index.py プロジェクト: hmark/viddle
    def init(self):
        """Index and writer object initialization.
		"""

        target = os.path.dirname(__file__) + "/../data/indexcopy"
        self.index = open_dir(target)
        self.writer = self.index.writer()
コード例 #28
0
ファイル: index.py プロジェクト: sbrunner/edocuments
    def __init__(self):
        self.directory = os.path.join(edocuments.root_folder, '.index')
        self.dirty = False
        schema = Schema(**{
            PATH: ID(stored=True, unique=True),
            CONTENT: TEXT(stored=True),
            DATE: STORED,
            DIRECTORY: STORED,
            MD5: TEXT(stored=True),
        })
        self.parser_path = QueryParser("path_id", schema)
        self.parser_content = QueryParser("content", schema)

        if not exists_in(self.directory):
            os.makedirs(self.directory)
            self.index = create_in(self.directory, schema)
        else:
            self.index = open_dir(self.directory)
            if 'path' in self.index.schema.names():
                with self.index.writer() as writer:
                    writer.remove_field('path')
            if 'directory' not in self.index.schema.names():
                with self.index.writer() as writer:
                    writer.add_field('directory', STORED)
            if 'md5' not in self.index.schema.names():
                with self.index.writer() as writer:
                    writer.add_field('md5', TEXT(stored=True))
            print(
                'Field length:\npath: %i\ncontent: %i\nmd5: %i' % (
                    self.index.field_length("path_id"),
                    self.index.field_length("content"),
                    self.index.field_length("md5"),
                )
            )
コード例 #29
0
ファイル: custom.py プロジェクト: sptramer/azure-cli
def _get_index(cli_ctx):
    from whoosh import index

    # create index if it does not exist already
    if not os.path.exists(INDEX_PATH):
        _create_index(cli_ctx)
    return index.open_dir(INDEX_PATH)
コード例 #30
0
ファイル: search.py プロジェクト: abhinaykumar/hasjob
def delete_from_index(oblist):
    ix = index.open_dir(app.config['SEARCH_INDEX_PATH'])
    writer = ix.writer()
    for item in oblist:
        mapping = item.search_mapping()
        writer.delete_by_term('idref', mapping['idref'])
    writer.commit()
コード例 #31
0
    return Schema(related_vars=TEXT(stored=True),
                  name=NGRAMWORDS(stored=True, minsize=3, maxsize=12, at='start', queryor=True),
                  description=TEXT(stored=True),
                  section=TEXT(stored=True),
                  section_title=TEXT(stored=True),
                  related_attrs=TEXT(stored=True),
                  params=TEXT(stored=True))

if __name__ == '__main__':
    print("Building index...")
    if not os.path.exists(VAR_INDEX_DIR):
        os.mkdir(VAR_INDEX_DIR)
        ix = index.create_in(VAR_INDEX_DIR, get_schema())
        print("Creating variables index...")

    ix = index.open_dir(VAR_INDEX_DIR)
    writer = ix.writer()

    all_vars = [
        [u'adult_obesity,diabetes', u'obesity', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None],
        [u'adult_obesity,diabetes', u'diabetes', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare',  u'geo', None],
        [u'adult_obesity,diabetes', u'healthcare', u'Obesity Prevalence,Diabetes Prevalence', u'conditions_diseases', u'Healthcare', u'geo', None],
        [u'motor_vehicle_crash_deaths', u'car crashes', u'Motor Vehicle Crash Deaths', u'risky', u'Crime', u'geo', None],
        [u'motor_vehicle_crash_deaths', u'accidents', u'Motor Vehicle Crash Deaths', u'risky', u'Crime', u'geo', None],

        [u'adult_smoking', u'smokers', u'Adult Smoking Prevalence', u'risky', u'Healthcare', u'geo', None],
        [u'adult_smoking', u'cigarettes', u'Adult Smoking Prevalence', u'risky', u'Healthcare', u'geo', None],

        # [u'infant_mortality', u'infant mortality', u'Infant mortality', u'health', u'geo'],
        # [u'teen_births', u'teen births', u'Teen births', u'health', u'geo'],
        [u'mean_commute_minutes', u'commuters', u'Average Travel Time', u'commute_time', u'Transportation', u'geo', None],
コード例 #32
0
import whoosh.index as index
from whoosh import columns, fields, index, sorting
from whoosh.qparser import QueryParser

# ix = index.open_dir("./")
# facet = sorting.FieldFacet("id", reverse=True)
# searcher = ix.searcher()
#
# searchwords = "新西兰"
# qp = QueryParser("gtitle", schema=ix.schema)
# q = qp.parse(searchwords)
# results = searcher.search(q, sortedby=facet)
# for each in results:
#     print(each)

from whoosh.qparser import QueryParser
from whoosh.index import open_dir
from whoosh.sorting import FieldFacet

new_list = []
index = open_dir("./index/", indexname='goods')  # 读取建立好的索引
with index.searcher() as searcher:
    parser = QueryParser("gtitle", index.schema)  # 要搜索的项目,比如“phone_name
    myquery = parser.parse("鸭蛋")
    facet = FieldFacet("id", reverse=True)  # 按序排列搜索结果
    results = searcher.search(
        myquery, limit=None, sortedby=facet)  # limit为搜索结果的限制,默认为10,详见博客开头的官方文档
    for result1 in results:
        print(dict(result1))
        new_list.append(dict(result1))
コード例 #33
0
# coding=utf-8
from whoosh.index import open_dir
from whoosh.qparser import MultifieldParser

idx_dir = 'lagou_idx'
ix = open_dir(idx_dir)
searcher = ix.searcher()

parser = MultifieldParser(["name", "desc"], schema=ix.schema)

# Single field parser.
k = u'搜索 OR Python city:上海'
q = parser.parse(k)

results = searcher.search_page(q, 1, pagelen=5)

print(u'{0} results found for keyword {1}, {2} returned: '.format(
    len(results), k, results.scored_length()))
for hit in results[:50]:
    print(hit['id'])
    print(hit['name'])
    # print(hit['city'])
    print(hit['com_name'])
    print('************')
コード例 #34
0
#!/usr/bin/env python

from whoosh.index import open_dir
from whoosh.qparser import QueryParser

ix = open_dir("index_dir")
with ix.searcher() as searcher:
    text = input("Dime:")
    while len(text) > 0:
        query = QueryParser("content", ix.schema).parse(text)
        results = searcher.search(query)
        for r in results:

            print (r)
#        print (dir(results))
#        print (results.docs)
        text = input("Dime más:")
コード例 #35
0
def search_tweets(search_term, limit=5, restrict_to_user=None):
    index = open_dir("indexdir")
    if restrict_to_user is not None:
        restrict_to_user = QueryParser('user',
                                       index.schema).parse(restrict_to_user)
    return _do_search(index, search_term, limit, restrict_to_user)
コード例 #36
0
ファイル: task2.py プロジェクト: ashishpatel26/codelibrary
query = ["t1"]  # query is given as a list of terms


class MyBM25Scorer(MyBaseScorer):
    def __init__(self, index_dir):
        MyBaseScorer.__init__(self, index_dir)

    def score_term(self, field, t, ftq, qlen, ftd, doclen):
        # TODO
        return 0


if __name__ == "__main__":

    # Self scorer
    print "Our ranking:"
    scorer = MyBM25Scorer(index_dir)
    scorer.score_all(query, field)
    scorer.close()

    # Whoosh scorer
    print "Whoosh ranking:"
    ix = index.open_dir(index_dir)
    with ix.searcher(weighting=scoring.BM25F()) as searcher:
        qp = qparser.QueryParser(field, schema=ix.schema)
        q = qp.parse(" ".join(query))  # we contatenate query terms
        results = searcher.search(q)
        for r in results:
            print r['id'], str(r.score)[0:6]
    ix.close()
コード例 #37
0
	def index(self):
		self.indexer = open_dir("index")
		self.review_indexer = open_dir("reviews_index");
コード例 #38
0
ファイル: ir_server.py プロジェクト: kifish/SMP-MCC2020
import jieba
import random

from whoosh.query import Term
from whoosh.qparser import QueryParser, MultifieldParser
from whoosh.index import open_dir

import requests
from flask import Flask, jsonify, request

from config import CONFIG

app = Flask(__name__)

# 加载索引文件
IDX = open_dir(dirname=CONFIG.get('IR_DIR'))


def search(context='你喜欢看什么电影?',
           topic='电影',
           method='sampling',
           limit=10,
           data=None,
           rtype='test'):
    """检索函数
    :param context: 对话历史
    :param topic: 对话主题
    :param method: 对话对构造方法
    :param limit: 返回结果个数
    """
    with IDX.searcher() as searcher:
コード例 #39
0
indexpath = Configuration.getIndexdir()

from whoosh.index import create_in, exists_in, open_dir
from whoosh.fields import Schema, TEXT, ID

schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True, unique=True),
                content=TEXT)

if not os.path.exists(indexpath):
    os.mkdir(indexpath)

if not exists_in(indexpath):
    ix = create_in(indexpath, schema)
else:
    ix = open_dir(indexpath)


def dumpallcveid(entry=None):
    cveid = []
    if entry is None:
        for x in collection.find({}).sort('_id', 1):
            cveid.append(x['id'])
    else:
        for x in collection.find({}).sort("Modified", -1).limit(int(entry)):
            cveid.append(x['id'])
    return cveid


def getcve(cveid=None):
    if cveid is None:
コード例 #40
0
def ChineseAnalyzer():
    return ChineseTokenizer()


analyzer = ChineseAnalyzer()

schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                content=TEXT(stored=True, analyzer=analyzer))

if not os.path.exists("index"):
    os.mkdir("index")

ix = index.create_in("index", schema)
ix = index.open_dir("index")

writer = ix.writer()

for pid in xrange(len(pid_p_r)):
    writer.add_document(title=str(pid).decode("utf-8"),
                        path=u"/" + str(pid).decode("utf-8"),
                        content=pid_p_r[pid][0].decode("utf-8"))

writer.commit()


def find(text):
    og = qparser.OrGroup.factory(0.9)
    parser = qparser.QueryParser("content", schema, group=og)
    with ix.searcher() as searcher:
コード例 #41
0

def SearchConcept(Concept, Parser):
    s = myindex.searcher()
    Found = []
    for V in Variants(Concept):
        q = Parser.parse(V)
        Found += [
            str(x)[:-3].split('wiki')[-1].split("'")[0]
            for x in list(s.search(q, limit=None))
        ]
    return Found


Concepts = readtxt('DutchDataUMLS/Concepts_UMLS.txt')[1:153862]
myindex = index.open_dir("WikiIndex")
qp = QueryParser("content", schema=myindex.schema)
FoundConcepts = []
c = 1

OutFile = open('FoundConcepts.txt', 'a')

for C in Concepts:
    print(str(c) + " of " + str(len(Concepts)) + " processed.\n")
    Found = SearchConcept(C[1], qp)
    if Found != []:
        FoundConcepts.append((C[0], C[1], Found))
        OutFile.write(C[0] + '\t' + C[1] + '\t' + str(Found) + '\n')
    c += 1

OutFile.close()
コード例 #42
0
ファイル: txt2.py プロジェクト: xiaofengzi2019/test2
writer = ix.writer()  
for fn in traverseFile("shegongku_db"):
    with open(fn, 'r', encoding='utf-8') as f:
        print(fn, "...")
        lines=0
        while True:
            line1 = f.readline()
            if line1:
                writer.add_document(full_line=line1)
                lines+=1;
            else:
                break
        print(fn, lines, "added")
writer.commit()  
print("index finished")
# 以上为建立索引的过程 

index1 = open_dir("shegongku_idx", indexname='allin1line')
parser1 = QueryParser("full_line", index1.schema)
while True:
    with index1.searcher() as searcher:  
        print("pls input what u want to search:")
        key = input()
        myquery = parser1.parse(key)
        resultss = searcher.search(myquery, limit=2000)
        #print(type(resultss))
        for result1 in resultss:  
            d1=dict(result1)['full_line']
            print(d1)

コード例 #43
0
            lastTitle = child
            textCur = []

        addDocument(f, section, lastTitle, textCur)

    writer.commit()


#---------------------------------------------------------------------------------------------

if __name__ == '__main__':
    BuildHelpIndex()

    from whoosh.qparser import QueryParser
    ix = open_dir(indexDir, readonly=True)

    with ix.searcher() as searcher, open('search.html', 'w') as f:
        query = QueryParser('content', ix.schema).parse(u'fastest lap')
        results = searcher.search(query, limit=20)
        f.write(
            '<table><tr><th></th><th align="left">Section</th><th align="left">Match</th></tr>\n'
        )
        for i, hit in enumerate(results):
            f.write(
                '<tr><td align="left">%d.</td><td><a href="%s">%s</a></td><td>%s</td></tr>\n'
                % ((i + 1), hit['path'], hit['section'],
                   hit.highlights('content')))
        f.write('</table>\n')

    ix.close()
コード例 #44
0
    def __init__(self):
	self.ix = index.open_dir("indexdir")
コード例 #45
0
        text = '\n'.join(chunk for chunk in chunks if chunk)
        text = text.replace('\n', ' ').lower()
        hash_text = hashlib.md5(text.encode())
        return (text, hash_text.hexdigest(), ' ', str(soup.title.string),
                html['url'])
    if len(keys) == 3:
        text = html['body']
        hash_text = hashlib.md5(text.encode())
        return (text, hash_text.hexdigest(), html['tags'], ' ', html['url'])
    if len(keys) == 4:
        return (html['body'], html['hash'], html['tags'], ' ', html['url'])
    return (html['body'], html['hash'], html['tags'], html['title'],
            html['url'])


ix = index.open_dir("mesh_index")
# qp = QueryParser("body", schema=ix.schema)
# query = "Cell"
# q = qp.parse(query)
searcher = ix.searcher().documents()
writer = ix.writer()
i = 0
for doc in searcher:
    text, hash, tags, title, url = parse_html(doc)
    writer.update_document(url=url,
                           body=text,
                           tags=tags,
                           hash=hash,
                           title=title)
    print(str(i + 1) + ' completed.')
    i += 1
コード例 #46
0
        url=u"https://media.giphy.com/media/czwo5mMtaknhC/giphy.gif",
        tags=u"done finally free react")
    writer.add_document(
        url=u"https://media.giphy.com/media/qnOBmH70CGSVa/giphy.gif",
        tags=u"clap welldone goodjob applaud")
    writer.commit()


def search_query(ix, q):
    with ix.searcher() as searcher:
        query = QueryParser("tags", ix.schema).parse(q)
        results = searcher.search(query)
        file_path = [(r["url"], i) for i, r in enumerate(results)]
        #print(file_path)
        if len(file_path) > 1:
            file_path = [random.choice(file_path)]
        #return [(r["path"],i) for i,r in enumerate(results)]
        return (file_path)


if __name__ == '__main__':
    s = make_index()
    ix = open_dir("index")
    add_docs(ix)
    result = search_query(ix, 'react')
    print(result)
    #print(ix.schema)
    #results = ix.searcher().search(Every('tags'))
    #for result in results:
    #    print (result['url'])
コード例 #47
0
def get_item_count(dirs):
    ix = index.open_dir(os.path.join(baseindexpath, dirs))
    return ix.doc_count_all()
コード例 #48
0
# -*- coding: UTF-8 -*-
import sys
import os
sys.path.append("../")
from whoosh.index import create_in, open_dir
from whoosh.fields import *
from whoosh.qparser import QueryParser

from jieba.analyse import ChineseAnalyzer

analyzer = ChineseAnalyzer()

schema = Schema(title=TEXT(stored=True),
                path=ID(stored=True),
                content=TEXT(stored=True, analyzer=analyzer))
if not os.path.exists("tmp"):
    os.mkdir("tmp")
ix = open_dir("tmp")

searcher = ix.searcher()
parser = QueryParser("content", schema=ix.schema)

for keyword in (u"水果小姐", u"你", u"first", u"中文", u"交换机", u"交换", u"少林", u"乔峰"):
    print "result of ", keyword
    q = parser.parse(keyword)
    results = searcher.search(q)
    for hit in results:
        print hit.highlights("content")
    print "=" * 10
コード例 #49
0
def phone_search(request):
    if request.method == "POST":
        print(request.body)
        data = json.loads(request.body)
        sessionId = data.get("sessionId")
        print(sessionId)
        decision_arr = securer(sessionId, request.headers["user_agent"])
        if decision_arr[0] == False:
            return JsonResponse(decision_arr[1])
        else:
            phone_name = data.get("phone_name")
            if phone_name == None:
                return JsonResponse({"error": "no phone_name"})
            ix = index.open_dir("index")
            qp = QueryParser("phone_name", schema=ix.schema)
            q = qp.parse(phone_name)
            retrieved_phones_ = []
            with ix.searcher() as s:
                retrieved_phones = s.search(q)
                if len(retrieved_phones) != 0:
                    for retrieved_phone in retrieved_phones:
                        retrieved_phones_.append(retrieved_phone["phone_name"])

            retrieved_phones__ = []
            q = qp.parse(' '.join(phone_name.split(" ")[1:]))
            with ix.searcher() as s:
                retrieved_phones = s.search(q)
                if len(retrieved_phones) != 0:
                    for retrieved_phone in retrieved_phones:
                        retrieved_phones__.append(
                            retrieved_phone["phone_name"])

            retrieved_phones = []
            total_phones = retrieved_phones__ + retrieved_phones_

            print(retrieved_phones_)
            print(retrieved_phones__)

            for phone_ in total_phones:
                if phone_ not in retrieved_phones:
                    retrieved_phones.append(phone_)
            print(len(retrieved_phones))

            # special case for iphone X
            if phone_name == "Apple iPhone X":
                for phone_ in retrieved_phones:
                    if "8" in phone_.split(" "):
                        retrieved_phones.pop(retrieved_phones.index(phone_))
                    elif "7" or "iPhone\xa07" or "iPhone\xa07" in phone_.split(
                            " "):
                        retrieved_phones.pop(retrieved_phones.index(phone_))
                    elif "6" in phone_.split(" "):
                        retrieved_phones.pop(retrieved_phones.index(phone_))

                retrieved_phones = retrieved_phones[:3]

            phones_json = [{
                "image":
                phone_name.lower().replace(" ", "_") + ".jpg"
            }, [], {}]
            for i, phone in enumerate(retrieved_phones):
                phones = ScrapedSmartphone.objects.filter(data__name=phone)
                for search_phone in phones:
                    not_there = True
                    for phone_json in phones_json[1]:
                        if phone_json["url"] == search_phone.data["url"]:

                            not_there = False
                    if not_there == True:
                        phones_json[1].append(search_phone.data)
                        try:
                            vendor_index = vendor.index(
                                search_phone.data["vendor"])
                            phones_json[1][-1][
                                "brandLogoUrl"] = "http://localhost:8001/brand_images/" + vendor_images[
                                    vendor_index]
                        except ValueError:
                            phones_json[1][-1][
                                "brandLogoUrl"] = "http://localhost:8001/brand_images/daraz.png"
                        if len(phones_json[1][-1]["name"]) > 32:
                            phones_json[1][-1]["name"] = phones_json[1][-1][
                                "name"][:33] + "..."
                        phones_json[1][-1]["price"] = int(
                            float(phones_json[1][-1]["price"]))
            original = OriginalSmartphone.objects.filter(
                data__DeviceName=phone_name)
            print(original)
            if (len(original) > 0):
                original = original[0]
            else:
                return JsonResponse({"error": "invalid phone name"})
            specs = original.data

            specs_to_send = {}
            try:
                specs_to_send["camera"] = specs["triple"]
            except KeyError:
                try:
                    specs_to_send["camera"] = specs["dual_"]
                except KeyError:
                    try:
                        specs_to_send["camera"] = specs["single"]
                    except KeyError:
                        specs_to_send["camera"] = "Photo/Video"

            specs_to_send["ram"] = specs["internal"]
            try:
                specs_to_send["processor"] = specs["cpu"]
            except KeyError:
                try:
                    specs_to_send["processor"] = specs["chipset"]
                except:
                    specs_to_send["processor"] = "No Info"

            specs_to_send["battery"] = specs["battery_c"]
            phones_json[2] = specs_to_send

            phones_json[1] = sorted(phones_json[1], key=itemgetter('price'))
            for specific_phone in phones_json[1]:
                specific_phone["price"] = locale.currency(
                    specific_phone["price"], grouping=True)[1:-3]

            return JsonResponse(phones_json, safe=False)
    else:
        return JsonResponse({"error": "method not allowed"})
コード例 #50
0
ファイル: indexer.py プロジェクト: chenhuayz/recipe_manager
 def searcher(self):
     """ Returns a searcher for this index. """
     self._index = index.open_dir(self.index_path)
     return self._index.searcher()
コード例 #51
0
if __name__ == "__main__":
    args = parse_args()
    with open(args.data_src_set, "r", encoding="utf-8") as data_src_r:
        with open(args.data_tgt_set, "r", encoding="utf-8") as data_tgt_r:
            with open(args.test_set, "r", encoding="utf-8") as test_r:
                with open(args.test_set + ".retrive.top-{}".format(args.top_K),
                          "w",
                          encoding="utf-8") as test_w:
                    schema = Schema(source_tok=TEXT(stored=True),
                                    target_tok=TEXT(stored=True),
                                    source=TEXT(stored=True),
                                    target=TEXT(stored=True))
                    if os.path.exists(args.indexdir):
                        print("Loading index dir {}".format(args.indexdir))
                        ix = open_dir(args.indexdir)
                    else:
                        print("reading source data file")
                        data_src_lines = data_src_r.readlines()
                        print("reading target data file")
                        data_tgt_lines = data_tgt_r.readlines()
                        os.mkdir(args.indexdir)
                        print("Crating index dir {}".format(args.indexdir))
                        ix = create_in(args.indexdir, schema)
                        writer = ix.writer()
                        print("Index Build Start")
                        for data_src_line, data_tgt_line in tqdm.tqdm(
                                zip(data_src_lines, data_tgt_lines)):
                            data_src_line_tok = data_src_line.strip().replace(
                                "@@ ", "")
                            data_tgt_line_tok = data_tgt_line.strip().replace(
コード例 #52
0
    def update_file_index(self):
        log.debug(
            (u'STARTING INCREMENTAL INDEXING UPDATE FOR EXTENSIONS %s '
             'AND REPOS %s') % (INDEX_EXTENSIONS, self.repo_paths.keys()))

        idx = open_dir(self.index_location, indexname=self.indexname)
        # The set of all paths in the index
        indexed_paths = set()
        # The set of all paths we need to re-index
        to_index = set()

        writer = idx.writer()
        writer_is_dirty = False
        try:
            with idx.reader() as reader:

                # Loop over the stored fields in the index
                for fields in reader.all_stored_fields():
                    indexed_path = fields['path']
                    indexed_repo_path = fields['repository']
                    indexed_paths.add(indexed_path)

                    if not indexed_repo_path in self.filtered_repo_update_paths:
                        continue

                    repo = self.repo_paths[indexed_repo_path]

                    try:
                        node = self.get_node(repo, indexed_path)
                        # Check if this file was changed since it was indexed
                        indexed_time = fields['modtime']
                        mtime = self.get_node_mtime(node)
                        if mtime > indexed_time:
                            # The file has changed, delete it and add it to
                            # the list of files to reindex
                            log.debug(
                                'adding to reindex list %s mtime: %s vs %s' %
                                (indexed_path, mtime, indexed_time))
                            writer.delete_by_term('fileid', indexed_path)
                            writer_is_dirty = True

                            to_index.add(indexed_path)
                    except (ChangesetError, NodeDoesNotExistError):
                        # This file was deleted since it was indexed
                        log.debug('removing from index %s' % indexed_path)
                        writer.delete_by_term('path', indexed_path)
                        writer_is_dirty = True

            # Loop over the files in the filesystem
            # Assume we have a function that gathers the filenames of the
            # documents to be indexed
            ri_cnt_total = 0  # indexed
            riwc_cnt_total = 0  # indexed with content
            for repo_name, repo in self.repo_paths.items():
                # skip indexing if there aren't any revisions
                if len(repo) < 1:
                    continue
                ri_cnt = 0  # indexed
                riwc_cnt = 0  # indexed with content
                for path in self.get_paths(repo):
                    path = safe_unicode(path)
                    if path in to_index or path not in indexed_paths:

                        # This is either a file that's changed, or a new file
                        # that wasn't indexed before. So index it!
                        i, iwc = self.add_doc(writer, path, repo, repo_name)
                        writer_is_dirty = True
                        log.debug('re indexing %s' % path)
                        ri_cnt += i
                        ri_cnt_total += 1
                        riwc_cnt += iwc
                        riwc_cnt_total += iwc
                log.debug('added %s files %s with content for repo %s' %
                          (ri_cnt + riwc_cnt, riwc_cnt, repo.path))
            log.debug('indexed %s files in total and %s with content' %
                      (ri_cnt_total, riwc_cnt_total))
        finally:
            if writer_is_dirty:
                log.debug('>> COMMITING CHANGES TO FILE INDEX <<')
                writer.commit(merge=True)
                log.debug('>>> FINISHED REBUILDING FILE INDEX <<<')
            else:
                log.debug('>> NOTHING TO COMMIT TO FILE INDEX <<')
                writer.cancel()
コード例 #53
0
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser

runPath = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(runPath, ".."))

from lib.Config import Configuration
from lib.DatabaseLayer import DatabaseLayer

indexpath = Configuration.getIndexdir()

#basepath = os.path.join(os.sep, *os.path.dirname(os.path.realpath(__file__)).rsplit('/')[:-1])
#print (os.path.split(os.path.join(basepath,indexpath)))
schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)

ix = index.open_dir(indexpath)

argParser = argparse.ArgumentParser(
    description='Full text search for cve-search')
argParser.add_argument('-q',
                       action='append',
                       help='query to lookup (one or more)')
argParser.add_argument('-o',
                       action='store_true',
                       help='OR of the query to lookup (default is AND')
argParser.add_argument('-t',
                       action='store_true',
                       help='output title of the match CVE(s)')
argParser.add_argument('-f',
                       action='store_true',
                       help='output matching CVE(s) in JSON')
コード例 #54
0
def Load(indexdir):
    ix = open_dir(indexdir)
    return ix
コード例 #55
0
def available_checker(request):
    scraped = 0
    original = 0
    all_original_phones = OriginalSmartphone.objects.all()
    ix = index.open_dir("index")
    qp = QueryParser("phone_name", schema=ix.schema)
    for original_phone in all_original_phones:
        q = qp.parse(original_phone.data["DeviceName"])
        another_trial = False
        with ix.searcher() as s:
            results = s.search(q)
            if len(results) != 0:
                original_phone.available = True
                original_phone.save()
                original += 1

                if original_phone.data["DeviceName"] == "vivo Y65":
                    for i in range(30):
                        print("")
                    print("In First")
                    print(len(results))
                    for result in results:
                        print(result["phone_name"])
                    for i in range(30):
                        print("")

                for result in results:
                    scraped_phones = ScrapedSmartphone.objects.filter(
                        data__name=result["phone_name"])
                    for scraped_phone in scraped_phones:
                        scraped_phone.available = True
                        scraped_phone.belongs_to = original_phone
                        scraped_phone.save()
                        scraped += 1
            else:
                another_trial = True
        if another_trial == True:
            keyword = original_phone.data["DeviceName"].split(" ")[1:]
            q = qp.parse(" ".join(keyword))
            with ix.searcher() as s:
                results = s.search(q)
                if len(results) != 0:
                    original_phone.available = True
                    original_phone.save()
                    original += 1

                    if original_phone.data["DeviceName"] == "vivo Y65":
                        for i in range(30):
                            print("")
                        print("In Second")
                        for result in results:
                            print(result["phone_name"])
                        for i in range(30):
                            print("")

                    for result in results:
                        scraped_phones = ScrapedSmartphone.objects.filter(
                            data__name=result["phone_name"])
                        for scraped_phone in scraped_phones:
                            scraped_phone.available = True
                            scraped_phone.belongs_to = original_phone
                            scraped_phone.save()
                            scraped += 1

        i = list(all_original_phones).index(original_phone)
        percentage = i * 100 / len(all_original_phones)
        print(percentage)
        print("Original ", original)
        print("Scraped ", scraped)
コード例 #56
0
def query_thread(queue, database, g_minus_d, e1_type, e2_type, index):
    idx = open_dir(index)
    regex_tokenize = re.compile('\w+|-|<[A-Z]+>[^<]+</[A-Z]+>', re.U)
    tokenizer = RegexTokenizer(regex_tokenize)
    stopper = StopFilter()
    count = 0

    with idx.searcher() as searcher:
        while True:
            r = queue.get_nowait()
            count += 1
            if count % 25000 == 0:
                print multiprocessing.current_process(), count, queue.qsize()

            if len(database[(r.ent1, r.ent2)]) == 0:
                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                terms = list()
                for token in stopper(
                        tokenizer((r.between.decode("utf8")), renumber=True)):
                    terms.append(query.Term("sentence", token.text))

                #print terms
                t1 = query.Term("sentence", entity1)
                t3 = query.Term("sentence", entity2)

                query_terms = list()
                query_terms.append(t1)
                for t in terms:
                    query_terms.append(t)
                query_terms.append(t3)

                q1 = spans.SpanNear2(query_terms, slop=2, ordered=True)
                q2 = spans.SpanNear2([t1, t3], slop=8, ordered=True)
                entities_r = searcher.search(q1)
                entities = searcher.search(q2)
                """
                print query_terms, len(entities_r)
                print [t1, t3], len(entities)
                print "\n"
                """

                #print entity1, '\t', r.between, '\t', entity2, len(entities_r), len(entities)

                try:
                    assert not len(entities_r) > len(entities)
                except AssertionError, e:
                    print e
                    print r.sentence
                    print r.ent1
                    print r.ent2
                    print query_terms
                    print[t1, t3]

                if len(entities) > 0:
                    pmi = float(len(entities_r)) / float(len(entities))
                    if pmi >= 0.5:
                        #print entity1, '\t', r.between, '\t', entity2, pmi
                        g_minus_d.append(r)

                if queue.empty() is True:
                    break
コード例 #57
0
ファイル: search1.py プロジェクト: DickJ/data-science
from whoosh.index import open_dir
from whoosh.index import create_in
from whoosh.query import Term, And, Or
from whoosh.qparser import QueryParser

my_schema = Schema(id=ID(unique=True, stored=True),
                   lang=TEXT(),
                   screenname=TEXT(),
                   tweettext=TEXT(),
                   hashtags=TEXT(),
                   datetime=DATETIME())

if not os.path.exists("tweets_index"):
    os.mkdir("tweets_index")
    index = create_in("tweets_index", my_schema)
index = open_dir("tweets_index")
writer = index.writer()

df = pd.read_csv('tweets/tweets.csv',
                 header=None,
                 names=[
                     'id', 'language', 'screenname', 'tweettext', 'hashtags',
                     'timestamp'
                 ])
for row in df.iterrows():
    try:
        dt = datetime.datetime.fromtimestamp(int(row[1].timestamp.rstrip("L")))
    except:
        dt = None
    try:
        tt = unicode(row[1].tweettext, errors="ignore")
コード例 #58
0
ファイル: index.py プロジェクト: MurgoloMichele/GAvI
 def openIndex(self, dir):
     self.ix = index.open_dir(dir)
コード例 #59
0
#! /usr/bin/env python
# -*- coding:utf-8 -*-

import whoosh.index as index
from config import indexdir_path
from whoosh import columns, fields, index, sorting
from whoosh.qparser import QueryParser, MultifieldParser, query
import json

ix = index.open_dir(indexdir_path)
facet = sorting.FieldFacet("comment_num", reverse=True)
searcher = ix.searcher()

#对索引进行命名然后指定索引文件打开
# ix = index.create_in("indexdir", schema=schema, indexname="usages")
# ix = index.open_dir("indexdir", indexname="usages")

#测试索引文件是否存在
# exists = index.exists_in("indexdir")
# usages_exists = index.exists_in("indexdir", indexname="usages")

#删除文档
#删除文档通过field
# ix.delete_by_term('path', u'/a/b/c')
#删除文档通过query
#delete_by_query(query)
#
# ix.commit()
#更新索引  需要设置字段的唯一性 (设置惟一的字段必须被索引)
# schema = Schema (path=ID (unique=True),content=TEXT)
#writer.add_document(path=u"/a",content=u"the first document")
コード例 #60
0
def search_combined(search_term, limit=3):
    return _do_search(open_dir("indexcomb"), search_term, limit)