Example #1
0
 def GET(self):
     user_data = web.input()
     message = user_data.keyword
     if len(message) > 10:
         if (len(message) > 3
                 and message[-3] + message[-2] + message[-1] == 'png'
                 or message[-3] + message[-2] + message[-1] == 'jpg'):
             urlretrieve(message, 'target.jpg')
             lis1 = shit.LSH('target.jpg')
             lis = []
             vm_env.attachCurrentThread()
             STORE_DIR = 'index'
             directory = SimpleFSDirectory(File(STORE_DIR))
             searcher = IndexSearcher(DirectoryReader.open(directory))
             analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
             for i in range(len(lis1)):
                 lis.append(run(searcher, analyzer, lis1[i])[0])
     else:
         a = func(user_data.keyword)
         STORE_DIR = 'index'
         vm_env.attachCurrentThread()
         directory = SimpleFSDirectory(File(STORE_DIR))
         searcher = IndexSearcher(DirectoryReader.open(directory))
         analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
         lis = run(searcher, analyzer, a)
     f = login
     return render.movies(f, lis)
Example #2
0
 def GET(self, name):
     STORE_DIR_GOOD = "index_good"
     STORE_DIR_BAD = "index_bad"
     vm_env.attachCurrentThread()
     directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD))
     searcher_good = IndexSearcher(DirectoryReader.open(directory_good))
     directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD))
     searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad))
     analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     user_data = web.input(name=None)
     command = yourInput(user_data)
     if user_data.brand == '':
         user_data.brand = '1'
     res = Run_Score(searcher_good, searcher_bad, analyzer, name,
                     user_data.brand)
     comments = []
     for i in range(len(res)):
         if len(res[i]) == 9:
             t = res[i][8]
         else:
             t = ''
         for j in range(len(t)):
             s = t[j]
             s.encode("utf8")
             if len(s) >= 50:
                 comments.append(s)
     return render.comments(comments)
Example #3
0
 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)
Example #4
0
 def open_reader(self):
     """Open IndexReader."""
     if self.reader is None:
         if self.use_ram:
             print "reading from ram directory ..."
             self.reader = DirectoryReader.open(self.ram_dir)
         else:
             self.reader = DirectoryReader.open(self.dir)
 def open_reader(self):
     """Open IndexReader."""
     if self.reader is None:
         if self.use_ram:
             print "reading from ram directory ..."
             self.reader = DirectoryReader.open(self.ram_dir)
         else:
             self.reader = DirectoryReader.open(self.dir)
Example #6
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Example #7
0
 def __init__(self, fs_directory):
     directory = SimpleFSDirectory(Paths.get(fs_directory))
     self.index_reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     self.analyzer = StandardAnalyzer()
     self.query = None
     self.lucene_dictionary = LuceneDictionary(self.index_reader,
                                               'contents')
     self.analyzer = StandardAnalyzer()
     self.formatter = SimpleHTMLFormatter()
     self.hits = None
Example #8
0
 def GET(self, name):
     STORE_DIR_GOOD = "index_good"
     STORE_DIR_BAD = "index_bad"
     vm_env.attachCurrentThread()
     directory_good = SimpleFSDirectory(File(STORE_DIR_GOOD))
     searcher_good = IndexSearcher(DirectoryReader.open(directory_good))
     directory_bad = SimpleFSDirectory(File(STORE_DIR_BAD))
     searcher_bad = IndexSearcher(DirectoryReader.open(directory_bad))
     analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     user_data = web.input(name=None)
     command = yourInput(user_data.shop)
     res = Run_GoodRate(searcher_good, searcher_bad, analyzer, command,
                        user_data.brand)
     res.append(command)
     return render.SearchResult(res)
Example #9
0
    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results
Example #10
0
File: idx.py Project: mkind/crawler
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field, self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
Example #11
0
def search_img(output):
    STORE_DIR = "WebPageIndex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
    results = []
    results.append([])
    for num in output:
        imgnum = str(num)
        query = QueryParser(Version.LUCENE_CURRENT, "imgnum",
                            analyzer).parse(imgnum)
        scoreDocs = searcher.search(query, 1).scoreDocs
        for scoreDoc in scoreDocs:
            result = []
            doc = searcher.doc(scoreDoc.doc)
            result.append(doc.get("album"))
            result.append(doc.get("subalbum"))
            result.append(doc.get("singer"))
            result.append(doc.get("url"))
            result.append(doc.get("reviews"))
            result.append(doc.get("imgurl"))
            result.append(doc.get("imgnum"))
            results.append(result)
    del searcher
    return results
Example #12
0
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.nameQueryParser = QueryParser('name', StandardAnalyzer())
     self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
     self.idQueryParser = QueryParser('id', StandardAnalyzer())
     self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
Example #14
0
def SearchImgCommand(command):
    initvm.vm_env.attachCurrentThread()

    STORE_DIR = "jdindex"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    contentCommand = ' '.join(jieba.cut(command))
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(contentCommand)
    scoreDocs = searcher.search(query, 50).scoreDocs

    Already = []
    finalDocs = []
    for i, scoreDoc in enumerate(scoreDocs):
        doc = searcher.doc(scoreDoc.doc)
        itemurl = doc.get("itemurl")
        if itemurl not in Already:
            oneDoc = {}
            oneDoc['imgurl'] = doc.get("imgurl")
            oneDoc['title'] = doc.get("title").strip('\n')
            oneDoc['itemurl'] = itemurl
            oneDoc['score'] = scoreDoc.score
            finalDocs.append(oneDoc)
            Already.append(itemurl)

    return finalDocs
Example #15
0
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]):
    lucene.initVM()

    lindex = SimpleFSDirectory(Paths.get(indexfile))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()

    parser = QueryParser(default_field, analyser)
    query = parser.parse(querytext)

    hits = isearcher.search(query, top).scoreDocs
    docIDs = [hit.doc for hit in hits]
    print_results(isearcher, hits, display_fields)
    if len(hits) == 0:
        print("No hits!")
    elif qe:
        print("\n")
        print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top))
        relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]]
        nonrelevantids = [id for id in docIDs if id not in relevantids]

        print("\n\n")

        qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids)
        print("Expanded search query: '{}'\n".format(qequerytext))
        qequery = parser.parse(qequerytext)
        qehits = isearcher.search(qequery, top).scoreDocs
        print_results(isearcher, qehits, display_fields)

    ireader.close()
    lindex.close()
Example #16
0
    def GET(self):
        command = web.input().command.encode('utf-8')
        initvm.vm_env.attachCurrentThread()

        STORE_DIR = "jdindex"
        directory = SimpleFSDirectory(File(STORE_DIR))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        query = QueryParser(Version.LUCENE_CURRENT, "contents",
                            analyzer).parse(command)
        scoreDocs = searcher.search(query, 20).scoreDocs

        finalDocs = []
        for i, scoreDoc in enumerate(scoreDocs):
            doc = searcher.doc(scoreDoc.doc)
            title = doc.get("title").strip('\n')
            if title not in finalDocs:
                finalDocs.append(title)

        web.header('content-type', 'text/json')
        data = {}
        data['q'] = command
        data['p'] = 'false'
        data['s'] = finalDocs
        return 'fn(' + json.dumps(data) + ');'
    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
Example #18
0
def running(command):
    command = unicode(command)
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SimpleAnalyzer(Version.LUCENE_CURRENT)
    return run(searcher, analyzer, command)
Example #19
0
 def __init__(self, db_path):
     directory = SimpleFSDirectory(File(db_path))
     reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(reader)
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     logger.info("Loaded DB from %s with %d documents: ",
                 db_path, reader.numDocs())
Example #20
0
def run(command):
    global vm_env
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    query = QueryParser(Version.LUCENE_CURRENT, "contents",
                        analyzer).parse(command)
    scoreDocs = searcher.search(query, 10).scoreDocs
    #print "%s total matching documents." % len(scoreDocs)
    res = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        tmp = []
        tmp.append([doc.get('name1'), doc.get('name2')])
        tmp.append(doc.get("homepage"))
        tmp.append(doc.get("intro"))
        tmp.append(doc.get('logo'))
        a = doc.get('goods')
        a = a.split('\n')
        for i in a:
            tmp.append(i)
        res.append(tmp)

    return command, res
def buscar(indexDir, args,options = None):
    #lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
    fsDir = SimpleFSDirectory(File(indexDir))
    #print fsDir
    
    #Criando buscador baseado no diretorio dos indices passados pelo usuario
    searcher = IndexSearcher(DirectoryReader.open(fsDir))
    
    #Analizador para filtro dos tokens 
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    #print analyzer

    #Criando um QueryParser usando por padrao contents
    #Variavel com as restricoes da busca
    parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer)
    #print parser

    parser.setDefaultOperator(QueryParser.Operator.AND)

    #print args
    #Juntando parametros passados com o valor do mesmo
    command = ' +'.join(args)
    #print command

    query = parser.parse(command)
    print query

    #Criando um JArray com resultado da consulta
    return searcher.search(query, 200).scoreDocs
Example #22
0
 def __init__(self, db_path):
     directory = SimpleFSDirectory(File(db_path))
     reader = DirectoryReader.open(directory)
     self.searcher = IndexSearcher(reader)
     self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
     logger.info("Loaded DB from %s with %d documents: ", db_path,
                 reader.numDocs())
Example #23
0
    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
Example #24
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
	def search(self):
		''' Searches the given query in the index '''

		lucene.initVM(vmargs=['-Djava.awt.headless=true'])
		# print 'lucene', lucene.VERSION
		# base_dir = os.path.dirname(os.path.abspath('.'))
		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		

		while True:
			print
			print "Hit enter with no input to quit."
			command = raw_input("Query:")
			if command == '':
				return

			print
			print "Searching for:", command

			query = QueryParser(Version.LUCENE_CURRENT, "title",
								analyzer).parse(command)
			scoreDocs = searcher.search(query, 50).scoreDocs
			print "%s total matching documents." % len(scoreDocs)

			for scoreDoc in scoreDocs:
				doc = searcher.doc(scoreDoc.doc)
				# print 'path:', doc.get("path"), 'name:', doc.get("name")
				print doc
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
	def search(self, input_query=None, max_answers=10):
		''' Searches the given query in the index '''
		if input_query is None:
			return None

		base_dir = '.'
		directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir)))
		searcher = IndexSearcher(DirectoryReader.open(directory))
		analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
		
		
		# query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query)
		parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer)
		query = MultiFieldQueryParser.parse(parser, input_query)

		scoreDocs = searcher.search(query, max_answers).scoreDocs
		print "%s total matching documents." % len(scoreDocs)

		docs = []
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields())
			docs.append(doc_dict)
			# print doc
		return docs
Example #28
0
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger
Example #29
0
def searcher(directory, analyzer, queries_file):
    lines = queries_file.readlines()
    length = len(lines)
    a_query = ''
    query_counter = 0
    log = open("log.txt", "a")
    for line_number in range(length):
        if lines[line_number].startswith("<num>"):
            query_id = lines[line_number][14:].strip()
        elif lines[line_number].startswith("<desc>"):
            a_query = lines[line_number + 1].strip()
            a_query = stop_words(a_query)
        if a_query != '':
            # searching the index
            reader = DirectoryReader.open(directory)
            searcher = IndexSearcher(reader)
            # parse the query
            parser = QueryParser("DocParagraph", analyzer)
            query = parser.parse(a_query)
            # return 50 queries are required by the assignment
            hits = searcher.search(query, 50).scoreDocs
            # rank counter 1 through 50
            rank_counter = 1
            for hit in hits:
                result = searcher.doc(hit.doc)
                # write search result to log text file
                to_log = str(query_id) + " " + "Q" + str(
                    query_counter) + " " + str(result.get(
                        "DocID")) + " " + str(rank_counter) + " " + str(
                            hit.score) + " " + "Alex's" + "\n"
                log.write(to_log)
                rank_counter += 1
            query_counter += 1
            a_query = ''
    log.close()
Example #30
0
File: idx.py Project: mkind/crawler
    def search(self, field, text):
        """
        search text within indexed data

        input:
            field   fieldname of the value that will be indexed
            text    text to search

        output:
            hits    return a list of hits

        """
        results = []
        idx_reader = DirectoryReader.open(self.directory)
        idx_searcher = IndexSearcher(idx_reader)

        # parse query
        parser = AnalyzingQueryParser(Version.LUCENE_CURRENT, field,
                                      self.analyser)
        query = parser.parse(text)

        # search
        hits = idx_searcher.search(query, 1000).scoreDocs.tolist()
        for hit in hits:
            doc = idx_searcher.doc(hit.doc)
            score = hit.score
            title = doc.get(field)
            url = doc.get("url")
            results.append((score, url, title))

        return results
def search():

	lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    
	args = []
	if request.method == 'POST':
		if request.form['ies']:
			args.append('+ies:'+request.form['ies'])
		if request.form['area']:
			args.append('+area:'+request.form['area'])
		if request.form['professor']:
			args.append('+professor:'+request.form['professor'])
		if request.form['conceito']:
			#args.append('m:'+request.form['conceito']+'d:'+request.form['conceito']+'f:'+request.form['conceito'])
			args.append('m:'+request.form['conceito'])
			args.append('d:'+request.form['conceito'])
			args.append('f:'+request.form['conceito'])

	table = []
	if(len(args) > 0): 
		scoreDocs = mansearch.buscar('indexer/',args)
		fsDir = SimpleFSDirectory(File(indexDir))
		searcher = IndexSearcher(DirectoryReader.open(fsDir))
		for scoreDoc in scoreDocs:
			doc = searcher.doc(scoreDoc.doc)
			table.append(dict((field.name(), field.stringValue()) for field in doc.getFields()))
	return render_template('busca.html',table = table)
	
	pass
Example #32
0
def run_music(ID):
    STORE_DIR = "index"
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    directory = SimpleFSDirectory(File(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, "id", analyzer).parse(ID)
    scoreDocs = searcher.search(query, 1).scoreDocs

    try:
        scoreDoc = scoreDocs[0]
    except:
        return None
    doc = searcher.doc(scoreDoc.doc)

    item = []
    item.append(doc.get("song_title").encode('utf-8'))
    item.append(doc.get('song_url'))
    item.append(doc.get("singer").encode('utf-8'))
    item.append(doc.get("album").encode('utf-8'))
    item.append(doc.get("album_pic"))
    item.append(doc.get("album_genre").encode('utf-8'))
    item.append(doc.get("lyrics").encode('utf-8'))

    sim_str = doc.get("similar").encode('utf-8')
    sim_list = sim_str.split('+')
    for i in range(3):
        sim_list[i] = sim_list[i].split('*')
    item.append(sim_list)

    del searcher

    return item
Example #33
0
	def __init__ (self):
		
		self.mDocumentDirectory = "/home/hnguyen/Projects/CLIFinder/operations.sub"
		self.mIndexDirectory = "/home/hnguyen/Projects/CLIFinder/cli.index"

		self.mIndexReader = None
		if os.path.isdir(self.mIndexDirectory) and self.mIndexReader == None:
			directory = SimpleFSDirectory(File(self.mIndexDirectory))
			self.mIndexReader = DirectoryReader.open(directory)

		
		############################### IndexingEngine Settings ######################################
		self.mSimilarity = DecreaseLengthNormSimilarity()
		self.mOpenMode = IndexWriterConfig.OpenMode.CREATE
		##############################################################################################
				
		self.mIsDebug = False

		if self.mIsDebug:
		############################### Setting up loggers ###########################################
			self.mIndexingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/indexing.log"
			self.mSearchingLogPath = "/home/hnguyen/Projects/CLIFinder/logs/searching.log"
	
			self.mIndexingLogger = LoggingEngine(self.mIndexingLogPath, "IndexingLogger", Queue.Queue())
			self.mSearchingLogger = LoggingEngine(self.mSearchingLogPath, "SearchingLogger", Queue.Queue())

			self.mIndexingLogger.start()
			self.mSearchingLogger.start()
			atexit.register(self.clear)
Example #34
0
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger
Example #35
0
def main():
    resultados = []
    indice_vacio = False
    if len(os.listdir("./lucene/index")) == 0:
        indice_vacio = True
    else:
        consulta = request.args.get("consulta", None)
        if consulta is not None:
            directory = SimpleFSDirectory(Paths.get("./lucene/index"))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = SpanishAnalyzer()
            query = QueryParser("texto", analyzer).parse(consulta)
            scoreDocs = searcher.search(query, 10).scoreDocs

            for sd in scoreDocs:
                doc = searcher.doc(sd.doc)
                resultados.append({
                    "url": direccion_base + doc.get("pdf"),
                    "titulo": doc.get("titulo")
                })

    return render_template("main.html",
                           lucene=lucene.VERSION,
                           indice_vacio=indice_vacio,
                           resultados=resultados)
Example #36
0
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
Example #37
0
    def __init__(self, path, analyzer, topn=DEF_TOPN):

        self.path = path
        self._analyzer = analyzer
        self.topn = topn
        self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path)))
        self._searcher = IndexSearcher(DirectoryReader.open(self._store))
Example #38
0
def func(command):
    vm_env = lucene.getVMEnv()
    vm_env.attachCurrentThread()
    # ------------ #
    STORE_DIR = "index"
    directory = SimpleFSDirectory(File(STORE_DIR))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    searcher = IndexSearcher(DirectoryReader.open(directory))
    # ------------ #
    p = get_d_dimensional_vector(command)
    vp = get_vp(p)
    query = QueryParser(Version.LUCENE_CURRENT, "Vector", analyzer).parse(vp)
    scoreDocs = searcher.search(query, 200).scoreDocs

    dict1 = {}
    result = ""
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        rank = 0.6 * float(doc.get("Likes")) + 0.4 * float(doc.get("Views"))
        ch = doc.get('Page_num') + ' '
        ch += 'data/' + doc.get('Page_num') + '.jpg' + ' '
        ch += doc.get('Page_link') + ' '
        ch += doc.get('Views') + ' '
        ch += doc.get('Likes') + ' '
        tmp_alt = doc.get('Img_alt')
        tmp_alt = '_'.join(tmp_alt.split())
        ch += tmp_alt
        dict1[ch] = rank
    res_list = sorted(dict1.items(), key = lambda item:item[1], reverse = True)
    for i in res_list:
        result += i[0]
        result += ' '
    del searcher
    del analyzer
    return result
Example #39
0
 def __init__(self, path=INDEX_DIR):
     # 初始化lucene,设置好analyzer、reader、searcher和分词器
     lucene.initVM()
     self.indir = SimpleFSDirectory(Paths.get(path))
     self.analyzer = SmartChineseAnalyzer()
     self.reader = DirectoryReader.open(self.indir)
     self.searcher = IndexSearcher(self.reader)
Example #40
0
def retrieve(command):
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    except ValueError:
        print "JVM running."

    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    # to convert to AND query
    command = re.sub(r' ', r' +', command)
    command = "+" + command

    print "Searching for:", command
    query = QueryParser("contents", analyzer).parse(command)
    print query
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    retrieved_docs = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name")))

    del searcher
    return retrieved_docs
def build_corpus(n=0):
    sbcs = texeval_corpus.test_subcorpora
    sbc = sbcs[n]
    # Hack for parallelizing queries, uses one index per domain.
    directory = FSDirectory.open(File(wiki_index+'-'+sbc))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    build_corpus_from_terms_with_wiki(sbc, searcher, analyzer)
 def _maybeReopen(self):
     if len(self._latestModifications) > 10000:
         newReader = DirectoryReader.openIfChanged(self._reader, self._writer, True)
         if not newReader is None:
             self._reader.close()
             self._reader = newReader
             self._searcher = IndexSearcher(self._reader)
             self._latestModifications.clear()
Example #43
0
 def __init__(self,base_dir, index_dir,index_file,queryDict):
     self.baseDir = base_dir
     self.indexFile = os.path.join(index_dir,index_file)
     lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately
     directory = SimpleFSDirectory(File(self.indexFile))
     searcher = IndexSearcher(DirectoryReader.open(directory))
     self.BM25(searcher,queryDict)
     del searcher
def config():
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
    searcher.setSimilarity(bm25Sim)
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
    return searcher,analyzer
 def search_docs(self, value, field="general_info"):
     MAX_RESULTS = 1000
     searcher = IndexSearcher(DirectoryReader.open(self.store))
     query = QueryParser(Version.LUCENE_CURRENT, field,
                         self.analyzer).parse(value)
     topDocs = searcher.search(query, MAX_RESULTS)
     
     return [searcher.doc(hit.doc) for hit in topDocs.scoreDocs]
    def perform_search(self, searchterm, results_per_page, page):
        # if there is a field in the searchterm
        """if ":" in searchterm:
            # processing a query
            parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)
            parser.setDefaultOperator(QueryParser.Operator.AND)

            query = parser.parse(searchterm)

        else:
            query = BooleanQuery()
            query_title = TermQuery(Term("title", searchterm))
            query_description = TermQuery(Term("description", searchterm))
            query_content = TermQuery(Term("content", searchterm))

            #  BooleanClause.Occur.MUST for AND queries
            query.add(query_title, BooleanClause.Occur.SHOULD)
            query.add(query_description, BooleanClause.Occur.SHOULD)
            query.add(query_content, BooleanClause.Occur.SHOULD)"""

        # create QueryParser for each field to be searched
        parser_title = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer)
        parser_description = QueryParser(Version.LUCENE_CURRENT, "description", self.analyzer)
        parser_content = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer)

        # put fields together
        query = BooleanQuery()
        query.add(parser_title.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_description.parse(searchterm), BooleanClause.Occur.SHOULD)
        query.add(parser_content.parse(searchterm), BooleanClause.Occur.SHOULD)

        # conducting search
        searcher = IndexSearcher(DirectoryReader.open(self.store))

        start = datetime.now()
        hits = searcher.search(query, results_per_page + (results_per_page * page))
        score_docs = hits.scoreDocs
        count_results = hits.totalHits
        duration = datetime.now() - start

        # results to return
        results = []
        count = 0

        for scoreDoc in score_docs:

            # skip offset
            if count < results_per_page * page:
                count += 1
                continue
            count += 1


            doc = searcher.doc(scoreDoc.doc)
            table = dict((field.name(), field.stringValue()) for field in doc.getFields())
            results.append(table)

        return results, duration, count_results
Example #47
0
 def deleteRec(self, pid):
     config = IndexWriterConfig(self.analyzer)
     config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
     writer = IndexWriter(self.indexDir, config)
     writer.deleteDocuments(Term('uid', pid))
     writer.commit()
     writer.close()
     self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     return
Example #48
0
 def __init__(self, store_dir):
     initVM()
     directory = SimpleFSDirectory(File(store_dir))
     self.searcher = IndexSearcher(DirectoryReader.open(directory))
     print 'loaded index: %s' % store_dir
     self.analyzer = {}
     self.analyzer['StandardAnalyzer'] = StandardAnalyzer(Version.LUCENE_CURRENT)
     self.analyzer['SimpleAnalyzer'] = SimpleAnalyzer(Version.LUCENE_CURRENT)
     self.analyzer['ChineseAnalyzer'] = ChineseAnalyzer(Version.LUCENE_CURRENT)
Example #49
0
    def __init__(self, index_dir):
        #lucene.initVM(vmargs=['-Djava.awt.headless=true', '-Xmx4g'])

        self.index_dir = index_dir
        self.directory = SimpleFSDirectory(File(self.index_dir))
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
Example #50
0
def init_lucene_search():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    print 'Index ', INDEX_DIR
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))  # current dir
    directory = SimpleFSDirectory(File(INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT, SmartChineseAnalyzer.getDefaultStopSet())

    return searcher, analyzer
Example #51
0
def is_article_indexed(art_id, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_id', analyzer).parse(str(art_id))

    docs = searcher.search(query, 1).scoreDocs

    return len(docs) > 0
Example #52
0
    def __init__(self):
        vm_env = lucene.getVMEnv()
        vm_env.attachCurrentThread()

        indexDir = SimpleFSDirectory(File(app.config['INDEX_PATH']))
        self.searcher = IndexSearcher(DirectoryReader.open(indexDir))

        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

        self.parser = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer)
Example #53
0
  def __init__(self, indexPath):
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION

    #initialize the index
    self.INDEX_DIR = indexPath  #"Clue_Index"
    self.results = None
    self.searcher = IndexSearcher(DirectoryReader.open(
        SimpleFSDirectory(File(self.INDEX_DIR))))

    self.searcher.setSimilarity(BM25Similarity())
Example #54
0
 def query(self, txt, ant=10):
     """Searches for a person or family by id, name, place, or date"""
     q = QueryParser("text", self.analyzer).parse(txt.replace('/', '\/').lower())
     if not self.searcher:
         self.searcher = IndexSearcher(DirectoryReader.open(self.indexDir))
     scoreDocs = self.searcher.search(q, ant).scoreDocs
     hits = []
     for scoreDoc in scoreDocs:
         doc = self.searcher.doc(scoreDoc.doc)
         hits.append([doc.get("uid"), scoreDoc.score])
     return hits
Example #55
0
    def __init__(self, **kwargs):
        """ Initialize a new instance of the Searcher

        :param count: The number of counts to return from a query
        :param output: The output directory of the underlying index
        """
        self.count = kwargs.get("count", 100)
        self.output = kwargs.get("root", "index")
        self.store = SimpleFSDirectory(File(self.output))
        self.analyzer = StandardAnalyzer(Version.LUCENE_30)
        self.searcher = IndexSearcher(DirectoryReader.open(self.store))
 def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
     self._settings = settings
     self._similarity = settings.similarity
     self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
     self._reader = DirectoryReader.open(indexDirectory)
     self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
     self._readerSettingsWrapper = ReaderSettingsWrapper()
     self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
     self._readerSettingsWrapper.set = self._setReadSettings
     self._searcher = None
     self._executor = None
     self._reopenSearcher = True
Example #57
0
def search(term, n_docs=10, index='index'):
    store = SimpleFSDirectory(File(index))
    searcher = IndexSearcher(DirectoryReader.open(store))
    analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)

    query = QueryParser(Version.LUCENE_CURRENT, 'art_body', analyzer).parse(term)

    # str(query.getClass().toString()) == "class org.apache.lucene.search.TermQuery"

    score_docs = searcher.search(query, n_docs).scoreDocs

    return [(score_doc.score, unicode(searcher.doc(score_doc.doc).get('art_body'))) for score_doc in score_docs]
 def reopen(self):
     reader = DirectoryReader.openIfChanged(self._reader)
     if reader is None:
         return
     self._reader.close()
     self._reader = reader
     self._reopenSearcher = True
     taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
     if taxoReader is None:
         return
     self.taxoReader.close()
     self.taxoReader = taxoReader
Example #59
0
 def _getSearcher(self, identifier=None):
     modifications = len(self._latestModifications)
     if modifications == 0:
         return self._searcher
     if identifier and str(identifier) not in self._latestModifications and modifications < _MAX_MODIFICATIONS:
         return self._searcher
     newreader = DirectoryReader.openIfChanged(self._reader, self._writer, True)
     if newreader:
         self._reader.close()
         self._reader = newreader
         self._searcher = IndexSearcher(newreader)
     self._latestModifications.clear()
     return self._searcher
Example #60
0
	def __init__(self):
		if luceneImport:
			self.lucene = True
		else:
			self.lucene = False

		#Lucene connection
		lucene.initVM()
		indexDir = "texts/index"
		directory = MMapDirectory(File(indexDir))
		directory = DirectoryReader.open(directory)
		self.analyzer = StandardAnalyzer(Version.LUCENE_30)
		self.searcher = IndexSearcher(directory)