def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def func1(command): global vm_env STORE_DIR = "index" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() #lucene.initVM(vmargs=['-Djava.awt.headless=true']) # base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) if command == '': return [] command_list = jieba.cut(command) command = " ".join(command_list) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(command) scoreDocs = searcher.search(query, 50).scoreDocs result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doct = { 'title': doc.get("title"), 'url': doc.get("url"), "sentence": doc.get("sentence") } result.append(doct) del searcher return result
def func2(command): STORE_DIR = "index1" vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) res = [] if command == '': return query = QueryParser(Version.LUCENE_CURRENT, "zhuliao", analyzer).parse(command) scoreDocs = searcher.search(query, 9).scoreDocs for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) try: res.append([ doc.get("name"), doc.get("collect_num"), doc.get("zhuliao").split(' '), doc.get("zuofa").split('\n'), doc.get("img_url"), doc.get("url") ]) except: pass res1 = [] for i in res: i[1] = int(i[1]) res1.append(tuple(i)) res2 = sorted(res1, cmp=None, key=lambda x: x[1], reverse=True) return res2
def search_img(output): STORE_DIR = "WebPageIndex" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) results = [] results.append([]) for num in output: imgnum = str(num) query = QueryParser(Version.LUCENE_CURRENT, "imgnum", analyzer).parse(imgnum) scoreDocs = searcher.search(query, 1).scoreDocs for scoreDoc in scoreDocs: result = [] doc = searcher.doc(scoreDoc.doc) result.append(doc.get("album")) result.append(doc.get("subalbum")) result.append(doc.get("singer")) result.append(doc.get("url")) result.append(doc.get("reviews")) result.append(doc.get("imgurl")) result.append(doc.get("imgnum")) results.append(result) del searcher return results
def run_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index2" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) querys = BooleanQuery() query_content = QueryParser(Version.LUCENE_CURRENT, "urlcontent", analyzer).parse(command) query_title = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(command) querys.add(query_content, BooleanClause.Occur.SHOULD) querys.add(query_title, BooleanClause.Occur.SHOULD) scoreDocs = searcher.search(querys, 50).scoreDocs if len(scoreDocs) == 0: print "WARNING: No result" result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print doc.get("title") data = {} data['title'] = doc.get('title') data['url'] = doc.get('url') data['imgurl'] = doc.get('imgurl') result.append(data) return result
def run(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index1" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) query = QueryParser(Version.LUCENE_CURRENT, "contents", analyzer).parse(analysis(command)) HighlightFormatter = SimpleHTMLFormatter() highlighter = Highlighter(HighlightFormatter, QueryScorer(query)) scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) print 'path:', doc.get("path"), 'name:', doc.get( "name"), 'url:', doc.get("url"), 'title:', doc.get("title") text = doc.get('contents') highLightText = highlighter.getBestFragment(analyzer, "contents", text) if highLightText != None: highLightText = ''.join(highLightText.split(' ')) data = {} data['url'] = doc.get("url") data['title'] = doc.get('title') data['highlight'] = highLightText result.append(data) return result
def getWriter(self, directory=None, analyzer=None, open_mode=None, similarity=None, maxBufferedDocs=None, mergePolicy=None): if analyzer is None: analyzer = LimitTokenCountAnalyzer( WhitespaceAnalyzer(self.TEST_VERSION), 10000) config = self.getConfig(analyzer) if open_mode is None: open_mode = IndexWriterConfig.OpenMode.CREATE config.setOpenMode(open_mode) if similarity is not None: config.setSimilarity(similarity) if maxBufferedDocs is not None: config.setMaxBufferedDocs(maxBufferedDocs) if mergePolicy is not None: config.setMergePolicy(mergePolicy) if directory is None: directory = self.directory return IndexWriter(directory, config)
def index (cls, indexDir, taxoDir): """Create an index, and adds to it sample documents and facets. indexDir Directory in which the index should be created. taxoDir Directory in which the taxonomy index should be created. """ # create and open an index writer from org.apache.lucene.util import Version config = IndexWriterConfig(Version.LUCENE_42, WhitespaceAnalyzer(Version.LUCENE_42)) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) iw = IndexWriter(indexDir, config) # create and open a taxonomy writer taxo = DirectoryTaxonomyWriter(taxoDir, IndexWriterConfig.OpenMode.CREATE) # FacetFields is a utility class for adding facet fields to a document: facet_fields = FacetFields(taxo) # loop over sample documents nDocsAdded = 0 nFacetsAdded = 0 for docNum in range(len(docTexts)): # obtain the sample facets for current document facets = categories[docNum] facetList = [CategoryPath(f) for f in facets] # NOTE: setCategoryPaths() requires an Iterable, so need to convert the # Python list in order to to pass a proper argument to setCategoryPaths. # We use java.util.Arrays (via JCC) to create a Java List: facetList = Arrays.asList(facetList) # NOTE: we could use lucene.collections here as well in order to convert our # Python list to a Java based list using the JavaList class (JavaList implements # java.util.List around a Python list instance it wraps): # from lucene.collections import JavaList # facetList = JavaList(facetList) # create a plain Lucene document and add some regular Lucene fields to it doc = Document() doc.add(TextField(TITLE, docTitles[docNum], Field.Store.YES)) doc.add(TextField(TEXT, docTexts[docNum], Field.Store.NO)) # use the FacetFields utility class for adding facet fields (i.e. the categories) # to the document (and, as required, to the taxonomy index) facet_fields.addFields(doc, facetList) # finally add the document to the index iw.addDocument(doc) nDocsAdded +=1 nFacetsAdded += facetList.size() # end for # commit changes. # we commit changes to the taxonomy index prior to committing them to the search index. # this is important, so that all facets referred to by documents in the search index # will indeed exist in the taxonomy index. taxo.commit() iw.commit() # close the taxonomy index and the index - all modifications are # now safely in the provided directories: indexDir and taxoDir. taxo.close() iw.close() print "Indexed %d documents with overall %d facets." % (nDocsAdded,nFacetsAdded)
def testDelete(self, fieldName, searchString): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) writer.deleteDocuments(Term(fieldName, searchString)) writer.close()
def createIndexWriter(indexDir): if not os.path.exists(indexDir): os.mkdir(indexDir) directory = FSDirectory.open(Paths.get(indexDir)) config = IndexWriterConfig(WhitespaceAnalyzer()) #config = config.setRAMBufferSizeMB(ramBufferSize) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) return IndexWriter(directory, config)
def createAnalyzer(self): config = self._analyzer if config['type'] == "MerescoStandardAnalyzer": return MerescoStandardAnalyzer() elif config['type'] == "MerescoDutchStemmingAnalyzer": return MerescoDutchStemmingAnalyzer(config['stemmingFields']) elif config['type'] == "WhitespaceAnalyzer": return WhitespaceAnalyzer() raise Exception("No support for type " + str(self._analyzer))
def search_dianping(province, kind, query): STORE_DIR = "index" vm_env.attachCurrentThread() #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) allowed_opt = ['food', 'foodshop'] if kind not in allowed_opt: return None if query == '': return None command = '%s:%s province:%s' % (kind, query, province) command = unicode(command, 'utf8', 'ignore') command_dict = parseCommand(command) querys = BooleanQuery() for k, v in command_dict.iteritems(): query = QueryParser(Version.LUCENE_CURRENT, k, analyzer).parse(v) querys.add(query, BooleanClause.Occur.MUST) scoreDocs = searcher.search(querys, 50).scoreDocs #比较评分 max_rank = 0 best_shop = '' for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) cur_shop = doc.get("foodshop").split()[-1] cur_rank = float(doc.get('rank')) if cur_rank > max_rank: max_rank = cur_rank best_shop = cur_shop result = {} for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) cur_shop = doc.get("foodshop").split()[-1] cur_rank = float(doc.get('rank')) if cur_rank == max_rank: result['name'] = cur_shop.encode('utf8', 'ignore') result['rank'] = doc.get('rank').encode('utf8', 'ignore') result['food'] = doc.get('food').encode('utf8', 'ignore') result['location'] = doc.get('location').encode('utf8', 'ignore') result['tel'] = doc.get('tel').encode('utf8', 'ignore') result['environment_score'] = doc.get('environment_score').encode( 'utf8', 'ignore') result['flavour_score'] = doc.get('flavour_score').encode( 'utf8', 'ignore') result['service_score'] = doc.get('service_score').encode( 'utf8', 'ignore') result['price_level'] = doc.get('price_level').encode( 'utf8', 'ignore') del searcher return result
def main(): STORE_DIR = "index" lucene.initVM(vmargs=['-Djava.awt.headless=true']) print 'lucene', lucene.VERSION #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) run(searcher, analyzer) del searcher
def getWriter(store, analyzer=None, create=False): if analyzer is None: analyzer = WhitespaceAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000000) config = IndexWriterConfig(analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) print(store, config) writer = IndexWriter(store, config) return writer
def func_img(command): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) match_count, result = work(searcher, analyzer, command) del searcher return match_count, result
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort( Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer
def main(title, judge): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) res = [] res = run(searcher, analyzer, title, judge) del searcher return res
def search_img(image): img = cv2.imread(image) vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) result = run(searcher, analyzer, img) return result del searcher
def __init__(self, analyser=WhitespaceAnalyzer(), file="prolog/wn_s.pl"): self.parser = WordnetSynonymParser(True, True, analyser) # Read the prolog-file for wordnet in a stringreader PlFile = StringReader(open(file, 'r').read()) # Parse the prologfile with the WordnetSynonymParser self.parser.parse(PlFile) # Build the synonymmap self.map = self.parser.build()
def init_search(search_content, vm_env): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() lucene.initVM() STORE_DIR = "zhihuindex" print 'lucene', lucene.VERSION directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) result_list = run(searcher, analyzer, search_content) del searcher return result_list
def __init__(self, path): p = Path(path) if not p.is_dir(): os.mkdir(path) storeDir = SimpleFSDirectory(Paths.get(path)) # analyzer = StandardAnalyzer() # analyzer = SmartChineseAnalyzer(stopwords) analyzer = WhitespaceAnalyzer(Version.LATEST) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) index_writer = IndexWriter(storeDir, config) self.IndexDocs(index_writer)
def Search_text(command): STORE_DIR = "index" # lucene.initVM(vmargs=['-Djava.awt.headless=true']) vm_env.attachCurrentThread() print 'lucene', lucene.VERSION #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) res = run(searcher, analyzer, command) del searcher return res
def getWriter(self, store, analyzer=None, create=False): if analyzer is None: analyzer = WhitespaceAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) #config.setInfoStream(PrintStreamInfoStream(System.out)) if create: config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) return writer
def text_search(command, cpage, meth): global vm_env, searcher, analyzer vm_env.attachCurrentThread() print 'lucene', lucene.VERSION directory = SimpleFSDirectory(File(STORE_TEXT_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) text, maxnum = runstext(command, cpage, meth) del searcher return text, maxnum
def testNull(self): a = WhitespaceAnalyzer() self._assertAnalyzesTo(a, "foo bar FOO BAR", ["foo", "bar", "FOO", "BAR"]) self._assertAnalyzesTo(a, "foo bar . FOO <> BAR", ["foo", "bar", ".", "FOO", "<>", "BAR"]) self._assertAnalyzesTo(a, "foo.bar.FOO.BAR", ["foo.bar.FOO.BAR"]) self._assertAnalyzesTo(a, "U.S.A.", ["U.S.A."]) self._assertAnalyzesTo(a, "C++", ["C++"]) self._assertAnalyzesTo(a, "B2B", ["B2B"]) self._assertAnalyzesTo(a, "2B", ["2B"]) self._assertAnalyzesTo(a, "\"QUOTED\" word", ["\"QUOTED\"", "word"])
def init_search(search_content, vm_env): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() lucene.initVM() STORE_DIR = "index" print 'lucene', lucene.VERSION #base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) result_s = run(searcher, analyzer, search_content) del searcher print(result_s) return result_s
def testAdd(self, goodname, salenum, price, shopname, url, picturename, comment, historyprice): analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT) analyzer = LimitTokenCountAnalyzer(analyzer, 1048576) config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) # True,建立新索引,False,建立增量索引 noIndexedString = FieldType() noIndexedString.setTokenized(False) noIndexedString.setIndexed(False) noIndexedString.setStored(True) try: print "adding", goodname goodname_s = unicode(goodname, 'utf8') seg_list_good = jieba.cut(goodname_s, cut_all=False) goodname_s = " ".join(seg_list_good) # 默认模式 shopname_s = unicode(shopname, 'utf8') seg_list_shop = jieba.cut(shopname_s, cut_all=False) shopname_s = " ".join(seg_list_shop) # 默认模式 shopnameField = Field("shopName", shopname, noIndexedString) shopnameField_s = TextField("shopName_s", shopname_s, Field.Store.NO) goodnameField = Field("goodName", goodname, noIndexedString) goodnameField_s = TextField("goodName_s", goodname_s, Field.Store.NO) salenumField = IntField("saleNum", salenum, Field.Store.YES) priceField = DoubleField("price", price, Field.Store.YES) urlField = Field("url", url, noIndexedString) pictureField = StringField("pictureName", picturename, Field.Store.YES) commentField = Field("comments", comment, noIndexedString) historyPriceField = Field("historyPrice", historyprice, noIndexedString) doc = Document() doc.add(shopnameField) doc.add(shopnameField_s) doc.add(goodnameField) doc.add(goodnameField_s) doc.add(salenumField) doc.add(priceField) doc.add(urlField) doc.add(pictureField) doc.add(commentField) doc.add(historyPriceField) writer.addDocument(doc) except Exception, e: print "Failed in indexDocs:", e
def search(self, query, field, maxReturnLimit): qp = QueryParser(Version.LUCENE_CURRENT, field, WhitespaceAnalyzer(Version.LUCENE_CURRENT)).parse(query) hits = self.searcher.search(qp, maxReturnLimit) result = [] for hit in hits.scoreDocs: record = dict() doc = self.searcher.doc(hit.doc) record["id"] = doc.get("id") record["pos"] = doc.get("pos") record["hallmarks"] = doc.get("hallmarks").split() #record["hallmarks-exp"] = doc.get("hallmarks-exp").split() record["text"] = doc.get("text") result.append(record) return result
def user_profile(searcher, author): """Builds an user profile {comments, links, subreddits}""" analyzer = WhitespaceAnalyzer() query = QueryParser("author", analyzer).parse('"%s"' % author) scoreDocs = searcher.search(query, N_DOCS).scoreDocs profile = {'comments': set(), 'links': set(), 'subreddits': set()} for sd in scoreDocs: doc = searcher.doc(sd.doc) profile['comments'].add(doc.get('name')) profile['links'].add(doc.get('link_id')) profile['subreddits'].add(doc.get('subreddit')) return profile