def __init__(self, root, storeDir, doIndex=False): self.analyzer = StandardAnalyzer() if not os.path.exists(storeDir): os.mkdir(storeDir) if doIndex: store = SimpleFSDirectory(Paths.get(storeDir)) analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexDocs(root, writer) ticker = Ticker() print("commit index") threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print("done") directory = SimpleFSDirectory(Paths.get(storeDir)) self.searcher = IndexSearcher(DirectoryReader.open(directory))
def __init__(self, root, storedir, isindexing=False, isBM25=True): if not os.path.exists(storedir): os.mkdir(storedir) self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576) if isindexing: store = SimpleFSDirectory(Paths.get(storedir)) config = IndexWriterConfig(self.analyzer) # TODO BM25 parameter tuning if isBM25: config.setSimilarity(BM25Similarity()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(store, config) self.indexer(root, writer) ticker = Ticker() print('commit index') threading.Thread(target=ticker.run).start() writer.commit() writer.close() ticker.tick = False print('done') search_dir = SimpleFSDirectory(Paths.get(storedir)) self.searcher = IndexSearcher(DirectoryReader.open(search_dir)) if isBM25: self.searcher.setSimilarity(BM25Similarity())
def main(): root = session.getModel().getModelRoots().get(1) gproject = GProject.getProject(root) # Deploy Module (JMDAC File) module_jmdac_path = "/module_jmdac_archives" moduleArchivePattern = os.path.join(module_jmdac_path, "AttackTreeDesigner_*.jmdac") moduleArchives = glob.glob(moduleArchivePattern) assert len(moduleArchives) > 0, "No jmdac archive has been found !" moduleArchives.sort(reverse=True) print("deploying ", Paths.get(moduleArchives[0])) Modelio.getInstance().getModuleService().installModule( gproject, Paths.get(moduleArchives[0])) # test if module deployed correctly attackTreeDesignerModule = findModule("AttackTreeDesigner") if attackTreeDesignerModule is None: print("Tested module: not found. ABORT! <br/>") outputError("/errors_output/deploy-module.err", "AttackTreeDesigner module not found") return 1 else: print("Module AttackTreeDesigner found") coreSession.save(None)
def __init__(self, index_path, field, similarity="boolean", use_relevance_feedback=False, feedback_index_path=None): self.reader = DirectoryReader.open( FSDirectory.open(Paths.get(index_path))) self.searcher = IndexSearcher(self.reader) if use_relevance_feedback and feedback_index_path is not None: self.feedback_reader = DirectoryReader.open( FSDirectory.open(Paths.get(feedback_index_path))) self.feedback_searcher = IndexSearcher(self.feedback_reader) self.similarity = similarity self.stopwords = stop_words() if similarity == "boolean": self.searcher.setSimilarity(BooleanSimilarity()) elif similarity == "tf": self.searcher.setSimilarity(TFSimilarity()) elif similarity == "tfidf": self.searcher.setSimilarity(ClassicSimilarity()) elif similarity == "BM25": self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) else: print("Unknown similarity, so we use BM25(1.2, 0.2) as default") self.searcher.setSimilarity(BM25Similarity(1.2, 0.2)) analyzer = StandardAnalyzer() print(self.searcher.getSimilarity()) self.parser = QueryParser(field, analyzer)
def __init__(self): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.vocab = None BooleanQuery.setMaxClauseCount(2048) if not os.path.exists(prm.index_folder): print 'Creating index at', prm.index_folder if prm.docs_path == prm.docs_path_term: add_terms = True else: add_terms = False self.create_index(prm.index_folder, prm.docs_path, add_terms) if prm.local_index_folder: print 'copying index from', prm.index_folder, 'to', prm.local_index_folder if os.path.exists(prm.local_index_folder): print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder, prm.local_index_folder) self.index_folder = prm.local_index_folder else: self.index_folder = prm.index_folder fsDir = MMapDirectory(Paths.get(prm.index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) if prm.docs_path != prm.docs_path_term: if not os.path.exists(prm.index_folder_term): print 'Creating index at', prm.index_folder_term self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True) if prm.local_index_folder_term: print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term if os.path.exists(prm.local_index_folder_term): print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.' else: shutil.copytree(prm.index_folder_term, prm.local_index_folder_term) self.index_folder_term = prm.local_index_folder_term else: self.index_folder_term = prm.index_folder_term fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term)) self.searcher_term = IndexSearcher( DirectoryReader.open(fsDir_term)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=prm.n_threads) self.cache = {} print 'Loading Title-ID mapping...' self.title_id_map, self.id_title_map = self.get_title_id_map()
def doInBackground(self): #Initialize progress property. progress = 0 self.super__setProgress(progress) # "\n download tools list" progress = 2 self.super__setProgress(progress) self.delete_file(self.tmpToolsListFile) if not self.download_file(self.app.toolsListUrl, self.tmpToolsListFile): # " I cannot download the tools list." progress = 3 self.super__setProgress(progress) return toolsRefs = read_tools_list(self.tmpToolsListFile) #Download tools data as jar files progress = 5 self.super__setProgress(progress) self.jarDir = File.separator.join([self.app.SCRIPTDIR, "tools", "jar"]) if not File(self.jarDir).exists(): File(self.jarDir).mkdir() else: #delete old files for jarFileName in File(self.jarDir).list(): File(File.separator.join([self.jarDir, jarFileName])).delete() #download new files for toolRef in toolsRefs: jarFileName = "%s.jar" % toolRef jarUrl = "%s/%s" % (self.app.jarBaseUrl, jarFileName) jarFilePath = File.separator.join([self.jarDir, jarFileName]) answer = self.download_file(jarUrl, jarFilePath) if not answer: # " I cannot download the tools file" progress = 6 self.super__setProgress(progress) return #Extract tools data from jar files self.toolsDir = File.separator.join( [self.app.SCRIPTDIR, "tools", "data"]) progress = 7 self.super__setProgress(progress) self.extract_tools_data_from_jar_files() #Remove temporary file self.delete_file(self.toolsListFile) Files.copy(Paths.get(self.tmpToolsListFile), Paths.get(self.toolsListFile)) self.delete_file(self.tmpToolsListFile) progress = 8 self.super__setProgress(progress)
def doInBackground(self): #Initialize progress property. progress = 0 self.super__setProgress(progress) # "\n download tools list" progress = 2 self.super__setProgress(progress) self.delete_file(self.tmpToolsListFile) if not self.download_file(self.app.toolsListUrl, self.tmpToolsListFile): # " I cannot download the tools list." progress = 3 self.super__setProgress(progress) return toolsRefs = read_tools_list(self.tmpToolsListFile) #Download tools data as jar files progress = 5 self.super__setProgress(progress) self.jarDir = File.separator.join([self.app.SCRIPTDIR, "tools", "jar"]) if not File(self.jarDir).exists(): File(self.jarDir).mkdir() else: #delete old files for jarFileName in File(self.jarDir).list(): File(File.separator.join([self.jarDir, jarFileName])).delete() #download new files for toolRef in toolsRefs: jarFileName = "%s.jar" % toolRef jarUrl = "%s/%s" % (self.app.jarBaseUrl, jarFileName) jarFilePath = File.separator.join([self.jarDir, jarFileName]) answer = self.download_file(jarUrl, jarFilePath) if not answer: # " I cannot download the tools file" progress = 6 self.super__setProgress(progress) return #Extract tools data from jar files self.toolsDir = File.separator.join([self.app.SCRIPTDIR, "tools", "data"]) progress = 7 self.super__setProgress(progress) self.extract_tools_data_from_jar_files() #Remove temporary file self.delete_file(self.toolsListFile) Files.copy(Paths.get(self.tmpToolsListFile), Paths.get(self.toolsListFile)) self.delete_file(self.tmpToolsListFile) progress = 8 self.super__setProgress(progress)
def __init__(self, directory): self.directory = directory # create Directories for the search index and for the taxonomy index # in RAM or on Disc #indexDir = RAMDirectory() #taxoDir = RAMDirectory() self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory, INDEX_DIR))) self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory, TAXONOMY_DIR))) # FacetConfig self.facets_config = FacetsConfig() self.facets_config.setHierarchical("Categories", True) self.facets_config.setMultiValued("Categories", True)
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]): lucene.initVM() lindex = SimpleFSDirectory(Paths.get(indexfile)) ireader = DirectoryReader.open(lindex) isearcher = IndexSearcher(ireader) analyser = StandardAnalyzer() parser = QueryParser(default_field, analyser) query = parser.parse(querytext) hits = isearcher.search(query, top).scoreDocs docIDs = [hit.doc for hit in hits] print_results(isearcher, hits, display_fields) if len(hits) == 0: print("No hits!") elif qe: print("\n") print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top)) relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]] nonrelevantids = [id for id in docIDs if id not in relevantids] print("\n\n") qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids) print("Expanded search query: '{}'\n".format(qequerytext)) qequery = parser.parse(qequerytext) qehits = isearcher.search(qequery, top).scoreDocs print_results(isearcher, qehits, display_fields) ireader.close() lindex.close()
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader): docs_lookup = dict() # noinspection PyUnresolvedReferences lucene.initVM(initialheap='32m', maxheap='4G') file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT") dir = FSDirectory.open(file) reader = DirectoryReader.open(dir) searcher = IndexSearcher(reader) parser = QueryParser('contents', StandardAnalyzer()) logging.warning( 'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED' ) for object in objects: tokens = object.split(' ') doc_sets = [] for token in tokens: q = parser.parse(f'"{token}"') # TODO maybe use minimum score topdocs = searcher.search(q, 99999999) results = set([topdoc.doc for topdoc in topdocs.scoreDocs]) doc_sets.append(results) docs_lookup[object] = set.intersection(*doc_sets) return docs_lookup, reader
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) #self.analyzer = StandardAnalyzer() self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searcher = IndexSearcher(self.reader) self.dict_term_freq = {} if similarity == 'BM25': (self.searcher).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
def __init__(self, path, analyzer, topn=DEF_TOPN): self.path = path self._analyzer = analyzer self.topn = topn self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path))) self._searcher = IndexSearcher(DirectoryReader.open(self._store))
def __init__(self, searchDir): self.analyzer = MyPythonEnglishAnalyzer( stopwords=Indexer.ENGLISH_STOP_WORDS_SET) self.directory = FSDirectory.open(Paths.get(searchDir)) self.reader = DirectoryReader.open(self.directory) self.searcher = IndexSearcher(self.reader)
def create_index_for_wiki_sentence(filename, path, firstTime=False): logging.info('Start create wiki_sentence!') wiki_dict = get_wiki_data(path) logging.info('Start creating index!') filename = '_wiki_sentence' analyzer = analysis.standard.StandardAnalyzer() # # Store the index in memory: base_dir = HOMEPATH INDEX_DIR = "IndexFiles" + filename + ".index" storeDir = os.path.join(base_dir, INDEX_DIR) if not os.path.exists(storeDir): os.mkdir(storeDir) directory = SimpleFSDirectory(Paths.get(storeDir)) if firstTime: config = index.IndexWriterConfig(analyzer) iwriter = index.IndexWriter(directory, config) for cnt, key in enumerate(wiki_dict.keys()): if cnt % 1000 == 0: logging.info( 'I have preprocessed {} index in creating index by document!' .format(str(cnt))) org_title = key[0] preprocessed_title = key[1] doc_id = key[2] sentence = wiki_dict[key] doc = create_document_by_document_sentence(org_title, preprocessed_title, doc_id, sentence) iwriter.addDocument(doc) iwriter.close() logging.info('Finish creating index wiki_sentence!') return directory
def main(): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = SimpleAnalyzer() config = IndexWriterConfig(analyzer) config = config.setRAMBufferSizeMB(1024.0) # write data to index if not is_index_Exist: print('begin backup code files') system_flag = platform.system() cmd = 'robocopy %s %s\code_files *.py' % ( r'%cd%', LUCENE_INDEX_DIR ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % ( LUCENE_INDEX_DIR) os.system(cmd) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def retriever(file_dir): analyzer = WhitespaceAnalyzer() reader = DirectoryReader.open( SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/"))) searcher = IndexSearcher(reader) queryParser = QueryParser("code", analyzer) BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE) with open(file_dir + "/train/train.spl.src", 'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu: sources = [line.strip() for line in fso.readlines()] summaries = [line.strip() for line in fsu.readlines()] with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \ open(file_dir+"/output/ast.out", 'w') as fws: queries = [ re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip()) for line in ft.readlines() ] for i, line in enumerate(queries): print("query %d" % i) query = queryParser.parse(QueryParser.escape(line)) hits = searcher.search(query, 1).scoreDocs flag = False for hit in hits: doc = searcher.doc(hit.doc) _id = eval(doc.get("id")) flag = True fwo.write(sources[_id] + '\n') fws.write(summaries[_id] + '\n') if not flag: print(query) print(hits) exit(-1)
def getDoc(self, file): try: f = open(os.getcwd()+FILE_DIR+'/'+file, "r") try: c = [] s = BeautifulSoup(f, 'html.parser') text = s.findAll(text=True) c = filter(tag_vis, text) try: c = ' '.join(c) except Exception as e: c = b' '.join(c) except Exception as e: print(str(e)) return content = TextField("contents", c, Field.Store.YES) fileName = str(Paths.get(file)).split('/')[-1] fileName = fileName[:fileName.find(".")] filename = TextField("filename", fileName, Field.Store.YES) path = TextField("filepath", str(os.getcwd()+FILE_DIR+'/'+file), Field.Store.NO) doc = Document() doc.add(content) doc.add(filename) doc.add(path) return doc except Exception as e: print(type(Exception).__name__) print(str(e)) return
def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Indexer. Parameters ---------- index_dir : string The location of lucene index mode : string The mode when opening lucene index. Available values are: 'create', open new index and overwriting over index, 'append', open existed index and append. 'create_or_append', if `index_dir` exists, 'append', else 'create' date_format : string We save datetime field as string, `date_format` specify how to format datetime into string. """ # self.store = FSDirectory.open(File(index_dir)) self.store = FSDirectory.open(Paths.get(index_dir)) # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) self.analyzer = StandardAnalyzer() # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer) self.config = IndexWriterConfig(self.analyzer) self.mode = mode self.date_format = date_format if mode == 'create_or_append': self.config.setOpenMode( IndexWriterConfig.OpenMode.CREATE_OR_APPEND) elif mode == 'create': self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) elif mode == 'append': self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND) else: raise ValueError('Invalid mode %s', mode) self.writer = IndexWriter(self.store, self.config)
def __init__(self, LUCENE_INDEX_DIR, similarity='BM25', lucene_vm_flag=False, is_bigram_cache_used=False, mongoObj=None): if lucene_vm_flag == False: lucene.initVM(vmargs=['-Djava.awt.headless=true']) self.lucene_vm_init = True self.index_dir = LUCENE_INDEX_DIR self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) self.analyzer = SimpleAnalyzer() self.config = IndexWriterConfig(self.analyzer) self.reader = DirectoryReader.open(self.index_mm) self.searchers = [] self.searchers.append(IndexSearcher(self.reader)) if similarity == 'BM25': (self.searchers[0]).setSimilarity(BM25Similarity()) # load bigram cache self.is_bigram_cache_used = is_bigram_cache_used if is_bigram_cache_used == True: seperate_char = '/' if self.index_dir.find('/') > -1 else '\\' index_name = self.index_dir.split(seperate_char)[-1] self.index_name = index_name self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache'] self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache'] if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache_with_wikipedia'] else: self.conn_mapping_prob_cache = mongoObj.db[ index_name + '_mapping_prob_cache']
def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % ( len(scoreDocs), duration, query) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def main(): LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2' try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True except: print('JavaVM already running') is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) #config=config.setRAMBufferSizeMB(1024.0) # experimental setting !! # write data to index if not is_index_Exist: #if True: print('begin backup code files') system_flag = platform.system() if system_flag == 'Windows': os.system('robocopy %s %s\code_files *.py' % (r'%cd%', LUCENE_INDEX_DIR)) else: os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR)) os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR)) w = IndexWriter(index_mm, config) makeIndex(w) w.close() else: print('index already exists, stop indexing')
def __init__(self, path=INDEX_DIR): # 初始化lucene,设置好analyzer、reader、searcher和分词器 lucene.initVM() self.indir = SimpleFSDirectory(Paths.get(path)) self.analyzer = SmartChineseAnalyzer() self.reader = DirectoryReader.open(self.indir) self.searcher = IndexSearcher(self.reader)
def main(): resultados = [] indice_vacio = False if len(os.listdir("./lucene/index")) == 0: indice_vacio = True else: consulta = request.args.get("consulta", None) if consulta is not None: directory = SimpleFSDirectory(Paths.get("./lucene/index")) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = SpanishAnalyzer() query = QueryParser("texto", analyzer).parse(consulta) scoreDocs = searcher.search(query, 10).scoreDocs for sd in scoreDocs: doc = searcher.doc(sd.doc) resultados.append({ "url": direccion_base + doc.get("pdf"), "titulo": doc.get("titulo") }) return render_template("main.html", lucene=lucene.VERSION, indice_vacio=indice_vacio, resultados=resultados)
def retrieve(command): try: lucene.initVM(vmargs=['-Djava.awt.headless=true']) except ValueError: print "JVM running." print 'lucene', lucene.VERSION base_dir = os.path.dirname(os.path.abspath(sys.argv[0])) directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer() # to convert to AND query command = re.sub(r' ', r' +', command) command = "+" + command print "Searching for:", command query = QueryParser("contents", analyzer).parse(command) print query scoreDocs = searcher.search(query, 500).scoreDocs print "%s total matching documents." % len(scoreDocs) retrieved_docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name"))) del searcher return retrieved_docs
def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close()
def __init__(self, index_store_path): store = NIOFSDirectory(Paths.get(index_store_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) self.writer = IndexWriter(store, config)
def build_index(document_path, dir_path): lucene.initVM() index_dir = SimpleFSDirectory(Paths.get(dir_path)) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) index_writer = IndexWriter(index_dir, config) t1 = FieldType() t1.setStored(True) t1.setTokenized(True) t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS) t2 = FieldType() t2.setStored(True) t2.setTokenized(False) with open(document_path) as input_file: for line in input_file: segs = line.strip().split(" ") music_path, music_tags = segs[0], segs[1].split(",") document = Document() document.add(Field("content", " ".join(music_tags), t1)) document.add(Field("url", music_path, t2)) index_writer.addDocument(document) index_writer.close()
def __init__(self, indexDir): f = Paths.get(indexDir) self._dir = SimpleFSDirectory(f) analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self._writer = IndexWriter(self._dir, config)
def create_index(self, index_folder, docs_path, add_terms=False): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(False) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig(StandardAnalyzer()) self.writer = IndexWriter(fsDir, writerConfig) print "%d docs in index" % self.writer.numDocs() print "Indexing documents..." doc_id = 0 import corpus_hdf5 corpus = corpus_hdf5.CorpusHDF5(docs_path) for txt in corpus.get_text_iter(): title = corpus.get_article_title(doc_id) self.add_doc(doc_id, title, txt, add_terms) if doc_id % 1000 == 0: print 'indexing doc', doc_id doc_id += 1 print "Index of %d docs..." % self.writer.numDocs() self.writer.close()
def openStore(self): INDEX_DIR = "projectIndexFiles.index" path1 = "D:/IR Dataset/" base_dir = os.path.dirname(os.path.abspath(path1)) storeDir = os.path.join(base_dir, INDEX_DIR) store = SimpleFSDirectory(Paths.get(storeDir)) return store
def __init__(self, store_dir): self.store_dir = store_dir if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) self.store = SimpleFSDirectory(Paths.get(store_dir)) self.searcher = None self.analyzer = StandardAnalyzer() self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
def __init__(self, file_name): path = Paths.get(file_name) self.file_name = path.toAbsolutePath().toString() self.file = File(self.file_name) self.db_factory = DocumentBuilderFactory.newInstance() self.db_builder = self.db_factory.newDocumentBuilder() self.doc = self.db_builder.parse(self.file) self.doc.getDocumentElement().normalize()
def _getLucene(self, path): directory = FSDirectory.open(Paths.get(path)) config = IndexWriterConfig(None) config.setRAMBufferSizeMB(256.0) # faster config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def extract_file_from_jar(config_file): file_url = LoaderUtil.getResourceBySelfClassLoader(config_file) if file_url: tmp_file, tmp_abs_path = tempfile.mkstemp() tmp_file.close() Files.copy(file_url.openStream(), Paths.get(tmp_abs_path), StandardCopyOption.REPLACE_EXISTING) return tmp_abs_path else: return None
def __init__(self, store_dir): self.store_dir = store_dir if not os.path.exists(store_dir): os.mkdir(store_dir, 0777) self.store = SimpleFSDirectory(Paths.get(store_dir)) self.analyzer = StandardAnalyzer() self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576) self.config = IndexWriterConfig(self.analyzer) self.writer = IndexWriter(self.store, self.config)
def getLucene(path): directory = FSDirectory.open(Paths.get(path)) analyzer = WhitespaceAnalyzer() config = IndexWriterConfig(analyzer) config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG))) writer = IndexWriter(directory, config) reader = writer.getReader() searcher = IndexSearcher(reader) return writer, reader, searcher
def readFileToStructure(self, path, structure): header = PdbHeader() header.setTitle(Paths.get(path).getFileName().toString()) structure.setPDBHeader(header) model = ReadFile.getModelFromFile(self, path) structure.setChains(Lists.newArrayList(model)) info = PdbCryst() info.setSpaceGroup(SpaceGroup(0, 1, 1, "P 1", "P 1", BravaisL.CUBIC)) info.setCrystalCell(ReadFile.getBox(self, info.getSpaceGroup().getBravLattice().getExampleUnitCell())) header.setCrystallographicInfo(info) return structure
def download_file(self, url, filePath): """Downloads a file form url and save it as filePath """ try: print "\ndownloading" print url print filePath inputStream = URI.create(url).toURL().openStream() Files.copy(inputStream, Paths.get(filePath)) return True except (UnknownHostException, SocketException), e: print e print "I cannot download:\n%s" % url return False
def __init__(self, dbName, dropDB=False): #self.initObject = lucene.initVM() #default 2048? #vmargs=['-Djava.awt.headless=true'] """ attachCurrentThread(name, asDaemon) Before a thread created in Python or elsewhere but not in the Java VM can be used with the Java VM, this method needs to be invoked. The two arguments it takes are optional and self-explanatory. """ #self.initObject.attachCurrentThread('LuceneDB', True) luceneVM.attachCurrentThread('LuceneDB') self.analyzer = StandardAnalyzer() #split on whitespace, no trunkation or stemming self.indexDir = None self.searcher = None (user,db) = dbName.split('_', 1) directory = "./files/"+user+'/'+db+'/LuceneIndex' if dropDB: shutil.rmtree(directory) self.indexDir = SimpleFSDirectory(Paths.get(directory)) #creates directory if not exists
def loadResource(self, u) : sysloader = self.java.lang.ClassLoader.getSystemClassLoader() return sysloader.getResourceAsStream(u) import java.nio.file.Files as Files import java.nio.file.Paths as Paths import java.lang.System as System import java.util.List from java.awt import * import ucar.unidata.idv.DefaultIdv as DefaultIdv import ucar.unidata.idv.ui.ImageGenerator as ImageGenerator idv = DefaultIdv([]) islInterpreter = ImageGenerator(idv) # need to load a few resources from the classpath my_files = ["ucar/unidata/idv/resources/python/shell.py", "ucar/unidata/idv/resources/python/isl.py"] cpl = resourceLoader() tmpfile = System.getProperty("java.io.tmpdir") + "/idv.py" for f in my_files: inpstr = cpl.loadResource(f) path = Paths.get(tmpfile) Files.copy(inpstr, path) execfile(tmpfile) Files.delete(path)
doc.add(Field("synopsis", synopsis.strip(), TextField.TYPE_STORED)) doc.add(Field("keywords", ' '.join((command, name, synopsis, description)), TextField.TYPE_NOT_STORED)) doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED)) writer.addDocument(doc) if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: python manindex.py <index dir>" else: lucene.initVM(vmargs=['-Djava.awt.headless=true']) directory = SimpleFSDirectory(Paths.get(sys.argv[1])) analyzer = StandardAnalyzer() analyzer = LimitTokenCountAnalyzer(analyzer, 10000) config = IndexWriterConfig(analyzer) writer = IndexWriter(directory, config) manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep) for dir in manpath: print "Crawling", dir for name in os.listdir(dir): path = os.path.join(dir, name) if os.path.isdir(path): indexDirectory(path) writer.commit() writer.close()
def realPathName(path): return Paths.get(sys.netshell_root.toString(), path)
stats = False for o, a in options: if o == "--format": format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue())
def create( name = "Launcher", bundle = [], platforms=["mac", "win"], outdir="dist.platforms", ignorelibs=["*video*"] ): """Creates a launcher for the given platform""" import jycessing.Runner as Runner import jycessing.launcher.StandaloneSketch as StandaloneSketch import sys # Check if we should bail out - we're not running from a standalone sketch if not isinstance(Runner.sketch, StandaloneSketch): print >>sys.stderr, "Don't use launcher.create() from processing - use the export button instead!" return # Check if we are already deployed. In that case, # don't do anything if "--internal" in sys.argv: return # Our own imports import jycessing.launcher.LaunchHelper as LaunchHelper import java.lang.System as System import java.nio.file.Paths as Paths import os, shutil, zipfile, inspect, stat, glob, errno main = System.getProperty("python.main") mainroot = System.getProperty("python.main.root") outdir = mainroot + "/" + outdir # Clean the outdir ... try: shutil.rmtree(outdir) except: pass def copyeverything(src, dst): """The Machine That Copies EVERYTHING. https://www.youtube.com/watch?v=ibEdgQJEdTA """ import shutil, errno try: shutil.copytree(src, dst) except OSError as exc: if exc.errno == errno.ENOTDIR: shutil.copy(src, dst) else: raise def copyjars(root): """Copy jars & co""" sketch = Runner.sketch _mainjar = sketch.getMainJarFile() mainjar, mainjarname = _mainjar.getAbsolutePath(), _mainjar.getName() shutil.copyfile(mainjar, root + "/" + mainjarname) libraries = sketch.getLibraryDirectories() for lib in libraries: shutil.copytree(lib.getPath(), root + "/libraries", ignore=shutil.ignore_patterns(*ignorelibs)) def copydata(runtimedir): """Copy the main script and the given data""" # Create runtime directory try: os.mkdir(runtimedir) except: pass # Copy bundled files for data in bundle: for f in list(glob.iglob(mainroot + "/" + data)): copyeverything(f, runtimedir + "/" + f.replace(mainroot, "")) # Eventually copy the main file shutil.copyfile(main, runtimedir + "/sketch.py") # ... and recreate it os.mkdir(outdir) for platform in platforms: pdir = outdir + "/" + platform tmpfile = pdir + ".zip" os.mkdir(pdir) # Copy archive LaunchHelper.copyResourceTo("launcher." + platform + ".zip", Paths.get(tmpfile)) # Unzip z = zipfile.ZipFile(tmpfile, "r") z.extractall(pdir) z.close() # Try to remove the platform file we created try: os.remove(tmpfile) except Exception, e: print("Could not remove %s we used for creating the launcher. Please report." % tmpfile, e)
def openStore(self): return MMapDirectory(Paths.get(self.STORE_DIR))
def openStore(self): return SimpleFSDirectory(Paths.get(self.STORE_DIR))
def convert(input_svg_path, rotation_x, rotation_y): assert isinstance(input_svg_path, (str, unicode)) assert os.path.splitext(input_svg_path)[1] == ".svg" input_file_name = os.path.splitext(input_svg_path)[0] output_png_path = "{}_rotX_{}_rotY_{}.png".format(input_file_name, rotation_x, rotation_y) _log.info(" converting '%s' to Pocket Code compatible png '%s'", input_svg_path, output_png_path) output_svg_path = input_svg_path.replace(".svg", "_modified.svg") output_svg_URI = Paths.get(output_svg_path).toUri().toURL().toString() if os.path.exists(output_png_path): _log.error(" '%s' already exists", output_png_path) #assert False # "Still a Duplicate?" # remove temporary files if os.path.exists(output_svg_path): os.remove(output_svg_path) return output_png_path # avoid duplicate conversions! png_ostream = None error = None try: _parse_and_rewrite_svg_file(input_svg_path, output_svg_path) input_svg_image = TranscoderInput(output_svg_URI) output_png_image = TranscoderOutput(FileOutputStream(output_png_path)) _log.info(" converting '%s' to Pocket Code compatible png '%s'", input_svg_path, output_png_path) png_converter = PNGTranscoder() png_converter.transcode(input_svg_image, output_png_image) assert os.path.exists(output_png_path) final_image = _translation(output_png_path, rotation_x, rotation_y) if final_image is None: raise RuntimeError("...") from javax.imageio import ImageIO from java.io import File ImageIO.write(final_image, "PNG", File(output_png_path)) return output_png_path except BaseException as err: import traceback import sys exc_info = sys.exc_info() _log.error(err) _log.error(traceback.format_exc()) _log.error(exc_info) error = common.ScratchtobatError("SVG to PNG conversion call failed for: %s" % input_svg_path) finally: # free resources if png_ostream != None: png_ostream.flush() png_ostream.close() # remove temporary files if os.path.exists(output_svg_path): os.remove(output_svg_path) if error != None: raise error
def readJsonFile (self, filePath = None): fp = Paths.get(filePath); jsonStr = Files.readAllBytes(fp); return self.deserFromJson(jsonStr)
def getReader(path): return DirectoryReader.open(FSDirectory.open(Paths.get(path)))