Beispiel #1
0
    def __init__(self, root, storeDir, doIndex=False):

        self.analyzer = StandardAnalyzer()

        if not os.path.exists(storeDir):
            os.mkdir(storeDir)

        if doIndex:
            store = SimpleFSDirectory(Paths.get(storeDir))

            analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
            config = IndexWriterConfig(analyzer)
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexDocs(root, writer)
            ticker = Ticker()
            print("commit index")
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print("done")

        directory = SimpleFSDirectory(Paths.get(storeDir))
        self.searcher = IndexSearcher(DirectoryReader.open(directory))
Beispiel #2
0
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())
Beispiel #3
0
def main():
    root = session.getModel().getModelRoots().get(1)
    gproject = GProject.getProject(root)

    # Deploy Module (JMDAC File)
    module_jmdac_path = "/module_jmdac_archives"
    moduleArchivePattern = os.path.join(module_jmdac_path,
                                        "AttackTreeDesigner_*.jmdac")
    moduleArchives = glob.glob(moduleArchivePattern)
    assert len(moduleArchives) > 0, "No jmdac archive has been found !"
    moduleArchives.sort(reverse=True)
    print("deploying ", Paths.get(moduleArchives[0]))
    Modelio.getInstance().getModuleService().installModule(
        gproject, Paths.get(moduleArchives[0]))

    # test if module deployed correctly
    attackTreeDesignerModule = findModule("AttackTreeDesigner")
    if attackTreeDesignerModule is None:
        print("Tested module: not found. ABORT! <br/>")
        outputError("/errors_output/deploy-module.err",
                    "AttackTreeDesigner module not found")
        return 1
    else:
        print("Module AttackTreeDesigner found")

    coreSession.save(None)
Beispiel #4
0
 def __init__(self,
              index_path,
              field,
              similarity="boolean",
              use_relevance_feedback=False,
              feedback_index_path=None):
     self.reader = DirectoryReader.open(
         FSDirectory.open(Paths.get(index_path)))
     self.searcher = IndexSearcher(self.reader)
     if use_relevance_feedback and feedback_index_path is not None:
         self.feedback_reader = DirectoryReader.open(
             FSDirectory.open(Paths.get(feedback_index_path)))
         self.feedback_searcher = IndexSearcher(self.feedback_reader)
     self.similarity = similarity
     self.stopwords = stop_words()
     if similarity == "boolean":
         self.searcher.setSimilarity(BooleanSimilarity())
     elif similarity == "tf":
         self.searcher.setSimilarity(TFSimilarity())
     elif similarity == "tfidf":
         self.searcher.setSimilarity(ClassicSimilarity())
     elif similarity == "BM25":
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     else:
         print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
         self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
     analyzer = StandardAnalyzer()
     print(self.searcher.getSimilarity())
     self.parser = QueryParser(field, analyzer)
Beispiel #5
0
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()
Beispiel #6
0
    def doInBackground(self):
        #Initialize progress property.
        progress = 0
        self.super__setProgress(progress)

        # "\n download tools list"
        progress = 2
        self.super__setProgress(progress)
        self.delete_file(self.tmpToolsListFile)

        if not self.download_file(self.app.toolsListUrl,
                                  self.tmpToolsListFile):
            # " I cannot download the tools list."
            progress = 3
            self.super__setProgress(progress)
            return

        toolsRefs = read_tools_list(self.tmpToolsListFile)

        #Download tools data as jar files
        progress = 5
        self.super__setProgress(progress)
        self.jarDir = File.separator.join([self.app.SCRIPTDIR, "tools", "jar"])
        if not File(self.jarDir).exists():
            File(self.jarDir).mkdir()
        else:
            #delete old files
            for jarFileName in File(self.jarDir).list():
                File(File.separator.join([self.jarDir, jarFileName])).delete()
        #download new files
        for toolRef in toolsRefs:
            jarFileName = "%s.jar" % toolRef
            jarUrl = "%s/%s" % (self.app.jarBaseUrl, jarFileName)
            jarFilePath = File.separator.join([self.jarDir, jarFileName])
            answer = self.download_file(jarUrl, jarFilePath)
            if not answer:
                # " I cannot download the tools file"
                progress = 6
                self.super__setProgress(progress)
                return

        #Extract tools data from jar files
        self.toolsDir = File.separator.join(
            [self.app.SCRIPTDIR, "tools", "data"])
        progress = 7
        self.super__setProgress(progress)
        self.extract_tools_data_from_jar_files()

        #Remove temporary file
        self.delete_file(self.toolsListFile)
        Files.copy(Paths.get(self.tmpToolsListFile),
                   Paths.get(self.toolsListFile))
        self.delete_file(self.tmpToolsListFile)

        progress = 8
        self.super__setProgress(progress)
Beispiel #7
0
    def doInBackground(self):
        #Initialize progress property.
        progress = 0
        self.super__setProgress(progress)

        # "\n download tools list"
        progress = 2
        self.super__setProgress(progress)
        self.delete_file(self.tmpToolsListFile)

        if not self.download_file(self.app.toolsListUrl, self.tmpToolsListFile):
            # " I cannot download the tools list."
            progress = 3
            self.super__setProgress(progress)
            return

        toolsRefs = read_tools_list(self.tmpToolsListFile)

        #Download tools data as jar files
        progress = 5
        self.super__setProgress(progress)
        self.jarDir = File.separator.join([self.app.SCRIPTDIR, "tools", "jar"])
        if not File(self.jarDir).exists():
            File(self.jarDir).mkdir()
        else:
            #delete old files
            for jarFileName in File(self.jarDir).list():
                File(File.separator.join([self.jarDir, jarFileName])).delete()
        #download new files
        for toolRef in toolsRefs:
            jarFileName = "%s.jar" % toolRef
            jarUrl = "%s/%s" % (self.app.jarBaseUrl, jarFileName)
            jarFilePath = File.separator.join([self.jarDir, jarFileName])
            answer = self.download_file(jarUrl, jarFilePath)
            if not answer:
                # " I cannot download the tools file"
                progress = 6
                self.super__setProgress(progress)
                return

        #Extract tools data from jar files
        self.toolsDir = File.separator.join([self.app.SCRIPTDIR, "tools", "data"])
        progress = 7
        self.super__setProgress(progress)
        self.extract_tools_data_from_jar_files()

        #Remove temporary file
        self.delete_file(self.toolsListFile)
        Files.copy(Paths.get(self.tmpToolsListFile), Paths.get(self.toolsListFile))
        self.delete_file(self.tmpToolsListFile)

        progress = 8
        self.super__setProgress(progress)
Beispiel #8
0
 def __init__(self, directory):
     self.directory = directory
     # create Directories for the search index and for the taxonomy index
     # in RAM or on Disc
     #indexDir = RAMDirectory()
     #taxoDir = RAMDirectory()
     self.indexDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                             INDEX_DIR)))
     self.taxoDir = FSDirectory.open(Paths.get(os.path.join(self.directory,
                                                            TAXONOMY_DIR)))
     # FacetConfig
     self.facets_config = FacetsConfig()
     self.facets_config.setHierarchical("Categories", True)
     self.facets_config.setMultiValued("Categories", True)
Beispiel #9
0
def search_index(indexfile, querytext, top=10, qe=False, default_field="text", display_fields=["subreddit", "author", "text"]):
    lucene.initVM()

    lindex = SimpleFSDirectory(Paths.get(indexfile))
    ireader = DirectoryReader.open(lindex)
    isearcher = IndexSearcher(ireader)

    analyser = StandardAnalyzer()

    parser = QueryParser(default_field, analyser)
    query = parser.parse(querytext)

    hits = isearcher.search(query, top).scoreDocs
    docIDs = [hit.doc for hit in hits]
    print_results(isearcher, hits, display_fields)
    if len(hits) == 0:
        print("No hits!")
    elif qe:
        print("\n")
        print("Which documents were relevant to your search need? (Enter spaced list of result numbers [1-{}], e.g. 2 4 5)".format(top))
        relevantids = [docIDs[i-1] for i in [int(x) for x in input().split()]]
        nonrelevantids = [id for id in docIDs if id not in relevantids]

        print("\n\n")

        qequerytext = queryexpansion.rocchio(ireader, querytext, relevantids, nonrelevantids)
        print("Expanded search query: '{}'\n".format(qequerytext))
        qequery = parser.parse(qequerytext)
        qehits = isearcher.search(qequery, top).scoreDocs
        print_results(isearcher, qehits, display_fields)

    ireader.close()
    lindex.close()
Beispiel #10
0
def find_all_text_occurrences(objects: list) -> (dict, DirectoryReader):
    docs_lookup = dict()
    # noinspection PyUnresolvedReferences
    lucene.initVM(initialheap='32m', maxheap='4G')
    file = Paths.get("D:\GitHubD\BREDS\wiki_text_index\WIKI_TEXT")
    dir = FSDirectory.open(file)
    reader = DirectoryReader.open(dir)
    searcher = IndexSearcher(reader)
    parser = QueryParser('contents', StandardAnalyzer())

    logging.warning(
        'FOR MULTI-WORD OBJECTS, ALL DOCUMENTS WITH BOTH TERMS SEPARATELY WILL BE RETRIEVED'
    )

    for object in objects:
        tokens = object.split(' ')

        doc_sets = []
        for token in tokens:
            q = parser.parse(f'"{token}"')
            # TODO maybe use minimum score
            topdocs = searcher.search(q, 99999999)
            results = set([topdoc.doc for topdoc in topdocs.scoreDocs])
            doc_sets.append(results)
        docs_lookup[object] = set.intersection(*doc_sets)

    return docs_lookup, reader
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        #self.analyzer = StandardAnalyzer()
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searcher = IndexSearcher(self.reader)
        self.dict_term_freq = {}
        if similarity == 'BM25':
            (self.searcher).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
Beispiel #12
0
    def __init__(self, path, analyzer, topn=DEF_TOPN):

        self.path = path
        self._analyzer = analyzer
        self.topn = topn
        self._store = SimpleFSDirectory(Paths.get(os.path.abspath(self.path)))
        self._searcher = IndexSearcher(DirectoryReader.open(self._store))
Beispiel #13
0
    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
def create_index_for_wiki_sentence(filename, path, firstTime=False):
    logging.info('Start create wiki_sentence!')
    wiki_dict = get_wiki_data(path)

    logging.info('Start creating index!')
    filename = '_wiki_sentence'
    analyzer = analysis.standard.StandardAnalyzer()

    # # Store the index in memory:
    base_dir = HOMEPATH
    INDEX_DIR = "IndexFiles" + filename + ".index"
    storeDir = os.path.join(base_dir, INDEX_DIR)
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)
    directory = SimpleFSDirectory(Paths.get(storeDir))
    if firstTime:
        config = index.IndexWriterConfig(analyzer)
        iwriter = index.IndexWriter(directory, config)
        for cnt, key in enumerate(wiki_dict.keys()):
            if cnt % 1000 == 0:
                logging.info(
                    'I have preprocessed {} index in creating index by document!'
                    .format(str(cnt)))
            org_title = key[0]
            preprocessed_title = key[1]
            doc_id = key[2]
            sentence = wiki_dict[key]
            doc = create_document_by_document_sentence(org_title,
                                                       preprocessed_title,
                                                       doc_id, sentence)
            iwriter.addDocument(doc)
        iwriter.close()
    logging.info('Finish creating index wiki_sentence!')
    return directory
def main():
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = SimpleAnalyzer()
    config = IndexWriterConfig(analyzer)
    config = config.setRAMBufferSizeMB(1024.0)
    # write data to index

    if not is_index_Exist:
        print('begin backup code files')
        system_flag = platform.system()
        cmd = 'robocopy %s %s\code_files *.py' % (
            r'%cd%', LUCENE_INDEX_DIR
        ) if system_flag == 'Windows' else 'cp -f *.py %s\code_files' % (
            LUCENE_INDEX_DIR)
        os.system(cmd)

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
Beispiel #16
0
def retriever(file_dir):
    analyzer = WhitespaceAnalyzer()
    reader = DirectoryReader.open(
        SimpleFSDirectory(Paths.get(file_dir + "/lucene_index/")))
    searcher = IndexSearcher(reader)
    queryParser = QueryParser("code", analyzer)
    BooleanQuery.setMaxClauseCount(Integer.MAX_VALUE)

    with open(file_dir + "/train/train.spl.src",
              'r') as fso, open(file_dir + "/train/train.txt.tgt", 'r') as fsu:
        sources = [line.strip() for line in fso.readlines()]
        summaries = [line.strip() for line in fsu.readlines()]
    with open(file_dir+"/test/test.ast.src") as ft, open(file_dir+"/test/test.ref.src.0", 'w') as fwo, \
            open(file_dir+"/output/ast.out", 'w') as fws:
        queries = [
            re.sub("[\W\s]+|AND|NOT|OR", ' ', line.strip())
            for line in ft.readlines()
        ]

        for i, line in enumerate(queries):
            print("query %d" % i)
            query = queryParser.parse(QueryParser.escape(line))
            hits = searcher.search(query, 1).scoreDocs
            flag = False

            for hit in hits:
                doc = searcher.doc(hit.doc)
                _id = eval(doc.get("id"))
                flag = True
                fwo.write(sources[_id] + '\n')
                fws.write(summaries[_id] + '\n')
            if not flag:
                print(query)
                print(hits)
                exit(-1)
Beispiel #17
0
	def getDoc(self, file):
		try:
			f = open(os.getcwd()+FILE_DIR+'/'+file, "r")

			try:
				c = []
				s = BeautifulSoup(f, 'html.parser')
				text = s.findAll(text=True)
				c = filter(tag_vis, text)
				try:
					c = ' '.join(c)
				except Exception as e:
					c = b' '.join(c)
			except Exception as e:
				print(str(e))
				return
			content = TextField("contents", c, Field.Store.YES)
			fileName = str(Paths.get(file)).split('/')[-1]
			fileName = fileName[:fileName.find(".")]
			filename = TextField("filename",
							 fileName,
							 Field.Store.YES)
			path = TextField("filepath",
						 str(os.getcwd()+FILE_DIR+'/'+file),
						 Field.Store.NO)
			doc = Document()
			doc.add(content)
			doc.add(filename)
			doc.add(path)
			return doc
		except Exception as e:
			print(type(Exception).__name__)
			print(str(e))
			return
Beispiel #18
0
    def __init__(self, index_dir, mode, date_format='%Y-%m-%dT%H:%M:%S'):
        """Constructor of Indexer.

        Parameters
        ----------
        index_dir : string
            The location of lucene index
        mode : string
            The mode when opening lucene index. Available values are:
                'create', open new index and overwriting over index,
                'append', open existed index and append.
                'create_or_append', if `index_dir` exists, 'append',
                else 'create'
        date_format : string
            We save datetime field as string, `date_format` specify how to
            format datetime into string.
        """
        # self.store = FSDirectory.open(File(index_dir))
        self.store = FSDirectory.open(Paths.get(index_dir))
        # self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.analyzer = StandardAnalyzer()
        # self.config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        self.config = IndexWriterConfig(self.analyzer)
        self.mode = mode
        self.date_format = date_format
        if mode == 'create_or_append':
            self.config.setOpenMode(
                IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        elif mode == 'create':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        elif mode == 'append':
            self.config.setOpenMode(IndexWriterConfig.OpenMode.APPEND)
        else:
            raise ValueError('Invalid mode %s', mode)
        self.writer = IndexWriter(self.store, self.config)
    def __init__(self,
                 LUCENE_INDEX_DIR,
                 similarity='BM25',
                 lucene_vm_flag=False,
                 is_bigram_cache_used=False,
                 mongoObj=None):
        if lucene_vm_flag == False:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        self.lucene_vm_init = True
        self.index_dir = LUCENE_INDEX_DIR
        self.index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
        self.analyzer = SimpleAnalyzer()
        self.config = IndexWriterConfig(self.analyzer)
        self.reader = DirectoryReader.open(self.index_mm)
        self.searchers = []
        self.searchers.append(IndexSearcher(self.reader))
        if similarity == 'BM25':
            (self.searchers[0]).setSimilarity(BM25Similarity())

        # load bigram cache
        self.is_bigram_cache_used = is_bigram_cache_used
        if is_bigram_cache_used == True:
            seperate_char = '/' if self.index_dir.find('/') > -1 else '\\'
            index_name = self.index_dir.split(seperate_char)[-1]
            self.index_name = index_name
            self.conn_bigram_tf_cache = mongoObj.db[index_name + '_tf_cache']
            self.conn_bigram_cf_cache = mongoObj.db[index_name + '_cf_cache']
            if 'stemmed_wikipedia' in LIST_F or 'wikipedia' in LIST_F:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache_with_wikipedia']
            else:
                self.conn_mapping_prob_cache = mongoObj.db[
                    index_name + '_mapping_prob_cache']
    def retrieve_sents(self):

        indexDir = self.indexDir
        query = self.query

        sent_ind_list = []
        # template = CustomTemplate(format)
        fsDir = SimpleFSDirectory(Paths.get(indexDir))
        # print indexDir
        searcher = IndexSearcher(DirectoryReader.open(fsDir))

        analyzer = StandardAnalyzer()
        parser = QueryParser("contents", analyzer)
        parser.setDefaultOperator(QueryParser.Operator.OR)
        query = parser.parse(query)
        # print query
        start = datetime.now()
        scoreDocs = searcher.search(query, 50).scoreDocs
        duration = datetime.now() - start
        # print query
        if self.stats:
            print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % (
                len(scoreDocs), duration, query)

        for scoreDoc in scoreDocs:
            # print scoreDoc.doc
            # doc = searcher.doc(scoreDoc.doc)
            sent_ind_list.append(scoreDoc.doc)

        return sent_ind_list
Beispiel #21
0
def main():
    LUCENE_INDEX_DIR = 'mmapDirectory/trec_v15_wikipedia_stemmed_v2'
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True
    except:
        print('JavaVM already running')

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    #config=config.setRAMBufferSizeMB(1024.0)  # experimental setting !!
    # write data to index

    if not is_index_Exist:
        #if True:
        print('begin backup code files')
        system_flag = platform.system()
        if system_flag == 'Windows':
            os.system('robocopy %s %s\code_files *.py' %
                      (r'%cd%', LUCENE_INDEX_DIR))
        else:
            os.system('mkdir %s/code_files' % (LUCENE_INDEX_DIR))
            os.system('cp *.py %s/code_files' % (LUCENE_INDEX_DIR))

        w = IndexWriter(index_mm, config)
        makeIndex(w)
        w.close()
    else:
        print('index already exists, stop indexing')
Beispiel #22
0
 def __init__(self, path=INDEX_DIR):
     # 初始化lucene,设置好analyzer、reader、searcher和分词器
     lucene.initVM()
     self.indir = SimpleFSDirectory(Paths.get(path))
     self.analyzer = SmartChineseAnalyzer()
     self.reader = DirectoryReader.open(self.indir)
     self.searcher = IndexSearcher(self.reader)
Beispiel #23
0
def main():
    resultados = []
    indice_vacio = False
    if len(os.listdir("./lucene/index")) == 0:
        indice_vacio = True
    else:
        consulta = request.args.get("consulta", None)
        if consulta is not None:
            directory = SimpleFSDirectory(Paths.get("./lucene/index"))
            searcher = IndexSearcher(DirectoryReader.open(directory))
            analyzer = SpanishAnalyzer()
            query = QueryParser("texto", analyzer).parse(consulta)
            scoreDocs = searcher.search(query, 10).scoreDocs

            for sd in scoreDocs:
                doc = searcher.doc(sd.doc)
                resultados.append({
                    "url": direccion_base + doc.get("pdf"),
                    "titulo": doc.get("titulo")
                })

    return render_template("main.html",
                           lucene=lucene.VERSION,
                           indice_vacio=indice_vacio,
                           resultados=resultados)
Beispiel #24
0
def retrieve(command):
    try:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    except ValueError:
        print "JVM running."

    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(Paths.get(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()

    # to convert to AND query
    command = re.sub(r' ', r' +', command)
    command = "+" + command

    print "Searching for:", command
    query = QueryParser("contents", analyzer).parse(command)
    print query
    scoreDocs = searcher.search(query, 500).scoreDocs
    print "%s total matching documents." % len(scoreDocs)

    retrieved_docs = []
    for scoreDoc in scoreDocs:
        doc = searcher.doc(scoreDoc.doc)
        retrieved_docs.append(os.path.join(doc.get("path"), doc.get("name")))

    del searcher
    return retrieved_docs
 def __init__(self, indexDir):
     self.directory = SimpleFSDirectory(Paths.get(indexDir))
     self.searcher = IndexSearcher(DirectoryReader.open(self.directory))
     self.nameQueryParser = QueryParser('name', StandardAnalyzer())
     self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND)
     self.idQueryParser = QueryParser('id', StandardAnalyzer())
     self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND)
Beispiel #26
0
    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()
Beispiel #27
0
    def __init__(self, index_store_path):

        store = NIOFSDirectory(Paths.get(index_store_path))
        analyzer = StandardAnalyzer()
        config = IndexWriterConfig(analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        self.writer = IndexWriter(store, config)
def build_index(document_path, dir_path):
    lucene.initVM()
    index_dir = SimpleFSDirectory(Paths.get(dir_path))
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
    index_writer = IndexWriter(index_dir, config)

    t1 = FieldType()
    t1.setStored(True)
    t1.setTokenized(True)
    t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

    t2 = FieldType()
    t2.setStored(True)
    t2.setTokenized(False)

    with open(document_path) as input_file:
        for line in input_file:
            segs = line.strip().split(" ")
            music_path, music_tags = segs[0], segs[1].split(",")

            document = Document()
            document.add(Field("content", " ".join(music_tags), t1))
            document.add(Field("url", music_path, t2))
            index_writer.addDocument(document)

    index_writer.close()
Beispiel #29
0
	def __init__(self, indexDir):
		f = Paths.get(indexDir)
		self._dir = SimpleFSDirectory(f)
		analyzer = StandardAnalyzer()
		config = IndexWriterConfig(analyzer)
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
		self._writer = IndexWriter(self._dir, config)
Beispiel #30
0
    def create_index(self, index_folder, docs_path, add_terms=False):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()
Beispiel #31
0
 def openStore(self):
     INDEX_DIR = "projectIndexFiles.index"
     path1 = "D:/IR Dataset/"
     base_dir = os.path.dirname(os.path.abspath(path1))
     storeDir = os.path.join(base_dir, INDEX_DIR)
     store = SimpleFSDirectory(Paths.get(storeDir))
     return store
Beispiel #32
0
 def __init__(self, store_dir):
     self.store_dir = store_dir
     if not os.path.exists(store_dir):
         os.mkdir(store_dir, 0777)
     self.store = SimpleFSDirectory(Paths.get(store_dir))
     self.searcher = None
     self.analyzer = StandardAnalyzer()
     self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
Beispiel #33
0
 def __init__(self, file_name):
     path = Paths.get(file_name)
     self.file_name = path.toAbsolutePath().toString()
     self.file = File(self.file_name)
     self.db_factory = DocumentBuilderFactory.newInstance()
     self.db_builder = self.db_factory.newDocumentBuilder()
     self.doc = self.db_builder.parse(self.file)
     self.doc.getDocumentElement().normalize()
 def _getLucene(self, path):
     directory = FSDirectory.open(Paths.get(path))
     config = IndexWriterConfig(None)
     config.setRAMBufferSizeMB(256.0) # faster
     config.setUseCompoundFile(False) # faster, for Lucene 4.4 and later
     writer = IndexWriter(directory, config)
     reader = writer.getReader()
     searcher = IndexSearcher(reader)
     return writer, reader, searcher
def extract_file_from_jar(config_file):
    file_url = LoaderUtil.getResourceBySelfClassLoader(config_file)
    if file_url:
        tmp_file, tmp_abs_path = tempfile.mkstemp()
        tmp_file.close()
        Files.copy(file_url.openStream(), Paths.get(tmp_abs_path), StandardCopyOption.REPLACE_EXISTING)
        return tmp_abs_path
    else:
        return None
Beispiel #36
0
 def __init__(self, store_dir):
     self.store_dir = store_dir
     if not os.path.exists(store_dir):
         os.mkdir(store_dir, 0777)
     self.store = SimpleFSDirectory(Paths.get(store_dir))
     self.analyzer = StandardAnalyzer()
     self.analyzer = LimitTokenCountAnalyzer(self.analyzer, 1048576)
     self.config = IndexWriterConfig(self.analyzer)
     self.writer = IndexWriter(self.store, self.config)
Beispiel #37
0
def getLucene(path):
    directory = FSDirectory.open(Paths.get(path))
    analyzer = WhitespaceAnalyzer()
    config = IndexWriterConfig(analyzer)
    config.setIndexSort(Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG)))
    writer = IndexWriter(directory, config)
    reader = writer.getReader()
    searcher = IndexSearcher(reader)
    return writer, reader, searcher
Beispiel #38
0
 def readFileToStructure(self, path, structure):
     header = PdbHeader()
     header.setTitle(Paths.get(path).getFileName().toString())
     structure.setPDBHeader(header)
     model = ReadFile.getModelFromFile(self, path)
     structure.setChains(Lists.newArrayList(model))
     info = PdbCryst()
     info.setSpaceGroup(SpaceGroup(0, 1, 1, "P 1", "P 1", BravaisL.CUBIC))
     info.setCrystalCell(ReadFile.getBox(self, info.getSpaceGroup().getBravLattice().getExampleUnitCell()))
     header.setCrystallographicInfo(info)
     return structure
Beispiel #39
0
 def download_file(self, url, filePath):
     """Downloads a file form url and save it as filePath
     """
     try:
         print "\ndownloading"
         print url
         print filePath
         inputStream = URI.create(url).toURL().openStream()
         Files.copy(inputStream, Paths.get(filePath))
         return True
     except (UnknownHostException, SocketException), e:
         print e
         print "I cannot download:\n%s" % url
         return False
Beispiel #40
0
 def __init__(self, dbName, dropDB=False):
     #self.initObject = lucene.initVM() #default 2048? #vmargs=['-Djava.awt.headless=true']
     """
     attachCurrentThread(name, asDaemon)
     Before a thread created in Python or elsewhere but not in the Java VM
     can be used with the Java VM, this method needs to be invoked.
     The two arguments it takes are optional and self-explanatory.
     """
     #self.initObject.attachCurrentThread('LuceneDB', True)
     luceneVM.attachCurrentThread('LuceneDB')
     self.analyzer = StandardAnalyzer() #split on whitespace, no trunkation or stemming
     self.indexDir = None
     self.searcher = None
     (user,db) = dbName.split('_', 1)
     directory = "./files/"+user+'/'+db+'/LuceneIndex'
     if dropDB: shutil.rmtree(directory)
     self.indexDir = SimpleFSDirectory(Paths.get(directory)) #creates directory if not exists
Beispiel #41
0
    def loadResource(self, u) :
        sysloader = self.java.lang.ClassLoader.getSystemClassLoader()
        return sysloader.getResourceAsStream(u)

import java.nio.file.Files as Files
import java.nio.file.Paths as Paths
import java.lang.System as System
import java.util.List
from java.awt import *
import ucar.unidata.idv.DefaultIdv as DefaultIdv
import ucar.unidata.idv.ui.ImageGenerator as ImageGenerator

idv = DefaultIdv([])
islInterpreter = ImageGenerator(idv)

# need to load a few resources from the classpath

my_files = ["ucar/unidata/idv/resources/python/shell.py",
           "ucar/unidata/idv/resources/python/isl.py"]

cpl = resourceLoader()
tmpfile = System.getProperty("java.io.tmpdir") + "/idv.py"

for f in my_files:
    inpstr = cpl.loadResource(f)
    path = Paths.get(tmpfile)
    Files.copy(inpstr, path)
    execfile(tmpfile)
    Files.delete(path)
Beispiel #42
0
    doc.add(Field("synopsis", synopsis.strip(), TextField.TYPE_STORED))
    doc.add(Field("keywords", ' '.join((command, name, synopsis, description)),
                  TextField.TYPE_NOT_STORED))
    doc.add(Field("filename", os.path.abspath(path), StringField.TYPE_STORED))

    writer.addDocument(doc)


if __name__ == '__main__':

    if len(sys.argv) != 2:
        print "Usage: python manindex.py <index dir>"

    else:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        directory = SimpleFSDirectory(Paths.get(sys.argv[1]))
        analyzer = StandardAnalyzer()
        analyzer = LimitTokenCountAnalyzer(analyzer, 10000)
        config = IndexWriterConfig(analyzer)
        writer = IndexWriter(directory, config)

        manpath = os.environ.get('MANPATH', '/usr/share/man').split(os.pathsep)
        for dir in manpath:
            print "Crawling", dir
            for name in os.listdir(dir):
                path = os.path.join(dir, name)
                if os.path.isdir(path):
                    indexDirectory(path)
        writer.commit()
        writer.close()
Beispiel #43
0
def realPathName(path):
    return Paths.get(sys.netshell_root.toString(), path)
Beispiel #44
0
stats = False
for o, a in options:
    if o == "--format":
        format = a
    elif o == "--index":
        indexDir = a
    elif o == "--stats":
        stats = True


class CustomTemplate(Template):
    delimiter = '#'

template = CustomTemplate(format)

fsDir = SimpleFSDirectory(Paths.get(indexDir))
searcher = IndexSearcher(DirectoryReader.open(fsDir))

analyzer = StandardAnalyzer()
parser = QueryParser("keywords", analyzer)
parser.setDefaultOperator(QueryParser.Operator.AND)
query = parser.parse(' '.join(args))
start = datetime.now()
scoreDocs = searcher.search(query, 50).scoreDocs
duration = datetime.now() - start
if stats:
    print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query)

for scoreDoc in scoreDocs:
    doc = searcher.doc(scoreDoc.doc)
    table = dict((field.name(), field.stringValue())
Beispiel #45
0
    def create(
            name = "Launcher",
            bundle = [],
            platforms=["mac", "win"], 
            outdir="dist.platforms", 
            ignorelibs=["*video*"]
        ):
        """Creates a launcher for the given platform"""

        import jycessing.Runner as Runner
        import jycessing.launcher.StandaloneSketch as StandaloneSketch
        import sys
        # Check if we should bail out - we're not running from a standalone sketch
        if not isinstance(Runner.sketch, StandaloneSketch):
            print >>sys.stderr, "Don't use launcher.create() from processing - use the export button instead!"
            return

        # Check if we are already deployed. In that case, 
        # don't do anything
        if "--internal" in sys.argv: return

        # Our own imports 
        import jycessing.launcher.LaunchHelper as LaunchHelper
        
        import java.lang.System as System
        import java.nio.file.Paths as Paths
        import os, shutil, zipfile, inspect, stat, glob, errno

        main = System.getProperty("python.main")
        mainroot = System.getProperty("python.main.root")

        outdir = mainroot + "/" + outdir

        # Clean the outdir ...
        try: shutil.rmtree(outdir) 
        except: pass


        def copyeverything(src, dst):
            """The Machine That Copies EVERYTHING.
            https://www.youtube.com/watch?v=ibEdgQJEdTA
            """
            import shutil, errno
        
            try:
                shutil.copytree(src, dst)
            except OSError as exc:
                if exc.errno == errno.ENOTDIR:
                    shutil.copy(src, dst)
                else: raise

        def copyjars(root):
            """Copy jars & co"""
            sketch = Runner.sketch
            _mainjar = sketch.getMainJarFile()
            mainjar, mainjarname = _mainjar.getAbsolutePath(), _mainjar.getName()
            shutil.copyfile(mainjar, root + "/" + mainjarname)
            
            libraries = sketch.getLibraryDirectories()
            for lib in libraries:
                shutil.copytree(lib.getPath(), root + "/libraries", ignore=shutil.ignore_patterns(*ignorelibs))


        def copydata(runtimedir):
            """Copy the main script and the given data"""
            # Create runtime directory 

            try: os.mkdir(runtimedir)
            except: pass

            # Copy bundled files
            for data in bundle:
                for f in list(glob.iglob(mainroot + "/" + data)):
                    copyeverything(f, runtimedir + "/" + f.replace(mainroot, ""))


            # Eventually copy the main file
            shutil.copyfile(main, runtimedir + "/sketch.py")


        # ... and recreate it
        os.mkdir(outdir)
        for platform in platforms: 

            pdir = outdir + "/" + platform
            tmpfile = pdir + ".zip"

            os.mkdir(pdir)

            # Copy archive
            LaunchHelper.copyResourceTo("launcher." + platform + ".zip", Paths.get(tmpfile))
            
            # Unzip
            z = zipfile.ZipFile(tmpfile, "r")
            z.extractall(pdir)
            z.close()

            # Try to remove the platform file we created
            try:
                os.remove(tmpfile)
            except Exception, e:
                print("Could not remove %s we used for creating the launcher. Please report." % tmpfile, e)
Beispiel #46
0
    def openStore(self):

        return MMapDirectory(Paths.get(self.STORE_DIR))
Beispiel #47
0
    def openStore(self):

        return SimpleFSDirectory(Paths.get(self.STORE_DIR))
Beispiel #48
0
def convert(input_svg_path, rotation_x, rotation_y):
    assert isinstance(input_svg_path, (str, unicode))
    assert os.path.splitext(input_svg_path)[1] == ".svg"

    input_file_name = os.path.splitext(input_svg_path)[0]
    output_png_path = "{}_rotX_{}_rotY_{}.png".format(input_file_name, rotation_x, rotation_y)
    _log.info("      converting '%s' to Pocket Code compatible png '%s'", input_svg_path, output_png_path)

    output_svg_path = input_svg_path.replace(".svg", "_modified.svg")
    output_svg_URI = Paths.get(output_svg_path).toUri().toURL().toString()

    if os.path.exists(output_png_path):
        _log.error("      '%s' already exists", output_png_path)
        #assert False # "Still a Duplicate?"
        # remove temporary files
        if os.path.exists(output_svg_path):
            os.remove(output_svg_path)
        return output_png_path # avoid duplicate conversions!

    png_ostream = None
    error = None
    try:
        _parse_and_rewrite_svg_file(input_svg_path, output_svg_path)

        input_svg_image = TranscoderInput(output_svg_URI)

        output_png_image = TranscoderOutput(FileOutputStream(output_png_path))

        _log.info("      converting '%s' to Pocket Code compatible png '%s'",
                  input_svg_path, output_png_path)
        png_converter = PNGTranscoder()
        png_converter.transcode(input_svg_image, output_png_image)
        assert os.path.exists(output_png_path)

        final_image = _translation(output_png_path, rotation_x, rotation_y)

        if final_image is None:
            raise RuntimeError("...")

        from javax.imageio import ImageIO
        from java.io import File
        ImageIO.write(final_image, "PNG", File(output_png_path))
        return output_png_path
    except BaseException as err:
        import traceback
        import sys
        exc_info = sys.exc_info()
        _log.error(err)
        _log.error(traceback.format_exc())
        _log.error(exc_info)
        error = common.ScratchtobatError("SVG to PNG conversion call failed for: %s" % input_svg_path)
    finally:
        # free resources
        if png_ostream != None:
            png_ostream.flush()
            png_ostream.close()
        # remove temporary files
        if os.path.exists(output_svg_path):
            os.remove(output_svg_path)

    if error != None:
        raise error
Beispiel #49
0
 def readJsonFile (self, filePath = None):
     fp = Paths.get(filePath);
     jsonStr = Files.readAllBytes(fp);
     return self.deserFromJson(jsonStr)
Beispiel #50
0
def getReader(path):
    return DirectoryReader.open(FSDirectory.open(Paths.get(path)))