def process_wget_output(lines): ind = index.Index() logger.info('Worker processing lines') parsed_paths = [] for line in lines: p = parse(line) logger.info('Worker input {}'.format(line)) logger.info('Parsed {}'.format(p)) if p is not None: parsed_paths.append(p) with SqliteDict(RETRIEVE_CACHE_PATH) as url_map: for remote_url, local_path in parsed_paths: local_path = local_path.replace(WGET_DOWNLOADS + "/", "") remote_url = path_utils.strip_scheme(remote_url) url_map[remote_url] = local_path url_map.commit() for remote_url, local_path in parsed_paths: if is_html_file(local_path): local_path = local_path.replace(WGET_DOWNLOADS + "/", "") logger.info('Found non-none path in wget output {} {}'.format( remote_url, local_path)) ind.index_html(remote_url, local_path) logger.info('Worker EOF reached')
def reloadIndex(self): """ reloads Index if necessary """ reloaded = reload(index, "index.py") if reloaded or not self.index: self.index = index.Index("index") return index
def __init__(self): """ We keep a record of all magic handlers and instantiate them all. """ if not MagicResolver.indexer: MagicResolver.indexer = index.Index() for cls in Registry.MAGIC_HANDLERS.classes: cls = cls() MagicResolver.magic_handlers.append(cls) for rule in cls.regex_rules: MagicResolver.indexer.add_word(rule[0], MagicResolver.count, index.WORD_EXTENDED) MagicResolver.index_map[MagicResolver.count] = cls MagicResolver.rule_map[MagicResolver.count] = rule MagicResolver.count += 1 for rule in cls.literal_rules: MagicResolver.indexer.add_word(rule[0], MagicResolver.count, index.WORD_ENGLISH) MagicResolver.index_map[MagicResolver.count] = cls MagicResolver.rule_map[MagicResolver.count] = rule MagicResolver.count += 1 pyflaglog.log( pyflaglog.DEBUG, "Loaded %s signatures into Magic engine" % MagicResolver.count)
def __init__(self, screen): self.board = [ Cell(i % Config.CELL_NUM, i / Config.CELL_NUM) for i in range(Config.CELL_NUM**2) ] self.at(3, 3).state = Cell.WHITE self.at(3, 4).state = Cell.BLACK self.at(4, 3).state = Cell.BLACK self.at(4, 4).state = Cell.WHITE self.empty_img = pygame.image.load('empty.png').convert() self.black_img = pygame.image.load('black.png').convert() self.white_img = pygame.image.load('white.png').convert() self.empty_rect = self.empty_img.get_rect() self.black_rect = self.empty_img.get_rect() self.white_rect = self.empty_img.get_rect() self.screen = screen self.__index = index.Index() self.__dummyCell = Cell(-1, -1) # at()での範囲外のマスへの参照用 self.__prevStates = None self.__referenceContainer = self.__initReferenceContainer() # サイズは64 self.__emptyCells = [cell for cell in self.board ] # 空マスのリスト. placeableCells()で使用する self.modifyEmptyCells(3, 3) self.modifyEmptyCells(3, 4) self.modifyEmptyCells(4, 3) self.modifyEmptyCells(4, 4)
def main(): """ Do not modify. Run and evaluate all methods. """ queries, relevances, docs = read_data() NHITS = 10 indexer = index.Index(docs) scorers = [ score.Cosine(), score.RSV(), score.BM25(k=1, b=.5), score.BM25(k=1, b=1), score.BM25(k=2, b=.5), score.BM25(k=2, b=1) ] evaluators = [ evaluate.Precision(), evaluate.Recall(), evaluate.F1(), evaluate.MAP() ] all_results = run_all(queries, relevances, docs, indexer, scorers, evaluators, NHITS) write_results(all_results, 'Results.md')
def parse_folder(path): """ Parses all .bib files in given folder. Returns a tuple (parsed_iten, search_index) containing all items found """ if not os.path.isdir(path): raise Exception("Path to folder expected") parsed_items = [] files = utils.search_in_folder(path, lambda path: path.endswith(".bib")) executor = concurrent.futures.ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) futures = [ executor.submit( BibParser()._parse_file, os.path.join(path, filename) ) for filename in files ] for future in futures: parsed_items += future.result() executor.shutdown() parsed_items = list(sorted( parsed_items, key=BibItem.key_to_key_func(const.DEFAULT_ORDER_BY) )) item_index = search_index.Index(parsed_items) fin_ctx = FinalizingContext(item_index) for item in parsed_items: item.finalize_item_set(fin_ctx) item_index.update(parsed_items) return (parsed_items, item_index)
def test01timing_tests(self): """ Tests timing of indexing """ for count in [10, 100, 1000, 10000, 100000]: words = [ line.strip() for line in open(self.word_file) if len(line) >= 3 ] idx = index.Index() t = time.time() print "Loading %s words: " % count, ## We want to load words from the dictionary in random ## order so we dont have a bias due to the fact the ## dictionary is sorted. print "Randomizing dictionary words" for line_count in range(0, count): if len(words) == 0: break i = random.randint(0, len(words) - 1) idx.add_word(words.pop(i), 1, index.WORD_LITERAL) new_t = time.time() print "Done - %s seconds (%s lines)" % (new_t - t, line_count) fd = open(self.test_file) count = 0 while 1: data = fd.read(1024 * 1024) if len(data) == 0: break for offset, matches in idx.index_buffer(data, unique=False): for id, length in matches: count += 1 print "Indexed file in %s seconds (%s hits)" % (time.time() - new_t, count)
def build_index(): idx = index.Index() for i in range(32, sys.maxunicode + 1): char = chr(i) for word in unicodedata.name(char, "").split(): idx.add(word, char) return idx
def run(driver_path=None): while True: try: print_menu() menu = input('Which information do you want to know : ') if menu == '1': i = index.Index(driver_path) i.print_index() elif menu == '2': stockgraph.run() elif menu == '3': r = rate.Rate(driver_path) r.print_rate() elif menu == '4': r = rate.Rate(driver_path) r.calculate_rate() elif menu == '5': realtimePrice.run_mystock() break elif menu == '6': import sys sys.exit() else: print('invalid number!! write valid number') except KeyboardInterrupt as e: print('시스템이 강제로 종류 되었습니다.\n') break
def test_OverlapSearchEngine(self): request = "department matrix programming" index = Index.Index( os.path.abspath(os.path.join(testdir, "resources/test_cacm.all")), os.path.abspath(os.path.join(testdir, "resources/common_words"))) searchEngine = SearchEngine.OverlapSearchEngine(index) result = searchEngine.search(request) self.assertEqual(result, [0])
def home(): if request.method == 'POST': new_task = request.form.get('task') time1 = request.form.get('time1') time2 = request.form.get('time2') index.Index().home(new_task=new_task, time1=time1, time2=time2) tasks = database.Task.query.order_by(database.Task.time1).all() return render_template('index.html', tasks=tasks)
def __init__(self, opts): self.log = SophLogger("textEngine.log") self.options = opts self.dir = opts.get("dir", "index") self.start = opts.get("startIndexing", False) self.maxResults = int(opts.get("maxResults", 150)) self.index = index.Index(self.dir, start=self.start) self.qp = question.DumbQuestionParser()
def build_idx(self, dictionary): ## build an indexer: idx = index.Index() for k, v in dictionary.items(): idx.add_word(v, k, index.WORD_ENGLISH) return idx
def test_BooleanSearchEngine(self): request = "AND(department,OR(NOT(program),matrix))" booleanRequest = SearchEngine.BooleanRequestParser().parse(request) index = Index.Index( os.path.abspath(os.path.join(testdir, "resources/test_cacm.all")), os.path.abspath(os.path.join(testdir, "resources/common_words"))) searchEngine = SearchEngine.BooleanSearchEngine(index) result = searchEngine.search(booleanRequest) self.assertEqual(result, [0])
def test_splitCACMFile(self): index = Index.Index() listOfDocuments = [] with open( os.path.abspath( os.path.join(testdir, "resources/test_cacm.all")), "r") as cacmFile: listOfDocuments = index.splitCACMFile(cacmFile) self.assertEqual(listOfDocuments[0], self.test_document)
def index(self): """Returns the backend index of this object (instantiating it if it didn't already exist). """ import index if not self._index: create = not index.exists(self.storage, indexname = self.indexname) self._index = index.Index(self.storage, create = create, schema = self._schema(), indexname = self.indexname) return self._index
def executeQuery(self): query = open(self.nomeArqQuery).read().lower() query = util.tokenize(query) indObj = index.Index(self.nomeArqBase, self.nomeArqIndice) ind = indObj.loadIndex() indArqs = self.base.keys() #substitui os tokens pelos indices for i, v in enumerate(query): if v not in self.OPERATORS: query[i] = [int(oc.doc) for oc in ind[v]] #NOT while True: flag = False for i, v in enumerate(query): if v == "!": flag = True del query[i] #remove operator query[i] = conditions.Condition.notCondition(query[i], indArqs) if not flag: break #AND while True: flag = False for i, v in enumerate(query): if v == "&": flag = True del query[i] #remove operator query[i - 1] = conditions.Condition.andCondition(query[i-1], query[i]) #execute intersection del query[i] #remove one of the lists if not flag: break #OR while True: flag = False for i, v in enumerate(query): if (v == "|"): flag = True del query[i] #remove operator query[i - 1] = conditions.Condition.orCondition(query[i-1], query[i]) #execute intersection del query[i] #remove one of the lists if not flag: break query[0].sort() return [self.base[q] for q in query[0]]
def test_createIndexFromCACMFile(self): index = Index.Index( os.path.abspath(os.path.join(testdir, "resources/test_cacm.all")), os.path.abspath(os.path.join(testdir, "resources/common_words"))) self.assertEqual(index.getIndexWithDocid(0), { 'depart': 1, 'program': 1, 'scheme': 1, 'matrix': 1, 'techniqu': 1 })
def test_search_items(self): """ Tests if parsed items can be searched by a bunch of parameters """ items = bib_parser.BibParser().parse_string(TEST_ITEMS) item_index = index.Index(items) for item in items: item.process_crossrefs(item_index) item_index.update(items) author_search = search.search_for_iterable("author", "Петров") filtered_items = filter(author_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing exact match year_search = search.and_([ search.search_for("year_from", 1825), search.search_for("year_to", 1825) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing partial intersection year_search = search.and_([ search.search_for("year_from", 1500), search.search_for("year_to", 1600) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing inner containment year_search = search.and_([ search.search_for("year_from", 1499), search.search_for("year_to", 1501) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) #testing outer containment year_search = search.and_([ search.search_for("year_from", 1400), search.search_for("year_to", 1600) ]) filtered_items = filter(year_search, items) self.assertEqual(len(list(filtered_items)), 1) filtered_items = item_index["keywords"]["grumbling"] self.assertEqual(len(list(filtered_items)), 1) filtered_items = \ item_index["keywords"]["cinquecento"] & \ item_index["keywords"]["historical dance"] self.assertEqual(len(list(filtered_items)), 1)
def test_persistIndex(self): index = Index.Index( os.path.abspath(os.path.join(testdir, "resources/test_cacm.all")), os.path.abspath(os.path.join(testdir, "resources/common_words"))) index.persistIndex("test/resources/index.txt") json = "" with open("test/resources/index.txt", "r") as persistedIndex: json = persistedIndex.read() self.assertEqual( json, """[{"program": 1, "scheme": 1, "depart": 1, "techniqu": 1, "matrix": 1}, {"program": 1, "engin": 1, "glossari": 1, "of": 1, "comput": 2, "terminolog": 1}]""" )
def test_inverted_index_search(self): items = bib_parser.BibParser()._parse_string(TEST_ITEMS) item_index = index.Index(items) DIRECT_KEY = "cinquecento" INVERTED_KEY = const.INVERTED_INDEX_KEY_PREFIX + DIRECT_KEY subindex = item_index["keywords"] self.assertIn(DIRECT_KEY, subindex) self.assertIn(INVERTED_KEY, subindex) filtered_items = item_index["keywords"][INVERTED_KEY] self.assertEqual(len(filtered_items), 1) self.assertEqual(utils.first(filtered_items).id(), "id_2")
def test04uniqueIndexing(self): """ Test unique indexing mode """ idx = index.Index(unique=True) idx.add_word("\d{2,5}", 1, index.WORD_EXTENDED) data = "1234567890" * 3 results = [] for offset, matches in idx.index_buffer(data, unique=True): for id, length in matches: print "Found hit %s" % data[offset:offset + length] results.append(offset) ## We should only find a single hit since we are in unique ## mode self.assertEqual(len(results), 1)
def on_botonIniciar_clicked(self, widget): # Se busca el usuario y clave para ver si es un usuario correcto. usuario = users.buscarLogin(self.textoUsuario.get_text(), self.textoClave.get_text()) # Si el login es correcto, muestra bienvenida y abre el index de la aplicación. if (usuario != None): globalDef.glb_usuario = usuario.getId() globalDef.glb_usrNombre = usuario.getUsername() mostrar = mensajes.aviso( self.winLogin, mensajes.LOGIN_TRUE + '\n' + usuario.getName()) self.winLogin.hide( ) # El login fue correcto, oculta la ventana de ingreso. inicio = index.Index() # Ventana princiapal de la aplicación. else: mostrar = mensajes.error(self.winLogin, mensajes.LOGIN_FALSE)
def test_parse_string(self): """ Tests if string can be succesfully parsed by BibParser """ items = bib_parser.BibParser()._parse_string(TEST_ITEMS) item_index = index.Index(items) languages = set(langid for langid in item_index["langid"].keys() if not langid.startswith("!")) keywords = set(item_index["keywords"].keys()) self.assertEqual(len(items), 2) self.assertEqual(languages, EXPECTED_LANGUAGES) self.assertEqual(keywords, EXPECTED_KEYWORDS) item1 = next(iter(item_index["id"]["id_1"])) self.assertTrue('{' not in item1.title()) self.assertTrue('}' not in item1.title())
def __init__(self, pakfire, name, description): self.pakfire = pakfire self.name = name self.description = description # Reference to corresponding Repo object in the solver. self.solver_repo = satsolver.Repo(self.pool, self.name) self.solver_repo.set_priority(self.priority) # Some repositories may have a cache. self.cache = None log.debug("Initialized new repository: %s" % self) # Create an index (in memory). self.index = index.Index(self.pakfire, self) # Marks if this repository has been opened. self.opened = False
def test02UCS16Indexing(self): """ Test unicode indexing - simple words """ dictionary = { 5: u"hello", 10:u"world" } ## These are the encodings which will be tested: encodings = ["utf-16_le", "utf-16_be", "rot-13", "ms-pst"] line = u"Hello cruel world, hello..." print for encoding in encodings: print "Testing encoding %s" % encoding idx = index.Index() for k,v in dictionary.items(): idx.add_word(v.encode(encoding), k, index.WORD_LITERAL) data = line.encode(encoding) for offset, matches in idx.index_buffer(data): for id , length in matches: word = dictionary[id] matched = data[offset:offset+length] print "word: %s" % word, "matched: %r" % matched self.assertEqual(word.lower(), matched.decode(encoding).lower())
def reindex(): global INDEX, INDEX_VERSION pyflaglog.log(pyflaglog.DEBUG, "Index manager: Building index trie") start_time = time.time() dbh = DB.DBO() INDEX_VERSION = Indexing.get_dict_version() dbh.execute("select word,id,type,class from dictionary") INDEX = index.Index() for row in dbh: ## Classes starting with _ are private classes and want to ## return all hits. if row['class'].startswith("_"): id = row['id'] + 2**30 else: id = row['id'] t = row['type'] ## Literal and extended are encoded using latin if t == 'literal': INDEX.add_word(row['word'].decode("latin").encode("latin"), id, index.WORD_LITERAL) elif t == 'regex': if type(row['word']) == str: word = row['word'].decode('latin') else: word = row['word'] INDEX.add_word(word.encode("latin"), id, index.WORD_EXTENDED) elif t == 'word': try: word = row['word'].decode("UTF-8").lower() for e in config.INDEX_ENCODINGS.split(","): w = word.encode(e) if len(w) >= 3: INDEX.add_word(w, id, index.WORD_ENGLISH) except UnicodeDecodeError, error: pyflaglog.log( pyflaglog.ERROR, "Unable to encode in encoding %e: %s" % (e, error))
def test_parse_string(self): """ Tests if string can be succesfully parsed by BibParser """ items = bib_parser.BibParser().parse_string(TEST_ITEMS) item_index = index.Index(items) for item in items: item.process_crossrefs(item_index) item_index.update(items) languages = set(item_index["langid"].keys()) keywords = set(item_index["keywords"].keys()) self.assertEqual(len(items), 2) self.assertEqual(languages, EXPECTED_LANGUAGES) self.assertEqual(keywords, EXPECTED_KEYWORDS) item1 = next(iter(item_index["id"]["id_1"])) self.assertTrue('{' not in item1.title()) self.assertTrue('}' not in item1.title()) self.assertEqual( item1.annotation(), '<a href="http://example.com/description">http://example.com/description</a>' )
def mainprogram(self): num_class = self.comboBox.currentText() self.index = index.Index(num_class) self.close() self.index.show()
def main(): idx = index.Index() master = tk.Tk() GUI = gui(master, idx) master.mainloop()