def vectorial_search(querystring, collection_index, weight_type): """ Recherche vectorielle de `querystring` dans `collection_index` en utilisant les poids de type `weight_type`. Renvoie les résultats de similarité > 0 (ordonnées par similarité) """ search_results = [] # Resultat de la recherche # On indexe la recherche et on crée son vecteur query_doc = QueryDocument(querystring) query_index = Index([query_doc]) # On calcule le vecteur de la query par rappport a l'index de la collection query_vector = query_index.get_document_vector(query_doc.id, weight_type, collection_index) # On calcule la similarité entre la query et chaque document de la collection for doc_id in collection_index.documents_ids: doc_vector = collection_index.get_document_vector(doc_id, weight_type) similarity = cosinus_similarity(query_vector, doc_vector) search_result = SearchResult(doc_id, similarity) search_results.append(search_result) # On trie nos resultats par ordre decroissant de similarité search_results = sorted(search_results, key=lambda result: -result.similarity) # On revoie les résultats qui ont une similarité d'au moins 15% # J'ai tester differents minimus de similarité et 15% semble etre celui donnant # filtrant le mieux les resultats (pour les query de reference du dataset) return [result for result in search_results if result.similarity > 0.15]
def parseCreateTableString(self, createTableString): createTablePattern = re.compile('CREATE TABLE `(?P<name>[a-z_]+)` \((?P<columns>.*?)\) ENGINE=(?P<engine>[a-z]+) (AUTO_INCREMENT=(?P<autoincrement>\d+) )?DEFAULT CHARSET=(?P<charset>[a-z\d]+)', re.IGNORECASE | re.DOTALL) matches = createTablePattern.match(createTableString) if matches is None: print "Error:\n" + createTableString columns = matches.group('columns').strip().split("\n") for index, column in enumerate(columns): column = column.strip() column = column.strip(',') primaryKeyMatch = re.match("^(PRIMARY KEY \((?P<columns>.*)\))", column) uniqueKeyMatch = re.match("^(UNIQUE KEY `(?P<key_name>.*?)` \((?P<columns>.*)\))", column) keyMatch = re.match("^(KEY `(?P<key_name>.*?)` \((?P<columns>.*)\))", column) if primaryKeyMatch is not None: indexColumns = self.columnStringsToObjects(Index.parseColumnNamesFromString(primaryKeyMatch.group('columns'))) self.indexes.append(Index('PRIMARY', 'PRIMARY', indexColumns)) elif uniqueKeyMatch is not None: indexColumns = self.columnStringsToObjects(Index.parseColumnNamesFromString(uniqueKeyMatch.group('columns'))) self.indexes.append(Index('UNIQUE', uniqueKeyMatch.group('key_name'), indexColumns)) elif keyMatch is not None: indexColumns = self.columnStringsToObjects(Index.parseColumnNamesFromString(keyMatch.group('columns'))) self.indexes.append(Index('KEY', keyMatch.group('key_name'), indexColumns)) else: self.columns.append(Column.fromString(column)) self.name = matches.group('name') self.autoincrement = matches.group('autoincrement')
def __init__(self, data=None, index=None, name=None, series=None): ''' One-dimensional array with axis labels (including time series). :param data: (*array_like*) One-dimensional array data. :param index: (*list*) Data index list. Values must be unique and hashable, same length as data. :param name: (*string*) Series name. ''' if series is None: if isinstance(data, (list, tuple)): data = minum.array(data) if index is None: index = range(0, len(data)) else: if len(data) != len(index): raise ValueError('Wrong length of index!') if isinstance(index, (MIArray, DimArray)): index = index.tolist() if isinstance(index, Index): self._index = index else: self._index = Index.factory(index) self._data = data self._series = MISeries(data.array, self._index._index, name) else: self._series = series self._data = MIArray(self._series.getData()) self._index = Index.factory(index=self._series.getIndex())
def build_index(url, depth): index = Index() crawler = CustomCrawler(url, depth, index) crawler.crawl_all_links() index.status() print("Страниц просмотрено %d" % len(crawler.visited)) return index
def get_gradient(im, index, border_thickness_steps): """ Fun. calc. radial gradient including thickness of cell edges @param im: image (for which grad. will be calc.) @param index: indices of pixes sorted by polar coords. (alpha, radius) @param border_thickness_steps: number of steps to cop. grad. - depands on cell border thickness @return: gradient matrix for cell """ # index of axis used to find max grad. # PL: Indeks pomocniczy osi służący do wyznaczenia maksymalnego gradientu max_gradient_along_axis = 2 # preparing the image limits (called subimage) for which grad. will be computed # PL: Wymiary wycinka obrazu, dla którego będzie obliczany gradient radius_lengths, angles = index.shape[0], index.shape[1] # matrix init # for each single step for each border thick. separated grad. is being computed # at the end the max. grad values are returned (for all steps and thick.) # PL: Inicjacja macierzy dla obliczania gradientów # PL: Dla każdego pojedynczego kroku dla zadanej grubości krawędzi komórki obliczany jest osobny gradient # PL: Następnie zwracane są maksymalne wartości gradientu w danym punkcie dla wszystkich kroków grubości krawędzi gradients_for_steps = np.zeros((radius_lengths, angles, border_thickness_steps), dtype=np.float64) # PL: Dla każdego kroku wynikającego z grubości krawędzi komórki: # PL: Najmniejszy krok ma rozmiar 1, największy ma rozmiar: ${border_thickness_steps} for border_thickness_step in range(1, int(border_thickness_steps) + 1): # find beg. and end indices of input matrix for which the gradient will be computed # PL: Wyznacz początek i koniec wycinka macierzy, dla którego będzie wyliczany gradient matrix_end = radius_lengths - border_thickness_step matrix_start = border_thickness_step # find beg. and end indices of pix. for which the gradient will be computed # PL: Wyznacz początek i koniec wycinka indeksu pikseli, dla którego będzie wyliczany gradient starting_index = index[:matrix_end, :] ending_index = index[matrix_start:, :] # find the spot in matrix where comp. gradient will go # PL: Wyznacz początek i koniec wycinka macierzy wynikowej, do którego będzie zapisany obliczony gradient intersect_start = int(math.ceil(border_thickness_step / 2.0)) intersect_end = int(intersect_start + matrix_end) # comp. current gradient for selected (sub)image # PL: Wylicz bieżącą wartość gradientu dla wyznaczonego wycinka obrazu try: current_step_gradient = im[Index.to_numpy(ending_index)] - im[Index.to_numpy(starting_index)] except Exception: print border_thickness_step print radius_lengths print matrix_start print matrix_end print ending_index print starting_index raise Exception current_step_gradient /= np.sqrt(border_thickness_step) # Zapisz gradient do wyznaczonego wycinka macierzy wyników gradients_for_steps[intersect_start:intersect_end, :, border_thickness_step-1] = current_step_gradient return gradients_for_steps.max(axis=max_gradient_along_axis)
def test1(self): con = make_dbcon() ds = DataStore(con) col = ds.collection("users") i = Index(con, col, 'email') self.assertEqual(i.name(), 'email')
def init(self, *file): self.rules = [] self.if_index = Index().init() self.then_index = Index().init() if file: self.load_rules(file[0]) return self
def initialize(facts, kbase): known = Index().init() for fact in facts: known.store(fact, (fact, 'initial')) # fact, proof known.store(['true'], (['true'], 'atomic')) # if true then... for rule in kbase.rules: rule['trigger'] = 0 return known
def __init__(self, config, lang=None): """ Constructor method """ self.config = config self.lang = lang self.subscribers = Subscribers(config) self.index = Index(self.config)
def setup(html_file=None, corpus_file=CORPUS_CSV, index_file=INDEX): # Preprocessing the html createCSV(src_file=html_file) # Create index idxf2 = Index(corpus=corpus_file) if os.path.exists(index_file): print("Detected %s exists, removeing ..." % index_file) os.remove(index_file) idxf2.save(index_file)
def test_passing_stopwords_should_remove_these_words_from_token_list(self): index = Index(stopwords=['yes', 'no', ',', '.', '!']) index.add_document('coffee', 'Yes, sir! No, Joyce.') self.assertEquals( index._index, { 'sir': set(['coffee']), 'joyce': set(['coffee']) }, )
def setUp(self): root_path = os.path.dirname(os.path.realpath(__file__)) + "/../" try: os.unlink(root_path + "data/{{cookiecutter.project_slug}}.json") os.unlink(root_path + "data/{{cookiecutter.project_slug}}.json.backup") except Exception: pass self.index = Index(connection=Mockup())
def test_asdf(self): index = Index() index[PurePath('a')] = '3' self.assertEqual(1, len(index)) self.assertIn(PurePath('a'), index) self.assertEqual('3', index[PurePath('a')]) del index[PurePath('a')] self.assertEqual(0, len(index)) self.assertFalse(index.keys())
def store(k, terms, shlf_path): main_shlv = shelve.open(shlf_path, 'c') for w in terms: if w in main_shlv: index = main_shlv[w] else: index = Index() index.add_doc(int(k)) main_shlv[w] = index main_shlv.close()
class Repository: def __init__(self): self.trees = {} self.commits = {} self.index = Index() self.branches = {} self.head = None def pack(self): return { 'index': self.index.pack(), 'trees': self.trees, 'commits': self.commits, 'branches': self.branches, 'head': self.head, } def dump(self, filename): dump_object(filename, self.pack()) def save(self, filename): save_object(filename, self.pack()) def __str__(self): return json.dumps(self.pack(), ensure_ascii=False) def branch(self, name, ref): self.branches[name] = ref def write_tree(self): (tree_id, tree) = self.index.write_tree() self.trees = dict(self.trees, **tree) return tree_id def write_commit(self, message, author, tree_id, previous=[]): commit = { 'tree_id': tree_id, 'previous': previous, 'message': message, 'date': datetime.now().__str__(), 'author': author } id = sha1(commit) self.commits[id] = commit return id def diff_branches(self, visitor, branch_a, branch_b): self.diff_commits(visitor, self.branches[branch_a], self.branches[branch_b]) def diff_commits(self, visitor, ref_commit_a, ref_commit_b): commit_a = self.commits[ref_commit_a] commit_b = self.commits[ref_commit_b] return diff_trees(visitor, commit_a['tree_id'], commit_b['tree_id'], self.trees)
class Repository(object): ''' The git repository ''' GIT_DIR = '.git' INIT_DIR = [ 'branches', 'hooks', 'info', 'objects', 'objects/info', 'objects/pack', 'refs', 'refs/heads', 'refs/tags', ] INIT_FILE = [ ['HEAD', 'ref: refs/heads/master'], ['description', 'Unnamed repository'], ['info/exclude', ''], ] def __init__(self, workspace): self.workspace = workspace self.index = Index(os.path.join(workspace, '.git', 'index')) self.config = Config(workspace) self.head_path = self._get_head_path() self.head_tree = None if os.path.exists(self.head_path): self.head_tree = read_file(self.head_path).strip() def _get_head_path(self): branch_name = read_file(os.path.join(self.workspace, '.git', 'HEAD')).strip('\n').rsplit( '/', 1)[-1] return os.path.join(self.workspace, '.git', 'refs', 'heads', branch_name) def stage(self, files): try: for file in files: content = read_file(file) blob = Blob(self.workspace, content) if not os.path.exists(blob.path): write_object_to_file(blob.path, blob.content) stat = os.stat(os.path.join(self.workspace, file)) self.index.add_entry(file, ctime=stat.st_ctime, mtime=stat.st_mtime, dev=stat.st_dev, ino=stat.st_ino, mode=cal_mode(stat.st_mode), \ uid=stat.st_uid, gid=stat.st_gid, size=stat.st_size,sha1=blob.sha1, flags=0) self.index.write_to_file() except Exception, e: print 'stage file %s error: %s' % (file, e)
def test_should_store_tokens_lowercase(self): index = Index() index.add_document('doc', 'This IS mY firsT DoCuMeNt') expected_tokens = set(['this', 'is', 'my', 'first', 'document']) expected_index = {'this': set(['doc']), 'is': set(['doc']), 'my': set(['doc']), 'first': set(['doc']), 'document': set(['doc']),} self.assertEquals(index.tokens(), expected_tokens) self.assertEquals(dict(index._index), expected_index)
def test_pending_task_could_be_snoozed(self): today = datetime.datetime.today() today_10_min_future = today + datetime.timedelta(minutes=10) self.addTask(6, "task statring in 10 min", "PENDING", today_10_min_future.strftime('%Y-%m-%d %H:%M'), "NONE") overdue, startingsoon = self.index.listNotificationsPendingTasks() self.assertEqual(len(startingsoon), 1) self.index.snooze(6) index = Index() overdue, startingsoon = index.listNotificationsPendingTasks() self.assertEqual(len(startingsoon), 0)
def load_index(index_path, train_path, reconstruct=Setting.RERUN): print "load or construct index..." if not reconstruct and os.path.exists(index_path): index = load_data(index_path) else: index = Index() index.train_path = train_path index.construct_index() dump_data(index, index_path) print "done!" return index
def test_mark_task_done(self): now = datetime.datetime.now() newTask = self.index.addTask("new task", now, "NONE") taskWithDate, taskWithoutDates = self.index.listAll() self.assertEqual(len(taskWithDate), 3) self.assertEqual(len(taskWithoutDates), 1) updatedTask = self.index.markTaskComplete(newTask.id) self.assertEqual(updatedTask.status, "DONE") index = Index() deltedTask = index.findTaskById(updatedTask.id) self.assertIsNone(deltedTask)
def __init__(self): self.database = "hista" self.table = "bucket" self.primary_key = "id" self.meta = DBMeta() self.index = Index(self.database, self.table, self.primary_key) self.db_file = "hista.db" r = self._is_table_exists() if not r: print("table does not exist, create it now.") self.create_table()
class Repository(object): ''' The git repository ''' GIT_DIR = '.git' INIT_DIR = [ 'branches', 'hooks', 'info', 'objects', 'objects/info', 'objects/pack', 'refs', 'refs/heads', 'refs/tags', ] INIT_FILE = [ ['HEAD', 'ref: refs/heads/master'], ['description', 'Unnamed repository'], ['info/exclude', ''], ] def __init__(self, workspace): self.workspace = workspace self.index = Index(os.path.join(workspace, '.git', 'index')) self.config = Config(workspace) self.head_path = self._get_head_path() self.head_tree = None if os.path.exists(self.head_path): self.head_tree = read_file(self.head_path).strip() def _get_head_path(self): branch_name = read_file(os.path.join(self.workspace, '.git', 'HEAD')).strip('\n').rsplit('/', 1)[-1] return os.path.join(self.workspace, '.git', 'refs', 'heads', branch_name) def stage(self, files): try: for file in files: content = read_file(file) blob = Blob(self.workspace, content) if not os.path.exists(blob.path): write_object_to_file(blob.path, blob.content) stat = os.stat(os.path.join(self.workspace, file)) self.index.add_entry(file, ctime=stat.st_ctime, mtime=stat.st_mtime, dev=stat.st_dev, ino=stat.st_ino, mode=cal_mode(stat.st_mode), \ uid=stat.st_uid, gid=stat.st_gid, size=stat.st_size,sha1=blob.sha1, flags=0) self.index.write_to_file() except Exception, e: print 'stage file %s error: %s' % (file, e)
def __init__(self, data=None, index=None, columns=None, dataframe=None): if dataframe is None: if not data is None: if isinstance(data, dict): columns = data.keys() dlist = [] n = 1 for v in data.values(): if isinstance(v, (list, tuple)): n = len(v) v = np.array(v) elif isinstance(v, np.NDArray): n = len(v) dlist.append(v) for i in range(len(dlist)): d = dlist[i] if not isinstance(d, np.NDArray): d = [d] * n d = np.array(d) dlist[i] = d data = dlist if isinstance(data, np.NDArray): n = len(data) data = data._array else: dlist = [] n = len(data[0]) for dd in data: dlist.append(dd._array) data = dlist if index is None: index = range(0, n) else: if n != len(index): raise ValueError('Wrong length of index!') if isinstance(index, np.NDArray): index = index.tolist() if isinstance(index, Index): self._index = index else: self._index = Index.factory(index) if data is None: self._dataframe = MIDataFrame(self._index._index) else: self._dataframe = MIDataFrame(data, self._index._index, columns) else: self._dataframe = dataframe self._index = Index.factory(index=self._dataframe.getIndex())
def __init__(self, config, lang=None): """ Constructor method @param config: configuration @param lang: language """ self.config = config self.lang = lang self.subscribers = Subscribers(config) self.index = Index(self.config)
def call_index(): global par_orb,par_color path= z.get() if path== '': tkMessageBox.showinfo('ERROR','Please folder path!!!') elif path != '': if par_orb==1: di=Data_index(path) di.insert_data() elif par_color==1: i=Index(path) i.main_fun()
def __init__(self, data=None, index=None, columns=None, dataframe=None): if dataframe is None: if not data is None: if isinstance(data, dict): columns = data.keys() dlist = [] n = 1 for v in data.values(): if isinstance(v, (list, tuple)): n = len(v) v = minum.array(v) elif isinstance(v, MIArray): n = len(v) dlist.append(v) for i in range(len(dlist)): d = dlist[i] if not isinstance(d, MIArray): d = [d] * n d = minum.array(d) dlist[i] = d data = dlist if isinstance(data, MIArray): n = len(data) data = data.array else: dlist = [] n = len(data[0]) for dd in data: dlist.append(dd.array) data = dlist if index is None: index = range(0, n) else: if n != len(index): raise ValueError('Wrong length of index!') if isinstance(index, (MIArray, DimArray)): index = index.tolist() if isinstance(index, Index): self._index = index else: self._index = Index.factory(index) if data is None: self._dataframe = MIDataFrame(self._index._index) else: self._dataframe = MIDataFrame(data, self._index._index, columns) else: self._dataframe = dataframe self._index = Index.factory(index=self._dataframe.getIndex())
def main(filepath, column): indexer = Index(NGRAM) f = codecs.open(filepath, "r", "utf-8") lines = f.readlines() for line in lines: print line elems = line.split("\t") indexer.append(''.join(elems[column-1])) f.close() indexer.dump("data/") return
def test_index_in_memory(self): index = Index() self.assertEqual(0, len(index)) self.assertFalse(index.keys()) index[PurePath('a')] = '1' self.assertEqual(1, len(index)) self.assertIn(PurePath('a'), index) self.assertEqual('1', index[PurePath('a')]) del index[PurePath('a')] self.assertEqual(0, len(index)) self.assertFalse(index.keys())
def test_should_store_tokens_lowercase(self): index = Index() index.add_document('doc', 'This IS mY firsT DoCuMeNt') expected_tokens = set(['this', 'is', 'my', 'first', 'document']) expected_index = { 'this': set(['doc']), 'is': set(['doc']), 'my': set(['doc']), 'first': set(['doc']), 'document': set(['doc']), } self.assertEquals(index.tokens(), expected_tokens) self.assertEquals(dict(index._index), expected_index)
class Repository(object): """ The git repository """ GIT_DIR = ".git" INIT_DIR = [ "branches", "hooks", "info", "objects", "objects/info", "objects/pack", "refs", "refs/heads", "refs/tags", ] INIT_FILE = [["HEAD", "ref: refs/heads/master"], ["description", "Unnamed repository"], ["info/exclude", ""]] def __init__(self, workspace): self.workspace = workspace self.index = Index(os.path.join(workspace, ".git", "index")) self.config = Config(workspace) def stage(self, files): try: for file in files: content = read_file(file) blob = Blob(self.workspace, content) if not os.path.exists(blob.path): write_object_to_file(blob.path, blob.content) stat = os.stat(os.path.join(self.workspace, file)) self.index.add_entry( file, ctime=stat.st_ctime, mtime=stat.st_mtime, dev=stat.st_dev, ino=stat.st_ino, mode=cal_mode(stat.st_mode), uid=stat.st_uid, gid=stat.st_gid, size=stat.st_size, sha1=blob.sha1, flags=0, ) self.index.write_to_file() except Exception, e: print "stage file %s error: %s" % (file, e)
def download_and_register_illustrations(self, illustrations): image_registration_arguments = [] for illustration in illustrations: image_url = illustration.meta_single_page.get('original_image_url', illustration.image_urls.large) tags = [tag['name'] for tag in illustration.tags] url_basename = os.path.basename(image_url) extension = os.path.splitext(url_basename)[1] name = 'pixiv_{}{}'.format(str(illustration.id), extension) self.app_api.download(url=image_url, name=name) image_registration_arguments.append((name, tags)) Index.get_or_create_instance().register_new_illustration_list(image_registration_arguments)
def test_multipart(self): index = Index() index[PurePath('a/b')] = '2' self.assertEqual(1, len(index)) self.assertIn(PurePath('a/b'), index) self.assertEqual([PurePath('a/b')], list(index)) self.assertEqual('2', index[PurePath('a/b')]) self.assertEqual(1, len(index[PurePath('a')])) self.assertIn(PurePath('b'), index[PurePath('a')]) self.assertEqual('2', index[PurePath('a')][PurePath('b')]) del index[PurePath('a/b')] self.assertEqual(0, len(index)) self.assertFalse(index.keys())
def main() -> None: index = Index(BIGBOOK_DIR, SHOW_INDEX_FILE) templates = TemplateLookup( [TEMPLATE_DIR / 'website', TEMPLATE_DIR / 'website' / 'Jinja'], strict_undefined=True) # Create website directories, if necessary. for dirname in ('Text', 'Images', 'Media', 'Fonts', 'Styles'): (WEBSITE_DIR / dirname).mkdir(exist_ok=True, parents=True) # Copy in the styling files from the web template. for dirname in ('Fonts', 'Styles', 'Images'): for file in (TEMPLATE_DIR / 'common' / dirname).glob('*'): copyfile(src=file, dst=WEBSITE_DIR / dirname / file.name) for file in (TEMPLATE_DIR / 'website' / dirname).glob('*'): copyfile(src=file, dst=WEBSITE_DIR / dirname / file.name) # Copy in the Big Book's media files, if necessary. for dirname in ('Images', 'Media'): for file in (BIGBOOK_DIR / dirname).glob('*'): dst = WEBSITE_DIR / dirname / file.name if not dst.exists(): copyfile(src=file, dst=dst) # Expand the 'index.html' file template. file = TEMPLATE_DIR / 'website' / 'index.html' template = templates.get_template(file.name) html_file = WEBSITE_DIR / file.name html_file.write_text(template.render(index=index)) # Expand the index pages' templates. for file in (TEMPLATE_DIR / 'website' / 'Text').glob('index-*.html'): template = templates.get_template(file.name) html_file = WEBSITE_DIR / 'Text' / file.name html_file.write_text(template.render(index=index)) # Expand the pages for the Big Book, using the page.html template. for article in index.articles(): # content = xhtml.content(article.file, heading=True) # I'm going to be a barbarian instead and use a regex on HTML. # It's acceptably accurate on these Big Book files, and much faster. html = article.file.read_text() content = re.search(r'<body[^>]*>(.+)</body>', html, re.DOTALL)[1] content = content.replace('.xhtml"', '.html"') template = templates.get_template('page.html') destination = WEBSITE_DIR / 'Text' / (article.id + '.html') destination.write_text( template.render(content=content, article=article))
def test_subindex(self): index = Index() subindex = Index() subindex[PurePath('c')] = '4' index[PurePath('d')] = subindex self.assertEqual(1, len(index)) self.assertIn(PurePath('d/c'), index) self.assertEqual('4', index[PurePath('d/c')]) self.assertEqual(1, len(index[PurePath('d')])) self.assertIn(PurePath('c'), index[PurePath('d')]) self.assertEqual('4', index[PurePath('d')][PurePath('c')]) del index[PurePath('d')] self.assertEqual(0, len(index)) self.assertFalse(index.keys())
def __init__(self,collections): if not os.path.exists('saved_library'): # will change to be name of the .avc file ok print("making new save data") self.library = Library() # for now self.library.init_from_xml("test.avc") else: self.load_library('saved_library',True) #self.savedata = open('saved_library','wb') self.index = Index(self.library.get_clip_names()) self.tag_index = Index(self.library.get_tags()) self.col_index = Index(collections)
def test_calling_method_load_should_retrieve_object_from_pickle_file(self): fp = NamedTemporaryFile(delete=False) fp.close() self.filename = fp.name index = Index() index.add_document('coffee', 'I liked it') index.add_document('water', 'I need it') index.dump(self.filename) retrieved_index = Index.load(self.filename) self.assertEquals(len(retrieved_index), 2) self.assertEquals(set(retrieved_index._index.keys()), set(['i', 'liked', 'need', 'it']))
def build_index(): """ Builds and index from a given folder. Normalizes the documents, tokezine them, and create the index. This function is called only when the user has provided a wrong index file, or even when it did not provide anything at all. """ processors.append(NormalizerProcessor()) # Fetches every documents from the input folder print('[FETCHING]\tReading text files from \'{0}\'...'.format(folder)) documents = Document.fetch(folder, True) # Normalizes every loaded documents print('[PROCESSING]\tNormalizing words from every documents...') tokenize_all(documents) # Creates the index by mapping every word # to all the documents that reference it print('[INDEXING]\tBuilding index from words...\n') posting_list = Posting.index(tokenized_documents) index = Index.build(posting_list) return index
def test_put_schema__time_empty_should_fail(self): # arrange SUT = {"time": ""} # act with self.assertRaises(er.MultipleInvalid): Index.PUT_SCHEMA(SUT)
def plot1(self, dat): with Index(dat,['canton']) as df_: df = df_.loc[self.abbreviation] df = df.reset_index() dates = df['date'].drop_duplicates() #dates_complete = complete_index(dates) dates_complete = dates df = df.set_index('date') released = df.loc[:,'ncumul_released'].values released[np.isnan(released)] = 0 deceased = df.loc[:,'ncumul_deceased'].values deceased[np.isnan(deceased)] = 0 df.loc[:,'current_hosp'] = df.loc[:,'current_hosp'].values + released + deceased df.loc[:,'current_vent'] = df.loc[:,'current_vent'].values + released + deceased df.loc[:,'current_icu'] = df.loc[:,'current_icu'].values + deceased df = (df.rolling('7D').mean()).diff() #df = df.mean().diff() fig = plt.figure() ax = fig.subplots() ax.plot(df['ncumul_conf'],label='Neue Positive Testergebnisse') ax.plot(df['current_hosp'],label='Neue Hospitalisierungen') #ax.plot(df['current_icu'],label='Intensivstation') #ax.plot(df['ncumul_deceased'],label='Beatmet') ax.plot(df['ncumul_deceased'],label='Verstorben') plt.yscale('log') plt.grid(which='both') plt.legend() ax.set_title(self.canton + ' (Durchschnittswerte der letzten 7 Tage)') ax.set_ylabel('Durchschnittswert pro Tag') fig.autofmt_xdate() return ( fig )
def main(): pp = pprint.PrettyPrinter() data_dir = 'data/easyCLEF08' txt_file = os.path.join(data_dir, 'easyCLEF08_text.txt') query_file = os.path.join(data_dir, 'easyCLEF08_query.txt') relevants_file = os.path.join(data_dir, 'easyCLEF08_gt.txt') index = Index(directory=data_dir, txt_file=txt_file, create_index=False) okapi = Okapi(index, k1=1.80, b=0.65) diversity = DiversityClustering(DBSCANClustering, index) q = QueryParser(relevants_file) q.q.initFile(query_file) # while True: for i in range(1): query = q.nextQuery() if query is None: break if len(query.relevants) == 0: print('No relevants docs') continue docs_scores = okapi.getRanking(query.getText()) ordered_pred = diversity.order_pred(query, docs_scores)
def test_put_schema_with_empty_pin_code_should_pass(self): # arrange SUT = { "id": 1, "enable": True, "pdpContext": { "static": True, "id": 1, "retryTimeout": 1200, "primary": { "apn": "internet", "type": "ipv4v6" }, "secondary": { "type": "ipv4v6" } }, "pinCode": u"", "keepalive": { "enable": True, "targetHost": "8.8.8.8", "intervalSec": 60, "reboot": { "enable": False, "cycles": 1 } } } # act data = Index.PUT_SCHEMA(SUT) # assert self.assertEqual(SUT, data)
def __init__(self, comp=False, conf=False): ''' Initiate the diagnosis finder. Parameters: comp - Recompiles the data files if set to True conf - Supply a Conf object ''' if(conf): self.conf=conf else: self.conf=Conf() self.compiler=Compile(conf) if(comp): self.compiler.compile() self.index=Index(conf)
def optimize_language_parameters(measures=['AveragePrecision']): index = Index() models, names = [], [] lissage_values = np.arange(0.1, 1.0, 0.1) for l in lissage_values: models.append(LanguageModel(index, l)) names.append('LanguageModel_'+str(l)) scores = compare_models(names, models, measures) ap = [scores[model_name]['AveragePrecision']['mean'] for model_name in names] ap_std = [scores[model_name]['AveragePrecision']['eval_std'] for model_name in names] fig = plt.figure(figsize=(10,8)) plt.plot(lissage_values, ap) plt.title('Language model : average precision depending on lissage values') plt.xlabel('Lissage') plt.ylabel('Average Precision') plt.savefig('plot/Language_Model_ap.png') fig = plt.figure(figsize=(10,8)) plt.plot(lissage_values, ap_std) plt.title('Language model : average precision std depending on lissage values') plt.xlabel('Lissage') plt.ylabel('Average Precision') plt.savefig('plot/Language_Model_ap_std.png') best_model = names[np.argmax(ap)] print('best_model', best_model)
def main(): # Must have at least one application app = WebApplication() # Generate index.html from index.enaml with open('index.html', 'wb') as f: f.write(Index().render())
def __init__(self, n_spiders=1, n_document_processors=1, seeds=None, indexable_content_types=None, do_not_crawl=[]): """ n_spiders -- The number of spider processes to use n_document_processors -- The number of document processors to use seeds -- A list of initial URLs to crawl indexable_content_types -- A list of MIME content-types for which files should be indexed do_not_crawl -- A list of regular expressions for which matching domain names will not be crawled """ if not indexable_content_types: indexable_content_types = ['text/html'] if not seeds: seeds = [] self.frontier = Manager().Queue() self.document_store = Manager().Queue() self.visited_cache_path = 'visited_cache' self.visited_cache = None self.index = Index() self.n_spiders = n_spiders self.n_document_processors = n_document_processors self.indexable_content_types = indexable_content_types self.spiders = [] self.document_processors = [] self.seed_urls = seeds self.do_not_crawl = do_not_crawl for seed_url in seeds: self.frontier.put(seed_url) self.status = 'STOPPED'
def make_indexes(number_of_columns, prim_key_column, table): """ Make a list of empty indices for the table on setup. The primary key column index must always exist so instantiate only this index to begin with. :param number_of_columns: int # the number of columns in the table :param prim_key_column: int # the index of the primary key column :param table: table object # the table for which these indexes are being created :return: [] # a list of None with an index in the primary key index index """ indexes = [None] * number_of_columns index = Index() index.create_index(table=table, column_number=prim_key_column) indexes[prim_key_column] = index return indexes
def setUp(self): root_path = os.path.dirname(os.path.realpath(__file__)) + "/../" try: os.unlink(root_path + "data/ntp.json") os.unlink(root_path + "data/ntp.json.backup") except: pass self.index = Index(connection=Mockup())
def __init__(self, workspace): self.workspace = workspace self.index = Index(os.path.join(workspace, '.git', 'index')) self.config = Config(workspace) self.head_path = self._get_head_path() self.head_tree = None if os.path.exists(self.head_path): self.head_tree = read_file(self.head_path).strip()
def setUp(self): self.root_path = os.path.abspath(os.path.dirname(__file__) + "/../") self.jsons = glob.glob(os.path.join(self.root_path, "data/*.json")) self.backups = glob.glob(os.path.join(self.root_path, "data/*.backup")) for file in self.jsons + self.backups: os.unlink(file) self.index = Index(connection=Mockup())
class Kbase: def init(self, *file): self.rules = [] self.if_index = Index().init() self.then_index = Index().init() if file: self.load_rules(file[0]) return self def remove_rule(self, id): for i in range(len(self.rules)): if self.rules[i]["rule"] == id: del self.rules[i] for if1 in self.rules[i]["if"]: self.if_index.delete(if1) for then in self.rules[i]["then"]: self.then_index.delete(then) def add_rule(self, rule): self.index_rule(rule) self.rules.append(rule) def index_rule(self, rule): for if1 in rule["if"]: self.if_index.store(if1, rule) # fwd: fact/if index tree for then in rule["then"]: self.then_index.store(then, rule) # bkwd: goal/then index tree def match_if(self, fact): return self.if_index.search(fact) def match_then(self, goal, dict): return self.then_index.search(goal, dict) def load_rules(self, name): try: file = open(strip(name), "r") contents = file.read() # 'rule. rule.' rules = splitfields(contents, ".") # ['rule','rule',''] del rules[len(rules) - 1] # ['rule','rule'] for rule in rules: self.add_rule(internal_rule(rule)) # [{rule},{rule}] file.close() except IOError, cause: print "file error:", cause
def __init__(self, tweets): # send tweets into index self.index = Index(tweets) # get documents from index self.documents = self.index.documents #print self.documents self.dVectors = [None]*len(self.documents) ''' we want a slightly easier way to deal with the documents, so we're translating them from what we had before (document objects) to dictionaries of words -> tfidf values so we don't have to recompute tfidf values again and again and again... ''' for d in sorted(self.documents.keys()): doc = self.documents[d] vecs = {} for posting in doc.getPostingsList().values(): vecs[posting.getTerm()] = self.getTFIDF(posting) ''' so now we have vectors of all words in the document... but maybe we just want the top x ''' #vecs = self.reduceDimensionality(vecs) self.dVectors[d] = vecs ''' just for testing ''' #self.dVectors = createDummyData() #self.documents = None print self.cosineScore(self.dVectors[1],self.dVectors[1]) ''' run kmeans for k = 2 ''' statsFor2 = self.runKMeans(2) ''' run kmeans for k = 4 ''' statsFor4 = self.runKMeans(4) ''' run kmeans for k = 6 ''' statsFor6 = self.runKMeans(6) ''' run kmeans for k = 8 ''' statsFor8 = self.runKMeans(8) ''' start printing stats ''' print "-------------------------------------------------------------------" distanceFromOrigin = 0 for document in self.dVectors: if self.distanceMetric == 'euclidean': distanceFromOrigin += self.distanceBetween({},document) elif self.distanceMetric == 'cosine': distanceFromOrigin += self.cosineScore({}, document) else: print 'you have not defined a distance metric' print 'STATISTICS REPORT FOR K = 1, WHERE THE CENTER IS THE ORIGIN:' print " RSS:", distanceFromOrigin self.printStats(statsFor2) self.printStats(statsFor4) self.printStats(statsFor6) self.printStats(statsFor8)
def index_doc_set(self, doc_set_name): path = os.path.join(self._root_directory, doc_set_name) if os.path.isdir(path): idx = Index.open_or_create(self._index_directory) doc_set = DocSet(self._index_directory) doc_set.add_doc_set(doc_set_name) idx.add_field(_DOC_SET_FIELD, TEXT(stored=True)) idxr = _DocSetIndexer(idx, doc_set_name) idxr.index_directory(path)
def get_gradient(im, index, border_thickness_steps): """ Fun. calc. radial gradient including thickness of cell edges @param im: image (for which grad. will be calc.) @param index: indices of pixes sorted by polar coordinates (alpha, radius) @param border_thickness_steps: number of steps to cop. grad. - depends on cell border thickness @return: gradient matrix for cell """ # index of axis used to find max grad. max_gradient_along_axis = 2 # preparing the image limits (called subimage) for which grad. will be computed radius_lengths, angles = index.shape[:2] # matrix init # for each single step for each border thick. separated grad. is being computed # at the end the max. grad values are returned (for all steps of thickness) border_thickness_steps = int(border_thickness_steps) gradients_for_steps = np.zeros((radius_lengths, angles, border_thickness_steps), dtype=np.float64) # for every step of thickness: for border_thickness_step in range(1, int(border_thickness_steps) + 1): # find beg. and end indices of input matrix for which the gradient will be computed matrix_end = radius_lengths - border_thickness_step matrix_start = border_thickness_step # find beg. and end indices of pix. for which the gradient will be computed starting_index = index[:matrix_end, :] ending_index = index[matrix_start:, :] # find internal in matrix where computed gradient will go intersect_start = int(math.ceil(border_thickness_step / 2.0)) intersect_end = int(intersect_start + matrix_end) # comp. current gradient for selected (sub)image current_step_gradient = im[Index.to_numpy(ending_index)] - im[Index.to_numpy(starting_index)] current_step_gradient /= np.sqrt(border_thickness_step) # save gradient to previously determined place in results matrix gradients_for_steps[intersect_start:intersect_end, :, border_thickness_step - 1] = current_step_gradient return gradients_for_steps.max(axis=max_gradient_along_axis)
def test_index_file(): dir = './test_index' idx = Index.create(dir) writer = idx.get_writer() writer.add_document( path = u'/foo/bar', title = u'Foo: The History', last_modified = 34343423423, text = u'Not much to say here') writer.commit() assert idx.doc_count() == 1
def createIndex(self, name, col): """ Create an index on the table. Arguments: name -- name of the index, used to identify it col -- name of column to be used as key for the index """ index = Index.create(name, col) self.indexes.append(index) for row in self.rows: index.insert(row)