def __init__(self, config=cfg, cache=True): if not cache or not os.path.isfile(cfg.data_cache): self.train, self.val = self.train_val_split( utils.load_csv(cfg.train_csv), 0.9) self.test = utils.load_csv(cfg.test_csv, shuffle=False) utils.save_cache([self.train, self.val, self.test], cfg.data_cache) else: self.train, self.val, self.test = utils.load_cache(cfg.data_cache)
def get_axioms(cat: str) -> list: """Return all axioms created by the Cat2Ax approach.""" global __CATEGORY_AXIOMS__ if '__CATEGORY_AXIOMS__' not in globals(): __CATEGORY_AXIOMS__ = defaultdict(list, utils.load_cache('cat2ax_axioms')) if not __CATEGORY_AXIOMS__: raise ValueError('CATEGORY/CAT2AX: Axioms not initialised. Run axiom extraction before using them!') return __CATEGORY_AXIOMS__[cat]
def _setup_hypernyms(): """Initialisation of hypernyms that are extracted from Wikipedia categories using Cat2Ax axioms.""" if utils.load_cache('wikitaxonomy_hypernyms') is not None: return # only compute hypernyms if they are not existing already ccg = category.get_conceptual_category_graph() # initialise cat2ax axioms cat2ax_axioms = cat_axioms.extract_category_axioms(ccg) utils.update_cache('cat2ax_axioms', cat2ax_axioms) # initialise wikitaxonomy hypernyms wikitaxonomy_hypernyms = hypernymy_util.compute_hypernyms(ccg) utils.update_cache('wikitaxonomy_hypernyms', wikitaxonomy_hypernyms)
def __init__(self, load_atlas = False, load_split = None, use_estimated_3DBB = False, estimated_3DBB_path = None): self.dataset = load_cache(BOXCARS_DATASET) self.use_estimated_3DBB = use_estimated_3DBB self.atlas = None self.split = None self.split_name = None self.estimated_3DBB = None self.X = {} self.Y = {} for part in ("train", "validation", "test"): self.X[part] = None self.Y[part] = None # for labels as array of 0-1 flags if load_atlas: self.load_atlas() if load_split is not None: self.load_classification_split(load_split) if self.use_estimated_3DBB: self.estimated_3DBB = load_cache(estimated_3DBB_path)
def is_hypernym(hyper_word: str, hypo_word: str) -> bool: """Returns True, if `hyper_word` and `hypo_word` are synonyms or if the former is a hypernym of the latter.""" global __WIKITAXONOMY_HYPERNYMS__ if '__WIKITAXONOMY_HYPERNYMS__' not in globals(): __WIKITAXONOMY_HYPERNYMS__ = utils.load_cache('wikitaxonomy_hypernyms') if not __WIKITAXONOMY_HYPERNYMS__: raise ValueError( 'wikitaxonomy_hypernyms not initialised. Run hypernym extraction once to create the necessary cache!' ) if is_synonym(hyper_word, hypo_word): return True return hyper_word.lower() in __WIKITAXONOMY_HYPERNYMS__[hypo_word.lower()]
def read(self, path): cache = utils.load_cache("facilities", self.config) if cache is None: self.progress = tqdm(desc="Loading Facilities ...") utils.make_xml_parser(self, utils.open_gzip(path)) cache = self.process() utils.save_cache("facilities", cache, self.config) else: print("Loaded faciltiies from cache.") return cache
def read(self, path, facility_id_to_index): cache = None if self.config["use_population_cache"]: cache = utils.load_cache("population", self.config) if cache is None: self.progress = tqdm(desc = "Loading Population ...") utils.make_xml_parser(self, utils.open_gzip(path)) cache = self.process(facility_id_to_index) utils.save_cache("population", cache, self.config) else: print("Loaded population from cache.") return cache
def compute_hypernyms(category_graph) -> dict: """Retrieves all hypernym relationships from the three sources (Wiki corpus, WebIsALOD, Category axioms).""" hypernyms = defaultdict(set) # collect hypernyms from axiom matches between Wikipedia categories cat_headlemmas = category_graph.get_node_LHS() axiom_hypernyms = defaultdict(lambda: defaultdict(int)) for parent, child in category_graph.get_axiom_edges(): for cl in cat_headlemmas[child]: for pl in cat_headlemmas[parent]: axiom_hypernyms[cl.lower()][pl.lower()] += 1 # load remaining hypernyms wiki_hypernyms = utils.load_cache('wikipedia_hypernyms') webisalod_data = pickle.load( bz2.open(utils.get_data_file('files.dbpedia.webisalod_hypernyms'), mode='rb')) webisalod_hypernyms = defaultdict(dict) for parent, child, conf in webisalod_data: webisalod_hypernyms[child][parent] = conf # merge hypernyms candidates = set(axiom_hypernyms) | set(wiki_hypernyms) | set( webisalod_hypernyms) for candidate in candidates: hyper_count = defaultdict(int) if candidate in axiom_hypernyms: for word, count in axiom_hypernyms[candidate].items(): if count >= THRESHOLD_AXIOM: hyper_count[word] += 2 if candidate in wiki_hypernyms: for word, count in wiki_hypernyms[candidate].items(): if count >= THRESHOLD_WIKI: hyper_count[word] += 1 if candidate in webisalod_hypernyms: for word, conf in webisalod_hypernyms[candidate].items(): if conf >= THRESHOLD_WEBISALOD: hyper_count[word] += 1 hypernyms[candidate] = { word for word, count in hyper_count.items() if count > 1 } return hypernyms
def extract_wiki_corpus_resources(): """Crawl the Wikipedia corpus for hearst patterns to retrieve hypernyms and type lexicalisations.""" if utils.load_cache('wikipedia_type_lexicalisations') is not None: return # only compute hypernyms and type lexicalisations if they are not existing already utils.get_logger().info( 'WIKIPEDIA/NIF: Computing wikipedia hypernyms and type lexicalisations..' ) total_hypernyms = defaultdict(lambda: defaultdict(int)) total_type_lexicalisations = defaultdict(lambda: defaultdict(int)) # initialize some caches to reduce the setup time of the individual processes dbp_store.get_types('') dbp_store.get_inverse_lexicalisations('') spacy_util.get_hearst_pairs('') with mp.Pool(processes=utils.get_config('max_cpus')) as pool: for hypernyms, type_lexicalisations in pool.imap_unordered( _compute_counts_for_resource, tqdm(_retrieve_plaintexts()), chunksize=1000): for (sub, obj), count in hypernyms.items(): total_hypernyms[sub][obj] += count for (sub, obj), count in type_lexicalisations.items(): total_type_lexicalisations[sub][obj] += count wikipedia_hypernyms = { word: dict(hypernym_counts) for word, hypernym_counts in total_hypernyms.items() } utils.update_cache('wikipedia_hypernyms', wikipedia_hypernyms) type_lexicalisations = { word: dict(type_counts) for word, type_counts in total_type_lexicalisations.items() if word not in STOP_WORDS } utils.update_cache('wikipedia_type_lexicalisations', type_lexicalisations)
def get_movies(cached=False): if cached: print("Returning cached data") return load_cache(MOVIES_CACHE) options = Options() options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('--headless') chrome_bin_path = os.environ.get('GOOGLE_CHROME_BIN', None) chromedriver_path = os.environ.get('CHROMEDRIVER_PATH', None) if not chrome_bin_path or not chromedriver_path: print( 'Chrome problem. Check if chrome and chromedriver are installed and envionmental variables are set.' ) return [] options.binary_location = chrome_bin_path # options.set_headless(headless=True) url = create_multikino_url() print(f"Getting {url} ...") browser = webdriver.Chrome(executable_path=chromedriver_path, options=options) browser.get(url) html = browser.page_source save_to_file("multikino.html", html) print(f"browser: {browser}") browser.quit() movies = [] print("Parsing...") soup = BeautifulSoup(html, "html.parser") for movie in soup.find_all(class_='filmlist__info'): title = movie.select(".filmlist__title > span")[0].get_text() try: rating = movie.find(attrs={ "rv-show": "film.rank_value" }).select("span")[0].get_text() votes = movie.find(attrs={ "rv-show": "film.rank_votes" }).select("span")[0].get_text() except AttributeError: print(f"No rating for {title}") except Exception as e: print(f"Something really bad happend: {e}") description = movie.select(".filmlist__synopsis > p")[0].get_text() genres = list( map(lambda item: item.get_text(), movie.find_all("a", class_="film-details__item"))) genres = ', '.join(genres) or "-" if any(keyword in title for keyword in FILTER_KEYWORDS): continue movie = Movie(title=title, votes=votes, description=description, genres=genres) movie.rating.mul = rating movies.append(movie) hash_movies = {movie.title: movie for movie in movies} print('Total movies found (+7 days from now): {}'.format(len(movies))) loop = asyncio.new_event_loop() print("Filmweb api call...") loop.run_until_complete(get_all_filmweb_api_data(hash_movies)) print("IMDB api call...") loop.run_until_complete(get_all_imdb_api_data(hash_movies)) movies = sort_movies_descending(movies) print("Saving cache...") save_cache(movies, MOVIES_CACHE) print("OK") return movies
def get_type_lexicalisations(lemma: str) -> dict: """Return the type lexicalisation score for a set of lemmas (i.e. the probabilities of types given `lemmas`).""" global __TYPE_LEXICALISATIONS__ if '__TYPE_LEXICALISATIONS__' not in globals(): __TYPE_LEXICALISATIONS__ = defaultdict(dict, utils.load_cache('wikipedia_type_lexicalisations')) return __TYPE_LEXICALISATIONS__[lemma.lower()]
help="Distance metric in objective function.") args = parser.parse_args() print('\n', " Call with Arguments ".center(50, "="), sep='') for item in args.__dict__: print("{:18}".format(item), "->\t", args.__dict__[item]) return args if __name__ == "__main__": args = parse_args() # Data # Genes x # Cells if args.raw is not None: data_dict = load_data(args.raw, args.t, args.groups, args.groups_col, args.batches, args.batches_col) else: data_dict = load_cache(args.cache) inmf = iNMF(data_dict, args.k, args.lam, args.gam, args.penalty, args.metric) print(inmf) tic = time.time() try: for i in range(100000): obj_val = inmf.cal_objective() inmf.cvg.update_ma(obj_val) if i == 0 or (i + 1) % 100 == 0: print("Iteration: {}\tObjective Value: {}".format(i + 1, obj_val)) if inmf.cvg.is_converge(): print("Convergence Criterion Reached at Iteration: {}".format(i + 1)) break inmf.update_par()
def build_detector(detector_model_dir, detector_model_names, save_model_name, save_model_dir, model_path, MODEL, det_model, data, data_format, is_det_joint, model_idx, gpu_count=1): det_dict = {} det_set = {} det_idx_set = {} dropout_rate_set = {} det_gpu_idx = {} for val in detector_model_names: if val == '': continue cur_det_name, cur_p, cur_det_type, cur_dropout_rate, cur_model_id = val.split( '/') cur_model_id = int(cur_model_id) cur_det_path = os.path.join(detector_model_dir, cur_det_name) cur_detector = { "p": cur_p, "type": cur_det_type, "dropout_rate": cur_dropout_rate } det_dict[cur_det_name] = cur_detector if type(det_model) is list: cur_det_model = det_model[cur_model_id] cur_model_path = os.path.join(save_model_dir, save_model_name[cur_model_id]) cur_det_idx = model_idx[cur_model_id] else: cur_det_model = det_model cur_model_path = model_path cur_det_idx = model_idx default_det_idx = cur_det_idx with tf.device('/gpu:' + str(cur_model_id % gpu_count)): # build detector print("# build detector: ", cur_det_name) print("type:", cur_det_type) print("p:", cur_p) print("drop_rate:", cur_dropout_rate) if cur_det_type == 'AED': cur_detector = AEDetector(cur_det_path, p=int(cur_p)) cur_det_idx = load_model_idx(cur_det_path) elif cur_det_type == "DBD": id_reformer = IdReformer() print("# build reformer", cur_det_name) cur_reformer_t = SimpleReformer(cur_det_path) classifier = Classifier(cur_model_path, MODEL, data_format=data_format, model=cur_det_model) cur_detector = DBDetector(reconstructor=id_reformer, prober=cur_reformer_t, classifier=classifier, T=int(cur_p)) cur_det_idx = load_model_idx(cur_det_path) if cur_det_idx is None: cur_det_idx = default_det_idx det_idx_set[cur_det_name] = cur_det_idx['validate'] dropout_rate_set[cur_det_name] = float(cur_dropout_rate) det_set[cur_det_name] = cur_detector det_gpu_idx[cur_det_name] = cur_model_id % gpu_count # compute thrs thrs_set = {} det_info = { "model": save_model_name, "model_dir": save_model_dir, "det": det_dict, "det_dir": detector_model_dir, "joint_thrs": is_det_joint } cache_path = os.path.join(detector_model_dir, "cache") if is_det_joint: marks_set = [] num = 0 cache = load_cache(det_info, cache_path) if cache is None: cache_data = {} for cur_det_name, cur_det in det_set.items(): validation_data = data.train_data_orig[ det_idx_set[cur_det_name]] num = int( len(validation_data) * dropout_rate_set[cur_det_name]) marks = cur_det.mark(validation_data, data_format=data_format) marks_set.append(marks) marks = np.sort(marks) cache_data[cur_det_name] = marks[-num] print("compute thrs for model #", cur_det_name, "#:", marks[-num]) marks_set = np.transpose(marks_set) marks_max = np.max(marks_set, axis=1) marks_max = np.sort(marks_max) max_thrs = marks_max[-num] cache_data['thrs'] = max_thrs if len(det_set) > 0: hash_id = save_cache(det_info, cache_data, cache_path) print("save cache:", hash_id) else: print("hit cache:", cache['hash_id']) cache_data = cache['data'] for cur_det_name, cur_det in det_set.items(): print("compute thrs for model #", cur_det_name, "#:", cache_data[cur_det_name]) max_thrs = cache_data['thrs'] for cur_det_name, cur_det in det_set.items(): thrs_set[cur_det_name] = max_thrs print("use joint thrs:", max_thrs) else: cache = load_cache(det_info, cache_path) if cache is None: cache_data = {} for cur_det_name, cur_det in det_set.items(): validation_data = data.train_data_orig[ det_idx_set[cur_det_name]] num = int( len(validation_data) * dropout_rate_set[cur_det_name]) marks = cur_det.mark(validation_data, data_format=data_format) marks = np.sort(marks) thrs_set[cur_det_name] = marks[-num] cache_data[cur_det_name] = marks[-num] print("compute thrs for model #", cur_det_name, "#:", marks[-num]) if len(det_set) > 0: hash_id = save_cache(det_info, cache_data, cache_path) print("save cache:", hash_id) else: print("hit cache:", cache['hash_id']) cache_data = cache['data'] for cur_det_name, cur_det in det_set.items(): thrs_set[cur_det_name] = cache_data[cur_det_name] print("compute thrs for model #", cur_det_name, "#:", cache_data[cur_det_name]) return det_set, thrs_set, det_gpu_idx
args = parser.parse_args() # model model = {"fn": KNN, "params": {"n_neighbors": 3}} if args.dataset is None: args.dataset = sorted( [d for d in os.listdir(args.data_dir) if d[0] != "."]) for dataset in args.dataset: for mv_type in args.mv_type: print("Running", dataset, mv_type) # build data cache_dir = os.path.join(args.cache_dir, dataset, mv_type) data, info = utils.load_cache(cache_dir) data = preprocess(data) data["X_val"] = data["X_val"][:args.val_size] data["y_val"] = data["y_val"][:args.val_size] print("Preprocess Finished") info["val_size"] = len(data["X_val"]) result_classic = run_classic_clean(data, model) result_bc = run_boost_clean(data, model) save_path = utils.makedir( [args.save_dir, dataset, mv_type, "_" + str(args.val_size)], "baseline.csv") utils.dicts_to_csv([info, result_classic, result_bc], save_path)
def load_atlas(self): self.atlas = load_cache(BOXCARS_ATLAS)
def load_classification_split(self, split_name): self.split = load_cache(BOXCARS_CLASSIFICATION_SPLITS)[split_name] self.split_name = split_name
def _main(flag_draw, flag_preview, flag_asis, flag_si, address): """ ADDRESS - freeform address to get forecast for """ address = ' '.join(address) # ewww... load_cache(get_location) location = get_location(address) if not location: return 1 save_cache(get_location) if flag_asis: nice_address = address else: nice_address = get_nice_address(location) weather = get_weather(location, flag_si) if weather is None or "currently" not in weather: return 1 image_black = Image.new('1', (EPD_HEIGHT, EPD_WIDTH), 255) # 298*126 image_red = Image.new('1', image_black.size, 255) # estimate size of and draw forecast address address_text, address_size = get_text_fit(image_black, nice_address, image_black.size[0] - 4, CONFIG["font_address"], CONFIG["font_address_size_min"], CONFIG["font_address_size"]) draw_centered_text(image_red, address_text, 0, CONFIG["font_address"], address_size) max_address_height = get_font_height(image_black, CONFIG["font_address"], CONFIG["font_address_size"]) # estimate sizes of today/tomorrow forecasts (d0w, d0h) = draw_icon_temp(image_black, weather["daily"]["data"][0], (0, 0), CONFIG["font_forecast_size"], daily=True, draw_it=False, si_units=flag_si) (d1w, d1h) = draw_icon_temp(image_black, weather["daily"]["data"][1], (0, 0), CONFIG["font_forecast_size"], daily=True, draw_it=False, si_units=flag_si) # position forecasts nicely d_gap = (image_black.size[0] - d0w - d1w) / 3 d0x = d_gap d0y = image_black.size[1] - d0h - 2 d1x = d_gap + d0w + d_gap d1y = d0y # actually draw forecasts draw_icon_temp(image_black, weather["daily"]["data"][0], (d0x, d0y), CONFIG["font_forecast_size"], daily=True, si_units=flag_si) draw_icon_temp(image_black, weather["daily"]["data"][1], (d1x, d1y), CONFIG["font_forecast_size"], daily=True, si_units=flag_si) (cw, ch) = draw_icon_temp(image_black, weather["currently"], (0, 0), CONFIG["font_main_size"], daily=False, draw_it=False, si_units=flag_si) draw_icon_temp( image_black, weather["currently"], ((image_black.size[0] - cw) / 2, int(max_address_height * 0.9)), CONFIG["font_main_size"], daily=False, si_units=flag_si) if flag_preview: imgcat(gen_preview(image_black, image_red)) if flag_draw: draw_epaper_horizontal(image_black, image_red) return 0
def __getitem__(self, index): start = index*self.batch_size end = min(start + self.batch_size, len(self.ix)) a = self.y[self.iy[start:end],:] b = self.x[self.ix[start:end],:] if self.verbose > 0: self.progress.update() if self.progress.n >= len(self): self.progress.close() return [a,b] def __len__(self): return (len(self.ix) + self.batch_size - 1)//self.batch_size if __name__ == '__main__': from utils import load_cache, group_label, shuffle_idxs, score_reshape train, y_, _, _ = load_cache('../../') score = np.random.random_sample(size=(len(train), len(train))) id2samples = group_label(y_) train_idx, _ = shuffle_idxs(train) from model import build_model model, branch_model, head_model = build_model(64e-5,0) inp = FeatureGen(train, train_idx) feats = branch_model.predict(inp[0]) import ipdb; ipdb.set_trace() scoreGen = ScoreGen(feats) score = head_model.predict(scoreGen[0]) res = score_reshape(score, feats) print(score.shape)
pos[1], )) for isle, pos in zip(islands, bounds) ] for r in results: print("Completed Cacheing: ", r.get()) if __name__ == "__main__": ''' islands = ["kauai", "molokai", "big_east", "big_west", "maui", "niihau"] data = load_cache(islands, cache_dir="tmp") ''' data = load_cache(["oahu"], cache_dir="tmp") shape, X, Y = prep_data(data) x, x_test, y, y_test = train_test_split(X, Y, test_size=0.4) x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2) save_test(x_test, y_test) model = load_model("model.h5") model.summary() data_gen = daily_generator(x_train, y_train) val_gen = daily_generator(x_val, y_val) model.fit_generator(generator=data_gen,