Exemple #1
0
 def __init__(self, config=cfg, cache=True):
     if not cache or not os.path.isfile(cfg.data_cache):
         self.train, self.val = self.train_val_split(
             utils.load_csv(cfg.train_csv), 0.9)
         self.test = utils.load_csv(cfg.test_csv, shuffle=False)
         utils.save_cache([self.train, self.val, self.test], cfg.data_cache)
     else:
         self.train, self.val, self.test = utils.load_cache(cfg.data_cache)
Exemple #2
0
def get_axioms(cat: str) -> list:
    """Return all axioms created by the Cat2Ax approach."""
    global __CATEGORY_AXIOMS__
    if '__CATEGORY_AXIOMS__' not in globals():
        __CATEGORY_AXIOMS__ = defaultdict(list, utils.load_cache('cat2ax_axioms'))
        if not __CATEGORY_AXIOMS__:
            raise ValueError('CATEGORY/CAT2AX: Axioms not initialised. Run axiom extraction before using them!')

    return __CATEGORY_AXIOMS__[cat]
Exemple #3
0
def _setup_hypernyms():
    """Initialisation of hypernyms that are extracted from Wikipedia categories using Cat2Ax axioms."""
    if utils.load_cache('wikitaxonomy_hypernyms') is not None:
        return  # only compute hypernyms if they are not existing already
    ccg = category.get_conceptual_category_graph()
    # initialise cat2ax axioms
    cat2ax_axioms = cat_axioms.extract_category_axioms(ccg)
    utils.update_cache('cat2ax_axioms', cat2ax_axioms)
    # initialise wikitaxonomy hypernyms
    wikitaxonomy_hypernyms = hypernymy_util.compute_hypernyms(ccg)
    utils.update_cache('wikitaxonomy_hypernyms', wikitaxonomy_hypernyms)
Exemple #4
0
 def __init__(self, load_atlas = False, load_split = None, use_estimated_3DBB = False, estimated_3DBB_path = None):
     self.dataset = load_cache(BOXCARS_DATASET)
     self.use_estimated_3DBB = use_estimated_3DBB
     
     self.atlas = None
     self.split = None
     self.split_name = None
     self.estimated_3DBB = None
     self.X = {}
     self.Y = {}
     for part in ("train", "validation", "test"):
         self.X[part] = None
         self.Y[part] = None # for labels as array of 0-1 flags
         
     if load_atlas:
         self.load_atlas()
     if load_split is not None:
         self.load_classification_split(load_split)
     if self.use_estimated_3DBB:
         self.estimated_3DBB = load_cache(estimated_3DBB_path)
Exemple #5
0
def is_hypernym(hyper_word: str, hypo_word: str) -> bool:
    """Returns True, if `hyper_word` and `hypo_word` are synonyms or if the former is a hypernym of the latter."""
    global __WIKITAXONOMY_HYPERNYMS__
    if '__WIKITAXONOMY_HYPERNYMS__' not in globals():
        __WIKITAXONOMY_HYPERNYMS__ = utils.load_cache('wikitaxonomy_hypernyms')
        if not __WIKITAXONOMY_HYPERNYMS__:
            raise ValueError(
                'wikitaxonomy_hypernyms not initialised. Run hypernym extraction once to create the necessary cache!'
            )

    if is_synonym(hyper_word, hypo_word):
        return True
    return hyper_word.lower() in __WIKITAXONOMY_HYPERNYMS__[hypo_word.lower()]
    def read(self, path):
        cache = utils.load_cache("facilities", self.config)

        if cache is None:
            self.progress = tqdm(desc="Loading Facilities ...")
            utils.make_xml_parser(self, utils.open_gzip(path))

            cache = self.process()
            utils.save_cache("facilities", cache, self.config)
        else:
            print("Loaded faciltiies from cache.")

        return cache
    def read(self, path, facility_id_to_index):
        cache = None

        if self.config["use_population_cache"]:
            cache = utils.load_cache("population", self.config)

        if cache is None:
            self.progress = tqdm(desc = "Loading Population ...")
            utils.make_xml_parser(self, utils.open_gzip(path))

            cache = self.process(facility_id_to_index)
            utils.save_cache("population", cache, self.config)
        else:
            print("Loaded population from cache.")

        return cache
Exemple #8
0
def compute_hypernyms(category_graph) -> dict:
    """Retrieves all hypernym relationships from the three sources (Wiki corpus, WebIsALOD, Category axioms)."""
    hypernyms = defaultdict(set)

    # collect hypernyms from axiom matches between Wikipedia categories
    cat_headlemmas = category_graph.get_node_LHS()
    axiom_hypernyms = defaultdict(lambda: defaultdict(int))
    for parent, child in category_graph.get_axiom_edges():
        for cl in cat_headlemmas[child]:
            for pl in cat_headlemmas[parent]:
                axiom_hypernyms[cl.lower()][pl.lower()] += 1

    # load remaining hypernyms
    wiki_hypernyms = utils.load_cache('wikipedia_hypernyms')
    webisalod_data = pickle.load(
        bz2.open(utils.get_data_file('files.dbpedia.webisalod_hypernyms'),
                 mode='rb'))
    webisalod_hypernyms = defaultdict(dict)
    for parent, child, conf in webisalod_data:
        webisalod_hypernyms[child][parent] = conf

    # merge hypernyms
    candidates = set(axiom_hypernyms) | set(wiki_hypernyms) | set(
        webisalod_hypernyms)
    for candidate in candidates:
        hyper_count = defaultdict(int)
        if candidate in axiom_hypernyms:
            for word, count in axiom_hypernyms[candidate].items():
                if count >= THRESHOLD_AXIOM:
                    hyper_count[word] += 2
        if candidate in wiki_hypernyms:
            for word, count in wiki_hypernyms[candidate].items():
                if count >= THRESHOLD_WIKI:
                    hyper_count[word] += 1
        if candidate in webisalod_hypernyms:
            for word, conf in webisalod_hypernyms[candidate].items():
                if conf >= THRESHOLD_WEBISALOD:
                    hyper_count[word] += 1
        hypernyms[candidate] = {
            word
            for word, count in hyper_count.items() if count > 1
        }

    return hypernyms
Exemple #9
0
def extract_wiki_corpus_resources():
    """Crawl the Wikipedia corpus for hearst patterns to retrieve hypernyms and type lexicalisations."""
    if utils.load_cache('wikipedia_type_lexicalisations') is not None:
        return  # only compute hypernyms and type lexicalisations if they are not existing already

    utils.get_logger().info(
        'WIKIPEDIA/NIF: Computing wikipedia hypernyms and type lexicalisations..'
    )
    total_hypernyms = defaultdict(lambda: defaultdict(int))
    total_type_lexicalisations = defaultdict(lambda: defaultdict(int))

    # initialize some caches to reduce the setup time of the individual processes
    dbp_store.get_types('')
    dbp_store.get_inverse_lexicalisations('')
    spacy_util.get_hearst_pairs('')

    with mp.Pool(processes=utils.get_config('max_cpus')) as pool:
        for hypernyms, type_lexicalisations in pool.imap_unordered(
                _compute_counts_for_resource,
                tqdm(_retrieve_plaintexts()),
                chunksize=1000):
            for (sub, obj), count in hypernyms.items():
                total_hypernyms[sub][obj] += count
            for (sub, obj), count in type_lexicalisations.items():
                total_type_lexicalisations[sub][obj] += count

    wikipedia_hypernyms = {
        word: dict(hypernym_counts)
        for word, hypernym_counts in total_hypernyms.items()
    }
    utils.update_cache('wikipedia_hypernyms', wikipedia_hypernyms)

    type_lexicalisations = {
        word: dict(type_counts)
        for word, type_counts in total_type_lexicalisations.items()
        if word not in STOP_WORDS
    }
    utils.update_cache('wikipedia_type_lexicalisations', type_lexicalisations)
Exemple #10
0
def get_movies(cached=False):
    if cached:
        print("Returning cached data")
        return load_cache(MOVIES_CACHE)

    options = Options()
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    chrome_bin_path = os.environ.get('GOOGLE_CHROME_BIN', None)
    chromedriver_path = os.environ.get('CHROMEDRIVER_PATH', None)
    if not chrome_bin_path or not chromedriver_path:
        print(
            'Chrome problem. Check if chrome and chromedriver are installed and envionmental variables are set.'
        )
        return []

    options.binary_location = chrome_bin_path
    # options.set_headless(headless=True)

    url = create_multikino_url()
    print(f"Getting {url} ...")
    browser = webdriver.Chrome(executable_path=chromedriver_path,
                               options=options)
    browser.get(url)
    html = browser.page_source
    save_to_file("multikino.html", html)
    print(f"browser: {browser}")
    browser.quit()

    movies = []

    print("Parsing...")
    soup = BeautifulSoup(html, "html.parser")
    for movie in soup.find_all(class_='filmlist__info'):
        title = movie.select(".filmlist__title > span")[0].get_text()
        try:
            rating = movie.find(attrs={
                "rv-show": "film.rank_value"
            }).select("span")[0].get_text()
            votes = movie.find(attrs={
                "rv-show": "film.rank_votes"
            }).select("span")[0].get_text()
        except AttributeError:
            print(f"No rating for {title}")
        except Exception as e:
            print(f"Something really bad happend: {e}")

        description = movie.select(".filmlist__synopsis > p")[0].get_text()
        genres = list(
            map(lambda item: item.get_text(),
                movie.find_all("a", class_="film-details__item")))
        genres = ', '.join(genres) or "-"

        if any(keyword in title for keyword in FILTER_KEYWORDS):
            continue

        movie = Movie(title=title,
                      votes=votes,
                      description=description,
                      genres=genres)
        movie.rating.mul = rating
        movies.append(movie)

    hash_movies = {movie.title: movie for movie in movies}

    print('Total movies found (+7 days from now): {}'.format(len(movies)))

    loop = asyncio.new_event_loop()
    print("Filmweb api call...")
    loop.run_until_complete(get_all_filmweb_api_data(hash_movies))
    print("IMDB api call...")
    loop.run_until_complete(get_all_imdb_api_data(hash_movies))

    movies = sort_movies_descending(movies)
    print("Saving cache...")
    save_cache(movies, MOVIES_CACHE)
    print("OK")
    return movies
Exemple #11
0
def get_type_lexicalisations(lemma: str) -> dict:
    """Return the type lexicalisation score for a set of lemmas (i.e. the probabilities of types given `lemmas`)."""
    global __TYPE_LEXICALISATIONS__
    if '__TYPE_LEXICALISATIONS__' not in globals():
        __TYPE_LEXICALISATIONS__ = defaultdict(dict, utils.load_cache('wikipedia_type_lexicalisations'))
    return __TYPE_LEXICALISATIONS__[lemma.lower()]
Exemple #12
0
                        help="Distance metric in objective function.")
    args = parser.parse_args()

    print('\n', " Call with Arguments ".center(50, "="), sep='')
    for item in args.__dict__:
        print("{:18}".format(item), "->\t", args.__dict__[item])
    return args


if __name__ == "__main__":
    args = parse_args()
    # Data # Genes x # Cells
    if args.raw is not None:
        data_dict = load_data(args.raw, args.t, args.groups, args.groups_col, args.batches, args.batches_col)
    else:
        data_dict = load_cache(args.cache)

    inmf = iNMF(data_dict, args.k, args.lam, args.gam, args.penalty, args.metric)
    print(inmf)

    tic = time.time()
    try:
        for i in range(100000):
            obj_val = inmf.cal_objective()
            inmf.cvg.update_ma(obj_val)
            if i == 0 or (i + 1) % 100 == 0:
                print("Iteration: {}\tObjective Value: {}".format(i + 1, obj_val))
            if inmf.cvg.is_converge():
                print("Convergence Criterion Reached at Iteration: {}".format(i + 1))
                break
            inmf.update_par()
Exemple #13
0
def build_detector(detector_model_dir,
                   detector_model_names,
                   save_model_name,
                   save_model_dir,
                   model_path,
                   MODEL,
                   det_model,
                   data,
                   data_format,
                   is_det_joint,
                   model_idx,
                   gpu_count=1):
    det_dict = {}
    det_set = {}
    det_idx_set = {}
    dropout_rate_set = {}
    det_gpu_idx = {}

    for val in detector_model_names:
        if val == '':
            continue

        cur_det_name, cur_p, cur_det_type, cur_dropout_rate, cur_model_id = val.split(
            '/')
        cur_model_id = int(cur_model_id)
        cur_det_path = os.path.join(detector_model_dir, cur_det_name)
        cur_detector = {
            "p": cur_p,
            "type": cur_det_type,
            "dropout_rate": cur_dropout_rate
        }
        det_dict[cur_det_name] = cur_detector

        if type(det_model) is list:
            cur_det_model = det_model[cur_model_id]
            cur_model_path = os.path.join(save_model_dir,
                                          save_model_name[cur_model_id])
            cur_det_idx = model_idx[cur_model_id]
        else:
            cur_det_model = det_model
            cur_model_path = model_path
            cur_det_idx = model_idx
        default_det_idx = cur_det_idx

        with tf.device('/gpu:' + str(cur_model_id % gpu_count)):
            # build detector
            print("# build detector: ", cur_det_name)
            print("type:", cur_det_type)
            print("p:", cur_p)
            print("drop_rate:", cur_dropout_rate)

            if cur_det_type == 'AED':
                cur_detector = AEDetector(cur_det_path, p=int(cur_p))
                cur_det_idx = load_model_idx(cur_det_path)
            elif cur_det_type == "DBD":
                id_reformer = IdReformer()
                print("# build reformer", cur_det_name)
                cur_reformer_t = SimpleReformer(cur_det_path)
                classifier = Classifier(cur_model_path,
                                        MODEL,
                                        data_format=data_format,
                                        model=cur_det_model)
                cur_detector = DBDetector(reconstructor=id_reformer,
                                          prober=cur_reformer_t,
                                          classifier=classifier,
                                          T=int(cur_p))
                cur_det_idx = load_model_idx(cur_det_path)

        if cur_det_idx is None:
            cur_det_idx = default_det_idx

        det_idx_set[cur_det_name] = cur_det_idx['validate']

        dropout_rate_set[cur_det_name] = float(cur_dropout_rate)
        det_set[cur_det_name] = cur_detector
        det_gpu_idx[cur_det_name] = cur_model_id % gpu_count

    # compute thrs
    thrs_set = {}
    det_info = {
        "model": save_model_name,
        "model_dir": save_model_dir,
        "det": det_dict,
        "det_dir": detector_model_dir,
        "joint_thrs": is_det_joint
    }

    cache_path = os.path.join(detector_model_dir, "cache")

    if is_det_joint:
        marks_set = []
        num = 0
        cache = load_cache(det_info, cache_path)
        if cache is None:
            cache_data = {}
            for cur_det_name, cur_det in det_set.items():
                validation_data = data.train_data_orig[
                    det_idx_set[cur_det_name]]
                num = int(
                    len(validation_data) * dropout_rate_set[cur_det_name])
                marks = cur_det.mark(validation_data, data_format=data_format)
                marks_set.append(marks)

                marks = np.sort(marks)
                cache_data[cur_det_name] = marks[-num]
                print("compute thrs for model #", cur_det_name, "#:",
                      marks[-num])

            marks_set = np.transpose(marks_set)
            marks_max = np.max(marks_set, axis=1)
            marks_max = np.sort(marks_max)
            max_thrs = marks_max[-num]

            cache_data['thrs'] = max_thrs
            if len(det_set) > 0:
                hash_id = save_cache(det_info, cache_data, cache_path)
                print("save cache:", hash_id)
        else:
            print("hit cache:", cache['hash_id'])
            cache_data = cache['data']
            for cur_det_name, cur_det in det_set.items():
                print("compute thrs for model #", cur_det_name, "#:",
                      cache_data[cur_det_name])
            max_thrs = cache_data['thrs']

        for cur_det_name, cur_det in det_set.items():
            thrs_set[cur_det_name] = max_thrs

        print("use joint thrs:", max_thrs)
    else:
        cache = load_cache(det_info, cache_path)
        if cache is None:
            cache_data = {}
            for cur_det_name, cur_det in det_set.items():
                validation_data = data.train_data_orig[
                    det_idx_set[cur_det_name]]
                num = int(
                    len(validation_data) * dropout_rate_set[cur_det_name])
                marks = cur_det.mark(validation_data, data_format=data_format)
                marks = np.sort(marks)

                thrs_set[cur_det_name] = marks[-num]
                cache_data[cur_det_name] = marks[-num]
                print("compute thrs for model #", cur_det_name, "#:",
                      marks[-num])

            if len(det_set) > 0:
                hash_id = save_cache(det_info, cache_data, cache_path)
                print("save cache:", hash_id)
        else:
            print("hit cache:", cache['hash_id'])
            cache_data = cache['data']
            for cur_det_name, cur_det in det_set.items():
                thrs_set[cur_det_name] = cache_data[cur_det_name]
                print("compute thrs for model #", cur_det_name, "#:",
                      cache_data[cur_det_name])

    return det_set, thrs_set, det_gpu_idx
Exemple #14
0
args = parser.parse_args()

# model
model = {"fn": KNN, "params": {"n_neighbors": 3}}

if args.dataset is None:
    args.dataset = sorted(
        [d for d in os.listdir(args.data_dir) if d[0] != "."])

for dataset in args.dataset:
    for mv_type in args.mv_type:
        print("Running", dataset, mv_type)

        # build data
        cache_dir = os.path.join(args.cache_dir, dataset, mv_type)
        data, info = utils.load_cache(cache_dir)
        data = preprocess(data)

        data["X_val"] = data["X_val"][:args.val_size]
        data["y_val"] = data["y_val"][:args.val_size]

        print("Preprocess Finished")

        info["val_size"] = len(data["X_val"])

        result_classic = run_classic_clean(data, model)
        result_bc = run_boost_clean(data, model)
        save_path = utils.makedir(
            [args.save_dir, dataset, mv_type, "_" + str(args.val_size)],
            "baseline.csv")
        utils.dicts_to_csv([info, result_classic, result_bc], save_path)
Exemple #15
0
 def load_atlas(self):
     self.atlas = load_cache(BOXCARS_ATLAS)
Exemple #16
0
 def load_classification_split(self, split_name):
     self.split = load_cache(BOXCARS_CLASSIFICATION_SPLITS)[split_name]
     self.split_name = split_name
Exemple #17
0
def _main(flag_draw, flag_preview, flag_asis, flag_si, address):
    """
    ADDRESS - freeform address to get forecast for
    """

    address = ' '.join(address)  # ewww...
    load_cache(get_location)
    location = get_location(address)
    if not location:
        return 1
    save_cache(get_location)

    if flag_asis:
        nice_address = address
    else:
        nice_address = get_nice_address(location)

    weather = get_weather(location, flag_si)
    if weather is None or "currently" not in weather:
        return 1

    image_black = Image.new('1', (EPD_HEIGHT, EPD_WIDTH), 255)  # 298*126
    image_red = Image.new('1', image_black.size, 255)

    # estimate size of and draw forecast address
    address_text, address_size = get_text_fit(image_black, nice_address,
                                              image_black.size[0] - 4,
                                              CONFIG["font_address"],
                                              CONFIG["font_address_size_min"],
                                              CONFIG["font_address_size"])
    draw_centered_text(image_red, address_text, 0, CONFIG["font_address"],
                       address_size)
    max_address_height = get_font_height(image_black, CONFIG["font_address"],
                                         CONFIG["font_address_size"])

    # estimate sizes of today/tomorrow forecasts
    (d0w, d0h) = draw_icon_temp(image_black,
                                weather["daily"]["data"][0], (0, 0),
                                CONFIG["font_forecast_size"],
                                daily=True,
                                draw_it=False,
                                si_units=flag_si)
    (d1w, d1h) = draw_icon_temp(image_black,
                                weather["daily"]["data"][1], (0, 0),
                                CONFIG["font_forecast_size"],
                                daily=True,
                                draw_it=False,
                                si_units=flag_si)

    # position forecasts nicely
    d_gap = (image_black.size[0] - d0w - d1w) / 3
    d0x = d_gap
    d0y = image_black.size[1] - d0h - 2
    d1x = d_gap + d0w + d_gap
    d1y = d0y

    # actually draw forecasts
    draw_icon_temp(image_black,
                   weather["daily"]["data"][0], (d0x, d0y),
                   CONFIG["font_forecast_size"],
                   daily=True,
                   si_units=flag_si)
    draw_icon_temp(image_black,
                   weather["daily"]["data"][1], (d1x, d1y),
                   CONFIG["font_forecast_size"],
                   daily=True,
                   si_units=flag_si)

    (cw, ch) = draw_icon_temp(image_black,
                              weather["currently"], (0, 0),
                              CONFIG["font_main_size"],
                              daily=False,
                              draw_it=False,
                              si_units=flag_si)
    draw_icon_temp(
        image_black,
        weather["currently"],
        ((image_black.size[0] - cw) / 2, int(max_address_height * 0.9)),
        CONFIG["font_main_size"],
        daily=False,
        si_units=flag_si)

    if flag_preview:
        imgcat(gen_preview(image_black, image_red))
    if flag_draw:
        draw_epaper_horizontal(image_black, image_red)

    return 0
    def __getitem__(self, index):
        start = index*self.batch_size
        end   = min(start + self.batch_size, len(self.ix))
        a     = self.y[self.iy[start:end],:]
        b     = self.x[self.ix[start:end],:]
        if self.verbose > 0: 
            self.progress.update()
            if self.progress.n >= len(self): self.progress.close()
        return [a,b]
    def __len__(self):
        return (len(self.ix) + self.batch_size - 1)//self.batch_size
    
if __name__ == '__main__':
    from utils import load_cache, group_label, shuffle_idxs, score_reshape
    
    train, y_, _, _ = load_cache('../../')
    score = np.random.random_sample(size=(len(train), len(train)))
    id2samples = group_label(y_)
    train_idx, _ = shuffle_idxs(train)
    
    
    from model import build_model
    model, branch_model, head_model = build_model(64e-5,0)
    
    inp = FeatureGen(train, train_idx)
    feats = branch_model.predict(inp[0])
    import ipdb; ipdb.set_trace()
    scoreGen = ScoreGen(feats)
    score = head_model.predict(scoreGen[0])
    res = score_reshape(score, feats)
    print(score.shape)
Exemple #19
0
                                 pos[1],
                             )) for isle, pos in zip(islands, bounds)
        ]

        for r in results:
            print("Completed Cacheing: ", r.get())


if __name__ == "__main__":
    '''
    islands = ["kauai", "molokai", "big_east", "big_west", "maui", "niihau"]

    data = load_cache(islands, cache_dir="tmp")
    '''

    data = load_cache(["oahu"], cache_dir="tmp")

    shape, X, Y = prep_data(data)

    x, x_test, y, y_test = train_test_split(X, Y, test_size=0.4)

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)

    save_test(x_test, y_test)

    model = load_model("model.h5")
    model.summary()

    data_gen = daily_generator(x_train, y_train)
    val_gen = daily_generator(x_val, y_val)
    model.fit_generator(generator=data_gen,