def preprocess(train: pd.DataFrame, test: pd.DataFrame, verbose: bool = True) -> pd.DataFrame: settings = load_json() train = _quantile(train, settings) train, test = _minmax(train, test, settings) train, test = _process_tokens(train, test, settings) return train, test
def split(df: pd.DataFrame, seed: int = 1156, test_size: float = 0.1, split: bool = True): settings = load_json() column = settings["column"][0] data = df[[column["target"]] + column["text"] + column["numerical"]] return train_test_split(data, test_size=test_size, random_state=seed)
def __init__(self, name): self.width = None self.height = None self.grid = {} self.census = None self.running_census = None self.conf = Config(data.load_json(name + ".cfg")) self.name = name self.caught_fire = defaultdict(lambda: True) self.load(name)
def get_embbeding_layer(target_embbeding: str, vocabulary, verbose=True): settings = load_json()["embbeding"] embedding_dim = _get_embbeding_dim(settings, target_embbeding) embedding_path = path(target_embbeding) word_index = _get_word_index(vocabulary, verbose) embeddings_index = _get_embbedings_index(embedding_path, verbose) num_tokens = len(vocabulary) + 2 embbeding_matrix = _get_matrix(num_tokens, embedding_dim, word_index, embeddings_index, verbose) return Embedding(num_tokens, embedding_dim, embeddings_initializer=initializers.Constant(embbeding_matrix), trainable=False,), embbeding_matrix
def __init__(self, name, seq, pos_x, pos_y, *groups): super(Anim, self).__init__(*groups) self.conf = conf = data.load_json(name) self.sheet = data.load_image(conf["image"]) self.frame_w, self.frame_h = conf["frame"] self.offset_x, self.offset_y = conf["offset"] self.rect = pygame.Rect( pos_x + self.offset_x, pos_y + self.offset_y, self.frame_w, self.frame_h, ) self.delay = conf.get("delay", 0) self.time = self.delay # force first update self.seq = None self.set_seq(seq) self.update()
def __init__(self, name): self.name = name self.conf = data.load_json(name) self.surf = data.load_image(self.conf["image"]) self.tiles = defaultdict(list) self.dummy = pygame.Surface((GRID_W, GRID_H)) self.dummy.fill((255, 0, 0)) for name, coords in self.conf["tiles"].items(): for ix, iy in coords: x = ix * GRID_W y = iy * GRID_H tile = pygame.Surface((GRID_W, GRID_H), SRCALPHA) tile.blit(self.surf, (0, 0), (x, y, GRID_W, GRID_H)) self.tiles[name].append(tile) if pygame.version.vernum >= (2, 0, 0): # pygame_sdl2 has trouble with auto-generate stuff below, so we skip it return gen = defaultdict(list) # straits for tile in self.tiles["road-hh"]: tile = pygame.transform.flip(tile, True, False) gen["road-hh"].append(tile) tile = pygame.transform.flip(tile, False, True) gen["road-hh"].append(tile) tile = pygame.transform.flip(tile, True, False) gen["road-hh"].append(tile) for tile in self.tiles["road-vv"]: tile = pygame.transform.flip(tile, True, False) gen["road-vv"].append(tile) tile = pygame.transform.flip(tile, False, True) gen["road-vv"].append(tile) tile = pygame.transform.flip(tile, True, False) gen["road-vv"].append(tile) # turns for tile in self.tiles["road-tl"]: tile = pygame.transform.rotate(tile, 90) gen["road-bl"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-br"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-tr"].append(tile) for tile in self.tiles["road-tr"]: tile = pygame.transform.rotate(tile, 90) gen["road-tl"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-bl"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-br"].append(tile) for tile in self.tiles["road-br"]: tile = pygame.transform.rotate(tile, 90) gen["road-tr"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-tl"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-bl"].append(tile) for tile in self.tiles["road-bl"]: tile = pygame.transform.rotate(tile, 90) gen["road-br"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-tr"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-tl"].append(tile) # dead-ends for tile in self.tiles["road-bb"]: tile = pygame.transform.rotate(tile, 90) gen["road-rr"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-tt"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-ll"].append(tile) # tees for tile in self.tiles["road--t"]: tile = pygame.transform.rotate(tile, 90) gen["road-|l"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road--b"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-|r"].append(tile) for tile in self.tiles["road-|l"]: tile = pygame.transform.rotate(tile, 90) gen["road--b"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road-|r"].append(tile) tile = pygame.transform.rotate(tile, 90) gen["road--t"].append(tile) for name, tiles in gen.items(): self.tiles[name].extend(tiles)
import json from queneau import WordAssembler from data import load_json import textwrap assembler = WordAssembler(load_json("dinosaurs.json")) dinos = [] for i in range(2): dino = assembler.assemble_word() if dino[0] in 'AEIO': dino = "an " + dino else: dino = "a " + dino dinos.append(dino) print "Look! Behind that ridge! It's %s fighting %s!" % tuple(dinos)
def get_hero_power(): if not os.path.exists('hero_power.json'): create_all_normalized_power() hero_power = data.load_json('hero_power') return hero_power
def load_json(self, pagename): json_data = data.load_json(pagename) if json_data is None: return for item in json_data: self._add_content(item)
def main(arguments): ''' Main logic: parse args for tests to run and which models to evaluate ''' log.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d %I:%M:%S %p', level=log.INFO) args = handle_arguments(arguments) if args.seed >= 0: log.info('Seeding random number generators with {}'.format(args.seed)) random.seed(args.seed) np.random.seed(args.seed) maybe_make_dir(args.exp_dir) if args.log_file: log.getLogger().addHandler(log.FileHandler(args.log_file)) log.info("Parsed args: \n%s", args) all_tests = sorted( [ entry[:-len(TEST_EXT)] for entry in os.listdir(args.data_dir) if not entry.startswith('.') and entry.endswith(TEST_EXT) ], key=test_sort_key ) log.debug('Tests found:') for test in all_tests: log.debug('\t{}'.format(test)) tests = split_comma_and_check(args.tests, all_tests, "test") if args.tests is not None else all_tests log.info('Tests selected:') for test in tests: log.info('\t{}'.format(test)) models = split_comma_and_check(args.models, MODEL_NAMES, "model") if args.models is not None else MODEL_NAMES log.info('Models selected:') for model in models: log.info('\t{}'.format(model)) results = [] for model_name in models: # Different models have different interfaces for things, but generally want to: # - if saved vectors aren't there: # - load the model # - load the test data # - encode the vectors # - dump the files into some storage # - else load the saved vectors ''' log.info('Running tests for model {}'.format(model_name)) if model_name == ModelName.BOW.value: model_options = '' if args.glove_path is None: raise Exception('glove_path must be specified for {} model'.format(model_name)) elif model_name == ModelName.INFERSENT.value: if args.glove_path is None: raise Exception('glove_path must be specified for {} model'.format(model_name)) if args.infersent_dir is None: raise Exception('infersent_dir must be specified for {} model'.format(model_name)) model_options = '' elif model_name == ModelName.GENSEN.value: if args.glove_h5_path is None: raise Exception('glove_h5_path must be specified for {} model'.format(model_name)) if args.gensen_dir is None: raise Exception('gensen_dir must be specified for {} model'.format(model_name)) gensen_version_list = split_comma_and_check(args.gensen_version, GENSEN_VERSIONS, "gensen_prefix") if len(gensen_version_list) > 2: raise ValueError('gensen_version can only have one or two elements') model_options = 'version=' + args.gensen_version elif model_name == ModelName.GUSE.value: model_options = '' elif model_name == ModelName.COVE.value: if args.cove_encs is None: raise Exception('cove_encs must be specified for {} model'.format(model_name)) model_options = '' elif model_name == ModelName.ELMO.value: model_options = 'time_combine={};layer_combine={}'.format( args.time_combine_method, args.layer_combine_method) elif model_name == ModelName.BERT.value: model_options = 'version=' + args.bert_version elif model_name == ModelName.OPENAI.value: if args.openai_encs is None: raise Exception('openai_encs must be specified for {} model'.format(model_name)) model_options = '' else: raise ValueError("Model %s not found!" % model_name) model = None for test in tests: log.info('Running test {} for model {}'.format(test, model_name)) enc_file = os.path.join(args.exp_dir, "%s.%s.h5" % ( "%s;%s" % (model_name, model_options) if model_options else model_name, test)) if not args.ignore_cached_encs and os.path.isfile(enc_file): log.info("Loading encodings from %s", enc_file) encs = load_encodings(enc_file) encs_targ1 = encs['targ1'] encs_targ2 = encs['targ2'] encs_attr1 = encs['attr1'] encs_attr2 = encs['attr2'] else: # load the test data encs = load_json(os.path.join(args.data_dir, "%s%s" % (test, TEST_EXT))) # load the model and do model-specific encoding procedure log.info('Computing sentence encodings') if model_name == ModelName.BOW.value: encs_targ1 = bow.encode(encs["targ1"]["examples"], args.glove_path) encs_targ2 = bow.encode(encs["targ2"]["examples"], args.glove_path) encs_attr1 = bow.encode(encs["attr1"]["examples"], args.glove_path) encs_attr2 = bow.encode(encs["attr2"]["examples"], args.glove_path) elif model_name == ModelName.INFERSENT.value: if model is None: model = infersent.load_infersent(args.infersent_dir, args.glove_path, train_data='all', use_cpu=args.use_cpu) model.build_vocab( [ example for k in ('targ1', 'targ2', 'attr1', 'attr2') for example in encs[k]['examples'] ], tokenize=True) log.info("Encoding sentences for test %s with model %s...", test, model_name) encs_targ1 = infersent.encode(model, encs["targ1"]["examples"]) encs_targ2 = infersent.encode(model, encs["targ2"]["examples"]) encs_attr1 = infersent.encode(model, encs["attr1"]["examples"]) encs_attr2 = infersent.encode(model, encs["attr2"]["examples"]) elif model_name == ModelName.GENSEN.value: if model is None: gensen_1 = gensen.GenSenSingle( model_folder=args.gensen_dir, filename_prefix=gensen_version_list[0], pretrained_emb=args.glove_h5_path, cuda=not args.use_cpu) model = gensen_1 if len(gensen_version_list) == 2: gensen_2 = gensen.GenSenSingle( model_folder=args.gensen_dir, filename_prefix=gensen_version_list[1], pretrained_emb=args.glove_h5_path, cuda=not args.use_cpu) model = gensen.GenSen(gensen_1, gensen_2) vocab = gensen.build_vocab([ s for set_name in ('targ1', 'targ2', 'attr1', 'attr2') for s in encs[set_name]["examples"] ]) model.vocab_expansion(vocab) encs_targ1 = gensen.encode(model, encs["targ1"]["examples"]) encs_targ2 = gensen.encode(model, encs["targ2"]["examples"]) encs_attr1 = gensen.encode(model, encs["attr1"]["examples"]) encs_attr2 = gensen.encode(model, encs["attr2"]["examples"]) elif model_name == ModelName.GUSE.value: model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2") if args.use_cpu: kwargs = dict(device_count={'GPU': 0}) else: kwargs = dict() config = tf.ConfigProto(**kwargs) config.gpu_options.per_process_gpu_memory_fraction = 0.5 # maximum alloc gpu50% of MEM config.gpu_options.allow_growth = True # allocate dynamically with tf.Session(config=config) as session: session.run([tf.global_variables_initializer(), tf.tables_initializer()]) def guse_encode(sents): encs_node = model(sents) encs = session.run(encs_node) encs_d = {sents[j]: enc for j, enc in enumerate(np.array(encs).tolist())} return encs_d encs_targ1 = guse_encode(encs["targ1"]["examples"]) encs_targ2 = guse_encode(encs["targ2"]["examples"]) encs_attr1 = guse_encode(encs["attr1"]["examples"]) encs_attr2 = guse_encode(encs["attr2"]["examples"]) elif model_name == ModelName.COVE.value: load_encs_from = os.path.join(args.cove_encs, "%s.encs" % test) encs = load_jiant_encodings(load_encs_from, n_header=1) elif model_name == ModelName.ELMO.value: kwargs = dict(time_combine_method=args.time_combine_method, layer_combine_method=args.layer_combine_method) encs_targ1 = elmo.encode(encs["targ1"]["examples"], **kwargs) encs_targ2 = elmo.encode(encs["targ2"]["examples"], **kwargs) encs_attr1 = elmo.encode(encs["attr1"]["examples"], **kwargs) encs_attr2 = elmo.encode(encs["attr2"]["examples"], **kwargs) elif model_name == ModelName.BERT.value: model, tokenizer = bert.load_model(args.bert_version) encs_targ1 = bert.encode(model, tokenizer, encs["targ1"]["examples"]) encs_targ2 = bert.encode(model, tokenizer, encs["targ2"]["examples"]) encs_attr1 = bert.encode(model, tokenizer, encs["attr1"]["examples"]) encs_attr2 = bert.encode(model, tokenizer, encs["attr2"]["examples"]) elif model_name == ModelName.OPENAI.value: load_encs_from = os.path.join(args.openai_encs, "%s.encs" % test) #encs = load_jiant_encodings(load_encs_from, n_header=1, is_openai=True) encs = load_encodings(load_encs_from) encs_targ1 = encs["targ1"]["encs"] encs_targ2 = encs["targ2"]["encs"] encs_attr1 = encs["attr1"]["encs"] encs_attr2 = encs["attr2"]["encs"] else: raise ValueError("Model %s not found!" % model_name) encs["targ1"]["encs"] = encs_targ1 encs["targ2"]["encs"] = encs_targ2 encs["attr1"]["encs"] = encs_attr1 encs["attr2"]["encs"] = encs_attr2 log.info("\tDone!") if not args.dont_cache_encs: log.info("Saving encodings to %s", enc_file) save_encodings(encs, enc_file) enc = [e for e in encs["targ1"]['encs'].values()][0] d_rep = enc.size if isinstance(enc, np.ndarray) else len(enc) # run the test on the encodings log.info("Running SEAT...") log.info("Representation dimension: {}".format(d_rep)) esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric) results.append(dict( model=model_name, options=model_options, test=test, p_value=pval, effect_size=esize, num_targ1=len(encs['targ1']['encs']), num_targ2=len(encs['targ2']['encs']), num_attr1=len(encs['attr1']['encs']), num_attr2=len(encs['attr2']['encs']))) log.info("Model: %s", model_name) log.info('Options: {}'.format(model_options)) for r in results: log.info("\tTest {test}:\tp-val: {p_value:.9f}\tesize: {effect_size:.2f}".format(**r)) if args.results_path is not None: log.info('Writing results to {}'.format(args.results_path)) with open(args.results_path, 'w') as f: writer = DictWriter(f, fieldnames=results[0].keys(), delimiter='\t') writer.writeheader() for r in results: writer.writerow(r)