Beispiel #1
0
def preprocess(train: pd.DataFrame,
               test: pd.DataFrame,
               verbose: bool = True) -> pd.DataFrame:
    settings = load_json()

    train = _quantile(train, settings)
    train, test = _minmax(train, test, settings)
    train, test = _process_tokens(train, test, settings)

    return train, test
Beispiel #2
0
def split(df: pd.DataFrame,
          seed: int = 1156,
          test_size: float = 0.1,
          split: bool = True):
    settings = load_json()
    column = settings["column"][0]

    data = df[[column["target"]] + column["text"] + column["numerical"]]

    return train_test_split(data, test_size=test_size, random_state=seed)
Beispiel #3
0
    def __init__(self, name):
        self.width = None
        self.height = None
        self.grid = {}
        self.census = None
        self.running_census = None
        self.conf = Config(data.load_json(name + ".cfg"))
        self.name = name
        self.caught_fire = defaultdict(lambda: True)

        self.load(name)
Beispiel #4
0
def get_embbeding_layer(target_embbeding: str, vocabulary, verbose=True):
    settings = load_json()["embbeding"]
    embedding_dim = _get_embbeding_dim(settings, target_embbeding)
    embedding_path = path(target_embbeding)
    
    word_index = _get_word_index(vocabulary, verbose)
    embeddings_index = _get_embbedings_index(embedding_path, verbose)
    
    num_tokens = len(vocabulary) + 2
    
    embbeding_matrix = _get_matrix(num_tokens, embedding_dim, word_index, embeddings_index, verbose)
    
    return Embedding(num_tokens, embedding_dim, embeddings_initializer=initializers.Constant(embbeding_matrix), trainable=False,), embbeding_matrix
    
    
Beispiel #5
0
    def __init__(self, name, seq, pos_x, pos_y, *groups):
        super(Anim, self).__init__(*groups)
        self.conf = conf = data.load_json(name)

        self.sheet = data.load_image(conf["image"])
        self.frame_w, self.frame_h = conf["frame"]
        self.offset_x, self.offset_y = conf["offset"]
        self.rect = pygame.Rect(
            pos_x + self.offset_x,
            pos_y + self.offset_y,
            self.frame_w,
            self.frame_h,
        )

        self.delay = conf.get("delay", 0)
        self.time = self.delay  # force first update

        self.seq = None
        self.set_seq(seq)
        self.update()
Beispiel #6
0
    def __init__(self, name, seq, pos_x, pos_y, *groups):
        super(Anim, self).__init__(*groups)
        self.conf = conf = data.load_json(name)

        self.sheet = data.load_image(conf["image"])
        self.frame_w, self.frame_h = conf["frame"]
        self.offset_x, self.offset_y = conf["offset"]
        self.rect = pygame.Rect(
            pos_x + self.offset_x,
            pos_y + self.offset_y,
            self.frame_w,
            self.frame_h,
        )

        self.delay = conf.get("delay", 0)
        self.time = self.delay # force first update

        self.seq = None
        self.set_seq(seq)
        self.update()
Beispiel #7
0
    def __init__(self, name):
        self.name = name
        self.conf = data.load_json(name)
        self.surf = data.load_image(self.conf["image"])

        self.tiles = defaultdict(list)
        self.dummy = pygame.Surface((GRID_W, GRID_H))
        self.dummy.fill((255, 0, 0))

        for name, coords in self.conf["tiles"].items():
            for ix, iy in coords:
                x = ix * GRID_W
                y = iy * GRID_H
                tile = pygame.Surface((GRID_W, GRID_H), SRCALPHA)
                tile.blit(self.surf, (0, 0), (x, y, GRID_W, GRID_H))
                self.tiles[name].append(tile)

        if pygame.version.vernum >= (2, 0, 0):
            # pygame_sdl2 has trouble with auto-generate stuff below, so we skip it
            return

        gen = defaultdict(list)

        # straits
        for tile in self.tiles["road-hh"]:
            tile = pygame.transform.flip(tile, True, False)
            gen["road-hh"].append(tile)
            tile = pygame.transform.flip(tile, False, True)
            gen["road-hh"].append(tile)
            tile = pygame.transform.flip(tile, True, False)
            gen["road-hh"].append(tile)

        for tile in self.tiles["road-vv"]:
            tile = pygame.transform.flip(tile, True, False)
            gen["road-vv"].append(tile)
            tile = pygame.transform.flip(tile, False, True)
            gen["road-vv"].append(tile)
            tile = pygame.transform.flip(tile, True, False)
            gen["road-vv"].append(tile)

        # turns
        for tile in self.tiles["road-tl"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road-bl"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-br"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tr"].append(tile)

        for tile in self.tiles["road-tr"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tl"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-bl"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-br"].append(tile)

        for tile in self.tiles["road-br"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tr"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tl"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-bl"].append(tile)

        for tile in self.tiles["road-bl"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road-br"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tr"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tl"].append(tile)

        # dead-ends
        for tile in self.tiles["road-bb"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road-rr"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-tt"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-ll"].append(tile)

        # tees
        for tile in self.tiles["road--t"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road-|l"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road--b"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-|r"].append(tile)

        for tile in self.tiles["road-|l"]:
            tile = pygame.transform.rotate(tile, 90)
            gen["road--b"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road-|r"].append(tile)
            tile = pygame.transform.rotate(tile, 90)
            gen["road--t"].append(tile)

        for name, tiles in gen.items():
            self.tiles[name].extend(tiles)
Beispiel #8
0
import json
from queneau import WordAssembler
from data import load_json
import textwrap
assembler = WordAssembler(load_json("dinosaurs.json"))

dinos = []
for i in range(2):
    dino = assembler.assemble_word()
    if dino[0] in 'AEIO':
        dino = "an " + dino
    else:
        dino = "a " + dino
    dinos.append(dino)

print "Look! Behind that ridge! It's %s fighting %s!" % tuple(dinos)
Beispiel #9
0
def get_hero_power():
    if not os.path.exists('hero_power.json'):
        create_all_normalized_power()
    hero_power = data.load_json('hero_power')
    return hero_power
Beispiel #10
0
 def load_json(self, pagename):
     json_data = data.load_json(pagename)
     if json_data is None:
         return
     for item in json_data:
         self._add_content(item)
Beispiel #11
0
def main(arguments):
    ''' Main logic: parse args for tests to run and which models to evaluate '''
    log.basicConfig(format='%(asctime)s: %(message)s', datefmt='%m/%d %I:%M:%S %p', level=log.INFO)

    args = handle_arguments(arguments)
    if args.seed >= 0:
        log.info('Seeding random number generators with {}'.format(args.seed))
        random.seed(args.seed)
        np.random.seed(args.seed)
    maybe_make_dir(args.exp_dir)
    if args.log_file:
        log.getLogger().addHandler(log.FileHandler(args.log_file))
    log.info("Parsed args: \n%s", args)

    all_tests = sorted(
        [
            entry[:-len(TEST_EXT)]
            for entry in os.listdir(args.data_dir)
            if not entry.startswith('.') and entry.endswith(TEST_EXT)
        ],
        key=test_sort_key
    )
    log.debug('Tests found:')
    for test in all_tests:
        log.debug('\t{}'.format(test))

    tests = split_comma_and_check(args.tests, all_tests, "test") if args.tests is not None else all_tests
    log.info('Tests selected:')
    for test in tests:
        log.info('\t{}'.format(test))

    models = split_comma_and_check(args.models, MODEL_NAMES, "model") if args.models is not None else MODEL_NAMES
    log.info('Models selected:')
    for model in models:
        log.info('\t{}'.format(model))


    results = []
    for model_name in models:
        # Different models have different interfaces for things, but generally want to:
        # - if saved vectors aren't there:
        #    - load the model
        #    - load the test data
        #    - encode the vectors
        #    - dump the files into some storage
        # - else load the saved vectors '''
        log.info('Running tests for model {}'.format(model_name))

        if model_name == ModelName.BOW.value:
            model_options = ''
            if args.glove_path is None:
                raise Exception('glove_path must be specified for {} model'.format(model_name))
        elif model_name == ModelName.INFERSENT.value:
            if args.glove_path is None:
                raise Exception('glove_path must be specified for {} model'.format(model_name))
            if args.infersent_dir is None:
                raise Exception('infersent_dir must be specified for {} model'.format(model_name))
            model_options = ''
        elif model_name == ModelName.GENSEN.value:
            if args.glove_h5_path is None:
                raise Exception('glove_h5_path must be specified for {} model'.format(model_name))
            if args.gensen_dir is None:
                raise Exception('gensen_dir must be specified for {} model'.format(model_name))
            gensen_version_list = split_comma_and_check(args.gensen_version, GENSEN_VERSIONS, "gensen_prefix")
            if len(gensen_version_list) > 2:
                raise ValueError('gensen_version can only have one or two elements')
            model_options = 'version=' + args.gensen_version
        elif model_name == ModelName.GUSE.value:
            model_options = ''
        elif model_name == ModelName.COVE.value:
            if args.cove_encs is None:
                raise Exception('cove_encs must be specified for {} model'.format(model_name))
            model_options = ''
        elif model_name == ModelName.ELMO.value:
            model_options = 'time_combine={};layer_combine={}'.format(
                args.time_combine_method, args.layer_combine_method)
        elif model_name == ModelName.BERT.value:
            model_options = 'version=' + args.bert_version
        elif model_name == ModelName.OPENAI.value:
            if args.openai_encs is None:
                raise Exception('openai_encs must be specified for {} model'.format(model_name))
            model_options = ''
        else:
            raise ValueError("Model %s not found!" % model_name)

        model = None

        for test in tests:
            log.info('Running test {} for model {}'.format(test, model_name))
            enc_file = os.path.join(args.exp_dir, "%s.%s.h5" % (
                "%s;%s" % (model_name, model_options) if model_options else model_name,
                test))
            if not args.ignore_cached_encs and os.path.isfile(enc_file):
                log.info("Loading encodings from %s", enc_file)
                encs = load_encodings(enc_file)
                encs_targ1 = encs['targ1']
                encs_targ2 = encs['targ2']
                encs_attr1 = encs['attr1']
                encs_attr2 = encs['attr2']
            else:
                # load the test data
                encs = load_json(os.path.join(args.data_dir, "%s%s" % (test, TEST_EXT)))

                # load the model and do model-specific encoding procedure
                log.info('Computing sentence encodings')
                if model_name == ModelName.BOW.value:
                    encs_targ1 = bow.encode(encs["targ1"]["examples"], args.glove_path)
                    encs_targ2 = bow.encode(encs["targ2"]["examples"], args.glove_path)
                    encs_attr1 = bow.encode(encs["attr1"]["examples"], args.glove_path)
                    encs_attr2 = bow.encode(encs["attr2"]["examples"], args.glove_path)

                elif model_name == ModelName.INFERSENT.value:
                    if model is None:
                        model = infersent.load_infersent(args.infersent_dir, args.glove_path, train_data='all',
                                                         use_cpu=args.use_cpu)
                    model.build_vocab(
                        [
                            example
                            for k in ('targ1', 'targ2', 'attr1', 'attr2')
                            for example in encs[k]['examples']
                        ],
                        tokenize=True)
                    log.info("Encoding sentences for test %s with model %s...", test, model_name)
                    encs_targ1 = infersent.encode(model, encs["targ1"]["examples"])
                    encs_targ2 = infersent.encode(model, encs["targ2"]["examples"])
                    encs_attr1 = infersent.encode(model, encs["attr1"]["examples"])
                    encs_attr2 = infersent.encode(model, encs["attr2"]["examples"])

                elif model_name == ModelName.GENSEN.value:
                    if model is None:
                        gensen_1 = gensen.GenSenSingle(
                            model_folder=args.gensen_dir,
                            filename_prefix=gensen_version_list[0],
                            pretrained_emb=args.glove_h5_path,
                            cuda=not args.use_cpu)
                        model = gensen_1

                        if len(gensen_version_list) == 2:
                            gensen_2 = gensen.GenSenSingle(
                                model_folder=args.gensen_dir,
                                filename_prefix=gensen_version_list[1],
                                pretrained_emb=args.glove_h5_path,
                                cuda=not args.use_cpu)
                            model = gensen.GenSen(gensen_1, gensen_2)

                    vocab = gensen.build_vocab([
                        s
                        for set_name in ('targ1', 'targ2', 'attr1', 'attr2')
                        for s in encs[set_name]["examples"]
                    ])

                    model.vocab_expansion(vocab)

                    encs_targ1 = gensen.encode(model, encs["targ1"]["examples"])
                    encs_targ2 = gensen.encode(model, encs["targ2"]["examples"])
                    encs_attr1 = gensen.encode(model, encs["attr1"]["examples"])
                    encs_attr2 = gensen.encode(model, encs["attr2"]["examples"])

                elif model_name == ModelName.GUSE.value:
                    model = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
                    if args.use_cpu:
                        kwargs = dict(device_count={'GPU': 0})
                    else:
                        kwargs = dict()
                    config = tf.ConfigProto(**kwargs)
                    config.gpu_options.per_process_gpu_memory_fraction = 0.5  # maximum alloc gpu50% of MEM
                    config.gpu_options.allow_growth = True  # allocate dynamically
                    with tf.Session(config=config) as session:
                        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
                        def guse_encode(sents):
                            encs_node = model(sents)
                            encs = session.run(encs_node)
                            encs_d = {sents[j]: enc for j, enc in enumerate(np.array(encs).tolist())}
                            return encs_d

                        encs_targ1 = guse_encode(encs["targ1"]["examples"])
                        encs_targ2 = guse_encode(encs["targ2"]["examples"])
                        encs_attr1 = guse_encode(encs["attr1"]["examples"])
                        encs_attr2 = guse_encode(encs["attr2"]["examples"])

                elif model_name == ModelName.COVE.value:
                    load_encs_from = os.path.join(args.cove_encs, "%s.encs" % test)
                    encs = load_jiant_encodings(load_encs_from, n_header=1)

                elif model_name == ModelName.ELMO.value:
                    kwargs = dict(time_combine_method=args.time_combine_method,
                                  layer_combine_method=args.layer_combine_method)
                    encs_targ1 = elmo.encode(encs["targ1"]["examples"], **kwargs)
                    encs_targ2 = elmo.encode(encs["targ2"]["examples"], **kwargs)
                    encs_attr1 = elmo.encode(encs["attr1"]["examples"], **kwargs)
                    encs_attr2 = elmo.encode(encs["attr2"]["examples"], **kwargs)

                elif model_name == ModelName.BERT.value:
                    model, tokenizer = bert.load_model(args.bert_version)
                    encs_targ1 = bert.encode(model, tokenizer, encs["targ1"]["examples"])
                    encs_targ2 = bert.encode(model, tokenizer, encs["targ2"]["examples"])
                    encs_attr1 = bert.encode(model, tokenizer, encs["attr1"]["examples"])
                    encs_attr2 = bert.encode(model, tokenizer, encs["attr2"]["examples"])

                elif model_name == ModelName.OPENAI.value:
                    load_encs_from = os.path.join(args.openai_encs, "%s.encs" % test)
                    #encs = load_jiant_encodings(load_encs_from, n_header=1, is_openai=True)
                    encs = load_encodings(load_encs_from)
                    encs_targ1 = encs["targ1"]["encs"]
                    encs_targ2 = encs["targ2"]["encs"]
                    encs_attr1 = encs["attr1"]["encs"]
                    encs_attr2 = encs["attr2"]["encs"]

                else:
                    raise ValueError("Model %s not found!" % model_name)

                encs["targ1"]["encs"] = encs_targ1
                encs["targ2"]["encs"] = encs_targ2
                encs["attr1"]["encs"] = encs_attr1
                encs["attr2"]["encs"] = encs_attr2

                log.info("\tDone!")
                if not args.dont_cache_encs:
                    log.info("Saving encodings to %s", enc_file)
                    save_encodings(encs, enc_file)

            enc = [e for e in encs["targ1"]['encs'].values()][0]
            d_rep = enc.size if isinstance(enc, np.ndarray) else len(enc)

            # run the test on the encodings
            log.info("Running SEAT...")
            log.info("Representation dimension: {}".format(d_rep))
            esize, pval = weat.run_test(encs, n_samples=args.n_samples, parametric=args.parametric)
            results.append(dict(
                model=model_name,
                options=model_options,
                test=test,
                p_value=pval,
                effect_size=esize,
                num_targ1=len(encs['targ1']['encs']),
                num_targ2=len(encs['targ2']['encs']),
                num_attr1=len(encs['attr1']['encs']),
                num_attr2=len(encs['attr2']['encs'])))

        log.info("Model: %s", model_name)
        log.info('Options: {}'.format(model_options))
        for r in results:
            log.info("\tTest {test}:\tp-val: {p_value:.9f}\tesize: {effect_size:.2f}".format(**r))

    if args.results_path is not None:
        log.info('Writing results to {}'.format(args.results_path))
        with open(args.results_path, 'w') as f:
            writer = DictWriter(f, fieldnames=results[0].keys(), delimiter='\t')
            writer.writeheader()
            for r in results:
                writer.writerow(r)