Beispiel #1
0
def evaluate_polysemy(tokenizer: SubwordTextEncoder, transformer: Transformer):
    dataset_url: str = "https://box.hu-berlin.de/f/ce30d723beef4feba4c3/?dl=1"
    dataset_path: str = tf.keras.utils.get_file("pars_test.txt", dataset_url)
    # dataset_path: str = "../data/pars_test.txt"
    lines: List[str] = open(dataset_path).read().split("\n")
    examples: List[Example] = [Example(x) for x in lines if x]
    predictions: List[int] = []
    sims: List[float] = []
    for ex in examples:
        token1: str = ex.context1.content[ex.context1.token_range_start:ex.
                                          context1.token_range_end]
        tensors1: List[Tensor] = transformer.get_embeddings_for_token(
            ex.context1.content, tokenizer, token1)
        token2: str = ex.context2.content[ex.context2.token_range_start:ex.
                                          context2.token_range_end]
        tensors2: List[Tensor] = transformer.get_embeddings_for_token(
            ex.context2.content, tokenizer, token2)
        cos_sim: float = 1 - cosine(tensors1[0], tensors2[0])
        sims.append(cos_sim)
        predictions.append(1 if cos_sim > 0.4 else 0)
    print([x.label for x in examples])
    print(predictions)
    print(sims)
    correct_indices: List[int] = [
        i for i in range(len(predictions))
        if predictions[i] == examples[i].label
    ]
    print(correct_indices)
    print(f"Accuracy: {len(correct_indices) / len(examples) * 100}%")
Beispiel #2
0
def test_case_34():
    with open('training/' + os.listdir('training/')[34]) as f:
        raw_task = json.load(f)
    base_entity_finder = EntityFinder(
        lambda grid: find_components(grid, directions=ALL_DIRECTIONS))
    task = tuplefy_task(raw_task)
    inp = task['train'][0]['input']
    out = task['train'][0]['output']
    entities = base_entity_finder(inp)

    color_8 = Property(lambda x: frozenset({8}),
                       np.log(10) - 1,
                       name=f'color {8}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    color_0 = Property(lambda x: frozenset({0}),
                       np.log(10) - 1,
                       name=f'color {0}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1,
                          requires_entity=True)
    select_8 = Selector.make_property_selector(take_color, color_8, True)
    select_not_8 = Selector.make_property_selector(take_color, color_8, False)
    select_not_0 = Selector.make_property_selector(take_color, color_0, False)
    select_not_0.nll = np.log(2)
    select_not_0_nor_8 = Selector.intersect(select_not_0, select_not_8)

    selected_entities = select_not_0_nor_8.select(entities)

    collision = Relation(
        lambda entity1, entity2: next(
            iter(collision_directions(entity1, entity2, adjustment=1)))
        if len(collision_directions(entity1, entity2)) == 1 else None,
        nll=1 + np.log(2),
        name='the unique collision vector to',
        output_types=frozenset({'vector'}))
    collision_with_8 = Property.from_relation_selector(collision, select_8,
                                                       base_entity_finder)
    move_into_8 = Transformer(
        lambda entities, grid: move(entities,
                                    vector_property=collision_with_8,
                                    copy=True,
                                    extend_grid=False),
        nll=collision_with_8.nll + np.log(2),
        name=f"{'copy' if True else 'move'} them by ({collision_with_8})")
    new_entities, new_grid = move_into_8.transform(selected_entities, inp)
    assert new_grid == out
    my_entity_finder = base_entity_finder.compose(select_not_0_nor_8)
    my_predictor = Predictor(my_entity_finder, move_into_8)
    for case in task['train'] + task['test']:
        assert my_predictor.predict(case['input']) == case['output']

    my_predictor_2 = Predictor(base_entity_finder, move_into_8)
Beispiel #3
0
def test_replace_color():
    with open('training/' + os.listdir('training/')[9]) as f:
        raw_task = json.load(f)
    task = tuplefy_task(raw_task)
    input_grid = task['train'][0]['input']
    output_grid = task['train'][0]['output']
    entities = base_entity_finder(input_grid)
    # for selected_entity in entities:
    #     selected_entity.display()
    size_prop = Property(lambda x: x.entity.num_points(),
                         nll=np.log(2),
                         name='the number of points',
                         output_types=frozenset({'quantity'}),
                         entity_finder=base_entity_finder,
                         requires_entity=True)
    minimum = OrdinalProperty(lambda x, n=0: nth_ordered(x, 0, use_max=False),
                              nll=0,
                              name=f'take the {ordinal(0 + 1)} largest',
                              input_types=frozenset({
                                  'x_length', 'y_length', 'x_coordinate',
                                  'y_coordinate', 'quantity'
                              }))
    smallest_size = Property.from_entity_prop_and_ordinal(size_prop, minimum)
    color_4 = Property(lambda x: frozenset({4}),
                       np.log(10) - 2,
                       name=f'color {4}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1,
                          requires_entity=True)

    select_smallest = Selector.make_property_selector(size_prop,
                                                      smallest_size,
                                                      the_same=True)
    selected_entities = select_smallest.select(entities)
    assert len(selected_entities) == 1

    assert selected_entities[0].positions == {(6, 7): 5, (7, 7): 5, (8, 7): 5}

    # smallest_color = take_color.add_selector(select_smallest, ORDINAL_PROPERTIES[0])

    recolor_yellow = Transformer(
        lambda entities, grid, source_color_prop=take_color, target_color_prop=
        color_4: replace_color(entities,
                               source_color_prop=source_color_prop,
                               target_color_prop=target_color_prop),
        nll=take_color.nll + color_4.nll + np.log(2),
        name=f'recolor ({take_color}) with ({color_4})')
    _, prediction_grid = recolor_yellow.transform(selected_entities)

    assert base_entity_finder.grid_distance(
        prediction_grid, output_grid) < base_entity_finder.grid_distance(
            input_grid, output_grid)
Beispiel #4
0
def test_is_rectangle():
    with open('training/' + os.listdir('training/')[28]) as f:
        raw_case = json.load(f)
    case = tuplefy_task(raw_case)
    rectangle = Property(lambda x: x.entity.is_a_rectangle(),
                         0,
                         name='is a rectangle',
                         output_types=frozenset({'bool'}),
                         entity_finder=base_entity_finder)
    color_2 = Property(lambda x: frozenset({2}),
                       np.log(10) - 1,
                       name=f'color {2}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1)
    is_true = Property(lambda x: True,
                       0,
                       name='True',
                       output_types=frozenset({'bool'}),
                       entity_finder=base_entity_finder)
    select_2 = Selector.make_property_selector(rectangle, color_2, True)
    first_case = case['train'][0]['input']
    entities = select_2.select(base_entity_finder(first_case))
    for entity in entities:
        assert rectangle(entity, first_case)

    is_true = Property(lambda x: True,
                       0,
                       name='True',
                       output_types=frozenset({'bool'}),
                       entity_finder=base_entity_finder)
    select_rectangle = Selector.make_property_selector(rectangle, is_true,
                                                       True)
    rect_entities = select_rectangle.select(base_entity_finder(first_case))
    assert len(rect_entities) == 1
    # for entity in rect_entities:
    #     entity.display()
    crop = Transformer(lambda entities, grid, offsets=(1, -1, 1, -1):
                       crop_entities(entities, grid, offsets=offsets),
                       nll=np.log(2) + sum(
                           (abs(offset)
                            for offset in (1, -1, 1, -1))) * np.log(2),
                       name='crop them')
    _, output = crop.transform(rect_entities)
    assert output == case['train'][0]['output']
Beispiel #5
0
def test_sequential():
    with open('training/' + os.listdir('training/')[56]) as f:
        raw_task = json.load(f)
    task = tuplefy_task(raw_task)
    input_grid = task['train'][0]['input']
    output_grid = task['train'][0]['output']
    color_0 = Property(lambda x: frozenset({0}),
                       np.log(10) - 1,
                       name=f'color {0}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1)
    select_not_0 = Selector.make_property_selector(take_color, color_0, False)
    x_length = Property(
        lambda x: x.entity.max_coord(axis=1) - x.entity.min_coord(axis=1) + 1,
        np.log(2),
        name='the x length',
        output_types=frozenset({'x_length'}),
        entity_finder=base_entity_finder)
    zero = Property(lambda x: 0,
                    1,
                    name='0',
                    output_types=frozenset({'y_length'}),
                    entity_finder=base_entity_finder)
    x_length_vect = Property.xy_length_to_vector(zero, x_length)
    copy_move_x_length = Transformer(
        lambda entities, grid: move(entities, grid, x_length_vect, copy=True),
        name=f'copy them by ({x_length_vect})')
    my_entity_finder = base_entity_finder.compose(select_not_0)
    cropper = Transformer(crop_entities, nll=np.log(2), name='crop them')
    single_predictor = Predictor(my_entity_finder,
                                 copy_move_x_length,
                                 parallel=False)
    predictor_1 = Predictor(my_entity_finder, copy_move_x_length)
    predictor_2 = Predictor(my_entity_finder, cropper)
    sequential_predictor = Predictor([my_entity_finder, my_entity_finder],
                                     [copy_move_x_length, cropper],
                                     parallel=False)
    composed_predictor = predictor_1.compose(predictor_2, parallel=False)
    train_input = task['train'][0]['input']
    train_output = task['train'][0]['output']
    print(composed_predictor)
    assert sequential_predictor.predict(train_input) == train_output
    assert composed_predictor.predict(train_input) == train_output
Beispiel #6
0
def test_transformers_predictors():
    with open('training/' + os.listdir('training/')[7]) as f:
        raw_case7 = json.load(f)
    case7 = tuplefy_task(raw_case7)
    inp = case7['train'][0]['input']
    out = case7['train'][0]['output']
    base_entity_finder = EntityFinder(find_components)
    entities = base_entity_finder(inp)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1)
    color_2 = Property(lambda x, i=2: frozenset({2}),
                       np.log(10) - 2,
                       name=f'color {2}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    color_8 = Property(lambda x, i=8: frozenset({8}),
                       np.log(10) - 2,
                       name=f'color {8}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    select_8 = Selector.make_property_selector(take_color, color_8)
    select_2 = Selector.make_property_selector(take_color, color_2)
    max_ord = OrdinalProperty(lambda x: nth_ordered(x, 0, use_max=True),
                              nll=0,
                              name=f'take the {1} largest')
    find_collision_vect_to_8 = Property.from_relation_selector(
        collision_relation,
        select_8,
        entity_finder=base_entity_finder,
        ordinal_property=max_ord)
    my_transformer = Transformer(
        lambda entities, grid: move(entities,
                                    vector_property=find_collision_vect_to_8),
        name=f'move them by ({find_collision_vect_to_8})',
        nll=1 + np.log(2))

    assert my_transformer.transform(select_2.select(entities))[1] == out

    select_2_finder = base_entity_finder.compose(select_2)
    my_predictor = Predictor(select_2_finder, my_transformer)
    assert my_predictor.predict(inp) == out
Beispiel #7
0
def test_case_30():
    with open('training/' + os.listdir('training/')[30]) as f:
        raw_task = json.load(f)
    base_entity_finder = EntityFinder(
        lambda grid: find_components(grid, directions=ALL_DIRECTIONS))
    task = tuplefy_task(raw_task)
    inp = task['train'][0]['input']
    output = task['train'][0]['output']
    entities = base_entity_finder(inp)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1,
                          requires_entity=True)
    color_0 = Property(lambda x, i=2: frozenset({0}),
                       np.log(10) - 1,
                       name=f'color {0}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    select_not_0 = Selector.make_property_selector(take_color,
                                                   color_0,
                                                   the_same=False)
    crop_transform = Transformer(crop_entities,
                                 nll=np.log(2),
                                 name='crop them')
    _, trivial_transformed_grid = crop_transform.transform(entities)
    assert trivial_transformed_grid == inp

    selected_entities = select_not_0.select(entities)
    _, transformed_grid = crop_transform.transform(selected_entities)
    assert transformed_grid == ((0, 2, 2, 2), (0, 0, 2, 0), (2, 2, 2, 0),
                                (2, 0, 2, 0))

    my_predictor = Predictor(base_entity_finder.compose(select_not_0),
                             crop_transform)

    for case in task['train']:
        assert my_predictor.predict(case['input']) == case['output']

    test_case = task['test'][0]
    print(my_predictor)
    assert my_predictor.predict(test_case['input']) == test_case['output']
Beispiel #8
0
def test_replace_color_entity_frame():
    with open('training/' + os.listdir('training/')[80]) as f:
        raw_case = json.load(f)
    case = tuplefy_task(raw_case)

    color_0 = Property(lambda x: frozenset({0}),
                       np.log(10) - 1,
                       name=f'color {0}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    color_1 = Property(lambda x: frozenset({1}),
                       np.log(10) - 1,
                       name=f'color {1}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    color_8 = Property(lambda x: frozenset({8}),
                       np.log(10) - 1,
                       name=f'color {1}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1)
    select_8 = Selector.make_property_selector(take_color, color_8, True)

    select_not_0 = Selector.make_property_selector(take_color, color_0, False)

    color_frame_blue = Transformer(
        lambda entities, grid, offsets=(0, 0, 0, 0):
        replace_colors_in_entities_frame(entities,
                                         grid=None,
                                         offsets=offsets,
                                         source_color_prop=color_0,
                                         target_color_prop=color_1),
        name=
        f'replace ({color_0}) with ({color_1}) in a box around them with offsets {(0, 0, 0, 0)}'
    )
    first_case = case['train'][0]['input']
    entity_finder = base_entity_finder.compose(select_8)
    my_predictor = Predictor(entity_finder, color_frame_blue)
    # print(my_predictor.predict(first_case))
    assert my_predictor.predict(first_case) == case['train'][0]['output']
    assert my_predictor.predict(
        case['test'][0]['input']) == case['test'][0]['output']

    entity_finder_2 = base_entity_finder.compose(select_not_0)
    my_predictor_2 = Predictor(entity_finder_2, color_frame_blue)
    assert my_predictor_2.predict(first_case) == case['train'][0]['output']
    assert my_predictor_2.predict(
        case['test'][0]['input']) == case['test'][0]['output']
    print(my_predictor_2)
Beispiel #9
0
def test_composite_selections():
    with open('training/' + os.listdir('training/')[205]) as f:
        raw_cases = json.load(f)
    cases = tuplefy_task(raw_cases)
    color_0 = Property(lambda x: frozenset({0}),
                       np.log(2),
                       name=f'color {0}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    color_5 = Property(lambda x: frozenset({5}),
                       np.log(10) - 1,
                       name=f'color {5}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1)
    select_not_0 = Selector.make_property_selector(take_color, color_0, False)
    select_not_5 = Selector.make_property_selector(take_color, color_5, False)
    select_not_0_nor_5 = select_not_0.intersect(select_not_5)
    entity_finder = base_entity_finder.compose(select_not_0_nor_5, True)
    select_5 = Selector.make_property_selector(take_color, color_5)
    center_y = Property(lambda x: x.entity.center(axis=0),
                        nll=np.log(2),
                        name='the center y coordinate',
                        output_types=frozenset({'y_coordinate'}),
                        entity_finder=base_entity_finder,
                        requires_entity=True)
    center_x = Property(lambda x: x.entity.center(axis=1),
                        nll=np.log(2),
                        name='the center x coordinate',
                        output_types=frozenset({'x_coordinate'}),
                        entity_finder=base_entity_finder,
                        requires_entity=True)
    center_5y = center_y.add_selector(select_5)
    length_5y = Property.create_distance_property(center_5y, center_y)
    center_5x = center_x.add_selector(select_5)
    length_5x = Property.create_distance_property(center_5x, center_x)
    vect_prop = Property.xy_length_to_vector(length_5y, length_5x)
    move_to_5 = Transformer(
        lambda entities, grid, copy=True: move(
            entities, vector_property=vect_prop, copy=copy, extend_grid=False),
        nll=vect_prop.nll + np.log(2),
        name=f"{'copy' if True else 'move'} them by ({vect_prop})")
    my_predictor = Predictor(entity_finder, move_to_5)

    for case in cases['train']:
        assert my_predictor.predict(case['input']) == case['output']
 def do_work(input_queue: multiprocessing.Queue,
             output_queue: multiprocessing.Queue):
     from classes import Transformer
     from transformer_tensorflow import NUMBER_OF_LAYERS
     from transformer_tensorflow import MODEL_DIMENSIONS
     from transformer_tensorflow import NUMBER_OF_HEADS
     from transformer_tensorflow import FEED_FORWARD_DIMENSIONS
     from transformer_tensorflow import DROPOUT_RATE
     from tensorflow.python.training.tracking.util import Checkpoint
     import tensorflow as tf
     from tensorflow_datasets.core.features.text import SubwordTextEncoder
     from classes import ClusterItem
     from tensorflow.python.training.checkpoint_management import CheckpointManager
     tokenizer_path: str = "tokenizer.subwords"
     tokenizer_prefix: str = tokenizer_path.split(".")[0]
     tokenizer: SubwordTextEncoder
     tokenizer = SubwordTextEncoder.load_from_file(tokenizer_prefix)
     input_vocabulary_size = target_vocabulary_size = tokenizer.vocab_size + 2
     local_transformer: Transformer = Transformer(
         NUMBER_OF_LAYERS,
         MODEL_DIMENSIONS,
         NUMBER_OF_HEADS,
         FEED_FORWARD_DIMENSIONS,
         input_vocabulary_size,
         target_vocabulary_size,
         pe_input=input_vocabulary_size,
         pe_target=target_vocabulary_size,
         rate=DROPOUT_RATE)
     ckpt: Checkpoint = tf.train.Checkpoint(
         transformer=local_transformer,
         optimizer=local_transformer.optimizer)
     ckpt_manager: CheckpointManager = tf.train.CheckpointManager(
         ckpt, checkpoint_path, max_to_keep=5)
     # if a checkpoint exists, restore the latest checkpoint.
     if ckpt_manager.latest_checkpoint:
         ckpt.restore(ckpt_manager.latest_checkpoint)
         print('Latest checkpoint restored!!')
     while True:
         sent: TokenList = input_queue.get(True)
         if sent is None:
             break
         lemma_dict: Dict[str, List[ClusterItem]] = process_sentence(
             sent, local_transformer, tokenizer)
         output_queue.put(lemma_dict)
Beispiel #11
0
def find_word_senses(tokenizer: SubwordTextEncoder, transformer: Transformer,
                     dataset_path: str) -> None:
    word_forms_set: Set[str] = {
        "pars", "partis", "parti", "partem", "parte", "partes", "partium",
        "partibus"
    }
    examples: List[str] = open(dataset_path).read().split("\n")
    examples = [y for x in examples for y in x.split("\t")]
    example_token_sets: List[Set[str]] = [set(x.split()) for x in examples]
    deletion_indices: List[int] = [
        i for i in range(len(examples))
        if not len(word_forms_set.intersection(example_token_sets[i]))
    ]
    examples = [
        examples[i] for i in range(len(examples)) if i not in deletion_indices
    ]
    examples_set: Set[str] = set(examples)
    relevant_tensors: List[Tensor] = []
    for example in tqdm(examples_set):
        target_token: str = next(x for x in example.split()
                                 if x in word_forms_set)
        tensors: List[Tensor] = transformer.get_embeddings_for_token(
            example, tokenizer, target_token)
        relevant_tensors.append(tensors[0])
    sims: np.ndarray = np.zeros((len(relevant_tensors), len(relevant_tensors)))
    for i in range(len(relevant_tensors)):
        for j in range(len(relevant_tensors) - 1):
            if i == j:
                continue
            cos_sim: float = 1 - cosine(relevant_tensors[i],
                                        relevant_tensors[j])
            sims[i, j] = sims[j, i] = round(cos_sim, 2)
    examples = [x[:20] for x in examples]
    sims_with_ex: List[Tuple[float, str, str]] = []
    for i in range(len(sims)):
        for j in range(len(sims[i])):
            sims_with_ex.append((sims[i, j], examples[i], examples[j]))
    sims_with_ex = [x for x in sims_with_ex if x[0]]
    sims_with_ex.sort(key=lambda x: x[0], reverse=True)
    sims_with_ex = sims_with_ex[:5] + sims_with_ex[-5:]
    for swe in sims_with_ex:
        print(swe)
Beispiel #12
0
def evaluate_polysemy_old(tokenizer: SubwordTextEncoder,
                          transformer: Transformer):
    sentences: List[str] = [
        "et percussa est tertia pars solis et tertia pars lunae et tertia pars stellarum ut obscuraretur tertia pars eorum et diei non luceret pars tertia et nox similiter",
        "nam et pars quedam fluminis Nili ibi currit",
        "Ac saepe in eum locum ventum est tanto in omnes partes diviso equitatu ut modo visum ab se Ambiorigem in fuga circumspicerent captivi nec plane etiam abisse ex conspectu contenderent ut spe consequendi inlata atque infinito labore suscepto qui se summam a Caesare gratiam inituros putarent paene naturam studio vincerent semper que paulum ad summam felicitatem defuisse videretur atque ille latebris aut saltibus se eriperet et noctu occultatus alias regiones partes que peteret non maiore equitum praesidio quam quattuor quibus solis vitam suam committere audebat",
        "numquam ante arbitror te epistulam meam legisse nisi mea manu scriptam",
        "ante diem xii Kal Decembr Milo ante mediam noctem cum magna manu in campum venit",
        "numquam enim a Pomponia nostra certior sum factus esse cui dare litteras possem",
        "quod fere plerisque accidit ut praesidio litterarum diligentiam in perdiscendo ac memoriam remittant",
        "nam statim fidem publicam postulavit", "habete fidem Dei",
        "Fundamentum autem est iustitiae fides id est dictorum conventorum que constantia et veritas",
        "sol ", "merces "
    ]
    tokens: List[str] = [
        "pars", "pars", "partes", "manu", "manu", "litteras", "litterarum",
        "fidem", "fidem", "fides", "sol", "merces"
    ]
    # for tok in tokens:
    #     print(f"{tok}: {most_similar(transformer, tok)}")
    print_tokens: List[str] = [
        "pars solis", "pars fluminis", "equitatus in omnes partes divisus",
        "manu scriptus", "magna manus", "litteras dare alicui",
        "praesidium litterarum", "fides publica", "fides dei",
        "fides iustitiae", "sol", "merces"
    ]
    sims: np.ndarray = np.zeros((len(tokens), len(tokens)))
    cross_validation_k: int = 5
    for k in range(cross_validation_k):
        relevant_tensors: List[Tensor] = []
        for i in range(len(sentences)):
            tensors: List[Tensor] = transformer.get_embeddings_for_token(
                sentences[i], tokenizer, tokens[i])
            relevant_tensors.append(tensors[0])
            for j in range(len(relevant_tensors) - 1):
                cos_sim: float = 1 - cosine(relevant_tensors[-1],
                                            relevant_tensors[j])
                sims[i, j] = sims[j, i] = round(
                    (sims[i, j] + cos_sim) / 2, 2) if sims[i, j] else cos_sim
    plot_similarities(print_tokens, sims)
def test_sense_embeddings():
    sentence: str = "cum enim hoc rectum et gloriosum putarem ex annuo sumptu qui mihi decretus esset me C Coelio quaestori relinquere annuum referre in aerarium ad HS †cIↃ† ingemuit nostra cohors omne illud putans distribui sibi oportere ut ego amicior invenirer Phrygum et Cilicum aerariis quam nostro"
    token: str = "HS"
    from classes import Transformer
    from transformer_tensorflow import NUMBER_OF_LAYERS
    from transformer_tensorflow import MODEL_DIMENSIONS
    from transformer_tensorflow import NUMBER_OF_HEADS
    from transformer_tensorflow import FEED_FORWARD_DIMENSIONS
    from transformer_tensorflow import DROPOUT_RATE
    from tensorflow.python.training.tracking.util import Checkpoint
    import tensorflow as tf
    from tensorflow_datasets.core.features.text import SubwordTextEncoder
    from tensorflow.python.training.checkpoint_management import CheckpointManager
    tokenizer_path: str = "tokenizer.subwords"
    tokenizer_prefix: str = tokenizer_path.split(".")[0]
    tokenizer: SubwordTextEncoder
    tokenizer = SubwordTextEncoder.load_from_file(tokenizer_prefix)
    input_vocabulary_size = target_vocabulary_size = tokenizer.vocab_size + 2
    local_transformer: Transformer = Transformer(
        NUMBER_OF_LAYERS,
        MODEL_DIMENSIONS,
        NUMBER_OF_HEADS,
        FEED_FORWARD_DIMENSIONS,
        input_vocabulary_size,
        target_vocabulary_size,
        pe_input=input_vocabulary_size,
        pe_target=target_vocabulary_size,
        rate=DROPOUT_RATE)
    ckpt: Checkpoint = tf.train.Checkpoint(
        transformer=local_transformer, optimizer=local_transformer.optimizer)
    ckpt_manager: CheckpointManager = tf.train.CheckpointManager(
        ckpt, checkpoint_path, max_to_keep=5)
    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')
    tensors: list = local_transformer.get_embeddings_for_token(
        sentence, tokenizer, token)  # List[Tensor]
    a = 0
            for case in task['train'])):
        new_colors = False
    else:
        new_colors = True

    # 36
    composite_transformers = itertools.chain(
        composite_transformers,
        (Transformer(lambda entities, grid, offsets=offsets, source_color_prop=
                     source_color_prop, target_color_prop=target_color_prop:
                     replace_colors_in_entities_frame(
                         entities,
                         grid,
                         offsets=offsets,
                         source_color_prop=source_color_prop,
                         target_color_prop=target_color_prop),
                     nll=np.log(2) + source_color_prop.nll +
                     target_color_prop.nll + sum(
                         (abs(offset) for offset in offsets)) * np.log(2) -
                     new_colors * NEW_COLOR_BONUS,
                     name=f'replace ({source_color_prop}) '
                     f'with ({target_color_prop}) '
                     f'in a box around them with offsets {offsets}')
         for source_color_prop, target_color_prop in combine_sorted_queues(
             (Property.of_type['color'], Property.of_type['color']),
             max_nll=max_nll - np.log(2) + new_colors * NEW_COLOR_BONUS)
         for offsets in [(0, 0, 0, 0), (1, -1, 1, -1)]))
    # 37
    composite_transformers = itertools.chain(
        composite_transformers,
        (Transformer(
Beispiel #15
0
def test_place_shape():
    with open('training/' + os.listdir('training/')[94]) as f:
        raw_task = json.load(f)
    task = tuplefy_task(raw_task)
    input_grid = task['train'][0]['input']
    output_grid = task['train'][0]['output']
    entities = base_entity_finder(input_grid)
    appearing_shapes = Counter()

    for grid in task['train']:
        output_entities = base_entity_finder(grid['output'])
        appearing_shapes += Entity.shapes(output_entities)
    desired_shape = frozenset({((0.0, 1.0), 1), ((1.0, 0.0), 1),
                               ((-1.0, 1.0), 1), ((1.0, 1.0), 1),
                               ((1.0, -1.0), 1), ((0.0, -1.0), 1),
                               ((-1.0, -1.0), 1), ((-1.0, 0.0), 1)})
    assert desired_shape in appearing_shapes
    color_5 = Property(lambda x: frozenset({5}),
                       np.log(10) - 1,
                       name=f'color {5}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1)
    center_0 = Property(lambda x: x.entity.center(axis=0),
                        nll=1 + np.log(2),
                        name='the center y coordinate',
                        output_types=frozenset({'y_coordinate'}),
                        entity_finder=base_entity_finder)
    center_1 = Property(lambda x: x.entity.center(axis=1),
                        nll=1 + np.log(2),
                        name='the center x coordinate',
                        output_types=frozenset({'x_coordinate'}),
                        entity_finder=base_entity_finder)
    center = Property.create_point_property(center_0, center_1)
    desired_shape_prop = Property(lambda x: desired_shape,
                                  np.log(10) - 1,
                                  name=f'shape {desired_shape}',
                                  output_types=frozenset({'shape'}),
                                  is_constant=True,
                                  entity_finder=base_entity_finder)
    # shape_entity_prop = Property(lambda x: x.entity.shape(), 1, name=f'the shape',
    #                              output_types=frozenset({'shape'}),
    #                              entity_finder=base_entity_finder)
    place_desired_shape = Transformer(
        lambda entities, grid: place_shape(
            entities, point_prop=center, shape_prop=desired_shape_prop),
        nll=center.nll + desired_shape_prop.nll + np.log(2),
        name=f'place ({desired_shape_prop}) at position ({center}))')
    select_5 = Selector.make_property_selector(take_color, color_5)
    find_entities_5 = base_entity_finder.compose(select_5)
    my_predictor = Predictor(find_entities_5, place_desired_shape)
    assert my_predictor.predict(input_grid) == output_grid

    with open('training/' + os.listdir('training/')[14]) as f:
        raw_task14 = json.load(f)
    task14 = tuplefy_task(raw_task14)
    input_grid14 = task14['train'][0]['input']
    output_grid14 = task14['train'][0]['output']
    color_1 = Property(lambda x: frozenset({1}),
                       np.log(10) - 1,
                       name=f'color {1}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    select_1 = Selector.make_property_selector(take_color, color_1)
    # print(input_grid14)
    diamond = frozenset({((1.0, 0.0), 7), ((-1.0, 0.0), 7), ((0.0, 1.0), 7),
                         ((0.0, -1.0), 7)})
    diamond_prop = Property(lambda x: diamond,
                            np.log(10) - 1,
                            name=f'shape {diamond}',
                            output_types=frozenset({'shape'}),
                            is_constant=True,
                            entity_finder=base_entity_finder)
    place_diamond = Transformer(
        lambda entities, grid: place_shape(entities, grid, center, diamond_prop
                                           ),
        name=f'place ({diamond_prop}) at position ({center})')
    diamond_predictor = Predictor(base_entity_finder.compose(select_1),
                                  place_diamond)
    print(diamond_predictor)
    for case in task14['train']:
        # print(case['input'])
        output_grid = diamond_predictor.predict(case['input'])
        assert (base_entity_finder.grid_distance(
            case['output'], diamond_predictor.predict(case['input'])) +
                base_entity_finder.grid_distance(
                    diamond_predictor.predict(case['input']), case['input']) <=
                base_entity_finder.grid_distance(case['output'],
                                                 case['input']))
def create_predictor_queue(task,
                           max_nll,
                           base_entity_finder,
                           allow_selector_pairs=False):
    for i, example in enumerate(task['train']):
        if len(base_entity_finder(example['input'])) == 0:
            return []
    start_time = time.perf_counter()
    selector_list = list(
        selector_iterator(task,
                          base_entity_finder,
                          max_nll=max_nll - SELECTOR_MAX_NLL_CORRECTION))
    selector_list.sort()
    print(f"selecting time = {time.perf_counter() - start_time}")

    if MAKE_PROPERTY_LIST:
        Property.property_list.sort()
        print(f"len(Property.property_list) = {len(Property.property_list)}")

    print(f'built selector list (1), length={len(selector_list)}')

    if allow_selector_pairs:
        for selector1, selector2 in itertools.combinations(selector_list, 2):
            if combine_pair_selector_nll(
                    selector1,
                    selector2) < max_nll - SELECTOR_MAX_NLL_CORRECTION:
                new_selector = selector1.intersect(selector2)
                if new_selector.validate_and_register(
                        task, base_entity_finder,
                        max_nll - SELECTOR_MAX_NLL_CORRECTION):
                    selector_list.append(new_selector)
        if time.perf_counter() - start_time > MAX_SMALL_TIME:
            print('Out of time')
            return []

    selector_list.sort()
    print(f'built selector list (2), length={len(selector_list)}')
    # print('Time after selectors created = ', time.perf_counter() - start_time)
    # Create distance properties out of coordinate properties
    Property.of_type['x_coordinate'].sort()
    Property.of_type['y_coordinate'].sort()
    # LENGTH PROPERTIES
    x_length_props = (prop1.create_distance_property(
        prop2, register=False) for prop1, prop2 in combine_sorted_queues((
            Property.of_type['x_coordinate'],
            Property.of_type['x_coordinate']), max_nll - np.log(2))
                      if prop1.count != prop2.count and (
                          not prop1.is_constant or not prop2.is_constant))
    y_length_props = (prop1.create_distance_property(
        prop2, register=False) for prop1, prop2 in combine_sorted_queues((
            Property.of_type['y_coordinate'],
            Property.of_type['y_coordinate']), max_nll - np.log(2))
                      if prop1.count != prop2.count and (
                          not prop1.is_constant or not prop2.is_constant))

    length_props = sorted(list(itertools.chain(x_length_props,
                                               y_length_props)))
    for length_prop in length_props:
        length_prop.validate_and_register(
            task,
            extra_validation=lambda output_signature: all(
                (value.is_integer() for value in output_signature)))

    if time.perf_counter() - start_time > MAX_SMALL_TIME:
        print('Out of time')
        return []

    Property.of_type['x_length'].sort()
    Property.of_type['y_length'].sort()

    # Constructing point properties
    point_props = [
        Property.create_point_property(prop1, prop2, register=False)
        for prop1, prop2 in combine_sorted_queues((
            Property.of_type['y_coordinate'],
            Property.of_type['x_coordinate']), max_nll - 2 - POINT_PROP_COST)
    ]
    for point_prop in point_props:
        point_prop.validate_and_register(task)
    Property.of_type['point'].sort()

    if time.perf_counter() - start_time > MAX_SMALL_TIME:
        print('Out of time')
        return []

    # Constructing vector properties

    # Create vectors from single lengths
    for axis, name in enumerate(['y_length', 'x_length']):
        for length in Property.of_type[name]:
            vect_prop = Property.length_to_vector(length, axis, register=False)
            vect_prop.validate_and_register(task)

    # Create vectors from pairs of points
    for source_pt, target_pt in combine_sorted_queues(
        (Property.of_type['point'], Property.of_type['point']),
            max_nll - np.log(2)):
        vect_prop = Property.points_to_vector(source_pt,
                                              target_pt,
                                              register=False)
        vect_prop.validate_and_register(
            task,
            extra_validation=lambda output_signature: all(
                (value[i].is_integer() for value in output_signature
                 for i in range(2))))
        if time.perf_counter() - start_time > MAX_SMALL_TIME:
            print('Out of time')
            return []

    penalize_dim_change = True if all(
        (len(case['input']) == len(case['output'])
         and len(case['input'][0]) == len(case['output'][0])
         for case in task['train'])) else False

    transformers = (
        # 34
        Transformer(
            lambda entities, grid, vector_prop=vector_prop, copy=copy: move(
                entities,
                vector_property=vector_prop,
                copy=copy,
                extend_grid=not penalize_dim_change),
            nll=vector_prop.nll + np.log(2),
            name=f"{'copy' if copy else 'move'} them by ({vector_prop})")
        for vector_prop in Property.of_type['vector']
        for copy in [True, False] if vector_prop.nll + np.log(2) <= max_nll)
    if time.perf_counter() - start_time > MAX_SMALL_TIME:
        print('Out of time')
        return []
    Property.of_type['color'].sort()
    # 35
    composite_transformers = (Transformer(lambda entities, grid, offsets=offsets:
                                          crop_entities(entities, grid, offsets=offsets),
                                          nll=np.log(2) + sum(
                                              (abs(offset) for offset in offsets)) * np.log(2) + \
                                              penalize_dim_change * DIM_CHANGE_PENALTY,
                                          name=f'crop them with offset {offsets}')
                              for offsets in itertools.product([-1, 0, 1], repeat=4)
                              if np.log(2) + sum((abs(offset) for offset in offsets)) * np.log(2) + \
                              penalize_dim_change * DIM_CHANGE_PENALTY < max_nll)
    if any(({entry
             for row in case['input'] for entry in row
             } == {entry
                   for row in case['output'] for entry in row}
            for case in task['train'])):
        new_colors = False
Beispiel #17
0
def do_deep_learning():
    checkpoint_path: str = "./checkpoints/train"
    train_examples: Dataset = tf.data.Dataset.from_generator(
        generate_train_examples, (tf.string, tf.string),
        (tf.TensorShape([]), tf.TensorShape([]))).take(500)
    tokenizer_path: str = "tokenizer.subwords"
    tokenizer_prefix: str = tokenizer_path.split(".")[0]
    tokenizer: SubwordTextEncoder
    try:
        tokenizer = SubwordTextEncoder.load_from_file(tokenizer_prefix)
    except NotFoundError:
        tokenizer: SubwordTextEncoder = tfds.features.text.SubwordTextEncoder.build_from_corpus(
            (la1.numpy() + la2.numpy()[:-1] for la1, la2 in train_examples),
            target_vocab_size=2**13)
    tokenizer.save_to_file(tokenizer_prefix)
    train_dataset: Dataset = train_examples.map(tf_encode)
    train_dataset = train_dataset.filter(filter_max_length)
    # cache the dataset to memory to get a speedup while reading from it.
    train_dataset = train_dataset.cache()
    train_dataset = train_dataset.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
    train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
    val_examples: Dataset = tf.data.Dataset.from_generator(
        generate_val_examples, (tf.string, tf.string),
        (tf.TensorShape([]), tf.TensorShape([]))).take(5000)
    val_dataset = val_examples.map(tf_encode)
    val_dataset = val_dataset.filter(filter_max_length).padded_batch(
        BATCH_SIZE)
    input_vocabulary_size = target_vocabulary_size = tokenizer.vocab_size + 2
    # TODO: USE VALIDATION DATASET DURING TRAINING!
    np.set_printoptions(suppress=True)
    transformer: Transformer = Transformer(NUMBER_OF_LAYERS,
                                           MODEL_DIMENSIONS,
                                           NUMBER_OF_HEADS,
                                           FEED_FORWARD_DIMENSIONS,
                                           input_vocabulary_size,
                                           target_vocabulary_size,
                                           pe_input=input_vocabulary_size,
                                           pe_target=target_vocabulary_size,
                                           rate=DROPOUT_RATE)
    ckpt: Checkpoint = tf.train.Checkpoint(transformer=transformer,
                                           optimizer=transformer.optimizer)
    ckpt_manager: CheckpointManager = tf.train.CheckpointManager(
        ckpt, checkpoint_path, max_to_keep=5)
    # if a checkpoint exists, restore the latest checkpoint.
    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print('Latest checkpoint restored!!')
    evaluate_polysemy(tokenizer, transformer)
    # evaluate_word_order()
    # evaluate_lexis()
    data_dir: str = "../data"
    proiel_pickle_path: str = os.path.join(data_dir, "proiel_conllu.pickle")
    cache_path: str = os.path.join(data_dir, "sense_embeddings.json")
    # build_sense_embeddings(proiel_pickle_path, tokenizer, cache_path)
    # train_model(transformer.train_loss, transformer.train_accuracy, train_dataset, ckpt_manager)
    evaluate_polysemy(tokenizer, transformer)
    predict_next_sentence("Gallia est omnis divisa in partes tres.", tokenizer,
                          transformer)
    predict_next_sentence(
        "Arma virumque cano Troiae qui primus ab oris Italiam fato profugus Laviniaque venit litora.",
        tokenizer, transformer)
    predict_next_sentence(
        "Omnis homines qui sese student praestare ceteris animalibus summa ope niti decet ne vitam silentio transeant veluti pecora quae natura prona atque ventri oboedientia finxit.",
        tokenizer, transformer)
Beispiel #18
0
def test_reflect_about_line():
    with open('training/' + os.listdir('training/')[86]) as f:
        raw_task = json.load(f)
    task = tuplefy_task(raw_task)
    inp = task['train'][0]['input']
    out = task['train'][0]['output']
    vert_center_line = Property(
        lambda x: (float(np.array(x.grid).shape[1] - 1) / 2., 1.),
        np.log(4),
        name='the vertical center line',
        output_types=frozenset({'line'}),
        entity_finder=base_entity_finder)
    entities = base_entity_finder(inp)
    new_entities, new_grid = reflect_about_line(entities, inp,
                                                vert_center_line)
    # original = ((2, 2, 1),
    #             (2, 1, 2),
    #             (2, 8, 1))
    assert new_grid == ((1, 2, 2), (2, 1, 2), (1, 8, 2))
    horiz_center_line = Property(
        lambda x: (float(np.array(x.grid).shape[0] - 1) / 2., 0.),
        np.log(4),
        name='the horizontal center line',
        output_types=frozenset({'line'}),
        entity_finder=base_entity_finder)
    new_entities, new_grid = reflect_about_line(entities, inp,
                                                horiz_center_line)
    assert new_grid == ((2, 8, 1), (2, 1, 2), (2, 2, 1))
    back_diagonal_center_line = Property(lambda x: (0., -0.5),
                                         np.log(4),
                                         name='the back diagonal center line',
                                         output_types=frozenset({'line'}),
                                         entity_finder=base_entity_finder)
    new_entities, new_grid = reflect_about_line(entities, inp,
                                                back_diagonal_center_line)
    assert new_grid == ((2, 2, 2), (2, 1, 8), (1, 2, 1))
    forward_diagonal_center_line = Property(
        lambda x: (float(np.array(x.grid).shape[1] - 1.) / 2., 0.5),
        np.log(4),
        name='the forward diagonal center line',
        output_types=frozenset({'line'}),
        entity_finder=base_entity_finder)
    new_entities, new_grid = reflect_about_line(entities, inp,
                                                forward_diagonal_center_line)
    assert new_grid == ((1, 2, 1), \
                        (8, 1, 2), \
                        (2, 2, 2))
    new_entities, new_grid = reflect_about_line(entities, inp,
                                                vert_center_line)
    new_entities, new_grid = reflect_about_line(new_entities, new_grid,
                                                horiz_center_line)
    assert new_grid == out

    new_entities, new_grid = rotate_via_reflects(entities, inp,
                                                 vert_center_line,
                                                 horiz_center_line)
    assert len(new_entities) == 3
    assert new_grid == out
    transformer = Transformer(
        lambda entities, grid: rotate_via_reflects(
            entities, grid, vert_center_line, horiz_center_line),
        nll=vert_center_line.nll + horiz_center_line.nll + np.log(2),
        name=f'reflect about ({vert_center_line}) then ({horiz_center_line})')
    entities = base_entity_finder(inp)
    # new_entities, new_grid = transformer.transform(entities, inp)
    my_predictor = Predictor(base_entity_finder, transformer, parallel=False)
    assert my_predictor.predict(inp) == out

    grid_center = Property(lambda x:
                           (float(np.array(x.grid).shape[0] - 1) / 2.,
                            float(np.array(x.grid).shape[1] - 1) / 2.),
                           0,
                           name='the center point of the grid',
                           output_types=frozenset({'point'}),
                           entity_finder=base_entity_finder)
    new_entities, new_grid = rotate_about_point(entities,
                                                inp,
                                                grid_center,
                                                quarter_steps=2)
    assert new_grid == out
Beispiel #19
0
def test_case_29():
    with open('training/' + os.listdir('training/')[29]) as f:
        raw_task = json.load(f)
    base_entity_finder = EntityFinder(
        lambda grid: find_components(grid, directions=ALL_DIRECTIONS))
    trivial_selector = Selector(lambda entity, grid: True, name='')
    task = tuplefy_task(raw_task)
    inp = task['train'][0]['input']
    out = task['train'][0]['output']
    # print(task['train'][0]['input'])
    take_color = Property(lambda x: x.entity.colors(),
                          name='the colors',
                          output_types=frozenset({'color'}),
                          entity_finder=base_entity_finder,
                          nll=1,
                          requires_entity=True)
    # color_2 = Property(lambda x, i=2: frozenset({2}), np.log(10) - 2, name=f'color {2}',
    #                    output_types=frozenset({'color'}))
    color_1 = Property(lambda x, i=2: frozenset({1}),
                       np.log(10) - 1,
                       name=f'color {1}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    color_0 = Property(lambda x, i=2: frozenset({0}),
                       np.log(10) - 1,
                       name=f'color {0}',
                       output_types=frozenset({'color'}),
                       entity_finder=base_entity_finder)
    select_1 = Selector.make_property_selector(take_color, color_1)
    property_0 = Property(lambda x, i=0: i,
                          nll=1,
                          name=f'{0}',
                          output_types=frozenset({
                              'x_coordinate', 'y_coordinate', 'x_length',
                              'y_length', 'quantity'
                          }),
                          entity_finder=base_entity_finder)
    select_not_0 = Selector.make_property_selector(take_color,
                                                   color_0,
                                                   the_same=False)
    smallest_y = Property(lambda x: x.entity.max_coord(axis=0),
                          1 + np.log(4),
                          name='the largest y coordinate',
                          output_types=frozenset({'y_coordinate'}),
                          entity_finder=base_entity_finder,
                          requires_entity=True)
    min_y_of_blue = smallest_y.add_selector(select_1)
    distance_to_min_y_of_blue = Property.create_distance_property(
        min_y_of_blue, smallest_y)
    vector_to_min_y_of_blue = Property.xy_length_to_vector(
        distance_to_min_y_of_blue, property_0)
    move_transform = Transformer(
        lambda entities, grid, vector_prop=vector_to_min_y_of_blue: move(
            entities, vector_property=vector_prop),
        nll=vector_to_min_y_of_blue.nll + np.log(2),
        name=f'move them by ({vector_to_min_y_of_blue})')
    my_predictor = Predictor(base_entity_finder.compose(trivial_selector),
                             move_transform)  # .compose(select_not_0)
    # display_case(my_predictor.predict(inp))
    # display_case(out)
    assert my_predictor.predict(inp) == out

    test_input = task['test'][0]['input']
    test_output = task['test'][0]['output']
    test_entities = base_entity_finder(test_input)
    assert len(test_entities) == 4

    selected_finder = base_entity_finder.compose(select_not_0)
    # selected_finder(test_input)
    assert len(selected_finder(test_input)) == 3

    assert my_predictor.predict(test_input) == test_output