Beispiel #1
0
def get_spos_to_index():
    # structured_path = "/Users/ra-mit/data/fabric/dbpedia/triples_structured/all.csv"
    structured_path = "/data/smalldatasets/wiki/all.csv"
    # unstructured_path = "/Users/ra-mit/data/fabric/dbpedia/triples_unstructured/"
    unstructured_path = "/data/smalldatasets/wiki/triples_unstructured/"
    spos = []
    df = pd.read_csv(structured_path, encoding='latin1')
    ss = list(df.iloc[:, 0])
    ps = df.iloc[:, 1]
    os = df.iloc[:, 2]
    for s, p, o in zip(ss, ps, os):
        spos.append((s, p, o))
    print("Total structured spos: " + str(len(spos)))

    uns_spos, loc_dic = prepare_sqa_data.get_spo_from_uns(
        path=unstructured_path)

    print("Total unstructured spos: " + str(len(uns_spos)))

    spos += uns_spos
    return spos
Beispiel #2
0
def main():

    loc_dic = dict()  # to store locations

    uns_spos, doc_dic = prepare_sqa_data.get_spo_from_uns(loc_dic=loc_dic)

    spos = uns_spos

    true_pairs = []
    S = []
    P = []
    O = []
    # positive pairs
    for s, p, o in spos:
        true_pairs.append((s, p, 0))
        true_pairs.append((s, o, 0))
        true_pairs.append((p, o, 0))
        S.append(s)
        P.append(p)
        O.append(o)
    # negative pairs
    random_permutation = np.random.permutation(len(S))
    S = np.asarray(S)
    S = S[random_permutation]
    random_permutation = np.random.permutation(len(O))
    O = np.asarray(O)
    O = O[random_permutation]



    vocab = dict()

    sparsity_code_size = 48

    idx_vectorizer = IndexVectorizer(vocab_index=vocab, sparsity_code_size=sparsity_code_size, tokenizer_sep=" ")
    vectorizer = tp.CustomVectorizer(idx_vectorizer)

    # vectorization happens here
    X1 = []
    X2 = []
    Y = []
    for e1, e2, label in all_data:
        ve1 = vectorizer.get_vector_for_tuple(e1)
        ve1 = ve1.toarray()[0]
        ve2 = vectorizer.get_vector_for_tuple(e2)
        ve2 = ve2.toarray()[0]
        X1.append(ve1)
        X2.append(ve2)
        Y.append(label)

    X1 = np.asarray(X1)
    X2 = np.asarray(X2)
    Y = np.asarray(Y)

    vocab, inv_vocab = vectorizer.get_vocab_dictionaries()

    print('-')
    print('Vocab size:', str(len(vocab)), 'unique words')
    print("Code input bin of size: " + str(inputs_train.shape[1]))
    # print('Story max length:', story_maxlen, 'words')
    # print('Query max length:', query_maxlen, 'words')
    print('Number of training stories:', len(train_stories))
    # print('Number of test stories:', len(test_stories))
    print('-')
    print('Here\'s what a "story" tuple looks like (input, query, answer):')
    print(train_stories[0])
    print('-')
    print('Vectorizing the word sequences...')

    print('-')
    print('inputs: integer tensor of shape (samples, max_length)')
    print('inputs_train shape:', inputs_train.shape)
    # print('inputs_test shape:', inputs_test.shape)
    print('-')
    print('queries: integer tensor of shape (samples, max_length)')
    print('queries_train shape:', queries_train.shape)
    # print('queries_test shape:', queries_test.shape)
    # print('-')
    # print('answers: binary (1 or 0) tensor of shape (samples, vocab_size)')
    # print('answers_train shape:', answers_train.shape)
    # # print('answers_test shape:', answers_test.shape)
    print('-')
    print('Compiling...')

    input_dim = inputs_train.shape[1]

    from architectures import fabric_binary as bae
    model = bae.declare_model(input_dim, 256)
    model = bae.compile_model(model)

    output_dim = input_dim + 32

    model.fit(queries_train, inputs_train, epochs=250, batch_size=16, shuffle=True)

    o_path = "/data/eval/qatask/location/"

    bae.save_model_to_path(model, o_path, log="location")

    with open(o_path + "tf_dictionary.pkl", "wb") as f:
        pickle.dump(vocab, f)
Beispiel #3
0
def main():

    i_path = "/data/eval/fb/true_pairs.pkl"

    from utils import prepare_sqa_data

    # Get pairs from scratch or from serialized file
    if i_path is None:
        spos = prepare_sqa_data.get_spo_from_rel(filter_stopwords=True)
        uns_spos, loc_dic = prepare_sqa_data.get_spo_from_uns()
        spos = spos + uns_spos
        pos_samples = []
        # positive pairs
        for s, p, o in spos:
            pos_samples.append(s + " " + p)
            pos_samples.append(s + " " + o)
            pos_samples.append(p + " " + o)
    else:
        print("Loading data from: " + str(i_path))
        with open(i_path, "rb") as f:
            true_pairs = pickle.load(f)
        pos_samples = []
        seen = set()
        for e1, e2, label in true_pairs:
            if e1 + e2 in seen:
                continue
            pos_samples.append(e1 + " " + e2)
            seen.add(e1 + e2)

    all_data = pos_samples
    print("Pos samples available: " + str(len(all_data)))

    if i_path is not None:
        with open("/data/eval/fb/tf_dictionary.pkl", "rb") as f:
            vocab = pickle.load(f)
    else:
        vocab = dict()

    print("Initial vocab lenght: " + str(len(vocab)))

    if not fb:
        sparsity_code_size = 48
        idx_vectorizer = IndexVectorizer(vocab_index=vocab,
                                         sparsity_code_size=sparsity_code_size,
                                         tokenizer_sep=" ")
        vectorizer = tp.CustomVectorizer(idx_vectorizer)
    else:
        sparsity_code_size = 12
        idx_vectorizer = FlatIndexVectorizer(
            vocab_index=vocab, sparsity_code_size=sparsity_code_size)
        vectorizer = tp.CustomVectorizer(idx_vectorizer)

    # vectorization happens here
    X = []
    #all_data = all_data[:1000]  # test
    for el in all_data:
        ve = vectorizer.get_vector_for_tuple(el)
        ve = ve.toarray()[0]
        X.append(ve)

    X = np.asarray(X)

    vocab, inv_vocab = vectorizer.get_vocab_dictionaries()

    # def model1():
    input_dim = sparsity_code_size * 32

    from architectures import fabric_binary as bae
    model = bae.declare_model(input_dim, 256)
    model = bae.compile_model(model)

    st = time.time()

    model.fit(X, X, epochs=15, batch_size=128, shuffle=True)

    et = time.time()
    print("Total time: " + str(et - st))

    #o_path = "/data/eval/qatask/ad3/"
    o_path = "/data/eval/fbad/"

    bae.save_model_to_path(model, o_path, log="ad")

    with open(o_path + "tf_dictionary.pkl", "wb") as f:
        pickle.dump(vocab, f)
Beispiel #4
0
def main():
    o_path = "/data/eval/fbpair/"
    all_data, true_pairs = process_fb.extract_data_pairs()

    if wiki:
        o_path = "/data/eval/wikipair/"
        # structured_path = "/Users/ra-mit/data/fabric/dbpedia/triples_structured/all.csv"
        structured_path = "/data/smalldatasets/wiki/all.csv"
        # unstructured_path = "/Users/ra-mit/data/fabric/dbpedia/triples_unstructured/"
        unstructured_path = "/data/smalldatasets/wiki/triples_unstructured/"
        spos = []
        df = pd.read_csv(structured_path, encoding='latin1')
        ss = list(df.iloc[:, 0])
        ps = df.iloc[:, 1]
        os = df.iloc[:, 2]
        for s, p, o in zip(ss, ps, os):
            spos.append((s, p, o))
        print("Total structured spos: " + str(len(spos)))

        from utils import prepare_sqa_data
        uns_spos, loc_dic = prepare_sqa_data.get_spo_from_uns(path=unstructured_path)

        print("Total unstructured spos: " + str(len(uns_spos)))

        spos += uns_spos

        all_data = []
        all_right = []
        all_pred = []
        all_left = []
        for s, p, o in spos:
            all_left.append(s)
            all_pred.append(p)
            all_right.append(o)

            all_data.append((s + " " + p, o, 0))
            all_data.append((s, p + " " + o, 0))

        print("True pairs: " + str(len(all_data)))

        pos = set()
        for e1, e2, label in all_data:
            pos.add(e1 + e2)

        # negative pairs
        random_permutation = np.random.permutation(len(all_left))
        all_left = np.asarray(all_left)
        all_left = all_left[random_permutation]
        random_permutation = np.random.permutation(len(all_right))
        all_right = np.asarray(all_right)
        all_right = all_right[random_permutation]

        false_pairs = []
        seen = set()
        for s, p, o in zip(all_left, all_pred, all_right):
            if s + " " + p + o in pos or s + p + " " + o in pos:
                continue

            if s + " " + p + o not in seen:
                false_pairs.append((s + " " + p, o, 1))

            if s + p + " " + o not in seen:
                false_pairs.append((s, p + " " + o, 1))

            seen.add((s + " " + p + o))
            seen.add((s + p + " " + o))

        print("False pairs: " + str(len(false_pairs)))

        all_data += false_pairs

        print("All pairs: " + str(len(all_data)))

    random_permutation = np.random.permutation(len(all_data))
    all_data = np.asarray(all_data)
    all_data = all_data[random_permutation]
    # with open(o_path + "true_pairs.pkl", "wb") as f:
    #     pickle.dump(true_pairs, f)
        # all_data = all_data[:2000]  # test
        # total = 0
        # for s, p, label in all_data:
        #    total += label
        # print("total: " + str(total/len(all_data)))

    #all_data = all_data[:2000]  # chunk all_data for evaluator

    vocab = dict()

    sparsity_code_size = 8
    idx_vectorizer = FlatIndexVectorizer(vocab_index=vocab, sparsity_code_size=sparsity_code_size)
    if wiki:
        sparsity_code_size = 48
        idx_vectorizer = IndexVectorizer(vocab_index=vocab, sparsity_code_size=sparsity_code_size, tokenizer_sep=" ")
    vectorizer = tp.CustomVectorizer(idx_vectorizer)

    st = time.time()
    print("start vectorizing...")
    # vectorization happens here
    X1 = []
    X2 = []
    Y = []
    for e1, e2, label in all_data:
        ve1 = vectorizer.get_vector_for_tuple(e1)
        ve1 = ve1.toarray()[0]
        ve2 = vectorizer.get_vector_for_tuple(e2)
        ve2 = ve2.toarray()[0]
        X1.append(ve1)
        X2.append(ve2)
        Y.append(label)

    X1 = np.asarray(X1)
    X2 = np.asarray(X2)
    Y = np.asarray(Y)

    et = time.time()
    print("finish vectorizing...")
    print("took: " + str(et - st))

    vocab, inv_vocab = vectorizer.get_vocab_dictionaries()

    print("vocab size: " + str(len(vocab)))

    # def model1():
    input_dim = sparsity_code_size * 32

    # declare network
    i1 = Input(shape=(input_dim,), name="i1")
    i2 = Input(shape=(input_dim,), name="i2")

    base = Sequential()
    base.add(Dense(1024, input_shape=(input_dim,), activation='relu'))
    # base.add(Dense(2056, input_shape=(input_dim,), activation='relu'))
    # base.add(Dense(512, input_shape=(input_dim,), activation='relu'))
    # base.add(Dense(2056, activation='relu'))
    # base.add(Dense(768, activation='relu'))
    base.add(Dense(512, activation='relu'))
    # base.add(Dense(1024, activation='relu'))
    base.add(Dense(256, activation='relu'))
    base.add(Dense(128, activation='relu'))
    # base.add(Dense(64, activation='relu'))

    emb_1 = base(i1)
    emb_2 = base(i2)

    def euclidean_distance(vects):
        x, y = vects
        return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

    def eucl_dist_output_shape(shapes):
        shape1, shape2 = shapes
        return shape1[0], 1

    def contrastive_loss(y_true, y_pred):
        margin = 1
        # Y=0 means similar and Y=1 means dissimilar. Think of it as distance
        return K.mean((1 - y_true) * K.square(y_pred) + y_true * K.square(K.maximum(margin - y_pred, 0)))

    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([emb_1, emb_2])

    fullmodel = Model(input=[i1, i2], output=distance)

    opt = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)

    callbacks = []
    callback_best_model = keras.callbacks.ModelCheckpoint(o_path + "epoch-{epoch}.h5",
                                                          monitor='val_loss',
                                                          save_best_only=False)
    # early stopping callback
    callback_early_stop = keras.callbacks.EarlyStopping(monitor='acc', patience=2)
    callbacks.append(callback_best_model)
    callbacks.append(callback_early_stop)

    fullmodel.compile(optimizer=opt, loss=contrastive_loss, metrics=['accuracy'])

    fullmodel.summary()

    def size(model):  # Compute number of params in a model (the actual number of floats)
        return sum([np.prod(K.get_value(w).shape) for w in model.trainable_weights])

    print("trainable params: " + str(size(fullmodel)))

    fullmodel.fit([X1, X2], Y, epochs=125, shuffle=True, batch_size=80, callbacks=callbacks)

    encoder = Model(input=i1, output=emb_1)

    fullmodel.save(o_path + "/sim.h5")
    encoder.save(o_path + "/sim_encoder.h5")

    with open(o_path + "tf_dictionary.pkl", "wb") as f:
        pickle.dump(vocab, f)
Beispiel #5
0
def main():
    #o_path = "/Users/ra-mit/development/fabric/uns/sim/"
    o_path = "/data/eval/wiki/"

    from utils import prepare_sqa_data
    #data = prepare_sqa_data.get_sqa(filter_stopwords=True)

    if not fb and not wiki:

        spos = prepare_sqa_data.get_spo_from_rel(filter_stopwords=True)

        uns_spos, loc_dic = prepare_sqa_data.get_spo_from_uns()

        spos = spos + uns_spos

    if wiki:
        # structured_path = "/Users/ra-mit/data/fabric/dbpedia/triples_structured/all.csv"
        structured_path = "/data/smalldatasets/wiki/all.csv"
        # unstructured_path = "/Users/ra-mit/data/fabric/dbpedia/triples_unstructured/"
        unstructured_path = "/data/smalldatasets/wiki/triples_unstructured/"
        spos = []
        df = pd.read_csv(structured_path, encoding='latin1')
        ss = list(df.iloc[:, 0])
        ps = df.iloc[:, 1]
        os = df.iloc[:, 2]
        for s, p, o in zip(ss, ps, os):
            spos.append((s, p, o))
        print("Total structured spos: " + str(len(spos)))

        # uns_files = csv_access.list_files_in_directory(unstructured_path)
        uns_spos, loc_dic = prepare_sqa_data.get_spo_from_uns(path=unstructured_path)
        # uns_spos = []
        # for f in uns_files:
        #     df = pd.read_csv(f, encoding='latin1')
        #     ss = list(df.iloc[:, 0])
        #     ps = df.iloc[:, 1]
        #     os = df.iloc[:, 2]
        #     for s, p, o in zip(ss, ps, os):
        #         uns_spos.append((s, p, o))

        print("Total unstructured spos: " + str(len(uns_spos)))

        spos += uns_spos
        print("Total: " + str(len(spos)))

    true_pairs = []
    S = []
    P = []
    O = []
    # positive pairs
    for s, p, o in spos:
        s = str(s)
        p = str(p)
        o = str(o)
        true_pairs.append((s, p, 0))
        true_pairs.append((s, o, 0))
        true_pairs.append((p, o, 0))
        S.append(s)
        P.append(p)
        O.append(o)

    if not fb and not wiki:
        with open(o_path + "true_pairs.pkl", "wb") as f:
            pickle.dump(true_pairs, f)

    print("True pairs: " + str(len(true_pairs)))

    # set to avoid negative samples that collide with positive ones
    pos = set()
    for e1, e2, label in true_pairs:
        pos.add(e1 + e2)

    print("Unique true pairs: " + str(len(pos)))

    # negative pairs
    random_permutation = np.random.permutation(len(S))
    S = np.asarray(S)
    S = S[random_permutation]
    random_permutation = np.random.permutation(len(O))
    O = np.asarray(O)
    O = O[random_permutation]

    false_pairs = []
    for s, p, o in zip(list(S), P, list(O)):
        if s + p in pos or s + o in pos or p + o in pos:
            continue  # this is probably colliding with pos, so we do not include
        false_pairs.append((s, p, 1))
        false_pairs.append((s, o, 1))
        false_pairs.append((p, o, 1))

    print("Negative pairs 1: " + str(len(false_pairs)))

    random_permutation = np.random.permutation(len(S))
    S = np.asarray(S)
    S = S[random_permutation]
    random_permutation = np.random.permutation(len(O))
    O = np.asarray(O)
    O = O[random_permutation]

    false_pairs2 = []
    for s, p, o in zip(list(S), P, list(O)):
        if s + p in pos or s + o in pos or p + o in pos:
            continue  # this is probably colliding with pos, so we do not include
        false_pairs2.append((s, p, 1))
        false_pairs2.append((s, o, 1))
        false_pairs2.append((p, o, 1))

    print("Negative pairs 2: " + str(len(false_pairs2)))

    all_data = true_pairs + false_pairs + false_pairs2

    sparsity_code_size = 48

    if fb:
        sparsity_code_size = 4 # 1 word per clause
        o_path = "/data/eval/fb/"
        all_data, true_pairs = process_fb.extract_data()
        
        # start counting vals
        #_test = all_data[:2000]  # test
        #total = 0
        #for s, p, label in _test:
        #    total += label
        #print("total: " + str(total/len(all_data)))
        # end counting vals
      
        random_permutation = np.random.permutation(len(all_data))
        all_data = np.asarray(all_data)
        all_data = all_data[random_permutation]
        with open(o_path + "true_pairs.pkl", "wb") as f:
            pickle.dump(true_pairs, f)
        #all_data = all_data[:2000]  # test
        #total = 0
        #for s, p, label in all_data:
        #    total += label
        #print("total: " + str(total/len(all_data)))

    if wiki:
        sparsity_code_size = 48
        o_path = "/data/eval/wiki/"
        random_permutation = np.random.permutation(len(all_data))
        all_data = np.asarray(all_data)
        all_data = all_data[random_permutation]
        with open(o_path + "true_pairs.pkl", "wb") as f:
            pickle.dump(true_pairs, f)

    vocab = dict()

    if not fb:
        idx_vectorizer = IndexVectorizer(vocab_index=vocab, sparsity_code_size=sparsity_code_size, tokenizer_sep=" ")
    else:
        idx_vectorizer = FlatIndexVectorizer(vocab_index=vocab, sparsity_code_size=sparsity_code_size)
    vectorizer = tp.CustomVectorizer(idx_vectorizer)

    st = time.time()
    print("start vectorizing...")
    # vectorization happens here
    X1 = []
    X2 = []
    Y = []
    for e1, e2, label in all_data:
        ve1 = vectorizer.get_vector_for_tuple(e1)
        ve1 = ve1.toarray()[0]
        ve2 = vectorizer.get_vector_for_tuple(e2)
        ve2 = ve2.toarray()[0]
        X1.append(ve1)
        X2.append(ve2)
        Y.append(label)

    X1 = np.asarray(X1)
    X2 = np.asarray(X2)
    Y = np.asarray(Y)

    et = time.time()
    print("finish vectorizing...")
    print("took: " + str(et-st))

    vocab, inv_vocab = vectorizer.get_vocab_dictionaries()

    print("vocab size: " + str(len(vocab)))

    # def model1():
    input_dim = sparsity_code_size * 32

    # declare network
    i1 = Input(shape=(input_dim,), name="i1")
    i2 = Input(shape=(input_dim,), name="i2")

    base = Sequential()
    base.add(Dense(1024, input_shape=(input_dim,), activation='relu'))
    #base.add(Dense(2056, input_shape=(input_dim,), activation='relu'))
    #base.add(Dense(512, input_shape=(input_dim,), activation='relu'))
    #base.add(Dense(2056, activation='relu'))
    #base.add(Dense(768, activation='relu'))
    base.add(Dense(512, activation='relu'))
    #base.add(Dense(1024, activation='relu'))
    base.add(Dense(256, activation='relu'))
    base.add(Dense(128, activation='relu'))
    #base.add(Dense(64, activation='relu'))

    emb_1 = base(i1)
    emb_2 = base(i2)

    def euclidean_distance(vects):
        x, y = vects
        return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))

    def eucl_dist_output_shape(shapes):
        shape1, shape2 = shapes
        return shape1[0], 1

    def contrastive_loss(y_true, y_pred):
        margin = 1
        # Y=0 means similar and Y=1 means dissimilar. Think of it as distance
        return K.mean((1 - y_true) * K.square(y_pred) + y_true * K.square(K.maximum(margin - y_pred, 0)))

    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([emb_1, emb_2])

    fullmodel = Model(input=[i1, i2], output=distance)

    opt = SGD(lr=0.05, decay=1e-6, momentum=0.9, nesterov=True)

    callbacks = []
    callback_best_model = keras.callbacks.ModelCheckpoint(o_path + "epoch-{epoch}.h5",
                                                                  monitor='val_loss',
                                                                  save_best_only=False)
    callbacks.append(callback_best_model)

    fullmodel.compile(optimizer=opt, loss=contrastive_loss, metrics=['accuracy'])

    fullmodel.summary()

    def size(model):  # Compute number of params in a model (the actual number of floats)
        return sum([np.prod(K.get_value(w).shape) for w in model.trainable_weights])

    print("trainable params: " + str(size(fullmodel)))

    fullmodel.fit([X1, X2], Y, epochs=300, shuffle=True, batch_size=80, callbacks=callbacks)

    encoder = Model(input=i1, output=emb_1)

    fullmodel.save(o_path + "/sim.h5")
    encoder.save(o_path + "/sim_encoder.h5")

    with open(o_path + "tf_dictionary.pkl", "wb") as f:
        pickle.dump(vocab, f)
Beispiel #6
0
def collocs(text):
    bigrams = BigramAssocMeasures()
    finder = BigramCollocationFinder.from_documents(
        [nltk.word_tokenize(" ".join(text))])
    finder.apply_freq_filter(2)
    topk = finder.nbest(bigrams.pmi, 15)
    for tk in topk:
        print(tk)


if __name__ == "__main__":
    print("nlp utils")

    from utils.prepare_sqa_data import get_spo_from_uns
    spos, loc_dic = get_spo_from_uns(
        path="/Users/ra-mit/data/fabric/dbpedia/test")

    # path = "/Users/ra-mit/data/fabric/academic/preprocessed/barbara10.txt"

    # with open(path, "r") as f:
    #     text = f.readlines()
    #
    # all_text = " ".join(text)
    # sents = sentence_segmentation(all_text)
    # for s in sents:
    #     s = [(w, pos) for (w, pos) in s if w not in ['.', ',', '-', ';', ':', '``']]
    #     s = filter(" ".join([w for w, pos in s]))
    #     print(s)
    #
    # exit()