Beispiel #1
0
 def save(self, *args, **kwargs):
     self.state = normalize_text(self.state)
     self.city = normalize_text(self.city)
     self.neighborhood = normalize_text(self.neighborhood)
     self.location = normalize_text(self.location)
     self.zipcode = normalize_text(self.zipcode)
     super(Address, self).save(*args, **kwargs)
Beispiel #2
0
    def parse(klass, row, carrier, areacode, phone_type):
        row = list(row)                                 # Make row mutable

        person = Person()
        person.name = normalize_text(row[2])

        address = Address()
        address.location = normalize_text(row[3])
        address.neighborhood = normalize_text(row[4])
        address.city = normalize_text(row[5])
        address.state = normalize_text(row[6])

        phone = Phone()
        phone.carrier = carrier
        phone.areacode = areacode
        phone.type = phone_type

        try:

            # document - try CPF
            person.document = validate_cpf(row[8][-11:])
            person.nature = Person.NATURE_CHOICES_PHYSICAL[0]

        except CPFValidationError:

            # document - try CNPJ
            person.document = validate_cnpj(row[8][-14:])
            person.nature = Person.NATURE_CHOICES_LEGAL[0]

        address.zipcode = validate_zipcode(row[7])

        phone.number = validate_phone_number(row[1])

        return klass(row, person, address, phone)
Beispiel #3
0
def create_plot_record(embeddings, plot, movie_id):
    p = []
    sent_lens = []

    plot_size = 0
    for i, pl in enumerate(plot):
        words = util.normalize_text(plot[i])
        if (len(words) > 0) and (words[0] != ''):
            p_sent = []
            word_count = 0
            plot_size += 1

            for j, word in enumerate(words):
                if (j < data_conf.P_MAX_WORD_PER_SENT_COUNT) and (plot_size < data_conf.P_MAX_SENT_COUNT):
                    p_sent.append(util.get_word_vector(embeddings, word, data_conf.EMBEDDING_SIZE))
                    word_count += 1
            sent_lens.append(word_count)
            p.append(p_sent)
    return p
Beispiel #4
0
def run_creation(model_type, attack, model_folder, examples_folder,
                 instances_to_attack):
    print("store created examples in %s" % examples_folder)
    if model_type == "lstm":
        import movieqa.run_lstm as runner
    else:
        import movieqa.run_cnn as runner

    runner.data_conf.TRAIN_DIR = model_folder

    load = False
    check_sents = []
    check_found = []
    check_num = 0
    corr_probs = []
    if not tf.gfile.Exists(examples_folder):
        tf.gfile.MakeDirs(examples_folder)
    else:
        checkpoints = glob.glob(examples_folder + "/[!accuracies]*")
        checkpoints = sorted(checkpoints, reverse=True)
        latest = checkpoints[0]
        splitted = latest.split(".txt")[0]
        check_num = int(splitted[len(splitted) - 1]) + 1

        check = open(latest, encoding="utf8")
        for line in check:
            parts = line.replace('\n', '').split("\t")
            check_words = parts[0].split(" ")
            check_sents.append(check_words)
            last_prob = float(parts[1])
            found = parts[2]
            if found == 'True':
                b_found = True
            else:
                b_found = False
            corr_probs.append(last_prob)
            check_found.append(b_found)

        load = True

    emb_dir = runner.data_conf.EMBEDDING_DIR

    vectors, vocab = util.load_embeddings(emb_dir)
    rev_vocab = dict(zip(vocab.values(), vocab.keys()))
    # print(rev_vocab)
    filename = "adversarial_addAny/common_english.txt"
    # length of the distractor sentence
    d = 10
    # pool size of common words to sample from for each word in the distractor sentence
    poolsize = 10
    common_words = {}
    fin = open(filename, encoding="utf8")
    for line in fin:
        word = line.replace('\n', '')
        # print(word)
        if word in rev_vocab:
            common_words[word] = rev_vocab[word]
        else:
            print(
                'ERROR: word "%s" not in vocab. Run add_common_words_to_vocab.py first.'
                % word)
            exit(1)

    with open(instances_to_attack + '/val.pickle', 'rb') as handle:
        qa = pickle.load(handle)

    w_s = []
    w_choices = []
    w_found = []

    q_inds = []
    pools = []
    with open(examples_folder + "/" + str(0 + check_num) + ".txt",
              "a") as file:
        for k, question in enumerate(qa):
            # load question indices
            q_words = util.normalize_text(question.question)
            q_ind = []
            for word in q_words:
                q_ind.append(rev_vocab[word])

            a_words = []
            for i, answer in enumerate(question.answers):
                if not i == int(question.correct_index):
                    words = util.normalize_text(answer)
                    a_words.extend(words)
            w = []
            w_choice = []
            rand_sent = ""
            for i in range(0, d):
                if load:
                    c_word = check_sents[k][i]
                    w_index = rev_vocab[c_word]
                    rand_sent += (c_word + " ")

                else:
                    w_index = random.choice(list(common_words.values()))
                    rand_sent += (vocab[w_index] + " ")
                    w_found.append(False)
                w.append(w_index)
                w_choice.append(i)

            if load:
                found = check_found[k]
                w_found.append(found)
                # file.write(rand_sent+"\t"+str(corr_probs[k])+"\t"+str(found)+"\n")
            else:
                found = False
                w_found.append(found)
                file.write(rand_sent + "\t" + "1.0" + "\t" + str(found) + "\n")

            shuffle(w_choice)
            w_choices.append(w_choice)

            w_s.append(w)
            d_pools = []
            for j, dj in enumerate(w):
                pool = []
                random_common_words = np.random.choice(list(
                    common_words.values()),
                                                       poolsize,
                                                       replace=False)
                print("Adding common words")
                pool.extend(random_common_words)
                if attack == 'addQ' or attack == "addQA":
                    print("Adding question words")
                    for word in q_words:
                        pool.append(rev_vocab[word])
                if attack == "addA" or attack == "addQA":
                    print("Adding answer words")
                    for word in a_words:
                        pool.append(rev_vocab[word])

                shuffle(pool)
                d_pools.append(pool)
            pools.append(d_pools)

    filepath = instances_to_attack + "/*.tfrecords"
    filenames = glob.glob(filepath)

    global_step = tf.contrib.framework.get_or_create_global_step()
    dataset = tf.contrib.data.TFRecordDataset(filenames)
    dataset = dataset.map(runner.get_single_sample)
    dataset = dataset.repeat(poolsize * d)
    batch_size = 1

    dataset = dataset.padded_batch(batch_size,
                                   padded_shapes=([None], [5, None], [None],
                                                  (), [None, None], ()))

    iterator = dataset.make_one_shot_iterator()

    next_q, next_a, next_l, next_plot_ids, next_plots, next_q_types = iterator.get_next(
    )
    add_sent = tf.placeholder(tf.int64, shape=[None])
    # sent_exp = tf.expand_dims(add_sent,0)
    m_p = tf.py_func(add_plot_sentence, [next_plots, add_sent], [tf.int64])[0]
    # m_p = next_plots
    # m_p = tf.concat([next_plots,sent_exp],axis=0)

    logits, atts, sent_atts, _ = runner.predict_batch([next_q, next_a, m_p],
                                                      training=False)

    probabs = model.compute_probabilities(logits=logits)
    accuracy_example = tf.reduce_mean(
        model.compute_accuracies(logits=logits, labels=next_l, dim=1))

    to_restore = tf.contrib.slim.get_variables_to_restore(
        exclude=["embeddings"])
    saver = tf.train.Saver(to_restore)

    p_counts = 0
    last_p = ''
    p_id = 0
    f_counter = 0
    with tf.Session() as sess:
        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())
        sess.run(init_op)
        ckpt = tf.train.get_checkpoint_state(runner.data_conf.TRAIN_DIR)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found')
        _ = sess.run(runner.set_embeddings_op,
                     feed_dict={runner.place: vectors})
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
        if not load:
            accs = np.ones(shape=(len(qa)))
        else:
            accs = corr_probs
        for w_counter in range(0, d):
            words = np.zeros(shape=(len(qa)), dtype=np.int64)
            # select next word to optimize greedily
            next_inds = []
            for k, question in enumerate(qa):
                next_word = w_choices[k].pop()
                next_inds.append(next_word)
                words[k] = w_s[k][next_word]

            # go through whole pool for every question
            next_ind = 0
            for pool_counter in range(0, poolsize):
                total_acc = 0.0
                info = ""

                for k, question in enumerate(qa):
                    w_copy = [x for x in w_s[k]]
                    print("==============")
                    next_ind = next_inds[k]
                    pool = pools[k][next_ind]

                    pool_ind = pool.pop()

                    print("setting " + str(w_s[k][next_ind]) + " to " +
                          str(pool_ind))
                    w_copy[next_ind] = pool_ind
                    info = "wordcounter: " + str(
                        w_counter) + " - poolcounter: " + str(
                            pool_counter) + " - question: " + str(k)
                    print(info)
                    acc_val, probs_val, gs_val, q_type_val, q_val, atts_val, sent_atts_val, labels_val, p_val, a_val, p_id_val = sess.run(
                        [
                            accuracy_example, probabs, global_step,
                            next_q_types, next_q, atts, sent_atts, next_l, m_p,
                            next_a, next_plot_ids
                        ],
                        feed_dict={add_sent: w_copy})
                    sent = ""
                    for word in w_copy:
                        sent += (" " + vocab[word])
                    print(sent + " - acc: " + str(acc_val))
                    corr = np.argmax(labels_val[0])
                    pred_val = probs_val[0][corr]
                    if (pred_val < accs[k]):
                        word_s = vocab[words[k]]
                        pool_s = vocab[pool_ind]
                        print(pool_s + " (" + str(pred_val) + ") < " + word_s +
                              " (" + str(accs[k]) + ")")
                        words[k] = pool_ind
                        accs[k] = pred_val
                        if acc_val == 0:
                            print("setting" + str(k) + " to true with acc" +
                                  str(acc_val) + " and pred " + str(pred_val))
                            w_found[k] = True
                            f_counter += 1

                    filename = ''
                    q_s = ''
                    for index in q_val[0]:
                        word = (vocab[index])
                        q_s += (word + ' ')
                        filename += (word + '_')
                    predicted_probabilities = probs_val[0]
                    labels = labels_val[0]

                    p_id = 'test'
                    path = runner.data_conf.EVAL_DIR + "/plots/" + p_id + "/" + filename
                    if (p_counts < 20):
                        for i, a_att in enumerate(atts_val[0]):
                            # a_att = np.max(a_att, 1)
                            qa_s = q_s + "? (acc: " + str(acc_val) + ")\n "
                            for index in a_val[0][i]:
                                qa_s += (vocab[index] + ' ')
                            lv = " (label: " + str(int(
                                labels[i])) + " - prediction: " + (str(
                                    "%.2f" %
                                    (predicted_probabilities[i] * 100))) + "%)"
                            qa_s += lv

                            a_sents = []
                            y_labels = []

                            for j, att in enumerate(a_att):
                                a_s = []
                                y_labels.append(
                                    str("%.2f" %
                                        (sent_atts_val[0][i][j] * 100)) + "%")
                                for index in p_val[0][j]:
                                    a_s.append(vocab[index])
                                a_sents.append(a_s)
                            util.plot_attention(np.array(a_att),
                                                np.array(a_sents), qa_s,
                                                y_labels, path, filename)
                        last_p = p_id
                        p_counts += 1
                    total_acc += acc_val
                    print(total_acc / (k + 1))
                with open(examples_folder + "/accuracies.txt", "a") as file:
                    file.write(info + " - " + str(total_acc / (len(qa))) +
                               "\n")

            with open(
                    examples_folder + "/" + str(w_counter + check_num + 1) +
                    ".txt", "a") as file:
                for k, question in enumerate(qa):
                    w_s[k][next_ind] = words[k]
                    sent = ""
                    for word in w_s[k]:
                        sent += (vocab[word] + " ")
                    file.write(sent + "\t" + str(accs[k]) + "\t" +
                               str(w_found[k]) + "\n")
Beispiel #5
0
def create_movieqa_data(qa_json_file,
                        name,
                        outfolder,
                        embeddings,
                        qa_ids=None):
    valid_count = 0

    movie.cfg.QA_JSON = qa_json_file
    print("Preprocessing qa file and creating records for %s" %
          movie.cfg.QA_JSON)

    mqa = movie.DataLoader()
    story, qa = mqa.get_story_qa_data(name, 'split_plot')
    set_path = outfolder + "/" + name + ".tfrecords"
    writer = tf.python_io.TFRecordWriter(set_path)

    # filter questions by ids
    if qa_ids:
        qa = filter_qa(qa, qa_ids)
        print("Selected %d questions based on %d provided ids" %
              (len(qa), len(qa_ids)))
        with open(os.path.join(outfolder, 'val.pickle'), 'wb') as handle:
            pickle.dump(qa, handle)

    for k, question in enumerate(qa):
        q = []
        ans = []
        l = np.zeros(shape=[5], dtype=float)

        ex = tf.train.SequenceExample()
        words = util.normalize_text(question.question)

        # lowercase now
        words = [word.lower() for word in words]

        movie_id = question.imdb_key
        question_size = len(words)

        if name != "test":
            l[question.correct_index] = 1.0
        if words[0] in util.question_types:
            question_type = util.question_types[words[0]]
        else:
            question_type = -1
        ex.context.feature["question_type"].int64_list.value.append(
            question_type)

        for i, word in enumerate(words):
            if i < data_conf.Q_MAX_WORD_PER_SENT_COUNT:
                w_vec = (util.get_word_vector(embeddings, word,
                                              data_conf.EMBEDDING_SIZE))
                if not w_vec:
                    w_vec = (util.get_word_vector(embeddings, word,
                                                  data_conf.EMBEDDING_SIZE))
                q.append(w_vec)

        if not movie_id in plot_dict:
            plot = story.get(movie_id)
            p_word_ids = create_plot_record(embeddings, plot, movie_id)
            plot_dict[movie_id] = p_word_ids
        else:
            p_word_ids = plot_dict[movie_id]

        for i, answer in enumerate(question.answers):
            a = []
            words = util.normalize_text(answer)

            for j, word in enumerate(words):
                if j < data_conf.Q_MAX_WORD_PER_SENT_COUNT:
                    w_vec = (util.get_word_vector(embeddings, word,
                                                  data_conf.EMBEDDING_SIZE))
                    if not w_vec:
                        w_vec = (util.get_word_vector(
                            embeddings, word, data_conf.EMBEDDING_SIZE))
                    a.append(w_vec)
            ans.append(a)

        q_type_feature = tf.train.Feature(int64_list=tf.train.Int64List(
            value=[question_type]))

        q_size_feature = tf.train.Feature(int64_list=tf.train.Int64List(
            value=[question_size]))

        movie_id_feature = tf.train.Feature(bytes_list=tf.train.BytesList(
            value=[str.encode(movie_id)]))

        label_list_feature = [
            tf.train.Feature(float_list=tf.train.FloatList(value=[label]))
            for label in l
        ]

        answer_list_feature = [
            tf.train.Feature(int64_list=tf.train.Int64List(value=aw))
            for aw in ans
        ]

        plot_list_feature = [
            tf.train.Feature(int64_list=tf.train.Int64List(value=pl))
            for pl in p_word_ids
        ]

        question_list_feature = [
            tf.train.Feature(int64_list=tf.train.Int64List(value=q))
        ]

        feature_list = {
            "labels": tf.train.FeatureList(feature=label_list_feature),
            "answers": tf.train.FeatureList(feature=answer_list_feature),
            "question": tf.train.FeatureList(feature=question_list_feature),
            "plot": tf.train.FeatureList(feature=plot_list_feature),
        }

        context = tf.train.Features(
            feature={
                "question_type": q_type_feature,
                "question_size": q_size_feature,
                "movie_id": movie_id_feature
            })

        feature_lists = tf.train.FeatureLists(feature_list=feature_list)
        example_sequence = tf.train.SequenceExample(
            feature_lists=feature_lists, context=context)
        serialized = example_sequence.SerializeToString()
        writer.write(serialized)
        valid_count += 1

    print(name + ' set completed - files written to ' + set_path)