Ejemplo n.º 1
0
def plot_wvecs(filename):
    wv = load_pickle(filename)
    word_to_index = load_pickle("../data/word_to_index.pkl")
    index_to_word = invert_dict(word_to_index)
    counts = load_pickle("../data/counts.pkl")
    reduced_word_to_index = {}

    for k,v in counts.iteritems():
        if v > 1800:
            reduced_word_to_index[k] = v
    
    indices = [word_to_index[k] for k, v in reduced_word_to_index.iteritems()]
    words = [index_to_word[i].replace("edu.stanford.nlp.sempre.","") for i in indices]
    word_vecs = [wv[i] for i in indices]

    temp = (word_vecs - np.mean(word_vecs, axis=0))
    covariance = 1.0 / len(word_vecs) * temp.T.dot(temp)
    U, S, V = np.linalg.svd(covariance)
    coord = temp.dot(U[:,0:2])

    for i in xrange(len(words)):
        plt.text(coord[i,0], coord[i,1], words[i], bbox=dict(facecolor='green', alpha=0.1))
    
    plt.xlim((np.min(coord[:,0]), np.max(coord[:,0]) + 2))
    plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))
    plt.show()
Ejemplo n.º 2
0
def make_templates(args, templates_emb, w2emb, w2i):
    """
    Gets embedded + unembedded templates if they don't exist.
    """

    if os.path.exists("templates.pkl"):
        templates = load_pickle("templates.pkl")
        templates_emb_pad = load_pickle("templates_emb.pkl")
    else:
        # Flatten templates
        templates_emb = [y for x in templates_emb for y in x]
        # Cut templates to maximum length
        templates_emb = [temp[-args.max_length:] for temp in templates_emb]
        # Pad embeddings
        templates_emb_pad = [
            np.pad(temp1, ((0, args.max_length - len(temp1)), (0, 0)),
                   "constant",
                   constant_values=(len(w2i))) for temp1 in templates_emb
        ]
        # Convert embedded templates to word templates
        templates = [convert_to_words(sent, w2emb) for sent in templates_emb]

        save_pickle("templates.pkl", templates)
        save_pickle("templates_emb.pkl", templates_emb_pad)
    return templates, templates_emb_pad
Ejemplo n.º 3
0
 def __init__(self):
     '''
     提前准备好计算句向量必备的文件:word2id,id2embed,id2weight
     '''
     self.word_id = load_json(config.word_id_path)
     self.id_emb = load_pickle(config.id_emb_path)
     self.id_weight = load_pickle(config.id_weight_path)
     self.tokenizer = clean_seg
Ejemplo n.º 4
0
def get_data(filename, data, embeddings, w2i, gensim_model, args):
    """
    Retrieves all data. Load it from a Pickle file if it exists, and create it
    otherwise.
    """

    global num_words

    if os.path.exists(filename):
        all_examples = data_utils.load_pickle(filename)
    else:
        all_examples = []

        for example in tqdm(data[:10]):
            resources = []
            embedded_resources = []

            data_utils.get_resources(example["documents"]["comments"],
                                     resources, embedded_resources)
            data_utils.get_resources(example["documents"]["fact_table"],
                                     resources, embedded_resources)
            data_utils.get_resources(example["documents"]["plot"], resources,
                                     embedded_resources)
            data_utils.get_resources(example["documents"]["review"], resources,
                                     embedded_resources)

            chat = example["chat"]

            # Loop over each of the last three utterances in the chat (context).
            for i in range(3, len(chat) - 1):
                last_utterances = chat[i - 3:i]
                response = chat[i + 1]

                if len(response) > 0:
                    exp = []
                    embedded_utterances = [
                        data_utils.embed_sentence(utterance)
                        for utterance in last_utterances
                    ]
                    context, embedded_context = \
                        data_utils.get_context(last_utterances)

                    # Retrieve: Takes context and resources. Uses Word Mover's Distance
                    # to obtain relevant resource candidates.
                    similarities = retrieve(context, resources, gensim_model)

                    padd_resource = embedded_resources[np.argmax(
                        similarities)][-args.max_length:]
                    padd_resource = np.pad(
                        padd_resource,
                        ((0, args.max_length - len(padd_resource)), (0, 0)),
                        "constant",
                        constant_values=(num_words))

                    exp.append(padd_resource)
                    exp.append(data_utils.clean_sentence(chat[i + 1]))
                    all_examples.append(tuple(exp))
        save_data(filename, all_examples)
    return all_examples
Ejemplo n.º 5
0
 def load(self, dataset="train"):
     path = self.get_path(dataset)
     return du.load_pickle(path)
Ejemplo n.º 6
0
def split_files(args):
    assert os.path.isfile(
        args.label_file), 'there is no label files, --label_file [{}]'.format(
            args.label_file)
    dirname, filename = os.path.split(args.label_file)
    data = load_pickle(args.label_file)

    ## SPLIT data
    train_idx, leftover_idx, _, leftover_label = train_test_split(
        list(range(len(data['label']))),
        data['label'],
        train_size=args.labeled_data_size,
        stratify=data['label'])
    if len(leftover_idx) > args.valid_data_size:
        valid_idx, unlabel_idx, _, _ = train_test_split(
            leftover_idx,
            leftover_label,
            train_size=args.valid_data_size,
            stratify=leftover_label)
    else:
        valid_idx = leftover_idx
        unlabel_idx = []

    train_data = dict((key, np.array(item)[train_idx].tolist())
                      for key, item in zip(data.keys(), data.values()))
    valid_data = dict((key, np.array(item)[valid_idx].tolist())
                      for key, item in zip(data.keys(), data.values()))
    unlabel_data = dict((key, np.array(item)[unlabel_idx].tolist())
                        for key, item in zip(data.keys(), data.values()))

    if args.unlabel_file is not None and os.path.isfile(args.unlabel_file):
        additional_data = load_pickle(args.unlabel_file)
        for key in unlabel_data.keys():
            unlabel_data[key] += additional_data[key]

    if args.train_file is None:
        args.train_file = TRAIN_NAME.format(args.labeled_data_size,
                                            args.valid_data_size)
    train_path = os.path.join(args.output_dir, args.train_file)
    save_pickle(train_path, train_data)
    try:
        os.remove(os.path.join(args.output_dir, "cache_" + args.train_file))
    except:
        pass

    if args.valid_file is None:
        args.valid_file = VALID_NAME.format(args.labeled_data_size,
                                            args.valid_data_size)
    valid_path = os.path.join(args.output_dir, args.valid_file)
    save_pickle(valid_path, valid_data)
    try:
        os.remove(os.path.join(args.output_dir, "cache_" + args.valid_file))
    except:
        pass

    if args.augment_file is None:
        args.augment_file = AUGMENT_NAME.format(args.labeled_data_size,
                                                args.valid_data_size)
    augment_path = os.path.join(args.output_dir, args.augment_file)
    save_pickle(augment_path, unlabel_data)
    try:
        os.remove(os.path.join(args.output_dir, "cache_" + args.augment_file))
    except:
        pass

    args.train_file = train_path
    args.valid_file = valid_path
    args.augment_file = augment_path
Ejemplo n.º 7
0
def train(args):
    print("Load data...")
    data_train = load_data(args.folder + "/train_data.json")
    data_test = load_data(args.folder + "/dev_data.json")
    embeddings = load_pickle(args.embeddings)
    w2i = load_pickle(args.w2i)
    w2emb = load_pickle(args.w2emb)
    templates_emb = get_templates("../../data/templates.pkl")

    print("Do the templates...")
    templates, templates_emb = make_templates(args, templates_emb, w2emb, w2i)

    print("Now load the model...")
    emb_size = len(embeddings[0])
    model = SaliencyPrediction(emb_size * args.max_length, device).to(device)
    loss_func = nn.BCELoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
    rouge = Rouge()

    print("Read in train data...")
    resources = []
    embedded_resources = []
    for example in tqdm(data_train):
        get_resources(example["documents"]["comments"], resources,
                      embedded_resources, embeddings, w2i)
        num_comments = len(resources)
        get_resources(example["documents"]["fact_table"], resources,
                      embedded_resources, embeddings, w2i)
        num_facts = len(resources) - num_comments
        get_resources(example["documents"]["plot"], resources,
                      embedded_resources, embeddings, w2i)
        num_plots = len(resources) - num_comments - num_facts
        get_resources(example["documents"]["review"], resources,
                      embedded_resources, embeddings, w2i)
        num_reviews = len(resources) - num_comments - num_facts - num_plots

    print("Read in test data...")
    resources_test = []
    embedded_resources_test = []
    for example in tqdm(data_test):
        get_resources(example["documents"]["comments"], resources_test,
                      embedded_resources_test, embeddings, w2i)
        num_comments = len(resources)
        get_resources(example["documents"]["fact_table"], resources_test,
                      embedded_resources_test, embeddings, w2i)
        num_facts = len(resources) - num_comments
        get_resources(example["documents"]["plot"], resources_test,
                      embedded_resources_test, embeddings, w2i)
        num_plots = len(resources) - num_comments - num_facts
        get_resources(example["documents"]["review"], resources_test,
                      embedded_resources_test, embeddings, w2i)
        num_reviews = len(resources) - num_comments - num_facts - num_plots

    print("Now learn.....")
    total_resources = len(embedded_resources)
    all_temps = torch.Tensor(templates_emb)

    for epoch in range(5):
        print("Epoch: " + str(epoch))
        avg_loss = 0
        for i, resource in tqdm(enumerate(embedded_resources)):
            sent = " ".join(resources[i])
            if sent == "" or sent == "eod":
                continue
            optimizer.zero_grad()

            padd_resource = resource[-args.max_length:]
            padd_resource = np.pad(padd_resource,
                                   ((0, args.max_length - len(padd_resource)),
                                    (0, 0)),
                                   "constant",
                                   constant_values=(len(w2i)))

            actual_scores = []
            all_res = torch.Tensor(padd_resource).unsqueeze(0).repeat(20, 1, 1)
            size_inp = all_res.size()
            for j, template in enumerate(templates_emb):
                try:
                    actual_score = rouge.get_scores(templates[j], " ".join(
                        resources[i]))[0]["rouge-1"]["f"]
                except:
                    actual_score = 0
                actual_scores.append(actual_score)
            x1 = all_res.reshape(size_inp[0],
                                 size_inp[1] * size_inp[2]).to(device)
            x2 = all_temps.reshape(size_inp[0],
                                   size_inp[1] * size_inp[2]).to(device)
            actual_scores = torch.Tensor(actual_scores).unsqueeze(1).to(device)
            scores = model(x1, x2)
            loss = loss_func(scores, actual_scores)
            avg_loss += loss.item()
            loss.backward()
            optimizer.step()
        print("For this epoch, we found avg_loss: " +
              str(avg_loss / total_resources))
        torch.save(model,
                   "../../models/rewrite/saliency_" + str(epoch) + ".pt")

        model.eval()

        with torch.no_grad():
            total_loss = 0
            amount_res = len(resources)
            for i, resource in tqdm(enumerate(embedded_resources_test)):
                sent = " ".join(resources_test[i])
                if sent == "" or sent == "eod":
                    continue
                padd_resource = resource[-args.max_length:]
                padd_resource = np.pad(
                    padd_resource,
                    ((0, args.max_length - len(padd_resource)), (0, 0)),
                    "constant",
                    constant_values=(len(w2i)))
                actual_scores = []
                all_res = torch.Tensor(padd_resource).unsqueeze(0).repeat(
                    20, 1, 1)
                size_inp = all_res.size()
                for j, template in enumerate(templates_emb):
                    try:
                        actual_score = rouge.get_scores(
                            templates[j],
                            " ".join(resources[i]))[0]["rouge-1"]["f"]
                    except:
                        actual_score = 0
                    actual_scores.append(actual_score)
                x1 = all_res.reshape(size_inp[0],
                                     size_inp[1] * size_inp[2]).to(device)
                x2 = all_temps.reshape(size_inp[0],
                                       size_inp[1] * size_inp[2]).to(device)
                actual_scores = torch.Tensor(actual_scores).unsqueeze(1).to(
                    device)

                scores = model(x1, x2)
                loss = loss_func(scores, actual_scores)
                total_loss += loss.item()
            print("Average loss is: " + str(total_loss / amount_res))
        model.train()