コード例 #1
0
def deal_data(path):
    word2id = load_obj(W2V_DIR, "word2id_dict.pkl")
    # w2v_matrix = np.load(join(W2V_DIR, "word2vex_matrix.npy"))
    with codecs.open(path, "r", "utf-8") as f:
        data = {"distribution": [], "label": [], "token": []}
        for i, line in enumerate(f):
            data["token"].append([])
            line_list = line.strip().split()
            _sum = int(line_list[1].split(':')[1])
            # scale directly here
            _labels = [int(v.split(':')[1]) / _sum for v in line_list[2:10]]
            data["label"].append([_labels.index(max(_labels))])
            data["distribution"].append(_labels)

            result = Counter(line_list[10:])
            for w in line_list[10:]:
                # if it's not Chinese, ignore it
                if not is_Chinese(w):
                    continue
                '''
                if result[w] < 2:
                    continue
                '''
                try:
                    data["token"][i].append(word2id[w])
                # if a word is not in pre-trained embedding, ignore it
                except KeyError:
                    continue
                seq_lens.append(len(data["token"][i]))
    return data
コード例 #2
0
    def __init__(self, input_dir, output_dir):
        # 对模型中的权重赋值,权重是根据调参文件得到的
        # 因为浮点数精度问题所以有的数值有很多位
        self.num = 8
        self.a = 0.0001
        self.b = 1
        self.c = 1.5
        self.d = 1.5
        self.e = 1.5

        # 导入矩阵

        # 单个拼音转汉字的概率
        self.pinyin2hanzi_matrix = settings.load_obj(
            settings.PINYIN_HANZI_DIR, "new_pinyin2hanzi_freq.dat")

        # 连续文本中单个汉字转汉字的概率
        self.two_gram_continuous_matrix = settings.load_obj(
            settings.COUNT_DIR, "new_2_gram_continuous_freq_matrix.dat")

        # 2元词语中单个汉字转汉字的概率
        self.two_gram_matrix = settings.load_obj(settings.COUNT_DIR,
                                                 "new_2_gram_freq_matrix.dat")

        # 3元词语中前两个汉字转移到第三个汉字的概率
        self.three_gram_matrix = settings.load_obj(
            settings.COUNT_DIR, "new_3_gram_freq_matrix.dat")

        # 4元词语中前三个汉字转移到第四个汉字的概率
        self.four_gram_matrix = settings.load_obj(
            settings.COUNT_DIR, "new_4_gram_freq_matrix.dat")

        # 输入文件文件夹
        self.__input_dir = input_dir
        # 输出文件文件夹
        self.__output_dir = output_dir

        # 记录全句概率,初始化
        self.prob = {}
        for i in range(self.num):
            self.prob[i] = 1
        # 记录选择的汉字,初始化
        self.seq = {}
        for i in range(self.num):
            self.seq[i] = ''
コード例 #3
0
from scipy.stats import pearsonr
import os
from os.path import join
from settings import W2V_DIR, DATA_DIR, load_obj, EMB_DIM, FILTER_NUM, FILTER_SIZES, DROPOUT, TARGET_SIZE, BATCH_SIZE, CNN_HISTORY_PATH
from sklearn.metrics import f1_score


def setup_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':
    setup_seed(1)
    device = torch.device("cpu")
    train_data = load_obj(DATA_DIR, "train_CNN.pkl")
    test_data = load_obj(DATA_DIR, "test_CNN.pkl")

    y_train = torch.tensor(np.array(train_data["label"]), dtype=torch.long)
    X_train = torch.tensor(np.array(train_data["token"]), dtype=torch.long)
    y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long)
    X_test = torch.tensor(np.array(test_data["token"]), dtype=torch.long)
    test_distr = test_data["distribution"]

    sub_train_dataset = Data.TensorDataset(X_train, y_train)
    sub_train_loader = Data.DataLoader(
        dataset=sub_train_dataset,  # torch TensorDataset format
        batch_size=BATCH_SIZE,  # mini batch size
        shuffle=True,  # 要不要打乱数据 (打乱比较好)
        drop_last=True,
    )
コード例 #4
0
import numpy as np
from scipy.stats import pearsonr
from os.path import join
from settings import W2V_DIR, DATA_DIR, CHECKPOINT_DIR, load_obj
from sklearn.metrics import f1_score


def setup_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':
    setup_seed(1)
    device = torch.device("cpu")
    test_data = load_obj(DATA_DIR, "test_MLP.pkl")
    test_input = load_obj(DATA_DIR, "average_test_MLP.pkl")
    y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long)
    X_test = torch.tensor(np.array(test_input), dtype=torch.float)
    test_distr = test_data["distribution"]

    sub_test_dataset = Data.TensorDataset(X_test, y_test)
    sub_test_loader = Data.DataLoader(
        dataset=sub_test_dataset,  # torch TensorDataset format
        batch_size=len(sub_test_dataset),  # all test data
    )

    weights = np.load(join(W2V_DIR, "word2vex_matrix.npy"))
    model = torch.load(join(CHECKPOINT_DIR, "MLP_model.ckpt"),
                       map_location='cpu')
    print(model)
コード例 #5
0
import torch
import numpy as np
import os
from os.path import join
from settings import emotions, emo2id, DATA_DIR, TRAIN_DATA_NAME, BALANCED_TRAIN_DATA_NAME, CHECKPOINT_DIR, \
    store_obj, load_obj, EMB_DIM, FILTER_NUM, FILTER_SIZES, DROPOUT, TARGET_SIZE, BATCH_SIZE, CNN_HISTORY_PATH

if __name__ == '__main__':
    train_data = load_obj(DATA_DIR, TRAIN_DATA_NAME)
    samples = {
        "感动": [],
        "同情": [],
        "无聊": [],
        "愤怒": [],
        "搞笑": [],
        "难过": [],
        "新奇": [],
        "温馨": []
    }
    for i, emo in enumerate(train_data["label"]):
        samples[emotions[int(emo[0])]].append(train_data["token"][i])

    max_num = 0
    for (k, v) in samples.items():
        if max_num < len(v):
            max_num = len(v)
    print(max_num)

    for (k, v) in samples.items():
        copy_num = max_num // len(v)
        v_copy = v.copy()
コード例 #6
0
import numpy as np
from scipy.stats import pearsonr
from os.path import join
from settings import W2V_DIR, DATA_DIR, CHECKPOINT_DIR, load_obj
from sklearn.metrics import f1_score


def setup_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':
    setup_seed(1)
    device = torch.device("cpu")
    test_data = load_obj(DATA_DIR, "test_RNN.pkl")

    y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long)
    X_test = torch.tensor(np.array(test_data["token"]), dtype=torch.long)
    test_distr = test_data["distribution"]

    sub_test_dataset = Data.TensorDataset(X_test, y_test)
    sub_test_loader = Data.DataLoader(
        dataset=sub_test_dataset,  # torch TensorDataset format
        batch_size=len(sub_test_dataset),  # all test data
    )

    weights = np.load(join(W2V_DIR, "word2vex_matrix.npy"))

    model = torch.load(join(CHECKPOINT_DIR, "LSTM_model.ckpt"),
                       map_location='cpu')
コード例 #7
0
import os
from os.path import join
from settings import W2V_DIR, DATA_DIR, CHECKPOINT_DIR, load_obj, EMB_DIM, HIDDEN_DIM, DROPOUT, TARGET_SIZE, BATCH_SIZE, RNN_HISTORY_PATH
from sklearn.metrics import f1_score


def setup_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)


if __name__ == '__main__':
    # 设置随机数种子
    setup_seed(1)
    device = torch.device("cpu")
    balanced_train_data = load_obj(DATA_DIR, "train_RNN.pkl")
    test_data = load_obj(DATA_DIR, "test_RNN.pkl")

    y_train = torch.tensor(np.array(balanced_train_data["label"]),
                           dtype=torch.long)
    X_train = torch.tensor(np.array(balanced_train_data["token"]),
                           dtype=torch.long)
    y_test = torch.tensor(np.array(test_data["label"]), dtype=torch.long)
    X_test = torch.tensor(np.array(test_data["token"]), dtype=torch.long)
    test_distr = test_data["distribution"]

    sub_train_dataset = Data.TensorDataset(X_train, y_train)
    sub_train_loader = Data.DataLoader(
        dataset=sub_train_dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,