Exemple #1
0
    def __init__(self, train_path, test_path, learning_rate=0.001):
        self.train_set = loader.TextDataset(train_path)
        self.test_set = loader.TextDataset(test_path, word_vocab=self.train_set.word2idx,
                                           pos_vocab=self.train_set.pos2idx)

        self.labels = {
            'country': 0, 'team': 1,
            'starring': 2, 'director': 3,
            'child': 4, 'successor': 5
        }

        args = {
            'pos_embed_dim': 256,
            'word_embed_dim': 256,
            'words_num': len(self.train_set.word2idx),
            'pos_num': len(self.train_set.pos2idx),
            'kernel_num': 32,
            'kernel_sizes': [2, 3, 4, 5],
            'dropout': 0.2,
            'static': False
        }
        batch_size = 4
        self.cnn_model = model.CNN_Text(args)
        self.learningRate = learning_rate
        self.train_loader = torch.utils.data.DataLoader(self.train_set, batch_size=batch_size,
                                                        shuffle=True, num_workers=4)
        self.test_loader = torch.utils.data.DataLoader(self.test_set, batch_size=1,
                                                       shuffle=False, num_workers=4)
Exemple #2
0
def text_cnn_train(args, train_path):
    # load data
    print("\nLoading data...")
    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    #train_iter, dev_iter = mr(text_field, label_field, device=-1, repeat=False)
    train_iter, dev_iter = msw_text(text_field,
                                    label_field,
                                    train_path,
                                    device=-1,
                                    repeat=False)

    # batch = next(iter(train_iter))
    # print(type(batch))
    # print(batch.text)

    # train_iter, dev_iter, test_iter = sst(text_field, label_field, device=-1, repeat=False)

    # update args and print
    args.embed_num = len(
        text_field.vocab)  # .vocab을 해주면 단어 집합을 만들어 주는거 같다. 일단 추정
    args.class_num = len(label_field.vocab) - 1
    args.cuda = (not False) and torch.cuda.is_available()
    kerns = '3,4,5'
    args.kernel_sizes = [int(k) for k in kerns.split(',')]
    re_train_path = train_path.split('/')[1][:-4]
    save_path = os.path.join(args.save_dir, re_train_path)

    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))

    # train or predict
    if args.baye:
        print('옵티마이즈 세팅')
        myBopt = BayesianOptimization(f=baye,
                                      domain=domain,
                                      initial_design_numdata=5)
        print('옵티마이즈 시작')
        myBopt.run_optimization(max_iter=10)
        print('옵티마이즈 결과 : ', myBopt.x_opt)
        print('최적의 하이퍼 파라미터의 결과의 파라미터', args.lr, args.dropout)

    else:
        try:
            cnn = model.CNN_Text(args)
            if args.snapshot is not None:
                print('\nLoading model from {}...'.format(args.snapshot))
                cnn.load_state_dict(torch.load(args.snapshot))

            if args.cuda:
                torch.cuda.set_device(args.device)
                cnn = cnn.cuda()

            train.train(train_iter, dev_iter, cnn, args, save_path)
        except KeyboardInterrupt:
            print('\n' + '-' * 89)
            print('Exiting from training early')
Exemple #3
0
def main():
    text_field = data.Field(lower=True)
    label_field = data.Field(sequential=False)
    logging.critical('starting loading data')
    train_iter, dev_iter, total_steps = vulgar(text_field,
                                               label_field,
                                               args,
                                               device=-1,
                                               repeat=False)
    if args.load_vec:
        if args.load_vec == 'hi':
            args.load_vec = 'model/hi_1105_ml_100.w2v'

        logging.critical('start load word2vec')
        embeddings_file = args.load_vec
        vectors = vocab.Vectors(embeddings_file)
        text_field.vocab.set_vectors(vectors.stoi, vectors.vectors,
                                     vectors.dim)
        embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(text_field.vocab.vectors))
        args.embed_dim = vectors.dim
        embedding.weight.requires_grad = True
        # logging.critical(embedding.weight.requires_grad)
    else:
        # update args and print
        args.embed_num = len(text_field.vocab)
        embedding = nn.Embedding(args.embed_num, args.embed_dim)

    args.class_num = len(label_field.vocab) - 1  # 有个<unk>
    args.cuda = (not args.no_cuda) and torch.cuda.is_available()
    del args.no_cuda
    args.kernel_sizes = [int(k)
                         for k in args.kernel_sizes.split(',')]  # args中-变成了_
    args.save_dir = os.path.join(
        args.save_dir,
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    logging.critical('Parameters:')
    for attr, value in sorted(args.__dict__.items()):
        logging.critical("\t{}={}".format(attr.upper(), value))
    # model
    cnn = model.CNN_Text(args, embedding)
    if args.snapshot is not None:
        logging.critical('\nLoading model from {}...'.format(args.snapshot))
        cnn.load_state_dict(torch.load(args.snapshot))
    if args.cuda:
        torch.cuda.set_device(args.device)
        cnn = cnn.cuda()

    try:
        train.train(train_iter, dev_iter, cnn, args, total_steps)
    except KeyboardInterrupt:
        print('\n' + '-' * 89)
        print('Exiting from training early')
Exemple #4
0
def main():
    word_id=create_vocab(args.training_data_path,args.vocab_path,True)
    #label_id=create_vocab(args.training_data_path,args.vocab_tag_path)
    args.class_num=3
    #train,test=load_data(args.training_data_path,word_id,label_id)
    train1,test1=load_data1(args.training_data_path,word_id)
    #train1,test1=load_data_bert(args.training_data_path,word_id)
    TrainX,TrainY=zip(*train1)
    testX,testY=zip(*test1)
    cnn=model.CNN_Text(args).cuda()
    criterion = torch.nn.CrossEntropyLoss()
    #optimizer = torch.optim.SGD(cnn.parameters(), lr=0.001, momentum=0.9)
    opt_Adam = torch.optim.Adam(cnn.parameters(), lr=args.lr, betas=(0.9, 0.99))

    for epoch in range(1,args.epoches):
        print("epoch",epoch)
        batch_iter=batch_helper(TrainX,TrainY,args.batch_size)
        for trainx,trainy in batch_iter:
            #print("trainy length",len(trainy)) #batchsize
            input_data = torch.autograd.Variable(torch.LongTensor(trainx)).cuda()

            output_labels=torch.autograd.Variable(torch.LongTensor(trainy)).cuda()
            output_labels=output_labels. squeeze()
            #print("vocab_size",args.vocab_size)
            cnn_outputs=cnn(input_data)
            torch.save(cnn.state_dict(),args.parameters_path)
            opt_Adam.zero_grad()
            loss = criterion(cnn_outputs, output_labels)
            loss.backward()
            opt_Adam.step()
            # for param_tensor in cnn.state_dict():
            #     print(param_tensor, "\t", cnn.state_dict()[param_tensor].size())
            # for var_name in opt_Adam.state_dict():
            #     print(var_name, "\t", opt_Adam.state_dict()[var_name])
            eva(cnn_outputs,output_labels,args.batch_size)
        torch.save(cnn.state_dict(), args.parameters_path)
        run_val(testX,testY,cnn)
Exemple #5
0
        args.optimizer = args.char_optimizer
        if args.two_ch:
            V_bd = len(bound_field.vocab)
        else:
            V_bd = 1
        print("\nParameters:")
        for attr, value in sorted(args.__dict__.items()):
            print("  {}={}".format(attr.upper(), value))

        if args.snapshot is None and args.num_experts == 0:
            char_cnn = model.CNN_Text(class_num=args.class_num,
                                      kernel_num=args.char_kernel_num,
                                      kernel_sizes=args.char_kernel_sizes,
                                      embed_num=len(text_field.vocab),
                                      embed2_num=V_bd,
                                      embed_dim=args.char_embed_dim,
                                      dropout=args.char_dropout,
                                      conv_init='uniform',
                                      fc_init='normal',
                                      static=False,
                                      two_ch=args.two_ch,
                                      vectors=None)
        elif args.snapshot is None and args.num_experts > 0:
            char_cnn = [
                model.CNN_Text(class_num=args.class_num,
                               kernel_num=args.char_kernel_num,
                               kernel_sizes=args.char_kernel_sizes,
                               embed_num=len(text_field.vocab),
                               embed2_num=V_bd,
                               embed_dim=args.char_embed_dim,
                               dropout=args.char_dropout,
                               conv_init='uniform',
Exemple #6
0
#print(torch.sum(torch.sum(users_vec,1)!=0))
args.embed_num = len(text_field.vocab)
args.class_num = len(label_field.vocab) - 1
args.embed_num_users = len(user_field.vocab)
args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

# print("\nParameters:")
# for attr, value in sorted(args.__dict__.items()):
#     print("\t{}={}".format(attr.upper(), value))

# model
args.embed_dim_users = 100
cnn = model.CNN_Text(args.embed_num_users, args.embed_num, args.embed_dim_users, 
                      args.embed_dim, args.class_num, args.kernel_num, args.kernel_sizes, 
                      args.conv_layer, args.dropout, args.pretrained_embed_words, 
                      args.pretrained_embed_users, words_vec, users_vec)
                                                        
if args.snapshot is not None:
    print('\nLoading model from {}...'.format(args.snapshot))
    cnn.load_state_dict(torch.load(args.snapshot))

if args.cuda:
    torch.cuda.set_device(args.device)
    cnn = cnn.cuda()
        

# train or predict
if args.predict is not None:
    label = train.predict(args.predict, cnn, text_field, label_field, args.cuda)
    print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
        emb_num = len(text_field.vocab)
        class_num = len(label_field.vocab) - 1
        emb_num_u = len(user_field.vocab)
        cuda = (not args.no_cuda) and torch.cuda.is_available()
        ker_siz = [int(k) for k in kernel_sizes.split(',')]
        sav_dir = os.path.join(
            save_dir,
            datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

        # print("\nParameters:")
        # for attr, value in sorted(args.__dict__.items()):
        #     print("\t{}={}".format(attr.upper(), value))

        # model
        cnn = model.CNN_Text(emb_num_u, emb_num, embed_dim_users, embed_dim,
                             class_num, kernel_num, ker_siz, conv_layer,
                             dropout, args.pretrained_embed_words,
                             args.pretrained_embed_users, words_vec, users_vec)

        if args.snapshot is not None:
            print('\nLoading model from {}...'.format(args.snapshot))
            cnn.load_state_dict(torch.load(args.snapshot))

        if cuda:
            torch.cuda.set_device(args.device)
            cnn = cnn.cuda()

        # train or predict
        if args.predict is not None:
            label = train.predict(args.predict, cnn, text_field, label_field,
                                  user_field, args.cuda)
            print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
Exemple #8
0
def main(path=None):
    
    if path is None:
        args = read_arguments()
    else:
        args = import_arguments()
        
        

            
    
    # MOVED
    # Update some arguments
    args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
    args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    
    
    
    # TODO
    # Remove placeholders
    #args.embeddings = "../../embedding_bias/data/embeddings/ft_embeddings.300.vec"
    #args.embeddings = None
    #args.embeddings = "../glove-twitter-100"
    #args.data_path = "~/embedding_bias/data/hateval2019/"
    #args.data_path = None
    #args.data_name = "task1_es_"
    #args.data_name = "task2_es_"
    #args.results_path = "../../embedding_bias/results/cnn/task1.txt"
    
    
    
    ### NEW
    def custom_tokenizer(text, tokenizer=TweetTokenizer(), max_filter_size=max(args.kernel_sizes)):
        token = tokenizer.tokenize(text)
        #print(token)
        if len(token) < max_filter_size:
            for i in range(0, max_filter_size - len(token)):
                token.append('<PAD>')
        return token
    
    
    # load data
    print("\nLoading data...")
    text_field = data.Field(lower=True, tokenize=custom_tokenizer)
    label_field = data.Field(sequential=False)
    
    ### NEW
    if args.data_path is None:
        train_iter, dev_iter = mr(text_field, label_field, args, device=-1, repeat=False)
    else:
        train_iter, dev_iter, test_iter = load_dataset(text_field, label_field, args, device=-1, repeat=False)
        
    print(type(test_iter))
    
    ####NEW
    if args.embeddings is not None:
        #text_field.build_vocab(args.embeddings)
        #print(type(text_field.vocab.vectors))
        #print("\n\nAAAAAAAAAAAAAAAAAAAAAAAAAAA\n\n")
        #vectors = vocab.GloVe(args.embeddings)
        vectors = vocab.Vectors(name=args.embeddings, cache='./cnn/')
        text_field.vocab.set_vectors(vectors.stoi, vectors.vectors, vectors.dim)
        args.text_field = text_field
        vec = torch.FloatTensor(args.text_field.vocab.vectors).shape[-1]
        print("\n\n",len(vectors),"\n\n")
        args.embed_dim = vec
    
    
    # update args and print
    args.embed_num = len(text_field.vocab)
    args.class_num = len(label_field.vocab) - 1
    
    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))
    
    
            
    
    # train or predict
    if args.test:
        labels = ["HS","TR","AG"]
        models = [model.CNN_Text(args) for _ in labels]
        for i,M in enumerate(models):
            if args.snapshot is not None:
                print('\nLoading model from {}...'.format(args.snapshot))
                M.load_state_dict(torch.load(args.snapshot+labels[i]+".pt"))
            if args.cuda:
                torch.cuda.set_device(args.device)
                M.cuda()
        train.test(test_iter, models, args)
        
        #try:
            #train.eval(test_iter, cnn, args) 
        #except Exception as e:
        #    print("\nSorry. The test dataset doesn't  exist.\n")
    else:
        print()
        try:
    # model
            cnn = model.CNN_Text(args)
            if args.snapshot is not None:
                print('\nLoading model from {}...'.format(args.snapshot))
                cnn.load_state_dict(torch.load(args.snapshot))
            
            if args.cuda:
                torch.cuda.set_device(args.device)
                cnn = cnn.cuda()
            train.train(train_iter, dev_iter, cnn, args)
        except KeyboardInterrupt:
            print('\n' + '-' * 89)
            print('Exiting from training early')
Exemple #9
0
)  # label Field contains information on how datapreprocessed without tokenization--Field for label data
train_iter, val_iter = mr(text_field, label_field)

# update parameters and print
embed_num = len(text_field.vocab)
class_num = len(label_field.vocab) - 1
cuda = (not cuda) and torch.cuda.is_available()
del cuda

save_dir = os.path.join(
    save_dir,
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
)  #making a Directory in snapshot named after time and date"

# model- load from model.py
cnn = model.CNN_Text(embed_num, embed_dim, class_num, kernel_num, static)
if snapshot is not None:
    print('\nLoading model from {}...'.format(snapshot))
    cnn.load_state_dict(torch.load(snapshot))

# train or predict
if predict is not None:
    label = train.predict(predict, cnn, text_field, label_field)
    print('\n[Text]  {}\n[Label] {}\n'.format(predict, label))
elif test:
    try:
        train.eval(test_iter, cnn)
    except Exception as e:
        print("\nSorry. The test dataset doesn't  exist.\n")
else:
    print()
Exemple #10
0
np.random.seed(0)
torch.manual_seed(0)
random.seed(0)

embedding_length = 17

fake, real = data_processor.loadHeadlines()
# f_real = open("Data/clean_real.txt")
# f_fake = open("Data/clean_fake.txt")
#
# real = [str.split(line) for line in f_real]
# fake = [str.split(line) for line in f_fake]

train, val, test, train_labels, val_labels, test_labels, embedding, vocab = classifier.prep_data(
    fake, real, embedding_length)

train = train.transpose(0, 1)
test = test.transpose(0, 1)
val = val.transpose(0, 1)
train_labels = Variable(torch.FloatTensor(train_labels))

model = model.CNN_Text(embedding, 1, embedding_length, 3)
#model = model.CNN_Text2(embedding)
model = classifier.train(model, train, train_labels, val, val_labels)

model.training = False

torch.save(model.state_dict(), 'model_state_dict')

print("test set acc: {}".format(classifier.testNN(model, test, test_labels)))
Exemple #11
0
                    port=args.port,
                    port_out=args.port_out,
                    check_version=False)
    mydata = data.MyTaskProcessor()
    device = torch.device("cuda" if (torch.cuda.is_available()) else "cpu")
    print('training device: ' + str(device))
    # update args
    args.class_num = len(mydata.get_labels())  # 4
    # define text cnn model
    if args.kernel == 2:
        kernel_size = [2]
    elif args.kernel == 3:
        kernel_size = [2, 3]
    elif args.kernel == 4:
        kernel_size = [2, 3, 4]
    cnn = model.CNN_Text(1, args.kernel_num, kernel_size, args.class_num,
                         args.dropout).to(device)  #need to modify

    # training
    if args.training:
        # open eval_result.txt
        with open(args.eval_result, 'w') as f:
            now = datetime.datetime.now()
            f.write('[' + str(now) + ']\n')
            f.write('*' * 30 + '\n')
        # get eval data&label
        dev, dev_label = mydata.get_dev_examples(args.data_dir, args.batch, bc,
                                                 args.dev_file)
        print('start training')
        loss_func = nn.CrossEntropyLoss(weight=torch.from_numpy(
            np.array([1.05740498, 1, 1.10068213, 1.40410721])).float(),
                                        size_average=True)
Exemple #12
0
def main(path=None):

    if path is None:
        args = read_arguments()
    else:
        args = import_arguments()

    #### MOVED ####
    # Update some arguments
    args.cuda = (not args.no_cuda) and torch.cuda.is_available()
    del args.no_cuda
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
    args.save_dir = os.path.join(
        args.save_dir,
        datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

    #### MOVED ####

    #### NEW ####
    # This uses thesame tokenizer that we used when preprocessing the data (NLTK for tweets)
    def custom_tokenizer(text,
                         tokenizer=TweetTokenizer(),
                         max_filter_size=max(args.kernel_sizes)):
        token = tokenizer.tokenize(text)
        if len(token) < max_filter_size:
            for i in range(0, max_filter_size - len(token)):
                token.append('<PAD>')
        return token

    #### NEW ####

    # load data
    print("\nLoading data...")
    text_field = data.Field(lower=True, tokenize=custom_tokenizer)
    label_field = data.Field(sequential=False)

    #### NEW ####
    # Loads the relevant dataset
    if args.data_path is None:
        train_iter, dev_iter = mr(text_field,
                                  label_field,
                                  args,
                                  device=-1,
                                  repeat=False)
    else:
        train_iter, dev_iter, test_iter = load_dataset(text_field,
                                                       label_field,
                                                       args,
                                                       device=-1,
                                                       repeat=False)
    #### NEW ####

    #### NEW ####
    # This part of the code loads the pretrained embeddings
    if args.embeddings is not None:
        vectors = vocab.Vectors(name=args.embeddings, cache='./cnn/')
        text_field.vocab.set_vectors(vectors.stoi, vectors.vectors,
                                     vectors.dim)
        args.text_field = text_field
        vec = torch.FloatTensor(args.text_field.vocab.vectors).shape[-1]
        args.embed_dim = vec
    #### NEW ####

    # update args and print
    args.embed_num = len(text_field.vocab)
    args.class_num = len(label_field.vocab) - 1

    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))

    # model
    cnn = model.CNN_Text(args)
    if args.snapshot is not None:
        print('\nLoading model from {}...'.format(args.snapshot))
        cnn.load_state_dict(torch.load(args.snapshot))

    if args.cuda:
        torch.cuda.set_device(args.device)
        cnn = cnn.cuda()

    # train or predict
    if args.predict is not None:
        label = train.predict(args.predict, cnn, text_field, label_field,
                              args.cuda)
        print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
    elif args.test:
        #try:
        train.eval(test_iter, cnn, args)
    #except Exception as e:
    #    print("\nSorry. The test dataset doesn't  exist.\n")
    else:
        print()
        try:
            train.train(train_iter, dev_iter, cnn, args)
        except KeyboardInterrupt:
            print('\n' + '-' * 89)
            print('Exiting from training early')
Exemple #13
0
                    class_num=359,
                    kernel_num=400,
                    char_kernel_sizes=[2, 3, 4, 5, 6],
                    word_kernel_sizes=[3, 4, 5],
                    ortho_init=False,
                    dropout=0.5,
                    static=args.static,
                    word_vector='w2v',
                    cuda=args.yes_cuda
                    and torch.cuda.is_available())  # ; del args.no_cuda

phn_mdl_path = os.path.join(args.model, '*')
phn_mdl_files = glob.glob(phn_mdl_path)
phn_mdls = []
for i in range(len(phn_mdl_files)):
    phn_mdls.append(model.CNN_Text(phn_args, 'char'))
    phn_mdls[i].load_state_dict(
        torch.load(phn_mdl_files[i], map_location=lambda stor, loc: stor))
#word_args = CNN_Args(embed_num = len(word_field.vocab), ## (should be 1715)
#                     char_embed_dim = 16,
#                     word_embed_dim = 300,
#                     class_num = 359,
#                     kernel_num = 300,
#                     char_kernel_sizes = [2,3,4,5,6],
#                     word_kernel_sizes = [3,4,5],
#                     ortho_init = False,
#                     dropout = 0.5,
#                     static = False,
#                     word_vector = 'w2v')
#word_mdl_path = os.path.join(conf['word_cnn_dir'], '*')
#word_mdl_files = glob.glob(word_mdl_path)
    def __init__(self, master): 
        ''' GUI布局 '''
        frame = Frame(master)  
        frame.pack()
        
        self.frame = frame
        self.text = ''
        self.isPreprocess = 0
        
        # 全局
        self.frame.bind("<Button-3>", self.delete)
        
        # 字体
        ft = tkFont.Font(family='Fixdsys', size=13, weight=tkFont.BOLD)
        ft_big = tkFont.Font(family='Fixdsys', size=21, weight=tkFont.BOLD)
        ft_middle = tkFont.Font(family='Fixdsys', size=18, weight=tkFont.BOLD)
        
        # 文本框  <请输入简历链接:>
        self.label1 = Label(frame, text='请输入简历链接:', font=ft, pady=5)
        #self.label1.pack(side=TOP, anchor='w')
        self.label1.grid(row=0, column=0)
        
        # 输入框 (简历链接)
        self.url = StringVar()
        self.urlEntry = Entry(frame, textvariable=self.url, font=ft, width=45)
        self.urlEntry.bind("<Return>", self.input_url)
        self.urlEntry.bind("<Button-3>", self.delete)
        #self.urlEntry.pack(side=TOP, anchor='w')
        self.urlEntry.grid(row=1, column=0)
        
        # 文本框  <或输入简历文本:>
        self.label2 = Label(frame, text='或输入简历文本:', font=ft, pady=5)
        #self.label2.pack(side=TOP, anchor='w')
        self.label2.grid(row=2, column=0)
        
        # 输入框 (简历文本)
        self.resumeText = Text(frame, width=50, height=30, font=ft)
        self.resumeText.bind("<Button-3>", self.delete)
        self.resumeText.bind("<Return>", self.input_text)
        #self.entryText.pack(side=TOP, anchor='w')
        self.resumeText.grid(row=3, column=0)
        
        # 按钮  [开始预测]
        self.predictButton = Button(frame, text="开始预测", fg="red", command=self.predict, font=ft_big)  
        #self.next.pack(side=RIGHT)
        self.predictButton.grid(row=3, column=1)
        
        # 文本框  < --> >
        self.label5 = Label(frame, text="-> ", fg="blue", font=ft_big)  
        #self.next.pack(side=RIGHT)
        self.label5.grid(row=3, column=2)
        
        # 文本框  (预测结果)
        self.result = StringVar()
        self.label4 = Label(frame, textvariable=self.result, bg='yellow', font=ft_middle)
        self.result.set('     ')
        #self.label4.pack(side=RIGHT)
        self.label4.grid(row=3, column=3)
        
        # 文本  (状态显示)
        self.condition = StringVar()
        self.label6 = Label(frame, textvariable=self.condition, fg="green", font=ft_big)  
        self.condition.set('空闲中.')
        #self.next.pack(side=RIGHT)
        self.label6.grid(row=2, column=1)
        
        ''' 模型载入 '''
        excel = './data/标注数据.xlsx'
        text_path = './data/text/processed/'
        args.snapshot = './snapshot/TextCNN_Word2Vec/best_steps_4100.pt'
        #new_model = word2vec.Word2Vec.load(args.word2vec_model)
        
        print("\nLoading data...")
        tokenize = lambda x: re.split(' ', x)
        text_field = data.Field(tokenize=tokenize, lower=True,  stop_words=['\r', '\n', '\t', '\xa0', ' ', ''])
        label_field = data.Field(sequential=False)
        train_iter, dev_iter = mydatasets.resume(excel, text_path, text_field, label_field, 
                                                 args.batch_size, device=-1, repeat=False,
                                                 use_wv=True, 
                                                 wv_model=args.word2vec_model,
                                                 )
        
        args.embed_num = len(text_field.vocab) # 9499
        args.class_num = len(label_field.vocab) - 1 # 16, -1是为了除去<unk>
        args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
        args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
        #args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        
        if args.model == 'CNN':
            print('Using CNN model')
            cnn = model.CNN_Text(args, text_field)
        elif args.model == 'LSTM':
            print('Using LSTM model')
            cnn = model.LSTM_Text(args, text_field)
        
        if args.snapshot is not None:
            print('\nLoading model from {}...'.format(args.snapshot))
            cnn.load_state_dict(torch.load(args.snapshot))

        if args.cuda:
            #torch.cuda.set_device(args.device)
            print('Using CUDA')
            cnn = cnn.cuda()
        else:
            print('Using CPU')
        
        self.cnn = cnn
        self.text_field = text_field
        self.label_field = label_field
'''

args.embed_num = len(text_field.vocab)  # 9499
args.class_num = len(label_field.vocab) - 1  # 16, -1是为了除去<unk>
print('embed num:', args.embed_num, '\nclass num:', args.class_num)

args.cuda = (not args.no_cuda) and torch.cuda.is_available()
del args.no_cuda
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
args.save_dir = os.path.join(
    args.save_dir,
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

if args.model == 'CNN':
    print('Using CNN model')
    cnn = model.CNN_Text(args, text_field)
elif args.model == 'LSTM':
    print('Using LSTM model')
    cnn = model.LSTM_Text(args, text_field)

if args.snapshot is not None:
    print('\nLoading model from {}...'.format(args.snapshot))
    cnn.load_state_dict(torch.load(args.snapshot))

if args.cuda:
    #torch.cuda.set_device(args.device)
    print('Using CUDA')
    cnn = cnn.cuda()
else:
    print('Using CPU')
Exemple #16
0
#print("Positive News Ratio", sum(y_test > 0) * 1. / (sum(y_test > 0) + sum(y_test < 0)))
X_test = X_test.astype('float32')
y_test = util.value2int_simple(y_test).astype("int")

# update args and print
args.class_num = 2
args.cuda = (not args.no_cuda) and torch.cuda.is_available()
del args.no_cuda
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

print("\nParameters:")
for attr, value in sorted(args.__dict__.items()):
    print("\t{}={}".format(attr.upper(), value))

# model
cnn = model.CNN_Text(args)
if args.cuda:
    torch.cuda.set_device(args.device)
    cnn = cnn.cuda()

# train or predict
if args.predict is not None:
    if args.date != '':
        util.daily_predict(cnn, args)
        output = './input/news/' + args.date[:4] + '/news_' + args.date + '.csv'
        os.system('mv ' + output + '_bak ' + output)
    else:
        mymodels, word2idx, stopWords = util.predictor_preprocess(cnn, args)
        print(util.predict(args.predict, mymodels, word2idx, stopWords, args))
elif args.eval is not False:
    mymodels, word2idx, stopWords = util.predictor_preprocess(cnn, args)
Exemple #17
0
    args.embed_num = len(text_field.vocab)
    args.lr = args.char_lr
    args.l2 = args.char_l2
    args.epochs = args.char_epochs
    args.batch_size = args.char_batch_size
    args.dropout = args.char_dropout
    args.max_norm = args.char_max_norm
    args.kernel_num = args.char_kernel_num
    args.optimizer = args.char_optimizer

    print("\nParameters:")
    for attr, value in sorted(args.__dict__.items()):
        print("  {}={}".format(attr.upper(), value))

    if args.snapshot is None and args.num_experts == 0:
        char_cnn = model.CNN_Text(args, 'char')
    elif args.snapshot is None and args.num_experts > 0:
        char_cnn = [model.CNN_Text(args, 'char') for i in range(args.num_experts)]
    else:
        print('\nLoading model from [%s]...' % args.snapshot)
        try:
            char_cnn = torch.load(args.snapshot)
        except:
            print("Sorry, This snapshot doesn't exist.");
            exit()
    if args.num_experts > 0:
        acc, char_cnn = train.ensemble_train(train_iter, dev_iter, char_cnn, args,
                                             log_file_handle=log_file_handle, always_norm=False)
    else:
        acc, char_cnn = train.train(train_iter, dev_iter, char_cnn, args, log_file_handle=log_file_handle)
    char_dev_fold_accuracies.append(acc)