batch_size=args.batch_size,
                           cuda=use_cuda)

validation_data = DataLoader(data['valid']['src'],
                             data['valid']['label'],
                             args.max_len,
                             batch_size=args.batch_size,
                             shuffle=False,
                             cuda=use_cuda)

# ##############################################################################
# Build model
# ##############################################################################
import model

rnn = model.LSTM_Text(args)
if use_cuda:
    rnn = rnn.cuda()

optimizer = torch.optim.Adam(rnn.parameters(), lr=args.lr, weight_decay=0.001)
criterion = torch.nn.CrossEntropyLoss()

# ##############################################################################
# Training
# ##############################################################################
import time
from tqdm import tqdm

train_loss = []
valid_loss = []
accuracy = []
Example #2
0
X, Y, corpus = util.read_data_omg(data)

# data set - dev
input_file = "../omg_ValidationTranscripts.csv"
input_file2 = "../omg_ValidationVideos.csv"
data = util.process_transcripts(input_file, input_file2)
# prepare data
Xv, Yv, corpus = util.read_data_omg(data)

# prepare embedding
emb_file = "../glove/glove.6B.100d.txt"
w2v = util.load_embeddings_from_glove(emb_file, corpus.word2index)

# model
emb_dim = 100
lstm_model = model.LSTM_Text(emb_dim, 128, 32).cuda()

def sent_to_tensor(x):
    tensor = torch.zeros(len(x), 1, emb_dim)
    for index, tok_id in enumerate(x):
        tensor[index][0] = torch.from_numpy(w2v[tok_id])
    return tensor

def eval(X, Y, model):
    model.cpu()
    y_pred = [[],[]]
    for i in range(len(X)):
        x = X[i]
        if len(x) == 0:
            y_pred[0].append(0.)
            y_pred[1].append(0.)
    def __init__(self, master): 
        ''' GUI布局 '''
        frame = Frame(master)  
        frame.pack()
        
        self.frame = frame
        self.text = ''
        self.isPreprocess = 0
        
        # 全局
        self.frame.bind("<Button-3>", self.delete)
        
        # 字体
        ft = tkFont.Font(family='Fixdsys', size=13, weight=tkFont.BOLD)
        ft_big = tkFont.Font(family='Fixdsys', size=21, weight=tkFont.BOLD)
        ft_middle = tkFont.Font(family='Fixdsys', size=18, weight=tkFont.BOLD)
        
        # 文本框  <请输入简历链接:>
        self.label1 = Label(frame, text='请输入简历链接:', font=ft, pady=5)
        #self.label1.pack(side=TOP, anchor='w')
        self.label1.grid(row=0, column=0)
        
        # 输入框 (简历链接)
        self.url = StringVar()
        self.urlEntry = Entry(frame, textvariable=self.url, font=ft, width=45)
        self.urlEntry.bind("<Return>", self.input_url)
        self.urlEntry.bind("<Button-3>", self.delete)
        #self.urlEntry.pack(side=TOP, anchor='w')
        self.urlEntry.grid(row=1, column=0)
        
        # 文本框  <或输入简历文本:>
        self.label2 = Label(frame, text='或输入简历文本:', font=ft, pady=5)
        #self.label2.pack(side=TOP, anchor='w')
        self.label2.grid(row=2, column=0)
        
        # 输入框 (简历文本)
        self.resumeText = Text(frame, width=50, height=30, font=ft)
        self.resumeText.bind("<Button-3>", self.delete)
        self.resumeText.bind("<Return>", self.input_text)
        #self.entryText.pack(side=TOP, anchor='w')
        self.resumeText.grid(row=3, column=0)
        
        # 按钮  [开始预测]
        self.predictButton = Button(frame, text="开始预测", fg="red", command=self.predict, font=ft_big)  
        #self.next.pack(side=RIGHT)
        self.predictButton.grid(row=3, column=1)
        
        # 文本框  < --> >
        self.label5 = Label(frame, text="-> ", fg="blue", font=ft_big)  
        #self.next.pack(side=RIGHT)
        self.label5.grid(row=3, column=2)
        
        # 文本框  (预测结果)
        self.result = StringVar()
        self.label4 = Label(frame, textvariable=self.result, bg='yellow', font=ft_middle)
        self.result.set('     ')
        #self.label4.pack(side=RIGHT)
        self.label4.grid(row=3, column=3)
        
        # 文本  (状态显示)
        self.condition = StringVar()
        self.label6 = Label(frame, textvariable=self.condition, fg="green", font=ft_big)  
        self.condition.set('空闲中.')
        #self.next.pack(side=RIGHT)
        self.label6.grid(row=2, column=1)
        
        ''' 模型载入 '''
        excel = './data/标注数据.xlsx'
        text_path = './data/text/processed/'
        args.snapshot = './snapshot/TextCNN_Word2Vec/best_steps_4100.pt'
        #new_model = word2vec.Word2Vec.load(args.word2vec_model)
        
        print("\nLoading data...")
        tokenize = lambda x: re.split(' ', x)
        text_field = data.Field(tokenize=tokenize, lower=True,  stop_words=['\r', '\n', '\t', '\xa0', ' ', ''])
        label_field = data.Field(sequential=False)
        train_iter, dev_iter = mydatasets.resume(excel, text_path, text_field, label_field, 
                                                 args.batch_size, device=-1, repeat=False,
                                                 use_wv=True, 
                                                 wv_model=args.word2vec_model,
                                                 )
        
        args.embed_num = len(text_field.vocab) # 9499
        args.class_num = len(label_field.vocab) - 1 # 16, -1是为了除去<unk>
        args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
        args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
        #args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
        
        if args.model == 'CNN':
            print('Using CNN model')
            cnn = model.CNN_Text(args, text_field)
        elif args.model == 'LSTM':
            print('Using LSTM model')
            cnn = model.LSTM_Text(args, text_field)
        
        if args.snapshot is not None:
            print('\nLoading model from {}...'.format(args.snapshot))
            cnn.load_state_dict(torch.load(args.snapshot))

        if args.cuda:
            #torch.cuda.set_device(args.device)
            print('Using CUDA')
            cnn = cnn.cuda()
        else:
            print('Using CPU')
        
        self.cnn = cnn
        self.text_field = text_field
        self.label_field = label_field
Example #4
0
# Update args and print
args.embed_num = len(text_field.vocab)
args.class_num = len(label_field.vocab) - 1
args.vectors = text_field.vocab.vectors
args.cuda = (not args.no_cuda) and torch.cuda.is_available(); del args.no_cuda
args.save_dir = os.path.join(args.save_dir, datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

print("\nParameters:")
for attr, value in sorted(args.__dict__.items()):
    print("\t{}={}".format(attr.upper(), value))

args.TF = text_field

# Model creation and load
if args.snapshot is None:
    net = model.LSTM_Text(args)
else :
    print('\nLoading model from [%s]...' % args.snapshot)
    try:
        net = model.LSTM_Text(args)
        net.load_state_dict(torch.load(args.snapshot))
    except :
        print("Sorry, This snapshot doesn't exist."); exit()

if args.cuda:
    net = net.cuda()
        
# Call the main function (train.py)
if args.predict is not None:
    label = train.predict(args.predict, net, text_field, label_field, args.cuda, args)
    print('\n[Text]  {}\n[Label] {}\n'.format(args.predict, label))
args.class_num = len(label_field.vocab) - 1  # 16, -1是为了除去<unk>
print('embed num:', args.embed_num, '\nclass num:', args.class_num)

args.cuda = (not args.no_cuda) and torch.cuda.is_available()
del args.no_cuda
args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]
args.save_dir = os.path.join(
    args.save_dir,
    datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))

if args.model == 'CNN':
    print('Using CNN model')
    cnn = model.CNN_Text(args, text_field)
elif args.model == 'LSTM':
    print('Using LSTM model')
    cnn = model.LSTM_Text(args, text_field)

if args.snapshot is not None:
    print('\nLoading model from {}...'.format(args.snapshot))
    cnn.load_state_dict(torch.load(args.snapshot))

if args.cuda:
    #torch.cuda.set_device(args.device)
    print('Using CUDA')
    cnn = cnn.cuda()
else:
    print('Using CPU')

#------ 预测 ------#
if args.predict or args.predict_url:
    args.snapshot = './snapshot/TextCNN_Word2Vec/best_steps_4100.pt'