def getLogin(self): """ return the unique login of the proprio """ prenom = normalizeString(self.context.pro_prenom1[:3].lower()) nom = normalizeString(self.context.pro_nom1[:3].lower()) temp_pro_log = "%s%s%s" % (prenom, nom, self.getPostLoginString()) if not temp_pro_log: # case where there is no prenom - nom temp_pro_log = generateRandomLogin() if temp_pro_log in PROPRIO_LOGIN_REGISTRY: self.count += 1 temp_pro_log = self.getLogin() return temp_pro_log
def mp_func(data_en, data_zh, manager_d, index): samples = [] for idx in tqdm(range(len(data_en))): sentence_zh = data_zh[idx].strip() seg_list = jieba.cut(sentence_zh) input_zh = encode_text(word_map_zh, list(seg_list)) sentence_en = data_en[idx].strip().lower() tokens = [ normalizeString(s) for s in nltk.word_tokenize(sentence_en) if len(normalizeString(s)) > 0 ] output_en = encode_text(word_map_en, tokens) if len(input_zh) <= max_len and len( output_en ) <= max_len and UNK_token not in input_zh and UNK_token not in output_en: samples.append({ 'input': list(input_zh), 'output': list(output_en) }) manager_d[index] = samples return manager_d
def syncToServer(db): ''' 将本地录音信息同步到服务器 ''' try: cr = db.cursor() #保证 录音文件mp3已经被上传到服务器了 sql = 'select * from core_audiofile where status=1 and memo_status=0' cr.execute(sql) rs = fetchallDict(cr) set=[] for r in rs: if not getApp().running: return digest = r['digest'] client_sid = r['client_sid'] memo = r['memo'] type = r['type'] productid=r['productid'] operator = utils.normalizeString(r['operator']) #一次发送每条需要更新的录音记录信息 params = urllib.urlencode({'token':getApp().getToken(), 'spx_digest':digest, 'client_sid':client_sid, 'memo':memo, 'type':type, 'productid':productid, 'operator':operator }) server = getApp().getSettings().get('webserver') if server.find('http')==-1: server = 'http://'+server f = urllib.urlopen('%s/WebApi/Terminal/updateAudioMemo'%(server),params) # POST d = f.read() print d d = json.loads(d) if d['status'] != 0 : return False sql = 'update core_audiofile set memo_status=1 where digest=?' cr.execute(sql,(digest,)) db.commit() print 'archive (digest: %s) has been update to server!'%(digest) return True except: traceback.print_exc() return False
def readLangs(lang1, lang2, reverse=False): print("Reading lines...") lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \ read().strip().split('\n') pairs = [[normalizeString(s) for s in line.split('\t')] for line in lines] if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def process(file, lang='zh'): print('processing {}...'.format(file)) with open(file, 'r', encoding='utf-8') as f: data = f.readlines() word_freq = Counter() lengths = [] for line in tqdm(data): sentence = line.strip() if lang == 'en': sentence_en = sentence.lower() tokens = [ normalizeString(s) for s in nltk.word_tokenize(sentence_en) ] word_freq.update(list(tokens)) vocab_size = n_src_vocab else: seg_list = jieba.cut(sentence.strip()) tokens = list(seg_list) word_freq.update(list(tokens)) vocab_size = n_tgt_vocab lengths.append(len(tokens)) words = word_freq.most_common(vocab_size - 4) word_map = {k[0]: v + 4 for v, k in enumerate(words)} word_map['<pad>'] = 0 word_map['<sos>'] = 1 word_map['<eos>'] = 2 word_map['<unk>'] = 3 print(len(word_map)) print(words[:100]) # # n, bins, patches = plt.hist(lengths, 50, density=True, facecolor='g', alpha=0.75) # # plt.xlabel('Lengths') # plt.ylabel('Probability') # plt.title('Histogram of Lengths') # plt.grid(True) # plt.show() word2idx = word_map idx2char = {v: k for k, v in word2idx.items()} return word2idx, idx2char
def trainIters(encoder, decoder, data_df, n_iters, print_every=1000, plot_every=100, learning_rate=0.05): start = time.time() plot_losses = [] print_loss_total = 0 # Reset every print_every plot_loss_total = 0 # Reset every plot_every encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate) decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate) criterion = nn.NLLLoss() #weight = weight_tensor) for iter in range(1, n_iters + 1): #print(iter) sentence = train_df.iloc[iter - 1]["description"] sentence = normalizeString(sentence) input_tensor = embeddedTensorFromSentence(sentence, device, word_emb, N_word) target_class = data_df.iloc[iter - 1]["department_new"] class_index = [] for i in range(CLASS_size): class_index.append(0) class_index[class_dict[target_class]] = 1 #import pdb; pdb.set_trace(); #print(class_index) target_tensor = torch.tensor(class_index, dtype=torch.long, device=device).view(1, CLASS_size) loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion) print_loss_total += loss plot_loss_total += loss if iter % print_every == 0: print_loss_avg = print_loss_total / print_every print_loss_total = 0 print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters), iter, iter / n_iters * 100, print_loss_avg)) if iter % plot_every == 0: plot_loss_avg = plot_loss_total / plot_every plot_losses.append(plot_loss_avg) plot_loss_total = 0 showPlot(plot_losses)
def process(file, lang='zh'): ''' 建立词表 :param file: :param lang: :return: ''' print('processing {}...'.format(file)) with open(file, 'r', encoding='utf-8') as f: data = f.readlines() word_freq = Counter() lengths = [] for line in tqdm(data): sentence = line.strip() if lang == 'en': # 若是英文 转小写 然后切分 sentence_en = sentence.lower() tokens = [ normalizeString(s) for s in nltk.word_tokenize(sentence_en) ] # 得到token然后再清洗 word_freq.update(list(tokens)) vocab_size = Config.n_src_vocab # 是由超参数给出的 else: # 若是中文 使用jieba进行分词 seg_list = jieba.cut(sentence.strip()) tokens = list(seg_list) word_freq.update(list(tokens)) vocab_size = Config.n_tgt_vocab lengths.append(len(tokens)) # 得到每个句子的真实长度 words = word_freq.most_common(vocab_size - 4) # vocab_size 统计出词频最高的这么多个词 word_map = {k[0]: v + 4 for v, k in enumerate(words)} # 词->id word_map['<pad>'] = 0 word_map['<sos>'] = 1 word_map['<eos>'] = 2 word_map['<unk>'] = 3 print(len(word_map)) print(words[:100]) word2idx = word_map idx2char = {v: k for k, v in word2idx.items()} return word2idx, idx2char
def evaluateInput(encoder, decoder, searcher, voc): input_sentence = '' while(1): try: # 获取输入句子 input_sentence = input('> ') # 检查是否退出 if input_sentence == 'q' or input_sentence == 'quit': break # 规范化句子 input_sentence = normalizeString(input_sentence) # 评估句子 output_words = evaluate(encoder, decoder, searcher, voc, input_sentence) # 格式化和打印回复句 output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')] print('Bot:', ' '.join(output_words)) except KeyError: print("Error: Encountered unknown word.")
def evaluateInput(encoder, decoder, searcher, voc): input_sentence = '' while(1): try: # Get input sentence input_sentence = input('> ') # Check if it is quit case if input_sentence == 'q' or input_sentence == 'quit': break # Normalize sentence input_sentence = normalizeString(input_sentence) # Evaluate sentence output_words = evaluate(encoder, decoder, searcher, voc, input_sentence) # Format and print response sentence output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')] print('Bot:', ' '.join(output_words)) except KeyError: print("Error: Encountered unknown word.")
async def handle(request): my_input = request.query['input'] input_sentence = normalizeString(my_input) # Evaluate sentence output_words, score = evaluate(encoder, decoder, searcher, voc, input_sentence) # Format and print response sentence output_words[:] = [ x for x in output_words if not (x == 'EOS' or x == 'PAD') ] response_obj = { 'status': 'success', "response": ' '.join(output_words), "confidence": score } return web.Response(text=json.dumps(response_obj), status=200)
def evaluate(sentence, max_length=MAX_LENGTH): time_start = time.time() sentence = normalizeString(sentence) sentence = unicodedata.normalize('NFD', sentence) indexes_batch = [indexesFromSentence(voc, sentence)] lengths = torch.tensor([len(indexes) for indexes in indexes_batch]) input_batch = torch.LongTensor(indexes_batch).transpose(0, 1) input_batch = input_batch.to(device) lengths = lengths.to(device) tokens, score = searcher(input_batch, lengths, max_length) decoded_words = [voc.index2word[token.item()] for token in tokens] result = '' for char in decoded_words: if char != 'EOS': result += char else: break time_pred = time.time() - time_start return result, torch.sum(score) / len(result), time_pred
def readLangs(lang1, lang2, reverse=False): print("Reading lines...") # Read the file and split into lines lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\ read().strip().split('\n') # Split every line into pairs and normalize pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] # Reverse pairs, make Lang instances if reverse: pairs = [list(reversed(p)) for p in pairs] input_lang = Lang(lang2) output_lang = Lang(lang1) else: input_lang = Lang(lang1) output_lang = Lang(lang2) return input_lang, output_lang, pairs
def evaluateTest(encoder, decoder): test_size = test_df.shape[0] y_true = [] y_pred = [] for iter in range(0, test_size + 1): sentence = test_df.iloc[iter - 1]["description"] sentence = normalizeString(sentence) input_tensor = embeddedTensorFromSentence(sentence, device, word_emb, N_word) target_class = test_df.iloc[iter - 1]["department_new"] class_index = [] target_index = class_dict[target_class] #print(target_index) y_true.append(target_index) output = evaluate(encoder, decoder, input_tensor, max_length, device) topv, topi = output.topk(1) y_pred.append(topi.numpy()[0][0]) cnf_matrix = confusion_matrix(y_true, y_pred) print("Accuarcy") print(accuracy_score(y_true, y_pred)) print(cnf_matrix)
def get_data(in_file, out_file): print('getting data {}->{}...'.format(in_file, out_file)) with open(in_file, 'r', encoding='utf-8') as file: in_lines = file.readlines() with open(out_file, 'r', encoding='utf-8') as file: out_lines = file.readlines() samples = [] for i in tqdm(range(len(in_lines))): sentence_en = in_lines[i].strip().lower() tokens = [normalizeString(s.strip()) for s in nltk.word_tokenize(sentence_en)] in_data = encode_text(src_char2idx, tokens) sentence_zh = out_lines[i].strip() tokens = jieba.cut(sentence_zh.strip()) out_data = [sos_id] + encode_text(tgt_char2idx, tokens) + [eos_id] if len(in_data) < maxlen_in and len(out_data) < maxlen_out and unk_id not in in_data and unk_id not in out_data: samples.append({'in': in_data, 'out': out_data}) return samples
def read_test_dataset(self, input_path, lock=True): with open(input_path) as f: data = json.load(f) self.no_samples = len(data) for i in tqdm(range(self.no_samples)): entry = data[str(i)] text = entry["text"] text = normalizeString(text) tokens = tokenize(text) self.stcs_literals.append(tokens) tokens_id = [ self.words_converter.T2id(id, lock=lock) for id in tokens ] tokens_id.append(self.words_converter.T2id('</s>')) self.stcs.append(tokens_id) self.lengths.append(len(tokens_id)) self.X = self.stcs
def build_samples(): word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r')) word_map_en = json.load(open('data/WORDMAP_en.json', 'r')) for usage in ['train', 'valid']: if usage == 'train': translation_path_en = os.path.join(train_translation_folder, train_translation_en_filename) translation_path_zh = os.path.join(train_translation_folder, train_translation_zh_filename) filename = 'data/samples_train.json' else: translation_path_en = os.path.join(valid_translation_folder, valid_translation_en_filename) translation_path_zh = os.path.join(valid_translation_folder, valid_translation_zh_filename) filename = 'data/samples_valid.json' print('loading {} texts and vocab'.format(usage)) with open(translation_path_en, 'r') as f: data_en = f.readlines() with open(translation_path_zh, 'r') as f: data_zh = f.readlines() print('building {} samples'.format(usage)) samples = [] for idx in tqdm(range(len(data_en))): sentence_en = data_en[idx].strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)] input_en = encode_text(word_map_en, tokens) sentence_zh = data_zh[idx].strip() seg_list = jieba.cut(sentence_zh) output_zh = encode_text(word_map_zh, list(seg_list)) if len(input_en) <= max_len and len( output_zh) <= max_len and UNK_token not in input_en and UNK_token not in output_zh: samples.append({'input': list(input_en), 'output': list(output_zh)}) with open(filename, 'w') as f: json.dump(samples, f, indent=4) print('{} {} samples created at: {}.'.format(len(samples), usage, filename))
def gather_file(file_, max_len): en_sents, fra_sents, en_cut_count, fra_cut_count = [], [], 0, 0 for sentences in open(file_): en_, fra_ = [ normalizeString(s) for s in sentences.strip().split('\t') ] en_ws = [word for word in en_.strip().split()] fra_ws = [word for word in fra_.strip().split()] if len(en_ws) > max_len: en_cut_count += 1 en_ws = en_ws[:max_len] en_sents.append([WORD[BOS]] + en_ws + [WORD[EOS]]) if len(fra_ws) > max_len: fra_cut_count += 1 fra_ws = fra_ws[:max_len] fra_sents.append([WORD[BOS]] + fra_ws + [WORD[EOS]]) return fra_sents, en_sents, fra_cut_count, en_cut_count
def _loadCachedArchiveNoteInfo(self,index): ''' #2014.1.3 ''' cr = self.db.handle().cursor() sql = "select * from core_audiotemp where serial=?" cr.execute(sql,(index,)) rs = fetchallDict(cr) if rs: r = rs[0] self.edtToneMemo.setText(r['memo'].decode('utf-8')) # 录音文件未产生之前,临时记录的备注信息 self.edtProductId.setText(r['productid'].decode('utf-8')) self.cbxCurrOperator.setEditText( utils.normalizeString(r['operator']).decode('utf-8')) for n in range(self.cbxToneType.count()): attr = self.cbxToneType.itemData(n).toInt()[0] try: r['type'] = int(r['type']) except: r['type'] = 1 if attr == int(r['type']): self.cbxToneType.setCurrentIndex(n) break
def sent2tenosr(self, sentence): max_len = self.args.max_word_len - 2 sentence = normalizeString(sentence) words = [w for w in sentence.strip().split()] if len(words) > max_len: words = words[:max_len] words = [WORD[BOS]] + words + [WORD[EOS]] idx = [self.src_dict[w] if w in self.src_dict else UNK for w in words] idx_data = torch.LongTensor(idx) idx_position = torch.LongTensor( [pos_i + 1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx)]) idx_data_tensor = Variable(idx_data.unsqueeze(0), volatile=True) idx_position_tensor = Variable(idx_position.unsqueeze(0), volatile=True) if self.cuda: idx_data_tensor = idx_data_tensor.cuda() idx_position_tensor = idx_position_tensor.cuda() return idx_data_tensor, idx_position_tensor
def readLangs(lang1, lang2, path): lines = open(path, 'r', encoding='utf-8').readlines() lang1_cls = Lang(lang1) lang2_cls = Lang(lang2) pairs = [] for l in lines: l = l.split("\t") sentence1 = normalizeString(l[0]) sentence2 = cht_to_chs(l[1]) seg_list = jieba.cut(sentence2, cut_all=False) sentence2 = " ".join(seg_list) if len(sentence1.split(" ")) > MAX_LENGTH: continue if len(sentence2.split(" ")) > MAX_LENGTH: continue pairs.append([sentence1, sentence2]) lang1_cls.addSentence(sentence1) lang2_cls.addSentence(sentence2) return lang1_cls, lang2_cls, pairs
def analyze_en(): translation_path = os.path.join(train_translation_folder, train_translation_en_filename) with open(translation_path, 'r') as f: sentences = f.readlines() sent_lengths = [] for sentence in tqdm(sentences): sentence_en = sentence.strip().lower() tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)] seg_list = list(jieba.cut(sentence.strip())) # Update word frequency sent_lengths.append(len(seg_list)) num_bins = 100 n, bins, patches = plt.hist(sent_lengths, num_bins, facecolor='blue', alpha=0.5) title = 'English Sentence Lengths Distribution' plt.title(title) plt.show()
# for pair in qa_pairs: # writer.writerow(pair) # print('Done writing to file') # # Visualise some lines # datafile = os.path.join('data/cornell_movie_dialogs_corpus','formatted_movie_lines.txt') # with open(datafile,'rb') as file: # lines = file.readlines() # for line in lines[:8]: # print(line) # Read the datafile and split into lines print('Reading and processing file. \nPlease wait ...') lines = open(datafile,encoding='utf-8').read().strip().split('\n') # Split every line into pairs and normalize pairs = [[utils.normalizeString(s) for s in pair.split('\t')] for pair in lines] print('Done Reading!') # Instantial a vocabulary class voc = Vocabulary('Cornell Movie-Dialogue Corpus') pairs = utils.filterPairs(pairs,MAX_LENGTH = 10) print('After filtering, there are {} conversation pairs'.format(len(pairs))) # Loop through each pair and add them to the vocabulary for pair in pairs: voc.addSentence(pair[0]) voc.addSentence(pair[1]) print('Counted Words:',voc.num_words)
def showNote(self,note,allow_change_phone= False): ''' note: { spx_digest, spx文件摘要 spx_index, 录音文件索引号 calltype, 电话呼叫类型 phone, 电话号码 duration, 时长 } ''' if self.timer.isActive(): self.timer.stop() self.duration = 0 # print '--'*20 # print repr(note) self.show() self.raise_() self.allow_change = allow_change_phone self.cbxClientNames.setCurrentIndex(-1) self.cbxClientNames.clearEditText() self.cbxClientNames.clear() self.edtHistoryToneMemoList.clear() self.note = note cr = self.db.handle().cursor() sql='' digest = note.get('spx_digest') index = note.get('spx_index') calltype = note.get('calltype',0) #默认来电 phone = note.get('phone','') duration = note.get('duration',0) if digest: #录音文件已存在 sql = 'select * from core_audiofile where digest=?' cr.execute(sql,(digest,)) rs = fetchallDict(cr) if rs: r = rs[0] self.edtPhone.setText(r['phone'].decode('utf-8')) attr = u'未知' if r['attr'] == 0 : attr=u'来电' elif r['attr'] == 1: attr=u'去电' elif r['attr'] == 2: attr=u'录音' elif r['attr'] == 3: attr = u'未接' self.edtToneAttr.setText(attr) self.edtToneTime.setText(utils.formatTimeLength(r['duration'])) self.edtToneMemo.setText(r['memo'].decode('utf-8')) for n in range(self.cbxToneType.count()): attr = self.cbxToneType.itemData(n).toInt()[0] try: r['type'] = int(r['type']) except: r['type'] = 1 if attr == int(r['type']): self.cbxToneType.setCurrentIndex(n) break self.edtProductId.setText(r['productid'].decode('utf-8')) sid = r['client_sid'] print 'current archive: client_sid=',sid,r['phone'] self.loadClientHistoryMemo(sid) #2013.11.9 # if sid: # sql = 'select * from core_client where sid=?' # cr.execute(sql,(sid,)) # rs = fetchallDict(cr) # if rs: # name = rs[0]['name'] # #self.edtClientName.setText(name.decode('utf-8')) # self.cbxClientNames.setEditText(name.decode('utf-8')) phone = r['phone'] rr = r if not phone: phone='z*'*20 sql = "select * from core_client where (phone1 = ? " \ "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER cr.execute(sql,(phone,phone,phone)) rs = fetchallDict(cr) n = 0 idx = -1 for r in rs: value = QStringList() value.append(r['sid']) value.append(r['name']) #value = '%s,~,%s'%(r['sid'],r['name']) self.cbxClientNames.addItem(r['name'].decode('utf-8'),value) if r['sid'] == sid: idx = n n+=1 self.cbxClientNames.setCurrentIndex(idx) r = rr self.cbxCurrOperator.setEditText( utils.normalizeString(r['operator']).decode('utf-8')) if index!=None: #通话状态,显示通话时间流逝 self.edtToneTime.setText(utils.formatTimeLength(0)) self.timer.start(1000*1) attr = u'未知' if calltype == 0 : attr=u'来电' elif calltype == 1: attr=u'去电' elif calltype == 2: attr=u'录音' elif calltype == 3: attr = u'未接' self.edtToneAttr.setText(attr) self.edtToneTime.setText(utils.formatTimeLength(duration)) self.edtToneMemo.setText('') self.edtPhone.setText(phone.decode('utf-8')) self.edtProductId.setText('') #根据电话号码匹配客户名称 if not phone: phone='z*'*20 # sql = "select * from core_client where phone1 like '%%%s%%' " \ # "or phone2 like '%%%s%%' or phone3 like '%%%s%%' "%(phone,phone,phone) sql = "select * from core_client where (phone1 = ? " \ "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER cr.execute(sql,(phone,phone,phone)) rs = fetchallDict(cr) idx = -1 n = -1 for r in rs: n+=1 value = QStringList() value.append(r['sid']) value.append(r['name']) #value = '%s,~,%s'%(r['sid'],r['name']) self.cbxClientNames.addItem(r['name'].decode('utf-8'),value) if currrent_dial_out_csid == r['sid']: idx = n if rs: if idx == -1: idx = 0 self.cbxClientNames.setCurrentIndex(idx) #2013.11.8 #print rs self.edtHistoryToneMemoList.clear() for r in rs: self.loadClientHistoryMemo(r['sid'],False) #--2014.1.3 self._loadCachedArchiveNoteInfo(index)
c = len(word_vector) word_vector = np.array(word_vector) word_vector = word_vector.reshape(c, 300) vec = Variable(torch.from_numpy(word_vector)) return vec model = model.LSTMClassifier() model.load_state_dict(torch.load("./trained_model/sentiment.pt")) model.eval() print("fasttext loaded. Please enter your feedback\n") while 1: feedback = input() if len(feedback) == 0: continue else: test = utils.normalizeString(feedback) length = len(test.split()) test = prepare_wordvec(test.split()) test = test.view(1, length, 300) k = nn.utils.rnn.pack_padded_sequence(test, [length], batch_first=True) tag_scores = model(k, 1) print("prediction: " + str(torch.Tensor.item(tag_scores.argmax()))) print(torch.exp(tag_scores)) cont = input("Continue?") if cont == "N" or cont == 'n' or cont.lower() == "no": break else: print("Please enter your feedback\n")
cnf_matrix = confusion_matrix(y_true, y_pred) print("Accuarcy") print(accuracy_score(y_true, y_pred)) print(cnf_matrix) #if __name__ == " __main__": #import pdb;pdb.set_trace(); encoder = EncoderRNN(N_word, hidden_size).to(device) decoder = AttnDecoderRNN(hidden_size, CLASS_size, dropout_p=0.1, max_length=max_length).to(device) n_iterations = train_df.shape[0] #trainIters(encoder, decoder, n_iterations, print_every=50, plot_every=10) import pdb;pdb.set_trace(); trainIters(encoder, decoder, 1, print_every=50, plot_every=10) sentence = train_df.iloc[0]["description"] sentence = normalizeString(sentence) input_tensor = embeddedTensorFromSentence(sentence,device,word_emb,N_word) target_class = train_df.iloc[0]["department_new"] class_index = [] target_index = class_dict[target_class] print(target_index) #y_true.append(target_index) output, attention = evaluate(encoder, decoder, input_tensor,max_length,device) #import pdb;pdb.set_trace(); topv, topi = output.topk(1) #import pdb;pdb.set_trace(); #torch.save(encoder.state_dict(), "encoder") #torch.save(decoder.state_dict(), "decoder") #encoder = torch.load("encoder") #decoder = torch.load("decoder") #desc1 = full_table.iloc[0]["description"]
def showNote(self, note, allow_change_phone=False): ''' note: { spx_digest, spx文件摘要 spx_index, 录音文件索引号 calltype, 电话呼叫类型 phone, 电话号码 duration, 时长 } ''' if self.timer.isActive(): self.timer.stop() self.duration = 0 # print '--'*20 # print repr(note) self.show() self.raise_() self.allow_change = allow_change_phone self.cbxClientNames.setCurrentIndex(-1) self.cbxClientNames.clearEditText() self.cbxClientNames.clear() self.edtHistoryToneMemoList.clear() self.note = note cr = self.db.handle().cursor() sql = '' digest = note.get('spx_digest') index = note.get('spx_index') calltype = note.get('calltype', 0) #默认来电 phone = note.get('phone', '') duration = note.get('duration', 0) if digest: #录音文件已存在 sql = 'select * from core_audiofile where digest=?' cr.execute(sql, (digest, )) rs = fetchallDict(cr) if rs: r = rs[0] self.edtPhone.setText(r['phone'].decode('utf-8')) attr = u'未知' if r['attr'] == 0: attr = u'来电' elif r['attr'] == 1: attr = u'去电' elif r['attr'] == 2: attr = u'录音' elif r['attr'] == 3: attr = u'未接' self.edtToneAttr.setText(attr) self.edtToneTime.setText(utils.formatTimeLength(r['duration'])) self.edtToneMemo.setText(r['memo'].decode('utf-8')) for n in range(self.cbxToneType.count()): attr = self.cbxToneType.itemData(n).toInt()[0] try: r['type'] = int(r['type']) except: r['type'] = 1 if attr == int(r['type']): self.cbxToneType.setCurrentIndex(n) break self.edtProductId.setText(r['productid'].decode('utf-8')) sid = r['client_sid'] print 'current archive: client_sid=', sid, r['phone'] self.loadClientHistoryMemo(sid) #2013.11.9 # if sid: # sql = 'select * from core_client where sid=?' # cr.execute(sql,(sid,)) # rs = fetchallDict(cr) # if rs: # name = rs[0]['name'] # #self.edtClientName.setText(name.decode('utf-8')) # self.cbxClientNames.setEditText(name.decode('utf-8')) phone = r['phone'] rr = r if not phone: phone = 'z*' * 20 sql = "select * from core_client where (phone1 = ? " \ "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER cr.execute(sql, (phone, phone, phone)) rs = fetchallDict(cr) n = 0 idx = -1 for r in rs: value = QStringList() value.append(r['sid']) value.append(r['name']) #value = '%s,~,%s'%(r['sid'],r['name']) self.cbxClientNames.addItem(r['name'].decode('utf-8'), value) if r['sid'] == sid: idx = n n += 1 self.cbxClientNames.setCurrentIndex(idx) r = rr self.cbxCurrOperator.setEditText( utils.normalizeString(r['operator']).decode('utf-8')) if index != None: #通话状态,显示通话时间流逝 self.edtToneTime.setText(utils.formatTimeLength(0)) self.timer.start(1000 * 1) attr = u'未知' if calltype == 0: attr = u'来电' elif calltype == 1: attr = u'去电' elif calltype == 2: attr = u'录音' elif calltype == 3: attr = u'未接' self.edtToneAttr.setText(attr) self.edtToneTime.setText(utils.formatTimeLength(duration)) self.edtToneMemo.setText('') self.edtPhone.setText(phone.decode('utf-8')) self.edtProductId.setText('') #根据电话号码匹配客户名称 if not phone: phone = 'z*' * 20 # sql = "select * from core_client where phone1 like '%%%s%%' " \ # "or phone2 like '%%%s%%' or phone3 like '%%%s%%' "%(phone,phone,phone) sql = "select * from core_client where (phone1 = ? " \ "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER cr.execute(sql, (phone, phone, phone)) rs = fetchallDict(cr) idx = -1 n = -1 for r in rs: n += 1 value = QStringList() value.append(r['sid']) value.append(r['name']) #value = '%s,~,%s'%(r['sid'],r['name']) self.cbxClientNames.addItem(r['name'].decode('utf-8'), value) if currrent_dial_out_csid == r['sid']: idx = n if rs: if idx == -1: idx = 0 self.cbxClientNames.setCurrentIndex(idx) #2013.11.8 #print rs self.edtHistoryToneMemoList.clear() for r in rs: self.loadClientHistoryMemo(r['sid'], False) #--2014.1.3 self._loadCachedArchiveNoteInfo(index)
def read_training_dataset(self, input_path): with open(input_path) as f: data = json.load(f) self.no_samples = len(data) # for padding. self.words_converter.T2id('<PAD>') self.words_converter.T2id('<SOS>') self.slots_converter.T2id('<PAD>') self.slots_converter.T2id('<SOS>') self.slots_converter.T2id('-') for i in tqdm(range(self.no_samples)): entry = data[str(i)] text = entry["text"] text = normalizeString(text) tokens = tokenize(text) self.stcs_literals.append(tokens) tokens_id = [self.words_converter.T2id(id) for id in tokens] tokens_id.append(self.words_converter.T2id('</s>')) self.stcs.append(tokens_id) self.lengths.append(len(tokens_id)) intent = entry["intent"] self.intents.append(self.intent_converter.T2id(intent)) slots_dictionary = entry["slots"] # +1 make room for <SOS> slots_id = [self.slots_converter.T2id('-')] * len(tokens_id) slots_id[0] = self.slots_converter.T2id('<SOS>') no_slots_in_stc = 0 for slot, target_words in slots_dictionary.items(): target_words = normalizeString(target_words) target_word_list = tokenize(target_words) for word in target_word_list: no_slots_in_stc += 1 try: idx = tokens.index(word) except: idx = [ i for i, s in enumerate(tokens) if word in s ][0] # +1 account for <SOS> slots_id[idx + 1] = self.slots_converter.T2id(slot) # keep count of no slots for j in range(len(tokens_id) - no_slots_in_stc): self.slots_converter.T2id('-') self.slots.append(slots_id) # self.slots.append(torch.tensor(slots_id, dtype=torch.long, device=self.device)) # add padding ncols = max(self.lengths) self.X = self.stcs self.Y = self.slots