def getLogin(self):
     """
     return the unique login of the proprio
     """
     prenom = normalizeString(self.context.pro_prenom1[:3].lower())
     nom = normalizeString(self.context.pro_nom1[:3].lower())
     temp_pro_log = "%s%s%s" % (prenom, nom, self.getPostLoginString())
     if not temp_pro_log:  # case where there is no prenom - nom
         temp_pro_log = generateRandomLogin()
     if temp_pro_log in PROPRIO_LOGIN_REGISTRY:
         self.count += 1
         temp_pro_log = self.getLogin()
     return temp_pro_log
Beispiel #2
0
        def mp_func(data_en, data_zh, manager_d, index):
            samples = []
            for idx in tqdm(range(len(data_en))):
                sentence_zh = data_zh[idx].strip()
                seg_list = jieba.cut(sentence_zh)
                input_zh = encode_text(word_map_zh, list(seg_list))

                sentence_en = data_en[idx].strip().lower()
                tokens = [
                    normalizeString(s) for s in nltk.word_tokenize(sentence_en)
                    if len(normalizeString(s)) > 0
                ]
                output_en = encode_text(word_map_en, tokens)

                if len(input_zh) <= max_len and len(
                        output_en
                ) <= max_len and UNK_token not in input_zh and UNK_token not in output_en:
                    samples.append({
                        'input': list(input_zh),
                        'output': list(output_en)
                    })

            manager_d[index] = samples
            return manager_d
Beispiel #3
0
def syncToServer(db):
	'''
		将本地录音信息同步到服务器
	'''
	try:
		cr = db.cursor()
		#保证 录音文件mp3已经被上传到服务器了
		sql = 'select * from core_audiofile where status=1 and memo_status=0'
		cr.execute(sql)
		rs = fetchallDict(cr)
		set=[]
		for r in rs:
			if not getApp().running:
				return
			digest = r['digest']
			client_sid = r['client_sid']
			memo = r['memo']
			type = r['type']
			productid=r['productid']
			operator = utils.normalizeString(r['operator'])

			#一次发送每条需要更新的录音记录信息
			params = urllib.urlencode({'token':getApp().getToken(),
			                           'spx_digest':digest,
			                           'client_sid':client_sid,
			                           'memo':memo,
			                           'type':type,
			                           'productid':productid,
			                           'operator':operator
									})
			server = getApp().getSettings().get('webserver')
			if server.find('http')==-1:
				server = 'http://'+server
			f = urllib.urlopen('%s/WebApi/Terminal/updateAudioMemo'%(server),params)   # POST
			d = f.read()
			print d
			d = json.loads(d)
			if d['status'] != 0 :
				return False
			sql = 'update core_audiofile set memo_status=1 where digest=?'
			cr.execute(sql,(digest,))
			db.commit()
			print 'archive (digest: %s) has been update to server!'%(digest)
		return True
	except:
		traceback.print_exc()
	return False
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8'). \
        read().strip().split('\n')

    pairs = [[normalizeString(s) for s in line.split('\t')] for line in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
Beispiel #5
0
def process(file, lang='zh'):
    print('processing {}...'.format(file))
    with open(file, 'r', encoding='utf-8') as f:
        data = f.readlines()

    word_freq = Counter()
    lengths = []

    for line in tqdm(data):
        sentence = line.strip()
        if lang == 'en':
            sentence_en = sentence.lower()
            tokens = [
                normalizeString(s) for s in nltk.word_tokenize(sentence_en)
            ]
            word_freq.update(list(tokens))
            vocab_size = n_src_vocab
        else:
            seg_list = jieba.cut(sentence.strip())
            tokens = list(seg_list)
            word_freq.update(list(tokens))
            vocab_size = n_tgt_vocab

        lengths.append(len(tokens))

    words = word_freq.most_common(vocab_size - 4)
    word_map = {k[0]: v + 4 for v, k in enumerate(words)}
    word_map['<pad>'] = 0
    word_map['<sos>'] = 1
    word_map['<eos>'] = 2
    word_map['<unk>'] = 3
    print(len(word_map))
    print(words[:100])
    #
    # n, bins, patches = plt.hist(lengths, 50, density=True, facecolor='g', alpha=0.75)
    #
    # plt.xlabel('Lengths')
    # plt.ylabel('Probability')
    # plt.title('Histogram of Lengths')
    # plt.grid(True)
    # plt.show()

    word2idx = word_map
    idx2char = {v: k for k, v in word2idx.items()}

    return word2idx, idx2char
Beispiel #6
0
def trainIters(encoder,
               decoder,
               data_df,
               n_iters,
               print_every=1000,
               plot_every=100,
               learning_rate=0.05):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()  #weight = weight_tensor)
    for iter in range(1, n_iters + 1):
        #print(iter)
        sentence = train_df.iloc[iter - 1]["description"]
        sentence = normalizeString(sentence)
        input_tensor = embeddedTensorFromSentence(sentence, device, word_emb,
                                                  N_word)
        target_class = data_df.iloc[iter - 1]["department_new"]
        class_index = []
        for i in range(CLASS_size):
            class_index.append(0)
        class_index[class_dict[target_class]] = 1
        #import pdb; pdb.set_trace();
        #print(class_index)
        target_tensor = torch.tensor(class_index,
                                     dtype=torch.long,
                                     device=device).view(1, CLASS_size)
        loss = train(input_tensor, target_tensor, encoder, decoder,
                     encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss
        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' %
                  (timeSince(start, iter / n_iters), iter,
                   iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0
    showPlot(plot_losses)
Beispiel #7
0
def process(file, lang='zh'):
    '''
    建立词表
    :param file:
    :param lang:
    :return:
    '''
    print('processing {}...'.format(file))
    with open(file, 'r', encoding='utf-8') as f:
        data = f.readlines()

    word_freq = Counter()
    lengths = []

    for line in tqdm(data):
        sentence = line.strip()
        if lang == 'en':
            # 若是英文 转小写 然后切分
            sentence_en = sentence.lower()
            tokens = [
                normalizeString(s) for s in nltk.word_tokenize(sentence_en)
            ]  # 得到token然后再清洗
            word_freq.update(list(tokens))
            vocab_size = Config.n_src_vocab  # 是由超参数给出的
        else:
            # 若是中文 使用jieba进行分词
            seg_list = jieba.cut(sentence.strip())
            tokens = list(seg_list)
            word_freq.update(list(tokens))
            vocab_size = Config.n_tgt_vocab

        lengths.append(len(tokens))  # 得到每个句子的真实长度

    words = word_freq.most_common(vocab_size - 4)  # vocab_size 统计出词频最高的这么多个词
    word_map = {k[0]: v + 4 for v, k in enumerate(words)}  # 词->id
    word_map['<pad>'] = 0
    word_map['<sos>'] = 1
    word_map['<eos>'] = 2
    word_map['<unk>'] = 3
    print(len(word_map))
    print(words[:100])

    word2idx = word_map
    idx2char = {v: k for k, v in word2idx.items()}

    return word2idx, idx2char
Beispiel #8
0
def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # 获取输入句子
            input_sentence = input('> ')
            # 检查是否退出
            if input_sentence == 'q' or input_sentence == 'quit': break
            # 规范化句子
            input_sentence = normalizeString(input_sentence)
            # 评估句子
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # 格式化和打印回复句
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")
Beispiel #9
0
def evaluateInput(encoder, decoder, searcher, voc):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
            input_sentence = normalizeString(input_sentence)
            # Evaluate sentence
            output_words = evaluate(encoder, decoder, searcher, voc, input_sentence)
            # Format and print response sentence
            output_words[:] = [x for x in output_words if not (x == 'EOS' or x == 'PAD')]
            print('Bot:', ' '.join(output_words))

        except KeyError:
            print("Error: Encountered unknown word.")
Beispiel #10
0
async def handle(request):
    my_input = request.query['input']
    input_sentence = normalizeString(my_input)
    # Evaluate sentence
    output_words, score = evaluate(encoder, decoder, searcher, voc,
                                   input_sentence)
    # Format and print response sentence
    output_words[:] = [
        x for x in output_words if not (x == 'EOS' or x == 'PAD')
    ]

    response_obj = {
        'status': 'success',
        "response": ' '.join(output_words),
        "confidence": score
    }

    return web.Response(text=json.dumps(response_obj), status=200)
Beispiel #11
0
def evaluate(sentence, max_length=MAX_LENGTH):
    time_start = time.time()
    sentence = normalizeString(sentence)
    sentence = unicodedata.normalize('NFD', sentence)
    indexes_batch = [indexesFromSentence(voc, sentence)]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)
    input_batch = input_batch.to(device)
    lengths = lengths.to(device)
    tokens, score = searcher(input_batch, lengths, max_length)
    decoded_words = [voc.index2word[token.item()] for token in tokens]
    result = ''
    for char in decoded_words:
        if char != 'EOS':
            result += char
        else:
            break
    time_pred = time.time() - time_start
    return result, torch.sum(score) / len(result), time_pred
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs
Beispiel #13
0
def evaluateTest(encoder, decoder):
    test_size = test_df.shape[0]
    y_true = []
    y_pred = []
    for iter in range(0, test_size + 1):
        sentence = test_df.iloc[iter - 1]["description"]
        sentence = normalizeString(sentence)
        input_tensor = embeddedTensorFromSentence(sentence, device, word_emb,
                                                  N_word)
        target_class = test_df.iloc[iter - 1]["department_new"]
        class_index = []
        target_index = class_dict[target_class]
        #print(target_index)
        y_true.append(target_index)
        output = evaluate(encoder, decoder, input_tensor, max_length, device)
        topv, topi = output.topk(1)
        y_pred.append(topi.numpy()[0][0])
    cnf_matrix = confusion_matrix(y_true, y_pred)
    print("Accuarcy")
    print(accuracy_score(y_true, y_pred))
    print(cnf_matrix)
Beispiel #14
0
def get_data(in_file, out_file):
    print('getting data {}->{}...'.format(in_file, out_file))
    with open(in_file, 'r', encoding='utf-8') as file:
        in_lines = file.readlines()
    with open(out_file, 'r', encoding='utf-8') as file:
        out_lines = file.readlines()

    samples = []

    for i in tqdm(range(len(in_lines))):
        sentence_en = in_lines[i].strip().lower()
        tokens = [normalizeString(s.strip()) for s in nltk.word_tokenize(sentence_en)]
        in_data = encode_text(src_char2idx, tokens)

        sentence_zh = out_lines[i].strip()
        tokens = jieba.cut(sentence_zh.strip())
        out_data = [sos_id] + encode_text(tgt_char2idx, tokens) + [eos_id]

        if len(in_data) < maxlen_in and len(out_data) < maxlen_out and unk_id not in in_data and unk_id not in out_data:
            samples.append({'in': in_data, 'out': out_data})
    return samples
Beispiel #15
0
    def read_test_dataset(self, input_path, lock=True):
        with open(input_path) as f:
            data = json.load(f)
            self.no_samples = len(data)

            for i in tqdm(range(self.no_samples)):

                entry = data[str(i)]

                text = entry["text"]
                text = normalizeString(text)
                tokens = tokenize(text)
                self.stcs_literals.append(tokens)
                tokens_id = [
                    self.words_converter.T2id(id, lock=lock) for id in tokens
                ]
                tokens_id.append(self.words_converter.T2id('</s>'))
                self.stcs.append(tokens_id)
                self.lengths.append(len(tokens_id))

            self.X = self.stcs
Beispiel #16
0
def build_samples():
    word_map_zh = json.load(open('data/WORDMAP_zh.json', 'r'))
    word_map_en = json.load(open('data/WORDMAP_en.json', 'r'))

    for usage in ['train', 'valid']:
        if usage == 'train':
            translation_path_en = os.path.join(train_translation_folder, train_translation_en_filename)
            translation_path_zh = os.path.join(train_translation_folder, train_translation_zh_filename)
            filename = 'data/samples_train.json'
        else:
            translation_path_en = os.path.join(valid_translation_folder, valid_translation_en_filename)
            translation_path_zh = os.path.join(valid_translation_folder, valid_translation_zh_filename)
            filename = 'data/samples_valid.json'

        print('loading {} texts and vocab'.format(usage))
        with open(translation_path_en, 'r') as f:
            data_en = f.readlines()

        with open(translation_path_zh, 'r') as f:
            data_zh = f.readlines()

        print('building {} samples'.format(usage))
        samples = []
        for idx in tqdm(range(len(data_en))):
            sentence_en = data_en[idx].strip().lower()
            tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
            input_en = encode_text(word_map_en, tokens)

            sentence_zh = data_zh[idx].strip()
            seg_list = jieba.cut(sentence_zh)
            output_zh = encode_text(word_map_zh, list(seg_list))

            if len(input_en) <= max_len and len(
                    output_zh) <= max_len and UNK_token not in input_en and UNK_token not in output_zh:
                samples.append({'input': list(input_en), 'output': list(output_zh)})

        with open(filename, 'w') as f:
            json.dump(samples, f, indent=4)

        print('{} {} samples created at: {}.'.format(len(samples), usage, filename))
Beispiel #17
0
        def gather_file(file_, max_len):
            en_sents, fra_sents, en_cut_count, fra_cut_count = [], [], 0, 0

            for sentences in open(file_):
                en_, fra_ = [
                    normalizeString(s) for s in sentences.strip().split('\t')
                ]

                en_ws = [word for word in en_.strip().split()]
                fra_ws = [word for word in fra_.strip().split()]

                if len(en_ws) > max_len:
                    en_cut_count += 1
                    en_ws = en_ws[:max_len]
                en_sents.append([WORD[BOS]] + en_ws + [WORD[EOS]])

                if len(fra_ws) > max_len:
                    fra_cut_count += 1
                    fra_ws = fra_ws[:max_len]
                fra_sents.append([WORD[BOS]] + fra_ws + [WORD[EOS]])

            return fra_sents, en_sents, fra_cut_count, en_cut_count
Beispiel #18
0
	def _loadCachedArchiveNoteInfo(self,index):
		'''
			#2014.1.3
		'''
		cr = self.db.handle().cursor()
		sql = "select * from core_audiotemp where serial=?"
		cr.execute(sql,(index,))
		rs = fetchallDict(cr)
		if rs:
			r = rs[0]
			self.edtToneMemo.setText(r['memo'].decode('utf-8')) # 录音文件未产生之前,临时记录的备注信息
			self.edtProductId.setText(r['productid'].decode('utf-8'))
			self.cbxCurrOperator.setEditText( utils.normalizeString(r['operator']).decode('utf-8'))
			for n in range(self.cbxToneType.count()):
				attr = self.cbxToneType.itemData(n).toInt()[0]
				try:
					r['type'] = int(r['type'])
				except: r['type'] = 1

				if attr == int(r['type']):
					self.cbxToneType.setCurrentIndex(n)
					break
Beispiel #19
0
    def sent2tenosr(self, sentence):
        max_len = self.args.max_word_len - 2
        sentence = normalizeString(sentence)
        words = [w for w in sentence.strip().split()]

        if len(words) > max_len:
            words = words[:max_len]

        words = [WORD[BOS]] + words + [WORD[EOS]]
        idx = [self.src_dict[w] if w in self.src_dict else UNK for w in words]

        idx_data = torch.LongTensor(idx)
        idx_position = torch.LongTensor(
            [pos_i + 1 if w_i != PAD else 0 for pos_i, w_i in enumerate(idx)])
        idx_data_tensor = Variable(idx_data.unsqueeze(0), volatile=True)
        idx_position_tensor = Variable(idx_position.unsqueeze(0),
                                       volatile=True)

        if self.cuda:
            idx_data_tensor = idx_data_tensor.cuda()
            idx_position_tensor = idx_position_tensor.cuda()

        return idx_data_tensor, idx_position_tensor
Beispiel #20
0
def readLangs(lang1, lang2, path):
    lines = open(path, 'r', encoding='utf-8').readlines()

    lang1_cls = Lang(lang1)
    lang2_cls = Lang(lang2)
    pairs = []
    for l in lines:
        l = l.split("\t")
        sentence1 = normalizeString(l[0])
        sentence2 = cht_to_chs(l[1])
        seg_list = jieba.cut(sentence2, cut_all=False)
        sentence2 = " ".join(seg_list)

        if len(sentence1.split(" ")) > MAX_LENGTH:
            continue
        if len(sentence2.split(" ")) > MAX_LENGTH:
            continue

        pairs.append([sentence1, sentence2])
        lang1_cls.addSentence(sentence1)
        lang2_cls.addSentence(sentence2)

    return lang1_cls, lang2_cls, pairs
Beispiel #21
0
def analyze_en():
    translation_path = os.path.join(train_translation_folder,
                                    train_translation_en_filename)

    with open(translation_path, 'r') as f:
        sentences = f.readlines()

    sent_lengths = []

    for sentence in tqdm(sentences):
        sentence_en = sentence.strip().lower()
        tokens = [normalizeString(s) for s in nltk.word_tokenize(sentence_en)]
        seg_list = list(jieba.cut(sentence.strip()))
        # Update word frequency
        sent_lengths.append(len(seg_list))

    num_bins = 100
    n, bins, patches = plt.hist(sent_lengths,
                                num_bins,
                                facecolor='blue',
                                alpha=0.5)
    title = 'English Sentence Lengths Distribution'
    plt.title(title)
    plt.show()
Beispiel #22
0
#     for pair in qa_pairs:
#         writer.writerow(pair)
# print('Done writing to file')

# # Visualise some lines
# datafile = os.path.join('data/cornell_movie_dialogs_corpus','formatted_movie_lines.txt')
# with open(datafile,'rb') as file:
#     lines = file.readlines()
# for line in lines[:8]:
#     print(line)

# Read the datafile and split into lines
print('Reading and processing file. \nPlease wait ...')
lines = open(datafile,encoding='utf-8').read().strip().split('\n')
# Split every line into pairs and normalize
pairs = [[utils.normalizeString(s) for s in pair.split('\t')] for pair in lines]
print('Done Reading!')

# Instantial a vocabulary class
voc = Vocabulary('Cornell Movie-Dialogue Corpus')

pairs = utils.filterPairs(pairs,MAX_LENGTH = 10)
print('After filtering, there are {} conversation pairs'.format(len(pairs)))

# Loop through each pair and add them to the vocabulary
for pair in pairs:
    voc.addSentence(pair[0])
    voc.addSentence(pair[1])

print('Counted Words:',voc.num_words)
Beispiel #23
0
	def showNote(self,note,allow_change_phone= False):
		'''
			note: {
				spx_digest, spx文件摘要
				spx_index,  录音文件索引号
				calltype,   电话呼叫类型
				phone,      电话号码
				duration,   时长
			}
		'''
		if self.timer.isActive():
			self.timer.stop()
			self.duration = 0
		# print '--'*20
		# print repr(note)
		self.show()
		self.raise_()
		self.allow_change = allow_change_phone
		self.cbxClientNames.setCurrentIndex(-1)
		self.cbxClientNames.clearEditText()
		self.cbxClientNames.clear()
		self.edtHistoryToneMemoList.clear()

		self.note = note
		cr = self.db.handle().cursor()
		sql=''
		digest = note.get('spx_digest')

		index = note.get('spx_index')
		calltype = note.get('calltype',0) #默认来电
		phone = note.get('phone','')
		duration = note.get('duration',0)
		if digest:  #录音文件已存在
			sql = 'select * from core_audiofile where digest=?'
			cr.execute(sql,(digest,))
			rs = fetchallDict(cr)
			if rs:
				r = rs[0]
				self.edtPhone.setText(r['phone'].decode('utf-8'))
				attr = u'未知'
				if r['attr'] == 0 :
					attr=u'来电'
				elif r['attr'] == 1:
					attr=u'去电'
				elif r['attr'] == 2:
					attr=u'录音'
				elif r['attr'] == 3:
					attr = u'未接'
				self.edtToneAttr.setText(attr)
				self.edtToneTime.setText(utils.formatTimeLength(r['duration']))
				self.edtToneMemo.setText(r['memo'].decode('utf-8'))

				for n in range(self.cbxToneType.count()):
					attr = self.cbxToneType.itemData(n).toInt()[0]
					try:
						r['type'] = int(r['type'])
					except: r['type'] = 1

					if attr == int(r['type']):
						self.cbxToneType.setCurrentIndex(n)
						break

				self.edtProductId.setText(r['productid'].decode('utf-8'))
				sid = r['client_sid']
				print 'current archive: client_sid=',sid,r['phone']
				self.loadClientHistoryMemo(sid) #2013.11.9
				# if sid:
				# 	sql = 'select * from core_client where sid=?'
				# 	cr.execute(sql,(sid,))
				# 	rs = fetchallDict(cr)
				# 	if rs:
				# 		name = rs[0]['name']
				# 		#self.edtClientName.setText(name.decode('utf-8'))
				# 		self.cbxClientNames.setEditText(name.decode('utf-8'))
				phone = r['phone']
				rr = r
				if not phone:
					phone='z*'*20
				sql = "select * from core_client where (phone1 = ? " \
			      "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER
				cr.execute(sql,(phone,phone,phone))
				rs = fetchallDict(cr)
				n = 0
				idx = -1
				for r in rs:
					value = QStringList()
					value.append(r['sid'])
					value.append(r['name'])
					#value = '%s,~,%s'%(r['sid'],r['name'])
					self.cbxClientNames.addItem(r['name'].decode('utf-8'),value)

					if r['sid'] == sid:
						idx = n

					n+=1
				self.cbxClientNames.setCurrentIndex(idx)
				r = rr
				self.cbxCurrOperator.setEditText( utils.normalizeString(r['operator']).decode('utf-8'))

		if index!=None:
			#通话状态,显示通话时间流逝
			self.edtToneTime.setText(utils.formatTimeLength(0))
			self.timer.start(1000*1)

			attr = u'未知'
			if calltype == 0 :
				attr=u'来电'
			elif calltype == 1:
				attr=u'去电'
			elif calltype == 2:
				attr=u'录音'
			elif calltype == 3:
				attr = u'未接'
			self.edtToneAttr.setText(attr)
			self.edtToneTime.setText(utils.formatTimeLength(duration))
			self.edtToneMemo.setText('')
			self.edtPhone.setText(phone.decode('utf-8'))
			self.edtProductId.setText('')

			#根据电话号码匹配客户名称
			if not phone:
				phone='z*'*20

			# sql = "select * from core_client where phone1 like '%%%s%%' " \
			#       "or phone2 like '%%%s%%' or phone3 like '%%%s%%' "%(phone,phone,phone)
			sql = "select * from core_client where (phone1 = ? " \
			      "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER
			cr.execute(sql,(phone,phone,phone))
			rs = fetchallDict(cr)
			idx = -1
			n = -1
			for r in rs:
				n+=1
				value = QStringList()
				value.append(r['sid'])
				value.append(r['name'])
				#value = '%s,~,%s'%(r['sid'],r['name'])
				self.cbxClientNames.addItem(r['name'].decode('utf-8'),value)
				if currrent_dial_out_csid == r['sid']:
					idx = n
			if rs:
				if idx == -1:
					idx = 0
				self.cbxClientNames.setCurrentIndex(idx)
			#2013.11.8
			#print rs
			self.edtHistoryToneMemoList.clear()
			for r in rs:
				self.loadClientHistoryMemo(r['sid'],False)

			#--2014.1.3
			self._loadCachedArchiveNoteInfo(index)
Beispiel #24
0
    c = len(word_vector)
    word_vector = np.array(word_vector)
    word_vector = word_vector.reshape(c, 300)
    vec = Variable(torch.from_numpy(word_vector))
    return vec


model = model.LSTMClassifier()
model.load_state_dict(torch.load("./trained_model/sentiment.pt"))
model.eval()

print("fasttext loaded. Please enter your feedback\n")
while 1:
    feedback = input()
    if len(feedback) == 0:
        continue
    else:
        test = utils.normalizeString(feedback)
        length = len(test.split())
        test = prepare_wordvec(test.split())
        test = test.view(1, length, 300)
        k = nn.utils.rnn.pack_padded_sequence(test, [length], batch_first=True)
        tag_scores = model(k, 1)
        print("prediction: " + str(torch.Tensor.item(tag_scores.argmax())))
        print(torch.exp(tag_scores))
        cont = input("Continue?")
        if cont == "N" or cont == 'n' or cont.lower() == "no":
            break
        else:
            print("Please enter your feedback\n")
	cnf_matrix = confusion_matrix(y_true, y_pred)
	print("Accuarcy")
	print(accuracy_score(y_true, y_pred))
	print(cnf_matrix)
		

#if __name__ == " __main__":
#import pdb;pdb.set_trace();
encoder = EncoderRNN(N_word, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, CLASS_size, dropout_p=0.1, max_length=max_length).to(device)
n_iterations = train_df.shape[0]
#trainIters(encoder, decoder, n_iterations, print_every=50, plot_every=10)
import pdb;pdb.set_trace();
trainIters(encoder, decoder, 1, print_every=50, plot_every=10)
sentence = train_df.iloc[0]["description"]
sentence = normalizeString(sentence)
input_tensor = embeddedTensorFromSentence(sentence,device,word_emb,N_word)
target_class = train_df.iloc[0]["department_new"]
class_index = []
target_index = class_dict[target_class]
print(target_index)
#y_true.append(target_index)
output, attention = evaluate(encoder, decoder, input_tensor,max_length,device)
#import pdb;pdb.set_trace();
topv, topi = output.topk(1)
#import pdb;pdb.set_trace();
#torch.save(encoder.state_dict(), "encoder")
#torch.save(decoder.state_dict(), "decoder")
#encoder = torch.load("encoder")
#decoder = torch.load("decoder")
#desc1 = full_table.iloc[0]["description"]
    def showNote(self, note, allow_change_phone=False):
        '''
			note: {
				spx_digest, spx文件摘要
				spx_index,  录音文件索引号
				calltype,   电话呼叫类型
				phone,      电话号码
				duration,   时长
			}
		'''
        if self.timer.isActive():
            self.timer.stop()
            self.duration = 0
        # print '--'*20
        # print repr(note)
        self.show()
        self.raise_()
        self.allow_change = allow_change_phone
        self.cbxClientNames.setCurrentIndex(-1)
        self.cbxClientNames.clearEditText()
        self.cbxClientNames.clear()
        self.edtHistoryToneMemoList.clear()

        self.note = note
        cr = self.db.handle().cursor()
        sql = ''
        digest = note.get('spx_digest')

        index = note.get('spx_index')
        calltype = note.get('calltype', 0)  #默认来电
        phone = note.get('phone', '')
        duration = note.get('duration', 0)
        if digest:  #录音文件已存在
            sql = 'select * from core_audiofile where digest=?'
            cr.execute(sql, (digest, ))
            rs = fetchallDict(cr)
            if rs:
                r = rs[0]
                self.edtPhone.setText(r['phone'].decode('utf-8'))
                attr = u'未知'
                if r['attr'] == 0:
                    attr = u'来电'
                elif r['attr'] == 1:
                    attr = u'去电'
                elif r['attr'] == 2:
                    attr = u'录音'
                elif r['attr'] == 3:
                    attr = u'未接'
                self.edtToneAttr.setText(attr)
                self.edtToneTime.setText(utils.formatTimeLength(r['duration']))
                self.edtToneMemo.setText(r['memo'].decode('utf-8'))

                for n in range(self.cbxToneType.count()):
                    attr = self.cbxToneType.itemData(n).toInt()[0]
                    try:
                        r['type'] = int(r['type'])
                    except:
                        r['type'] = 1

                    if attr == int(r['type']):
                        self.cbxToneType.setCurrentIndex(n)
                        break

                self.edtProductId.setText(r['productid'].decode('utf-8'))
                sid = r['client_sid']
                print 'current archive: client_sid=', sid, r['phone']
                self.loadClientHistoryMemo(sid)  #2013.11.9
                # if sid:
                # 	sql = 'select * from core_client where sid=?'
                # 	cr.execute(sql,(sid,))
                # 	rs = fetchallDict(cr)
                # 	if rs:
                # 		name = rs[0]['name']
                # 		#self.edtClientName.setText(name.decode('utf-8'))
                # 		self.cbxClientNames.setEditText(name.decode('utf-8'))
                phone = r['phone']
                rr = r
                if not phone:
                    phone = 'z*' * 20
                sql = "select * from core_client where (phone1 = ? " \
                     "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER
                cr.execute(sql, (phone, phone, phone))
                rs = fetchallDict(cr)
                n = 0
                idx = -1
                for r in rs:
                    value = QStringList()
                    value.append(r['sid'])
                    value.append(r['name'])
                    #value = '%s,~,%s'%(r['sid'],r['name'])
                    self.cbxClientNames.addItem(r['name'].decode('utf-8'),
                                                value)

                    if r['sid'] == sid:
                        idx = n

                    n += 1
                self.cbxClientNames.setCurrentIndex(idx)
                r = rr
                self.cbxCurrOperator.setEditText(
                    utils.normalizeString(r['operator']).decode('utf-8'))

        if index != None:
            #通话状态,显示通话时间流逝
            self.edtToneTime.setText(utils.formatTimeLength(0))
            self.timer.start(1000 * 1)

            attr = u'未知'
            if calltype == 0:
                attr = u'来电'
            elif calltype == 1:
                attr = u'去电'
            elif calltype == 2:
                attr = u'录音'
            elif calltype == 3:
                attr = u'未接'
            self.edtToneAttr.setText(attr)
            self.edtToneTime.setText(utils.formatTimeLength(duration))
            self.edtToneMemo.setText('')
            self.edtPhone.setText(phone.decode('utf-8'))
            self.edtProductId.setText('')

            #根据电话号码匹配客户名称
            if not phone:
                phone = 'z*' * 20

            # sql = "select * from core_client where phone1 like '%%%s%%' " \
            #       "or phone2 like '%%%s%%' or phone3 like '%%%s%%' "%(phone,phone,phone)
            sql = "select * from core_client where (phone1 = ? " \
                  "or phone2 =? or phone3 =?) and memo!='%s'"%AppConst.CLIENT_DELETED_MARKER
            cr.execute(sql, (phone, phone, phone))
            rs = fetchallDict(cr)
            idx = -1
            n = -1
            for r in rs:
                n += 1
                value = QStringList()
                value.append(r['sid'])
                value.append(r['name'])
                #value = '%s,~,%s'%(r['sid'],r['name'])
                self.cbxClientNames.addItem(r['name'].decode('utf-8'), value)
                if currrent_dial_out_csid == r['sid']:
                    idx = n
            if rs:
                if idx == -1:
                    idx = 0
                self.cbxClientNames.setCurrentIndex(idx)
            #2013.11.8
            #print rs
            self.edtHistoryToneMemoList.clear()
            for r in rs:
                self.loadClientHistoryMemo(r['sid'], False)

            #--2014.1.3
            self._loadCachedArchiveNoteInfo(index)
Beispiel #27
0
    def read_training_dataset(self, input_path):
        with open(input_path) as f:

            data = json.load(f)
            self.no_samples = len(data)

            # for padding.
            self.words_converter.T2id('<PAD>')

            self.words_converter.T2id('<SOS>')

            self.slots_converter.T2id('<PAD>')
            self.slots_converter.T2id('<SOS>')

            self.slots_converter.T2id('-')

            for i in tqdm(range(self.no_samples)):

                entry = data[str(i)]

                text = entry["text"]
                text = normalizeString(text)
                tokens = tokenize(text)
                self.stcs_literals.append(tokens)
                tokens_id = [self.words_converter.T2id(id) for id in tokens]
                tokens_id.append(self.words_converter.T2id('</s>'))
                self.stcs.append(tokens_id)
                self.lengths.append(len(tokens_id))

                intent = entry["intent"]

                self.intents.append(self.intent_converter.T2id(intent))

                slots_dictionary = entry["slots"]
                # +1 make room for <SOS>
                slots_id = [self.slots_converter.T2id('-')] * len(tokens_id)
                slots_id[0] = self.slots_converter.T2id('<SOS>')

                no_slots_in_stc = 0
                for slot, target_words in slots_dictionary.items():
                    target_words = normalizeString(target_words)
                    target_word_list = tokenize(target_words)
                    for word in target_word_list:
                        no_slots_in_stc += 1
                        try:
                            idx = tokens.index(word)
                        except:
                            idx = [
                                i for i, s in enumerate(tokens) if word in s
                            ][0]

                        # +1 account for <SOS>
                        slots_id[idx + 1] = self.slots_converter.T2id(slot)

                # keep count of no slots
                for j in range(len(tokens_id) - no_slots_in_stc):
                    self.slots_converter.T2id('-')

                self.slots.append(slots_id)
                # self.slots.append(torch.tensor(slots_id, dtype=torch.long, device=self.device))

            # add padding

            ncols = max(self.lengths)

            self.X = self.stcs
            self.Y = self.slots