def qu_words_detection(sen): sen = segmentation.seg(sen, mode=True) ans = [] for w in Quantifier: if w in sen: w_idx = sen.index(w) if w_idx == 0: pass else: if w_idx != 0 and len(sen[w_idx - 1]) > 1: first = sen[w_idx - 1][-2] second = sen[w_idx - 1][-1] if first in QuantifierWords: if second in Quantifier[w]: continue else: word_len = 0 for i in range(w_idx): word_len += len(sen[i]) detect = { "location": word_len, "word": second, "correction": [{ "word": random.choice(Quantifier[w]), "score": None }] } ans.append(detect) else: if w_idx > 1 and (sen[w_idx - 2][-1] in QuantifierWords): if len(sen[w_idx - 1]) == 1 and (sen[w_idx - 1] in Quantifier[w]): pass elif len(sen[w_idx - 1]) == 1 and ( sen[w_idx - 1] not in Quantifier[w]): word_len = 0 for i in range(w_idx): word_len += len(sen[i]) detect = { "location": word_len, "word": sen[w_idx - 1], "correction": [{ "word": random.choice(Quantifier[w]), "score": None }] } ans.append(detect) return ans
def sentenceScorer(sentence): """ The language model needs an input in which the sentence is segmented by space :param sentence: :return: """ sen_cut = " ".join(segmentation.seg(sentence)) score = LmModel.score(sen_cut) return score
def replaceWord(sentence): ans = [] sen_list = segmentation.seg(sentence, mode=True) old_score = sentenceScorer(sentence) temp_sen_list = sen_list[:] for idx in range(len(sen_list)): pinyin = PIN.get_pinyin(sen_list[idx], "") local_score = -10000 if len(sen_list[idx]) > 1 and pinyin in pinyinConfusionset: for candi in pinyinConfusionset[pinyin]: temp_sen_list[idx] = candi current_score = sentenceScorer("".join(temp_sen_list)) if current_score > local_score: local_score = current_score replace_word = candi replace_score = current_score if local_score - old_score > 4: word_len = 0 for i in range(idx): word_len += len(sen_list[i]) if len(sen_list[idx]) == len(replace_word): for i in range(len(replace_word)): if replace_word[i] != sen_list[idx][i]: detect = { "location": word_len + i + 1, "sentence_score": old_score, "word": sen_list[idx][i], "correction": [{ "word": replace_word[i], "score": replace_score }] } ans.append(detect) temp_sen_list = sen_list[:] return ans
data = data[msg_size:] #pick_data=pickle.loads(frame_data, fix_imports=True, encoding="bytes") pick_data=pickle.loads(frame_data) #print(frame) #print(np.load(image)) print(pick_data) print(len(pick_data)) image = cv2.imdecode(pick_data[1], cv2.IMREAD_COLOR) #image = cv2.imdecode(pick_data, -1) cv2.imwrite("output1.png",image) break img = seg('/home/lab05/kaggle_dir/project_socket_server/output1.png') # print(answer) print(img.shape) print(type(img)) # img = cv2.imread(answer) print("사진크기:{}".format(img.shape)) encode_param = [int(cv2.IMWRITE_PNG_COMPRESSION), 9] img = cv2.imencode('.png', img, encode_param) data = pickle.dumps(img, protocol=3) # data = pickle.dumps(img, 0) size = len(data) conn.sendall(struct.pack(">L", size) + data) # cv2.imwrite("output2.png",answer)
ha='center', va='bottom') else: ax.text(rect.get_x() + rect.get_width() / 2., 1.02 * height, '%d' % int(height), ha='center', va='bottom') segs = [[1, 10], [10, 100], [100, 1000], [1000, 10000], [10000, 100000], [ 100000, ]] ret = segmentation.seg(seg(), segs) N = len(segs) menStd = (0) * N ind = np.arange(N) * 1 # the x locations for the groups width = 0.55 # the width of the bars fig, ax = plt.subplots() rects1 = ax.bar(ind, ret, width, color='grey', align='center') #rects2 = ax.bar(ind+1*width, z2, width, color='y', yerr=menStd) ax.set_ylabel(u'用户数(人/次)') ax.set_xlabel(u'应还款额(RMB)') ax.set_title(u'卡牛区间统计') ax.set_xticks(ind)
def autolabel(rects): # attach some text labels for rect in rects: height = rect.get_height() if(height > 1000): ax.text(rect.get_x()+rect.get_width()/2., 1.02*height, '%.0fK'%float(height/1000.00), ha='center', va='bottom') else: ax.text(rect.get_x()+rect.get_width()/2., 1.02*height, '%d'%int(height), ha='center', va='bottom') segs = [[1,10],[10,100],[100,1000],[1000,10000],[10000,100000],[100000,]] ret = segmentation.seg(seg(),segs) N = len(segs) menStd = (0)*N ind = np.arange(N)*1 # the x locations for the groups width = 0.55 # the width of the bars fig, ax = plt.subplots() rects1 = ax.bar(ind, ret, width, color='grey',align='center') #rects2 = ax.bar(ind+1*width, z2, width, color='y', yerr=menStd) ax.set_ylabel(u'用户数(人/次)') ax.set_xlabel(u'应还款额(RMB)') ax.set_title(u'卡牛区间统计') ax.set_xticks(ind)
def replaceOneCharacter(idx, sen_list, sentence): old_score = sentenceScorer(sentence) + 3.7 double_check = False word = sen_list[idx] word_next_dict = {} word_deviation = 0 ## To see whether there contains same words in the following three words if idx < len(sen_list) - 1: temp_count = 1 check_len = 0 # this variable is used to record the number of words while temp_count < 4 and ( idx + temp_count) < len(sen_list) - 1 and check_len <= 4: check_len += len(sen_list[idx + temp_count]) temp_count += 1 word_next_dict[sen_list[idx + temp_count]] = temp_count for word_next in word_next_dict: if word == word_next: double_check = True word_deviation = word_next_dict[word_next] break candis = dict() try: candidates = Confusionset[word] except: return None max_score = -100000 replacement_ch = "" for ch in candidates: temp_list = sen_list[:] if not double_check: temp_list[idx] = ch else: temp_list[idx] = ch temp_list[idx + word_deviation] = ch new_score = sentenceScorer("".join(temp_list)) if len(segmentation.seg(sentence)) > len( segmentation.seg("".join(temp_list))): new_score += 5.4 if abs(new_score - old_score) <= 1: new_score -= 4 if new_score > old_score: candis[new_score] = ch if new_score > max_score: max_score = new_score replacement_ch = ch if max_score > old_score: candi_top = [] for i in sorted(candis, reverse=True)[:min(1, len(candis))]: candi_top.append({"word": candis[i], "score": i}) return replacement_ch, word, candi_top, old_score, double_check, word_deviation, max_score else: return None
def detectSentence(sentence): t1 = time.time() final_result = dict() sen_list = segmentation.seg(sentence) temp_sen_list = sen_list[:] error = {"error": []} index = 0 local_res = None # this variable is used for keeping the detected result skip_index = [] location = 1 for idx in range(len(temp_sen_list)): """ Since we use the segmentation tool, the location need some more operations. """ if idx == 0: location = 1 else: location += len(temp_sen_list[idx - 1]) word = temp_sen_list[idx] if index in skip_index: # If double check is True, we will not continue to detect the following 3 words continue if len(word) == 1 and isPunc( word) and word not in SkipWord and word not in SingleWord: if LmModel.score(word) <= -6.2: ans = replaceOneCharacter(idx, temp_sen_list, sentence) if ans is not None and (local_res is None or ans[-1] > local_res[-1]): error_location = location local_res = copy.deepcopy(ans) error_idx = idx else: if local_res is not None: detect_word = local_res[1] tops = local_res[2] double_check = local_res[4] word_deviation = local_res[5] old_score = local_res[3] if not double_check: detect = { "location": error_location, "sentence_score": old_score, "word": detect_word, "correction": tops } error["error"].append(detect) else: detect1 = { "location": error_location, "sentence_score": old_score, "word": detect_word, "correction": tops } detect2 = { "location": error_location + sum([ len(temp_sen_list[k]) for k in range(error_idx, error_idx + word_deviation) ]), "sentence_score": old_score, "word": detect_word, "correction": tops } error["error"].append(detect1) error["error"].append(detect2) skip_index.extend([ k for k in range(error_idx, error_idx + word_deviation) ]) local_res = None if word in TeacherWord: temp_location = location for i in range(len(word)): temp_location += i if word[i] != TeacherWord[word[i]]: detect = { "location": temp_location, "word": word[i], "correction": [{ "word": TeacherWord[word[i]], "score": None }] } error["error"].append(detect) if local_res is not None: detect_word = local_res[1] tops = local_res[2] double_check = local_res[4] word_deviation = local_res[5] old_score = local_res[3] if not double_check: detect = { "location": error_location, "sentence_score": old_score, "word": detect_word, "correction": tops } error["error"].append(detect) else: detect1 = { "location": error_location, "sentence_score": old_score, "word": detect_word, "correction": tops } detect2 = { "location": error_location + sum([ len(temp_sen_list[k]) for k in range(error_idx, error_idx + word_deviation) ]), "sentence_score": old_score, "word": detect_word, "correction": tops } error["error"].append(detect1) error["error"].append(detect2) t2 = time.time() qu_res = qu_words_detection(sentence) if qu_res: error["error"].extend(qu_res) word_res = replaceWord(sentence) if len(word_res) > 0: error["error"].extend(word_res) time_usage = {"time_usage": "%.4f" % (t2 - t1)} final_result = dict(final_result, **error) final_result = dict(final_result, **time_usage) return final_result