def MainFunction(): #CRIEA Start! if os.path.exists("Output/"): if Init.SystemJudge() == 0: os.system("rm -r Output") else: os.system("rmdir /s /q directory") NameArr = Pretreatment.FigureInput(1) try: if NameArr == -1: return except: pass #Figure traversal for kase in range(0, len(NameArr)): img = np.array(Image.open(NameArr[kase]).convert("L")) img = Pretreatment.BFSmooth(img) [Tobimg, NodeInfo] = Algorithm.Toboggan(img) [Upground, Background] = Algorithm.HandSeed(Tobimg, img, Surround) Seeds = Upground | Background ProbBlock = [] VarL = 0 if Method == "Lap": NodeInfo, VarL = Functions.SeedFirst(NodeInfo, Seeds) LapEqu = Algorithm.Laplacian(NodeInfo, VarL) ProbBlock = Functions.LinearEquation(LapEqu, len(NodeInfo) - VarL, VarL) """
def NEW_PTM_WKNN(data_set): train, test = Pretreatment.partitioningTPMKNN(data_set) normalized_train_matrix = Pretreatment.normalization(train) normalized_test_matrix = Pretreatment.normalization(test) klist = np.zeros((normalized_train_matrix.shape[0], 1)) predictions = np.zeros((normalized_test_matrix.shape[0], 1)) for unknowid in range(0, len(normalized_train_matrix)): # 给训练集样本加入标签local k klist[unknowid] = PTM_KNN.trainlocal_k(normalized_train_matrix, unknowid) new_normalized_train_matrix = np.column_stack( (normalized_train_matrix, klist)) for unknowid_text in range( 0, len(normalized_test_matrix)): # 选取测试集样本最近三个点的最大local k作为最佳k best_c = bestc(new_normalized_train_matrix, normalized_test_matrix, unknowid_text, data_set) # 循环选取c值 new_k1 = PTM_KNN.testlocal_k(new_normalized_train_matrix, normalized_test_matrix, unknowid_text, best_c) #通过c值选取k值 predictions[unknowid_text] = WKNN.weighted_knn( unknowid_text, normalized_train_matrix, int(new_k1), normalized_test_matrix) # 计算邻近k个值中分类权重最高的分类作为预测分类 TP, FP, FN, TN = analysis.TPFPFNTN(normalized_test_matrix, predictions) return analysis.recall(TP, FN), analysis.F_score(TP, FP, FN), analysis.G_mean( TP, FN, TN, FP), TP, FN, TN, FP
def BWError(): if os.path.exists("Output/"): if Init.SystemJudge() == 0: os.system("rm -r Output") else: os.system("rmdir /s /q directory") NameArr = Pretreatment.FigureInput(1) try: if NameArr == -1: return except: pass #Figure traversal for kase in range(0, len(NameArr)): img = np.array(Image.open(NameArr[kase]).convert("L")) """ for i in range(0, len(img)): for j in range(0, len(img[i])): if random.randint(1, 50) == 1: img[i][j] += np.random.normal(img[i][j], 64) img[i][j] = max(0, img[i][j]) img[i][j] = min(255, img[i][j]) """ Name = "Figure_" Name += str(Init.GetTime()) Name += ".png" Pretreatment.Output(img, Name, 2)
def bestc(normalized_train_matrix, normalized_test_matrix, unknowid, data_set): unknow_instance = normalized_test_matrix[unknowid, :-1] # 未知样例 unknow_distance = distance.e_distance_calculation( unknow_instance, normalized_train_matrix[:, :-2]) label_vector = normalized_train_matrix[:, -1] labeled_distance = np.column_stack( (unknow_distance, label_vector)) # 距离与对应k标签合并 sorted_labeled_distance = labeled_distance[ labeled_distance[:, 0].argsort()] # 排序 max = -1 real_accuracy = 0 for cnumber in range(1, 3): # 循环取c值 for c in range(0, cnumber): # 循环选取周围c个数中最大的标签k值 if sorted_labeled_distance[c][1] > max: max = sorted_labeled_distance[c][1] a = 0 for k in range(1, 11): # 十折交叉验证 train, test = Pretreatment.partitioning(data_set, k) # 十折拆分数据集 normalized_train_matrix = Pretreatment.normalization(train) normalized_test_matrix = Pretreatment.normalization(test) predictions = np.zeros( (normalized_test_matrix.shape[0], 1)) # 初始化预测结果集合 for unknowid_text in range(0, len(normalized_test_matrix)): predictions[unknowid_text] = WKNN.weighted_knn( unknowid_text, normalized_train_matrix, int(max), normalized_test_matrix) accuracy = analysis.getAccuracy(normalized_test_matrix, predictions) a = accuracy + a thistest_accuracy = float(a / 10) if thistest_accuracy > real_accuracy: # 保存十折交叉验证后精度最高的c值 real_accuracy = thistest_accuracy best_c = cnumber return best_c
def overtrace_area(temp1, temp2): # 重叠区域 d1 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[0][0], temp2[0][1]) d2 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[1][0], temp2[1][1]) if d1 < d2: c = temp2[0] a = find_minpoint(c, temp1) else: a = temp1[0] c = find_minpoint(a, temp2) d3 = Pretreatment.distance(temp1[len(temp1) - 1][0], temp1[len(temp1) - 1][1], temp2[len(temp2) - 1][0], temp2[len(temp2) - 1][1]) d4 = Pretreatment.distance(temp1[len(temp1) - 1][0], temp1[len(temp1) - 1][1], temp2[len(temp2) - 2][0], temp2[len(temp2) - 2][1]) if d3 > d4: b = temp1[len(temp1) - 1] d = find_minpoint(b, temp2) else: d = temp2[len(temp2) - 1] b = find_minpoint(d, temp1) temp = [a, b, c, d] # print("四个点:", temp) return temp
def extend(temp1, temp2): # 延伸类笔画 d1 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[0][0], temp2[0][1]) d2 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[1][0], temp2[1][1]) if d1 < d2: ftemp = temp1 btemp = temp2 else: ftemp = temp2 btemp = temp1 point = overtrace_area(ftemp, btemp) a = ftemp.index(point[0]) b = ftemp.index(point[1]) c = btemp.index(point[2]) d = btemp.index(point[3]) # print("a,b,c,d", a, b, c, d) if a > b: t = b b = a a = t if c > d: t = d d = c c = t # x = (point[0][0] + point[1][0] + point[2][0] + point[3][0]) / 4 # y = (point[0][1] + point[1][1] + point[2][1] + point[3][1]) / 4 # p1 = ftemp[a:b + 1] # p2 = btemp[c:d + 1] # # 找重叠区域的中点到两条笔画上最小距离的点 # n = find_minpoint([x, y], p1) # m = find_minpoint([x, y], p2) # i = ftemp.index(n) # j = btemp.index(m) # ntemp1 = ftemp[0:i + 1] # ntemp2 = btemp[j:len(btemp)] # mx, my = (n[0] + m[0]) / 2, (n[1] + m[1]) / 2 # new_temp1 = Chord_weighting(ntemp1, [mx, my]) # 弦长加权法求新的数据点 # new_temp1.append([mx, my]) # ntemp2.reverse() # new_temp2 = Chord_weighting(ntemp2, [mx, my]) # new_temp2.reverse() # new_temp = new_temp1 + new_temp2 new_temp = [] for i in range(a + 1): new_temp.append(ftemp[i]) Mtemp = btemp[c:d + 1] Mtemp.reverse() ntemp = Chord_weighting(Mtemp, point[0]) i = len(ntemp) - 1 while i >= 0: new_temp.append(ntemp[i]) i -= 1 for i in range(d, len(btemp)): new_temp.append(btemp[i]) print("new_temp", new_temp) return new_temp
def WKNN(data_set,best_k): train, test = Pretreatment.partitioningTPMKNN(data_set) normalized_train_matrix = Pretreatment.normalization(train) normalized_test_matrix = Pretreatment.normalization(test) predictions = np.zeros((normalized_test_matrix.shape[0], 1)) for unknowid_text in range(0, len(normalized_test_matrix)): predictions[unknowid_text] = weighted_knn(unknowid_text, normalized_train_matrix, int(best_k), normalized_test_matrix) TP, FP, FN, TN = analysis.TPFPFNTN(normalized_test_matrix, predictions) return analysis.recall(TP, FN), analysis.F_score(TP, FP, FN), analysis.G_mean(TP, FN, TN, FP),TP, FN, TN, FP
def the_direction_of_the_stroke(temp1, temp2): # 用重叠区域首尾变化的方向来判断笔画的方向 d1 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[0][0], temp2[0][1]) d2 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[len(temp2) - 1][0], temp2[len(temp2) - 1][1]) if d1 < d2: return True else: return False
def Three_point_chord_weighting(p1, p2, p3): # 求p2斜率 p1 = np.array(p1) p2 = np.array(p2) p3 = np.array(p3) v1 = p2 - p1 v2 = p3 - p2 l1 = Pretreatment.distance(p1[0], p1[1], p2[0], p2[1]) l2 = Pretreatment.distance(p2[0], p2[1], p3[0], p3[1]) nv1 = np.linalg.norm(v1) nv2 = np.linalg.norm(v2) t = (l1 / l2) * (v2 / nv2) + (l2 / l1) * (v2 / nv2) # print("切线方向", t) return t
def doMain(self): ioFunctions = IOFunctions.IOFunctions() pretreatment = Pretreatment.Pretreatment() raw = ioFunctions.ReadFile('D:\\codes\\Python\\PythonSpace\\NLTKTest\\datas\\articles.txt') #清除Unicode标签u"和u' raw = re.sub("u\"","",raw) raw = re.sub("u\'","",raw) sents = pretreatment.SenToken(raw) cleanSents = pretreatment.CleanSents(sents) words = pretreatment.WordToken(cleanSents) stemWords = pretreatment.StemWords(words) tagged_words = pretreatment.TagWords(stemWords) # chunked_words = pretreatment.ChunkWords(tagged_words) # iob = pretreatment.IOBTree(chunked_words) #将处理完的语料保存到output文件中去 file = open("datas\\output.txt","w") file.truncate() for line in tagged_words: string="" for mtuple in line: for word in mtuple: string += word+" " string+="\n" string+="\n" file.write(string) file.close()
def FouriorTrans(): if os.path.exists("Output/"): if Init.SystemJudge() == 0: os.system("rm -r Output") else: os.system("rmdir /s /q directory") NameArr = Pretreatment.FigureInput(1) try: if NameArr == -1: return except: pass #Figure traversal for kase in range(0, len(NameArr)): img = np.array(Image.open(NameArr[kase]).convert("L")) Statistic = [0 for n in range(0, 260)] TTL = 0 for i in range(0, len(img)): for j in range(0, len(img[i])): Statistic[img[i][j]] += 1 TTL += 1 #Drecrete PDE Prob = [0.00 for n in range(260)] for i in range(0, len(Prob)): Prob[i] = Statistic[i] / TTL #HF = np.fft.fft(Prob).real fig1 = plt.figure() ax = fig1.add_subplot(111) plt.xlim(-1, 260) plt.ylim(0, 0.2) #Printing loop for i in range(0, len(Prob)): ax.add_patch(patches.Rectangle((i, 0), 1, Prob[i], color = 'black')) Name = "" Hajimari = False for i in range(len(NameArr[kase]) - 1, -1, -1): if NameArr[kase][i] == ".": Hajimari = True continue elif NameArr[kase][i] == "/": break else: if Hajimari == False: continue else: Name = NameArr[kase][i] + Name Name += "_Histogram.png" print(Name) plt.savefig(Name) return
def is_overdraw(max_temp, min_temp): #判断是否重叠 l = the_lengthest(max_temp) max_corpoint = break_point.turning_point1(max_temp) #折线化的点 temp1 = [] # 短笔画落入重叠区域的采样点 for i in range(1, len(max_corpoint)): f_point = max_corpoint[i - 1] b_point = max_corpoint[i] R, T = coordinate_transformation(f_point, b_point) length = Pretreatment.distance(f_point[0], f_point[1], b_point[0], b_point[1]) x = length / 2 #容差长度范围 y = 3 / 4 * float('%.2f' % (length**0.5)) + 2 # 找到容差宽度 h = 0 while h < len(min_temp): X = np.array([min_temp[h][0], min_temp[h][1], 1]) Y = R @ T @ X r1 = ((min_temp[h][0] - f_point[0])**2 + (min_temp[h][1] - f_point[1])**2) r2 = ((min_temp[h][0] - b_point[0])**2 + (min_temp[h][1] - b_point[1])**2) if math.fabs(Y[0]) <= x and math.fabs( Y[1]) <= y or r1 <= y * y or r2 <= y * y: if min_temp[h] in temp1: pass else: a = min_temp[h] temp1.append(a) h += 1 k = len(temp1) / len(min_temp) return temp1, k
def extend_stroke(temp1, temp2): l = tolerance_zone.the_lengthest(temp1) y = 3 / 4 * float('%.2f' % (l**0.5)) + 2 # 找到容差宽度 max_corpoint = break_point.turning_point(temp1) # 折线化的点 max_corpoint.append(temp2[int(len(temp2) / 2)]) max_corpoint.append(temp2[len(temp2) - 1]) temp = [] for i in range(1, len(max_corpoint)): f_point = max_corpoint[i - 1] b_point = max_corpoint[i] # print(f_point,b_point) R, T = tolerance_zone.coordinate_transformation(f_point, b_point) length = Pretreatment.distance(f_point[0], f_point[1], b_point[0], b_point[1]) x = length / 2 # 容差长度范围 h = 0 while h < len(temp2): X = np.array([temp2[h][0], temp2[h][1], 1]) Y = R @ T @ X r1 = ((temp2[h][0] - f_point[0])**2 + (temp2[h][1] - f_point[1])**2) r2 = ((temp2[h][0] - b_point[0])**2 + (temp2[h][1] - b_point[1])**2) if math.fabs(Y[0]) <= x and math.fabs( Y[1]) <= y or r1 <= y * y or r2 <= y * y: if temp2[h] in temp: pass else: a = temp2[h] temp.append(a) h += 1 k = len(temp) / len(temp2) print("延伸笔画的比率", k) return k
def line_length(temp): l = 0 for i in range(1, len(temp)): a = temp[i - 1] b = temp[i] d = Pretreatment.distance(a[0], a[1], b[0], b[1]) l = l + d return l
def is_extend(temp1, temp2): d1 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[0][0], temp2[0][1]) d2 = Pretreatment.distance(temp1[0][0], temp1[0][1], temp2[1][0], temp2[1][1]) if d1 < d2: ftemp = temp1 btemp = temp2 else: ftemp = temp2 btemp = temp1 if angle(ftemp, btemp) and angle_bisector( ftemp, btemp) and extend_stroke(ftemp, btemp) > 0.8: # if angle_bisector(ftemp, btemp) : return True else: return False
def the_lengthest(temp): #最大的弦长笔画 s1 = temp[0] d_max = 0 for i in range(1, len(temp)): s2 = temp[i] d = Pretreatment.distance(s1[0], s1[1], s2[0], s2[1]) if d > d_max: d_max = d return d_max
def PTM_KNN(data_set): train, test = Pretreatment.partitioningTPMKNN(data_set) normalized_train_matrix = Pretreatment.normalization(train) normalized_test_matrix = Pretreatment.normalization(test) klist = np.zeros((normalized_train_matrix.shape[0], 1)) predictions = np.zeros((normalized_test_matrix.shape[0], 1)) for unknowid in range(0, len(normalized_train_matrix)): klist[unknowid] = trainlocal_k(normalized_train_matrix, unknowid) new_normalized_train_matrix = np.column_stack( (normalized_train_matrix, klist)) for unknowid_text in range(0, len(normalized_test_matrix)): new_k = testlocal_k(new_normalized_train_matrix, normalized_test_matrix, unknowid_text, 3) predictions[unknowid_text] = KNN.traditional_knn( unknowid_text, normalized_train_matrix, int(new_k), normalized_test_matrix) TP, FP, FN, TN = analysis.TPFPFNTN(normalized_test_matrix, predictions) return analysis.recall(TP, FN), analysis.F_score(TP, FP, FN), analysis.G_mean( TP, FN, TN, FP), TP, FN, TN, FP
def get_KNNbestK(data_set): real_accuracy = 0 beat_k = 1 for ks in range(1, 20): # 循环取k值 a = 0 for k in range(1, 11): train, test = Pretreatment.partitioning(data_set, k) # 十折拆分数据集 normalized_train_matrix = Pretreatment.normalization(train) normalized_test_matrix = Pretreatment.normalization(test) predictions = np.zeros( (normalized_test_matrix.shape[0], 1)) # 初始化预测结果集合 for unknowid_text in range(0, len(normalized_test_matrix)): predictions[unknowid_text] = traditional_knn( unknowid_text, normalized_train_matrix, int(ks), normalized_test_matrix) accuracy = analysis.getAccuracy(normalized_test_matrix, predictions) a = accuracy + a thistest_accuracy = float(a / 10) if thistest_accuracy > real_accuracy: real_accuracy = thistest_accuracy best_k = ks return best_k
def MainFunction(): if os.path.exists("Output/"): if Init.SystemJudge() == 0: os.system("rm -r Output") else: os.system("rmdir /s /q directory") NameArr = Pretreatment.FigureInput(1) try: if NameArr == -1: return except: pass #Figure traversal for kase in range(0, len(NameArr)): img = np.array(Image.open(NameArr[kase]).convert("L")) img1 = img.deepcopy()
def curve_change_line(point1, point2, line, t): #折线化 d = 0 x_max = 0 y_max = 0 line_d = Pretreatment.distance(point1[0], point1[1], point2[0], point2[1]) # print(line) for each in line: dis = getDis(each[0], each[1], point1[0], point1[1], point2[0], point2[1]) if dis >= d: d = dis x_max = each[0] y_max = each[1] if d > t * line_d: point_max = [x_max, y_max] else: point_max = [] return point_max
def Chord_weighting(temp, p): # 弦长加权法求新的数据点 # print("弦长加权:", temp, p) distance = [] s = 0 for i in range(1, len(temp)): dis = Pretreatment.distance(temp[i - 1][0], temp[i - 1][1], temp[i][0], temp[i][1]) s += dis distance.append(dis) x1, y1 = (p[0] - temp[len(temp) - 1][0]) / 2, (p[1] - temp[len(temp) - 1][1]) / 2 new_temp = [temp[0]] l = 0 for i in range(len(distance)): l += distance[i] w = l / s x, y = temp[i + 1][0] + w * x1, temp[i + 1][1] + w * y1 new_temp.append([x, y]) return new_temp
def CNNSeed(img, TobImg, BlockSize, FileName): BlockInfo = [0 for n in range(BlockSize)] for i in range(0, len(BlockSize)): print(str(BlockSize) + "\t:") OutImg = [[0 for n in ragne(len(TobImg[0]))] for n in range(TobImg)] for p in range(0, len(TobImg)): for q in range(0, len(TobImg[p])): if TobImg[p][q] == i: OutImg[p][q] = img[p][q] Pretreatment.FigurePrint(OutImg, kind) InpInt = Init.IntInput(str(BlockSize) + "\t:" , "1", "3", "int") if InpInt == 1: BlockInfo[i] = 1 elif InpInt == 2: BlockInfo[i] = 2 elif InpInt == 3: BlockInfo[i] = 3 BuildFile("Histogram") File = open("Histogram", "a")
import numpy as np import string from gensim import corpora import Pretreatment # 新建LDA模型 lda = Pretreatment.LDA() Corpus, Product, Component, Developer = lda.read_csv(dir="AspectJ.csv", summary=2, description=3, product=4, component=5, assigned_to=6, comment=7) Terms, Topics = lda.build_from_corpus(Corpus) # 生成MLkNN mlknn = Pretreatment.BR_MLkNN() Terms = mlknn.makematrix(Terms, lenth=len(lda.dictionary)) Topics = mlknn.makematrix(Topics, lenth=lda.num_topics) X, y = mlknn.make_Xy(Terms=Terms, Topics=Topics, Product=Product, Component=Component, Developer=Developer) step = int(len(X) / 11) # 分11组验证 # 仅br分析的结果 for n in range(step, len(X) - step, step): testX_Validation = X[n:n + step, :]
import Pretreatment import KNN ''' # Generate Image trainDatas, trainLabels = Pretreatment.loadTrainData('/home/hadoop/workdatas/kaggle/DigitRecognizer/train_sort.csv'); trainLabels = trainLabels[0]; Pretreatment.generateImage('/home/hadoop/workdatas/kaggle/DigitRecognizer/imgs/', trainDatas, trainLabels); ''' ''' # KNN Test import numpy group = numpy.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] tar = [1.0, 1.2]; result = KNN.classify(tar, group, labels, 3); print result; ''' # KNN Clasify trainDatas, trainLabels = Pretreatment.loadTrainData( '/home/hadoop/workdatas/kaggle/DigitRecognizer/train.csv') trainLabels = trainLabels[0] testDatas = Pretreatment.loadTestData( '/home/hadoop/workdatas/kaggle/DigitRecognizer/test.csv') result = KNN.process(testDatas, trainDatas, trainLabels) Pretreatment.generateResultFile( '/home/hadoop/workdatas/kaggle/DigitRecognizer/result_knn_10.csv', result)
import sys import subprocess print("-----------------------------------------") file_name = input( "输入'/帮助',则进入SCPython的使用帮助\n输入'/自定义',则进入自定义模式\n输入SCPython代码文件地址则执行解释\n\n请输入:" ) while file_name == "": file_name = input("您刚才未输入指令,请重新输入:") if file_name == "/帮助": pass elif file_name == "/自定义": pass else: file = open(file_name, encoding='utf-8') file_text = file.read() List_Code = Pretreatment.Decomposition_Code(file_text) Text = Translation.Translation_Code(List_Code) file_new = open(os.environ["TMP"] + "\SCPYFile.py", "w", encoding='utf-8') file_new.write(Text) file_new.close() os.system('cls') os.system(os.environ["TMP"] + r'"\SCPYFile.py"')
def HandSeed(Tobimg, img, Surround): Upground = set() Background = set() for Kase in range(0, 2): Owari = False while 1: print( "\nInput the location you choice as '[i1, j1] [i2, j2]'. \nAfter choose, close the figure and press Enter to continue." ) if Kase == 0: print("Upground Set: ") elif Kase == 1: print("Background Set: ") #Print the figure if Surround == "Nor": Pretreatment.FigurePrint(img, 2) else: pass #Pretreatment InpStr = input() RemStr1 = "" RemStr2 = "" Str2Int = False Error = False kind = 0 #Get string for i in range(0, len(InpStr)): #Partial if InpStr[i] == "[": kind = 1 continue if InpStr[i] == ",": kind = 2 continue if InpStr[i] == "]": kind = 0 Str2Int = True if kind == 1 and Str2Int == False: RemStr1 += InpStr[i] if kind == 2 and Str2Int == False: RemStr2 += InpStr[i] if Str2Int == True: Int1 = 0 Int2 = 0 #print([RemStr1,RemStr2]) try: Int1 = int(RemStr2) Int2 = int(RemStr1) except: print("Input Error, Please input points again") Error = True break #print(Tobimg[Int1][Int2]) try: if Kase == 0: Upground.add(Tobimg[Int1][Int2]) if Kase == 1: Background.add(Tobimg[Int1][Int2]) except: print("Input Error, Location exceed") RemStr1 = "" RemStr2 = "" Str2Int = False continue if Error == True: continue else: if Kase == 0: if len(Upground) == 0: print("You must input at least 1 node!") continue else: Owari = True elif Kase == 1: if len(Background) == 0: print("You must input at least 1 node!") continue else: Owari = True if Owari == True: break else: continue return Upground, Background
if __name__ == "__main__": list_tweets = [] #list_tweets = Data.GetListTweets("output/datas_tweets.txt", 5000) date_start = '2019-09-05' date_end = '2019-09-06' n = 10 #决定聚类数量 pretreatment_result = "output/" + date_start + "_" + date_end + "_" + "pretreatment_result" + ".txt" hotspot_result = "output/" + date_start + "_" + date_end + "_" + "hotspot_result" + ".txt" # #实验获取手肘值 # for f in Select_date(date_start,date_end): # list_tweets.extend(Data.GetListTweets(f)) # list_result = Pretreatment.Pretreatment(list_tweets, "input/sensitive.txt", "input/emoji.txt", "input/stopwords.txt") # count = Data.OutputToFile(list_result, pretreatment_result) # list_tweets = Data.GetListTweets(pretreatment_result) for f in Select_date(date_start, date_end): list_tweets.extend(Data.GetListTweets(f)) list_result = Pretreatment.Pretreatment(list_tweets, "input/sensitive.txt", "input/emoji.txt", "input/stopwords.txt") count = Data.OutputToFile(list_result, pretreatment_result) list_tweets = Data.GetListTweets(pretreatment_result) dict_hotSpot = K_MEANS.GetDictHotSpot(list_tweets, count, n) list_hotSpot = GetListHotSpot(dict_hotSpot) Output(list_hotSpot, hotspot_result)
import Pretreatment import KNN ''' # Generate Image trainDatas, trainLabels = Pretreatment.loadTrainData('/home/hadoop/workdatas/kaggle/DigitRecognizer/train_sort.csv'); trainLabels = trainLabels[0]; Pretreatment.generateImage('/home/hadoop/workdatas/kaggle/DigitRecognizer/imgs/', trainDatas, trainLabels); ''' ''' # KNN Test import numpy group = numpy.array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = ['A', 'A', 'B', 'B'] tar = [1.0, 1.2]; result = KNN.classify(tar, group, labels, 3); print result; ''' # KNN Clasify trainDatas, trainLabels = Pretreatment.loadTrainData('/home/hadoop/workdatas/kaggle/DigitRecognizer/train.csv'); trainLabels = trainLabels[0]; testDatas = Pretreatment.loadTestData('/home/hadoop/workdatas/kaggle/DigitRecognizer/test.csv'); result = KNN.process(testDatas, trainDatas, trainLabels); Pretreatment.generateResultFile('/home/hadoop/workdatas/kaggle/DigitRecognizer/result_knn_10.csv', result);
xsize = tagger.xsize() for i in range(size): for j in range(xsize): char = tagger.x(i, j) tag = tagger.y2(i) if tag == 'O': test_result_file.write(char) elif tag == 'B_LOC' or tag == 'B_ORG' or tag == 'B_PRO' or tag == 'B_PER' or tag == 'B_TIME': test_result_file.write('(' + char) elif tag == 'E_LOC' or tag == 'E_ORG' or tag == 'E_PRO' or tag == 'E_PER' or tag == 'E_TIME': test_result_file.write(char + ')' + tag[2:]) else: test_result_file.write(char) test_result_file.write('\n') test_file.close() test_result_file.close() if __name__ == '__main__': pre = Pretreatment() pre.load_copus() name_indicator, loc_indicator, org_indicator, time_indicator, pro_indicator = pre.get_indicator( ) ner = NER() # ner.load_copus(name_indicator, loc_indicator, org_indicator, time_indicator, pro_indicator) ner.load_test_corpus(name_indicator, loc_indicator, org_indicator, time_indicator, pro_indicator) # tagger = CRFPP.Tagger('-m' + crf_model) # ner.recognize(tagger)