def tfidf_knn(news, category): c = idf_knn(news, category) wordlist = c.keys() tok = token__.tokenize2(news, category) label = tok.keys() matrix = {} print 'hello' for topic in label: matrix[topic] = {} for t in tok[topic].keys(): matrix[topic][t] = [] for i in range(len(c)): matrix[topic][t].append(0) print 'world' for topic in label: for i in tok[topic].keys(): for word in tok[topic][i]: if word in wordlist: matrix[topic][i][wordlist.index(i)] = tok[topic][i].count(i) * c[word] print 'today' #normalization for topic in matrix: for i in matrix[topic]: normalize(matrix[topic][i]) return matrix
def idf_knn(news, category): tok = token__.tokenize2(news, category) text = [] for topic in tok: for i in tok[topic]: text += list(set(tok[topic][i])) idf = {} for word in text: idf[word] = math.log(float(len(text)) / float(text.count(word))) / math.log(2) print len(idf) return idf
def classify(guess, p, category): cl = {} tok = token__.tokenize2(guess, category) for i in range(len(tok)): cl[i] = {} for j in tok[i].keys(): cl[i][j] = {} for k in range(len(category)): cl[i][j][k] = 0 for word in tok[i][j]: if word in p: cl[i][j][k] += p[word][k] result = {} for i in range(len(cl)): result[i] = {} for j in cl[i].keys(): for k in cl[i][j]: if cl[i][j][k] == max(cl[i][j].values()): result[i][j] = k for i in range(len(result)): print category[i] + ':' for j in result[i].keys(): print ' ' + category[result[i][j]] + ' ' + j TP_ = 0 #overall TP FP_ = 0 #overall FP FN_ = 0 #overall FN for i in range(len(result)): TP = result[i].values().count(i) FP = len(result[i].values()) - TP FN = 0 for j in range(len(result)): if j != i: FN += result[j].values().count(i) print category[i] + ': ' + 'TP = ' + str(TP) + ',FP = ' + str(FP) + ',FN = ' + str(FN) print ' Precision = ' + str(float(TP) / float(TP + FP)) + ', Recall = ' + str(float(TP) / float(TP + FN)) print ' F1 = ' + str(float(2 * TP) / float(2 * TP + FP + FN)) TP_ += TP FP_ += FP FN_ += FN print ' Microaveraged F1 = ' + str(float(2 * TP_) / float(2 * TP_ + FP_ + FN_))
def bayes(news, category): tok = token__.tokenize2(news, category) text = [] text_c = {} s = [] for i in range(len(tok)): tmp = 0 text_c[i] = [] for doc in tok[i]: text += tok[i][doc] text_c[i] += tok[i][doc] tmp += len(tok[i][doc]) s.append(tmp) text = list(set(text)) p = {} for word in text: p[word] = {} for i in range(len(text_c)): p[word][i] = math.log(float(text_c[i].count(word) + 1) / float(s[i] + len(text))) return p
def not_naive(news, category): tok = token__.tokenize2(news, category) text = [] text_c = {} text_raw = [] s = [] for i in range(len(tok)): tmp = 0 text_c[i] = [] for doc in tok[i]: text += tok[i][doc] text_c[i] += tok[i][doc] text_raw += tok[i][doc] tmp += len(tok[i][doc]) s.append(tmp) text = list(set(text)) p = {} for word in text: p[word] = {} for i in range(len(text_c)): p[word][i] = math.log(float(text_c[i].count(word) + 1) / float(s[i] + len(text))) p_optimal = {} for word in p: for i in p[word].keys(): if p[word][i] == max(p[word].values()): a = text_c[i].count(word) c = text_raw.count(word) * 0.5 if a >= c: p_optimal[word] = {} for word in p_optimal: for i in range(len(text_c)): p_optimal[word][i] = p[word][i] return p_optimal