def KmeansForAgeEst2(db, where, users, n_clusters): X = [] X_users = [] centers = [] est = [] est_v = [] for at in where: _users = [users[i] for i in at] X.append(pymongo_utill.toTimeFreq(db, _users)) X_users.append(_users) for c, x in enumerate(X): km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) km.fit(x) centers.append(km.cluster_centers_) max_0 = 0 max_1 = 0 est_0_v = "" est_1_v = "" for i, u in enumerate(x): sim = pairwise_kernels(km.cluster_centers_, u, metric="cosine") if max_0 < sim[0]: est_0 = X_users[c][i] max_0 = sim[0] est_0_v = u if max_1 < sim[1]: est_1 = X_users[c][i] max_1 = sim[1] est_1_v = u est.append((est_0, est_1)) est_v.append((est_0_v, est_1_v)) return centers
def ageEstimation(): conn = pymongo_utill.getConnectionToMongoDB() db = conn['TwitterInsert2'] #feature_vectors, labels, screen_names = pymongo_utill.byTimeFreq(db, sample=225) screen_names, labels = pymongo_utill.loadUsers(db, sample=1256) #screen_names, labels = pymongo_utill.loadUsers(db, sample=50) conn.disconnect() skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True, random_state=100) score = [] precision = [0, 0] recall = [0, 0] F_score = [0, 0] for train, test in skf: vectorizer = WordVectorizer() X_w = [] X = [] p_lab = [] """ screen_names_tr = [screen_names[i] for i in train] selector = SelectKBest(score_func=chi2, k=16000) for screen_name in screen_names_tr: tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100) vectorizer.fit(tweets) vectorizer.sort_voc() for screen_name in screen_names: tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100) X_w.append(vectorizer.transform(tweets)[0]) """ X_t = pymongo_utill.toTimeFreq(db, screen_names) #X_w = np.array(X_w) #selector.fit(X_w[train], labels[train]) #X_w = selector.transform(X_w) """ for w, t in zip(X_w, X_t): X.append(np.append(w,t)) """ X = np.array(X_t) svr = SVC(kernel="linear", C=100) svr.fit(X=X[train], y=labels[train]) score.append(svr.score(X=X[test], y=labels[test])) p_lab = svr.predict(X[test]) scores = precision_recall_fscore_support(labels[test], p_lab) precision = [a+b for a, b in zip(precision, scores[0])] recall = [a+b for a, b in zip(recall, scores[1])] F_score = [a+b for a, b in zip(F_score, scores[2])] score = np.array(score) print('-' * 76) print("Cross-Validation scores:%s" % score) print("Mean Score:%s" % np.mean(score)) print("Mean Precision:%s" % [float(precision[0])/5, float(precision[1])/5]) print("Mean recall:%s" % [float(recall[0])/5, float(recall[1])/5]) print("Mean F_score:%s" % [float(F_score[0])/5, float(F_score[1])/5]) print('-' * 76)
def KmeansForAgeEst(db, where, users, n_clusters): X = [] map = [] cor_k = [] for at in where: _users = [users[i] for i in at] X.append(pymongo_utill.toTimeFreq(db, _users)) for i, x in enumerate(X): km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10) km.fit(x) map = [i]*len(x) cor_k += [tmp+(i*n_clusters) for tmp in km.predict(x)] return cor_k, map
def ageEstimationByCluser(file): conn = pymongo_utill.getConnectionToMongoDB() db = conn['TwitterInsert2'] screen_names, labels = pymongo_utill.loadUsers(db, sample=1254) #screen_names, labels = pymongo_utill.loadUsers(db, sample=50) skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True, random_state=100) score = [] precision = [0, 0] recall = [0, 0] F_score = [0, 0] error_svm = [] error_proposed_msd = [] error_both = [] for train, test in skf: screen_names_tr = [screen_names[i] for i in train] vectorizer = WordVectorizer() selector = SelectKBest(score_func=chi2, k=16000) for screen_name in screen_names_tr: tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100) vectorizer.fit(tweets) vectorizer.sort_voc() X_w = [] for screen_name in screen_names: tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100) X_w.append(vectorizer.transform(tweets)[0]) X_w = np.array(X_w) X_w_t = selector.fit_transform(X_w[train], labels[train]) X_w_ts = selector.transform(X_w[test]) #X_w = selector.fit_transform(X_w, labels) X_t = pymongo_utill.toTimeFreq(db, screen_names) where = [] for threshold in [0, 1]: where.append(np.argwhere(labels[train] == threshold)) n_clusters = 3 centers = clustering.KmeansForAgeEst2(db, where, screen_names_tr, n_clusters) svr = SVC(probability=True, kernel="linear", C=100) svr.fit(X_w_t, labels[train]) """ for w, t in zip(X_t, X_w): X.append(np.append(w,t)) """ X = [] for w, t in zip(X_w_ts, X_t[test]): X.append((w, t)) X = np.array(X) right = 0 indetable = 0 screen_names_ts = [screen_names[i] for i in test] p_lab = [] centers = [c for center in centers for c in center] for i, ts in enumerate(X): w, t = ts V_sim = pairwise_kernels(centers, t, metric="chi2") V_sim = [sim/sum(V_sim) for sim in V_sim] prd_pro0 = svr.predict_proba(w)[0][0] prd_pro1 = svr.predict_proba(w)[0][1] if max(V_sim[:n_clusters]) * prd_pro0 > max(V_sim[n_clusters:]) * prd_pro1: predic = 0 elif max(V_sim[:n_clusters]) * prd_pro0 < max(V_sim[n_clusters:]) * prd_pro1: predic = 1 else: indetable += 1 p_lab.append(predic) if predic == labels[test][i]: right += 1 if prd_pro0 > prd_pro1 and labels[test][i] == 1: error_svm.append(screen_names_ts[i]) if prd_pro0 < prd_pro1 and labels[test][i] == 0: error_svm.append(screen_names_ts[i]) else: if prd_pro0 < prd_pro1 and labels[test][i] == 1: error_proposed_msd.append(screen_names_ts[i]) elif prd_pro0 > prd_pro1 and labels[test][i] == 0: error_proposed_msd.append(screen_names_ts[i]) else: error_both.append(screen_names_ts[i]) scores = precision_recall_fscore_support(labels[test], p_lab) precision = [a+b for a, b in zip(precision, scores[0])] recall = [a+b for a, b in zip(recall, scores[1])] F_score = [a+b for a, b in zip(F_score, scores[2])] score.append(float(right)/len(X)) for name in error_svm: file.write("error_svm:"+name+'\n') for name in error_proposed_msd: file.write("error_propsed_msd:"+name+"\n") for name in error_both: file.write("error_both:"+name+"\n") score = np.array(score) print('-' * 76) print("Cross-Validation scores:%s" % score) print("Mean Score:%s" % np.mean(score)) print("Mean Precision:%s" % [float(precision[0])/5, float(precision[1])/5]) print("Mean recall:%s" % [float(recall[0])/5, float(recall[1])/5]) print("Mean F_score:%s" % [float(F_score[0])/5, float(F_score[1])/5]) print('-' * 76)