Esempio n. 1
0
def KmeansForAgeEst2(db, where, users, n_clusters):
    X = []
    X_users = []
    centers = []
    est = []
    est_v = []
    for at in where:
        _users = [users[i] for i in at]
        X.append(pymongo_utill.toTimeFreq(db, _users))
        X_users.append(_users)
    for c, x in enumerate(X):
        km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
        km.fit(x)
        centers.append(km.cluster_centers_)
        max_0 = 0
        max_1 = 0
        est_0_v = ""
        est_1_v = ""
        for i, u in enumerate(x):
            sim = pairwise_kernels(km.cluster_centers_, u, metric="cosine")
            if max_0 < sim[0]:
                est_0 = X_users[c][i]
                max_0 = sim[0]
                est_0_v = u
            if max_1 < sim[1]:
                est_1 = X_users[c][i]
                max_1 = sim[1]
                est_1_v = u
        est.append((est_0, est_1))
        est_v.append((est_0_v, est_1_v))

    return centers
def ageEstimation():
    conn = pymongo_utill.getConnectionToMongoDB()
    db = conn['TwitterInsert2']
    #feature_vectors, labels, screen_names = pymongo_utill.byTimeFreq(db, sample=225)
    screen_names, labels = pymongo_utill.loadUsers(db, sample=1256)
    #screen_names, labels = pymongo_utill.loadUsers(db, sample=50)
    conn.disconnect()
    skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True, random_state=100)
    score = []
    precision = [0, 0]
    recall = [0, 0]
    F_score = [0, 0]
    for train, test in skf:
        vectorizer = WordVectorizer()
        X_w = []
        X = []
        p_lab = []
        """
        screen_names_tr = [screen_names[i] for i in train]
        selector = SelectKBest(score_func=chi2, k=16000)
        for screen_name in screen_names_tr:
            tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100)
            vectorizer.fit(tweets)
        vectorizer.sort_voc()
        for screen_name in screen_names:
            tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100)
            X_w.append(vectorizer.transform(tweets)[0])
        """
        X_t = pymongo_utill.toTimeFreq(db, screen_names)
        #X_w = np.array(X_w)
        #selector.fit(X_w[train], labels[train])
        #X_w = selector.transform(X_w)

        """
        for w, t in zip(X_w, X_t):
            X.append(np.append(w,t))
        """

        X = np.array(X_t)

        svr = SVC(kernel="linear", C=100)
        svr.fit(X=X[train], y=labels[train])
        score.append(svr.score(X=X[test], y=labels[test]))
        p_lab = svr.predict(X[test])
        scores = precision_recall_fscore_support(labels[test], p_lab)
        precision = [a+b for a, b in zip(precision, scores[0])]
        recall = [a+b for a, b in zip(recall, scores[1])]
        F_score = [a+b for a, b in zip(F_score, scores[2])]

    score = np.array(score)
    print('-' * 76)
    print("Cross-Validation scores:%s" % score)
    print("Mean Score:%s" % np.mean(score))
    print("Mean Precision:%s" % [float(precision[0])/5, float(precision[1])/5])
    print("Mean recall:%s" % [float(recall[0])/5, float(recall[1])/5])
    print("Mean F_score:%s" % [float(F_score[0])/5, float(F_score[1])/5])
    print('-' * 76)
Esempio n. 3
0
def KmeansForAgeEst(db, where, users, n_clusters):
    X = []
    map = []
    cor_k = []
    for at in where:
        _users = [users[i] for i in at]
        X.append(pymongo_utill.toTimeFreq(db, _users))
    for i, x in enumerate(X):
        km = KMeans(init='k-means++', n_clusters=n_clusters, n_init=10)
        km.fit(x)
        map = [i]*len(x)
        cor_k += [tmp+(i*n_clusters) for tmp in km.predict(x)]
    return cor_k, map
def ageEstimationByCluser(file):
    conn = pymongo_utill.getConnectionToMongoDB()
    db = conn['TwitterInsert2']
    screen_names, labels = pymongo_utill.loadUsers(db, sample=1254)
    #screen_names, labels = pymongo_utill.loadUsers(db, sample=50)

    skf = cross_validation.StratifiedKFold(labels, n_folds=5, shuffle=True, random_state=100)
    score = []
    precision = [0, 0]
    recall = [0, 0]
    F_score = [0, 0]

    error_svm = []
    error_proposed_msd = []
    error_both = []

    for train, test in skf:
        screen_names_tr = [screen_names[i] for i in train]
        vectorizer = WordVectorizer()
        selector = SelectKBest(score_func=chi2, k=16000)
        for screen_name in screen_names_tr:
            tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100)
            vectorizer.fit(tweets)
        vectorizer.sort_voc()

        X_w = []
        for screen_name in screen_names:
            tweets = pymongo_utill.getUsersTweets(db, [screen_name], sample=100)
            X_w.append(vectorizer.transform(tweets)[0])
        X_w = np.array(X_w)
        X_w_t = selector.fit_transform(X_w[train], labels[train])
        X_w_ts = selector.transform(X_w[test])
        #X_w = selector.fit_transform(X_w, labels)
        X_t = pymongo_utill.toTimeFreq(db, screen_names)

        where = []
        for threshold in [0, 1]:
            where.append(np.argwhere(labels[train] == threshold))

        n_clusters = 3
        centers = clustering.KmeansForAgeEst2(db, where, screen_names_tr, n_clusters)
        svr = SVC(probability=True, kernel="linear", C=100)
        svr.fit(X_w_t, labels[train])
        """
        for w, t in zip(X_t, X_w):
            X.append(np.append(w,t))
        """

        X = []
        for w, t in zip(X_w_ts, X_t[test]):
            X.append((w, t))
        X = np.array(X)

        right = 0
        indetable = 0
        screen_names_ts = [screen_names[i] for i in test]
        p_lab = []
        centers = [c for center in centers
                        for c in center]
        for i, ts in enumerate(X):
            w, t = ts
            V_sim = pairwise_kernels(centers, t, metric="chi2")
            V_sim = [sim/sum(V_sim) for sim in V_sim]
            prd_pro0 = svr.predict_proba(w)[0][0]
            prd_pro1 = svr.predict_proba(w)[0][1]
            if max(V_sim[:n_clusters]) * prd_pro0 > max(V_sim[n_clusters:]) * prd_pro1:
                predic = 0
            elif max(V_sim[:n_clusters]) * prd_pro0 < max(V_sim[n_clusters:]) * prd_pro1:
                predic = 1
            else:
                indetable += 1
            p_lab.append(predic)
            if predic == labels[test][i]:
                right += 1
                if prd_pro0 > prd_pro1 and labels[test][i] == 1:
                    error_svm.append(screen_names_ts[i])
                if prd_pro0 < prd_pro1 and labels[test][i] == 0:
                    error_svm.append(screen_names_ts[i])
            else:
                if prd_pro0 < prd_pro1 and labels[test][i] == 1:
                    error_proposed_msd.append(screen_names_ts[i])
                elif prd_pro0 > prd_pro1 and labels[test][i] == 0:
                    error_proposed_msd.append(screen_names_ts[i])
                else:
                    error_both.append(screen_names_ts[i])


        scores = precision_recall_fscore_support(labels[test], p_lab)
        precision = [a+b for a, b in zip(precision, scores[0])]
        recall = [a+b for a, b in zip(recall, scores[1])]
        F_score = [a+b for a, b in zip(F_score, scores[2])]
        score.append(float(right)/len(X))

    for name in error_svm:
        file.write("error_svm:"+name+'\n')
    for name in error_proposed_msd:
        file.write("error_propsed_msd:"+name+"\n")
    for name in error_both:
        file.write("error_both:"+name+"\n")

    score = np.array(score)
    print('-' * 76)
    print("Cross-Validation scores:%s" % score)
    print("Mean Score:%s" % np.mean(score))
    print("Mean Precision:%s" % [float(precision[0])/5, float(precision[1])/5])
    print("Mean recall:%s" % [float(recall[0])/5, float(recall[1])/5])
    print("Mean F_score:%s" % [float(F_score[0])/5, float(F_score[1])/5])
    print('-' * 76)