Example #1
0
def testClusteringTopics():
    D = noPreprocessTopics('topics.txt')
    print(len(D))
    description, clusters, vectorspace, labels, distances = clustering(D, 'topics', number_clusters = 38)
    clusterToInterpret = list(filter(lambda x: 11 in x, clusters))
    print(clusterToInterpret)
    medoid, mean = interpret(np.array(clusterToInterpret), D, distances, 'topics')
    print("Medoid: ", medoid)
    print("Mean: ", mean)
    sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances)
    print("Silhouette: ", sil_score)
    print("AvgCoheision: ", avgCoheision)
    print("AvgSeparation: ", avgSeparation)
    cohesionValues = cohesion(labels, distances)
    separationValues = separation(labels, distances)

    plt.title('Cohesions values distribuition for topics')
    plt.xlabel('Cohesion')
    plt.ylabel('Count')
    plt.hist(list(cohesionValues.values()), bins = 38)
    plt.show()

    plt.title('Separation values distribuition for tpoics')
    plt.xlabel('Separation')
    plt.ylabel('Count')
    plt.hist(list(separationValues.values()), bins = 38)
    plt.show()
Example #2
0
def testClustering2():
    print("Getting docs...")
    #D = list(getEvaledDocs('qrels.train'))
    D = getAllFiles('./rcv1/D_train/')
    description, clusters, vectorspace, labels, distances = clustering(D, 'docs', number_clusters = 100)
    medoid, mean = interpret(clusters[0], D, distances, 'docs')
    print("Medoid: ", medoid)
    print("Mean: ", mean)
    sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances)
    print("Silhouette: ", sil_score)
    print("AvgCoheision: ", avgCoheision)
    print("AvgSeparation: ", avgSeparation)
    cohesionValues = cohesion(labels, distances)
    separationValues = separation(labels, distances)


    plt.title('Cohesions values distribuition for Dtrain')
    plt.xlabel('Cohesion')
    plt.ylabel('Count')
    plt.hist(list(cohesionValues.values()), bins = 38)
    plt.show()

    plt.title('Separation values distribuition for Dtrain')
    plt.xlabel('Separation')
    plt.ylabel('Count')
    plt.hist(list(separationValues.values()), bins = 38)
    plt.show()
Example #3
0
def testClustering1():
    D = []
    for i in ['R101' , 'R121', 'R150', 'R170', 'R180']:
        D += getEvaledDocsForTopic('qrels.test', i, 'test')
    description, clusters, vectorspace, labels, distances = clustering(D, 'docs', number_clusters = 11)
    
    medoid, mean = interpret(clusters[0], D, distances, 'docs')
    print("Medoid: ", medoid)
    print("Mean: ", mean)
    sil_score, avgCoheision, avgSeparation = evaluation(D, vectorspace, labels, distances)
    print("Silhouette: ", sil_score)
    print("AvgCoheision: ", avgCoheision)
    print("AvgSeparation: ", avgSeparation)
    
    cohesionValues = cohesion(labels, distances)
    separationValues = separation(labels, distances)

    print(cohesionValues)
    print(separationValues)

    plt.title('Cohesions values distribuition for document collection')
    plt.xlabel('Cohesion')
    plt.ylabel('Count')
    plt.hist(list(cohesionValues.values()), bins = 11)
    plt.show()

    plt.title('Separation values distribuition for document collection')
    plt.xlabel('Separation')
    plt.ylabel('Count')
    plt.hist(list(separationValues.values()), bins = 11)
    plt.show()
Example #4
0
File: main.py Project: jesHrz/tipdm
def main():
    cluster.clustering(os.path.join(messages_path, '附件3_clean.xlsx'), os.path.join(results_path, 'result_last.xlsx'))
    hot_points.hotpoints_classify(os.path.join(results_path, 'result_last.xlsx'),
                                  os.path.join(results_path, 'result_hotpoints.xlsx'))

    max_label = merge_xlsx(os.path.join(results_path, 'result_last.xlsx'),
                           os.path.join(results_path, 'result_hotpoints.xlsx'),
                           os.path.join(results_path, 'result_merge.xlsx'))

    messages_dcit = filter_data(os.path.join(results_path, 'result_merge.xlsx'),
                                os.path.join(results_path, 'result_filter.xlsx'), max_label=max_label)

    ans = get_score(os.path.join(results_path, 'result_filter.xlsx'))
    score = sorted(ans.items(), key=lambda x: x[1], reverse=True)

    print(score)
    generate_hotpoints_xlsx(messages_dcit, score)
Example #5
0
def predictionLabel(data_test):

    label_pred = cluster.clustering()

    for i in range(len(label_pred)):
        if label_pred[i] == 0:
            x = [i for i in range(24)]
            plt.axis([0, 24, 0, 1.2])
            plt.xlabel('时刻/h')
            plt.ylabel('负荷')
            plt.title('这是第1类负荷趋势')
            plt.plot(x, data1[i], 'm')
    plt.show()

    for i in range(len(label_pred)):
        if label_pred[i] == 1:
            x = [i for i in range(24)]
            plt.axis([0, 24, 0, 1.2])
            plt.xlabel('时刻/h')
            plt.ylabel('负荷')
            plt.title('这是第2类负荷趋势')
            plt.plot(x, data1[i], 'g')

    plt.show()
    for i in range(len(label_pred)):

        if label_pred[i] == 2:
            x = [i for i in range(24)]
            plt.axis([0, 24, 0, 1.2])
            plt.xlabel('时刻/h')
            plt.ylabel('负荷')
            plt.title('这是第3类负荷趋势')
            plt.plot(x, data1[i], 'r')
    plt.show()
    for i in range(len(label_pred)):
        if label_pred[i] == 3:
            x = [i for i in range(24)]
            plt.axis([0, 24, 0, 1.2])
            plt.xlabel('时刻/h')
            plt.ylabel('负荷')
            plt.title('这是4类负荷趋势')
            plt.plot(x, data1[i], 'k')
    plt.show()

    model = decisionTree(date_rowDate, tem_rowDate, label_pred)
    label = model.predict(data_test)
    #print("这天所属类别为第"+str(label)+"类")
    #print("这天所属类别为第"+str(label+1)+"类")
    #print(label)
    return label
Example #6
0
def main(argv):
    evaluation_dir = ''
    output_dir = ''

    if len(sys.argv) == 5:
        try:
            opts, args = getopt.getopt(argv, "hc:o:", ["cfile=", "ofile="])
        except getopt.GetoptError:
            print 'error'
            sys.exit(2)
        for opt, arg in opts:
            if opt == '-h':
                print 'main.py -c /path/to/evaluation/directory -o path/to/output/directory'
                sys.exit()
            elif opt in ("-c", "--cfile"):
                evaluation_dir = arg
            elif opt in ("-o", "--ofile"):
                output_dir = arg

    filelog = "Log.out"
    logging.basicConfig(filename=filelog, filemode='a', level=logging.DEBUG)

    info_path = os.path.join(evaluation_dir, "info.json")
    problem_folder = evaluation_dir
    dict_file = read_data.file_info(info_path)
    print dict_file
    # create folder to store csv file
    csv_folder = r"./csv"
    if not os.path.exists(csv_folder):
        os.makedirs(csv_folder)
    print "convert to csv.."
    # convert to csv
    read_data.convert_to_csv(problem_folder, csv_folder, dict_file)

    # create folder to store output file
    output_folder = output_dir
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    produce_output.create_output_folder_problem(output_folder, dict_file)

    print "processing each file.."
    # read each csv file and input to clustering algorithm
    for k, v in dict_file.iteritems():
        csv_path = "./csv/" + k + "." + v + ".csv"
        print csv_path
        out_path = os.path.join(output_folder, k)
        print out_path
        text = pd.read_csv(csv_path, header=0, quoting=csv.QUOTE_MINIMAL)
        labels = []
        vec_features = []
        dict_output = {}
        dict_features = {}

        if v == 'en':
            model = gensim.models.word2vec.Word2Vec.load_word2vec_format(
                'GoogleNews-vectors-negative300.bin', binary=True)
            vec_features = word2vec_average.getAvgFeatureVecs(
                word2vec_average.getCleanReviews(text, v), model, 300)
            labels = cluster.clustering_word2vec(vec_features)
        elif v == "nl":
            model = gensim.models.word2vec.Word2Vec.load(
                'nl-word2vec-model-300-word.bin')
            vec_features = word2vec_average.getAvgFeatureVecs(
                word2vec_average.getCleanReviews(text, v), model, 300)
            labels = cluster.clustering_word2vec(vec_features)
        elif v == 'gr':
            all_sent = []
            for article in text['article']:
                all_sent.append(article)
                vec_features, labels = cluster.clustering(
                    all_sent, dict_file[k])
        i = 0
        for id in text["id"]:
            dict_output[id] = labels[i]
            dict_features[id] = vec_features[i]
            i += 1
        list_all = produce_output.write_cluster(dict_output, out_path)

        # similarity between documents
        list_comb, all_sim = cluster.similarity_score(list_all, dict_features)
        list_sim = produce_output.write_ranking(list_comb, all_sim, out_path)
Example #7
0
    return xm.dot(inverse_cov).dot(xm.T)

function_id = 8
dimension = 2
obj = CEC2005(dimension)[function_id].objective_function
xlim = [1.5 ,3.25]
ylim = [-2, -1]
positions = np.random.uniform([xlim[0], ylim[0]], [xlim[1], ylim[1]], size=(20,2))
fitnesses = np.array([ obj(x) for x in positions ])
draw( obj, 'test.png', xlim=xlim, ylim=ylim, scatter = positions )
from cluster import weighted_gaussian
mean, cov = weighted_gaussian( positions, fitnesses )
from scipy.stats import multivariate_normal
rv = multivariate_normal(mean, cov)
from cluster import clustering
labels = clustering(positions, fitnesses)
print('mean:', mean)
print('cov:\n', cov)
print(labels)


x, y = np.mgrid[xlim[0]:xlim[1]:((xlim[1]-xlim[0])/100), ylim[0]:ylim[1]:((ylim[1]-ylim[0])/100)] 
pos = np.empty(x.shape+(2,))
pos[:,:,0] = x
pos[:,:,1] = y
plt.contourf(x, y, rv.pdf(pos))
plt.savefig('contourf.png')
plt.close()


new_p = np.array([2.35, -1.6])
Example #8
0
    # Transform projection in space between (0.1, 0.9)
    for j in range(X_projected.shape[1]):
        _min = numpy.min(X_projected[:, j])
        _max = numpy.max(X_projected[:, j])
        if _min < 0:
            X_projected[:, j] += numpy.abs(_min)
            _max += numpy.abs(_min)
        X_projected[:, j] = 0.1 + 0.8 * (X_projected[:, j] / _max)

    # Plot the original data.
    for lbl in numpy.unique(y_truth[:, i]):
        ax[0,i].plot(X_projected[:,0][y_truth[:,i]==lbl], \
            X_projected[:,1][y_truth[:,i]==lbl], 'o', color=PLOTCOLS[lbl])

    # Cluster the data.
    y = clustering(X[:, :, i], mode="KMEANS", n_clusters=k)

    # Compute the silhouette score.
    sil = silhouette_score(X[:, :, i], y, metric="euclidean")

    # Compute overlap. Note that clusters might
    # overlap, but be labelled with a different value.
    # Values are arbitrary, so this should be
    # corrected.
    # Run through all possible permutations of labels.
    perms = list(itertools.permutations(range(k)))
    overlap = numpy.zeros(y.shape[0], dtype=bool)
    max_overlap = 0
    closest_perm = list(numpy.unique(y))
    for perm in perms:
        # Create permutated array.
Example #9
0
        # Load data.
        X = numpy.loadtxt(fpath_in, dtype=float, delimiter=",", \
            skiprows=1, unpack=False)
        y_truth = X[:,0].astype(int)
        X = X[:,1:]
        
        # Create empty matrices to hold the outcomes in.
        y = numpy.zeros((X.shape[0],len(CLUSTER_K[cluster_method])), dtype=int) \
            * numpy.NaN
        s = numpy.zeros((X.shape[0],len(CLUSTER_K[cluster_method])), dtype=float) \
            * numpy.NaN

        # Run through the requested numbers of clusters.
        for ki, k in enumerate(CLUSTER_K[cluster_method]):
            # Run cluster analysis.
            y[:,ki] = clustering(X, mode=cluster_method, n_clusters=k)
            # Compute a silhouette coefficient for each sample, but only if
            # more than one cluster was detected.
            if len(numpy.unique(y[:,ki])) == 1:
                s[:,ki] = 0.0
            else:
                s[:,ki] = silhouette_samples(X, y[:,ki], metric='euclidean')
        
        # Write to file.
        with open(fpath_out, "w") as f:
            header = ["cluster"]
            for ki, k in enumerate(CLUSTER_K[cluster_method]):
                header.append("y_k={}".format(k))
                header.append("s_k={}".format(k))
            f.write(",".join(map(str,header)))
            for i in range(X.shape[0]):
Example #10
0
def main(argv):
    evaluation_dir = ''
    output_dir = ''

    if len(sys.argv) == 5:
        try:
            opts, args = getopt.getopt(argv, "hc:o:", ["cfile=", "ofile="])
        except getopt.GetoptError:
            print 'error'
            sys.exit(2)
        for opt, arg in opts:
            if opt == '-h':
                print 'main.py -c /path/to/evaluation/directory -o path/to/output/directory'
                sys.exit()
            elif opt in ("-c", "--cfile"):
                evaluation_dir = arg
            elif opt in ("-o", "--ofile"):
                output_dir = arg

    filelog = "Log.out"
    logging.basicConfig(filename=filelog, filemode='a', level=logging.DEBUG)

    info_path = os.path.join(evaluation_dir, "info.json")
    problem_folder = evaluation_dir
    dict_file = read_data.file_info(info_path)
    print dict_file
    # create folder to store csv file
    csv_folder = r"./csv"
    if not os.path.exists(csv_folder):
        os.makedirs(csv_folder)
    print "convert to csv.."
    # convert to csv
    read_data.convert_to_csv(problem_folder, csv_folder, dict_file)

    # create folder to store output file
    output_folder = output_dir
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    produce_output.create_output_folder_problem(output_folder, dict_file)

    print "processing each file.."
    # read each csv file and input to clustering algorithm
    for k, v in dict_file.iteritems():
        csv_path = "./csv/" + k + "." + v + ".csv"
        print csv_path
        out_path = os.path.join(output_folder, k)
        print out_path
        text = pd.read_csv(csv_path, header=0, quoting=csv.QUOTE_MINIMAL)
        labels = []
        vec_features = []
        dict_output = {}
        dict_features = {}

        if v == 'en':
            model = gensim.models.word2vec.Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
            vec_features = word2vec_average.getAvgFeatureVecs(word2vec_average.getCleanReviews(text, v), model, 300)
            labels = cluster.clustering_word2vec(vec_features)
        elif v == "nl":
            model = gensim.models.word2vec.Word2Vec.load('nl-word2vec-model-300-word.bin')
            vec_features = word2vec_average.getAvgFeatureVecs(word2vec_average.getCleanReviews(text, v), model, 300)
            labels = cluster.clustering_word2vec(vec_features)
        elif v == 'gr':
            all_sent = []
            for article in text['article']:
                all_sent.append(article)
                vec_features, labels = cluster.clustering(all_sent, dict_file[k])
        i = 0
        for id in text["id"]:
            dict_output[id] = labels[i]
            dict_features[id] = vec_features[i]
            i += 1
        list_all = produce_output.write_cluster(dict_output, out_path)

        # similarity between documents
        list_comb, all_sim = cluster.similarity_score(list_all, dict_features)
        list_sim = produce_output.write_ranking(list_comb, all_sim, out_path)
Example #11
0
def main(videoPath, expect, showIt):
    start = time.time()
    '''
	Argument:
	disHor			--圆环中心距离机器人水平距离
	disVer			--圆环中心距离机器人垂直距离
	ringActual		--圆环实际半径大小
	cameraHight		--摄像机高度
	ballR			--球的实际半径
	'''
    disHor = 600
    disVer = 200
    robotR = 20
    ringActual = 40
    cameraHight = 20
    ballR = 17

    disHor -= robotR
    disVer += ringActual

    #pathL,pathR=videoPath

    camera = cv2.VideoCapture(videoPath)
    res, image = camera.read()
    res, image = camera.read()
    if not res:
        print("Image not captured!")
        return

    camera.release()

    x, y, r, points, isDetect = imageToPoints(image, disHor, videoPath, showIt)

    if (not isDetect):
        return decide(0, expect)
    if (len(points) > 7):
        points = cluster.clustering(points, showIt)

    #如果检测出来的球个数少于7个,则无法进行曲线拟合,跳出循环
    print('points number:', len(points))
    if (len(points) < 4):
        print("Detected Ball not enough, Exit System")
        return decide(0, expect)

    coordinate, ringXYZ = obtainCoordinate(image, disHor, disVer, ringActual,
                                           cameraHight, ballR, points, x, y, r)

    if (max(coordinate[1]) < disHor / 3):
        return decide(0, expect)

    line.drawGraph(coordinate.tolist(), showIt)
    bp = leastsq.draw3DLine(coordinate, showIt)
    bx, bz = leastsq.predictBallPos(disHor, bp)
    print("Ball position around ring\n (x,y,z)=({:.2f},{:.2f},{:.2f})".format(
        bx, disHor, bz))
    print("Ring position:\n (x,y,z)=({:.2f},{:.2f},{:.2f})".format(
        ringXYZ[0], ringXYZ[1], ringXYZ[2]))

    if (disVer - ringActual - 10 < bz < disVer + ringActual + 10):
        result = 1
    else:
        result = 0
    print("Use {:.2f} seconds".format(time.time() - start))
    return decide(result, expect), (bx, disHor, bz), ringXYZ
Example #12
0
def cluster(df,
            NGroups=2,
            category_dic={
                PROD: [],
                COUNTRY: ['Ethiopia']
            },
            mode=0,
            Alg=0,
            init_mode=0,
            norm=False,
            PCA=False,
            dim=10):
    """
    cluster de dataframe aan de hand van de category_dic en mode.

    category_dic is waar je op categoriseerd. Een lege lijst betekend alles.
    mode:
        - 0 pakt de PRICE als value
        - 1 pakt de Gradient als value
        - 2 pakt de PRICE en Gradient als value
    Alg:
        - 0 pakt de distance als clustering method
        - 1 pakt de cosine als clustering method
    init_mode:
        - 0 zorgt ervoor dat de category cluster toegewezen krijgen door de range(NGroups) te herhalen en daarna opvullen met 0. bijv 0 1 2 0 1 2 0 0
        - 1 zorgt ervoor dat de category cluster toegewezen krijgen door de eerste n categorieen cluster 0 te geven dan 1 etc. bijv. 0 0 0 1 1 1 2 2
        - 2 zorgt ervoor dat de category random cluster toegewezen krijgen.
    norm:
        - False is niet genormaliseerde data
        - True is genormaliseerde data
    PCA:
        - False pas PCA niet toe
        - True pas PCA wel toe. (plotten kan dan niet meer, tenzij je niet dim gebruikt)
    """
    value = PRICE
    if mode == 1:
        value = 'Gradient'

    # creeer de dataset voor k-means
    dates, categories, data = df_to_np_date_price(df,
                                                  category_dic,
                                                  value=value)
    # print(categories) #print de catogorien die worden geclusterd
    # print(len(categories))

    if norm:
        data = (data - np.nanmin(data, axis=1)[:, None]) / (
            np.nanmax(data, axis=1) - np.nanmin(data, axis=1))[:, None]

    if mode == 2:
        _, _, data2 = df_to_np_date_price(df, category_dic, value='Gradient')
        tmp_data = data

        if norm:
            data2 = (data2 - np.nanmin(data2, axis=1)[:, None]) / (
                np.nanmax(data2, axis=1) - np.nanmin(data2, axis=1))[:, None]

        data = np.concatenate((data, data2), axis=1)

    # clustering, als het verschil tussen de het nieuwe en oude gemiddelde convergeert is is het clusteren klaar.
    i = 0
    if PCA and norm:
        data = clus.PCA(data, dim)
    datagroup = clus.clustering(data, NGroups, init_mode)
    while np.max(
            np.sqrt(
                np.nansum((datagroup.GroupAvg - datagroup.NewGroupAvg)**2,
                          axis=1))) > 0.01 and i < 100:
        # print(datagroup.data[:,-1]) #print de tussen stappen van k-means
        if Alg == 0:
            datagroup.Clustering()
        elif Alg == 1:
            datagroup.Clustering2()
        i += 1

    # maak de data weer met alleen PRICE
    if mode == 2:
        data = tmp_data

    # maak een dictionary met de cluster groepen.
    dic = {}
    for cat, group in zip(categories, datagroup.data[:, -1]):
        if group in dic:
            dic[group].append(cat)
        else:
            dic[group] = [cat]

    # print de dictionary
    # i = 0
    # for group, catLst in dic.items():
    #     print(group, len(catLst))
    #     print(catLst)
    #     print(np.nanmean(datagroup.NewGroupAvg[i]))
    #     i += 1

    # plt.rcParams['axes.prop_cycle'] = "cycler('ls', ['-','--','-.',':']) * cycler(u'color', ['r','g','b','c','k','y','m','934c00'])" #changes the colour of the graph lines
    # for i, row in enumerate(data):
    #     # if i == 0:
    #     #     continue
    #     # if i > 3:
    #     #     break
    #     D = [float(date.split("-")[0]) + (float(date.split("-")[1]) - 1) / 12 for date in dates]
    #     plt.plot(D, row, label=categories[i])
    #
    # # # plot de cluster gemiddelde
    # # for i in range(NGroups):
    # #     D = [float(date.split("-")[0]) + (float(date.split("-")[1]) - 1) / 12 for date in dates]
    # #     if mode == 2:
    # #         plt.plot(D, datagroup.NewGroupAvg[i, :data.shape[1]], label=i)
    # #     else:
    # #         plt.plot(D, datagroup.NewGroupAvg[i, :], label=i)
    #
    # # plot
    # plt.rcParams.update({'font.size': 16})
    # plt.rcParams['legend.fontsize'] = 16
    # plt.legend(fancybox=True,loc="best",framealpha=0.8)
    # plt.ylabel('Betaalbaarheid-index', fontsize=18)
    # plt.xlabel('Datum (jaren)', fontsize=16)
    # plt.show(True)
    return dic, data