Python kmeans 예제들, clustering.kmeans Python 예제들

예제 #1

0

파일 보기

파일: main.py 프로젝트: somyamohanty/Pixel_Dawgs

def calcBackProject(image, tags, histograms):
    probability = {}

    cl.kmeans(image)

    for tag in tags:
        print tag
        result = cv2.calcBackProject([image], [0, 1, 2], histograms[tag], [0, 180, 0, 256, 0, 256], 1)

        probabilityValue = cv2.countNonZero(result)/float(image.shape[0] * image.shape[1])
        probability[tag] = probabilityValue

    return probability

예제 #2

0

파일 보기

파일: cluster.py 프로젝트: darlliu/cscripts

def k (tags, data, K, tries=5):
    #first get the clustering info
    idxs=kmeans(str(data), K, tries)
    idxs=eval(idxs)
    print len(idxs)==len(tags)
    #for the clusters, get the numbers for eac``
    return idxs

예제 #3

0

파일 보기

파일: apply.py 프로젝트: enaserianhanzaei/DEC_clustering

def onlytransfer(n_clusters, fb_kmeans=True):

    X = np.load('data/chan/8chan_pol/VGG16/fc1/featuresx.npy')
    X = X.astype('float32')
    pathfile = open('data/chan/8chan_pol/VGG16/fc1/paths.txt', "r")
    pathlist = pathfile.readlines()
    pathlist = [path[:-1] for path in pathlist]
    pathfile.close()

    if fb_kmeans:
        #features = torch.from_numpy(features)
        images_lists, loss = kmeans(X,
                                    nmb_clusters=n_clusters,
                                    preprocess=False)
        Y_pred = arrange_clustering(images_lists)
    else:
        km = KMeans(n_clusters=n_clusters, n_init=20)
        Y_pred = km.fit_predict(X)

    for y_pred, path in zip(Y_pred, pathlist):
        savedir = '/home/elahe/NortfaceProject/codes/DEC-keras/results/clusters/8chan_pol/%s/%s/%s' % (
            'transfer', 'fc1', y_pred)
        if not os.path.exists(savedir):
            os.makedirs(savedir)

        shutil.copy(path, savedir)

예제 #4

0

파일 보기

def main():

    # check command-line arguments
    if len(sys.argv) not in [3, 4]:
        print "Error, incorrect number of arguments"
        usage()
        sys.exit(1)

    k = check_k(sys.argv[1])
    check_argv(sys.argv[2])
    if len(sys.argv) == 4:  # optional, file containing annotations
        check_argv(sys.argv[3])
        # initialize gene matrix using annotations
        geneMatrix = construct_geneMatrix(sys.argv[2], sys.argv[3])
    else:
        # index instances by line number in original file
        geneMatrix = construct_geneMatrix(sys.argv[2], None)

    #normalize matrix
    geneMatrix = normalize_matrix(geneMatrix)

    # call kmeans clustering on input files
    sse, aic, silhouette = kmeans(geneMatrix, k)

    # print results
    print "K-means with k = %d\n" % k
    print "%-15s %.2f\n%-15s %.2f\n%-15s %.2f\n" % ('SSE:', sse, 'AIC:', aic,
                                                    'Silhouette:', silhouette)

예제 #5

0

파일 보기

파일: gui.py 프로젝트: brentshermana/datascience_ml

    def run_k_means(self):
        print("RUNNING KMEANS")
        print("Points")
        print(self.points)
        print("Starting Centers: {}".format([self.points[0], self.points[1]]))
        centers, clusters = clustering.kmeans(self.points, [self.points[0], self.points[1]] )
        # whichever center is lower is the blue center
        blue_center = centers[0]
        if centers[1][1] < centers[0][1]:
            blue_center = centers[1]

        # draw the centers and points
        self.click_canvas.delete("all")
        for center_i, center in enumerate(centers):
            color = None
            if center is blue_center:
                color='blue'
            else:
                color='red'
            print("Cluster {} with center {}".format(color, center))
            print("Cluster Points:")
            print(clusters[center_i])
            self.draw_center(center, color=color)
            for cluster_coord in clusters[center_i]:
                self.draw_dot(cluster_coord, color=color)

예제 #6

0

파일 보기

def quantize_single_sample(points, k):
    """
    Applies quantization to a single sample with accelerometer observations.
    """
    X = np.asarray(points, dtype=np.float)
    centroids, _ = kmeans(X, n_clusters=k)
    feature_vector = centroids.flatten()
    return feature_vector

예제 #7

0

파일 보기

파일: gui.py 프로젝트: taichino/color_search_sample

    def __init__(self):
        super(ClustersWidget, self).__init__()

        layout = QtGui.QVBoxLayout()
        clusters = kmeans()
        for cluster in clusters:
            if len(cluster.palettes) == 0: continue
            cw = ClusterWidget(cluster)
            layout.addWidget(cw)
        self.setLayout(layout)

예제 #8

0

파일 보기

    def test_ffh_10(self):

        finalObj = []
        for rep in range(10):
            np.random.seed(1234 + rep)
            mu0 = clustering.initialize_clusters(self.X, 10, 'ffh')
            (mu, z, obj) = clustering.kmeans(self.X, mu0, doPlot=False)
            finalObj.append(obj[-1])

        targetObj = 0.44031610993896342
        self.assertTrue(abs(np.mean(finalObj) - targetObj) <= 1e4)

예제 #9

0

파일 보기

    def test_km_plus_plus_10(self):

        finalObj = []
        for rep in range(20):
            np.random.seed(1234 + rep)
            mu0 = clustering.initialize_clusters(self.X, 10, 'km++')
            (mu, z, obj) = clustering.kmeans(self.X, mu0, doPlot=False)
            finalObj.append(obj[-1])

        targetObj = 0.4392510535744174
        self.assertTrue(abs(np.mean(finalObj) - targetObj) <= 1e4)

예제 #10

0

파일 보기

파일: main.py 프로젝트: googleinterns/cloud-monitoring-accessible-charts

def cluster(algorithm, similarity, encoding, outlier, rep, chart_id, key=None):
    """Returns the cluster each time series was placed in.

    Args:
        algorithm: The algorithm used for clustering. Must be "K-means"
            or "DBSCAN".
        similarity: The similarity measure used for scaling the data
            before clustering. Must be "Proximity" or "Correlation".
        encoding: The method used for encoding the labels. Must
            be "None" or "One-Hot".
        outlier: Whether outliers are identified, must be "on" or "off".
        rep: Whether the data is represented as "lines" or "bands".
        chart_id: The id of the file containing the data that k-means
            clustering is run on.
        key: The key for the time series labels that are saved. If None,
            then all label values may be kept, otherwise only label
            values with that key are kept.

    Returns:
        A json with a list containing the label of the cluster each
        time series was grouped in, and the min_max of each cluster and
        the corresponding dates for each value if rep == "bands",
        otheriwse and dates are empty lists.
    """
    data = load_data(chart_id)
    if "timeSeries" not in data:
        return data
    (time_series_data, label_dict, ts_to_labels, dates,
     old_range) = clustering.time_series_array(data, key)
    ts_data_updated = clustering.preprocess(time_series_data, encoding,
                                            similarity, ts_to_labels,
                                            algorithm)
    if algorithm == "k-means":
        labels = clustering.kmeans(ts_data_updated, outlier).tolist()
    elif algorithm == "k-means-constrained" or algorithm == "k-medians":
        labels = clustering.kmeans_kmedians(ts_data_updated, label_dict,
                                            ts_to_labels, algorithm,
                                            outlier).tolist()
    elif algorithm == "zone":
        labels = clustering.cluster_zone(label_dict, ts_to_labels)
    else:
        labels = clustering.dbscan(ts_data_updated, similarity, encoding,
                                   outlier).tolist()
    min_max, ordered_dates, outlier_indexes = [], [], []
    if rep == "bands":
        min_max, ordered_dates, outlier_indexes = clustering.clusters_min_max(
            time_series_data, labels, dates, old_range, outlier)
    return jsonify({
        "cluster_labels": labels,
        "min_max": min_max,
        "dates": ordered_dates,
        "outlier_indexes": outlier_indexes
    })

예제 #11

0

파일 보기

    def test_kmeans(self):
        # Run k-means clustering on the test set using 4 clusters and check if the cluster centres
        # (centroids) lie close to the expected centroids based on a fixed seed
        centroids, assignments, numIts = clustering.kmeans(self.data,
                                                           self.numClusters,
                                                           maxNumIts=10)
        expectedCentroids = np.float32([[2.80, -2.73], [-3.38, -2.94],
                                        [2.62, 3.10], [-2.46, 2.78]])

        #		np.testing.assert_almost_equal(centroids, expectedCentroids, 2)

        if self.plotResults:
            self.plotData_kMeans(self.data, centroids, assignments, 1)

예제 #12

0

파일 보기

파일: run.py 프로젝트: lijiahong/opinion_news

    def step2_cal():
        """第二步计算，判断其他类是否需要分裂，若需要，则对其他类进行文本聚类，并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing)
            print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时，不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE)

                # 更新新闻簇标签，更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)

예제 #13

0

파일 보기

def kmeans(model_file=None, nclusters=None, output_file=None):
    word_vectors = KeyedVectors.load_word2vec_format(model_file, binary=False)
    clusters = clustering.kmeans(word_vectors, nclusters)[0]

    if output_file:
        out = open(output_file, 'w')
    else:
        out = sys.stdout

    for k in clusters:
        out.write("%s\n" % " ".join([fn for fn in clusters[k]]))

    if output_file:
        out.close()

예제 #14

0

파일 보기

파일: ai.py 프로젝트: KevinIoi/codenames-ai

    def getOptimalWordCluster(self, target_words, bomb_words):
        ''' Creates clusters of words by cosine similarity, with increasing
            centriod count until a cluster is created containing only target_words

            params:
                target_words (iterable, str):
                    words to maximize in clusters
                bomb_words (iterable, str):
                    words to be avoided in clusters
            returns:
                potential_groups (tuple)
                    A tuple containing the best cluster of words
                    format -> (group center word embedding, list of words in group)
        '''
        if len(target_words) < 1:
            raise ValueError("Empty target word list provided")
        if len(target_words) == 1:
            return target_words[0]

        full_word_set = np.array(target_words + bomb_words)

        embeddings = self.getWordEmbeddings(full_word_set)

        potential_groups = []
        num_groups = max(int(len(target_words) / 2), 1)

        # keep clustering until found a group of similar words without bomb_words
        # increase number of clusters after each failed iteration
        while not potential_groups:
            grouper = clustering.kmeans(k=num_groups, measure='cosine')
            grouper.fit(embeddings)
            cluster_labels = np.array(grouper.train_labels)

            #verify if any potential clusters were created
            for cluster in np.unique(cluster_labels):
                cluster_indices = cluster_labels == cluster
                current_cluster_words = full_word_set[cluster_indices]

                if len(current_cluster_words)>0 and \
                    not any([word in bomb_words for word in current_cluster_words]):
                    potential_groups.append(
                        [grouper.centroids[cluster], current_cluster_words])

            num_groups += 1
        # sort potential_groups by number of group members
        potential_groups = sorted(potential_groups,
                                  key=lambda x: len(x[1]),
                                  reverse=True)

        return potential_groups[0]

예제 #15

0

파일 보기

파일: plots.py 프로젝트: devforfu/Blog

def main():
    np.random.seed(1)

    data_path = join(dirname(__file__), 'datasets', 'blobs.csv')
    X, _ = read_csv(data_path)

    figure, axes = plt.subplots(2, 2, figsize=(12, 12))
    axes = axes.flatten()
    font_path = join(dirname(__file__), 'fonts', 'RobotoSlab-Regular.ttf')
    font = font_manager.FontProperties(fname=font_path)
    font.set_size(16)
    gray = '#3c3c3c'

    for i, n_clusters in enumerate((2, 3, 4, 5)):
        print('Running K-means with k=%d' % n_clusters)
        centroids, score = kmeans(X, n_clusters)
        print('Best inertia score: %.2f' % score)

        letter = string.ascii_letters[i]
        title = '(%s) k=%d, inertia=%2.2f' % (letter, n_clusters, score)
        labels = assign_labels(X, centroids)
        get_color = palette()
        colors = [get_color(l) for l in labels]

        axes[i].scatter(X[:, 0], X[:, 1], c=colors, s=50, alpha=0.6)
        axes[i].set_title(title, fontproperties=font, color=gray)
        axes[i].set_xticks([])
        axes[i].set_yticks([])

        for spine in ('top', 'right', 'bottom', 'left'):
            axes[i].spines[spine].set_color(gray)

        for (x, y) in centroids:
            axes[i].plot(x,
                         y,
                         color='white',
                         markeredgewidth=1,
                         markeredgecolor=gray,
                         markersize=10,
                         marker='d')

    figure.tight_layout()
    figure.savefig('clusters.png', transparent=False)

예제 #16

0

파일 보기

파일: main.py 프로젝트: googleinterns/cloud-monitoring-accessible-charts

def frequency(similarity, algorithm, label_encoding, chart_id):
    """Runs kmeans and gets the frequencies of labels per time series
    and labels per cluster.

    Args:
        similarity: The similarity measure used for scaling the data
            before clustering. Must be "proximity" or "correlation".
        label_encoding: The method used for encoding the labels. Must
            be "none" or "one-hot".
        chart_id: The id of the file containing the data that k-means
            clustering is run on.

    Returns:
        A json with a list of cluster labels generated by running
        kmeans, an array of labels per time series and an array of
        labels per cluster.

    """
    data = load_data(chart_id)
    if "timeSeries" not in data:
        return data
    (time_series_data, label_dict, ts_to_labels, _,
     _) = clustering.time_series_array(data, None)
    time_series_data = clustering.preprocess(time_series_data, label_encoding,
                                             similarity, ts_to_labels,
                                             "k-means")
    if algorithm == "k-means":
        labels = clustering.kmeans(time_series_data, "off")
    elif algorithm == "k-means-constrained":
        labels = clustering.kmeans_kmedians(time_series_data, label_dict,
                                            ts_to_labels, algorithm, "off")

    cluster_labels = clustering.cluster_to_labels(labels, ts_to_labels)

    ordered_labels, ordered_clusters, ordered_ts = clustering.sort_labels(
        label_dict, cluster_labels, ts_to_labels)

    return jsonify({
        "labels": ordered_labels,
        "ts_labels": ordered_ts.tolist(),
        "cluster_labels": ordered_clusters.tolist()
    })

예제 #17

0

파일 보기

파일: eval.py 프로젝트: saketguru/Opinion-Distance

def get_evaluation_scores(X,
                          ground_truth_labels,
                          clusters,
                          filepath,
                          ignored_indices=(),
                          result_file=None,
                          files=None):
    if result_file is not None:
        X = read_distance_matrix_from_file(result_file, (),
                                           len(ground_truth_labels))

    n_clusters_, predicted_labels = kmeans(X, clusters)
    ari_k = metrics.adjusted_rand_score(ground_truth_labels, predicted_labels)

    n_clusters_, predicted_labels = spectral_clustering(X, clusters)
    ari_s = metrics.adjusted_rand_score(ground_truth_labels, predicted_labels)

    filepath.write("Silhouttee, K-mean ARI, Spectral-ARI\n")
    filepath.write("%s,%s,%s\n" %
                   (silhouette_score(X, ground_truth_labels), ari_k, ari_s))

예제 #18

0

파일 보기

파일: conv.py 프로젝트: yujiali/pynn

def train_kmeans_layer(X, in_shape, K, ksize, n_patches_per_image, prep_type=None, pad_h=0, pad_w=0, repeat=1, **kwargs):
    train_data = get_random_patches(X, in_shape, ksize, n_patches_per_image, pad_h=pad_h, pad_w=pad_w)
    if prep_type is not None:
        prep = pp.choose_preprocessor_by_name(prep_type)
        prep.train(train_data)
        train_data = prep.process(train_data)
    else:
        prep = None

    C_best = None
    loss_best = None

    for i_repeat in xrange(repeat):
        print '*** repeat #%d ***' % (i_repeat + 1)
        gnp.free_reuse_cache()
        C, _, loss = clust.kmeans(train_data, K, **kwargs) 
        if loss_best is None or loss < loss_best:
            loss_best = loss
            C_best = C

    print '>>> best loss: %.2f' % loss_best
    return KMeansModel(C_best, kwargs.get('dist', 'euclidean'), in_shape.c, ksize, prep)

예제 #19

0

파일 보기

파일: conv.py 프로젝트: yujiali/pynn

def train_kmeans_layer(X,
                       in_shape,
                       K,
                       ksize,
                       n_patches_per_image,
                       prep_type=None,
                       pad_h=0,
                       pad_w=0,
                       repeat=1,
                       **kwargs):
    train_data = get_random_patches(X,
                                    in_shape,
                                    ksize,
                                    n_patches_per_image,
                                    pad_h=pad_h,
                                    pad_w=pad_w)
    if prep_type is not None:
        prep = pp.choose_preprocessor_by_name(prep_type)
        prep.train(train_data)
        train_data = prep.process(train_data)
    else:
        prep = None

    C_best = None
    loss_best = None

    for i_repeat in xrange(repeat):
        print '*** repeat #%d ***' % (i_repeat + 1)
        gnp.free_reuse_cache()
        C, _, loss = clust.kmeans(train_data, K, **kwargs)
        if loss_best is None or loss < loss_best:
            loss_best = loss
            C_best = C

    print '>>> best loss: %.2f' % loss_best
    return KMeansModel(C_best, kwargs.get('dist', 'euclidean'), in_shape.c,
                       ksize, prep)

예제 #20

0

파일 보기

파일: MyApp.py 프로젝트: Prathameshk2696/Image_Colorization

def recolor_left_half(rgb_lh_arr,
                      number_of_colors,
                      useElbowMethod=False,
                      low=2,
                      high=10,
                      num_of_iters=1000):
    rgb_lh_arr_flat_shape = (rgb_lh_arr.shape[0] * rgb_lh_arr.shape[1], 3)
    rgb_lh_arr_flat = rgb_lh_arr.reshape(
        rgb_lh_arr_flat_shape
    )  # matrix flattened into a vector of data points (r,g,b)
    if not useElbowMethod:  # if elbow method is False
        number_of_clusters = number_of_colors  # k = given number of colors
    else:
        elbow_data = cl.elbow(
            rgb_lh_arr_flat, low, high,
            num_of_iters)  # compute sse for k = low to k = high
        df = pd.DataFrame(elbow_data, columns=['k', 'cost'])
        cl.plot_elbow_data(df)
        number_of_clusters = int(
            input('Enter value of k at elbow point in the plot'))
        # number_of_clusters = number_of_colors
    centroids_arr, assigned_clusters_arr_flat = cl.kmeans(
        rgb_lh_arr_flat, number_of_clusters, 1000)  # execute k-means algorithm
    assigned_clusters_arr = assigned_clusters_arr_flat.reshape(
        (rgb_lh_arr.shape[0], rgb_lh_arr.shape[1]))  # reshape into a matrix
    recolored_lh_arr = np.zeros(
        (rgb_lh_arr.shape))  # array of zeros to recolor left-half of an image
    for index, centroid in enumerate(centroids_arr):
        bool_indices = (assigned_clusters_arr == index)  # boolean array
        recolored_lh_arr[bool_indices] = centroid  # vectorized assignment
    recolored_lh_arr = np.asarray(
        recolored_lh_arr,
        dtype='uint8')  # convert all float numbers into 8-bit integers (0-255)
    centroids_arr = centroids_arr.astype(
        dtype='uint8')  # convert all float numbers into 8-bit integers (0-255)
    return centroids_arr, assigned_clusters_arr, recolored_lh_arr, number_of_clusters

예제 #21

0

파일 보기

파일: utils.py 프로젝트: Mus-Kah/spark_dataset_preprocessing

def add_filter(filters_list, column_index, column):
    from clustering import kmeans
    filters_list.append((column_index, kmeans(column)))
    return filters_list

예제 #22

0

파일 보기

파일: clustering_test.py 프로젝트: darlliu/cscripts

from clustering import svdpca,kmeans
import os, sys, string, numpy

ss=[[1,2],[4,2],[300,200]]
ll=svdpca(str(ss), 3, 2);
print ll;
ss=[[300,200],[1,2],[4,2]]
ll=kmeans(str(ss), 2, 2);
print ll;
#exchange
#sss = [[1,2],[4,2],[3,1]]
#ll=svdpca(str(ss), 3,2);
#print ll;
#ll=svdpca(str(sss), 3,2);
#print ll;

예제 #23

0

파일 보기

파일: _bkp_.py 프로젝트: pontes-guilherme/utfpr-neural-networks

 def _select_centers(self, X):
     # random_args = np.random.choice(len(X), self.n_neurons)
     # centers = X[random_args]
     centers, _, sigmas = kmeans(X, self.n_neurons, 'kmeanspp', 100)
     return centers, sigmas

예제 #24

0

파일 보기

파일: sensing_strategy.py 프로젝트: huxiaoqian/user_portrait

def sensors_keywords_detection(task_detail):
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    keywords_list = task_detail[2]
    sensitive_words = task_detail[3]
    stop_time = task_detail[4]
    forward_warning_status = task_detail[5]
    ts = task_detail[7]

    forward_result = get_forward_numerical_info(task_name, ts, keywords_list)
    # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博
    forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range, 1, social_sensors)
    # 2. 聚合当前阶段内的原创微博
    current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1, social_sensors)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list = list(set(all_mid_list))
    print len(all_mid_list)
    # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1, social_sensors)
    current_total_count = statistics_count['total_count']
    # 当前阶段内所有微博总数
    print "current all weibo: ", statistics_count
    current_origin_count = statistics_count['origin']
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']

    # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    if datetime == datetime_1:
        index_name = flow_text_index_name_pre + datetime
    else:
        index_name = flow_text_index_name_pre + datetime_1
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list, 1)
        sentiment_count = search_results
        print "sentiment_count: ", sentiment_count
    negetive_count = sentiment_count['2'] + sentiment_count['3']

    # 5. 那些社会传感器参与事件讨论
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "bool":{
                        "must":[
                            {"range":{
                                "timestamp":{
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }},
                            {"terms":{"uid": social_sensors}}
                        ],
                        "should":[
                            {"terms": {"root_mid": all_mid_list}},
                            {"terms": {"mid": all_mid_list}}
                        ]
                    }
                }
            }
        },
        "size": 10000
    }

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    if datetime == datetime_1:
        index_name = flow_text_index_name_pre + datetime
    else:
        index_name = flow_text_index_name_pre + datetime_1

    search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_body)['hits']['hits']
    attend_users = []
    if search_results:
        for item in search_results:
            attend_users.append(item['_source']['uid'])

    important_users = list(set(attend_users))
    print "important users", important_users


    # 6. 敏感词识别，如果传感器的微博中出现这么一个敏感词，那么就会预警------PS.敏感词是一个危险的设置
    sensitive_origin_weibo_number = 0
    sensitive_retweeted_weibo_number = 0
    sensitive_comment_weibo_number = 0
    sensitive_total_weibo_number = 0

    if sensitive_words:
        query_sensitive_body = {
            "query":{
                "filtered":{
                    "filter":{
                        "bool":{
                            "must":[
                                {"range":{
                                    "timestamp":{
                                        "gte": ts - time_interval,
                                        "lt": ts
                                    }}
                                },
                                {"terms": {"keywords_string": sensitive_words}},
                                {"terms": {"uid": social_sensors}}
                            ]
                        }
                    }
                }
            },
            "aggs":{
                "all_list":{
                    "terms":{"field": "message_type"}
                }
            }
        }

        sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"]
        if sensitive_results:
            for item in sensitive_results:
                if int(item["key"]) == 1:
                    sensitive_origin_weibo_number = item['doc_count']
                elif int(item["key"]) == 2:
                    sensitive_comment_weibo_number = item['doc_count']
                elif int(item["key"]) == 3:
                    sensitive_retweeted_weibo_number = item["doc_count"]
                else:
                    pass

            sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number


    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal # "0"

    if sensitive_total_weibo_number: # 敏感微博的数量异常
        print "======================"
        if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
            warning_status = signal_track
        else:
            warning_status = signal_brust
        burst_reason = signal_sensitive_variation

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if current_total_count > mean_count+1.96*std_count: # 异常点发生
            print "====================================================="
            if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition # 数量异常
        if negetive_count > mean_sentiment+1.96*std_sentiment:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal

    tmp_burst_reason = burst_reason
    topic_list = []
    # 7. 感知到的事, all_mid_list
    if burst_reason: # 有事情发生
        text_list = []
        mid_set = set()
        if signal_sensitive_variation in burst_reason:
            query_sensitive_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "bool":{
                                "must":[
                                    {"range":{
                                        "timestamp":{
                                            "gte": ts - time_interval,
                                            "lt": ts
                                        }}
                                    },
                                    {"terms": {"keywords_string": sensitive_words}}
                                ]
                            }
                        }
                    }
                },
                "size": 10000
            }
            if social_sensors:
                query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}})

            sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']["hits"]
            if sensitive_results:
                for item in sensitive_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    if iter_mid not in mid_set:
                        text_list.append(temp_dict) # 整理后的文本，mid，text
                        mid_set.add(iter_mid)
            burst_reason.replace(signal_sensitive_variation, "")


        if burst_reason and all_mid_list:
            sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": all_mid_list}, fields=["mid", "text"])["docs"]
            if sensing_text:
                for item in sensing_text:
                    if item['found']:
                        iter_mid = item["fields"]["mid"][0]
                        iter_text = item["fields"]["text"][0]
                        temp_dict = dict()
                        temp_dict["mid"] = iter_mid
                        temp_dict["text"] = iter_text
                        if iter_mid not in mid_set:
                            text_list.append(temp_dict)
                            mid_set.add(iter_mid)

        if len(text_list) == 1:
            top_word = freq_word(text_list[0])
            topic_list = top_word.keys()
        elif len(text_list) == 0:
            topic_list = []
            tmp_burst_reason = "" #没有相关微博，归零
            print "***********************************"
        else:
            feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据
            word_label, evaluation_results = kmeans(feature_words, text_list) #聚类
            inputs = text_classify(text_list, word_label, feature_words)
            clustering_topic = cluster_evaluation(inputs)
            sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)[0:5]
            topic_list = []
            if sorted_dict:
                for item in sorted_dict:
                    topic_list.append(word_label[item[0]])
        print "topic_list:", topic_list

    if not topic_list:
        tmp_burst_reason = signal_nothing_variation
        warning_status = signal_nothing

    results = dict()
    results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number
    results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number
    results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number
    results['sensitive_weibo_total_number'] = sensitive_total_weibo_number
    results['origin_weibo_number'] = current_origin_count
    results['retweeted_weibo_number'] = current_retweeted_count
    results['comment_weibo_number'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(important_users)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    if tmp_burst_reason:
        results["clustering_topic"] = json.dumps(topic_list)

    # es存储当前时段的信息
    doctype = task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source']
    temporal_result['warning_status'] = warning_status
    temporal_result['burst_reason'] = tmp_burst_reason
    temporal_result['finish'] = finish
    history_status = json.loads(temporal_result['history_status'])
    history_status.append([ts, ' '.join(keywords_list), warning_status])
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result)

    return "1"

예제 #25

0

파일 보기

def rbf_train(x,
              y,
              gamma=1,
              x_validation=None,
              y_validation=None,
              num_hidden_nodes=10,
              output="regression",
              num_epochs=100,
              alpha=1,
              mini_batch_size=1,
              compute_loss=True):

    # Preprocess the input and output data

    x_matrix = x

    y = y.copy()
    if len(y.shape) == 1:
        y = y[:, None]

    if y_validation is not None:
        y_validation = y_validation.copy()
        if len(y_validation.shape) == 1:
            y_validation = y_validation[:, None]

    # Select the hidden and output activation functions

    activation_function = activation_functions.rbf
    out_activation_function, out_grad_activation_function, loss = select_output_type(
        output, y)

    # Initialize the weights
    w_hidden = clustering.kmeans(x_matrix,
                                 num_hidden_nodes,
                                 max_iter=100,
                                 num_rep=10)['centroids']
    w_output = np.random.normal(size=(y.shape[1], num_hidden_nodes + 1))

    loss_history = []
    validation_loss_history = []
    for epoch in range(num_epochs):

        random_permutation = np.array_split(
            np.random.permutation(y.shape[0]),
            np.ceil(y.shape[0] / mini_batch_size))

        for i in random_permutation:

            xi = x_matrix[i]
            yi = y[i]

            # Forward pass
            layer_z = np.vstack((np.ones(
                (1, xi.shape[0])), activation_function(w_hidden, xi, gamma)))
            model_output = out_activation_function(w_output @ layer_z)

            # Compute error
            error = yi.T - model_output

            # Update the weights
            w_output += alpha * error @ layer_z.T / xi.shape[0]

        if compute_loss:

            layer_z = np.vstack((np.ones((1, x_matrix.shape[0])),
                                 activation_function(w_hidden, x_matrix,
                                                     gamma)))
            model_output = out_activation_function(w_output @ layer_z)

            loss_history.append(loss(y, model_output.T))

            if x_validation is not None:

                layer_z = np.vstack(
                    (np.ones((1, x_validation.shape[0])),
                     activation_function(w_hidden, x_validation, gamma)))
                model_output_validation = out_activation_function(
                    w_output @ layer_z)

                validation_loss_history.append(
                    loss(y_validation, model_output_validation.T))

    return {
        'w_hidden': w_hidden,
        'w_output': w_output,
        'gamma': gamma,
        'loss_history': loss_history,
        'validation_loss_history': validation_loss_history,
        'output': output,
        'activation_function': activation_function,
        'out_activation_function': out_activation_function
    }

예제 #26

0

파일 보기

 def cluster(self, X):
     centers, data_centers, sigmas = kmeans(X, self.n_neurons, 'kmeanspp',
                                            100)
     return centers, data_centers, sigmas

예제 #27

0

파일 보기

파일: generative_model_1.py 프로젝트: jakobjoachim/text-mining-haw-bachelor

  ffeatures, fids = get_sents_training( model_name )

  # Topic modelling
  decomposer = [lsa, lda, nmf, None][1]
  n_topics = 15
  x_topic = topic_model_run(decomposer, n_topics, ffeatures, fids)


  # Clustering
  from clustering import birch
  from clustering import kmeans
  from clustering import ward_linkage

  x, _ = count_vector( 
    x_topic, 
    ngram=(1,2),
    max_df=0.99, 
    min_df=0.1
  )
  n_clusters = 10

  plot_dimension = 2
  x, _ = pca(x, plot_dimension)
  km, (centroids, c, k) = kmeans(x, n_clusters)
  plot(x, centroids, c, k, "K-Means", plot_dimension)
  if fids: 
    print_clusters(c, fids)

  print_measure("K-Means", "silhouette", silhouette(x, c))

예제 #28

0

파일 보기

def silhouette(k, word_vectors):
    kmeans = kmeans(word_vectors, k)[1]
    return silhouette_score(word_vectors.syn0norm, kmeans.labels_)

예제 #29

0

파일 보기

파일: run.py 프로젝트: fliem/myelinconnect

            embedding_recort, embedding_dict = embedding(upper_corr, full_shape, mask, n_embedding)

            np.save(embed_file%(smooth, masktype, hemi, str(n_embedding)),embedding_recort)
            pkl_out = open(embed_dict_file%(smooth, masktype, hemi, str(n_embedding)), 'wb')
            pickle.dump(embedding_dict, pkl_out)
            pkl_out.close()
            

        '''clustering'''
        if calc_cluster:
            if not calc_embed:
                embedding_recort = np.load(embed_file%(smooth, masktype, hemi, str(n_embedding)))
                mask = np.load(mask_file%(hemi, masktype))
            for nk in n_kmeans:
                print 'clustering %s'%str(nk)
                kmeans_recort = kmeans(embedding_recort, nk, mask)
                np.save(kmeans_file%(smooth, masktype, hemi, str(n_embedding), str(nk)),
                        kmeans_recort)
                
                if calc_subcluster:
                    print 'subclustering %s'%str(nk)
                    v, f, d = read_vtk(mesh_file%hemi)
                    subclust_arr=subcluster(kmeans_recort, f)
                    np.save(subclust_file%(smooth, masktype, hemi, str(n_embedding), str(nk)), subclust_arr)  
                     
                        

        '''subclustering'''
        if not calc_cluster:
            if calc_subcluster:

예제 #30

0

파일 보기

파일: aggregation_weibo.py 프로젝트: lcwy220/user_portrait

def specific_keywords_burst_dection(task_detail):
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    keywords_list = task_detail[2]
    sensitive_words = task_detail[3]
    stop_time = task_detail[4]
    forward_warning_status = task_detail[5]
    ts = int(task_detail[7])
    forward_result = get_forward_numerical_info(task_name, ts, keywords_list)
    # 之前时间阶段内的原创微博list
    forward_origin_weibo_list = query_mid_list(ts-time_interval, keywords_list, forward_time_range)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, keywords_list, time_interval)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    print "all mid list: ", len(all_mid_list)
    # 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval, keywords_list)
    current_total_count = statistics_count['total_count']
    # 当前阶段内所有微博总数
    print "current all weibo: ", statistics_count
    current_origin_count = statistics_count['origin']
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # 针对敏感微博的监测，给定传感器和敏感词的前提下，只要传感器的微博里提及到敏感词即会认为是预警

    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts-time_interval)
    if datetime != datetime_1:
        index_name = flow_text_index_name_pre + datetime_1
    else:
        index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval, keywords_list)

        sentiment_count = search_results
        print "sentiment_count: ", sentiment_count
    negetive_count = sentiment_count['2'] + sentiment_count['3']

    # 聚合当前时间内重要的人
    important_uid_list = []
    if exist_es:
        #search_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets']
        search_results = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100)
        important_uid_list = search_results.keys()
        if datetime != datetime_1:
            index_name_1 = flow_text_index_name_pre + datetime_1
            if es_text.indices.exists(index_name_1):
                #search_results_1 = es_text.search(index=index_name_1, doc_type=flow_text_index_type, body=aggregation_sensor_keywords(ts-time_interval, ts, [], "root_uid", size=IMPORTANT_USER_NUMBER))['aggregations']['all_keywords']['buckets']
                search_results_1 = query_hot_weibo(ts, all_mid_list, time_interval, keywords_list, aggregation_field="root_uid", size=100)
                if search_results_1:
                    for item in search_results_1:
                        important_uid_list.append(item['key'])
    # 根据获得uid_list，从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name, doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = {}
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                        filter_important_list.append(item['_id'])
    print filter_important_list

    # 6. 敏感词识别，如果传感器的微博中出现这么一个敏感词，那么就会预警------PS.敏感词是一个
    sensitive_origin_weibo_number = 0
    sensitive_retweeted_weibo_number = 0
    sensitive_comment_weibo_number = 0
    sensitive_total_weibo_number = 0

    if sensitive_words:
        query_sensitive_body = {
            "query":{
                "filtered":{
                    "filter":{
                        "bool":{
                            "must":[
                                {"range":{
                                    "timestamp":{
                                        "gte": ts - time_interval,
                                        "lt": ts
                                    }}
                                },
                                {"terms": {"keywords_string": sensitive_words}}
                            ]
                        }
                    }
                }
            },
            "aggs":{
                "all_list":{
                    "terms":{"field": "message_type"}
                }
            }
        }
        if social_sensors:
            query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}})

        sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['aggregations']['all_list']["buckets"]
        if sensitive_results:
            for item in sensitive_results:
                if int(item["key"]) == 1:
                    sensitive_origin_weibo_number = item['doc_count']
                elif int(item["key"]) == 2:
                    sensitive_comment_weibo_number = item['doc_count']
                elif int(item["key"]) == 3:
                    sensitive_retweeted_weibo_number = item["doc_count"]
                else:
                    pass

            sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number




    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal # "0"
    process_status = "1"

    if sensitive_total_weibo_number > WARNING_SENSITIVE_COUNT: # 敏感微博的数量异常
        print "======================"
        if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
            warning_status = signal_track
        else:
            warning_status = signal_brust
        burst_reason = signal_sensitive_variation

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if current_total_count > mean_count+1.96*std_count: # 异常点发生
            print "====================================================="
            if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition # 数量异常
        if negetive_count > mean_sentiment+1.96*std_sentiment:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 7. 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []
    # 判断是否有敏感微博出现:有，则聚合敏感微博，replace；没有，聚合普通微博
    if burst_reason: # 有事情发生
        text_list = []
        mid_set = set()
        if signal_sensitive_variation in burst_reason:
            query_sensitive_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "bool":{
                                "must":[
                                    {"range":{
                                        "timestamp":{
                                            "gte": ts - time_interval,
                                            "lt": ts
                                        }}
                                    },
                                    {"terms": {"keywords_string": sensitive_words}}
                                ]
                            }
                        }
                    }
                },
                "size": 10000
            }

            if social_sensors:
                query_sensitive_body['query']['filtered']['filter']['bool']['must'].append({"terms":{"uid": social_sensors}})

            sensitive_results = es_text.search(index=index_name, doc_type=flow_text_index_type, body=query_sensitive_body)['hits']['hits']
            if sensitive_results:
                for item in sensitive_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    if iter_mid not in mid_set:
                        text_list.append(temp_dict) # 整理后的文本，mid，text
                        mid_set.add(iter_mid)
            burst_reason.replace(signal_sensitive_variation, "")

        current_origin_mid_list = query_mid_list(ts, keywords_list, time_interval, 1)
        print "current_origin_mid_list:", len(current_origin_mid_list)
        if burst_reason and current_mid_list:
            origin_sensing_text = es_text.mget(index=index_name, doc_type=flow_text_index_type, body={"ids": current_origin_mid_list}, fields=["mid", "text"])["docs"]
            if origin_sensing_text:
                for item in origin_sensing_text:
                    if item["found"]:
                        iter_mid = item["fields"]["mid"][0]
                        iter_text = item["fields"]["text"][0]
                        temp_dict = dict()
                        temp_dict["mid"] = iter_mid
                        temp_dict["text"] = iter_text
                        if iter_mid not in mid_set:
                            text_list.append(temp_dict) # 整理后的文本，mid，text
                            mid_set.add(iter_mid)

        if len(text_list) == 1:
            top_word = freq_word(text_list[0])
            topic_list = [top_word.keys()]
        elif len(text_list) == 0:
            topic_list = []
            tmp_burst_reason = "" #没有相关微博，归零
            print "***********************************"
        else:
            feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据
            word_label, evaluation_results = kmeans(feature_words, text_list) #聚类
            inputs = text_classify(text_list, word_label, feature_words)
            clustering_topic = cluster_evaluation(inputs)
            print "========================================================================================"
            print "========================================================================================="
            sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)
            topic_list = []
            if sorted_dict:
                for item in sorted_dict:
                    topic_list.append(word_label[item[0]])
        print "topic_list, ", topic_list

    if not topic_list:
        warning_status = signal_nothing
        tmp_burst_reason = signal_nothing_variation

    results = dict()
    results['origin_weibo_number'] = current_origin_count
    results['retweeted_weibo_number'] = current_retweeted_count
    results['comment_weibo_number'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number
    results['sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number
    results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number
    results['sensitive_weibo_total_number'] = sensitive_total_weibo_number
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(filter_important_list)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    if tmp_burst_reason:
        results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=task_name)['_source']
    temporal_result['warning_status'] = warning_status
    temporal_result['burst_reason'] = tmp_burst_reason
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append([ts, ' '.join(keywords_list), warning_status])
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=task_name, body=temporal_result)

    return "1"

예제 #31

0

파일 보기

#Module with functions to manipulate files
import manipulate_files
import fraud_simulation
import prediction
import clustering

# Reads .txt files
data=manipulate_files.read_files()
print('Files read')

# Sorts the files in order to be space relevant
data=manipulate_files.sort_files(data)
print('Files sorted')

# Finds consumers with full data for a year
# Output: index MeterID consumers vs halfhours of a year
timeseries=manipulate_files.find_full_values(data)
print('Dataset formated to annual data')

# Applies types of attack
# Outputs X and Y with MeterID index
X, Y=fraud_simulation.typical_attack(timeseries)
print('Fraud Simulation done')

# Computes Performance
prediction.liblinear(X,Y)

# Clustering with PCA and K-Means
clustering.kmeans(X)

예제 #32

0

파일 보기

    def step2_cal():
        """第二步计算，判断其他类是否需要分裂，若需要，则对其他类进行文本聚类，并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(
            int(time.time())
        ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(
            timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(
                initializing)
            print eventid, ' after classify before split: ', len(
                inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE,
                        min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时，不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE)

                # 更新新闻簇标签，更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)

예제 #33

0

파일 보기

if __name__ == "__main__":
    s_index = "20150701"
    e_index = "20150702"
    features_as_list, ids = indexed_features(s_index, e_index)

    n_topics = 0
    all_features = []
    for index, feature in features_as_list:
        all_features += feature
        round_topics = int(len(feature) / 25) + 1
        n_topics += round_topics

    dictionary, corpus = create_corpus(index, all_features)
    x, lda_model, lda_corpus = single_lda("20150703", corpus, dictionary,
                                          n_topics)

    x = pca(x, 2)
    km, (centroids, c, k) = kmeans(x, n_topics)
    cluster_plot_2d(x, centroids, c, k)

    clusters = {}
    for idx, el in enumerate(c):
        if not el in clusters:
            clusters[el] = []
        clusters[el].append(idx)

    for key, doc in clusters.items():
        for _id in doc[:5]:
            print(_id)
        print("-" * 40)

예제 #34

0

파일 보기

파일: dynamic_lda_kmeans.py 프로젝트: jakobjoachim/text-mining-haw-bachelor

if __name__ == "__main__":
  s_index = "20150701"
  e_index = "20150702"
  features_as_list, ids = indexed_features(s_index, e_index)
 
  n_topics = 0
  all_features = []
  for index, feature in features_as_list:
    all_features += feature
    round_topics = int( len(feature) / 25 ) + 1
    n_topics += round_topics

  dictionary, corpus = create_corpus(index, all_features)
  x, lda_model, lda_corpus = single_lda("20150703", corpus, dictionary, n_topics)

  x = pca(x, 2)
  km, (centroids, c, k) = kmeans(x, n_topics)
  cluster_plot_2d(x, centroids, c, k)

  clusters = {}
  for idx, el in enumerate(c):
    if not el in clusters:
      clusters[el] = []
    clusters[el].append( idx )

  for key, doc in clusters.items():
    for _id in doc[:5]:
      print(_id)
    print("-"*40)

예제 #35

0

파일 보기


    plt.style.use(args.style)

    df = data.read_data(args.data_filepath)

    if(args.plot_raw_data):
        plotting.plot_all_data(df, "Raw Data", "Raw", os.path.join(args.output_folderpath, "raw"), data.column_names)

    if(args.plot_standardized_data):
        clean_df = data.clean_data(df)
        plotting.plot_all_data(clean_df, "Standardized Data", "Clean", os.path.join(args.output_folderpath, "clean"), data.column_names)

    if(args.perform_pca):
        num_dimensions = args.num_dimensions
        projected_df = pca.perform_pca(df, num_dimensions)

        output_path = os.path.join(args.output_folderpath, "PCA")
        if(not(os.path.exists(output_path))):
            os.makedirs(output_path)

        print "Saving Graphs"
        plotting.plot_all_data(projected_df, "PCA {0}-D".format(num_dimensions), "PCA", output_path, projected_df.columns)


    if(args.perform_kmeans):
        output_path = os.path.join(args.output_folderpath, "KMeans")
        if (not (os.path.exists(output_path))):
            os.makedirs(output_path)
        clustering.kmeans(df, 6, 7, 2, output_path, df.columns[6:8])

예제 #36

0

파일 보기


if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO)

    # Data
    model_name = "20150701"
    ffeatures, fids = get_sents_training(model_name)

    # Topic modelling
    decomposer = [lsa, lda, nmf, None][1]
    n_topics = 15
    x_topic = topic_model_run(decomposer, n_topics, ffeatures, fids)

    # Clustering
    from clustering import birch
    from clustering import kmeans
    from clustering import ward_linkage

    x, _ = count_vector(x_topic, ngram=(1, 2), max_df=0.99, min_df=0.1)
    n_clusters = 10

    plot_dimension = 2
    x, _ = pca(x, plot_dimension)
    km, (centroids, c, k) = kmeans(x, n_clusters)
    plot(x, centroids, c, k, "K-Means", plot_dimension)
    if fids:
        print_clusters(c, fids)

    print_measure("K-Means", "silhouette", silhouette(x, c))

예제 #37

0

파일 보기

파일: sensing_v3.py 프로젝트: ferrero-zhang/user_portrait_0324

def social_sensing(task_detail):
    # 任务名 传感器 终止时间 之前状态 创建者 时间
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    stop_time = task_detail[2]
    forward_warning_status = task_detail[3]
    create_by = task_detail[4]
    ts = int(task_detail[5])

    # PART 1
    forward_result = get_forward_numerical_info(task_name, ts, create_by)
    # 之前时间阶段内的原创微博list/retweeted
    forward_origin_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range)
    forward_retweeted_weibo_list = query_mid_list(ts-time_interval, social_sensors, forward_time_range, 3)
    # 当前阶段内原创微博list
    current_mid_list = query_mid_list(ts, social_sensors, time_interval)
    current_retweeted_mid_list = query_mid_list(ts, social_sensors, time_interval, 3)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(current_retweeted_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list.extend(forward_retweeted_weibo_list)
    all_origin_list = []
    all_origin_list.extend(current_mid_list)
    all_origin_list.extend(forward_origin_weibo_list)
    all_retweeted_list = []
    all_retweeted_list.extend(current_retweeted_mid_list)
    all_retweeted_list.extend(forward_retweeted_weibo_list)#被转发微博的mid/root-mid
    print "all mid list: ", len(all_mid_list)
    print "all_origin_list", all_origin_list
    print "all_retweeted_list", all_retweeted_list

    # 查询微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval)
    if all_origin_list:
        origin_weibo_detail = query_hot_weibo(ts, all_origin_list, time_interval) # 原创微博详情
    else:
        origin_weibo_detail = {}
    if all_retweeted_list:
        retweeted_weibo_detail = query_hot_weibo(ts, all_retweeted_list, time_interval) # 转发微博详情
    else:
        retweeted_weibo_detail = {}
    current_total_count = statistics_count['total_count']

    # 当前阶段内所有微博总数
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']


    # PART 2
    # 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    # sentiment_dict = {"0": "neutral", "1":"positive", "2":"sad", "3": "anger"}
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    search_results = aggregation_sentiment_related_weibo(ts, all_mid_list, time_interval)
    sentiment_count = search_results
    print "sentiment_count: ", sentiment_count
    negetive_key = ["2", "3", "4", "5", "6"]
    negetive_count = 0
    for key in negetive_key:
        negetive_count += sentiment_count[key]


    # 聚合当前时间内重要的人
    important_uid_list = []
    datetime = ts2datetime(ts-time_interval)
    index_name = flow_text_index_name_pre + datetime
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = get_important_user(ts, all_mid_list, time_interval)
        important_uid_list = search_results.keys()
    # 根据获得uid_list，从人物库中匹配重要人物
    if important_uid_list:
        important_results = es_user_portrait.mget(index=portrait_index_name,doc_type=portrait_index_type, body={"ids": important_uid_list})['docs']
    else:
        important_results = {}
    filter_important_list = [] # uid_list
    if important_results:
        for item in important_results:
            if item['found']:
                #if item['_source']['importance'] > IMPORTANT_USER_THRESHOULD:
                filter_important_list.append(item['_id'])
    print filter_important_list


    #判断感知
    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal # "0"
    process_status = "1"

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if mean_count >= MEAN_COUNT and current_total_count > mean_count+1.96*std_count or current_total_count >= len(social_sensors)*0.2*AVERAGE_COUNT: # 异常点发生
            print "====================================================="
            if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition # 数量异常

        if negetive_count > mean_sentiment+1.96*std_sentiment and mean_sentiment >= MEAN_COUNT or negetive_count >= len(social_sensors)*0.2*AVERAGE_COUNT:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust: # 已有事件发生，改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts: # 检查任务是否已经完成
        finish = finish_signal
        process_status = "0"

    # 感知到的事, all_mid_list
    tmp_burst_reason = burst_reason
    topic_list = []

    # 有事件发生时开始
    if warning_status:
        index_list = []
        important_words = []
        datetime_1 = ts2datetime(ts)
        index_name_1 = flow_text_index_name_pre + datetime_1
        exist_es = es_text.indices.exists(index=index_name_1)
        if exist_es:
            index_list.append(index_name_1)
        datetime_2 = ts2datetime(ts-DAY)
        index_name_2 = flow_text_index_name_pre + datetime_2
        exist_es = es_text.indices.exists(index=index_name_2)
        if exist_es:
            index_list.append(index_name_2)
        if index_list and all_mid_list:
            query_body = {
                "query":{
                    "filtered":{
                        "filter":{
                            "terms":{"mid": all_mid_list}
                        }
                    }
                },
                "size": 2000
            }
            search_results = es_text.search(index=index_list, doc_type="text", body=query_body)['hits']['hits']
            text_list = []
            if search_results:
                for item in search_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    text_list.append(temp_dict)
            for item in text_list:
                print item['text']
            if len(text_list) == 1:
                top_word = freq_word(text_list[0])
                topic_list = [top_word.keys()]
            elif len(text_list) == 0:
                topic_list = []
                tmp_burst_reason = "" #没有相关微博，归零
                print "***********************************"
            else:
                feature_words, input_word_dict = tfidf(text_list) #生成特征词和输入数据
                word_label, evaluation_results = kmeans(feature_words, text_list) #聚类
                inputs = text_classify(text_list, word_label, feature_words)
                clustering_topic = cluster_evaluation(inputs)
                print "==============================================================="
                print "==============================================================="
                sorted_dict = sorted(clustering_topic.items(), key=lambda x:x[1], reverse=True)
                topic_list = []
                if sorted_dict:
                    for item in sorted_dict:
                        topic_list.append(word_label[item[0]])
            print "topic_list, ", topic_list

    #if not topic_list:
    #    warning_status = signal_nothing
    #    tmp_burst_reason = signal_nothing_variation

    results = dict()
    results['origin_weibo_number'] = len(all_origin_list)
    results['retweeted_weibo_number'] = len(all_retweeted_list)
    results['origin_weibo_detail'] = json.dumps(origin_weibo_detail)
    results['retweeted_weibo_detail'] = json.dumps(retweeted_weibo_detail)
    results['retweeted_weibo_count'] = current_retweeted_count
    results['comment_weibo_count'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(filter_important_list)
    results['unfilter_users'] = json.dumps(important_uid_list)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    if tmp_burst_reason:
        results['clustering_topic'] = json.dumps(topic_list)
    # es存储当前时段的信息
    doctype = create_by + '-' + task_name
    es_user_portrait.index(index=index_sensing_task, doc_type=doctype, id=ts, body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task, doc_type=task_doc_type, id=doctype)['_source']
    temporal_result['warning_status'] = warning_status
    temporal_result['burst_reason'] = tmp_burst_reason
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append([ts, task_name, warning_status])
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task, doc_type=task_doc_type, id=doctype, body=temporal_result)

    return "1"

예제 #38

0

파일 보기

def cluster_wine(k):
    table = load_wine_data('redwine.csv')
    labels = clus.kmeans(k, table)
    plot_scatter(table, labels, k)

예제 #39

0

파일 보기

파일: plot.py 프로젝트: chouxiaowen/queryopt

def cluster_wine(k):
  table = load_wine_data('redwine.csv')
  labels = clus.kmeans(k, table)
  plot_scatter(table, labels, k)

예제 #40

0

파일 보기

def sensors_keywords_detection(task_detail):
    task_name = task_detail[0]
    social_sensors = task_detail[1]
    keywords_list = task_detail[2]
    sensitive_words = task_detail[3]
    stop_time = task_detail[4]
    forward_warning_status = task_detail[5]
    ts = task_detail[7]

    forward_result = get_forward_numerical_info(task_name, ts, keywords_list)
    # 1. 聚合前12个小时内传感人物发布的所有与关键词相关的原创微博
    forward_origin_weibo_list = query_mid_list(ts - time_interval,
                                               keywords_list,
                                               forward_time_range, 1,
                                               social_sensors)
    # 2. 聚合当前阶段内的原创微博
    current_mid_list = query_mid_list(ts, keywords_list, time_interval, 1,
                                      social_sensors)
    all_mid_list = []
    all_mid_list.extend(current_mid_list)
    all_mid_list.extend(forward_origin_weibo_list)
    all_mid_list = list(set(all_mid_list))
    print len(all_mid_list)
    # 3. 查询当前的原创微博和之前12个小时的原创微博在当前时间内的转发和评论数, 聚合按照message_type
    statistics_count = query_related_weibo(ts, all_mid_list, time_interval,
                                           keywords_list, 1, social_sensors)
    current_total_count = statistics_count['total_count']
    # 当前阶段内所有微博总数
    print "current all weibo: ", statistics_count
    current_origin_count = statistics_count['origin']
    current_retweeted_count = statistics_count['retweeted']
    current_comment_count = statistics_count['comment']

    # 4. 聚合当前时间内积极、中性、悲伤、愤怒情绪分布
    sentiment_count = {"0": 0, "1": 0, "2": 0, "3": 0}
    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    if datetime == datetime_1:
        index_name = flow_text_index_name_pre + datetime
    else:
        index_name = flow_text_index_name_pre + datetime_1
    exist_es = es_text.indices.exists(index_name)
    if exist_es:
        search_results = aggregation_sentiment_related_weibo(
            ts, all_mid_list, time_interval, keywords_list, 1)
        sentiment_count = search_results
        print "sentiment_count: ", sentiment_count
    negetive_count = sentiment_count['2'] + sentiment_count['3']

    # 5. 那些社会传感器参与事件讨论
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "bool": {
                        "must": [{
                            "range": {
                                "timestamp": {
                                    "gte": ts - time_interval,
                                    "lt": ts
                                }
                            }
                        }, {
                            "terms": {
                                "uid": social_sensors
                            }
                        }],
                        "should": [{
                            "terms": {
                                "root_mid": all_mid_list
                            }
                        }, {
                            "terms": {
                                "mid": all_mid_list
                            }
                        }]
                    }
                }
            }
        },
        "size": 10000
    }

    datetime = ts2datetime(ts)
    datetime_1 = ts2datetime(ts - time_interval)
    if datetime == datetime_1:
        index_name = flow_text_index_name_pre + datetime
    else:
        index_name = flow_text_index_name_pre + datetime_1

    search_results = es_text.search(index=index_name,
                                    doc_type=flow_text_index_type,
                                    body=query_body)['hits']['hits']
    attend_users = []
    if search_results:
        for item in search_results:
            attend_users.append(item['_source']['uid'])

    important_users = list(set(attend_users))
    print "important users", important_users

    # 6. 敏感词识别，如果传感器的微博中出现这么一个敏感词，那么就会预警------PS.敏感词是一个危险的设置
    sensitive_origin_weibo_number = 0
    sensitive_retweeted_weibo_number = 0
    sensitive_comment_weibo_number = 0
    sensitive_total_weibo_number = 0

    if sensitive_words:
        query_sensitive_body = {
            "query": {
                "filtered": {
                    "filter": {
                        "bool": {
                            "must": [{
                                "range": {
                                    "timestamp": {
                                        "gte": ts - time_interval,
                                        "lt": ts
                                    }
                                }
                            }, {
                                "terms": {
                                    "keywords_string": sensitive_words
                                }
                            }, {
                                "terms": {
                                    "uid": social_sensors
                                }
                            }]
                        }
                    }
                }
            },
            "aggs": {
                "all_list": {
                    "terms": {
                        "field": "message_type"
                    }
                }
            }
        }

        sensitive_results = es_text.search(
            index=index_name,
            doc_type=flow_text_index_type,
            body=query_sensitive_body)['aggregations']['all_list']["buckets"]
        if sensitive_results:
            for item in sensitive_results:
                if int(item["key"]) == 1:
                    sensitive_origin_weibo_number = item['doc_count']
                elif int(item["key"]) == 2:
                    sensitive_comment_weibo_number = item['doc_count']
                elif int(item["key"]) == 3:
                    sensitive_retweeted_weibo_number = item["doc_count"]
                else:
                    pass

            sensitive_total_weibo_number = sensitive_origin_weibo_number + sensitive_comment_weibo_number + sensitive_retweeted_weibo_number

    burst_reason = signal_nothing_variation
    warning_status = signal_nothing
    finish = unfinish_signal  # "0"
    process_status = "1"

    if sensitive_total_weibo_number:  # 敏感微博的数量异常
        print "======================"
        if forward_warning_status == signal_brust:  # 已有事件发生，改为事件追踪
            warning_status = signal_track
        else:
            warning_status = signal_brust
        burst_reason = signal_sensitive_variation

    if forward_result[0]:
        # 根据移动平均判断是否有时间发生
        mean_count = forward_result[1]
        std_count = forward_result[2]
        mean_sentiment = forward_result[3]
        std_sentiment = forward_result[4]
        if current_total_count > mean_count + 1.96 * std_count:  # 异常点发生
            print "====================================================="
            if forward_warning_status == signal_brust:  # 已有事件发生，改为事件追踪
                warning_status = signal_track
            else:
                warning_status = signal_brust
            burst_reason += signal_count_varition  # 数量异常
        if negetive_count > mean_sentiment + 1.96 * std_sentiment:
            warning_status = signal_brust
            burst_reason += signal_sentiment_varition  # 负面情感异常, "12"表示两者均异常
            if forward_warning_status == signal_brust:  # 已有事件发生，改为事件追踪
                warning_status = signal_track

    if int(stop_time) <= ts:  # 检查任务是否已经完成
        finish = finish_signal
        process_status = '0'

    tmp_burst_reason = burst_reason
    topic_list = []
    # 7. 感知到的事, all_mid_list
    if burst_reason:  # 有事情发生
        text_list = []
        mid_set = set()
        if signal_sensitive_variation in burst_reason:
            query_sensitive_body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "bool": {
                                "must": [{
                                    "range": {
                                        "timestamp": {
                                            "gte": ts - time_interval,
                                            "lt": ts
                                        }
                                    }
                                }, {
                                    "terms": {
                                        "keywords_string": sensitive_words
                                    }
                                }]
                            }
                        }
                    }
                },
                "size": 10000
            }
            if social_sensors:
                query_sensitive_body['query']['filtered']['filter']['bool'][
                    'must'].append({"terms": {
                        "uid": social_sensors
                    }})

            sensitive_results = es_text.search(
                index=index_name,
                doc_type=flow_text_index_type,
                body=query_sensitive_body)['hits']["hits"]
            if sensitive_results:
                for item in sensitive_results:
                    iter_mid = item['_source']['mid']
                    iter_text = item['_source']['text']
                    temp_dict = dict()
                    temp_dict["mid"] = iter_mid
                    temp_dict["text"] = iter_text
                    if iter_mid not in mid_set:
                        text_list.append(temp_dict)  # 整理后的文本，mid，text
                        mid_set.add(iter_mid)
            burst_reason.replace(signal_sensitive_variation, "")

        if burst_reason and all_mid_list:
            sensing_text = es_text.mget(index=index_name,
                                        doc_type=flow_text_index_type,
                                        body={"ids": all_mid_list},
                                        fields=["mid", "text"])["docs"]
            if sensing_text:
                for item in sensing_text:
                    if item['found']:
                        iter_mid = item["fields"]["mid"][0]
                        iter_text = item["fields"]["text"][0]
                        temp_dict = dict()
                        temp_dict["mid"] = iter_mid
                        temp_dict["text"] = iter_text
                        if iter_mid not in mid_set:
                            text_list.append(temp_dict)
                            mid_set.add(iter_mid)

        if len(text_list) == 1:
            top_word = freq_word(text_list[0])
            topic_list = [top_word.keys()]
        elif len(text_list) == 0:
            topic_list = []
            tmp_burst_reason = ""  #没有相关微博，归零
            print "***********************************"
        else:
            feature_words, input_word_dict = tfidf(text_list)  #生成特征词和输入数据
            word_label, evaluation_results = kmeans(feature_words,
                                                    text_list)  #聚类
            inputs = text_classify(text_list, word_label, feature_words)
            clustering_topic = cluster_evaluation(inputs)
            sorted_dict = sorted(clustering_topic.items(),
                                 key=lambda x: x[1],
                                 reverse=True)[0:5]
            topic_list = []
            if sorted_dict:
                for item in sorted_dict:
                    topic_list.append(word_label[item[0]])
        print "topic_list:", topic_list

    if not topic_list:
        tmp_burst_reason = signal_nothing_variation
        warning_status = signal_nothing

    results = dict()
    results['sensitive_origin_weibo_number'] = sensitive_origin_weibo_number
    results[
        'sensitive_retweeted_weibo_number'] = sensitive_retweeted_weibo_number
    results['sensitive_comment_weibo_number'] = sensitive_comment_weibo_number
    results['sensitive_weibo_total_number'] = sensitive_total_weibo_number
    results['origin_weibo_number'] = current_origin_count
    results['retweeted_weibo_number'] = current_retweeted_count
    results['comment_weibo_number'] = current_comment_count
    results['weibo_total_number'] = current_total_count
    results['sentiment_distribution'] = json.dumps(sentiment_count)
    results['important_users'] = json.dumps(important_users)
    results['burst_reason'] = tmp_burst_reason
    results['timestamp'] = ts
    if tmp_burst_reason:
        results["clustering_topic"] = json.dumps(topic_list[:5])

    # es存储当前时段的信息
    doctype = task_name
    es_user_portrait.index(index=index_sensing_task,
                           doc_type=doctype,
                           id=ts,
                           body=results)

    # 更新manage social sensing的es信息
    temporal_result = es_user_portrait.get(index=index_manage_social_task,
                                           doc_type=task_doc_type,
                                           id=task_name)['_source']
    temporal_result['warning_status'] = warning_status
    temporal_result['burst_reason'] = tmp_burst_reason
    temporal_result['finish'] = finish
    temporal_result['processing_status'] = process_status
    history_status = json.loads(temporal_result['history_status'])
    history_status.append([ts, ' '.join(keywords_list), warning_status])
    temporal_result['history_status'] = json.dumps(history_status)
    es_user_portrait.index(index=index_manage_social_task,
                           doc_type=task_doc_type,
                           id=task_name,
                           body=temporal_result)

    return "1"

예제 #41

0

파일 보기

파일: make_visual_clusters.py 프로젝트: ttdtrang/SlidingBindingBox

    values = l.split()
    at = atom()
    at.PDBIndex = values[1]
    at.atomname = values[2]
    at.residue = values[3]
    at.chain    = values[4]
    at.resid    = values[5]
    at.coordinates = point(float(values[6]), float(values[7]), float(values[8]) )
    at.bfactor = float(values[10])* weight_be
    at.atomtype = values[11]
    at.element = 'H'
    coords = [at.coordinates.x, at.coordinates.y, at.coordinates.z, at.bfactor]
    p = clustering.Point(coords , reference=at)
    points.append(p)

clusters = clustering.kmeans(points, number_of_clusters, cutoff)
for i in range(len(clusters)):
    if ( not os.path.exists(str(i+1))  ): os.mkdir(str(i+1))
    pdb_cluster = PDB()
    tot_be = 0.0
    for p in clusters[i].points:
        p.reference.bfactor = p.reference.bfactor / weight_be
        pdb_cluster.AddNewAtom(p.reference)
        tot_be += p.reference.bfactor
        (box_num, cluster_id) = (ord(p.reference.chain) - ord('A') +1, p.reference.resid)
        target = "../../%s/cluster-%04d.pdbqt" % (box_num, int(cluster_id) )
        link_name = "./cluster-%04d-box%s.pdbqt" % (int(cluster_id), box_num)  
        os.chdir(str(i+1))
        os.system("ln -sf %s %s" % (target, link_name) )
        os.chdir("..")
    pdb_cluster.UserNotes.append("STATISTICS")