Esempio n. 1
0
    def getDtimeStatistics(self, cluster, db=None):
        dtimeDict = dict()
        dtimeRangeDict = dict()

        for cluster_id, members in cluster:
            dtimeDict[cluster_id] = set()
            for vertex in list(
                    filter(lambda x: x['cluster_id'] == cluster_id, [
                        self.graph.getVertex(member).get()
                        for member in to_list(members)
                    ])):
                if 'dtime' in vertex.keys():
                    dtimeDict[cluster_id].add(to_time(vertex['dtime']))
                    dtimeRangeDict[cluster_id] = dict()
                    dtimeRangeDict[cluster_id]['alive'] = (
                        max(dtimeDict[cluster_id]) -
                        min(dtimeDict[cluster_id])).days + 1

        for cluster_id, members in cluster:
            dtimeCntDict = dict()
            for vertex in list(
                    filter(lambda x: x['cluster_id'] == cluster_id, [
                        self.graph.getVertex(member).get()
                        for member in to_list(members)
                    ])):
                for time in dtimeDict[cluster_id]:
                    if to_time(vertex['dtime']) == time:
                        if not dtimeCntDict.has_key(time):
                            dtimeCntDict[time] = 0
                        dtimeCntDict[time] += 1

            dtimeDict[cluster_id] = dtimeCntDict

        print('dtimeDict >> ', dtimeDict)
        print('dtimeRangeDict >> ', dtimeRangeDict)
Esempio n. 2
0
    def getClusterIdByVid(self, vid):
        clusterId = ''
        for key, value in self.cluster:
            valueArr = to_list(value)
            if vid in valueArr:
                clusterId = key

        return clusterId
Esempio n. 3
0
    def getDetectionReasonCluster(self, cluster):
        reasonDict = dict()

        for cluster_id, members in cluster:
            reasonDict[cluster_id] = set()
            for vertex in list(
                    filter(lambda x: x['cluster_id'] == cluster_id, [
                        self.graph.getVertex(member).get()
                        for member in to_list(members)
                    ])):
                reasonDict[cluster_id].add(vertex['detection_reason'])

        print(reasonDict)
Esempio n. 4
0
 def setClusterId(self):
     for cluster_id, members in self.cluster:
         for vertex in [
                 self.graph.getVertex(member) for member in to_list(members)
         ]:
             vertex.setClusterId(cluster_id)
Esempio n. 5
0
    def insertClusterTfIdf(self, cluster=None, qt=None, qr=None):
        import pandas as pd
        import json
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer

        # cluster 특징 추출 : properties:value --> 1 line text
        if cluster is None:
            cluster = self.cluster

        _dict = dict()
        _dict['text'] = list()
        _dict['cluster_id'] = list()
        for cluster_id, members in cluster:
            text = ''
            for vertex in [
                    self.graph.getVertex(member).get()
                    for member in to_list(members)
            ]:
                for key in vertex.keys():
                    if vertex[key] is not None:
                        value = None

                        if isinstance(vertex[key], unicode):
                            value = vertex[key].encode('utf-8')
                        else:
                            value = str(vertex[key])

                        text += ' ' + pre_process(value)
                    else:
                        text += ''
                        text.strip()

            _dict['text'].append(text)
            _dict['cluster_id'].append(cluster_id)

        # pandas dataframe set
        df_idf = pd.DataFrame(_dict)
        #         df_idf['text'] = reduce(lambda x,y: x+' '+y.apply(lambda y1:pre_process(y1)), [df_idf[key] for key in self.graph.vKeys()])

        # not yet used stopwords
        #         stopWords = file.get_stop_words('analysis/stopword.txt')

        # cluster 특징 dataframe column list get
        docs = df_idf['text'].tolist()

        # count vectorizer 객체 생성
        cv = CountVectorizer(
            max_df=0.85,
            stop_words='english',
            analyzer='word',
            token_pattern=
            r'(?u)\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}\:\d*|(?u)\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}|\b\w\w+\b'
        )

        # 전체 dictionary 생성
        word_count_vector = cv.fit_transform(docs)
        #         print(list(cv.vocabulary_.keys())[:10])
        # tfidf 객체 생성 및 텍스트별 카운팅 산출
        tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)

        tfidf_transformer.fit(word_count_vector)
        # print idf values
        #         df_idf1 = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["tf_idf_weights"])

        # sort ascending
        #         print(df_idf1.sort_values(by=['tf_idf_weights']))
        feature_names = cv.get_feature_names()

        idx = 1
        # dictionary에서 클러스터별 특징 출력
        for doc in docs:
            tf_idf_vector = tfidf_transformer.transform(cv.transform([doc]))

            sorted_items = sort_coo(tf_idf_vector.tocoo())
            keywords = extract_topn_from_vector(feature_names, sorted_items,
                                                100)

            sorted_keywords = sorted(keywords,
                                     key=lambda k: keywords[k],
                                     reverse=True)
            param_keywords = dict()

            # top 5 추출
            for k in sorted_keywords[:5]:
                param_keywords[k] = keywords[k]

            qt.doQuery(
                qr.getQueryString('creation.table.insert_cluster_ch'), {
                    'cluster_id': str(idx),
                    'characteristic': json.dumps(param_keywords)
                })
            idx += 1
Esempio n. 6
0
 def getMaxMemberCluster(self, cluster):
     return reduce(lambda x, y: y if len(x) < len(y) else x,
                   [(cluster_id, to_list(members))
                    for cluster_id, members in cluster])