def getDtimeStatistics(self, cluster, db=None): dtimeDict = dict() dtimeRangeDict = dict() for cluster_id, members in cluster: dtimeDict[cluster_id] = set() for vertex in list( filter(lambda x: x['cluster_id'] == cluster_id, [ self.graph.getVertex(member).get() for member in to_list(members) ])): if 'dtime' in vertex.keys(): dtimeDict[cluster_id].add(to_time(vertex['dtime'])) dtimeRangeDict[cluster_id] = dict() dtimeRangeDict[cluster_id]['alive'] = ( max(dtimeDict[cluster_id]) - min(dtimeDict[cluster_id])).days + 1 for cluster_id, members in cluster: dtimeCntDict = dict() for vertex in list( filter(lambda x: x['cluster_id'] == cluster_id, [ self.graph.getVertex(member).get() for member in to_list(members) ])): for time in dtimeDict[cluster_id]: if to_time(vertex['dtime']) == time: if not dtimeCntDict.has_key(time): dtimeCntDict[time] = 0 dtimeCntDict[time] += 1 dtimeDict[cluster_id] = dtimeCntDict print('dtimeDict >> ', dtimeDict) print('dtimeRangeDict >> ', dtimeRangeDict)
def getClusterIdByVid(self, vid): clusterId = '' for key, value in self.cluster: valueArr = to_list(value) if vid in valueArr: clusterId = key return clusterId
def getDetectionReasonCluster(self, cluster): reasonDict = dict() for cluster_id, members in cluster: reasonDict[cluster_id] = set() for vertex in list( filter(lambda x: x['cluster_id'] == cluster_id, [ self.graph.getVertex(member).get() for member in to_list(members) ])): reasonDict[cluster_id].add(vertex['detection_reason']) print(reasonDict)
def setClusterId(self): for cluster_id, members in self.cluster: for vertex in [ self.graph.getVertex(member) for member in to_list(members) ]: vertex.setClusterId(cluster_id)
def insertClusterTfIdf(self, cluster=None, qt=None, qr=None): import pandas as pd import json from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer # cluster 특징 추출 : properties:value --> 1 line text if cluster is None: cluster = self.cluster _dict = dict() _dict['text'] = list() _dict['cluster_id'] = list() for cluster_id, members in cluster: text = '' for vertex in [ self.graph.getVertex(member).get() for member in to_list(members) ]: for key in vertex.keys(): if vertex[key] is not None: value = None if isinstance(vertex[key], unicode): value = vertex[key].encode('utf-8') else: value = str(vertex[key]) text += ' ' + pre_process(value) else: text += '' text.strip() _dict['text'].append(text) _dict['cluster_id'].append(cluster_id) # pandas dataframe set df_idf = pd.DataFrame(_dict) # df_idf['text'] = reduce(lambda x,y: x+' '+y.apply(lambda y1:pre_process(y1)), [df_idf[key] for key in self.graph.vKeys()]) # not yet used stopwords # stopWords = file.get_stop_words('analysis/stopword.txt') # cluster 특징 dataframe column list get docs = df_idf['text'].tolist() # count vectorizer 객체 생성 cv = CountVectorizer( max_df=0.85, stop_words='english', analyzer='word', token_pattern= r'(?u)\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}\:\d*|(?u)\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}|\b\w\w+\b' ) # 전체 dictionary 생성 word_count_vector = cv.fit_transform(docs) # print(list(cv.vocabulary_.keys())[:10]) # tfidf 객체 생성 및 텍스트별 카운팅 산출 tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) # print idf values # df_idf1 = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["tf_idf_weights"]) # sort ascending # print(df_idf1.sort_values(by=['tf_idf_weights'])) feature_names = cv.get_feature_names() idx = 1 # dictionary에서 클러스터별 특징 출력 for doc in docs: tf_idf_vector = tfidf_transformer.transform(cv.transform([doc])) sorted_items = sort_coo(tf_idf_vector.tocoo()) keywords = extract_topn_from_vector(feature_names, sorted_items, 100) sorted_keywords = sorted(keywords, key=lambda k: keywords[k], reverse=True) param_keywords = dict() # top 5 추출 for k in sorted_keywords[:5]: param_keywords[k] = keywords[k] qt.doQuery( qr.getQueryString('creation.table.insert_cluster_ch'), { 'cluster_id': str(idx), 'characteristic': json.dumps(param_keywords) }) idx += 1
def getMaxMemberCluster(self, cluster): return reduce(lambda x, y: y if len(x) < len(y) else x, [(cluster_id, to_list(members)) for cluster_id, members in cluster])