def return_data(union_index_list, corpus, min_sample): group = {} for idx, cluster_index in enumerate(union_index_list): label = idx group[label] = {} group[label]["ids"] = [] group[label]["title_list"] = {} group[label]["site_names"] = set() group[label]["publish_times"] = [] for index in cluster_index: id = corpus[index].messageId title = corpus[index].messageTitle publish_time = corpus[index].messagePublishtime site_name = corpus[index].site_name group[label]["ids"].append(id) group[label]["site_names"].add(site_name) group[label]["publish_times"].append(publish_time) if title not in group[label]["title_list"].keys(): group[label]["title_list"][title] = 1 else: group[label]["title_list"][title] += 1 cluster_result = [] for label, value in group.items(): # clusterId cluster_id = label # cluster_topic 对每一title降序排序 titles = group[label]["title_list"] sorted_titles = sorted(titles.iteritems(), key=operator.itemgetter(1), reverse=True) cluster_topic = sorted_titles[0][0] # clusterPublishtimeRange 对时间排序 publish_times = group[label]["publish_times"] publish_times.sort() # 时间升序 cluster_publish_beginTime = publish_times[0] cluster_publish_endTime = publish_times[-1] # cluster_member ids_list = map(str, group[label]["ids"]) cluster_member = "^A".join(ids_list) # cluster_member_count cluster_member_count = len(ids_list) # siteCount siteCount = len(group[label]["site_names"]) # 构造ClusterObj if cluster_member_count >= min_sample: cObj = ClusterObj(clusterId=cluster_id, clusterTopic=cluster_topic, clusterPublishBeginTime=cluster_publish_beginTime, clusterPublishEndTime=cluster_publish_endTime, clusterMember=cluster_member, cluserMemeberCount=cluster_member_count, siteCount=siteCount) cluster_result.append(cObj) return cluster_result
def return_data(cluster_ids_list, first_cluster_results, min_sample): cObjArray = [] for idx, cluster_ids in enumerate(cluster_ids_list): cluster_topic = '' max_ids_count = 0 publishtimes = [] ids = [] sitenames = set() for cluster_id in cluster_ids: _ids = map(str, first_cluster_results[cluster_id]["ids"]) _cluster_topic = first_cluster_results[cluster_id]["cluster_topic"] _sitenames = first_cluster_results[cluster_id]["site_names"] _publishtimes = first_cluster_results[cluster_id]["publish_times"] if len(_ids) > max_ids_count: max_ids_count = len(_ids) cluster_topic = _cluster_topic publishtimes.extend(_publishtimes) ids.extend(_ids) sitenames = sitenames | _sitenames # 构建返回结果 # clusterId clusterId = idx # clusterTopic 对每一title降序排序 clusterTopic = cluster_topic # clusterPublishtimeRange 对时间排序 publishtimes.sort() # 时间升序 clusterPublishBeginTime = publishtimes[0] clusterPublishEndTime = publishtimes[-1] # clusterMember clusterMember = "^A".join(ids) # cluserMemeberCount cluserMemeberCount = len(ids) # siteCount siteCount = len(sitenames) # 构造ClusterObj if len(ids) >= min_sample: cObj = ClusterObj(clusterId=clusterId, clusterTopic=clusterTopic, clusterPublishBeginTime=clusterPublishBeginTime, clusterPublishEndTime=clusterPublishEndTime, clusterMember=clusterMember, cluserMemeberCount=cluserMemeberCount, siteCount=siteCount) cObjArray.append(cObj) return cObjArray
def get_result_corpus(corpus, is_manual, cluster_type, manual_id, subtopic_id, language_type, save_group_id): """ 当first_cluster_results 为空, 随机选择10条数据作为聚类结果返回. :return: """ random_corpus = [] if len(corpus) > 10: random_corpus = random.sample(corpus, 10) # 从list中随机获取10个元素 else: random_corpus = corpus cObjArray = [] for idx, cluster_message_obj in enumerate(random_corpus): # id id = get_cluster_result_id() # clusterId clusterId = idx # clusterTopic 对每一title降序排序 clusterTopic = cluster_message_obj.messageTitle # clusterPublishtimeRange 对时间排序 clusterPublishBeginTime = cluster_message_obj.messagePublishtime clusterPublishEndTime = cluster_message_obj.messagePublishtime # clusterMember clusterMember = str(cluster_message_obj.messageId) # cluserMemeberCount cluserMemeberCount = 1 # siteCount siteCount = 1 # 构造ClusterObj cObj = ClusterObj(id, clusterId, clusterTopic, clusterPublishBeginTime, clusterPublishEndTime, clusterMember, cluserMemeberCount, siteCount, cluster_type, language_type, save_group_id, is_manual, manual_id, subtopic_id) cObjArray.append(cObj) # 增加排序 sort_bywords(cObjArray) return cObjArray
def get_result_corpus(corpus): """ 当first_cluster_results 为空, 随机选择10条数据作为聚类结果返回. :param corpus: :return: """ random_corpus = [] if len(corpus) > 10: random_corpus = random.sample(corpus, 10) # 从list中随机获取10个元素 else: random_corpus = corpus cObjArray = [] for idx, cluster_message_obj in enumerate(random_corpus): # clusterId clusterId = idx # clusterTopic 对每一title降序排序 clusterTopic = cluster_message_obj.messageTitle # clusterPublishtimeRange 对时间排序 clusterPublishBeginTime = cluster_message_obj.messagePublishtime clusterPublishEndTime = cluster_message_obj.messagePublishtime # clusterMember clusterMember = str(cluster_message_obj.messageId) # cluserMemeberCount cluserMemeberCount = 1 # siteCount siteCount = 1 # 构造ClusterObj cObj = ClusterObj(clusterId=clusterId, clusterTopic=clusterTopic, clusterPublishBeginTime=clusterPublishBeginTime, clusterPublishEndTime=clusterPublishEndTime, clusterMember=clusterMember, cluserMemeberCount=cluserMemeberCount, siteCount=siteCount) cObjArray.append(cObj) return cObjArray
def return_data(cluster_ids_list, first_cluster_results, is_manual, cluster_type, manual_id, subtopic_id, language_type, save_group_id): cObjArray = [] for idx, cluster_ids in enumerate(cluster_ids_list): cluster_topic = '' max_ids_count = 0 publishtimes = [] ids = [] sitenames = set() for cluster_id in cluster_ids: _ids = map(str, first_cluster_results[cluster_id]["ids"]) _cluster_topic = first_cluster_results[cluster_id]["cluster_topic"] _sitenames = first_cluster_results[cluster_id]["site_names"] _publishtimes = first_cluster_results[cluster_id]["publish_times"] if len(_ids) > max_ids_count: max_ids_count = len(_ids) cluster_topic = _cluster_topic publishtimes.extend(_publishtimes) ids.extend(_ids) sitenames = sitenames | _sitenames # 构建返回结果 # id id = get_cluster_result_id() # clusterId clusterId = idx # clusterTopic 对每一title降序排序 clusterTopic = cluster_topic # clusterPublishtimeRange 对时间排序 publishtimes.sort() # 时间升序 clusterPublishBeginTime = publishtimes[0] clusterPublishEndTime = publishtimes[-1] # clusterMember clusterMember = "^A".join(ids) # cluserMemeberCount cluserMemeberCount = len(ids) # siteCount siteCount = len(sitenames) # 构造ClusterObj cObj = ClusterObj(id, clusterId, clusterTopic, clusterPublishBeginTime, clusterPublishEndTime, clusterMember, cluserMemeberCount, siteCount, cluster_type, language_type, save_group_id, is_manual, manual_id, subtopic_id) cObjArray.append(cObj) # 增加排序 sort_bywords(cObjArray) return cObjArray
def getReturnData(label, messageObjList): # title和MessageObj 拉链操作 labelZipMessageObjList = zip(label, messageObjList) # 去除-1标签 noImpurity = filter(lambda (x, y): x != -1, labelZipMessageObjList) # 按label分组,统计每个组出现次数最多的title,统计每个组的时间范围 noImpurityGroup = {} for line in noImpurity: label = line[0] id = line[1].messageId title = line[1].messageTitle content = line[1].messageContent content_length = len(content) publish_time = line[1].messagePublishtime site_name = line[1].site_name # 如果不存在, 初始化 if label not in noImpurityGroup.keys(): noImpurityGroup[label] = {} noImpurityGroup[label]["id"] = [] # 关注title 出现次数 noImpurityGroup[label]["title"] = {} # 关注publishtime 最大最小 noImpurityGroup[label]["publishtime"] = [] # 关注site_name noImpurityGroup[label]["sitename"] = set() # 获得不同站点中, length最长的文章 noImpurityGroup[label]["mlcpersite"] = {} noImpurityGroup[label]["id"].append(id) noImpurityGroup[label]["title"][title] = 1 noImpurityGroup[label]["publishtime"].append(publish_time) noImpurityGroup[label]["sitename"].add(site_name) noImpurityGroup[label]["mlcpersite"][site_name] = { 'content': content, 'maxlength': content_length, 'id': id } else: noImpurityGroup[label]["id"].append(id) if title not in noImpurityGroup[label]["title"].keys(): noImpurityGroup[label]["title"][title] = 1 else: noImpurityGroup[label]["title"][title] += 1 noImpurityGroup[label]["publishtime"].append(publish_time) noImpurityGroup[label]["sitename"].add(site_name) if site_name not in noImpurityGroup[label]["mlcpersite"].keys(): noImpurityGroup[label]["mlcpersite"][site_name] = { 'content': content, 'maxlength': content_length, 'id': id } else: old_mlc = noImpurityGroup[label]["mlcpersite"][site_name] if old_mlc['maxlength'] < content_length: noImpurityGroup[label]["mlcpersite"][site_name] = { 'content': content, 'maxlength': content_length, 'id': id } # 分组内部 title降序排序 publisttime比较大小 构造ClusterObj import operator from entity.ClusterObj import ClusterObj cObjArray = [] for (label, value) in noImpurityGroup.items(): # clusterId clusterId = label # clusterTopic 对每一title降序排序 titleGroup = noImpurityGroup[label]["title"] sortedTitleGroup = sorted(titleGroup.iteritems(), key=operator.itemgetter(1), reverse=True) clusterTopic = sortedTitleGroup[0][0] # clusterPublishtimeRange 对时间排序 clusterPublishtime = noImpurityGroup[label]["publishtime"] clusterPublishtime.sort() # 时间升序 clusterPublishBeginTime = clusterPublishtime[0] clusterPublishEndTime = clusterPublishtime[-1] # clusterMember cluster_group = map(str, noImpurityGroup[label]["id"]) clusterMember = "^A".join(cluster_group) # cluserMemeberCount cluserMemeberCount = len(cluster_group) # siteCount siteCount = len(noImpurityGroup[label]["sitename"]) # 构造ClusterObj cObj = ClusterObj(clusterId=clusterId, clusterTopic=clusterTopic, clusterPublishBeginTime=clusterPublishBeginTime, clusterPublishEndTime=clusterPublishEndTime, clusterMember=clusterMember, cluserMemeberCount=cluserMemeberCount, siteCount=siteCount) cObjArray.append(cObj) return cObjArray