def get_behavior_statics(uid_list, topic_dict): """ 获取用户的行为信息 """ print '获取用户的行为信息...' behavior = dict() for uid in uid_list: behavior[uid] = [0, 0, ''] # [num_topics, num_comments, related-content] # 浏览所有的topic print '浏览所有的topic信息...' for topic_id in topic_dict: topic = topic_dict[topic_id] if topic['user_id'] in behavior: uid = topic['user_id'] behavior[uid][0] += 1 behavior[uid][2] += (topic['title'] + ' ' + topic['content']) print '浏览所有的留言信息...' f = open(COMMENT_ALL_FILE_PATH, 'r') # 保证本group的comment信息只抽取一遍 if os.path.exists(COMMENT_FILE_PATH): fc = None else: fc = open(COMMENT_FILE_PATH, 'w') # 存储本group的comment row = '' for line in f: line = line.strip() if line != '[*ROWEND*]': row += line continue seg_list = row.split('[=]') #print 'Processing comment id: %s, group id: %s, topic id: %s' % (seg_list[0], seg_list[1], seg_list[2]) pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S") topic_id = seg_list[2] # 保证评论所在的topic被收录 if seg_list[1] != GROUP_ID or (not topic_id in topic_dict): row = '' continue if fc != None and (is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE) or is_between(pubdate, TEST_START_DATE, TEST_END_DATE)): fc.write(row + '\n[*ROWEND*]\n') if is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE): uid = seg_list[3] if uid in behavior: behavior[uid][1] += 1 # 如果用户评论了某个帖子,则认为用户对这个帖子有兴趣 # 在这里,将帖子的标题和内容,都加入用户的感兴趣的内容中 topic_id = seg_list[2] topic = topic_dict[topic_id] # 这里并没有包括引用的评论的内容 behavior[uid][2] += (topic['title'] + ' ' + topic['content'] + ' ' + seg_list[6]) row = '' f.close() if not fc is None: fc.close() return behavior
def score_inheritance(val: Optional[Union[int, float]]) -> str: if val is None: color = 'white' elif is_between(val, 0, 3): color = 'green' elif is_between(val, 3, 6): color = 'yellow' else: color = 'red' return color
def score_c2c(val: Optional[Union[int, float]]) -> str: if val is None: color = 'white' elif is_between(val, 0, 0.44): color = 'red' elif is_between(val, 0.44, 0.55): color = 'yellow' else: color = 'green' return color
def score_cohesion(val: Optional[Union[int, float]]) -> str: if val is None: color = 'white' elif is_between(val, 0, 33): color = 'green' elif is_between(val, 33, 67): color = 'yellow' else: color = 'red' return color
def score_nesting(val: Optional[Union[int, float]]) -> str: if val is None: color = 'white' elif is_between(val, 0, 2): color = 'green' elif is_between(val, 2, 5): color = 'yellow' else: color = 'red' return color
def score_essential(val: Optional[Union[int, float]]) -> str: if val is None: color = 'white' elif is_between(val, 0, 4): color = 'green' elif is_between(val, 4, 10): color = 'yellow' else: color = 'red' return color
def score_sloc(val: Optional[Union[int, float]]) -> str: if val is None: color = 'white' elif is_between(val, 0, 100): color = 'green' elif is_between(val, 100, 200): color = 'yellow' elif is_between(val, 200, 500): color = 'red' else: color = 'red' return color
def score_cyclo(val: Optional[Union[int, float]]) -> str: if val is None: return 'white' elif is_between(val, 0, 4): return 'green' elif is_between(val, 4, 10): return 'yellow' elif is_between(val, 10, 20): return 'yellow' elif is_between(val, 20, 50): return 'red' else: return 'red'
def cyclo_to_bad_fix(val: Optional[Union[int, float]]) -> Optional[int]: if val is None: return None elif is_between(val, 1, 10): return 5 elif is_between(val, 10, 20): return 10 elif is_between(val, 20, 30): return 20 elif is_between(val, 30, 50): return 30 elif is_between(val, 50, 50): return 40 else: return 60
def get_interested_topic(uid_list, comment_path): """ 从comment info中获取用户感兴趣的topic list列表(即评论的某个topic的id列表 ), 并统计用户的评论次数 """ user_set = set(uid_list) # user id ==> (num_comments, num_topics) behavior = dict() # 初始化behavior # 注意:有可能interest_info和behavior并不包含所有的uid,不过都为其设置了初始值 for uid in uid_list: behavior[uid] = [0, 0] f = codecs.open(comment_path, 'r', 'utf-8') for line in f: line = line.strip() seg_list = line.split('[=]') uid = seg_list[3] pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S") if uid in behavior and is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE): # 如果某个用户在训练时间内没有发表帖子或者评论,其设置为0 behavior[uid][0] += 1 # 参与评论数加1 f.close() return behavior
def score_bad_fix(_val: Optional[Union[int, float]]) -> str: val = cyclo_to_bad_fix(_val) if val is None: return 'white' elif is_between(val, 0, 10): return 'green' elif is_between(val, 10, 20): return 'green' elif is_between(val, 20, 30): return 'yellow' elif is_between(val, 30, 50): return 'yellow' elif is_between(val, 50, 80): return 'red' else: return 'red'
def check_angular_position(self, name, bbox): angle = utils.get_angle(bbox, self.cur_img.shape) l = len(self.angular_order) ang_order = self.angular_order * 2 idx = ang_order.index(name) prev = self.angular_order[(idx - 1) % l] next = self.angular_order[(idx + 1) % l] angles_dict = utils.get_bbox_dict_ang_pos(self.tmp_track, self.cur_img.shape) start, end = angles_dict[prev], angles_dict[next] return utils.is_between(start, end, angle)
def get_bbox_between_id(self, id1, id2, bbox_fd_list): a1 = utils.get_angle(self.tmp_track[id1][config.BBOX_KEY], self.cur_img.shape) a2 = utils.get_angle(self.tmp_track[id2][config.BBOX_KEY], self.cur_img.shape) bbox_list = [ i for i in bbox_fd_list if utils.is_between(a1, a2, utils.get_angle(i, self.cur_img.shape)) ] angle_list = [] for bbox_fd in bbox_fd_list: angle = utils.get_angle(bbox_fd, self.cur_img.shape) if utils.is_between(a1, a2, angle): bbox_list.append(bbox_fd) angle_list.append(angle) zipped_pairs = zip(angle_list, bbox_list) zipped_pairs = sorted(zipped_pairs) bbox_list = [x for _, x in zipped_pairs] angle_list = [x for x, _ in zipped_pairs] return bbox_list, angle_list
def gen_interest_text(uid_list, behavior, topic_path): """ 根据用户感兴趣的topic list,从TopicInfo中找到用户感兴趣的文本 """ f = codecs.open(topic_path, 'r', 'utf-8') for line in f: line = line.strip() seg_list = line.split('[=]') topic_id = seg_list[0] uid = seg_list[2] pubdate = datetime.strptime(seg_list[3], "%Y-%m-%d %H:%M:%S") # 统计发表的topic数 if uid in behavior and is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE): behavior[uid][1] += 1 f.close()
def load_topic_user(filepath, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME): """ 根据时间范围,导入所有的topic以及参与的user id 注意:topic可能有commentlist或者没有 """ print 'Loading topic from %s' % filepath f = codecs.open(filepath, 'r', 'utf-8') # map topic_id --> dict() topic_dict = dict() user_set = set() count = 0 for line in f: line = line.strip() seg_list = line.split('[=]') if len(seg_list) < 6: log.info('Bad formatted topic: %s' % line) count += 1 continue #print 'Processing topic id: %s, group id: %s' % (seg_list[0], seg_list[1]) pubdate = datetime.strptime(seg_list[3], "%Y-%m-%d %H:%M:%S") if not is_between(pubdate, start_date, end_date): continue # 记录下该topic信息 topic = dict() topic['topic_id'] = seg_list[0] topic['group_id'] = seg_list[1] topic['user_id'] = seg_list[2] topic['pubdate'] = pubdate topic['title'] = seg_list[4] topic['content'] = seg_list[5] user_set.add(topic['user_id']) # 去掉最后的逗号 if len(seg_list) == 7: # 如果包含comment_list s = seg_list[6] if s != '' and s[-1] == ',': seg_list[6] = s[0:-1] topic['comment_list'] = seg_list[6] else: topic['comment_list'] = '' topic_dict[topic['topic_id']] = topic #print "Loaded topic: " + topic[topic_id] log.info('Number of bad formatted topic: %d' % count) f.close() return topic_dict, user_set
def load_comment_user(filepath, topic_dict, start_date = VERY_EARLY_TIME, end_date = VERY_LATE_TIME): """ 根据时间范围,导入所有的评论id,tpic id和内容 注意:在这里仍然需要topic_dict,因为只有在topic_dict中的comment才会被收集 """ print 'Loading comment from %s' % filepath f = codecs.open(filepath, 'r', 'utf-8') comment_dict = dict() user_set = set() count = 0 for line in f: line = line.strip() seg_list = line.split('[=]') if len(seg_list) != 7: log.info('Bad formatted comment: %s' % line) count += 1 continue #print 'Processing comment id: %s, group id: %s, topic id: %s' % (seg_list[0], seg_list[1], seg_list[2]) pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S") topic_id = seg_list[2] if topic_id in topic_dict and is_between(pubdate, start_date, end_date): pass else: continue comment = dict() comment['comment_id'] = seg_list[0] comment['group_id'] = seg_list[1] comment['topic_id'] = seg_list[2] comment['user_id'] = seg_list[3] user_set.add(comment['user_id']) pubdate = datetime.strptime(seg_list[4], "%Y-%m-%d %H:%M:%S") comment['pubdate'] = pubdate comment['ref_comment_id'] = seg_list[5] comment['content'] = seg_list[6] comment_dict[comment['comment_id']] = comment log.info('Number of bad formatted comment: %d' % count) return comment_dict, user_set
def move(self, colliders): rem_vel_x = self.velocity_x rem_vel_y = self.velocity_y while rem_vel_x or rem_vel_y: # while the ball can still move target_pos = Vector(rem_vel_x, rem_vel_y) + self.center for collider in colliders: point1, point2 = collider bounce_pos = utils.find_intersection(*self.center, *target_pos, *point1, *point2) if bounce_pos is None: continue # Will never collide unless angle changes distance_bounce = utils.distance(*self.center, *bounce_pos) - self.radius distance_target = utils.distance(*self.center, *target_pos) if distance_bounce > distance_target: continue # Did not collide yet if not utils.is_between(*collider, bounce_pos): continue # Moves past collider break else: # Did not collide with any collider -> free to move self.center_x += rem_vel_x * self.mod self.center_y += rem_vel_y * self.mod break dist_x = utils.to_zero(bounce_pos[0] - self.center_x, rem_vel_x) dist_y = utils.to_zero(bounce_pos[1] - self.center_y, rem_vel_y) rem_vel_x -= dist_x rem_vel_y -= dist_y if collider[0][0] == collider[1][0]: # collider is vertical dist_x = -dist_x rem_vel_x = -rem_vel_x self.velocity_x = -self.velocity_x elif collider[0][1] == collider[1][1]: # collider is horizontal dist_y = -dist_y rem_vel_y = -rem_vel_y self.velocity_y = -self.velocity_y else: raise ValueError("Collider", collider, "has to be a straight line") self.center_x += dist_x * self.mod self.center_y += dist_y * self.mod self.mod += .1
def correct_faces_by_proximity(self, bbox_fd_list, score_fd_list, corrected_bbox): if len(bbox_fd_list) == 0: return [], [] bbox_fd_list = [ bbox_fd_list[idx] for idx, score in enumerate(score_fd_list) if score > config.face_detection_angle_trh ] score_fd_list = [ score for score in score_fd_list if score > config.face_detection_angle_trh ] indices = [] # Get the angles of the sure bboxes ordered in ---> corrected_bbox_angles corrected_bbox_angles_tmp = utils.get_bbox_dict_ang_pos( corrected_bbox, self.cur_img.shape) corrected_bbox_angles = {} verified_names = [] for name in self.angular_order: if name in corrected_bbox_angles_tmp.keys(): corrected_bbox_angles[name] = corrected_bbox_angles_tmp[name] verified_names.append(name) not_corrected_bbox_angles = { k: utils.get_angle(v[config.BBOX_KEY], self.cur_img.shape) for k, v in self.tmp_track.items() if k not in corrected_bbox } if not not_corrected_bbox_angles: return [], [] bbox_fd_angles = [ utils.get_angle(bbox_fd, self.cur_img.shape) for bbox_fd in bbox_fd_list ] tmp_order = {} l = len(verified_names) for idx, angle in enumerate(bbox_fd_angles): for i in range(l): if utils.is_between( corrected_bbox_angles[verified_names[i]], corrected_bbox_angles[verified_names[(i + 1) % l]], angle): tmp_order[idx] = (verified_names[i], verified_names[(i + 1) % l]) break for idx, bbox_fd in enumerate(bbox_fd_list): angle = utils.get_angle(bbox_fd, self.cur_img.shape) if corrected_bbox_angles: prev_id, next_id = None, None for name, value in corrected_bbox_angles.items(): if angle > value: prev_id = name break for name, value in corrected_bbox_angles.items(): if angle < value: next_id = name break if prev_id is None: prev_id = list(corrected_bbox_angles.keys())[-1] if next_id is None: next_id = list(corrected_bbox_angles.keys())[0] ang_order = self.angular_order * 2 start = ang_order.index(prev_id) end = ang_order.index(next_id, start + 1) potential_id_list = [i for i in ang_order[start + 1:end]] else: potential_id_list = self.angular_order if self.check_angle_proximity(angle, corrected_bbox_angles): name = None if len(potential_id_list) == 1 and self.check_angular_position( potential_id_list[0], bbox_fd): name = potential_id_list[0] elif len(potential_id_list) == 0: continue else: key, value = min(not_corrected_bbox_angles.items(), key=lambda kv: abs(kv[1] - angle)) if self.check_angular_position(key, bbox_fd): name = key if name is not None: self.correct_tracker(name, bbox_fd, True) indices.append(idx) logging.info( "Assigned {} to children {} by closest angular position" .format(bbox_fd, name)) bbox_fd_list = [ i for j, i in enumerate(bbox_fd_list) if j not in indices ] score_fd_list = [ i for j, i in enumerate(score_fd_list) if j not in indices ] return bbox_fd_list, score_fd_list
def main(argv): if len(argv) < 2: print 'Group ID not provided.' sys.exit(1) group_id = argv[1] log.info('Prepare training set and test set for group: %s' % group_id) path = 'tables/' + group_id + '/TopicInfo-raw-all-' + group_id topic_dict, topic_user_set = load_topic_user(path, TRAIN_START_DATE, TEST_END_DATE) # 取出所有topic print 'Number of topics loaded: %d (From %s to %s)' % (len(topic_dict), str(TRAIN_START_DATE), str(TEST_END_DATE)) log.info('Number of topics loaded: %d (From %s to %s)' % (len(topic_dict), str(TRAIN_START_DATE), str(TEST_END_DATE))) path = 'tables/' + group_id + '/CommentInfo-raw-all-' + group_id comment_dict, comment_user_set = load_comment_user(path, topic_dict, TRAIN_START_DATE, COMMENT_END_DATE) print 'Number of comments loaded: %d (From %s to %s))' % (len(comment_dict), str(TRAIN_START_DATE), str(COMMENT_END_DATE)) log.info('Number of comments loaded: %d (From %s to %s))' % (len(comment_dict), str(TRAIN_START_DATE), str(COMMENT_END_DATE))) print 'Finding comment users for topics...' # 在comment info中找到对于某个topic的评论id和评论用户 for topic_id in topic_dict: topic = topic_dict[topic_id] topic['comment_set'] = set() topic['comment_user_set'] = set() for comment_id in comment_dict: comment = comment_dict[comment_id] topic_id = comment['topic_id'] user_id = comment['user_id'] if topic_id in topic_dict: topic = topic_dict[topic_id] topic['comment_set'].add(comment_id) topic['comment_user_set'].add(user_id) path = 'tables/' + group_id + '/train-topic-' + group_id train_topic_file = codecs.open(path, 'w', 'utf-8') path = 'tables/' + group_id + '/test-topic-' + group_id test_topic_file = codecs.open(path, 'w', 'utf-8') print 'Generating training and test dataset...' # 作为训练集和测试集的topic, comment数目 train_topic_count = 0 train_comment_count = 0 test_topic_count = 0 test_comment_count = 0 user_set = set() # 保存所有出现在训练集和测试集中的用户id for topic_id, topic in topic_dict.iteritems(): topic_creator = topic['user_id'] pubdate = topic['pubdate'] comment_user_set = topic['comment_user_set'] log.info('Comment user number for topic %s is: %d' % (topic_id, len(comment_user_set))) if is_between(pubdate, TRAIN_START_DATE, TRAIN_END_DATE): train_topic_count += 1 train_comment_count += len(topic['comment_user_set']) f = train_topic_file elif is_between(pubdate, TEST_START_DATE, TEST_END_DATE): # 保证训练集中的评论用户数至少为5 if len(topic['comment_user_set']) < 5: continue test_topic_count += 1 test_comment_count += len(topic['comment_user_set']) f = test_topic_file user_set.add(topic_creator) # add topic creator user_set = user_set | topic['comment_user_set'] # add comment user set row = topic['topic_id'] + '[=]' + topic['group_id'] + '[=]' + \ topic['user_id'] + '[=]' + str(topic['pubdate']) + '[=]' + \ topic['title'] + '[=]' + topic['content'] + '[=]' + \ ','.join(topic['comment_set']) + '[=]' + ','.join(topic['comment_user_set']) row += '\n' f.write(row) train_topic_file.close() test_topic_file.close() # write all user ids to file path = 'social/' + group_id + '/all-users-' + group_id print 'Writing user list to file: %s' % path f = codecs.open(path, 'w', 'utf-8') for uid in user_set: f.write(uid + '\n') f.close() print 'Total users in train and test set: %d' % len(user_set) print 'For training, number of topics: %d, number of commenting users: %d' % (train_topic_count, train_comment_count) print 'For test, number of topics: %d, number of commenting users: %d' % (test_topic_count, test_comment_count) print 'Done'