def time_seq_count(): time_seqs_per_label = {} for fname in os.listdir(data_dir): full_fname = os.path.join(data_dir, fname) for line in open(full_fname): created_at, user_id, label, text = line.rstrip('\n').split('\t') time_seq = timeSeq(created_at) if not ok_without_rt(text): continue if label not in time_seqs_per_label: time_seqs_per_label[label] = [] time_seqs_per_label[label].append(time_seq) # Let's aggregate into a single table labels = map(itemgetter(0), time_seqs_per_label.items()) #print labels counts_per_time_unit = {} for label, time_seqs in time_seqs_per_label.iteritems(): c = sorted(Counter(time_seqs).most_common(), key=itemgetter(0)) for (time_seq, count) in c: if time_seq not in counts_per_time_unit: counts_per_time_unit[time_seq] = {} for l in labels: counts_per_time_unit[time_seq][l] = 0 counts_per_time_unit[time_seq][label] = count return (counts_per_time_unit, labels)
def extract_specific_time_music(week_num, music_label): user_ids = [] for fname in os.listdir(data_dir): full_fname = os.path.join(data_dir, fname) for line in open(full_fname): created_at, user_id, label, text = line.rstrip('\n').split('\t') time_seq = timeSeq(created_at) if time_seq == week_num and label == music_label: user_ids.append(user_id) user_ids = list(set(user_ids)) return user_ids
def extract_show(show_label): user_ids = [] for fname in os.listdir(data_dir): full_fname = os.path.join(data_dir, fname) for line in open(full_fname): created_at, user_id, label, text = line.rstrip('\n').split('\t') if ok_without_rt(text): time_seq = timeSeq(created_at) if label == show_label: user_ids.append(user_id) user_ids = list(set(user_ids)) return user_ids