def q2(content): pos_tags = [] doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus( content, pos_tags, question=2) doc_day, doc_time = doc_extracted[0], doc_extracted[1] label_day = [] label_time = [] cluster_info_all = [] for idx, sing_review in enumerate(doc_day): doc_day[idx] = (sing_review[0], sing_review[1], day_extract(sing_review)) label_day.append(day_extract(sing_review)) for idx, sing_review in enumerate(doc_time): doc_time[idx] = (sing_review[0], sing_review[1], time_extract(sing_review)) label_time.append(time_extract(sing_review)) unique_day, unique_day_indices = np.unique(label_day, return_inverse=True) for i in range(len(unique_day)): label = unique_day[i] idx_set = np.where(np.array(unique_day_indices) == i)[0] sent = get_Cluster_Centroid([doc_day[i][0] for i in idx_set]) cluster_info_all.append([2, label, len(idx_set), sent]) doc_time_cluster = [doc[0] for doc in doc_time] sent = get_Cluster_Centroid(doc_time_cluster) cluster_info_all.append([2, 'specific_time', len(doc_time), sent]) return doc_day + doc_time + doc_noimprove + doc_other, cluster_info_all
def q6(content): pos_tags = ['NN', 'NNS'] doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus( content, pos_tags, question=6) doc_nn, nn_extracted = doc_extracted[0], doc_extracted[1] print 'Comment with keywords:', len(doc_nn) print 'No comments:', len(doc_noimprove) print 'Comment without keywords:', len(doc_other), "\n" # Rule-based clustering unclustered_index, nn_extracted, cluster_info = rule_based_clustering.clustering( nn_extracted, doc_nn, question=6) # cluster_info_all format: [[question number, label, freq, centroid_sentence],...] cluster_info_all = [[6, info[0], info[1], info[2]] for info in cluster_info] # LSI + Spectral Clustering nn_extracted_unclustered = [nn_extracted[i][0] for i in unclustered_index] doc_nn_unclustered = [doc_nn[i] for i in unclustered_index] similarity_matrix = auto_clustering.lsi(nn_extracted_unclustered) label_auto, cluster_info = auto_clustering.spectral_clustering( similarity_matrix, nn_extracted_unclustered, doc_nn_unclustered) for i, idx in enumerate(unclustered_index): nn_extracted[idx] = nn_extracted[idx] + (label_auto[i], ) for info in cluster_info: cluster_info_all.append([6, info[0], info[1], info[2]]) return nn_extracted + doc_noimprove + doc_other, cluster_info_all
def q9(content): pos_tags = ['NN', 'NNS'] doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus( content, pos_tags, question=9) doc_nn, nn_extracted = doc_extracted[0], doc_extracted[1] print 'Comment with keywords:', len(doc_nn) print 'No comments:', len(doc_noimprove) print 'Comment without keywords:', len(doc_other) df = post_processing.df_count_tuple(nn_extracted) nn_clean = post_processing.filter_ne(nn_extracted, doc_nn, df, question=9) df = post_processing.df_count(nn_clean) nn_extracted, cluster_info = post_processing.main_category_clustering( df, nn_extracted, nn_clean, doc_nn) # cluster_info_all format: [[question number, label, freq, centroid_sentence],...] cluster_info_all = [[9, info[0], info[1], info[2]] for info in cluster_info] return nn_extracted + doc_noimprove + doc_other, cluster_info_all
def q5(content): pos_tags = ['NN', 'NNS', 'JJ', 'JJR', 'JJS'] doc_noimprove, doc_extracted, doc_other = pre_processing.process_corpus( content, pos_tags, question=5) doc_nn, nn_extracted = doc_extracted[0], doc_extracted[1] print 'Comment with keywords:', len(doc_nn) print 'No comments:', len(doc_noimprove) print 'Comment without keywords:', len(doc_other), "\n" # LSI + Spectral Clustering nn_extracted_corpus = [nn_single[0] for nn_single in nn_extracted] similarity_matrix = auto_clustering.lsi(nn_extracted_corpus) label_auto, cluster_info = auto_clustering.spectral_clustering( similarity_matrix, nn_extracted_corpus, doc_nn, cluster_num=10) for idx in range(len(nn_extracted_corpus)): nn_extracted[idx] = nn_extracted[idx] + (label_auto[idx], ) cluster_info_all = [[5, info[0], info[1], info[2]] for info in cluster_info] return nn_extracted + doc_noimprove + doc_other, cluster_info_all