from pre_processing import to_process, get_senti_representation, vocabulary_pos src = 'books' n = 10 for src in ('books', 'electronics', 'kitchen', 'dvd'): book = xlsxwriter.Workbook('Sheets/Clustering/DBScan' + src + '.xls') # book = xlsxwriter.Workbook(src + '.xls') with open('Datasets/dataset_' + src, 'rb') as fp: dataset = pickle.load(fp) # Preprocessing and getting swn representation data = to_process(dataset.docs, '6', 50) vocabulary = vocabulary_pos(data) vocab, scores = get_senti_representation(vocabulary, True) for n in (10, 20, 30, 40, 50, 75, 100, 150, 200, 250, 300, 350, 400): print(src, n, end=' ') clustering = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False, leaf_size=40, metric='euclidean',min_cluster_size=5, min_samples=None, p=None) # clustering = DBSCAN(eps=1, min_samples=2, algorithm='kd_tree') # clustering = SpectralClustering(n_clusters=n, assign_labels="discretize", random_state=0) # clustering = KMeans(n_clusters=n, random_state=0) clustering.fit(scores)
for item in lists: text = "" for word in item: text = text + " " + word new_docs.append(text) return new_docs with open('dictionary', 'rb') as fp: grouped = pickle.load(fp) with open('Datasets/dataset_kitchen', 'rb') as fp: dataset = pickle.load(fp) data_src = to_process(dataset.docs, '1011', 3) with open('Datasets/dataset_electronics', 'rb') as fp: dataset = pickle.load(fp) data_tgt = to_process(dataset.docs, '1011', 3) vocabulary_source = get_vocabulary(data_src) vocab_source, scores_source, dicti_source = get_senti_representation( vocabulary_source, True) vocabulary_target = get_vocabulary(data_tgt) vocab_target, scores_target, dicti_target = get_senti_representation( vocabulary_target, True) x = [] y = []
while len(x) < max_length: x.append(np.zeros(emb_size)) x_batch.append(x) return x_batch # -------------------------- preprocessing ------------------------------------ print("\npreprocessing=====================================\n") datasets = {} labels = {} with open('Datasets/dataset_books', 'rb') as fp: dataset = pickle.load(fp) data = to_process(dataset.docs, pos, 3, "Books") datasets['books'] = data labels['books'] = dataset.labels with open('Datasets/dataset_dvd', 'rb') as fp: dataset = pickle.load(fp) data = to_process(dataset.docs, pos, 3, "DVD") datasets['dvd'] = data labels['dvd'] = dataset.labels with open('Datasets/dataset_electronics', 'rb') as fp: dataset = pickle.load(fp) data = to_process(dataset.docs, pos, 3, "Eletronics") datasets['electronics'] = data labels['electronics'] = dataset.labels