コード例 #1
0
from pre_processing import to_process, get_senti_representation, vocabulary_pos

src = 'books'
n = 10

for src in ('books', 'electronics', 'kitchen', 'dvd'):

    book = xlsxwriter.Workbook('Sheets/Clustering/DBScan' + src + '.xls')
    # book = xlsxwriter.Workbook(src + '.xls')

    with open('Datasets/dataset_' + src, 'rb') as fp:
        dataset = pickle.load(fp)

    # Preprocessing and getting swn representation
    data = to_process(dataset.docs, '6', 50)
    vocabulary = vocabulary_pos(data)
    vocab, scores = get_senti_representation(vocabulary, True)

    for n in (10, 20, 30, 40, 50, 75, 100, 150, 200, 250, 300, 350, 400):

        print(src, n, end=' ')

        clustering = hdbscan.HDBSCAN(algorithm='best', alpha=1.0, approx_min_span_tree=True, gen_min_span_tree=False,
        leaf_size=40, metric='euclidean',min_cluster_size=5, min_samples=None, p=None)

        # clustering = DBSCAN(eps=1, min_samples=2, algorithm='kd_tree')
        # clustering = SpectralClustering(n_clusters=n, assign_labels="discretize", random_state=0)
        # clustering = KMeans(n_clusters=n, random_state=0)

        clustering.fit(scores)
コード例 #2
0
    for item in lists:
        text = ""
        for word in item:
            text = text + " " + word
        new_docs.append(text)

    return new_docs


with open('dictionary', 'rb') as fp:
    grouped = pickle.load(fp)

with open('Datasets/dataset_kitchen', 'rb') as fp:
    dataset = pickle.load(fp)
data_src = to_process(dataset.docs, '1011', 3)

with open('Datasets/dataset_electronics', 'rb') as fp:
    dataset = pickle.load(fp)
data_tgt = to_process(dataset.docs, '1011', 3)

vocabulary_source = get_vocabulary(data_src)
vocab_source, scores_source, dicti_source = get_senti_representation(
    vocabulary_source, True)

vocabulary_target = get_vocabulary(data_tgt)
vocab_target, scores_target, dicti_target = get_senti_representation(
    vocabulary_target, True)

x = []
y = []
コード例 #3
0
        while len(x) < max_length:
            x.append(np.zeros(emb_size))

        x_batch.append(x)
    return x_batch


# -------------------------- preprocessing ------------------------------------
print("\npreprocessing=====================================\n")

datasets = {}
labels = {}

with open('Datasets/dataset_books', 'rb') as fp:
    dataset = pickle.load(fp)
data = to_process(dataset.docs, pos, 3, "Books")
datasets['books'] = data
labels['books'] = dataset.labels

with open('Datasets/dataset_dvd', 'rb') as fp:
    dataset = pickle.load(fp)
data = to_process(dataset.docs, pos, 3, "DVD")
datasets['dvd'] = data
labels['dvd'] = dataset.labels

with open('Datasets/dataset_electronics', 'rb') as fp:
    dataset = pickle.load(fp)
data = to_process(dataset.docs, pos, 3, "Eletronics")
datasets['electronics'] = data
labels['electronics'] = dataset.labels