Beispiel #1
0
def process_chunks(chunks, word_count, tag_count, clf_folder,
                   classifier_filenames, bytes_processed, bytes_total, lock):
    for chunk in chunks:

        # Convert to sparse matrix
        X, target_indices = helpers.chunk_to_sparse_mat(chunk, word_count)

        if X is None: continue

        # Create target vector from target indices
        Y = np.zeros((len(target_indices), tag_count))
        for i, indices in enumerate(target_indices):
            Y[i, indices] = 1

        # Train decision tree
        clf = DecisionTreeClassifier(
            splitter='best',
            max_features='auto',
            max_depth=None,
        )

        # Fit data
        clf.fit(X.toarray(), Y)

        # Save trained classifier
        classifier_filename = os.path.join(clf_folder, 'clf-%s-%s.pkl' % chunk)
        joblib.dump(clf, classifier_filename)

        # Add classifier name to file
        with lock:
            classifier_filenames.append(classifier_filename)
            bytes_processed.value += chunk[1]
            print('Processed: %d/%d' % (bytes_processed.value, bytes_total))
Beispiel #2
0
def process_chunks(chunks, word_count, K, mu, cluster_sums, cluster_counts,
                   lock):
    for chunk in chunks:

        # Convert to sparse matrix
        X, _ = helpers.chunk_to_sparse_mat(chunk, word_count)

        if X is None: continue

        # Get closest cluster indices
        max_idx = helpers.sparse_matrix_to_cluster_indices(X, mu)

        mu_subs = collections.defaultdict(list)
        for i, k in enumerate(max_idx):
            mu_subs[k].append(X[i].toarray())

        # Compute sub-means
        for k in range(0, K):
            mu_sub = mu_subs[k]
            if len(mu_sub) == 0: continue

            with lock:
                cluster_sums[k] = cluster_sums[k] + np.asarray(
                    mu_sub, dtype=np.float32).mean(axis=0)
                cluster_counts[k] += 1
Beispiel #3
0
chunk_reader = helpers.ChunkReader(
    post_filename=config.paths.TRAIN_DATA_IDX,
    chunk_size=config.data.CHUNK_SIZE)  # TODO: Change
chunks = [chunk for chunk in chunk_reader]

#with open(config.paths.TRAIN_DATA_IDX, 'r') as f:
for iteration in range(0, config.algorithm.MAX_ITER):
    start = time.time()

    cluster_sums = {k: np.zeros((1, word_count)) for k in range(0, K)}
    cluster_counts = {k: 0 for k in range(0, K)}

    for chunk in chunks:

        # Convert to sparse matrix
        X, _ = helpers.chunk_to_sparse_mat(chunk, word_count)

        # Get closest cluster indices
        max_idx = helpers.sparse_matrix_to_cluster_indices(X, mu)

        mu_subs = collections.defaultdict(list)
        for i, k in enumerate(max_idx):
            mu_subs[k].append(X[i].toarray())

        # Compute sub-means
        for k in range(0, K):
            mu_sub = mu_subs[k]
            if len(mu_sub) == 0: continue
            cluster_sums[k] += np.asarray(mu_sub,
                                          dtype=np.float32).mean(axis=0)
            cluster_counts[k] += 1