def process_chunks(chunks, word_count, tag_count, clf_folder, classifier_filenames, bytes_processed, bytes_total, lock): for chunk in chunks: # Convert to sparse matrix X, target_indices = helpers.chunk_to_sparse_mat(chunk, word_count) if X is None: continue # Create target vector from target indices Y = np.zeros((len(target_indices), tag_count)) for i, indices in enumerate(target_indices): Y[i, indices] = 1 # Train decision tree clf = DecisionTreeClassifier( splitter='best', max_features='auto', max_depth=None, ) # Fit data clf.fit(X.toarray(), Y) # Save trained classifier classifier_filename = os.path.join(clf_folder, 'clf-%s-%s.pkl' % chunk) joblib.dump(clf, classifier_filename) # Add classifier name to file with lock: classifier_filenames.append(classifier_filename) bytes_processed.value += chunk[1] print('Processed: %d/%d' % (bytes_processed.value, bytes_total))
def process_chunks(chunks, word_count, K, mu, cluster_sums, cluster_counts, lock): for chunk in chunks: # Convert to sparse matrix X, _ = helpers.chunk_to_sparse_mat(chunk, word_count) if X is None: continue # Get closest cluster indices max_idx = helpers.sparse_matrix_to_cluster_indices(X, mu) mu_subs = collections.defaultdict(list) for i, k in enumerate(max_idx): mu_subs[k].append(X[i].toarray()) # Compute sub-means for k in range(0, K): mu_sub = mu_subs[k] if len(mu_sub) == 0: continue with lock: cluster_sums[k] = cluster_sums[k] + np.asarray( mu_sub, dtype=np.float32).mean(axis=0) cluster_counts[k] += 1
chunk_reader = helpers.ChunkReader( post_filename=config.paths.TRAIN_DATA_IDX, chunk_size=config.data.CHUNK_SIZE) # TODO: Change chunks = [chunk for chunk in chunk_reader] #with open(config.paths.TRAIN_DATA_IDX, 'r') as f: for iteration in range(0, config.algorithm.MAX_ITER): start = time.time() cluster_sums = {k: np.zeros((1, word_count)) for k in range(0, K)} cluster_counts = {k: 0 for k in range(0, K)} for chunk in chunks: # Convert to sparse matrix X, _ = helpers.chunk_to_sparse_mat(chunk, word_count) # Get closest cluster indices max_idx = helpers.sparse_matrix_to_cluster_indices(X, mu) mu_subs = collections.defaultdict(list) for i, k in enumerate(max_idx): mu_subs[k].append(X[i].toarray()) # Compute sub-means for k in range(0, K): mu_sub = mu_subs[k] if len(mu_sub) == 0: continue cluster_sums[k] += np.asarray(mu_sub, dtype=np.float32).mean(axis=0) cluster_counts[k] += 1