import helpers import config # Read tags tags, tag2idx, tag_count = helpers.read_tags() # Read words words, word2idx, word_count = helpers.read_words() # Load means with open(config.paths.MU, 'rb') as f: mu = np.load(f) # Get chunks chunk_reader = helpers.ChunkReader(post_filename=config.paths.TEST_DATA_IDX, chunk_size=config.data.CHUNK_SIZE) chunks = [chunk for chunk in chunk_reader] # Load cluster tags dict cluster2tag = config.data.load_cluster_tags() with open(config.paths.TEST_DATA_IDX, 'r') as f: # Count number of true retrieved tags in 'top k' true_counts_at_k = {k: 0 for k in range(0, tag_count)} total_tag_counts = 0 for chunk in chunks: # Convert to sparse matrix X, y_tags = helpers.chunk_to_sparse_mat(chunk, word_count)
# Read tags tags, tag2idx, tag_count = helpers.read_tags() # Read words words, word2idx, word_count = helpers.read_words() # Clusters K = tag_count # Initialize cluster centers mu = np.random.rand(K, word_count) # Get chunks chunk_reader = helpers.ChunkReader( post_filename=config.paths.TRAIN_DATA_IDX, chunk_size=config.data.CHUNK_SIZE) # TODO: Change chunks = [chunk for chunk in chunk_reader] #with open(config.paths.TRAIN_DATA_IDX, 'r') as f: for iteration in range(0, config.algorithm.MAX_ITER): start = time.time() cluster_sums = {k: np.zeros((1, word_count)) for k in range(0, K)} cluster_counts = {k: 0 for k in range(0, K)} for chunk in chunks: # Convert to sparse matrix X, _ = helpers.chunk_to_sparse_mat(chunk, word_count)
import helpers import config from sklearn.model_selection import train_test_split # Read tags tags, tag2idx, tag_count = helpers.read_tags() # Read words words, word2idx, word_count = helpers.read_words() # Get number of texts in data text_count = config.text.get_text_count() # Read chunks chunk_reader = helpers.ChunkReader( post_filename=config.paths.POST, chunk_size=config.data.CHUNK_SIZE) # TODO: Change all_chunks = [chunk for chunk in chunk_reader] # Split chunks in training and test chunks_train, chunks_test = train_test_split( all_chunks, test_size=config.data.TEST_FRACTION) for chunks, target_filename in [ (chunks_train, config.paths.TRAIN_DATA_IDX), (chunks_test, config.paths.TEST_DATA_IDX), ]: with open(config.paths.POST, 'rb') as f, open(target_filename, 'w') as f_indices: for chunk in chunks: