Beispiel #1
0
import helpers
import config

# Read tags
tags, tag2idx, tag_count = helpers.read_tags()

# Read words
words, word2idx, word_count = helpers.read_words()

# Load means
with open(config.paths.MU, 'rb') as f:
    mu = np.load(f)

# Get chunks
chunk_reader = helpers.ChunkReader(post_filename=config.paths.TEST_DATA_IDX, chunk_size=config.data.CHUNK_SIZE)
chunks = [chunk for chunk in chunk_reader]

# Load cluster tags dict
cluster2tag = config.data.load_cluster_tags()

with open(config.paths.TEST_DATA_IDX, 'r') as f:

    # Count number of true retrieved tags in 'top k'
    true_counts_at_k = {k: 0 for k in range(0, tag_count)}
    total_tag_counts = 0
    for chunk in chunks:

        # Convert to sparse matrix
        X, y_tags = helpers.chunk_to_sparse_mat(chunk, word_count)
Beispiel #2
0
# Read tags
tags, tag2idx, tag_count = helpers.read_tags()

# Read words
words, word2idx, word_count = helpers.read_words()

# Clusters
K = tag_count

# Initialize cluster centers
mu = np.random.rand(K, word_count)

# Get chunks
chunk_reader = helpers.ChunkReader(
    post_filename=config.paths.TRAIN_DATA_IDX,
    chunk_size=config.data.CHUNK_SIZE)  # TODO: Change
chunks = [chunk for chunk in chunk_reader]

#with open(config.paths.TRAIN_DATA_IDX, 'r') as f:
for iteration in range(0, config.algorithm.MAX_ITER):
    start = time.time()

    cluster_sums = {k: np.zeros((1, word_count)) for k in range(0, K)}
    cluster_counts = {k: 0 for k in range(0, K)}

    for chunk in chunks:

        # Convert to sparse matrix
        X, _ = helpers.chunk_to_sparse_mat(chunk, word_count)
Beispiel #3
0
import helpers
import config
from sklearn.model_selection import train_test_split

# Read tags
tags, tag2idx, tag_count = helpers.read_tags()

# Read words
words, word2idx, word_count = helpers.read_words()

# Get number of texts in data
text_count = config.text.get_text_count()

# Read chunks
chunk_reader = helpers.ChunkReader(
    post_filename=config.paths.POST,
    chunk_size=config.data.CHUNK_SIZE)  # TODO: Change
all_chunks = [chunk for chunk in chunk_reader]

# Split chunks in training and test
chunks_train, chunks_test = train_test_split(
    all_chunks, test_size=config.data.TEST_FRACTION)

for chunks, target_filename in [
    (chunks_train, config.paths.TRAIN_DATA_IDX),
    (chunks_test, config.paths.TEST_DATA_IDX),
]:

    with open(config.paths.POST, 'rb') as f, open(target_filename,
                                                  'w') as f_indices:
        for chunk in chunks: