# tok_neg_test, tok_pos_test = parallel_run(parse_tokens, test_neg), parallel_run(parse_tokens, test_pos) test['paragraph_neg'], test['paragraph_pos'] = parallel_run(parse_paragraph, test_neg), parallel_run(parse_paragraph, test_pos) # -- parameters to tune and set WORDS_PER_SENTENCE = 50 SENTENCES_PER_PARAGRAPH = 50 PREPEND = False log('normalizing training inputs...') log(' --> building local word vector representation') train_repr = normalize_sos( [ normalize_sos(review, WORDS_PER_SENTENCE, prepend=PREPEND) for review in gb.get_indices(train['paragraph_pos'] + train['paragraph_neg']) ], SENTENCES_PER_PARAGRAPH, [0] * WORDS_PER_SENTENCE, PREPEND ) train_text = np.array(train_repr) log(' --> building global word vector representation') global_train_repr = normalize_sos( [ normalize_sos(review, WORDS_PER_SENTENCE, prepend=PREPEND) for review in global_gb.get_indices(train['paragraph_pos'] + train['paragraph_neg']) ], SENTENCES_PER_PARAGRAPH, [0] * WORDS_PER_SENTENCE, PREPEND )
train_reviews = train_reviews[:NUM_TRAIN_REVIEWS] train_labels.extend(dev_labels) train_labels = train_labels[:NUM_TRAIN_REVIEWS] test_reviews = test_reviews[:NUM_TEST_REVIEWS] test_labels = test_labels[:NUM_TEST_REVIEWS] log('Splitting training data into paragraphs') train_text_sentences = parallel_run(parse_paragraph, train_reviews) test_text_sentences = parallel_run(parse_paragraph, test_reviews) log('normalizing training inputs...') train_repr = normalize_sos( [ normalize_sos(review, WORDS_PER_SENTENCE) for review in gb.get_indices(train_text_sentences) ], SENTENCES_PER_PARAGRAPH, [0] * WORDS_PER_SENTENCE ) train_text = np.array(train_repr) log('normalizing testing inputs...') test_repr = normalize_sos( [ normalize_sos(review, WORDS_PER_SENTENCE) for review in gb.get_indices(test_text_sentences) ], SENTENCES_PER_PARAGRAPH, [0] * WORDS_PER_SENTENCE )