def output_label_index(): output_dir = fh.makedirs(defines.web_dir, 'DRLD') output_filename = fh.make_filename(output_dir, 'index_labels', 'html') true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes']) with codecs.open(output_filename, 'w') as output_file: output_file.write(html.make_header('Labels')) output_file.write(html.make_body_start()) output_file.write(common.make_masthead(1)) output_file.write(html.make_heading('Labels', align='center')) table_header = ['Label'] output_file.write(html.make_table_start(style='sortable')) output_file.write(html.make_table_header(table_header)) for index, code in enumerate(true.columns): code_name = code_names[index] link = html.make_link('label_' + html.replace_chars(code_name) + '.html', code_name) row = [link] output_file.write(html.make_table_row(row)) output_file.write(html.make_table_end()) output_file.write(html.make_body_end()) output_file.write(html.make_footer())
def output_response_index(): output_dir = fh.makedirs(defines.web_dir, 'DRLD') output_filename = fh.make_filename(output_dir, 'index_responses', 'html') datasets = ['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes'] text_file_dir = fh.makedirs(defines.data_dir, 'rnn') text = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn', 'json')) with codecs.open(output_filename, 'w') as output_file: output_file.write(html.make_header('Democrats vs Republicans')) output_file.write(html.make_body_start()) output_file.write(common.make_masthead(0)) for dataset in datasets: true = labels.get_labels([dataset]) all_items = ds.get_all_documents(dataset) train_items = ds.get_train_documents(dataset, 0, 0) dev_items = ds.get_dev_documents(dataset, 0, 0) test_items = ds.get_test_documents(dataset, 0) output_file.write(html.make_heading(dataset, align='center')) table_header = ['Response', 'Split', 'Snippet'] col_widths = [130, 80, 800] output_file.write(html.make_table_start(col_widths=col_widths, style='sortable')) output_file.write(html.make_table_header(table_header)) for subset in [train_items, dev_items, test_items]: subset.sort() for item in subset: if item in train_items: split = 'train' elif item in dev_items: split = 'dev' else: split = 'test' words = text[item] response = ' '.join(words) if len(response) > 100: response = response[:100] + '. . .' num = item.split('_')[1] link = html.make_link(item + '.html', num, new_window=False) link2 = html.make_link(item + '.html', response, new_window=False) row = [link, split, link2] output_file.write(html.make_table_row(row)) output_file.write(html.make_table_end()) output_file.write(html.make_body_end()) output_file.write(html.make_footer())
def output_label_pages(): output_dir = fh.makedirs(defines.web_dir, 'DRLD') blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models') true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes']) for code_index, code in enumerate(true.columns): code_name = code_names[code_index] output_filename = fh.make_filename(output_dir, 'label_' + html.replace_chars(code_name), 'html') with codecs.open(output_filename, 'w') as output_file: output_file.write(html.make_header(code_name)) output_file.write(html.make_body_start()) output_file.write(common.make_masthead(-1)) output_file.write(html.make_heading('Label: ' + code_name, align='center')) output_file.write(html.make_paragraph('Coefficients for unigram model:', align="center")) table_header = ['Word', 'Value', 'Scaled'] output_file.write(html.make_table_start(style='sortable')) output_file.write(html.make_table_header(table_header)) model_filename = fh.make_filename(blm_dir, re.sub(' ', '_', code), 'json') model = fh.read_json(model_filename) intercept = float(model.get('intercept', 1.0)) if 'coefs' in model: coefs = dict(model['coefs']) tokens = coefs.keys() tokens.sort() for token_index, token in enumerate(tokens): cmax = 255 colours = [(0, 0, 0)]*2 word = token.split('_')[-1] coef = coefs[token] scaled_coef = coef/abs(intercept) val = int(cmax - (min(1, abs(scaled_coef))*cmax)) if coef > 0: colours += [(val, val, cmax)] else: colours += [(cmax, val, val)] if len(word) > 0: if word[0] not in ascii_lowercase: word = '_' + word link = html.make_link('wordtype_' + word + '.html', word) row = [link, str('{:0.2f}'.format(coef)), word] output_file.write(html.make_table_row(row, colours=colours)) output_file.write(html.make_table_end()) output_file.write(html.make_body_end()) output_file.write(html.make_footer())
def output_word_index(): output_dir = fh.makedirs(defines.web_dir, 'DRLD') blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models') true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes']) word_list = set() for code_index, code in enumerate(true.columns): # load coefficients from unigram model model_filename = fh.make_filename(blm_dir, html.replace_chars(code), 'json') model = fh.read_json(model_filename) if 'coefs' in model: coefs = dict(model['coefs']) words = [word[4:] for word in coefs.keys()] word_list.update(words) word_list = list(word_list) word_list.sort() output_filename = fh.make_filename(output_dir, 'index_words', 'html') with codecs.open(output_filename, 'w') as output_file: output_file.write(html.make_header('Words')) output_file.write(html.make_body_start()) output_file.write(common.make_masthead(2)) output_file.write(html.make_heading('Words', align='center')) table_header = ['Words'] output_file.write(html.make_table_start(style='sortable')) output_file.write(html.make_table_header(table_header)) for word in word_list: link = html.make_link('wordtype_' + html.replace_chars(word) + '.html', word) row = [link] output_file.write(html.make_table_row(row)) output_file.write(html.make_table_end()) output_file.write(html.make_body_end()) output_file.write(html.make_footer())
def output_responses(dataset): print dataset output_dir = fh.makedirs(defines.web_dir, 'DRLD') rnn_dir = fh.makedirs(defines.exp_dir, 'rnn', 'bayes_opt_rnn_LSTM_reuse_mod_34_rerun', 'fold0', 'responses') blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models') predictions_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'predictions') train_pred = pd.read_csv(fh.make_filename(predictions_dir, dataset + '_train', 'csv'), header=0, index_col=0) test_pred = pd.read_csv(fh.make_filename(predictions_dir, dataset + '_test', 'csv'), header=0, index_col=0) text_file_dir = fh.makedirs(defines.data_dir, 'rnn') text = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn', 'json')) true = labels.get_labels([dataset]) all_items = ds.get_all_documents(dataset) word_list = common.get_word_list(true.columns, blm_dir) train_words = {} test_words = {} for i in all_items: true_i = true.loc[i] rnn_file = fh.make_filename(rnn_dir, i, 'csv') rnn_vals = pd.read_csv(rnn_file, header=-1) rnn_vals.columns = true.columns if i in train_pred.index: pred_i = train_pred.loc[i] train_item = True else: pred_i = test_pred.loc[i] train_item = False #output_filename = fh.make_filename(output_dir, i, 'html') output_filename = '/Users/dcard/Desktop.temp.html' with codecs.open(output_filename, 'w') as output_file: output_file.write(html.make_header(i)) output_file.write(html.make_body_start()) output_file.write(common.make_masthead(-1)) output_file.write(html.make_heading('Response: ' + i, align='center')) output_file.write(html.make_paragraph('The table below shows coeficients for the unigram model (red-blue)', align="center")) output_file.write(html.make_paragraph('and sequence element probabilities for the LSTM (white-green).', align="center")) links = [html.make_link('wordtype_' + w + '.html', w) if w in word_list else w for w in text[i]] table_header = ['Label'] + links + ['True', 'Pred.'] output_file.write(html.make_table_start(style='t1')) output_file.write(html.make_table_header(table_header)) for code_index, code in enumerate(true.columns): # load coefficients from unigram model words = text[i] model_filename = fh.make_filename(blm_dir, re.sub(' ', '_', code), 'json') model = fh.read_json(model_filename) intercept = float(model.get('intercept', 1.0)) if 'coefs' in model: coefs = dict(model['coefs']) colours = [str((0, 0, 0))] for word in words: coef = coefs.get('_n1_' + word, 0.0)/abs(intercept) val = int(255 - (min(1, abs(coef))*255)) if coef > 0: colours += [(val, val, 255)] else: colours += [(255, val, val)] else: colours = [str((0, 0, 0))] colours += [(255, 255, 255) for w in words] colours += [str((0, 0, 0))]*2 code_name = code_names[code_index] link = html.make_link('label_' + html.replace_chars(code_name) + '.html', code_name) row = [link] + words + [str(true_i[code]), str(int(pred_i[code])) + ' (LR)'] output_file.write(html.make_table_row(row, colours=colours)) for i_v, v in enumerate(rnn_vals[code].values): if v >= 0.5: if train_item: focal_word = text[i][i_v] if focal_word in train_words: train_words[focal_word] += 1 else: train_words[focal_word] = 1 else: focal_word = text[i][i_v] if focal_word in test_words: test_words[focal_word] += 1 else: test_words[focal_word] = 1 colours = [str((0, 0, 0))] vals = [int(235 - (v*235)) for v in rnn_vals[code]] colours += [(v, 235, v) for v in vals] colours += [str((0, 0, 0))]*2 row = [' '] + text[i] + [' ', str(int(rnn_vals[code].max() >= 0.5)) + ' (RNN)'] output_file.write(html.make_table_row(row, colours=colours)) output_file.write(html.make_table_end()) output_file.write(html.make_heading('LSTM Gates', align='center')) output_file.write(html.make_paragraph('The plot below shows LSTM gate values at each sequence element.', align="center")) output_file.write(html.make_paragraph('Each grey line is one dimension; the colored line shows the mean.', align="center")) output_file.write(html.make_image(os.path.join('gate_plots', i + '_gates.png'))) output_file.write(html.make_heading('LSTM vectors', align='center')) output_file.write(html.make_paragraph('The plot below shows the LSTM hidden and memory nodes for each ' 'sequence element.', align="Center")) output_file.write(html.make_paragraph('Vectors have been projected to a common space.', align="center")) output_file.write(html.make_image(os.path.join('vector_plots', i + '_vectors.png'))) output_file.write(html.make_body_end()) output_file.write(html.make_footer()) return train_words, test_words
def output_words(): output_dir = fh.makedirs(defines.web_dir, 'DRLD') blm_dir = fh.makedirs(defines.exp_dir, 'Democrat-Dislikes_Democrat-Likes_Republican-Dislikes_Republican-Likes', 'test_fold_0', 'L1LR_all_groups_a0', 'models') text_file_dir = fh.makedirs(defines.data_dir, 'rnn') text = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn', 'json')) vocab = fh.read_json(fh.make_filename(text_file_dir, 'ngrams_n1_m1_rnn_vocab', 'json')) true = labels.get_labels(['Democrat-Dislikes', 'Democrat-Likes', 'Republican-Dislikes', 'Republican-Likes']) word_list = set() for code_index, code in enumerate(true.columns): # load coefficients from unigram model model_filename = fh.make_filename(blm_dir, html.replace_chars(code), 'json') model = fh.read_json(model_filename) if 'coefs' in model: coefs = dict(model['coefs']) words = [word[4:] for word in coefs.keys()] word_list.update(words) word_list = list(word_list) word_index = {} order = true.index.tolist() random.shuffle(order) for item in order: words = text[item] for word in words: if word in word_index: word_index[word].append(item) else: word_index[word] = [item] for word in word_list: output_filename = fh.make_filename(output_dir, 'wordtype_' + word, 'html') with codecs.open(output_filename, 'w') as output_file: output_file.write(html.make_header(word)) output_file.write(html.make_body_start()) output_file.write(common.make_masthead(-1)) output_file.write(html.make_heading('Word: ' + word, align='center')) if word in word_index: output_file.write(html.make_paragraph('Sample usage:', align='center')) item_list = word_index[word][:] random.shuffle(item_list) for item in item_list[:min(len(item_list), 5)]: item_text = text[item] occurence_index = item_text.index(word) start = max(0, occurence_index-10) end = min(len(item_text), occurence_index + 10) item_text = ['<b>' + w + '</b>' if w == word else w for w in item_text] link = html.make_link(item + '.html', ' '.join(item_text[start:end])) output_file.write(html.make_paragraph(link, align="center", id="psmall")) output_file.write(html.make_paragraph('Unigram model coefficients for each label:', align='center')) table_header = ['Label', 'Value', 'Scaled'] output_file.write(html.make_table_start(style='sortable')) output_file.write(html.make_table_header(table_header)) for code_index, code in enumerate(true.columns): # load coefficients from unigram model model_filename = fh.make_filename(blm_dir, re.sub(' ', '_', code), 'json') model = fh.read_json(model_filename) intercept = float(model.get('intercept', 1.0)) cmax = 255 if 'coefs' in model: coefs = dict(model['coefs']) colours = [str((0, 0, 0))]*2 coef = coefs.get('_n1_' + word, 0.0) scaled_coef = coef/abs(intercept) val = int(cmax - (min(1, abs(scaled_coef))*cmax)) if coef > 0: colours += [(val, val, cmax)] else: colours += [(cmax, val, val)] else: coef = 0.0 colours = [str((0, 0, 0)), str((0, 0, 0)), str((cmax, cmax, cmax))] code_name = code_names[code_index] link = html.make_link('label_' + html.replace_chars(code_name) + '.html', code_name) row = [link, '{:0.2f}'.format(coef), word] output_file.write(html.make_table_row(row, colours=colours)) output_file.write(html.make_table_end()) output_file.write(html.make_body_end()) output_file.write(html.make_footer())