def sentences(config, normalize, include_historic=True, include_modern=True): if normalize: names = utils.load_pickle(config.names) wordlike_names = utils.load_pickle(config.wordlike_names) for decade in os.listdir(config.data_dir): decade_int = int(decade[:-1]) if not ((include_historic and decade_int < 1860) or (include_modern and decade_int > 1980)): continue print "DECADE:", decade path = os.path.join(config.data_dir, decade) for fname in utils.logged_loop([fname for fname in os.listdir(path) if fname[:3] == "fic"]): with open(os.path.join(path, fname)) as f: for line in f: line = line.strip().replace("<p>", "") sentences = nltk.tokenize.sent_tokenize(line) for s in sentences: if "@" in s or "Page image" in s or "PRINTED" in s or "nbsp" in s: continue words = nltk.tokenize.word_tokenize(s) if normalize: words = normalize_sentence(words, names, wordlike_names) yield words
def make_attn_word_level(data, tokenizer, cased): for features in utils.logged_loop(data): words_to_tokens = tokenize_and_align(tokenizer, features["words"], cased) assert sum(len(word) for word in words_to_tokens) == len(features["tokens"]) features["attns"] = np.stack([[ get_word_word_attention(attn_head, words_to_tokens) for attn_head in layer_attns ] for layer_attns in features["attns"]])
def write_probable_pairs(dataset_name, action_space_path, scores): probable_pairs = {} margin_removals = 0 total_pairs = 0 total_size = 0 for did in utils.logged_loop(scores): doc_scores = scores[did] pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1], key=lambda pr: doc_scores[pr] - (-1 - 0.3 * doc_scores[ (-1, pr[1])]), reverse=True) total_pairs += len(pairs) probable_pairs[did] = [] for pair in pairs: score = doc_scores[pair] - (-1 - 0.3 * doc_scores[(-1, pair[1])]) if score < SCORE_THRESHOLD: break probable_pairs[did].append(pair) max_scores = {} for pair in probable_pairs[did]: if pair[1] not in max_scores: max_scores[pair[1]] = max(doc_scores[pair], -1 - 0.3 * doc_scores[(-1, pair[1])]) else: max_scores[pair[1]] = max(max_scores[pair[1]], doc_scores[pair]) margin_removals += len(probable_pairs[did]) probable_pairs[did] = [ p for p in probable_pairs[did] if doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD ] margin_removals -= len(probable_pairs[did]) total_size += len(probable_pairs[did]) print "num docs:", len(scores) print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores))) print "avg size: {:.1f}".format(total_size / float(len(scores))) print "margin removals size: {:.1f}".format(margin_removals / float(len(scores))) utils.write_pickle( probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl') shutil.copyfile('clustering_preprocessing.py', action_space_path + 'clustering_preprocessing.py')
def make_attn_word_level(data, bert_attn_tokenizer, self_attn_tokenizer, args): for features in utils.logged_loop(data): #print('wordsss:',features["words"]) #print('self tokens:', features["self_tokens"]) #print('bert tokens:', features["bert_tokens"]) self_words_to_tokens = tokenize_and_align(self_attn_tokenizer, features["words"], args, 'self', features['self_tokens']) bert_words_to_tokens = tokenize_and_align(bert_attn_tokenizer, features["words"], args, 'bert', features['bert_tokens']) assert sum(len(word) for word in self_words_to_tokens) == len( features["self_tokens"]) assert sum(len(word) for word in bert_words_to_tokens) == len( features["bert_tokens"]) features["attns"] = np.stack([[ get_word_word_attention(attn_head, self_words_to_tokens, bert_words_to_tokens, args) for attn_head in layer_attns ] for layer_attns in features["attns"]])
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( "--attn-data-file", required=True, help="Pickle file containing extracted attention maps.") parser.add_argument("--outfile", required=True, help="Where to write out the distances between heads.") args = parser.parse_args() print("Loading attention data") data = utils.load_pickle(args.attn_data_file) print("Computing head distances") js_distances = np.zeros([144, 144]) for doc in utils.logged_loop(data, n_steps=None): if "attns" not in doc: continue tokens, attns = doc["tokens"], np.array(doc["attns"]) attns_flat = attns.reshape([144, attns.shape[2], attns.shape[3]]) for head in range(144): head_attns = np.expand_dims(attns_flat[head], 0) head_attns_smoothed = (0.001 / head_attns.shape[1]) + (head_attns * 0.999) attns_flat_smoothed = (0.001 / attns_flat.shape[1]) + (attns_flat * 0.999) m = (head_attns_smoothed + attns_flat_smoothed) / 2 js = -head_attns_smoothed * np.log(m / head_attns_smoothed) js += -attns_flat_smoothed * np.log(m / attns_flat_smoothed) js /= 2 js = js.sum(-1).sum(-1) js_distances[head] += js utils.write_pickle(js_distances, args.outfile)
def examples_in_batches(examples, batch_size): for i in utils.logged_loop(range(1 + ((len(examples) - 1) // batch_size))): yield examples[i * batch_size:(i + 1) * batch_size]