def docs(dataset_name): p = utils.Progbar(target=(utils.lines_in_file(directories.RAW + dataset_name))) for i, d in enumerate(utils.load_json_lines(directories.RAW + dataset_name)): p.update(i + 1) yield d
def write_feature_names(): utils.write_pickle( { f: i for i, f in enumerate( next(utils.load_json_lines(directories.RAW + 'train'))["pair_feature_names"]) }, directories.MISC + 'pair_feature_names.pkl')
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None): doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") + "_document_vectors.pkl") main_pairs = PairDataBuilder(columns) tune_pairs = PairDataBuilder(columns) main_mentions = MentionDataBuilder(columns) tune_mentions = MentionDataBuilder(columns) main_docs = DocumentDataBuilder(columns) tune_docs = DocumentDataBuilder(columns) print("Building dataset", name + ("/tune" if tune_fraction > 0 else "")) p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name))) for i, d in enumerate(utils.load_json_lines(directories.RAW + name)): if reduced and i > 2: break p.update(i + 1) if reduced and tune_fraction != 0: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if i == 0 else (tune_pairs, tune_mentions, tune_docs) else: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs) ms, ps = mentions.size(), pairs.size() mention_positions = {} for mention_num in sorted(d["mentions"].keys(), key=int): mention_positions[mention_num] = mentions.size() mentions.add_mention(d["mentions"][mention_num], vectors, doc_vectors[d["mentions"][mention_num]["doc_id"]]) for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))): k1, k2 = key.split() pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2], int(d["mentions"][k1]["doc_id"]), int(d["mentions"][k1]["mention_id"]), int(d["mentions"][k2]["mention_id"]), d["pair_features"][key]) me, pe = mentions.size(), pairs.size() docs.add_doc(ms, me, ps, pe, d["document_features"]) suffix = ("_reduced" if reduced else "") if tune_mentions.size() > 0: tune_mentions.write(name + "_tune" + suffix) tune_pairs.write(name + "_tune" + suffix) tune_docs.write(name + "_tune" + suffix) main_mentions.write(name + "_train" + suffix) main_pairs.write(name + "_train" + suffix) main_docs.write(name + "_train" + suffix) else: main_mentions.write(name + suffix) main_pairs.write(name + suffix) main_docs.write(name + suffix)
def build_dataset(vectors, name, tune_fraction=0.0, reduced=False, columns=None): doc_vectors = utils.load_pickle(directories.MISC + name.replace("_reduced", "") + "_document_vectors.pkl") main_pairs = PairDataBuilder(columns) tune_pairs = PairDataBuilder(columns) main_mentions = MentionDataBuilder(columns) tune_mentions = MentionDataBuilder(columns) main_docs = DocumentDataBuilder(columns) tune_docs = DocumentDataBuilder(columns) print "Building dataset", name + ("/tune" if tune_fraction > 0 else "") p = utils.Progbar(target=(2 if reduced else utils.lines_in_file(directories.RAW + name))) for i, d in enumerate(utils.load_json_lines(directories.RAW + name)): if reduced and i > 2: break p.update(i + 1) if reduced and tune_fraction != 0: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if i == 0 else (tune_pairs, tune_mentions, tune_docs) else: pairs, mentions, docs = (main_pairs, main_mentions, main_docs) \ if random.random() > tune_fraction else (tune_pairs, tune_mentions, tune_docs) ms, ps = mentions.size(), pairs.size() mention_positions = {} for mention_num in sorted(d["mentions"].keys(), key=int): mention_positions[mention_num] = mentions.size() mentions.add_mention(d["mentions"][mention_num], vectors, doc_vectors[d["mentions"][mention_num]["doc_id"]]) for key in sorted(d["labels"].keys(), key=lambda k: (int(k.split()[1]), int(k.split()[0]))): k1, k2 = key.split() pairs.add_pair(d["labels"][key], mention_positions[k1], mention_positions[k2], int(d["mentions"][k1]["doc_id"]), int(d["mentions"][k1]["mention_id"]), int(d["mentions"][k2]["mention_id"]), d["pair_features"][key]) me, pe = mentions.size(), pairs.size() docs.add_doc(ms, me, ps, pe, d["document_features"]) suffix = ("_reduced" if reduced else "") if tune_mentions.size() > 0: tune_mentions.write(name + "_tune" + suffix) tune_pairs.write(name + "_tune" + suffix) tune_docs.write(name + "_tune" + suffix) main_mentions.write(name + "_train" + suffix) main_pairs.write(name + "_train" + suffix) main_docs.write(name + "_train" + suffix) else: main_mentions.write(name + suffix) main_pairs.write(name + suffix) main_docs.write(name + suffix)
def load_gold(dataset_name): gold = {} mention_to_gold = {} for doc_gold in utils.load_json_lines(directories.GOLD + dataset_name): did = int(list(doc_gold.keys())[0]) gold[did] = doc_gold[str(did)] mention_to_gold[did] = {} for gold_cluster in doc_gold[str(did)]: for m in gold_cluster: mention_to_gold[did][m] = tuple(gold_cluster) return gold, mention_to_gold
def load_gold(dataset_name): gold = {} mention_to_gold = {} for doc_gold in utils.load_json_lines(directories.GOLD + dataset_name): did = int(doc_gold.keys()[0]) gold[did] = doc_gold[str(did)] mention_to_gold[did] = {} for gold_cluster in doc_gold[str(did)]: for m in gold_cluster: mention_to_gold[did][m] = tuple(gold_cluster) return gold, mention_to_gold
def explore_pairwise_features(): pos_sum, neg_sum = np.zeros(9), np.zeros(9) pos_count, neg_count = 0, 0 for i, d in enumerate(utils.load_json_lines(directories.RAW + "train")): for key in d["labels"].keys(): if d["labels"][key] == 1: pos_sum += d["pair_features"][key] pos_count += 1 else: neg_sum += d["pair_features"][key] neg_count += 1 print("positive counts", list(pos_sum)) print("negative counts", list(neg_sum)) print("feature odds", list(np.divide(pos_sum / pos_count, (pos_sum / pos_count + neg_sum / neg_count)))) print()
def explore_pairwise_features(): pos_sum, neg_sum = np.zeros(9), np.zeros(9) pos_count, neg_count = 0, 0 for i, d in enumerate(utils.load_json_lines(directories.RAW + "train")): for key in d["labels"].keys(): if d["labels"][key] == 1: pos_sum += d["pair_features"][key] pos_count += 1 else: neg_sum += d["pair_features"][key] neg_count += 1 print "positive counts", list(pos_sum) print "negative counts", list(neg_sum) print "feature odds", list(np.divide(pos_sum / pos_count, (pos_sum / pos_count + neg_sum / neg_count))) print
def write_feature_names(): raw_train = directories.RAW + 'train' try: utils.write_pickle( { f: i for i, f in enumerate( next(utils.load_json_lines(raw_train)) ["pair_feature_names"]) }, directories.MISC + 'pair_feature_names.pkl') except FileNotFoundError as e: if e.filename == raw_train: raise FileNotFoundError( 'Raw training data not found. Perhaps you need to copy the original dataset first: %s' % e.filename) from e else: raise
def main(model_path, dataset_name): docs = utils.load_pickle(model_path + dataset_name + '_processed_docs.pkl') for doc_data in utils.load_json_lines(directories.RAW + dataset_name): sentences = doc_data["sentences"] mid_to_mention = {int(m["mention_id"]): m for m in doc_data["mentions"].values()} mid_to_position = {mid: int(m["mention_num"]) for mid, m in mid_to_mention.iteritems()} doc = docs[doc_data["document_features"]["doc_id"]] clusters = [c for c in doc.clusters if len(c) > 1] cluster_to_endpoints = {} for c in clusters: positions = [mid_to_position[mid] for mid in c] cluster_to_endpoints[c] = (min(positions), max(positions)) sorted_clusters = sorted(clusters, key=lambda c: cluster_to_endpoints[c]) color_last_usage = {i: -1 for i in range(len(COLORS))} active_clusters = [] cluster_to_color = {} for c in sorted_clusters: start, end = cluster_to_endpoints[c] for a in list(active_clusters): if cluster_to_endpoints[a][1] < start: active_clusters.remove(a) used_colors = [cluster_to_color[a] for a in active_clusters] sorted_colors = sorted((u, i) for i, u in color_last_usage.iteritems()) next_color = None for u, i in sorted_colors: if i not in used_colors: next_color = i break if next_color is None: next_color = sorted_colors[0][1] color_last_usage[next_color] = start cluster_to_color[c] = next_color active_clusters.append(c) annotations = defaultdict(lambda: defaultdict(list)) for i, c in enumerate(sorted_clusters): color = COLORS[cluster_to_color[c]] for m in c: mention = mid_to_mention[m] start, end = mention["start_index"], mention["end_index"] - 1 annotations[mention["sent_num"]][start].append( (color + "[" + ENDC, 1 + end)) annotations[mention["sent_num"]][end].append( (color + "]" + subscript(i) + ENDC, -1 - start)) for i, s in enumerate(sentences): for j, sentence_annotations in annotations[i].iteritems(): sentence_annotations = sorted(sentence_annotations, key=itemgetter(1)) for (annotation, priority) in sentence_annotations: if priority > 0: s[j] = annotation + s[j] else: s[j] = s[j] + annotation print(" ".join(s)) print() print(80 * "=") print()
def write_feature_names(): utils.write_pickle({f: i for i, f in enumerate(next( utils.load_json_lines(directories.RAW + 'train'))["pair_feature_names"])}, directories.MISC + 'pair_feature_names.pkl')
def main(model_path, dataset_name): docs = utils.load_pickle(model_path + dataset_name + '_processed_docs.pkl') for doc_data in utils.load_json_lines(directories.RAW + dataset_name): sentences = doc_data["sentences"] mid_to_mention = {int(m["mention_id"]): m for m in doc_data["mentions"].values()} mid_to_position = {mid: int(m["mention_num"]) for mid, m in mid_to_mention.iteritems()} doc = docs[doc_data["document_features"]["doc_id"]] clusters = [c for c in doc.clusters if len(c) > 1] cluster_to_endpoints = {} for c in clusters: positions = [mid_to_position[mid] for mid in c] cluster_to_endpoints[c] = (min(positions), max(positions)) sorted_clusters = sorted(clusters, key=lambda c: cluster_to_endpoints[c]) color_last_usage = {i: -1 for i in range(len(COLORS))} active_clusters = [] cluster_to_color = {} for c in sorted_clusters: start, end = cluster_to_endpoints[c] for a in list(active_clusters): if cluster_to_endpoints[a][1] < start: active_clusters.remove(a) used_colors = [cluster_to_color[a] for a in active_clusters] sorted_colors = sorted((u, i) for i, u in color_last_usage.iteritems()) next_color = None for u, i in sorted_colors: if i not in used_colors: next_color = i break if next_color is None: next_color = sorted_colors[0][1] color_last_usage[next_color] = start cluster_to_color[c] = next_color active_clusters.append(c) annotations = defaultdict(lambda: defaultdict(list)) for i, c in enumerate(sorted_clusters): color = COLORS[cluster_to_color[c]] for m in c: mention = mid_to_mention[m] start, end = mention["start_index"], mention["end_index"] - 1 annotations[mention["sent_num"]][start].append( (color + "[" + ENDC, 1 + end)) annotations[mention["sent_num"]][end].append( (color + "]" + subscript(i) + ENDC, -1 - start)) for i, s in enumerate(sentences): for j, sentence_annotations in annotations[i].iteritems(): sentence_annotations = sorted(sentence_annotations, key=itemgetter(1)) for (annotation, priority) in sentence_annotations: if priority > 0: s[j] = annotation + s[j] else: s[j] = s[j] + annotation print " ".join(s) print print 80 * "=" print