Esempio n. 1
0
def sort_mentions_for_pronoun(ment, ant_list, ment_attr, sents, trees, heads):
    head_span, head_word, head_pos = \
        coref.mention_head(ment, sents, trees, heads)
    node = trees[ment[0]].get_nodes('lowest', head_span[0], head_span[1])
    if node is None:
        return ant_list
    tree = node
    nodes_to_keep = {}
    idx = 0
    while tree is not None:
        if tree.label.startswith('S') or tree.parent is None:
            for a in ant_list:
                if (a not in nodes_to_keep and tree.span[0] <= a[1]
                        and a[2] <= tree.span[1]):
                    nodes_to_keep[a] = idx
                    idx += 1
        tree = tree.parent

    sorted_ant_list =  [x[0] for x in \
        sorted(nodes_to_keep.items(), key=lambda x: x[1])]
    if len(sorted_ant_list) != len(ant_list):
        logger.error("Inconsistent size 'ant_list': %d != %d" %
                     (len(sorted_ant_list), len(ant_list)))
        sys.exit(1)

    return sorted_ant_list
Esempio n. 2
0
File: sieve.py Progetto: bgshin/mps
 def check_ordered_mentions(self):
     # Check all mentions are in tree-traversal order
     data = self.data
     for doc in data:
         for part in data[doc]:
             sents = data[doc][part]['text']
             trees = data[doc][part]['parses']
             heads = data[doc][part]['heads']
             for i in xrange(len(sents)):
                 sent_ments = data[doc][part]['doc_mentions'][i]
                 idx = 0
                 tree_order = {}
                 for node in trees[i]:
                     tree_order[node.span] = idx
                     idx += 1
                 prev_idx = -1
                 for ment in sent_ments:
                     print True if (ment[0] == i) else False
                     span = (ment[1], ment[2])
                     idx = -1
                     if span in tree_order:
                         idx = tree_order[span]
                     else:
                         head_span, head_word, head_pos = \
                             coreference.mention_head(
                                 ment, sents, trees, heads)
                         idx = tree_order[head_span]
                     print True if (idx >= prev_idx) else False
                     prev_idx = idx
Esempio n. 3
0
def extract_named_entitiy_mentions(clusters_by_head, sent_idx, sents, trees,
                                   heads, sner):
    found = False
    for ment in sner[sent_idx].keys():
        head_span, head_word, head_pos = \
            coref.mention_head(ment, sents, trees, heads)
        clusters_by_head[head_span][ment] = True
Esempio n. 4
0
def remove_phrase_after_head(attr, ment, sents, trees, heads):
    comma_idx = -1
    wh_idx = -1
    sent_idx, start_idx, end_idx = ment

    head_idx = attr["head_idx"]
    surface = attr["surface"]
    if head_idx + 2 > end_idx:  # Unlikely to have a pharse
        return surface

    tmp = (sent_idx, head_idx + 1, head_idx + 2)
    span, word, pos = coref.mention_head(tmp, sents, trees, heads)
    if comma_idx == -1 and pos == ",":
        comma_idx = head_idx + 1
    if wh_idx == -1 and pos.startswith("W"):
        wh_idx = head_idx + 1

    ret = surface
    if comma_idx != -1 and head_idx < comma_idx:
        ret = " ".join(sents[sent_idx][start_idx:comma_idx])

    if comma_idx == -1 and wh_idx != -1 and head_idx < wh_idx:
        ret = " ".join(sents[sent_idx][start_idx:wh_idx])

    return ret.lower()
Esempio n. 5
0
def sort_mentions_for_pronoun(ment, ant_list, ment_attr, sents, trees, heads):
    head_span, head_word, head_pos = \
        coref.mention_head(ment, sents, trees, heads)
    node = trees[ment[0]].get_nodes('lowest', head_span[0], head_span[1])
    if node is None:
        return ant_list
    tree = node
    nodes_to_keep = {}
    idx = 0
    while tree is not None:
        if tree.label.startswith('S') or tree.parent is None:
            for a in ant_list:
                if a not in nodes_to_keep and \
                                tree.span[0] <= a[1] and a[2] <= tree.span[1]:
                    nodes_to_keep[a] = idx
                    idx += 1 
        tree = tree.parent

    sorted_ant_list =  [x[0] for x in \
        sorted(nodes_to_keep.items(), key=lambda x: x[1])]
    if len(sorted_ant_list) != len(ant_list):
        logger.error("Inconsistent size 'ant_list': %d != %d" %
                     (len(sorted_ant_list), len(ant_list)))
        sys.exit(1)

    return sorted_ant_list
Esempio n. 6
0
 def check_ordered_mentions(self):
     # Check all mentions are in tree-traversal order
     data = self.data
     for doc in data:
         for part in data[doc]:
             sents = data[doc][part]['text']
             trees = data[doc][part]['parses']
             heads = data[doc][part]['heads']
             for i in xrange(len(sents)):
                 sent_ments = data[doc][part]['doc_mentions'][i]
                 idx = 0
                 tree_order = {}
                 for node in trees[i]:
                     tree_order[node.span] = idx
                     idx += 1
                 prev_idx = -1
                 for ment in sent_ments:
                     print True if (ment[0] == i) else False
                     span = (ment[1], ment[2])
                     idx = -1
                     if span in tree_order:
                         idx = tree_order[span]
                     else:
                         head_span, head_word, head_pos = \
                             coreference.mention_head(
                                 ment, sents, trees, heads)
                         idx = tree_order[head_span]
                     print True if (idx == prev_idx) else False
                     prev_idx = idx
Esempio n. 7
0
def remove_phrase_after_head(attr, ment, sents, trees, heads):
    comma_idx = -1
    wh_idx = -1
    sent_idx, start_idx, end_idx = ment

    head_idx = attr['head_idx']
    surface = attr['surface']
    if head_idx + 2 > end_idx:  # Unlikely to have a pharse
        return surface

    tmp = (sent_idx, head_idx + 1, head_idx + 2)
    span, word, pos = coref.mention_head(tmp, sents, trees, heads)
    if comma_idx == -1 and pos == ',':
        comma_idx = head_idx + 1
    if wh_idx == -1 and pos.startswith('W'):
        wh_idx = head_idx + 1

    ret = surface
    if comma_idx != -1 and head_idx < comma_idx:
        ret = ' '.join(sents[sent_idx][start_idx:comma_idx])

    if comma_idx == -1 and wh_idx != -1 and head_idx < wh_idx:
        ret = ' '.join(sents[sent_idx][start_idx:wh_idx])

    return ret.lower()
Esempio n. 8
0
def remove_conll_spurious_mentions(ments, sents, trees, heads, sner, params):
    ments_to_remove = []
    for ment in ments.keys():
        if is_generic(ment, sents, trees, heads):
            ments_to_remove.append(ment)

        surface = coref.mention_text(ment, sents).lower()

        if (ment in sner[ment[0]] and sner[ment[0]][ment] == 'GPE'
                and surface in dictionaries.gpe_acronyms):
            ments_to_remove.append(ment)

        if surface in dictionaries.stop_words:
            ments_to_remove.append(ment)

        if start_with_stop_prefixes(surface):
            ments_to_remove.append(ment)

        if end_with_stop_suffixes(surface):
            ments_to_remove.append(ment)

        head_span, head_word, head_pos = \
            coref.mention_head(ment, sents, trees, heads)
        tmp = (ment[0], head_span[0], head_span[1])
        if (tmp in sner[ment[0]]
                and sner[ment[0]][tmp] in {'PERCENT', 'MONEY'}):
            ments_to_remove.append(ment)

    for r in ments_to_remove:
        if r in ments:
            ments.pop(r)
Esempio n. 9
0
def is_generic(ment, sents, trees, heads):
    surface_len = ment[2] - ment[1]
    head_span, head_word, head_pos = \
        coref.mention_head(ment, sents, trees, heads)
    first_span, first_word, first_pos = \
        coref.mention_head((ment[0], ment[1], ment[1] + 1),
                           sents, trees, heads)

    if (head_pos == 'NN' and head_word not in dictionaries.temporals
            and (surface_len == 1 or first_pos in {'JJ', 'RB'})):
        return True

    if first_word in dictionaries.quantifiers:
        return True

    return False
Esempio n. 10
0
def keep_subpatterns(nodes_to_keep, head_span, ments, largest, sents, trees,
                     heads):
    for ment in ments:
        if largest[1] == ment[1] and ment[2] == largest[2]:
            continue

        if largest[1] == ment[1] and ment[2] < largest[2]:
            tmp = (ment[0], ment[2], ment[2] + 1)
            span, word, pos = coref.mention_head(tmp, sents, trees, heads)
            if pos == ',' or pos == 'CC':
                nodes_to_keep[ment] = True

        if largest[1] < ment[1] and ment[2] == largest[2]:
            tmp = (ment[0], ment[1] - 1, ment[1])
            span, word, pos = coref.mention_head(tmp, sents, trees, heads)
            if pos == 'CC':
                nodes_to_keep[ment] = True
Esempio n. 11
0
def extract_parse_mentions(clusters_by_head, tree_order, i, sents, trees,
                           heads, sner):
    idx = 0
    for node in trees[i]:
        if node.label in my_constant.PARSE_TYPES_TO_KEEP:
            ment = (i, node.span[0], node.span[1])
            if not inside_named_entities(ment, sner[i]):
                head_span, head_word, head_pos = \
                    coref.mention_head(ment, sents, trees, heads)
                clusters_by_head[head_span][ment] = True
        tree_order[node.span] = idx
        idx += 1
Esempio n. 12
0
def extract_modifiers(attr, ment, sents, trees, heads):
    ret = set()
    for i in xrange(ment[1], ment[2]):
        span, word, pos = \
            coref.mention_head((ment[0], i, i + 1), sents, trees, heads)
        word = word.lower()
        if (not (pos.startswith('N') or pos.startswith('V')
                 or pos.startswith('JJ') or pos == 'CD')
                or word == attr['head_word']):
            continue
        ret.add(word)

    return ret
Esempio n. 13
0
def extract_modifiers(attr, ment, sents, trees, heads):
    ret = set()
    for i in xrange(ment[1], ment[2]):
        span, word, pos = coref.mention_head((ment[0], i, i + 1), sents, trees, heads)
        word = word.lower()
        if (
            not (pos.startswith("N") or pos.startswith("V") or pos.startswith("JJ") or pos == "CD")
            or word == attr["head_word"]
        ):
            continue
        ret.add(word)

    return ret
Esempio n. 14
0
def sort_by_tree_traversal_order(nodes_to_keep, tree_order, sents, trees,
                                 heads):
    for node in nodes_to_keep.keys():
        span = (node[1], node[2])
        if span in tree_order:
            nodes_to_keep[node] = tree_order[span]
        else:
            head_span, head_word, head_pos = \
                coref.mention_head(node, sents, trees, heads)
            if head_span in tree_order:
                nodes_to_keep[node] = tree_order[head_span]
            else:
                logger.error("Cannot find node '(%d, %d, %d)' in tree" %
                             (node[0], node[1], node[2]))
                sys.exit(1)

    return [x[0] for x in sorted(nodes_to_keep.items(), key=lambda x: x[1])]
Esempio n. 15
0
def set_first_word(attr, ment, sents, trees, heads):
    ment_ = (ment[0], ment[1], ment[1] + 1)
    first_span, first_word, first_pos = coref.mention_head(ment_, sents, trees, heads)
    attr["first_word"] = first_word.lower()
    attr["first_pos"] = first_pos
Esempio n. 16
0
def set_first_word(attr, ment, sents, trees, heads):
    ment_ = (ment[0], ment[1], ment[1] + 1)
    first_span, first_word, first_pos = \
        coref.mention_head(ment_, sents, trees, heads)
    attr['first_word'] = first_word.lower()
    attr['first_pos'] = first_pos
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions, auto_clusters, auto_cluster_set, text, parses, heads):
	# Apply changes for cases where the difference is only leading or trailing punctuation
	mapping = {}
	used_gold = set()
	unique_to_gold = gold_mention_set.difference(auto_mention_set)
	unique_to_auto =  auto_mention_set.difference(gold_mention_set)
	for amention in unique_to_auto:
		sentence, astart, aend = amention
		while (aend - astart > 1 and
		       (text[sentence][astart] == "the" or
		       (len(text[sentence][astart]) == 1 and
		       text[sentence][astart][0] not in string.letters))):
			astart += 1
		while (aend - astart > 1 and
		       (text[sentence][aend - 1] == "'s" or
		       (len(text[sentence][aend - 1]) == 1 and
		       text[sentence][aend - 1][0] not in string.letters))):
			aend -= 1
		for gmention in unique_to_gold:
			gsentence, gstart, gend = gmention
			if sentence != gsentence or gmention in used_gold:
				continue
			while (gend - gstart > 1 and
			       (text[sentence][gstart] == "the" or
			       (len(text[sentence][gstart]) == 1 and
			       text[sentence][gstart][0] not in string.letters))):
				gstart += 1
			while (gend - gstart > 1 and
			       (text[sentence][gend - 1] == "'s" or
			       (len(text[sentence][gend - 1]) == 1 and
			       text[sentence][gend - 1][0] not in string.letters))):
				gend -= 1
			if astart == gstart and aend == gend:
				mapping[amention] = gmention
				used_gold.add(gmention)
	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		to_remove = None
		for cluster in auto_cluster_set:
			if mention in cluster:
				to_remove = cluster
		auto_cluster_set.remove(to_remove)
		ncluster = []
		for mention2 in to_remove:
			if mention2 == mention:
				mention2 = mapping[mention]
			ncluster.append(mention2)
		ncluster = tuple(ncluster)
		auto_cluster_set.add(ncluster)

	# Create a mapping based on heads
	head_dict = defaultdict(lambda: {'auto': [], 'gold': []})
	for mention in auto_mention_set.difference(gold_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention, text, parses, heads, default_last=True)
		# This will default to last word if the mention is not a constituent, is
		# there an alternative?
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['auto'].append(mention)
	for mention in gold_mention_set.difference(auto_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention, text, parses, heads, default_last=True)
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['gold'].append(mention)

	mapping = {}
	for head in head_dict:
		amentions = head_dict[head]['auto']
		gmentions = head_dict[head]['gold']
		if len(amentions) == 1 and len(gmentions) == 1:
			mapping[amentions[0]] = gmentions[0]

	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		to_remove = None
		for cluster in auto_cluster_set:
			if mention in cluster:
				to_remove = cluster
		auto_cluster_set.remove(to_remove)
		ncluster = []
		for mention2 in to_remove:
			if mention2 == mention:
				mention2 = mapping[mention]
			ncluster.append(mention2)
		ncluster = tuple(ncluster)
		auto_cluster_set.add(ncluster)
Esempio n. 18
0
def set_head(attr, ment, sents, trees, heads):
    head_span, head_word, head_pos = \
        coref.mention_head(ment, sents, trees, heads)
    attr['head_idx'] = head_span[0]
    attr['head_word'] = head_word.lower()
    attr['head_pos'] = head_pos
Esempio n. 19
0
def match_boundaries(gold_mention_set, auto_mention_set, auto_mentions, auto_clusters, auto_cluster_set, text, parses, heads):
	# Apply changes for cases where the difference is only leading or trailing punctuation
	mapping = {}
	used_gold = set()
	unique_to_gold = gold_mention_set.difference(auto_mention_set)
	unique_to_auto =  auto_mention_set.difference(gold_mention_set)
	for amention in unique_to_auto:
		sentence, astart, aend = amention
		while (aend - astart > 1 and
		       (text[sentence][astart] == "the" or
		       (len(text[sentence][astart]) == 1 and
		       text[sentence][astart][0] not in string.letters))):
			astart += 1
		while (aend - astart > 1 and
		       (text[sentence][aend - 1] == "'s" or
		       (len(text[sentence][aend - 1]) == 1 and
		       text[sentence][aend - 1][0] not in string.letters))):
			aend -= 1
		for gmention in unique_to_gold:
			gsentence, gstart, gend = gmention
			if sentence != gsentence or gmention in used_gold:
				continue
			while (gend - gstart > 1 and
			       (text[sentence][gstart] == "the" or
			       (len(text[sentence][gstart]) == 1 and
			       text[sentence][gstart][0] not in string.letters))):
				gstart += 1
			while (gend - gstart > 1 and
			       (text[sentence][gend - 1] == "'s" or
			       (len(text[sentence][gend - 1]) == 1 and
			       text[sentence][gend - 1][0] not in string.letters))):
				gend -= 1
			if astart == gstart and aend == gend:
				mapping[amention] = gmention
				used_gold.add(gmention)
	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		to_remove = None
		for cluster in auto_cluster_set:
			if mention in cluster:
				to_remove = cluster
		auto_cluster_set.remove(to_remove)
		ncluster = []
		for mention2 in to_remove:
			if mention2 == mention:
				mention2 = mapping[mention]
			ncluster.append(mention2)
		ncluster = tuple(ncluster)
		auto_cluster_set.add(ncluster)

	# Create a mapping based on heads
	head_dict = defaultdict(lambda: {'auto': [], 'gold': []})
	for mention in auto_mention_set.difference(gold_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention, text, parses, heads, default_last=True)
		# This will default to last word if the mention is not a constituent, is
		# there an alternative?
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['auto'].append(mention)
	for mention in gold_mention_set.difference(auto_mention_set):
		sentence, start, end = mention
		head = coreference.mention_head(mention, text, parses, heads, default_last=True)
		if head is not None:
			head = (mention[0], head[0])
			head_dict[head]['gold'].append(mention)

	mapping = {}
	for head in head_dict:
		amentions = head_dict[head]['auto']
		gmentions = head_dict[head]['gold']
		if len(amentions) == 1 and len(gmentions) == 1:
			mapping[amentions[0]] = gmentions[0]

	# Apply mapping to create new auto_mention_set
	for mention in mapping:
		auto_mention_set.remove(mention)
		auto_mention_set.add(mapping[mention])
		cluster_id = auto_mentions.pop(mention)
		auto_mentions[mapping[mention]] = cluster_id
		auto_clusters[cluster_id].remove(mention)
		auto_clusters[cluster_id].append(mapping[mention])
		to_remove = None
		for cluster in auto_cluster_set:
			if mention in cluster:
				to_remove = cluster
		auto_cluster_set.remove(to_remove)
		ncluster = []
		for mention2 in to_remove:
			if mention2 == mention:
				mention2 = mapping[mention]
			ncluster.append(mention2)
		ncluster = tuple(ncluster)
		auto_cluster_set.add(ncluster)
Esempio n. 20
0
def set_head(attr, ment, sents, trees, heads):
    head_span, head_word, head_pos = coref.mention_head(ment, sents, trees, heads)
    attr["head_idx"] = head_span[0]
    attr["head_word"] = head_word.lower()
    attr["head_pos"] = head_pos