def item_based_to_class_based(lst):
    d = {}
    for item,class_cluster in enumerate(lst):
        update_list_dict(d, class_cluster, item)
    for class_cluster in d:
        d[class_cluster] = set(d[class_cluster])
    return d
Exemple #2
0
def convert_to_conll(tokens, conll_path, isgold, events_only=False, data_set='all', save=True):
    s = ''
    fname = 'ECB+/ecbplus_all'  # document.fname
    empty_line = fname+' -\n'

    coref_ids = {}
    continuing_mids = []
    for i, token in enumerate(tokens):
        if token.not_mention():
            s += empty_line
        else:
            s += fname + ' '
            put = False
            new_continuing_mids = []

            # putting starters and single-tokeners
            for mid in token.coref_ids:
                if mid not in continuing_mids:
                    if token.is_last_token_for_mid(mid): # single-token mention
                        s += '(' + str(mid) + ')'
                    else: # start new mention
                        s += '(' + str(mid)
                        new_continuing_mids.append(mid)
                    put = True
                    helpers.update_list_dict(coref_ids, mid, token)

            # putting enders
            cont_mid_list = list(continuing_mids)
            for mid in cont_mid_list:
                if token.is_last_token_for_mid(mid) or i == len(tokens)-1:
                    s += str(mid)+')'
                    continuing_mids.remove(mid)
                    put = True

            if not put:
                s += '-'

            s += '\n'
            continuing_mids += new_continuing_mids

    print '( = %d, ) = %d'%(s.count('('), s.count(')'))
    # assert s.count('(') == s.count(')')

    total_mentions = 0
    non_singleton_chains = 0
    singletons = 0
    for cid in coref_ids:
        total_mentions += len(coref_ids[cid])
        non_singleton_chains += 1 if len(coref_ids[cid])>1 else 0
        singletons += 1 if len(coref_ids[cid])==1 else 0

    print 'TOTAL MENTIONS:',total_mentions
    print 'TOTAL CHAINS:',non_singleton_chains
    print 'TOTAL SINGLETONS:',singletons

    if save:
        with open(conll_path + conll_file_name(isgold, events_only, data_set), 'w') as f:
            f.write('#begin document (ECB+/ecbplus_all); part 000\n')
            f.write(s)
            f.write('\n#end document\n')
Exemple #3
0
 def get_clusters(self, events_only=True, topics=helpers.ALL_TOPICS):
     clusters = {}
     for mention in self.itermentions():
         if mention.topic() in topics:
             if not events_only or mention.is_event():
                 helpers.update_list_dict(clusters, mention.coref_chain_id,
                                          mention)
    def __init__(self, mention_coref_clusters, all_tokens, events_only=False, data_set='all', with_topics=False, topics=helpers.ALL_TOPICS):
        """
        :type mention_coref_clusters: MentionCoreferenceClusters
        :type all_tokens: list
        """
        self.mentions_dict = mention_coref_clusters.get_mentions_by_class(topics=topics)
        self.mentions_by_doc_dict = mention_coref_clusters.get_mentions_by_doc(topics=topics)
        self.gold_clusters = mention_coref_clusters.get_clusters(topics=topics)
        self.tokens = all_tokens
        self.token_hash_table = {(t.fname.replace('.txt',''), int(t.tid),):t for t in self.tokens} # hash a token table with filename and token id for quick access
        self.tokens_by_doc = {}
        for t in self.tokens:
            helpers.update_list_dict(self.tokens_by_doc, t.fname.replace('.txt',''), t)

        self.events_only = events_only
        self.data_set = data_set
        self.predictor_name = ''
        self.document_pairs = None
        self.positive_mention_pairs = None
        self.negative_mention_pairs = None
        self.with_topics = with_topics

        self.set_name()

        # only resetting token coreference values, NOT mentions
        for token in self.tokens:
            token.reset_coreference()

        for mention in self.itermentions():
            for token in mention.tokens:
                token.reset_coreference()

        self.new_coref_id = 1
 def sort_tokens_into_docs(self, word_lists=False):
     d = {}
     for token in self.tokens:
         if not word_lists:
             helpers.update_list_dict(d, token.fname.replace('.txt',''), token)
         else:
             helpers.update_str_dict(d, token.fname.replace('.txt',''), token.word.decode('utf-8'))
     return d
Exemple #6
0
 def get_mentions_by_class(self, topics=helpers.ALL_TOPICS):
     d = {}
     for mention in self.itermentions():
         if mention.topic() in topics:
             tag = mention.get_class()
             if tag.startswith('NEG'):
                 tag = 'ACTION'
             helpers.update_list_dict(d, tag.split('_')[0], mention)
     return d
Exemple #7
0
 def get_mentions_by_class(self, topics=helpers.ALL_TOPICS):
     d = {}
     for mention in self.itermentions():
         if mention.topic() in topics:
             tag = mention.get_class()
             if tag.startswith('NEG'):
                 tag = 'ACTION'
             helpers.update_list_dict(d, tag.split('_')[0], mention)
     return d
Exemple #8
0
def get_annotated_sentences(fname):
    f_to_sent_dict = {}
    with open(fname, 'rb') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        csv_reader.next()
        for row in csv_reader:
            fname = row[0] + '_' + row[1] + '.xml'
            sentence_num = int(row[2])
            helpers.update_list_dict(f_to_sent_dict, fname, sentence_num)
    return f_to_sent_dict
Exemple #9
0
def get_annotated_sentences(fname):
    f_to_sent_dict = {}
    with open(fname, 'rb') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        csv_reader.next()
        for row in csv_reader:
            fname = row[0] + '_' + row[1] + '.xml'
            sentence_num = int(row[2])
            helpers.update_list_dict(f_to_sent_dict, fname, sentence_num)
    return f_to_sent_dict
Exemple #10
0
 def get_mentions_by_doc(self, fname=None, topics=helpers.ALL_TOPICS):
     mentions_by_doc = {}
     mentions = []
     for mention in self.itermentions():
         if mention.topic() in topics:
             if fname and mention.fname == fname:
                 mentions.append(mention)
             else:
                 helpers.update_list_dict(mentions_by_doc, mention.fname, mention)
     if fname:
         return mentions
     return mentions_by_doc
Exemple #11
0
 def get_mentions_by_doc(self, fname=None, topics=helpers.ALL_TOPICS):
     mentions_by_doc = {}
     mentions = []
     for mention in self.itermentions():
         if mention.topic() in topics:
             if fname and mention.fname == fname:
                 mentions.append(mention)
             else:
                 helpers.update_list_dict(mentions_by_doc, mention.fname,
                                          mention)
     if fname:
         return mentions
     return mentions_by_doc
Exemple #12
0
def test_coref_extraction(coredocs):
    singletons = 0
    corefs_dict = {}
    clust = coredocs.get_clusters()
    for mention in clust.itermentions():
        helpers.update_list_dict(corefs_dict, mention.coref_chain_id, mention)
        if mention.is_singleton:
            singletons += 1

    total_count = 0
    single_item_lists = 0
    for iid in corefs_dict:
        if len(corefs_dict[iid]) == 1:
            single_item_lists += 1
        total_count += len(corefs_dict[iid])
Exemple #13
0
def test_coref_extraction(coredocs):
    singletons = 0
    corefs_dict = {}
    clust = coredocs.get_clusters()
    for mention in clust.itermentions():
        helpers.update_list_dict(corefs_dict, mention.coref_chain_id, mention)
        if mention.is_singleton:
            singletons += 1

    total_count = 0
    single_item_lists = 0
    for iid in corefs_dict:
        if len(corefs_dict[iid]) == 1:
            single_item_lists += 1
        total_count += len(corefs_dict[iid])
Exemple #14
0
def count_ecb(count_dict, path, xml_file, annotated_sentences):
    relevant_toks = []

    root = ET.parse(path + xml_file, parser=ET.XMLParser()).getroot()
    for tok in root.findall('token'):
        sentnum = int(tok.get('sentence'))
        if sentnum in annotated_sentences:
            relevant_toks.append(tok.get('t_id'))

    # get mentions
    for mention in root.find('Markables'):
        if mention.attrib.has_key('TAG_DESCRIPTOR'):
            continue
        mention_tok_ids = [child.get('t_id') for child in mention]
        if helpers.all_in_list(mention_tok_ids, relevant_toks):
            helpers.update_list_dict(count_dict, mention.tag, mention)
Exemple #15
0
def count_ecb(count_dict, path, xml_file, annotated_sentences):
    relevant_toks = []

    root = ET.parse(path + xml_file, parser=ET.XMLParser()).getroot()
    for tok in root.findall('token'):
        sentnum = int(tok.get('sentence'))
        if sentnum in annotated_sentences:
            relevant_toks.append(tok.get('t_id'))

    # get mentions
    for mention in root.find('Markables'):
        if mention.attrib.has_key('TAG_DESCRIPTOR'):
            continue
        mention_tok_ids = [child.get('t_id') for child in mention]
        if helpers.all_in_list(mention_tok_ids, relevant_toks):
            helpers.update_list_dict(count_dict, mention.tag, mention)
    def delta_filter(self, clusters, delta, build_test_comparison=False):
        m_topics = self.get_mentions_by_topic(events_only=self.events_only, split_into_topics=False)
        all_mentions = []
        delta_clusters = collections.defaultdict(list)
        clust_key_idxs = collections.defaultdict(int)
        
        # iterate
        for mlst in m_topics.itervalues():
            mentions = sorted(mlst, key=Mention.get_comparator_function())
            mentions = [m for m in mentions if m.is_event() or not self.events_only]
            
            for i,mention in enumerate(mentions):
                # delta stuff
                clusters_to_check = clust_key_idxs[clusters[i]]
                
                for j in range(0, clusters_to_check+1):
                    clust_key = (clusters[i], j,) # define key for this mention
                    if clust_key in delta_clusters: # compare tfidf for lemma delta
                        my_tfidf = get_doc_tfidf_rep(mention.fname)
                        broken = False
                        for m in delta_clusters[clust_key]:
                            check_tfidf = get_doc_tfidf_rep(m.fname)
                            sim = cos_sim(mention.fname, m.fname, my_tfidf, check_tfidf)
                            if sim < delta:
                                broken = True
                                break
                        if not broken: # then it is similar enough, add it!
                            helpers.update_list_dict(delta_clusters, clust_key, mention)
                            break
                    
                    # then this was too different, so add a new cluster!
                    # this must only be reached if it wasn't added to a cluster
                    # otherwise it would have broken out of the loop
                    # also it cannot be in clusters if we are at end since
                    # otherwise we would not be at the end
                    if j == clusters_to_check:
                        delta_clusters[clust_key].append(mention)
                        clust_key_idxs[clusters[i]] += 1

            all_mentions += mentions

        test_comp = None
        if build_test_comparison:
            test_comp = [m.coref_chain_id for m in all_mentions]

        self.set_tokens_to_clusters({i:cluster for i,cluster in enumerate(delta_clusters.values())}) # this updates the coref_ids on all mentions
        return test_comp, [m.coref_chain_id for m in all_mentions]
Exemple #17
0
 def set_is_last_token(self, mid, boolean):
     helpers.update_list_dict(self.is_last_token_in_mention, mid, boolean)
Exemple #18
0
    def __init__(self,
                 xml_file,
                 path,
                 topic,
                 annotated_sentences,
                 pmention_clusters,
                 get_all=False,
                 events_only=False):
        """
        :type pmention_clusters: MentionCoreferenceClusters
        """

        # organizational attributes
        self.fname = xml_file
        self.category = helpers.get_category(xml_file)
        self.topic = topic
        self.docnum = int(xml_file.split('_')[1].strip(self.category + '.xml'))
        self.sentences = {}
        self.mentions = {}

        # parsing constants initialization
        tokens_dict = {}
        relevant_tokens = {}

        # token parsing
        root = ET.parse(path + xml_file, parser=ET.XMLParser()).getroot()
        for tok in root.findall('token'):
            sentnum = int(tok.get('sentence'))
            if get_all or sentnum in annotated_sentences:
                tid = tok.get('t_id')
                token = EcbToken(tok.text, tid, sentnum)
                helpers.update_list_dict(tokens_dict, sentnum, token)
                relevant_tokens[tid] = token

        self.sentences = {
            sentnum: EcbSentence(token_list, sentnum, xml_file)
            for sentnum, token_list in tokens_dict.iteritems()
        }

        # get mentions
        for item in root.find('Markables'):
            tag = item.tag
            mid = item.get('m_id')

            if is_not_action(tag) and events_only:
                continue

            if item.attrib.has_key('TAG_DESCRIPTOR'):
                pmention_clusters.add_instance(
                    RealInstance(tag, mid, item.get('TAG_DESCRIPTOR'),
                                 item.get('instance_id'), xml_file))
            else:
                orig_ids = [int(child.get('t_id')) for child in item]
                mention_tok_ids = map(str, range(orig_ids[0],
                                                 orig_ids[-1] + 1))

                if helpers.all_in_list(mention_tok_ids,
                                       relevant_tokens.keys()):
                    # old_len = len(corefed_token_ids)
                    # corefed_token_ids = corefed_token_ids.union(set(orig_ids))

                    # if len(corefed_token_ids) - old_len != len(orig_ids):
                    #     print 'OVERLAP - continuing'
                    #     continue

                    tokens = [relevant_tokens[tid] for tid in mention_tok_ids]
                    self.mentions[mid] = Mention(
                        self.fname,
                        mid,
                        tag,
                        tokens,
                        is_continuous=orig_ids == mention_tok_ids)

        # get coreference
        mids_mapped = set()
        for coreference in root.find('Relations'):
            iid = coreference.get('note')
            if iid is None:  #intra-doc-coref
                iid = helpers.get_intra_doc_iid(
                    coreference.find('target').get('m_id'), xml_file)

            for child in coreference:
                mid = child.get('m_id')
                if child.tag == 'source' and mid in self.mentions.keys():
                    try:
                        pmention_clusters.add_mention(iid, self.mentions[mid])
                        mids_mapped.add(mid)
                    except KeyError:
                        pass

        for mid, mention in self.mentions.iteritems():
            if mid not in mids_mapped:
                pmention_clusters.add_singleton_mention(mention)
Exemple #19
0
    def __init__(self, xml_file, path, topic, annotated_sentences, pmention_clusters, get_all=False, events_only=False):
        """
        :type pmention_clusters: MentionCoreferenceClusters
        """

        # organizational attributes
        self.fname = xml_file
        self.category = helpers.get_category(xml_file)
        self.topic = topic
        self.docnum = int(xml_file.split('_')[1].strip(self.category+'.xml'))
        self.sentences = {}
        self.mentions = {}

        # parsing constants initialization
        tokens_dict = {}
        relevant_tokens = {}

        # token parsing
        root = ET.parse(path+xml_file, parser=ET.XMLParser()).getroot()
        for tok in root.findall('token'):
            sentnum = int(tok.get('sentence'))
            if get_all or sentnum in annotated_sentences:
                tid = tok.get('t_id')
                token = EcbToken(tok.text, tid, sentnum)
                helpers.update_list_dict(tokens_dict, sentnum, token)
                relevant_tokens[tid] = token

        self.sentences = {sentnum:EcbSentence(token_list, sentnum, xml_file) for sentnum, token_list in tokens_dict.iteritems()}

        # get mentions
        for item in root.find('Markables'):
            tag = item.tag
            mid = item.get('m_id')

            if is_not_action(tag) and events_only:
                continue

            if item.attrib.has_key('TAG_DESCRIPTOR'):
                pmention_clusters.add_instance(RealInstance(tag, mid, item.get('TAG_DESCRIPTOR'), item.get('instance_id'), xml_file))
            else:
                orig_ids = [int(child.get('t_id')) for child in item]
                mention_tok_ids = map(str, range(orig_ids[0], orig_ids[-1]+1))

                if helpers.all_in_list(mention_tok_ids, relevant_tokens.keys()):
                    # old_len = len(corefed_token_ids)
                    # corefed_token_ids = corefed_token_ids.union(set(orig_ids))

                    # if len(corefed_token_ids) - old_len != len(orig_ids):
                    #     print 'OVERLAP - continuing'
                    #     continue

                    tokens = [relevant_tokens[tid] for tid in mention_tok_ids]
                    self.mentions[mid] = Mention(self.fname, mid, tag, tokens, is_continuous=orig_ids==mention_tok_ids)


        # get coreference
        mids_mapped = set()
        for coreference in root.find('Relations'):
            iid = coreference.get('note')
            if iid is None: #intra-doc-coref
                iid = helpers.get_intra_doc_iid(coreference.find('target').get('m_id'), xml_file)

            for child in coreference:
                mid = child.get('m_id')
                if child.tag == 'source' and mid in self.mentions.keys():
                    try:
                        pmention_clusters.add_mention(iid, self.mentions[mid])
                        mids_mapped.add(mid)
                    except KeyError:
                        pass

        for mid,mention in self.mentions.iteritems():
            if mid not in mids_mapped:
                pmention_clusters.add_singleton_mention(mention)
Exemple #20
0
 def get_clusters(self, events_only=True, topics=helpers.ALL_TOPICS):
     clusters = {}
     for mention in self.itermentions():
         if mention.topic() in topics:
             if not events_only or mention.is_event():
                 helpers.update_list_dict(clusters, mention.coref_chain_id, mention)
Exemple #21
0
def convert_to_conll(tokens,
                     conll_path,
                     isgold,
                     events_only=False,
                     data_set='all',
                     save=True):
    s = ''
    fname = 'ECB+/ecbplus_all'  # document.fname
    empty_line = fname + ' -\n'

    coref_ids = {}
    continuing_mids = []
    for i, token in enumerate(tokens):
        if token.not_mention():
            s += empty_line
        else:
            s += fname + ' '
            put = False
            new_continuing_mids = []

            # putting starters and single-tokeners
            for mid in token.coref_ids:
                if mid not in continuing_mids:
                    if token.is_last_token_for_mid(
                            mid):  # single-token mention
                        s += '(' + str(mid) + ')'
                    else:  # start new mention
                        s += '(' + str(mid)
                        new_continuing_mids.append(mid)
                    put = True
                    helpers.update_list_dict(coref_ids, mid, token)

            # putting enders
            cont_mid_list = list(continuing_mids)
            for mid in cont_mid_list:
                if token.is_last_token_for_mid(mid) or i == len(tokens) - 1:
                    s += str(mid) + ')'
                    continuing_mids.remove(mid)
                    put = True

            if not put:
                s += '-'

            s += '\n'
            continuing_mids += new_continuing_mids

    print '( = %d, ) = %d' % (s.count('('), s.count(')'))
    # assert s.count('(') == s.count(')')

    total_mentions = 0
    non_singleton_chains = 0
    singletons = 0
    for cid in coref_ids:
        total_mentions += len(coref_ids[cid])
        non_singleton_chains += 1 if len(coref_ids[cid]) > 1 else 0
        singletons += 1 if len(coref_ids[cid]) == 1 else 0

    print 'TOTAL MENTIONS:', total_mentions
    print 'TOTAL CHAINS:', non_singleton_chains
    print 'TOTAL SINGLETONS:', singletons

    if save:
        with open(conll_path + conll_file_name(isgold, events_only, data_set),
                  'w') as f:
            f.write('#begin document (ECB+/ecbplus_all); part 000\n')
            f.write(s)
            f.write('\n#end document\n')
    def predict(self, build_test_comparison=True, delta=0.0):
        super(BaselineLemmaPredictor, self).predict()
        m_topics = self.get_mentions_by_topic(events_only=self.events_only,
                                             split_into_topics=False)
        all_mentions = []
        clusters = collections.defaultdict(list)
        lemma_keys_idxs = collections.defaultdict(int)
        for mlst in m_topics.itervalues():
            mentions = sorted(mlst, key=Mention.get_comparator_function())
            mentions = [m for m in mentions if m.is_event() or not self.events_only]
            for i,mention in enumerate(mentions):
                best_lemmas = []

                # set best_lemmas - for now it is by tuples, may be too restrictive..
                for token in mention.tokens:
                    if not helpers.is_stop_word(token.word):
                        best_lemmas.append(token.lemma)
                if len(best_lemmas) == 0:
                    best_lemmas = [t.lemma for t in mention.tokens]

                # HEAD LEMMA ONLY!!!! 
                best_lemmas = [mention.tokens[0].lemma]

                lemmas = tuple(best_lemmas+[mention.topic()])
                if not self.with_topics: # remove topic if we aint doin topics
                    lemmas = lemmas[:-1]

                # delta stuff
                clusters_to_check = lemma_keys_idxs[lemmas]
                
                for i in range(0, clusters_to_check+1):
                    lemmas_key = (lemmas, i,)
                    if lemmas_key in clusters: # compare tfidf for lemma delta
                        my_tfidf = get_doc_tfidf_rep(mention.fname)
                        broken = False
                        for m in clusters[lemmas_key]:
                            check_tfidf = get_doc_tfidf_rep(m.fname)
                            sim = cos_sim(mention.fname, m.fname, my_tfidf, check_tfidf)
                            if sim < delta:
                                broken = True
                                break
                        if not broken: # then it is similar enough, add it!
                            helpers.update_list_dict(clusters, lemmas_key, mention)
                            break
                    
                    # then this was too different, so add a new cluster!
                    # this must only be reached if it wasn't added to a cluster
                    # otherwise it would have broken out of the loop
                    # also it cannot be in clusters if we are at end since
                    # otherwise we would not be at the end
                    if i == clusters_to_check:
                        clusters[lemmas_key].append(mention)
                        lemma_keys_idxs[lemmas] += 1

            all_mentions += mentions

        test_comp = None
        if build_test_comparison:
            test_comp = [m.coref_chain_id for m in all_mentions]

        self.set_tokens_to_clusters({i:cluster for i,cluster in enumerate(clusters.values())}) # this updates the coref_ids on all mentions
        return test_comp, [m.coref_chain_id for m in all_mentions]