def extract_story_elements():
    min_head_vocab = 5
    min_role_vocab = 4
    min_tuples = 3

    ATTRIBUTE = 0
    AGENT_ROLE = 1
    PATIENT_ROLE = 2
    SURFACE_FORM = 3

    parsed_dir = os.path.join(dirs.data_stanford_dir, 'parsed')
    parsed_files = glob.glob(os.path.join(parsed_dir, '*.json'))

    dependencies_file = os.path.join(dirs.data_stanford_dir, 'dependency_tuple_ids.json')
    dependencies = fh.read_json(dependencies_file)

    coref_file = os.path.join(dirs.data_stanford_dir, 'coref_heads.json')
    coref_heads = fh.read_json(coref_file)

    supersense_tags = fh.read_json(os.path.join(dirs.data_amalgram_dir, 'all_tags.json'))

    heads = defaultdict(int)
    tokens = defaultdict(int)
    attributes = defaultdict(int)
    agent_roles = defaultdict(int)
    patient_roles = defaultdict(int)

    story_elements = {}
    print "Extracting story elements"
    for f_i, f in enumerate(parsed_files):
        sentences = fh.read_json(f)
        basename = fh.get_basename_wo_ext(f)
        element_list = extract_story_elements_from_article(sentences, dependencies[basename], coref_heads[basename], supersense_tags[basename], basename)
        story_elements[basename] = element_list
        for element in element_list:
            for h in element.head_words:
                heads[h] += 1
            for t in element.attributes:
                attributes[t] += 1
            for t in element.agent_roles:
                agent_roles[t] += 1
            for t in element.patient_roles:
                patient_roles[t] += 1

    print "Finding most common tokens"
    common_heads = [(v, k) for k, v in heads.items()]
    common_heads.sort()
    common_heads.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_heads.json')
    fh.write_to_json(common_heads, output_filename, sort_keys=False)

    """
    common_tokens = [(v, k) for k, v in tokens.items()]
    common_tokens.sort()
    common_tokens.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_tokens.json')
    fh.write_to_json(common_tokens, output_filename, sort_keys=False)
    """

    common_attributes = [(v, k) for k, v in attributes.items()]
    common_attributes.sort()
    common_attributes.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_attributes.json')
    fh.write_to_json(common_attributes, output_filename, sort_keys=False)

    common_agent_roles = [(v, k) for k, v in agent_roles.items()]
    common_agent_roles.sort()
    common_agent_roles.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_agent_roles.json')
    fh.write_to_json(common_agent_roles, output_filename, sort_keys=False)

    common_patient_roles = [(v, k) for k, v in patient_roles.items()]
    common_patient_roles.sort()
    common_patient_roles.reverse()
    output_filename = os.path.join(dirs.lda_dir, 'common_patient_roles.json')
    fh.write_to_json(common_patient_roles, output_filename, sort_keys=False)

    print pronoun_list
    #most_common_heads = {k: v for v, k in common_heads if v >= min_head_vocab and k not in pronoun_list}
    most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list)}
    most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}
    most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}

    output_filename = os.path.join(dirs.lda_dir, 'most_common_attributes.json')
    fh.write_to_json(most_common_attributes, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'most_common_agent_roles.json')
    fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'most_common_patient_roles.json')
    fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False)

    print len(most_common_attributes)
    print len(most_common_agent_roles)
    print len(most_common_patient_roles)

    print "Filtering tuples"
    valid_elements = defaultdict(list)
    for basename, element_list in story_elements.items():
        for se in element_list:
            se.valid_heads = [h for h in se.head_words if h not in pronoun_list]
            se.valid_phrases = [h for h in se.phrases if h not in pronoun_list]
            if len(se.valid_heads) > 0:
                se.valid_attributes = [t for t in se.attributes if t in most_common_attributes]
                se.valid_agent_roles = [t for t in se.agent_roles if t in most_common_agent_roles]
                se.valid_patient_roles = [t for t in se.patient_roles if t in most_common_patient_roles]
                se.tuples = [(ATTRIBUTE, t) for t in se.valid_attributes] + \
                            [(AGENT_ROLE, t) for t in se.valid_agent_roles] + \
                            [(PATIENT_ROLE, t) for t in se.valid_patient_roles]
                            #[(SURFACE_FORM, t) for t in se.valid_heads]

                if len(se.tuples) >= min_tuples:
                    valid_elements[basename].append(se)

    print "Constructing vocabulary"
    n_tuples = 0
    vocab = VocabWithCounts('', add_oov=False)
    n_entities = 0
    for basename, element_list in valid_elements.items():
        for se in element_list:
            tokens = [token for role, token in se.tuples]
            vocab.add_tokens(tokens)
            n_tuples += len(tokens)
            n_entities += 1

    head_word_vocab = VocabWithCounts('', add_oov=False)
    for basename, element_list in valid_elements.items():
        for se in element_list:
            tokens = [token for token in se.valid_heads]
            head_word_vocab.add_tokens(tokens)

    head_phrase_vocab = VocabWithCounts('', add_oov=False)
    for basename, element_list in valid_elements.items():
        for se in element_list:
            tokens = [token for token in se.valid_phrases]
            head_phrase_vocab.add_tokens(tokens)

    print "Building indices"
    tuple_vocab = np.zeros(n_tuples, dtype=int)     # vocab index of the ith word
    tuple_entity = np.zeros(n_tuples, dtype=int)
    tuple_role = []
    entity_doc = np.zeros(n_entities, dtype=int)      # topic of the ith word
    docs = valid_elements.keys()
    docs.sort()

    vocab_counts = np.zeros(len(vocab), dtype=int)

    article_mapping = []
    entity_index = 0
    head_word_vocab_list = []
    head_word_entity_list = []
    head_phrase_vocab_list = []
    head_phrase_entity_list = []
    t_i = 0
    for d_i, d in enumerate(docs):
        element_list = valid_elements[d]
        for se in element_list:
            entity_doc[entity_index] = d_i
            for role, token in se.tuples:
                tuple_entity[t_i] = entity_index
                tuple_role.append(role)
                vocab_index = vocab.get_index(token)
                tuple_vocab[t_i] = vocab_index
                vocab_counts[vocab_index] += 1
                t_i += 1
            for token in se.valid_heads:
                head_word_vocab_index = head_word_vocab.get_index(token)
                head_word_vocab_list.append(head_word_vocab_index)
                head_word_entity_list.append(entity_index)
            for token in se.valid_phrases:
                head_phrase_vocab_index = head_phrase_vocab.get_index(token)
                head_phrase_vocab_list.append(head_phrase_vocab_index)
                head_phrase_entity_list.append(entity_index)

            article_mapping.append(str(entity_index) + ':' + d + ':' + ','.join(se.head_words) + ':' + ','.join(se.valid_attributes) + ':' + ','.join(se.valid_agent_roles) + ':' + ','.join(se.valid_patient_roles))
            entity_index += 1

    print len(docs), "valid documents"
    print entity_index, "entities"
    print t_i, "tuples"
    print len(vocab), "word types"
    print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts)

    output_filename = os.path.join(dirs.lda_dir, 'tuple_vocab.json')
    fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'tuple_role.json')
    fh.write_to_json(list(tuple_role), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'tuple_entity.json')
    fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'entity_doc.json')
    fh.write_to_json(list(entity_doc), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'vocab.json')
    fh.write_to_json(vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'docs.json')
    fh.write_to_json(list(docs), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'article_map.json')
    fh.write_to_json(list(article_mapping), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab.json')
    fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab.json')
    fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_word_vocab_list.json')
    fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_word_entity_list.json')
    fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_phrase_vocab_list.json')
    fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.lda_dir, 'head_phrase_entity_list.json')
    fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)
Beispiel #2
0
def get_bamman_entities(all_trees, clustered_entity_indices, word2vec_file=None, min_role_vocab=4, min_tuples=3):

    ATTRIBUTE = 0
    AGENT_ROLE = 1
    PATIENT_ROLE = 2
    SURFACE_FORM = 3

    tokens = defaultdict(int)
    heads = defaultdict(int)
    attributes = defaultdict(int)
    agent_roles = defaultdict(int)
    patient_roles = defaultdict(int)

    story_elements = {}
    for basename, trees in all_trees.items():
        story_elements[basename] = []
        article_clusters = clustered_entity_indices[basename]
        # go through each entity, represented by a list of tree/node locations
        for c_i, cluster_indices in enumerate(article_clusters):
            # create an entity for each cluster in this document
            entity = BammanEntity(basename)
            # for each appearance, create an appearance object for this entity
            for t_i, n_i in cluster_indices:
                word = trees[t_i].node_dict[n_i].word
                compound_word = get_compound_noun(trees[t_i], n_i)
                mention_attributes = get_attributes(trees[t_i], n_i)
                mention_agent_roles = get_agent_roles(trees[t_i], n_i)
                mention_patient_roles = get_patient_roles(trees[t_i], n_i)
                appearance = BammanEntityAppearance(t_i, n_i, word, mention_attributes, mention_agent_roles, mention_patient_roles, compound_word)
                entity.add_appearance(appearance)

                # count the total mentions of these words to build vocabularies
                heads[word] += 1
                for t in mention_attributes:
                    attributes[t[0]] += 1
                for t in mention_agent_roles:
                    agent_roles[t[0]] += 1
                for t in mention_patient_roles:
                    patient_roles[t[0]] += 1
            # add the newly created entity to a dict
            story_elements[basename].append(entity)

    print "Finding most common tokens"
    common_heads = [(v, k) for k, v in heads.items()]
    common_heads.sort()
    common_heads.reverse()
    output_filename = os.path.join(dirs.persona_dir, 'common_heads.json')
    fh.write_to_json(common_heads, output_filename, sort_keys=False)

    common_attributes = [(v, k) for k, v in attributes.items()]
    common_attributes.sort()
    common_attributes.reverse()
    output_filename = os.path.join(dirs.persona_dir, 'common_attributes.json')
    fh.write_to_json(common_attributes, output_filename, sort_keys=False)

    common_agent_roles = [(v, k) for k, v in agent_roles.items()]
    common_agent_roles.sort()
    common_agent_roles.reverse()
    output_filename = os.path.join(dirs.persona_dir, 'common_agent_roles.json')
    fh.write_to_json(common_agent_roles, output_filename, sort_keys=False)

    common_patient_roles = [(v, k) for k, v in patient_roles.items()]
    common_patient_roles.sort()
    common_patient_roles.reverse()
    output_filename = os.path.join(dirs.persona_dir, 'common_patient_roles.json')
    fh.write_to_json(common_patient_roles, output_filename, sort_keys=False)

    # filter vocabularies based on frequency and stopwords
    most_common_attributes = {k: v for v, k in common_attributes if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}
    most_common_agent_roles = {k: v for v, k in common_agent_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}
    most_common_patient_roles = {k: v for v, k in common_patient_roles if (v >= min_role_vocab and k not in pronoun_list and k not in stopwords)}

    # save these vocabularies
    output_filename = os.path.join(dirs.persona_dir, 'most_common_attributes.json')
    fh.write_to_json(most_common_attributes, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'most_common_agent_roles.json')
    fh.write_to_json(most_common_agent_roles, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'most_common_patient_roles.json')
    fh.write_to_json(most_common_patient_roles, output_filename, sort_keys=False)

    print len(most_common_attributes)
    print len(most_common_agent_roles)
    print len(most_common_patient_roles)

    print "Filtering tuples"
    filtered_indices = {}
    valid_elements = defaultdict(list)
    for basename, entity_list in story_elements.items():
        #filtered_indices[basename] = []
        for e_index, entity in enumerate(entity_list):
            appearances = entity.get_appearances()
            valid_heads = []
            for ap in appearances:
                if ap.head_word not in pronoun_list:
                    valid_heads.append(ap.head_word)
                    ap.valid_heads = [ap.head_word]
                    ap.valid_compound_heads = [ap.compound_word]
                else:
                    ap.valid_heads = []
                    ap.valid_compound_heads = []
            if len(valid_heads) > 0:
                for ap in appearances:
                    ap.valid_attributes = [t for t in ap.attributes if t[0] in most_common_attributes]
                    ap.valid_agent_roles = [t for t in ap.agent_roles if t[0] in most_common_agent_roles]
                    ap.valid_patient_roles = [t for t in ap.patient_roles if t[0] in most_common_patient_roles]
                    ap.tuples = [(ATTRIBUTE, t[0], t[1], t[2], t[3]) for t in ap.valid_attributes] + \
                                [(AGENT_ROLE, t[0], t[1], t[2], t[3]) for t in ap.valid_agent_roles] + \
                                [(PATIENT_ROLE, t[0], t[1], t[2], t[3]) for t in ap.valid_patient_roles]

            if entity.get_n_tuples() >= min_tuples:
                valid_elements[basename].append(entity)
                        #filtered_indices[basename].append(clustered_entity_indices[basename][se_index])

    print "Constructing vocabulary"
    n_tuples = 0
    vocab = VocabWithCounts('', add_oov=False)
    n_entities = 0
    n_mentions = 0
    for basename, element_list in valid_elements.items():
        for se in element_list:
            for appearance in se.appearances:
                tokens = [token for role, token, relation, pos, tuple_token_index in appearance.tuples]
                vocab.add_tokens(tokens)
                n_tuples += len(tokens)
                if len(appearance.tuples) > 0:
                    n_mentions += 1
            n_entities += 1

    head_word_vocab = VocabWithCounts('', add_oov=False)
    for basename, element_list in valid_elements.items():
        for se in element_list:
            for appearance in se.appearances:
                tokens = [token for token in appearance.valid_heads]
                head_word_vocab.add_tokens(tokens)

    head_phrase_vocab = VocabWithCounts('', add_oov=False)
    for basename, element_list in valid_elements.items():
        for se in element_list:
            for appearance in se.appearances:
                tokens = [token for token in appearance.valid_compound_heads]
                head_phrase_vocab.add_tokens(tokens)

    print "Building indices"
    tuple_vocab = np.zeros(n_tuples, dtype=int)     # vocab index of the ith word
    tuple_entity = np.zeros(n_tuples, dtype=int)
    tuple_role = []
    mention_entity = np.zeros(n_mentions, dtype=int)
    tuple_mention = np.zeros(n_tuples, dtype=int)
    entity_doc = np.zeros(n_entities, dtype=int)      # topic of the ith word
    docs = valid_elements.keys()
    docs.sort()


    """
    vocab_vectors = None
    if word2vec_file is not None:
        import gensim
        dx = 300
        vocab_vectors = np.zeros((len(vocab), dx))

        # load pre-trained word vectors
        print "Loading pre-trained word vectors"
        all_vectors = gensim.models.Word2Vec.load_word2vec_format(word2vec_file, binary=True)

        word2vec_vocab = set()

        for v in vocab.get_all_tokens():
            v_i = vocab.get_index(v)
            if v in all_vectors:
                vocab_vectors[v_i, :] = all_vectors[v]
                word2vec_vocab.add(v)
            else:
                vocab_vectors[v_i, :] = 0.05 * np.random.uniform(-1.0, 1.0, (1, dx))
        print len(list(set(vocab.get_all_tokens()) - word2vec_vocab)), "words in training vocabulary with no word2vec vector"
    """

    vocab_counts = np.zeros(len(vocab), dtype=int)
    entity_appearances = {}
    entity_index = 0
    mention_index = 0
    head_word_vocab_list = []
    head_word_entity_list = []
    head_phrase_vocab_list = []
    head_phrase_entity_list = []
    entity_text_mentions = {}
    t_i = 0
    for d_i, d in enumerate(docs):
        print d
        basename = os.path.basename(d)
        entity_appearances[basename] = {}
        element_list = valid_elements[d]
        entity_text_mentions[d] = {}
        for se in element_list:
            entity_text_mentions[d][entity_index] = {'sent_indices': [], 'token_indices': [], 'roles': []}
            entity_doc[entity_index] = d_i
            for appearance in se.appearances:
                entity_text_mentions[d][entity_index]['sent_indices'].append(appearance.tree_index)
                entity_text_mentions[d][entity_index]['token_indices'].append(appearance.token_index)
                for role, token, relation, pos, tuple_token_index in appearance.tuples:
                    tuple_entity[t_i] = entity_index
                    tuple_mention[t_i] = mention_index
                    tuple_role.append(role)
                    vocab_index = vocab.get_index(token)
                    tuple_vocab[t_i] = vocab_index
                    vocab_counts[vocab_index] += 1
                    t_i += 1
                    entity_text_mentions[d][entity_index]['roles'].append((role, token, appearance.tree_index, tuple_token_index))
                for token in appearance.valid_heads:
                    head_word_vocab_index = head_word_vocab.get_index(token)
                    head_word_vocab_list.append(head_word_vocab_index)
                    head_word_entity_list.append(entity_index)
                for token in appearance.valid_compound_heads:
                    head_phrase_vocab_index = head_phrase_vocab.get_index(token)
                    head_phrase_vocab_list.append(head_phrase_vocab_index)
                    head_phrase_entity_list.append(entity_index)
                # keep track of which document / sentences this entity appears in
                s_i = appearance.tree_index
                if s_i in entity_appearances[basename]:
                    entity_appearances[basename][s_i].append(entity_index)
                else:
                    entity_appearances[basename][s_i] = [entity_index]
                if len(appearance.tuples):
                    mention_entity[mention_index] = entity_index
                    mention_index += 1
            entity_index += 1

    # as initial testing for Gaussian LDA, export a small vector for each tuple
    """
    tuple_vectors = None
    if word2vec_file is not None:
        vec_size = 10
        tuple_vectors = np.zeros([n_tuples, vec_size])
        for v_i, v in enumerate(tuple_vocab):
            tuple_vectors[v_i, :] = vocab_vectors[v, :vec_size]
    """



    # export network data
    rnn_data = []
    t_i = 0
    entity_index = 0
    mention_index = 0
    for d_i, d in enumerate(docs):
        element_list = valid_elements[d]
        for entity in element_list:
            appearance_list = []
            for appearance in entity.appearances:
                tuple_list = []
                head_word = appearance.head_word
                head_phrase = appearance.compound_word
                for role, token, relation, pos, tuple_token_index in appearance.tuples:
                    tuple_list.append((t_i, role, token, relation, head_word, pos, head_phrase))
                    t_i += 1
                if len(tuple_list) > 0:
                    appearance_list.append(tuple_list)
            rnn_data.append([d_i, entity_index, appearance_list])
            entity_index += 1

    output_filename = os.path.join(dirs.persona_dir, 'rnn_data.json')
    fh.write_to_json(rnn_data, output_filename, sort_keys=False)





    print len(docs), "valid documents"
    print entity_index, "entities"
    print t_i, "tuples"
    print len(vocab), "word types"
    print np.min(vocab_counts), np.max(vocab_counts), np.sum(vocab_counts)

    output_filename = os.path.join(dirs.persona_dir, 'tuple_vocab.json')
    fh.write_to_json(list(tuple_vocab), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'tuple_role.json')
    fh.write_to_json(list(tuple_role), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'tuple_entity.json')
    fh.write_to_json(list(tuple_entity), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'tuple_mention.json')
    fh.write_to_json(list(tuple_mention), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'mention_entity.json')
    fh.write_to_json(list(mention_entity), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'entity_doc.json')
    fh.write_to_json(list(entity_doc), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'vocab.json')
    fh.write_to_json(vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'docs.json')
    fh.write_to_json(list(docs), output_filename, sort_keys=False)

    #output_filename = os.path.join(dirs.persona_dir, 'article_map.json')
    #fh.write_to_json(list(article_mapping), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'head_word_vocab.json')
    fh.write_to_json(head_word_vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'head_word_vocab_list.json')
    fh.write_to_json(head_word_vocab_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'head_word_entity_list.json')
    fh.write_to_json(head_word_entity_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'entity_appearances.json')
    fh.write_to_json(entity_appearances, output_filename, sort_keys=False)

    #if tuple_vectors is not None:
    #    output_filename = os.path.join(dirs.persona_dir, 'tuple_vectors.json')
    #    fh.write_to_json(tuple_vectors.tolist(), output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'head_phrase_vocab.json')
    fh.write_to_json(head_phrase_vocab.index2token, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'head_phrase_vocab_list.json')
    fh.write_to_json(head_phrase_vocab_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'head_phrase_entity_list.json')
    fh.write_to_json(head_phrase_entity_list, output_filename, sort_keys=False)

    output_filename = os.path.join(dirs.persona_dir, 'entity_text_mentions.json')
    fh.write_to_json(entity_text_mentions, output_filename, sort_keys=False)

    return filtered_indices, valid_elements