コード例 #1
0
ファイル: predict.py プロジェクト: ShengleiH/NLP_Project
def main():
    # create instance of config
    config = Config()

    # build model
    model = NERModel(config)
    model.build()
    model.restore_session(config.dir_model)

    # create dataset
    test = CoNLLDataset(config.filename_test, config.processing_word,
                        config.processing_tag, config.max_iter)

    # evaluate and interact
    model.predict(test)
コード例 #2
0
def train_active(train, dev, test, select, config, modename):
    """
    Input: train set, test set, selection set, configurations
    Output: accuracy on dev set, test set, prediction on selection set
    Select Most & Least Certain Examples from Select set
    """
    # build model
    #tf.reset_default_graph()
    #gc.collect()
    #tf.get_variable_scope().reuse_variables()
    model = NERModel(config)
    model.build()
    print("Start training model...")
    print("Training size ", len(train))
    model.train(train, dev)

    # restore session
    model.restore_session(config.dir_model)

    # evaluate
    print("===Evaluating on test set:===")
    mode = "test" + modename
    model.evaluate(test, mode)

    # run on selection set

    print("Selecting samples for active learning...")
    if len(select) == 0:
        return []
    l = []
    for sent in select:
        output = model.predict(sent[0])
        l.append(output[1][0])
    #sort l
    return l  #most uncertain and least uncertain
コード例 #3
0
ファイル: predict.py プロジェクト: marlesson/sequence_tagging
def main(args):
    # create instance of config
    config = Config()

    # build model
    model = NERModel(config)
    model.build()
    model.restore_session(config.dir_model)

    # create dataset
    print(model.predict(args.sentence))
コード例 #4
0
def main():
    # create instance of config
    config = Config()

    # build model
    model = NERModel(config)
    model.build()
    model.restore_session(config.dir_model)

    # predict
    path = "data-sequence-tagging/QA4IE-benchmark/"
    file_name_list = [
        "ie_test/0-400/ie_test.span",
        "seq/0-400/dev.seq",
        "seq/0-400/test.seq",
        "seq/0-400/train.seq",
        "seq/400-700/dev.seq",
        "seq/400-700/test.seq",
        "seq/400-700/train.seq",
        "seq/700-/dev.seq",
        "seq/700-/test.seq",
        "seq/700-/train.seq",
        "span/0-400/dev.span",
        "span/0-400/test.span",
        "span/0-400/train.span",
        "span/400-700/dev.span",
        "span/400-700/test.span",
        "span/400-700/train.span",
        "span/700-/dev.span",
        "span/700-/test.span",
        "span/700-/train.span"
    ]

    for file_name in file_name_list:
        ifs = open(path + file_name + ".json", 'r')
        ofs = open(path + file_name + ".ner", 'w')
        dataset_raw = ifs.read()
        dataset = json.loads(dataset_raw)
        index = 0
        for passage in dataset['data']:
            # start of one passage
            ofs.write('#' + str(index) + "\n\n")
            index = index + 1
            for paragraph in passage['paragraphs']:
                context = paragraph['context']
                word_list = context.split(' ')
                preds = model.predict(word_list)
                ofs.write('\n'.join(preds) + '\n\n')
            ofs.write('\n')
        ifs.close()
        ofs.close()
        print("successfully predict " + file_name + '\n')
コード例 #5
0
# Pessoa is blue, tempo is green, Local is yellow and organizacao is red
bcolors = {
    "PESSOA": '\033[94m',
    "TEMPO": '\033[92m',
    "LOCAL": '\033[93m',
    "ORGANIZACAO": '\033[91m',
    "JURISPRUDENCIA": '\033[35m',
    "LEGISLACAO": '\033[36m',
    "ENDC": '\033[0m',
    "O": ""
}

# create instance of config
config = Config()

# build model
model = NERModel(config)
model.build()
model.restore_session(config.dir_model)

while (True):
    words = input("Escreva frase a ser analisada: ")
    words = word_tokenize(words, language='portuguese')
    preds = model.predict(words)
    for index, word in enumerate(words):
        if preds[index][0:2] in ['B-', 'I-', 'E-', 'S-']:
            preds[index] = preds[index][2:]
        print(bcolors[preds[index]] + word + bcolors["ENDC"], end=' ')
    print('\n')
コード例 #6
0
class _EntityBase:
    def __init__(self, load_lstm):

        import sys

        if load_lstm:
            sys.path.append('/home/rbshaffer/sequence_tagging')

            from model.ner_model import NERModel
            from model.config import Config
            config = Config()

            # build model
            self.model = NERModel(config)
            self.model.build()
            self.model.restore_session(config.dir_model)

    def get_chunks(self, parsed):
        return []

    def do_entity_extraction(self, parsed):
        """ Somewhat complex function to actually do the entity extraction. """

        import networkx as nx
        import textwrap
        from numpy import mean

        chunks = self.get_chunks(parsed)

        def total_edge_count(count_obj, total_counter=0):
            """ Sub-function to calculate total number of edges in a container. """

            if count_obj:
                ceiling = count_obj.pop(count_obj.keys()[0])
                total_counter += sum([min(ceiling, count_obj[c]) for c in count_obj])
                total_counter = total_edge_count(count_obj, total_counter)

            return total_counter

        def observed_edge_count(raw_obj):
            """ Sub-function to calculate the observed number of edges in a container. """

            observed_counter = 0

            for chunk_obj in raw_obj:
                chunk_entities = {e: chunk_obj.count(e) for e in set(chunk_obj)}
                observed_counter += total_edge_count(chunk_entities)

            return observed_counter

        # container to store all entities extracted, for matching use in-string
        # maybe consider shifting this inside the loop to only match in-chunk?
        # though note that the output generator currently depends on this
        all_entities = []

        # output container
        out = []

        # iterate over units of analysis, as defined in country-specific functions
        for chunk in chunks:
            entity_strings = []

            sentences = self.process_doc(chunk)

            for sent in sentences:
                entities = []
                tags = self.model.predict(sent)

                for i, t in enumerate(tags):
                    if t == 'B-MISC':
                        entities.append([sent[i]])
                    elif t == 'I-MISC' and len(entities) > 0:
                        # this condition shouldn't be necessary - need to figure out why this is happening
                        entities[-1].append(sent[i])

                new_entities = [' '.join(e) for e in entities]
                new_entities = ['\n'.join(textwrap.wrap(e.strip(), 20)) for e in new_entities]

                entity_strings += new_entities
                all_entities += new_entities

            out.append(entity_strings)

        # get the actual output
        entities_count = {e: all_entities.count(e) for e in set(all_entities)}

        out = [[e for e in row if e in entities_count] for row in out]

        edges = {}
        for chunk in out:
            if len(set(chunk)) > 1:
                entities = list(set(chunk))
                for i in range(len(entities)):
                    for j in range(i+1, len(entities)):
                        e1 = entities[i]
                        e2 = entities[j]

                        if (e1, e2) in edges:
                            edges[(e1, e2)] += min(chunk.count(e1), chunk.count(e2))
                        elif (e2, e1) in edges:
                            edges[(e2, e1)] += min(chunk.count(e1), chunk.count(e2))
                        else:
                            edges[(e1, e2)] = min(chunk.count(e1), chunk.count(e2))

        edges = [k + (w,) for k, w in edges.iteritems()]

        if entities_count:
            graph = nx.Graph()
            for u, v, w in edges:
                graph.add_edge(u, v, weight=w)

            degree = list(graph.degree(weight='weight').values())
            if degree:
                average_degree = mean(list(graph.degree(weight='weight').values()))
            else:
                average_degree = 0

            # count_zeroes?
            try:
                clustering_coeff = nx.average_clustering(graph, weight='weight', count_zeros=True)
            except ZeroDivisionError:
                clustering_coeff = 0

        else:
            graph = None
            clustering_coeff = None
            average_degree = None

        total_nodes = len(set(all_entities))
        total_edges = sum([e[2] for e in edges])

        return {'graph': graph, 'edges': edges, 'total_nodes': total_nodes, 'clustering': clustering_coeff,
                'total_edges': total_edges, 'average_degree': average_degree}

    @staticmethod
    def process_doc(document):

        sentences = _nltk.sent_tokenize(document)
        sentences = [_nltk.word_tokenize(sent) for sent in sentences]

        return sentences