コード例 #1
0
    def __stanford_openie(self, input, output, verbose=False):
        with open(input, 'r') as input_file:
            contents = input_file.read()
            input_file.close()

        if verbose:
            print('Searching for triples using Stanford OpenIE ...')

        nlp = CoreNLPWrapper()
        annotated = nlp.annotate(
            contents,
            properties={
                'annotators':
                'tokenize, ssplit, pos, ner, depparse, parse, openie'
            })

        for sentence in annotated['sentences']:
            for openie in sentence['openie']:
                with open(output, 'a') as output_file:
                    triple = Triple(sentence['index'],
                                    NLPUtils.adjust_tokens(openie['subject']),
                                    openie['relation'],
                                    NLPUtils.adjust_tokens(openie['object']))
                    if verbose:
                        print(triple.to_string())
                    output_file.write(triple.to_string() + '\n')
                    output_file.close()

        return output
コード例 #2
0
    def __clausie(self, input, output, verbose=False):
        with open(input, 'r') as input_file:
            contents = input_file.read()
            input_file.close()

        if verbose:
            print('Searching for triples using ClausIE ...')

        input_clausie = os.path.splitext(input)[0] + '_clausie_input.txt'
        open(input_clausie, 'w').close()

        print('Preparing contents to be processed by ClausIE at {}'.format(
            input_clausie))

        nlp = CoreNLPWrapper()
        annotated = nlp.annotate(
            contents, properties={'annotators': 'tokenize, ssplit, pos'})

        for sentence in annotated['sentences']:
            sent_str = ''
            for token in sentence['tokens']:
                if token['pos'] == 'POS':
                    sent_str.strip()

                sent_str += token['word'] + ' '

            with open(input_clausie, 'a') as clausie_file:
                clausie_file.write(
                    str(sentence['index']) + '\t' + sent_str.strip() + '\n')
                clausie_file.close()

        clausie_out = ClausIEWrapper.run_clausie(input_clausie, output,
                                                 verbose)

        os.remove(input_clausie)

        # We need to do some adjustments to the output.
        final_contents = ""
        with open(clausie_out, 'r') as clausie_out_file:
            line = clausie_out_file.readline()
            while line:
                line = line.replace('\"', '').split('\t')
                triple = Triple(line[0].strip(),
                                NLPUtils.adjust_tokens(line[1].strip()),
                                line[2].strip(),
                                NLPUtils.adjust_tokens(line[3].strip()))
                if verbose:
                    print(triple.to_string())

                final_contents += triple.to_string() + '\n'

                line = clausie_out_file.readline()

        final_file = open(clausie_out, "w")
        n = final_file.write(final_contents)
        final_file.close()

        return final_file
コード例 #3
0
ファイル: csontologywrapper.py プロジェクト: ncirisaif/KGen
    def annotate(self, contents, verbose=False):
        candidates = NLPUtils.extract_candidate_entities(
            contents, grammar=False, stopwords=True).union(
                NLPUtils.extract_candidate_relations(contents))

        matches = self.find_matches(candidates, verbose)
        annotations = list()
        for match in matches:
            annotations.append({
                'instance': match,
                'link': self._links[matches[match]['matched']]
            })

        return annotations
コード例 #4
0
ファイル: preprocessor.py プロジェクト: ping543f/KGen
    def preprocess(self, input_filename, verbose=False):
        if not input_filename.startswith('/'):
            input_filename = os.path.dirname(
                os.path.realpath(__file__)) + '/' + input_filename

        print('Processing text from {}'.format(input_filename))
        with open(input_filename, 'r') as input_file:
            contents = input_file.read()
            input_file.close()

        coref_resolver = CorefResolver(contents)
        abbrev_resolver = AbbrevResolver(coref_resolver.resolve(verbose))
        simplified_contents = Simplifier(
            NLPUtils.adjust_tokens(
                abbrev_resolver.resolve(verbose))).simplify(verbose)

        output_filename = os.path.splitext(
            input_filename)[0] + '_preprocessed.txt'
        with open(output_filename, 'w') as output_file:
            output_file.write(simplified_contents)
            output_file.close()

        print('Preprocessed text stored at {}'.format(output_filename))

        return output_filename
コード例 #5
0
    def link(self,
             input_filename,
             k_base='babelfy',
             umls=False,
             verbose=False):
        if not input_filename.startswith('/'):
            input_filename = os.path.dirname(
                os.path.realpath(__file__)) + '/' + input_filename

        print('Processing text from {}'.format(input_filename))

        with open(input_filename, 'r') as input_file:
            contents = input_file.read()

            input_file.close()

        if umls:
            prefixes, entities, relations, links = KnowledgeBases(
                k_base).query(contents, verbose)

        else:
            entities, relations = NLPUtils.extract_entities_and_relations(
                contents)
            prefixes, links = KnowledgeBases(k_base).annotate(
                contents, verbose)

        output_filename = os.path.splitext(input_filename)[0] + '_links.txt'
        open(output_filename, 'w').close()  # Clean the file in case it exists

        with open(output_filename, 'a') as output_file:
            for key in prefixes.keys():
                output_file.write('@PREFIX\t{}:\t<{}>\t\n'.format(
                    prefixes[key], key))

            for key in relations:
                if key in links.keys():
                    output_file.write('@LINK\t{}\t{}\t\n'.format(
                        key, links[key]))
                else:
                    output_file.write(
                        '@LINK\t{0}\tno_match\tnot_found\t{0}\t\n'.format(key))

            for key in entities:
                if key in links.keys():
                    output_file.write('@LINK\t{}\t{}\t\n'.format(
                        key, links[key]))
                else:
                    output_file.write(
                        '@LINK\t{0}\tno_match\tnot_found\t{0}\t\n'.format(key))

            output_file.close()

        print('Linked entities were stored at {}'.format(output_filename))

        return output_filename
コード例 #6
0
ファイル: linker.py プロジェクト: cemeiq/KGen
    def link(self, input_filename, k_base='babelfy', verbose=False):
        if not input_filename.startswith('/'):
            input_filename = os.path.dirname(
                os.path.realpath(__file__)) + '/' + input_filename

        print('Processing text from {}'.format(input_filename))

        with open(input_filename, 'r') as input_file:
            contents = input_file.read()
            input_file.close()

        prefixes, links = KnowledgeBases(k_base).annotate(contents, verbose)

        np_entities, verbs = NLPUtils.extract_np_and_verbs(contents)
        entities_linked = self.__associate_np_to_entities(np_entities, links)
        verbs_linked = self.__associate_verbs_to_entities(verbs, links)

        output_filename = os.path.splitext(input_filename)[0] + '_links.txt'
        open(output_filename, 'w').close()  # Clean the file in case it exists

        with open(output_filename, 'a') as output_file:
            for key in prefixes.keys():
                output_file.write('@PREFIX\t{}:\t<{}>\n'.format(
                    prefixes[key], key))

            for key in verbs_linked.keys():
                output_file.write('@PREDICATE\t{};{}\n'.format(
                    key.encode('utf-8'), verbs_linked[key]))

            for key in entities_linked.keys():
                output_file.write('@ENTITY\t{};{}\n'.format(
                    key.encode('utf-8'), entities_linked[key]))
            output_file.close()
        print('Linked entities were stored at {}'.format(output_filename))

        return output_filename
コード例 #7
0
    def __senna(self, input_filename, output_filename, verbose=False):
        if verbose:
            print('Performing Sentence Role Labeling with SENNA...')

        senna = SennaWrapper()

        out_contents = ''
        with open(input_filename, 'r') as input_file:
            sentence_number = 0
            for line in input_file.readlines():
                if len(line) < 1: continue

                senna_output = senna.srl(NLPUtils.adjust_tokens(line), verbose=False)
                for predicate in senna_output.keys():
                    dict_contents = senna_output[predicate]
                    agent = None
                    patient = None

                    if 'A0' in dict_contents and 'A1' in dict_contents:
                        agent = dict_contents['A0']
                        patient = dict_contents['A1']

                    elif 'A0' in dict_contents: # No A1
                        agent = dict_contents['A0']
                        if 'A2' in dict_contents:
                            patient = dict_contents['A2']
                        else:
                            for key in dict_contents.keys():
                                if not key == 'A0':
                                    patient = dict_contents

                    elif 'A1' in dict_contents: # No A0
                        patient = dict_contents['A1']
                        if 'A2' in dict_contents:
                            agent = dict_contents['A2']
                        else:
                            for key in dict_contents.keys():
                                if not key == 'A1':
                                    agent = dict_contents[key]

                    else: # Neither A0 nor A1
                        if 'A2' in dict_contents:
                            agent = dict_contents['A2']
                            for key in dict_contents.keys():
                               if not key == 'A2':
                                   patient = dict_contents[key]
                        else: # Very unlikely
                            key_lst = dict_contents.keys()
                            key_lst.sort(key = len) # sort by string length
                            agent = dict_contents[key_lst[0]]
                            patient = dict_contents[key_lst[1]]

                    if agent is None or patient is None:
                        print('-Warning: No agent or patient determined for predicate {}'.format(predicate))
                        print('-- agent: {}'.format(agent))
                        print('-- patient: {}'.format(patient))
                        continue

                    triple = Triple(sentence_number, agent, predicate, patient)

                    if verbose:
                        print(triple.to_string())

                    out_contents += triple.to_string() + '\n'

                sentence_number += 1

            input_file.close()

        with open(output_filename, 'w') as output_file:
            output_file.write(out_contents)
            output_file.close()

        return output_filename
コード例 #8
0
    def __senna(self, input_filename, output_filename, verbose=False):
        if verbose:
            print('Performing Sentence Role Labeling with SENNA...')

        senna = SennaWrapper()

        out_contents = ''
        with open(input_filename, 'r') as input_file:
            sentence_number = 0
            for line in input_file.readlines():
                if len(line) < 1:
                    continue

                dependency_list = NLPUtils.dependency_parse(
                    line,
                    deps_key='enhancedPlusPlusDependencies',
                    verbose=verbose)

                previous_term = ''
                previous_compound = ''
                dict_basic_to_most_specific = {}
                connective_dependencies = []
                while len(dependency_list) > 0:
                    elem = dependency_list.pop()

                    if elem[1] in ['ROOT', 'punct', 'det'
                                   ] or 'subj' in elem[1] or 'obj' in elem[1]:
                        continue

                    if elem[1] in ['compound', 'nmod:poss', 'aux', 'neg'
                                   ] or elem[1].endswith('mod'):
                        if previous_term == elem[0]:
                            updated_term = '{} {}'.format(
                                elem[2], previous_compound)
                        else:
                            updated_term = '{} {}'.format(elem[2], elem[0])
                            previous_compound = elem[0]
                        dict_basic_to_most_specific[elem[0]] = updated_term

                        triple = Triple(sentence_number, updated_term,
                                        'rdfs:subClassOf', previous_compound)

                        previous_compound = updated_term
                        previous_term = elem[0]

                        if verbose:
                            print(triple.to_string())

                        out_contents += triple.to_string() + '\n'

                    elif elem[1] in ['acl', 'appos'
                                     ] or elem[1].startswith('nmod:'):
                        connective_dependencies.append(elem)

                while len(connective_dependencies) > 0:
                    elem = connective_dependencies.pop()

                    if elem[1] == 'nmod:poss':
                        continue

                    if elem[1].find(':') > 0:  # e.g. 'nmod:of'
                        connector = elem[1][elem[1].find(':') + 1:]
                    elif elem[1] in ['acl', 'appos']:
                        connector = ''
                    else:
                        connector = elem[1]

                    first = elem[0]
                    if first in dict_basic_to_most_specific.keys():
                        first = dict_basic_to_most_specific[first]

                    second = elem[2]
                    if second in dict_basic_to_most_specific.keys():
                        second = dict_basic_to_most_specific[second]

                    if connector == '':
                        full = '{} {}'.format(first, second)
                    else:
                        full = '{} {} {}'.format(first, connector, second)

                    triple = Triple(
                        sentence_number,
                        full, 'local:{}_{}'.format(connector,
                                                   second.replace(' ',
                                                                  '')), first)
                    if verbose:
                        print(triple.to_string())
                    out_contents += triple.to_string() + '\n'

                    triple = Triple(
                        sentence_number, full,
                        'local:{}_{}'.format(first.replace(' ', ''),
                                             connector), second)
                    if verbose:
                        print(triple.to_string())
                    out_contents += triple.to_string() + '\n'

                    dict_basic_to_most_specific[elem[0]] = full

                senna_output = senna.srl(line, verbose=False)
                for predicate in senna_output.keys():
                    pred_args = senna_output[predicate]
                    pred_arg_names = NLPUtils.get_verbnet_args(predicate,
                                                               verbose=True)
                    if len(pred_arg_names) < 1:
                        print(
                            'WARNING -- Unable to retrieve predicate arg names for "{}"'
                            .format(predicate))

                    if verbose:
                        print('predicate: {}, args: {}'.format(
                            predicate, pred_args))

                    for pred_arg in pred_args:
                        if 'AM-NEG' == pred_arg:
                            predicate = 'not {}'.format(predicate)
                        elif 'AM-MOD' == pred_arg:
                            predicate = ' '.join(
                                [pred_args['AM-MOD'].strip(), predicate])
                        elif pred_arg.startswith('AM-'):
                            # Remove initial stopwords (e.g. determiners)
                            s = pred_args[pred_arg].strip()
                            split = s.split(' ', 1)
                            if NLPUtils.is_stopword(
                                    split[0]) and len(split) > 1:
                                s = s.split(' ', 1)[1]

                            triple = Triple(sentence_number, predicate,
                                            'local:{}'.format(pred_arg), s)
                            if verbose:
                                print(triple.to_string())

                            out_contents += triple.to_string() + '\n'

                    for i in range(len(pred_arg_names)):
                        pred_args_index = 'A{}'.format(i)
                        if pred_args_index in pred_args:
                            # Remove initial stopwords (e.g. determiners)
                            s = pred_args[pred_args_index].strip()
                            split = s.split(' ', 1)
                            if NLPUtils.is_stopword(
                                    split[0]) and len(split) > 1:
                                s = s.split(' ', 1)[1]

                            triple = Triple(
                                sentence_number, predicate,
                                'vn.role:{}'.format(pred_arg_names[i]), s)
                            if verbose:
                                print(triple.to_string())

                            out_contents += triple.to_string() + '\n'

                sentence_number += 1

            input_file.close()

        with open(output_filename, 'w') as output_file:
            output_file.write(out_contents)
            output_file.close()

        return output_filename