コード例 #1
0
def highlight(text, language="en", iknow=iknowpy.iKnowEngine()):

    iknow.index(text, language)

    for s in iknow.m_index['sentences']:

        # first figure out where negation spans are and tag those entities
        for a in s['path_attributes']:

            # path attributes are expressed as positions within s['path'],
            # which in turn keys into the s['entities'] array
            for ent in range(s['path'][a['pos']],
                             s['path'][a['pos'] + a['span'] - 1] + 1):
                if a['type'] == "Negation":
                    s['entities'][ent]['colour'] = Fore.RED
                if a['type'] == "Certainty":
                    s['entities'][ent]['colour'] = Fore.CYAN

        for e in s['entities']:
            colour = Fore.BLACK
            style = Style.NORMAL

            if "colour" in e:
                colour = e["colour"]

            if (e['type'] == 'Concept'):
                style = Style.BRIGHT
            if (e['type'] == 'NonRelevant') | (e['type'] == 'PathRelevant'):
                style = Style.DIM

            print(colour + style + text[e['offset_start']:e['offset_stop']],
                  end=' ')

        print("\n")
コード例 #2
0
ファイル: iksimilarity.py プロジェクト: mgoldenisc/iknow
 def __init__(self, corpus_path, use_iknow_entities, tokenize_concepts):
     self.corpus_path = corpus_path
     self.is_dir = os.path.isdir(corpus_path)
     if use_iknow_entities:
         self.use_iknow_entities = use_iknow_entities
         self.tokenize_concepts = tokenize_concepts
         self.engine = iknowpy.iKnowEngine()
コード例 #3
0
ファイル: iksimilarity.py プロジェクト: mgoldenisc/iknow
    def synonym_dict_from_file(self,
                               source_text,
                               use_iknow_entities=True,
                               num_similar=5):
        """ Uses currently loaded model to determine a dictionary of synonyms for each word or
        entity in a provided text file.

        Parameters
        --------------
        source_text (str) - The path to a file containing the source text
        
        use_iknow_entities (bool) - whether to find synonyms for iKnow entities (as opposed to words)

        num_similar (int) - Number of similar words that will be returned for each term in the source text (if exist).
        Higher num_similar ~ less strict similarity, lower num_similar ~ more strict similarity


        Returns
        --------------
        a dictionary of synonyms for each entity or word in the source

        NOTE: Right now, using iKnow entities will only check for synoyms of the iKnow entities, not for 
        their individual components. So it is one or the other.
        """
        dictionary = {}
        if use_iknow_entities:
            # index the source with iknow entities
            engine = iknowpy.iKnowEngine()
            for line in open(source_text, 'r'):
                engine.index(line, 'en')
                # Populate dictionary with keys for each term, all with empty list for value
                for s in engine.m_index['sentences']:
                    for e in s['entities']:
                        if (e['type'] in ('PathRelevant',
                                          'NonRelevant')) or (e['index']
                                                              in dictionary):
                            continue
                        else:
                            try:
                                dictionary[e['index']] = [self.most_similar(e['index'], num_similar=num_similar)] \
                                    if num_similar == 1 else self.most_similar(e['index'], num_similar=num_similar)
                            except KeyError:
                                continue
        else:
            # use words instead of entities
            for line in open(source_text, 'r'):
                words = line.split(' ')
                for word in words:
                    if word in dictionary: continue
                    else:
                        try:
                            dictionary[word] = [self.most_similar(word, num_similar=num_similar)] \
                                if num_similar == 1 else self.most_similar(word, num_similar=num_similar)
                        except KeyError:
                            continue
        return dictionary
コード例 #4
0
ファイル: strip_negation.py プロジェクト: rsi7700/iknow
def strip_negation(text, language="en", iknow=iknowpy.iKnowEngine()):

    iknow.index(text, language)
    stripped = ""

    for s in iknow.m_index['sentences']:

        # first figure out where negation spans are and tag those entities
        for a in s['path_attributes']:

            # path attributes are expressed as positions within s['path'],
            # which in turn keys into the s['entities'] array
            if a['type'] == "Negation":
                for ent in range(s['path'][a['pos']],
                                 s['path'][a['pos'] + a['span'] - 1] + 1):
                    s['entities'][ent]['neg'] = 1

        for e in s['entities']:
            if "neg" in e:
                continue
            stripped += text[e['offset_start']:e['offset_stop']] + " "

    return stripped
コード例 #5
0
ファイル: genRAW-with-udct.py プロジェクト: mgoldenisc/iknow
def collect_files_recursive(in_path_par):
    for (dirpath, dirnames, filenames) in walk(in_path_par):
        for single_file in filenames:
            if (single_file.endswith('.txt')):
                full_path = dirpath + single_file
                f_rec.append(full_path)
        for single_dir in dirnames:
            full_dir = dirpath + single_dir + "/"
            collect_files_recursive(full_dir)
        break


collect_files_recursive(in_path_par)

engine = iknowpy.iKnowEngine()


def read_udct_file(file_, udct_):
    f_udct = open(file_, "r", True, "utf8")
    for txt_line in f_udct:
        # print('txt_line: ' + txt_line)
        txt_line = txt_line.rstrip()

        if ',' in txt_line and txt_line[0:2] != '/*':
            txt_list = txt_line.split(',')
            lexrep, action = txt_list[0], txt_list[1]
            if (lexrep[0] == '@'):
                literal = lexrep[1:]
                if action == "UDCertainty":
                    level = txt_list[2]
コード例 #6
0
ファイル: strip_negation.py プロジェクト: rsi7700/iknow
        for e in s['entities']:
            if "neg" in e:
                continue
            stripped += text[e['offset_start']:e['offset_stop']] + " "

    return stripped


# command-line processing
import sys, glob

lang = "en"
if len(sys.argv) > 2:
    lang = sys.argv[2]
iknow = iknowpy.iKnowEngine()

# read file pattern argument and process the contents, writing directly to stdout (for piping)
# note that file patterns need to be wrapped in quotes or they will be "applied" before this hits python
# usage:
#    $ python strip_negation.py test.txt
#    $ python strip_negation.py test.txt "fr"
#    $ python strip_negation.py "*.txt" | grep fix
for path in glob.glob(sys.argv[1]):
    with open(path, 'r') as file:
        for line in file:
            print(strip_negation(line, lang, iknow))

#
# variation: this reads piped text straight from stdin
# usage