Esempio n. 1
0
def query_frog_sentence(words):
    if have_frog:
        try:
            frog_out = frog.process(words)
            if verbose:
                print("frog_out", frog_out)
            return frog_out
        except:
            print("Unexpected Frog error:", sys.exc_info()[0])
            sys.exit(1)
    return None
Esempio n. 2
0
def find_keywords_and_groups(text, keywords, frog):
    """Returns a list of the keywords and a list of associated groups that occur in tokens."""
    tokens = frog.process(text)  # a list of dictionaries with frog's analysis per token
    kw = []
    groups = []
    for t in tokens:
        lemma = t["lemma"]
        g = keywords.get(lemma.lower(), None)
        if g is not None:
            kw.append(lemma)
            groups += g
    return list(set(kw)), list(set(groups))
Esempio n. 3
0
def lemmatise_frog(word, lemma, tag):
    #[{'index': '1', 'lemma': 'κρατήρ', 'pos': 'N--s---ma-', 'eos': True, 'posprob': 1.0, 'text': 'κρητῆρα'}]
    if have_frog:
        try:
            frog_out  = frog.process(word)
            the_lemma = frog_out[0]["lemma"]
            the_tag   = frog_out[0]["pos"]
            new_lemma = Lemma(word, the_lemma, tag, 0)
            return( new_lemma, "FROG" )
        except:
            pass
    return (None, "UNKNOWN")
Esempio n. 4
0
def read_keywords(filename):
    """Returns a list of keywords from the datafile. Keywords are lemmatized."""
    keywords = []
    frog = get_frog()
    with open(filename) as f:
        for line in f:
            word = line.decode("utf-8").strip().lower()
            if word[0] == "#":
                word = word[1:]
            tokens = frog.process(word)
            if len(tokens) == 1:  # TODO: we skip over multi-word keywords
                keywords.append(tokens[0]["lemma"])
    return list(set(keywords))
def frog_process():

    text = request.json["text"] if request.json and "text" in request.json else request.args.get('text', '')
    if not text:
        return jsonify({
            "message": "You must include a text as GET parameter or in the body."
        }), 400

    app.logger.debug("Analyzing text ..")
    app.logger.debug(text.replace('\n', ' '))

    return jsonify({
        "response": frog.process(text),
        "text": text
    })
Esempio n. 6
0
def find_keywords_and_groups(text, keywords, frog):
    """Returns a list of the keywords and a list of associated groups that occur in tokens."""
    tokens = frog.process(text)  # a list of dictionaries with frog's analysis per token
    kw = []
    groups = []
    for t in tokens:
        lemma = t["lemma"].lower()
        k = keywords.get(lemma, None)
        if k is not None:
            if t["posprob"] > 0.6:
                if not t["pos"].startswith(k.pos + "("):
                    continue
            kw.append(lemma)
            groups += k.groups
    return list(set(kw)), list(set(groups))
Esempio n. 7
0
def read_keywords(filename):
    """Returns a list of Keyword objects from the datafile."""
    keywords = []
    lemmas = []
    frog = get_frog()
    with open(filename) as f:
        for line in f:
            word, pos = line.decode("utf-8").strip().split(",")
            word = word.lower()
            if word[0] == "#":
                word = word[1:]
            tokens = frog.process(word)
            if len(tokens) == 1:  # TODO: we skip over multi-word keywords
                lemma = tokens[0]["lemma"]
                if lemma not in lemmas:  # we only want unique lemma's
                    k = Keyword(lemma=lemma, pos=pos)
                    keywords.append(k)
                    lemmas.append(lemma)
    return keywords
Esempio n. 8
0
def clean_wordlist(filename):
    frog = get_frog()
    keywords = []

    with open("data/{}".format(filename)) as f:
        for line in f:
            word, pos = line.strip().split(",")
            if not pos.startswith("SPEC"):
                word = word.lower()
            if word[0] == "#":
                word = word[1:]
            tokens = frog.process(word)
            if len(tokens) > 1:  # TODO: we skip over multi-word keywords
                lemma = word
            else:
                lemma = tokens[0]["lemma"]
            keywords.append((lemma, pos))

    keywords = sorted(set(keywords))

    with open("data/new_{}".format(filename), "w") as f:
        for (word, pos) in keywords:
            f.write("{},{}\n".format(word, pos))
Esempio n. 9
0
def lemmatize_word(word, lang):
    if lang == "D":
        word = frog.process(word)[0]['lemma']
    else:
        word = wn_lemmatizer(word)
    return word
Esempio n. 10
0
from __future__ import print_function, unicode_literals

import frog

frog = frog.Frog(frog.FrogOptions(parser=False), "/etc/frog/frog.cfg")
output = frog.process_raw("Dit is een test")
print("RAW OUTPUT=",output)
output = frog.process("Dit is nog een test.")
print("PARSED OUTPUT=",output)
with open("./data/neg.translated.tok", "r") as f_in:
    neg_trans_list = [l for l in f_in]



frog = frog.Frog(frog.FrogOptions(parser=False, ner=False, tok=False))
p = re.compile('(ADJ|BW|LID|N|SPEC|TSW|TW|VG|VNW|VZ|WW|LET)\((.*)\)')

def parse_pos(pos):
    m = p.match(pos)    
    coarse = m.group(1)
    fine = m.group(2)
    return coarse, fine.split(",")

X_pos = [
    [parse_pos(t["pos"])[0] for t in frog.process(sent)]
    
    for sent in pos_trans_list
]
X_neg = [
    [parse_pos(t["pos"])[0] for t in frog.process(sent)]
    
    for sent in neg_trans_list
]

with open("./data/parsed/positive_pos_coarse.txt", "w") as f:
    for s in X_pos:
        f.write(" ".join(s)+'\n')
with open("./data/parsed/negative_pos_coarse.txt", "w") as f:
    for s in X_neg:
        f.write(" ".join(s)+'\n')
# runnen met LaMachine (zie readme)
from __future__ import print_function, unicode_literals  #to make this work on Python 2 as well as Python 3
import frog, pickle

frog = frog.Frog(frog.FrogOptions(parser=False))
f = open(r"50zinnen", "rb")
tekst = pickle.load(f)
output = frog.process("\n".join(tekst))
f.close()
f = open("frog_resultaat.json", "w+")
f.write(str(output))
f.close()
Esempio n. 13
0
# #!/usr/bin/env python

# # -*- coding: utf-8 -*-

from __future__ import print_function, unicode_literals #to make this work on Python 2 as well as Python 3
import frog

filepath = '/Users/roelsmeets/desktop/af_corpora/af_corpus_1stpers_clean/AaVander_DeLichtekooiVanLoven_clean.txt'
text = open(filepath, encoding='utf-8').read()

frog = frog.Frog(frog.FrogOptions(parser=False))

raw_output = frog.process_raw(text)
# print ('**********************************************************')
# print("RAW OUTPUT=",raw_output)
# print ('**********************************************************')

parsed_output = frog.process(text)
# print ('**********************************************************')
# print("PARSED OUTPUT=",parsed_output)
# print ('**********************************************************')

named_entities = []

# for element in parsed_output:
# 	if parsed_output[element]['ner'] == 'B-PER':
# 		named_entities.append(parsed_output[element]['text'])

# print (named_entities)
Esempio n. 14
0

# In[7]:


if not os.path.exists('data/' + 'dev' + '.POS.txt') or not os.path.exists('data/' + 'train' + '.POS.txt'):
    import frog

    frog = frog.Frog(frog.FrogOptions(parser=False))

    for t in ['dev', 'train']:
        with open('data/' + t + '.POS.txt', 'w') as out:
            with open('data/' + t + '.txt', 'r') as f:
                for line in f:
                    sentence, tag = line.strip().split("\t")
                    froggo = frog.process(sentence)
                    postext = []
                    for w in froggo:
                        postext.append(w['pos'].split("(")[0])
                    out.write(" ".join(postext) + "\t" + tag + "\n")


# In[8]:


_X_pos_training = []
_y_pos_training = []
with open('data/train.POS.txt', 'r') as f:
    for line in f:
        sentence, tag = line.strip().split("\t")
        _X_pos_training.append(sentence)