Esempio n. 1
0
def frog_process_files(files, verbose=True):
    seen = []
    start_time = time.time()

    frogger = frog.Frog(frog.FrogOptions(parser=False,mwu=False,ner=False,morph=False,chunking=False, numThreads=8),'/etc/frog/frog.cfg')

    for i, filename in enumerate(files):
        with open(filename,'r') as in_file:
            output = frogger.process_raw(in_file.read())

        if verbose:
            print ('> PROCESSING', filename, str(len(seen))+'/'+str(len(files)))

            seen.append(filename)

            #Timings (estimation of time remaining)
            runtime = time.time() - start_time
            per_document_time = runtime/len(seen)
            remaining_time = (len(files)-len(seen))*per_document_time
            total_time = remaining_time+runtime

            print ("RUNTIME", duration_to_string(runtime),
             "("+duration_to_string(per_document_time)+")",
              'REMAINING', duration_to_string(remaining_time),
               'TOTAL', duration_to_string(total_time))

        frogged_filename = util.filename_without_extension(filename, '.txt')

        with open(OUTPUT_FOLDER+frogged_filename+'.frog.out', 'w') as f:
            f.write(output)
Esempio n. 2
0
 def process_data(self, X):
     import frog
     frogg = frog.Frog(frog.FrogOptions(lemma=False, morph=False))
     new_X = [
         ' '.join([word['pos'] for word in frogg.process(x)]) for x in X
     ]
     return new_X
Esempio n. 3
0
 def process_data(self, X):
     frogg = frog.Frog(
         frog.FrogOptions(morph=False, mwu=False, chunking=False,
                          ner=False))
     new_X = [
         ' '.join([word['lemma'] for word in frogg.process(x)]) for x in X
     ]
     return new_X
Esempio n. 4
0
 def __init__(self, lmdir, sleep=False):
     """Starts the frog server if the sleep function isn't on."""
     if not sleep:
         import frog
         opts = frog.FrogOptions(parser=False, ner=False)
         self.frogger = frog.Frog(
             opts, lmdir + "LaMachine/lamachine/etc/"
             "frog/frog-twitter.cfg")
Esempio n. 5
0
def preprocess(files):
    """
    Preprocess a list of XML-files
    The cleaned files will be saved in the output folder

    Remove the XML-tags and clean the remaining raw text
    to have one sentence per line with lemmatized words
    """
    frog_options = frog.FrogOptions(tok=False,
                                    morph=False,
                                    mwu=True,
                                    chunking=False,
                                    ner=False,
                                    numThreads=8)
    frogger = frog.Frog(frog_options,
                        '/vol/customopt/lamachine/etc/frog/frog.cfg')

    start_time = time.time()
    for i, file_name in enumerate(files):
        outfile = ntpath.basename(file_name)[:-4] + '.txt'
        out_name = os.path.join(OUTPUT_FOLDER, outfile)
        if os.path.isfile(out_name):
            print('Already done:', out_name)
            continue
        with open(file_name, 'r', encoding='utf-8') as file:
            try:
                text = file.read()
                # Remove all XML tags
                text = re.sub('<[^>]*>', '', text)
                lines = text.splitlines()
                # Remove abundant whitespace
                lines = [line.strip() for line in lines]
                # One sentence per line
                lines = [
                    re.sub(r'(\w)\. ([A-Z])', '\\1.\n\\2', line)
                    for line in lines
                ]
                # Remove punctuation
                lines = [
                    re.sub(r'[\.,:;/\(\)\[\]\'\"]', '', line) for line in lines
                ]
                # Remove empty lines and make lower case
                lines = [line.lower() for line in lines if line != '']
                # Convert each word to its lemma
                lemmas = [lemmatize(line, frogger) for line in lines]
                # Change extension to .txt
                with open(out_name, 'w', encoding='utf-8') as out:
                    out.write('\n'.join(lemmas))
                if i % 49 == 0 and i != 0:
                    print('Done {}/{}'.format(i, len(files)))
                    time_per_doc = (time.time() - start_time) / i
                    print('Average time/document:',
                          sec_to_string(time_per_doc))
                    time_remaining = time_per_doc * (len(files) - i)
                    print('Time remaining:', sec_to_string(time_remaining))
            except UnicodeError:
                print('Skipping {}, UnicodeError'.format(file_name))
Esempio n. 6
0
def activate_lemmatizers():
    global frog_installed, frog_lemmatizer, lemmas_nl, lemmas_nl_file, wn_lemmatizer

    wn_lemmatizer = WordNetLemmatizer()
    frog_installed = True
    with open("./data/lemmas_nl.csv", 'r') as lemmas_nl_file:
        lemmas_nl_df = pandas.read_csv(lemmas_nl_file, sep=",")
        lemmas_nl = dict(zip(lemmas_nl_df["word"], lemmas_nl_df["lemma"]))
    try:
        import frog
        frog_lemmatizer = frog.Frog(frog.FrogOptions(parser=False))
        lemmas_nl_file = open("./data/lemmas_nl.csv", 'a')
    except ImportError:
        frog_installed = False
Esempio n. 7
0
def get_frog():
    """Returns the interface object to frog NLP. (There should only be one
    instance, because it spawns a frog process that consumes a lot of RAM.)
    """
    global FROG
    if FROG is None:
        FROG = frog.Frog(
            frog.FrogOptions(tok=True,
                             lemma=True,
                             morph=False,
                             daringmorph=False,
                             mwu=True,
                             chunking=False,
                             ner=False,
                             parser=False),
            "/home/rahiel/hortiradar/venv/share/frog/nld/frog.cfg")
    return FROG
Esempio n. 8
0
def function_sents(X):
    import frog
    frogg = frog.Frog(frog.FrogOptions(morph=False, mwu=False, chunking=False))
    aux = open('data/ww.txt', 'r').read().splitlines()
    new_X = []
    for x in X:
        new_x = []
        output = frogg.process(x)
        for word in output:
            if word['pos'][:3] not in ['LID', 'VNW', 'VG(', 'WW(']:
                continue
            if word['pos'][:2] == 'WW':
                if word['lemma'] in aux:
                    new_x.append(word['lemma'])
                continue
            new_x.append(word['text'].lower())
        new_X.append(new_x)
    return new_X
Esempio n. 9
0
 def process_data(self, X):
     """Filter data. Leave only articles, pronouns, conjunctions and auxiliary verbs."""
     frogg = frog.Frog(
         frog.FrogOptions(morph=False, mwu=False, chunking=False))
     aux = open(config.VERB_FILE, 'r').read().splitlines()
     new_X = []
     for x in X:
         new_x = []
         output = frogg.process(x)
         for word in output:
             if word['pos'][:3] not in ['LID', 'VNW', 'VG(', 'WW(']:
                 continue
             if word['pos'][:2] == 'WW':
                 if word['lemma'] in aux:
                     new_x.append(word['lemma'])
                 continue
             new_x.append(word['text'].lower())
         new_X.append(new_x)
     return new_X
Esempio n. 10
0
def delete_car(car):
    if car.xcor() < -325:
        car.hideturtle()
        car.clear()
        screen_cars.remove(car)


#Screen initialization
screen = turtle.Screen()
screen.setup(600, 600)
screen.tracer(0)
screen.colormode(255)
screen.title("Turtle crossing game")

#Classes initialization
player = frog.Frog()
scoreboard = score.Score()


#Functions
def exit_game():
    screen.bye()
    turtle.done()


#Listeners for executing the game
screen.listen()
screen.onkeypress(exit_game, "c")
screen.onkeypress(player.move, "w")

#Main loop
Esempio n. 11
0
import csv
import frog
from tqdm import tqdm

frogger = frog.Frog(frog.FrogOptions(parser=False, ner=False))
reader = csv.reader(open('dmad_a.csv'))
writer = csv.writer(open('dmad_a_tagged.csv', 'w'))

corp = [x for x in reader]

for i, r in enumerate(tqdm(corp)):
    try:
        r += ["\n".join(["\t".join([token["text"], token["lemma"], token["pos"]]) for token in frogger.process(r[4])])] if i else ['frogs']
        writer.writerow(r)
    except IndexError:
        print(r)
Esempio n. 12
0
import nltk.data
import frog
import codecs
import docopt

Usage:
    sample_file_builder <input_file> <output_file>


sent_detector = nltk.data.load('tokenizers/punkt/dutch.pickle')
froggie = frog.Frog(frog.FrogOptions(parser=False), "/etc/frog/frog.cfg")
# counter = 1

with codecs.open(output_file,"w","utf-8") as of:
    with codecs.open(input_file,"r","utf-8") as infile:
        for line in infile:
            of.write(froggie.process_raw(s)+"\n")
            # counter+=1
Esempio n. 13
0
def lemmatize_sents(X):
    import frog
    frogg = frog.Frog(
        frog.FrogOptions(morph=False, mwu=False, chunking=False, ner=False))
    new_X = [' '.join([word['lemma'] for word in frogg.process(x)]) for x in X]
    return new_X
Esempio n. 14
0
def load_level():
    screen.blit_UI(loading, (0, 0))
    pygame.display.flip()
    global spider
    global frames
    frames = 0
    movement.clear()
    bird.clear()
    bomb.clear()
    flingable.clear()
    frog.clear()
    hand.clear()
    stinger.clear()
    venom.clear()
    wasp.clear()
    screen.reset()

    start_region = movement.WalkableRegion("tree4", topleft=(0, 0))
    movement.WalkableRegion("tree branch", center=(1085, 360))
    movement.WalkableRegion("web bridge", center=(2125, 810))
    movement.WalkableRegion("web bridge", center=(2695, 107))
    movement.WalkableRegion("web bridge", center=(4135, 660))
    movement.WalkableRegion("tree branch", center=(4785, 295))
    movement.WalkableRegion("tree branch", center=(4785, 510))
    movement.WalkableRegion("tree branch", center=(4785, 725))
    movement.WalkableRegion("tree2", topleft=(520, 0))
    movement.WalkableRegion("tree3", topleft=(725, 0))
    movement.WalkableRegion("tree1", topleft=(1205, 0))
    movement.WalkableRegion("tree1", topleft=(1680, 0))
    movement.WalkableRegion("tree1", topleft=(2235, 0))
    movement.WalkableRegion("tree4", topleft=(2810, 0))
    movement.WalkableRegion("tree3", topleft=(3320, 0))
    movement.WalkableRegion("tree1", topleft=(3735, 0))
    movement.WalkableRegion("tree4", topleft=(4207, 0))
    movement.WalkableRegion("tree4", topleft=(4915, 0))
    movement.WalkableRegion("bathroom toothbrush", topleft=(5357, 0))
    movement.WalkableRegion("bathroom sink", topleft=(6268, 0))
    movement.WalkableRegion("bathroom window", topleft=(7362, 0))
    movement.WalkableRegion("window", topleft=(5357, 0))

    flingable.Flingable("rock", (645, 410))
    flingable.Flingable("acorn", (1310, 140))
    flingable.Flingable("acorn", (1403, 175))
    flingable.Flingable("acorn", (1482, 205))
    flingable.Flingable("acorn", (1400, 267))
    flingable.Flingable("rock", (1310, 635))
    flingable.Flingable("rock", (1430, 665))
    flingable.Flingable("branch", (1430, 735))
    flingable.Flingable("rock", (1807, 293))
    flingable.Flingable("rock", (1770, 410))
    flingable.Flingable("acorn", (1870, 450))
    flingable.Flingable("acorn", (2395, 205))
    flingable.Flingable("rock", (2460, 710))
    flingable.Flingable("branch", (2410, 535))
    flingable.Flingable("acorn", (2935, 230))
    flingable.Flingable("rock", (3075, 330))
    flingable.Flingable("rock", (2945, 500))
    flingable.Flingable("acorn", (3105, 505))
    flingable.Flingable("rock", (3040, 700))
    flingable.Flingable("acorn", (3475, 335))
    flingable.Flingable("rock", (3805, 325))
    flingable.Flingable("rock", (3960, 460))
    flingable.Flingable("rock", (3840, 615))
    flingable.Flingable("rock", (4345, 390))
    flingable.Flingable("rock", (4540, 500))
    flingable.Flingable("branch", (4435, 720))

    wasp.Wasp(1100, 235)
    wasp.Wasp(1155, 500)
    wasp.Wasp(1255, 265)
    wasp.Wasp(1940, 165)
    wasp.Wasp(1940, 320)
    wasp.Wasp(1940, 410)
    wasp.Wasp(1940, 775)
    wasp.Wasp(2372, 320)
    wasp.Wasp(2640, 475)
    wasp.Wasp(2760, 630)
    wasp.Wasp(2815, 355)
    wasp.Wasp(2865, 805)
    wasp.Wasp(3685, 680)
    wasp.Wasp(4105, 345)
    wasp.Wasp(4490, 205)
    wasp.Wasp(4665, 555)
    wasp.Wasp(4785, 725)
    wasp.Wasp(4940, 610)
    wasp.Wasp(4995, 270)
    wasp.Wasp(5100, 470)
    wasp.Wasp(5195, 730)
    wasp.Wasp(6135, 760)
    wasp.Wasp(6770, 360)
    wasp.Wasp(7320, 280)
    wasp.Wasp(7640, 700)

    high = -10, 10
    med = -15, 15
    low = -18, 18
    bird.Bird((2675, 100), med, -.25)
    bird.Bird((3705, 100), high, -.25)
    bird.Bird((3925, 100), low, -.25)
    bird.Bird((4635, 100), low, -.25)
    bird.Bird((4995, 100), med, -.25)
    bird.Bird((5315, 100), high, -.25)

    frog.Frog((1850, 980))
    frog.Frog((3250, 980))
    frog.Frog((3915, 980))
    frog.Frog((4455, 980))
    frog.Frog((4788, 980))
    frog.Frog((5135, 980))

    spider = Spider(start_region)
    music.music_begin()
    music.play_music()
Esempio n. 15
0
import os
import frog
from nltk import pos_tag
from nltk import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import csv

frog = frog.Frog(frog.FrogOptions(parser=False))
lemmatizer = WordNetLemmatizer()


def wn_lemmatizer(word):
    tag = pos_tag([word])[0][1]  # Converting it to WordNet format.
    mapping = {'N': wn.NOUN, 'V': wn.VERB, 'R': wn.ADV, 'J': wn.ADJ}
    tag_wn = mapping.get(tag[0], wn.NOUN)
    lemma = lemmatizer.lemmatize(word, tag_wn)
    return lemma


def preprocess_word(w):
    # An auxiliary function to clean test files.
    if "f-" in w:
        w = w[2:]
    if w[:3] == "vk-":
        w = w[3:]
    if w[:2] == "vk":
        w = w[2:]
    if w == "geen":
        return ""
    if "(" in w:
        w = w[:w.index("(")]
# -*- coding: utf-8 -*-

import frog
import re


with open("./data/pos.translated.tok", "r") as f_in:
    pos_trans_list = [l for l in f_in]
with open("./data/neg.translated.tok", "r") as f_in:
    neg_trans_list = [l for l in f_in]



frog = frog.Frog(frog.FrogOptions(parser=False, ner=False, tok=False))
p = re.compile('(ADJ|BW|LID|N|SPEC|TSW|TW|VG|VNW|VZ|WW|LET)\((.*)\)')

def parse_pos(pos):
    m = p.match(pos)    
    coarse = m.group(1)
    fine = m.group(2)
    return coarse, fine.split(",")

X_pos = [
    [parse_pos(t["pos"])[0] for t in frog.process(sent)]
    
    for sent in pos_trans_list
]
X_neg = [
    [parse_pos(t["pos"])[0] for t in frog.process(sent)]
    
    for sent in neg_trans_list
Esempio n. 17
0
# Sanity checks, aborts if specified lexicon files not found.
files_found = True
for f in [greekHDfile, filename, nofreqfile, extrafile, frog_cfg]:
    if f and not os.path.exists(f):
        print("ERROR: FILE NOT FOUND:", f, file=sys.stderr)
        files_found = False
if not files_found:
    sys.exit(1)

# Initialise Frog.
if have_frog:
    print("INITIALISE FROG", file=sys.stderr)
    frog = frog.Frog(
        frog.FrogOptions(parser=True,
                         tok=False,
                         morph=False,
                         mwu=False,
                         chunking=False,
                         ner=False), frog_cfg)

# Statistics on lexicon files.
line_count = 0
new_entries = 0
zero_freq = 0

if greekHDfile:
    print("READING", greekHDfile, file=sys.stderr)
    with open(greekHDfile, 'r') as f:
        '''
        WORD            LEMMA       TAG             COUNT
        ἀλλήλοις            ἀλλήλων Pc-p---md--i    5
Esempio n. 18
0
def generatefrogs(numfrogs):
    for x in range(numfrogs):
        x_coord = 32 * ika.Random(0, engine.WIN_WIDTH / 32)
        y_coord = 32 * ika.Random(0, engine.WIN_HEIGHT / 32)
        game.frogs.append(frog.Frog(x_coord, y_coord, False))