Exemple #1
0
    def extract_pos_tags(self):
        # create frogclient
        bLocal = False
        if bLocal:
            port = 8080
            frogclient = FrogClient("localhost", port, returnall=True)
        else:
            port = 443
            frogClient = FrogClient("https://languagemachines.github.io/frog",
                                    port,
                                    returnall=True)

        # create wordstream
        wordstream = ''
        for file in os.listdir(self.input_path):
            path = os.path.join(self.input_path, file)
            with open(path, 'rb') as reader:
                data = reader.read().decode('utf-8-sig')
                data = [line.split("\t")[-1] for line in data.split("\n")]
                wordstream += ' '.join(data)

        # extract pos tags
        window_size = 250
        window_shift = 50
        index = 0
        with open(self.postag_file, 'w') as file:
            while index + window_size < len(wordstream):
                substream = wordstream[index:index + window_size]
                for data in frogclient.process(substream):
                    sys.stdout.write('\r')
                    percentage = round(100 * index / float(len(wordstream)), 2)
                    sys.stdout.write(str(percentage) + '%')
                    sys.stdout.flush()
                    file.write(json.dumps(data) + "\n")
                index += window_shift
def tokenize(text):
    try:
        frogclient = FrogClient('localhost', FROGPORT, returnall=True)
    except Exception as e:
        sys.exit(COMMAND + ": cannot run frog: " + str(e))
    tokens, nbrOfSents = processFrogData(frogclient.process(text))
    return (tokens, nbrOfSents)
Exemple #3
0
def applyNer(lines):
    frogclient = FrogClient('localhost', PORT, returnall=True)
    nerOutput = ""
    for line in lines:
        data = frogclient.process(line)
        nerOutput += prettyPrint(data)
    return nerOutput
Exemple #4
0
def call_frog(text):
    """
    Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples
    """
    
    host, port = os.environ.get('FROG_HOST', 'localhost:9887').split(":")
    frogclient = FrogClient(host, port, returnall=True)
    sent = 1
    offset = 0
    for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in frogclient.process(text):
        if word is None:
            sent += 1
        else:
            pos = _POSMAP[morphofeat.split("(")[0]]
            yield Token(sent, offset, word, lemma, pos, morphofeat, ner, chunk)
            offset += len(word)
Exemple #5
0
def frog_process(texts):
    frogclient = FrogClient(
        config.frog_hostname,
        config.frog_port,
        returnall=True,
        timeout=1800.0,
    )
    for text in texts:
        cache = Cache.get_or_new(hash_text(text))
        sentences = [s for s in sent_tokenize(text) if s]
        sentences = split_long_sentences(sentences, 250)
        tokens = frogclient.process(' '.join(sentences))
        tokens_no_none = [token for token in tokens if None not in token]
        cache.data = tokens_no_none
        cache.save()
    frogclient.socket.close()
def retag(doc, i):
    global threads
    print "\tRetagging:"
    r = re.compile('\[(.*)\]')
    frogclient = FrogClient('localhost', 9000 + (i % threads))

    for sentence in doc.sentences():
        words = " ".join([w.text() for w in sentence.words()])
        for j, (word, lemma, morph,
                pos) in enumerate(frogclient.process(words)):
            wordelement = sentence.words(j)
            wordelement.replace(cgn.parse_cgn_postag(pos))
            wordelement.replace(folia.LemmaAnnotation, cls=lemma)

            #parse mbma
            morphemes = r.findall(morph)
            if morphemes:
                layer = wordelement.append(folia.MorphologyLayer)
                for morpheme in morphemes:
                    layer.append(folia.Morpheme, cls=morpheme)
Exemple #7
0
 def call_frog(self, text):
     """
     Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples
     """
     logging.debug("Creating frog client")
     frogclient = FrogClient(self.host,
                             self.port,
                             returnall=True,
                             timeout=600)
     sent = 1
     offset = 0
     logging.debug("Calling frog")
     tokens = list(frogclient.process(text))
     logging.debug("Got {} tokens".format(len(tokens)))
     for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in tokens:
         if word is None:
             sent += 1
         else:
             yield (sent, offset, word, lemma, morphofeat, ner, chunk)
             offset += len(word)
Exemple #8
0

time.sleep(3)
frogclient = FrogClient('localhost',7551)

for doc in CorpusX(sonardir,'tok',"", lambda f: not os.path.exists(f + '.pos') ): #read the *.tok files, on condition there are no *.pos equivalents (will not overwrite)
    processed_doc = False
    print doc.filename + '\tPROCESSING'
    for sentence in doc.sentences():
            words = " ".join([ x.text for x in sentence ])

            process_sentence = False
            for x in sentence:
                if not ns('dcoi') + 'pos' in x.attrib or not ns('dcoi') + 'lemma' in x.attrib:
                    process_sentence = True
            if process_sentence:
                processed_doc = True
                for i, (word, lemma, morph, pos) in enumerate(frogclient.process(words)):
                    try:
                        word_id = sentence[i].attrib[ns('xml') + 'id']
                    except: 
                        print >>sys.stderr, "ERROR: words out of sync in " + sentence.attrib[ns('xml') + 'id']
                        break
                    if pos:
                        doc[word_id].attrib[ns('dcoi') + 'pos'] = pos
                    if lemma:
                        doc[word_id].attrib[ns('dcoi') + 'lemma'] = lemma
    if processed_doc:
        doc.save(doc.filename+'.pos', 'iso-8859-15') #write .tok.pos files

Exemple #9
0
 if not foliadoc.declared(folia.AnnotationType.LEMMA):                
     foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO)        
 foliadoc.language('nld')    
 text = foliadoc.data[-1]
 
 for p in foliadoc.paragraphs():    
     found_s = False  
     for s in p.sentences():
         found_w = False
         for w in s.words():
             found_w = True
         found_s = True
         if found_w:
             #pass tokenised sentence
             words = s.words()
             response = frogclient.process(" ".join([unicode(w) for w in words]))
             for i, (word, lemma, morph, pos) in enumerate(response):
                 if legacy: legacyout(i,word,lemma,morph,pos)                    
                 if unicode(words[i]) == word:
                     if lemma:
                         words[i].append( folia.LemmaAnnotation(foliadoc, cls=lemma) )
                     if pos:
                         words[i].append( folia.PosAnnotation(foliadoc, cls=pos) )  
                 else:
                     print >>sys.stderr,"WARNING: Out of sync after calling Frog! ", i, word
                 
         else:
             #pass untokenised sentence
             try:
                 sentext = s.text()
             except folia.NoSuchText:
Exemple #10
0
#!/usr/bin/env python

from __future__ import print_function, unicode_literals, division, absolute_import

import sys
import io
from pynlpl.clients.frogclient import FrogClient



frogclient = FrogClient('localhost',12345)

inputfile = sys.argv[1]


with io.open(inputfile + '.lem', 'w',encoding='utf-8') as f_lemma:
    with io.open(inputfile + '.pos', 'w',encoding='utf-8') as f_pos:
        with io.open(inputfile,'r',encoding='utf-8') as f:
            for i, line in enumerate(f):
                print(i,file=sys.stderr)
                posline = []
                lemline = []
                for word,lemma,morph,pos in frogclient.process(line.strip()):
                    posline.append(pos)
                    lemline.append(lemma)
                f_lemma.write(" ".join(lemline).strip() + "\n")
                f_pos.write(" ".join(posline).strip() + "\n")


import json

import sys
from pynlpl.clients.frogclient import FrogClient
port = 8020
frogclient = FrogClient('localhost', port, returnall=True)

import os
wordstream = ''
data_folder = '../data/output/1_preprocessed/'
for file in os.listdir(data_folder):
    path = os.path.join(data_folder, file)
    with open(path, 'rb') as reader:
        data = reader.read().decode('utf-8-sig')
        data = [line.split("\t")[-1] for line in data.split("\n")]
        wordstream += ' '.join(data)

window_size = 250
window_shift = 50
index = 0
with open('pos_tags.txt', 'w') as file:
    while index + window_size < len(wordstream):
        substream = wordstream[index:index + window_size]
        for data in frogclient.process(substream):
            sys.stdout.write('\r')
            percentage = round(100 * index / float(len(wordstream)), 2)
            sys.stdout.write(str(percentage) + '%')
            sys.stdout.flush()
            file.write(json.dumps(data) + "\n")
        index += window_shift
# ner.py: perform named antity recognition with frog
# usage: ner.py < text
# note adapted from: https://www.tutorialspoint.com/python/python_networking.htm
# 20180604 erikt(at)xs4all.nl

from pynlpl.clients.frogclient import FrogClient
import re
import socket
import sys

PORT = 8080
MAXDATA = 1024
NERID = 4
POSID = 3
TOKENID = 0


def prettyPrint(data):
    for row in data:
        if len(row) >= NERID + 1 and row[0] != None:
            lastLine = row[TOKENID] + " " + row[POSID] + " " + row[NERID]
            print(lastLine)
    print("")
    return ()


frogclient = FrogClient('localhost', PORT, returnall=True)
for line in sys.stdin:
    data = frogclient.process(line)
    prettyPrint(data)
                elif not in_correction and sample_id:
                    text += c

            if "\\" in text:
                print >>sys.stderr,"WARNING: backslash in text, skipping sample to prevent Frog bug."
                continue

            if skipsample:
                continue


            if text and corrections and sample_id:
                print >>sys.stderr,"Invoking Frog and processing text: " + text
                paragraph = folia.Paragraph(doc, id=sample_id)
                sentence = paragraph.append(folia.Sentence)
                for j, (wordtext, lemma, morph, pos) in enumerate(frogclient.process(text)):
                    if not wordtext or not wordtext.strip():
                        print >>sys.stderr,"Empty word, moving to next sentence"
                        sentence = paragraph.append(folia.Sentence)
                    else:
                        word = sentence.append(folia.Word, text=wordtext)
                        if lemma:
                            word.append(folia.LemmaAnnotation, cls=lemma)
                        if pos:
                            word.append(folia.PosAnnotation, cls=pos)
                        if wordtext in corrections and not stripcorrections:
                            try:
                                word.correct(new=corrections[wordtext])
                            except ValueError as e:
                                print >>sys.stderr, "Error correcting, ignoring:", e
                            correctioncount += 1