def extract_pos_tags(self): # create frogclient bLocal = False if bLocal: port = 8080 frogclient = FrogClient("localhost", port, returnall=True) else: port = 443 frogClient = FrogClient("https://languagemachines.github.io/frog", port, returnall=True) # create wordstream wordstream = '' for file in os.listdir(self.input_path): path = os.path.join(self.input_path, file) with open(path, 'rb') as reader: data = reader.read().decode('utf-8-sig') data = [line.split("\t")[-1] for line in data.split("\n")] wordstream += ' '.join(data) # extract pos tags window_size = 250 window_shift = 50 index = 0 with open(self.postag_file, 'w') as file: while index + window_size < len(wordstream): substream = wordstream[index:index + window_size] for data in frogclient.process(substream): sys.stdout.write('\r') percentage = round(100 * index / float(len(wordstream)), 2) sys.stdout.write(str(percentage) + '%') sys.stdout.flush() file.write(json.dumps(data) + "\n") index += window_shift
def tokenize(text): try: frogclient = FrogClient('localhost', FROGPORT, returnall=True) except Exception as e: sys.exit(COMMAND + ": cannot run frog: " + str(e)) tokens, nbrOfSents = processFrogData(frogclient.process(text)) return (tokens, nbrOfSents)
def applyNer(lines): frogclient = FrogClient('localhost', PORT, returnall=True) nerOutput = "" for line in lines: data = frogclient.process(line) nerOutput += prettyPrint(data) return nerOutput
def call_frog(text): """ Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples """ host, port = os.environ.get('FROG_HOST', 'localhost:9887').split(":") frogclient = FrogClient(host, port, returnall=True) sent = 1 offset = 0 for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in frogclient.process(text): if word is None: sent += 1 else: pos = _POSMAP[morphofeat.split("(")[0]] yield Token(sent, offset, word, lemma, pos, morphofeat, ner, chunk) offset += len(word)
def frog_process(texts): frogclient = FrogClient( config.frog_hostname, config.frog_port, returnall=True, timeout=1800.0, ) for text in texts: cache = Cache.get_or_new(hash_text(text)) sentences = [s for s in sent_tokenize(text) if s] sentences = split_long_sentences(sentences, 250) tokens = frogclient.process(' '.join(sentences)) tokens_no_none = [token for token in tokens if None not in token] cache.data = tokens_no_none cache.save() frogclient.socket.close()
def retag(doc, i): global threads print "\tRetagging:" r = re.compile('\[(.*)\]') frogclient = FrogClient('localhost', 9000 + (i % threads)) for sentence in doc.sentences(): words = " ".join([w.text() for w in sentence.words()]) for j, (word, lemma, morph, pos) in enumerate(frogclient.process(words)): wordelement = sentence.words(j) wordelement.replace(cgn.parse_cgn_postag(pos)) wordelement.replace(folia.LemmaAnnotation, cls=lemma) #parse mbma morphemes = r.findall(morph) if morphemes: layer = wordelement.append(folia.MorphologyLayer) for morpheme in morphemes: layer.append(folia.Morpheme, cls=morpheme)
def call_frog(self, text): """ Call frog on the text and return (sent, offset, word, lemma, pos, morphofeat) tuples """ logging.debug("Creating frog client") frogclient = FrogClient(self.host, self.port, returnall=True, timeout=600) sent = 1 offset = 0 logging.debug("Calling frog") tokens = list(frogclient.process(text)) logging.debug("Got {} tokens".format(len(tokens))) for word, lemma, morph, morphofeat, ner, chunk, _p1, _p2 in tokens: if word is None: sent += 1 else: yield (sent, offset, word, lemma, morphofeat, ner, chunk) offset += len(word)
time.sleep(3) frogclient = FrogClient('localhost',7551) for doc in CorpusX(sonardir,'tok',"", lambda f: not os.path.exists(f + '.pos') ): #read the *.tok files, on condition there are no *.pos equivalents (will not overwrite) processed_doc = False print doc.filename + '\tPROCESSING' for sentence in doc.sentences(): words = " ".join([ x.text for x in sentence ]) process_sentence = False for x in sentence: if not ns('dcoi') + 'pos' in x.attrib or not ns('dcoi') + 'lemma' in x.attrib: process_sentence = True if process_sentence: processed_doc = True for i, (word, lemma, morph, pos) in enumerate(frogclient.process(words)): try: word_id = sentence[i].attrib[ns('xml') + 'id'] except: print >>sys.stderr, "ERROR: words out of sync in " + sentence.attrib[ns('xml') + 'id'] break if pos: doc[word_id].attrib[ns('dcoi') + 'pos'] = pos if lemma: doc[word_id].attrib[ns('dcoi') + 'lemma'] = lemma if processed_doc: doc.save(doc.filename+'.pos', 'iso-8859-15') #write .tok.pos files
if not foliadoc.declared(folia.AnnotationType.LEMMA): foliadoc.declare(folia.AnnotationType.LEMMA, set='http://ilk.uvt.nl/folia/sets/mblem-nl.foliaset', annotator='Frog',annotatortype=folia.AnnotatorType.AUTO) foliadoc.language('nld') text = foliadoc.data[-1] for p in foliadoc.paragraphs(): found_s = False for s in p.sentences(): found_w = False for w in s.words(): found_w = True found_s = True if found_w: #pass tokenised sentence words = s.words() response = frogclient.process(" ".join([unicode(w) for w in words])) for i, (word, lemma, morph, pos) in enumerate(response): if legacy: legacyout(i,word,lemma,morph,pos) if unicode(words[i]) == word: if lemma: words[i].append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) if pos: words[i].append( folia.PosAnnotation(foliadoc, cls=pos) ) else: print >>sys.stderr,"WARNING: Out of sync after calling Frog! ", i, word else: #pass untokenised sentence try: sentext = s.text() except folia.NoSuchText:
#!/usr/bin/env python from __future__ import print_function, unicode_literals, division, absolute_import import sys import io from pynlpl.clients.frogclient import FrogClient frogclient = FrogClient('localhost',12345) inputfile = sys.argv[1] with io.open(inputfile + '.lem', 'w',encoding='utf-8') as f_lemma: with io.open(inputfile + '.pos', 'w',encoding='utf-8') as f_pos: with io.open(inputfile,'r',encoding='utf-8') as f: for i, line in enumerate(f): print(i,file=sys.stderr) posline = [] lemline = [] for word,lemma,morph,pos in frogclient.process(line.strip()): posline.append(pos) lemline.append(lemma) f_lemma.write(" ".join(lemline).strip() + "\n") f_pos.write(" ".join(posline).strip() + "\n")
import json import sys from pynlpl.clients.frogclient import FrogClient port = 8020 frogclient = FrogClient('localhost', port, returnall=True) import os wordstream = '' data_folder = '../data/output/1_preprocessed/' for file in os.listdir(data_folder): path = os.path.join(data_folder, file) with open(path, 'rb') as reader: data = reader.read().decode('utf-8-sig') data = [line.split("\t")[-1] for line in data.split("\n")] wordstream += ' '.join(data) window_size = 250 window_shift = 50 index = 0 with open('pos_tags.txt', 'w') as file: while index + window_size < len(wordstream): substream = wordstream[index:index + window_size] for data in frogclient.process(substream): sys.stdout.write('\r') percentage = round(100 * index / float(len(wordstream)), 2) sys.stdout.write(str(percentage) + '%') sys.stdout.flush() file.write(json.dumps(data) + "\n") index += window_shift
# ner.py: perform named antity recognition with frog # usage: ner.py < text # note adapted from: https://www.tutorialspoint.com/python/python_networking.htm # 20180604 erikt(at)xs4all.nl from pynlpl.clients.frogclient import FrogClient import re import socket import sys PORT = 8080 MAXDATA = 1024 NERID = 4 POSID = 3 TOKENID = 0 def prettyPrint(data): for row in data: if len(row) >= NERID + 1 and row[0] != None: lastLine = row[TOKENID] + " " + row[POSID] + " " + row[NERID] print(lastLine) print("") return () frogclient = FrogClient('localhost', PORT, returnall=True) for line in sys.stdin: data = frogclient.process(line) prettyPrint(data)
elif not in_correction and sample_id: text += c if "\\" in text: print >>sys.stderr,"WARNING: backslash in text, skipping sample to prevent Frog bug." continue if skipsample: continue if text and corrections and sample_id: print >>sys.stderr,"Invoking Frog and processing text: " + text paragraph = folia.Paragraph(doc, id=sample_id) sentence = paragraph.append(folia.Sentence) for j, (wordtext, lemma, morph, pos) in enumerate(frogclient.process(text)): if not wordtext or not wordtext.strip(): print >>sys.stderr,"Empty word, moving to next sentence" sentence = paragraph.append(folia.Sentence) else: word = sentence.append(folia.Word, text=wordtext) if lemma: word.append(folia.LemmaAnnotation, cls=lemma) if pos: word.append(folia.PosAnnotation, cls=pos) if wordtext in corrections and not stripcorrections: try: word.correct(new=corrections[wordtext]) except ValueError as e: print >>sys.stderr, "Error correcting, ignoring:", e correctioncount += 1