class AbstractFrogChunker(ChunkParserI): _frog = None def __init__(self, **kwargs): self._frog = Frog(FrogOptions(parser=False, mwu=False, tok=False, xmlIn=True, **kwargs)) def __get_folia_doc__(self, tokens): doc = folia.Document(id='nltk-sentence') folia_sent = doc.add(folia.Text) for tok, pos in tokens: word = folia_sent.add(folia.Word, tok) word.add(folia.PosAnnotation(None, set='custom', cls=pos)) return doc def __create_tree__(self, tokens, key): _input = self.__get_folia_doc__(tokens) __output = self._frog.process(_input) for token in __output: token['pos'] = token['pos'].split('(')[0] if token['pos'].startswith('SPEC'): token['pos'] = 'NNP' return conlltags2tree([(token['text'], token['pos'], token[key]) for token in __output ]) def parse(self, tokens): raise NotImplementedError()
def morph_counts_old_version(self, words): #Word List to list of all morphisms print("len words: ") print(len(words)) print("len unique words: ") print(len(set(words))) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) morphisms = [] print_counter = 1 t0 = time.time() for word in words: output = frog.process(word) morphisms_word = output[0].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') #Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list total_length = len(words) print(str(print_counter) + " of " + str(total_length)) print_counter += 1 print("Frog Processing Time:") print(self.format_time(time.time() - t0)) morphisms = list(filter(None, morphisms)) morph_counts = Counter(morphisms) return morph_counts
def morph_counts_new_version(self, words): #Word List to list of all morphisms frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False)) words_string = ' '.join(words) morphisms = [] print_counter = 1 t0 = time.time() print("Starting Frog Processing..") output = frog.process(words_string) print("Process time:") process_time = self.format_time(time.time() - t0) print(process_time) t1 = time.time() for i in range(0,len(words)-1): morphisms_word = output[i].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') #Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list total_length = len(words) print(str(print_counter) + " of " + str(total_length)) print_counter += 1 print("Process Time:") print(process_time) print("Getting Morphisms Time:") print(self.format_time(time.time() - t1)) print("Total Time:") print(self.format_time(time.time() - t0)) morphisms = list(filter(None, morphisms)) morph_counts = Counter(morphisms) return morph_counts
class FrogTagger(TaggerI): def __init__(self, **kwargs): # Disable multiword recognition, which is performed by the chunker options = FrogOptions(parser=False, mwu=False, xmlIn=True, **kwargs) self.__frog = Frog(options) def tag(self, sentences): if isinstance(sentences, list): doc = folia.Document(id='nltk-sentence') folia_sent = doc.add(folia.Text) for sent in sentences: folia_sent.add(folia.Word, sent) _input = doc else: _input = sentences self.__output = self.__frog.process(_input) return [(token['text'], token['pos'].split('(')[0]) for token in self.__output] def get_tag_probabilities(self): if self.__output is None: return [] return [(token['text'], token['posprob']) for token in self.__output] def get_lemmas(self): if self.__output is None: return [] return [(token['text'], token['lemma']) for token in self.__output] def get_morph(self): if self.__output is None: return [] return [(token['text'], token['morph']) for token in self.__output]
def prep_nl(df, filename): from frog import Frog, FrogOptions print("Tokenizing, POS tagging, and lemmatizing the Dutch data...") # Create 'frog' instance. Turn off various options to save time. frog = Frog( FrogOptions(parser=False, morph=False, chunking=False, ner=False)) # Define set of possible answers if not "STAT_C" in str(filename): answers = ['Answer'] elif "STAT_C" in str(filename): answers = ['Answer4a', 'Answer2aDec', 'Answer2aCaus'] # Loop through answers for question_type in answers: for index in df.index: ans = df.loc[index, question_type] # Logging if index % 20 == 0: print(index, "/", df.index[-1], question_type[6:]) # Remove numbers ans = re.sub("\d+", "", ans) # Remove tags in spelling-corrected data ans = ans.replace("_abbreviation", "") # Remove non-Dutch and illegible words ans = re.sub("\w+_nonexistent", "", ans) ans = re.sub("\w+_nonexisting", "", ans) ans = re.sub("\w+_english", "", ans) ans = re.sub("\w+_german", "", ans) ans = re.sub("\?+_illegible", "", ans) # Preprocess the data with Frog ans_dict = frog.process(ans) tok_answer = [] lem_answer = [] pos_tags = [] # Append outcomes to list for word_index in range(len(ans_dict)): if ans_dict[word_index][ 'pos'] != "LET()": # Exclude punctuation tok_answer.append(ans_dict[word_index]['text'].lower()) lem_answer.append(ans_dict[word_index]['lemma']) pos_tags.append(ans_dict[word_index]['pos']) # Fill in the dataframe df.at[index, 'Tokenized{}'.format(question_type[6:])] = tok_answer df.at[index, 'Lemmatized{}'.format(question_type[6:])] = lem_answer df.at[index, 'POS{}'.format(question_type[6:])] = pos_tags return df
def morph_counts_faster_version(self, words): #Word List to list of all morphisms frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False)) batch_size = 400 morphisms = [] print_batch_number = 1 start_time = time.time() total_batch_number = math.ceil(len(words)/batch_size) total_process_time = 0 total_getting_morphisms_time = 0 for i in range(0, len(words), batch_size): t0 = time.time() #print_counter = 1 words_batch = words[i:i + batch_size] words_batch_string = ' '.join(words_batch) #print("Starting Frog Processing.. for batch = " + str(print_batch_number)) output = frog.process(words_batch_string) #print("Process time:") process_time = time.time() - t0 #print(self.format_time(process_time)) #print(process_time) t1 = time.time() for j in range(0,len(words_batch)-1): morphisms_word = output[j].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') #Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list total_batch_length = len(words_batch) #print(str(print_counter) + " of " + str(total_batch_length) + " -- of batch -- " + str(print_batch_number) + " of " + str(total_batch_number) ) #print("batch" + " (batch_size: " + str(batch_size) + " words): " + str(print_batch_number) + " of " + str(total_batch_number)) #print_counter += 1 print_batch_number += 1 getting_morphisms_time = time.time() - t1 total_process_time += process_time total_getting_morphisms_time += getting_morphisms_time print("Total number of words: ") print(len(words)) print("") print("Unique number words: ") print(len(set(words))) print("") print("Total Process Time:") print(self.format_time(total_process_time)) print("") print("Total Getting Morphisms Time: ") print(self.format_time(total_getting_morphisms_time)) print("") print("Total Time:") print(self.format_time(time.time() - start_time)) print("") morphisms = list(filter(None, morphisms)) morph_counts = Counter(morphisms) return morph_counts
def change_text_to_morphs(sentences, frog_merge=False, save=False, filename=None): # sentence list to sentence list in frog morphism form morphSentences = [] frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) for sentenceNumber in range(0, len(sentences)): print(sentenceNumber) print("of") print(len(sentences)) sentenceToBeProcessed = sentences[sentenceNumber] sentenceToBeProcessed = sentenceToBeProcessed.replace("\n", " ") morphSentence = [] output = frog.process(sentenceToBeProcessed) for i in range(0, len(output)): morphisms_word = output[i].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') if frog_merge: morphisms_word_list = list(filter(None, morphisms_word_list)) morphisms_word_list = intersperse(morphisms_word_list, "insertmergetoken") #print(morphisms_word_list) #print("EVET") #print(morphisms_word_list) morphSentence += morphisms_word_list #print("MORPHSENTENCE") #print(morphSentence) # Remove the empty strings morphSentence = list(filter(None, morphSentence)) #print("ok") #print(morphSentence) morphSentence = ' '.join(morphSentence) #print("HERE") #print(morphSentence) morphSentences.append(morphSentence) if save is True: with open(filename, 'wb') as outputfile: pickle.dump(morphSentences, outputfile) return morphSentences
def change_text_to_morphs(sentences, frog_merge=False, save=False, filename=None): # sentence list to sentence list in frog morphism form morphSentences = [] frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) j = 0 for sentenceToBeProcessed in sentences: if j % 1000 == 0: print(j + 1) print("of") print(len(sentences)) j += 1 sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n') morphSentence = [] output = frog.process(sentenceToBeProcessed) for i in range(0, len(output)): morphisms_word = output[i].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') if frog_merge: morphisms_word_list = list(filter(None, morphisms_word_list)) morphisms_word_list = intersperse(morphisms_word_list, "__add_merge__") morphSentence += morphisms_word_list # Remove the empty strings morphSentence = list(filter(None, morphSentence)) morphSentence = ' '.join(morphSentence) morphSentences.append(morphSentence) if save is True: with open(filename, 'wb') as outputfile: pickle.dump(morphSentences, outputfile) return morphSentences
line_txt = re.sub(r"^'s ", "''s ", line_txt) # if re.search(r'"".+""', line_txt): # print(re.search(r'"".+""', line_txt).group()) txt_dict[pair][part][spkr] += line_txt + " " frog = Frog(FrogOptions(mwu=False, ner=False)) for pair in txt_dict: for part in txt_dict[pair]: with open("{}pos/{}/{}_{}.pos".format(ecsd_path, pair, pair, part), "w", encoding="utf-8") as g: for spkr in txt_dict[pair][part]: print(pair, part, spkr) text = txt_dict[pair][part][spkr] word_list = frog.process(text) s_counter = 0 w_counter = 0 for word in word_list: if word["index"] == "1": s_counter += 1 w_counter = 0 g.write( "< file id: {}_{} speaker id: {} sentence: {} >\n". format(pair, part, spkr, s_counter)) if "LET" in word["pos"] and word["text"] != "&": continue else: g.write("\t".join([ word["text"], word["pos"], word["lemma"], str(word["posprob"])
#!/usr/bin/env python3 from __future__ import print_function, unicode_literals from frog import Frog, FrogOptions import folia.main as folia frog = Frog(FrogOptions(parser=True)) output = frog.process_raw("Dit is een test") print("RAW OUTPUT=", output) output = frog.process("Dit is nog een test.") print("PARSED OUTPUT=", output) frog = Frog(FrogOptions(parser=True, xmlout=True)) output = frog.process("Dit is een FoLiA test.") assert isinstance(output, folia.Document) assert isinstance(len(output.data), 1) assert isinstance( next(output.data.select(folia.Sentence)).text(), "Dit is een FoLiA test.") #output is now no longer a string but an instance of folia.Document, provided by the FoLiA library in PyNLPl (pynlpl.formats.folia) print("FOLIA OUTPUT=") print(output.xmlstring()) print("Inspecting FoLiA output (example):") for word in output.words(): print(word.text() + " " + word.pos() + " " + word.lemma()) assert len(output.words()) == 5
import pickle import re Processed_Sentence = " lezen optimaal liep europese unie gekregen spellen rugzak super allesinds boomhut ontwikkelende gemeenschappen vermeenigvuldigde getallen Vereenvoudigd. ....... is werken lopen een kleine test gewoon om te zien of het wel werkt." #Processed_Sentence = "Ik spring wat rond in het rond" #frog = Frog(FrogOptions(tok=True, lemma=True, morph = True, daringmorph=False, mwu=False, chunking=True, ner=True, parser=False)) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) output = frog.process(Processed_Sentence) print("") print("RAW OUTPUT") print(output) print("length") print(len(output)) print(output[0]) print(output[1]) print(output[2]) print(output[3]) print("") print("index: " + str(output[0].get("index"))) print("text: " + str(output[0].get("text"))) print("lemma: " + str(output[0].get("lemma")))
trials_dict[t]["s"] = s_plural with open(f_path + "SonarVar_test.csv", "w") as g: g.write("s_plural,item,prosodic_break,next_stress,next_sound,next_vowel_length,ort\n") with open(f_path + "sonar_plurals2.csv", "r") as f: for line_num, i_line in enumerate(f, 1): print("Processing line " + str(line_num)) i_list = i_line[:-1].split("\t") sentence = i_list[1].strip('" ') foll_context = i_list[2].strip('" ') for t in trials: grep_text = "(^{}|[ ']{})({}|{})(?=[ .,?!':;])".format(t[0].upper(), t[0], trials_dict[t]["en"][1:], trials_dict[t]["s"][1:]) if re.search(grep_text, sentence): line = re.sub(" & ", " en ", sentence) frog_output = frog.process(line) frog_output_clean = [i for i in frog_output if i["pos"] != "LET()"] w_indices = [] for num, b in enumerate(frog_output, 0): if b["text"].lower() in [trials_dict[t]["en"], trials_dict[t]["s"]] and b["pos"] == "N(soort,mv,basis)": w_indices.append(num) for w_index in w_indices: s_plural = "1" if frog_output[w_index]["text"][-1] == "s" else "0" clean_index = [w["index"] for w in frog_output_clean].index(frog_output[w_index]["index"]) if clean_index + 1 < len(frog_output_clean): next_word = frog_output_clean[clean_index + 1]["text"].lower() else: context_frog_output = frog.process(re.sub(" & ", " en ", foll_context)) context_frog_output_clean = [i for i in context_frog_output if i["pos"] != "LET()"] if len(context_frog_output_clean) > 0: next_word = context_frog_output_clean[0]["text"].lower()
def morph_counts_fastest_version(self, words): # Word List to list of all morphisms word_counts = Counter( word for word in toolz.concat(map(self.word_tokenizer, words))) print("words_counts: ") print(word_counts) print("") print("Unique number words: " + str(len(set(words)))) print("Total number of words: " + str(len(words))) print("") unique_words_set = set(words) unique_words = list(unique_words_set) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) batch_size = 400 morphisms = [] print_batch_number = 1 start_time = time.time() total_batch_number = math.ceil(len(unique_words) / batch_size) total_process_time = 0 total_getting_morphisms_time = 0 for i in range(0, len(unique_words), batch_size): t0 = time.time() words_batch = unique_words[i:i + batch_size] words_batch_string = ' '.join(words_batch) output = frog.process(words_batch_string) process_time = time.time() - t0 t1 = time.time() for j in range(0, len(words_batch) - 1): current_word = output[j].get("text") morphisms_word = output[j].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') current_word_count = word_counts[current_word] # Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list * current_word_count total_batch_length = len(words_batch) print("batch" + " (batch_size: " + str(batch_size) + " words): " + str(print_batch_number) + " of " + str(total_batch_number)) print_batch_number += 1 getting_morphisms_time = time.time() - t1 total_process_time += process_time total_getting_morphisms_time += getting_morphisms_time print("Total number of words: ") print(len(words)) print("") print("Unique number words: ") print(len(set(words))) print("") print("Total Process Time:") print(self.format_time(total_process_time)) print("") print("Total Getting Morphisms Time: ") print(self.format_time(total_getting_morphisms_time)) print("") print("Total Time:") print(self.format_time(time.time() - start_time)) print("") # Remove the empty strings morphisms = list(filter(None, morphisms)) #Make a counter of all morphisms morph_counts = Counter(morphisms) return morph_counts
def change_text_to_lemma_POS(sentences, save=False, filename=None): # sentence list to sentence list in frog lemma + pos lemmapos_sentences = [] frog = Frog( FrogOptions(tok=True, lemma=True, morph=False, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) j = 0 for sentenceToBeProcessed in sentences: if j % 1000 == 0: print(j + 1) print("of") print(len(sentences)) j += 1 sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n') output = frog.process(sentenceToBeProcessed) lemmapos_sentence = "" for i in range(0, len(output)): pos = str(output[i].get("pos")) lemma = str(output[i].get("lemma")) #posprob = str(output[i].get("posprob")) #print(posprob) # print("pos: " + pos) # print("lemma: " + lemma) pos = "<" + pos pos = pos.replace("(", "><") pos = pos.replace(")", ">") pos = pos.replace(",", "><") pos = pos.replace("<>", "") # print(pos) lemmapos_word = lemma + " " + "**" + pos + "**" #word = str(output[i].get("text")) #print(f"{word}: {lemmapos_word}") lemmapos_sentence = lemmapos_sentence + " " + lemmapos_word # Remove the first empty string #print(lemmapos_sentence) lemmapos_sentence = lemmapos_sentence[1:] #print("") #print("") #print("") #print("") #print(lemmapos_sentence) #print("") #print("") #print("") #print("") lemmapos_sentences.append(lemmapos_sentence) #print("") #print(lemmapos_sentences) #print("") if save is True: with open(filename, 'wb') as outputfile: pickle.dump(lemmapos_sentences, outputfile) return lemmapos_sentences
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '-s', '--nerset', type=str, help="NER FoLiA Set", action='store', default= "https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/namedentities.foliaset.ttl" ) parser.add_argument('-c', '--config', type=str, help="Frog configuration", action='store', required=True) parser.add_argument('--notexact', dest='exact', help="Loose evaluation", action='store_false', default=True) parser.add_argument('files', nargs='+', help='bar help') args = parser.parse_args() frog = Frog(FrogOptions(ner=True, parser=False, xmlout=True), args.config) sentence = [] entities = [] precisions = [] recalls = [] entity = None entity_cls = None doc = None classeval = defaultdict(lambda: defaultdict(int)) for filename in args.files: for token, tag in readdata( filename): #extracttrain also works on test gold standard if token is None: #end of sentence if entity: entities.append((" ".join(entity), entity_cls)) entity = [] if sentence: print("Processing: ", " ".join(sentence), file=sys.stderr) print(" Reference entities:", entities, file=sys.stderr) doc = frog.process(" ".join(sentence)) precision, recall = evaluate(doc, entities, args.nerset, classeval, args.exact) print(" precision=", precision, " recall=", recall, file=sys.stderr) if precision is not None: precisions.append(precision) if recall is not None: recalls.append(recall) sentence = [] entities = [] else: if tag[0] == 'B': if entity: entities.append((" ".join(entity), entity_cls)) entity = [] entity_cls = tag[2:] entity.append(token) elif tag[0] == 'I': entity.append(token) elif entity: entities.append((" ".join(entity), entity_cls)) entity = [] sentence.append(token) print("overall precision (macroav):\t", sum(precisions) / len(precisions)) print("overall recall (macroav):\t", sum(recalls) / len(recalls)) for cls, evaldata in classeval.items(): try: print(cls + " precision (microav):\t", evaldata['tp'] / (evaldata['tp'] + evaldata['fp'])) except ZeroDivisionError: print(cls + " precision (microav):\tn/a") try: print(cls + " recall (microav):\t", evaldata['tp'] / (evaldata['tp'] + evaldata['fn'])) except ZeroDivisionError: print(cls + " recall (microav):\tn/a")
g.write( "s_plural,item,next_stress,next_sound,f1,f2,f3,f4,f5,f6,f7,f8,ort\n") for t in trials: print("Processing " + t) grep_text = "(^{}|[ ']{})({}|{})(?=[ .,?!':;])".format( t[0].upper(), t[0], trials_dict[t]["en"][1:], trials_dict[t]["s"][1:]) w_lines = subprocess.check_output( ["grep", "-P", grep_text, f_path + "OpenSubtitles2018_nl_raw.txt"], universal_newlines=True) w_lines_list = w_lines.split("\n") w_lines_list = list(set(w_lines_list)) w_lines_list = [i for i in w_lines_list if i != ""] for line in w_lines_list: line = re.sub(" & ", " en ", line) frog_output = frog.process(line) print(line) frog_output_clean = [i for i in frog_output if i["pos"] != "LET()"] lemmas = [i["lemma"] for i in frog_output_clean] if t not in lemmas: continue # get indices of matching words w_indices = [] for num, b in enumerate(frog_output_clean, 0): if b["text"].lower() in [ trials_dict[t]["en"], trials_dict[t]["s"] ] and b["pos"] == "N(soort,mv,basis)": w_indices.append(num) for w_index in w_indices: s_plural = "1" if frog_output_clean[w_index]["text"][ -1] == "s" else "0"