def morph_counts_old_version(self, words): #Word List to list of all morphisms print("len words: ") print(len(words)) print("len unique words: ") print(len(set(words))) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) morphisms = [] print_counter = 1 t0 = time.time() for word in words: output = frog.process(word) morphisms_word = output[0].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') #Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list total_length = len(words) print(str(print_counter) + " of " + str(total_length)) print_counter += 1 print("Frog Processing Time:") print(self.format_time(time.time() - t0)) morphisms = list(filter(None, morphisms)) morph_counts = Counter(morphisms) return morph_counts
def morph_counts_new_version(self, words): #Word List to list of all morphisms frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False)) words_string = ' '.join(words) morphisms = [] print_counter = 1 t0 = time.time() print("Starting Frog Processing..") output = frog.process(words_string) print("Process time:") process_time = self.format_time(time.time() - t0) print(process_time) t1 = time.time() for i in range(0,len(words)-1): morphisms_word = output[i].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') #Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list total_length = len(words) print(str(print_counter) + " of " + str(total_length)) print_counter += 1 print("Process Time:") print(process_time) print("Getting Morphisms Time:") print(self.format_time(time.time() - t1)) print("Total Time:") print(self.format_time(time.time() - t0)) morphisms = list(filter(None, morphisms)) morph_counts = Counter(morphisms) return morph_counts
def prep_nl(df, filename): from frog import Frog, FrogOptions print("Tokenizing, POS tagging, and lemmatizing the Dutch data...") # Create 'frog' instance. Turn off various options to save time. frog = Frog( FrogOptions(parser=False, morph=False, chunking=False, ner=False)) # Define set of possible answers if not "STAT_C" in str(filename): answers = ['Answer'] elif "STAT_C" in str(filename): answers = ['Answer4a', 'Answer2aDec', 'Answer2aCaus'] # Loop through answers for question_type in answers: for index in df.index: ans = df.loc[index, question_type] # Logging if index % 20 == 0: print(index, "/", df.index[-1], question_type[6:]) # Remove numbers ans = re.sub("\d+", "", ans) # Remove tags in spelling-corrected data ans = ans.replace("_abbreviation", "") # Remove non-Dutch and illegible words ans = re.sub("\w+_nonexistent", "", ans) ans = re.sub("\w+_nonexisting", "", ans) ans = re.sub("\w+_english", "", ans) ans = re.sub("\w+_german", "", ans) ans = re.sub("\?+_illegible", "", ans) # Preprocess the data with Frog ans_dict = frog.process(ans) tok_answer = [] lem_answer = [] pos_tags = [] # Append outcomes to list for word_index in range(len(ans_dict)): if ans_dict[word_index][ 'pos'] != "LET()": # Exclude punctuation tok_answer.append(ans_dict[word_index]['text'].lower()) lem_answer.append(ans_dict[word_index]['lemma']) pos_tags.append(ans_dict[word_index]['pos']) # Fill in the dataframe df.at[index, 'Tokenized{}'.format(question_type[6:])] = tok_answer df.at[index, 'Lemmatized{}'.format(question_type[6:])] = lem_answer df.at[index, 'POS{}'.format(question_type[6:])] = pos_tags return df
def morph_counts_faster_version(self, words): #Word List to list of all morphisms frog = Frog(FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False,parser=False)) batch_size = 400 morphisms = [] print_batch_number = 1 start_time = time.time() total_batch_number = math.ceil(len(words)/batch_size) total_process_time = 0 total_getting_morphisms_time = 0 for i in range(0, len(words), batch_size): t0 = time.time() #print_counter = 1 words_batch = words[i:i + batch_size] words_batch_string = ' '.join(words_batch) #print("Starting Frog Processing.. for batch = " + str(print_batch_number)) output = frog.process(words_batch_string) #print("Process time:") process_time = time.time() - t0 #print(self.format_time(process_time)) #print(process_time) t1 = time.time() for j in range(0,len(words_batch)-1): morphisms_word = output[j].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') #Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list total_batch_length = len(words_batch) #print(str(print_counter) + " of " + str(total_batch_length) + " -- of batch -- " + str(print_batch_number) + " of " + str(total_batch_number) ) #print("batch" + " (batch_size: " + str(batch_size) + " words): " + str(print_batch_number) + " of " + str(total_batch_number)) #print_counter += 1 print_batch_number += 1 getting_morphisms_time = time.time() - t1 total_process_time += process_time total_getting_morphisms_time += getting_morphisms_time print("Total number of words: ") print(len(words)) print("") print("Unique number words: ") print(len(set(words))) print("") print("Total Process Time:") print(self.format_time(total_process_time)) print("") print("Total Getting Morphisms Time: ") print(self.format_time(total_getting_morphisms_time)) print("") print("Total Time:") print(self.format_time(time.time() - start_time)) print("") morphisms = list(filter(None, morphisms)) morph_counts = Counter(morphisms) return morph_counts
def change_text_to_morphs(sentences, frog_merge=False, save=False, filename=None): # sentence list to sentence list in frog morphism form morphSentences = [] frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) for sentenceNumber in range(0, len(sentences)): print(sentenceNumber) print("of") print(len(sentences)) sentenceToBeProcessed = sentences[sentenceNumber] sentenceToBeProcessed = sentenceToBeProcessed.replace("\n", " ") morphSentence = [] output = frog.process(sentenceToBeProcessed) for i in range(0, len(output)): morphisms_word = output[i].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') if frog_merge: morphisms_word_list = list(filter(None, morphisms_word_list)) morphisms_word_list = intersperse(morphisms_word_list, "insertmergetoken") #print(morphisms_word_list) #print("EVET") #print(morphisms_word_list) morphSentence += morphisms_word_list #print("MORPHSENTENCE") #print(morphSentence) # Remove the empty strings morphSentence = list(filter(None, morphSentence)) #print("ok") #print(morphSentence) morphSentence = ' '.join(morphSentence) #print("HERE") #print(morphSentence) morphSentences.append(morphSentence) if save is True: with open(filename, 'wb') as outputfile: pickle.dump(morphSentences, outputfile) return morphSentences
def change_text_to_morphs(sentences, frog_merge=False, save=False, filename=None): # sentence list to sentence list in frog morphism form morphSentences = [] frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) j = 0 for sentenceToBeProcessed in sentences: if j % 1000 == 0: print(j + 1) print("of") print(len(sentences)) j += 1 sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n') morphSentence = [] output = frog.process(sentenceToBeProcessed) for i in range(0, len(output)): morphisms_word = output[i].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') if frog_merge: morphisms_word_list = list(filter(None, morphisms_word_list)) morphisms_word_list = intersperse(morphisms_word_list, "__add_merge__") morphSentence += morphisms_word_list # Remove the empty strings morphSentence = list(filter(None, morphSentence)) morphSentence = ' '.join(morphSentence) morphSentences.append(morphSentence) if save is True: with open(filename, 'wb') as outputfile: pickle.dump(morphSentences, outputfile) return morphSentences
def morph_counts_fastest_version(self, words): # Word List to list of all morphisms word_counts = Counter( word for word in toolz.concat(map(self.word_tokenizer, words))) print("words_counts: ") print(word_counts) print("") print("Unique number words: " + str(len(set(words)))) print("Total number of words: " + str(len(words))) print("") unique_words_set = set(words) unique_words = list(unique_words_set) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) batch_size = 400 morphisms = [] print_batch_number = 1 start_time = time.time() total_batch_number = math.ceil(len(unique_words) / batch_size) total_process_time = 0 total_getting_morphisms_time = 0 for i in range(0, len(unique_words), batch_size): t0 = time.time() words_batch = unique_words[i:i + batch_size] words_batch_string = ' '.join(words_batch) output = frog.process(words_batch_string) process_time = time.time() - t0 t1 = time.time() for j in range(0, len(words_batch) - 1): current_word = output[j].get("text") morphisms_word = output[j].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') current_word_count = word_counts[current_word] # Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list * current_word_count total_batch_length = len(words_batch) print("batch" + " (batch_size: " + str(batch_size) + " words): " + str(print_batch_number) + " of " + str(total_batch_number)) print_batch_number += 1 getting_morphisms_time = time.time() - t1 total_process_time += process_time total_getting_morphisms_time += getting_morphisms_time print("Total number of words: ") print(len(words)) print("") print("Unique number words: ") print(len(set(words))) print("") print("Total Process Time:") print(self.format_time(total_process_time)) print("") print("Total Getting Morphisms Time: ") print(self.format_time(total_getting_morphisms_time)) print("") print("Total Time:") print(self.format_time(time.time() - start_time)) print("") # Remove the empty strings morphisms = list(filter(None, morphisms)) #Make a counter of all morphisms morph_counts = Counter(morphisms) return morph_counts
def __init__(self, **kwargs): self._frog = Frog(FrogOptions(parser=False, mwu=False, tok=False, xmlIn=True, **kwargs))
def main(): global have_frog greekHDfile = os.path.join(os.path.dirname(os.path.abspath(__file__)), "list_proiel_word_lemma_POS_freq") ghd_words = {} nofreqfile = os.path.join( os.path.dirname(os.path.abspath(__file__)), "list_proiel_perseus_merged_word_lemma_POS_nofreq") filenames = [] #list of globbed files filename = None # test file extrafile = os.path.join(os.path.dirname(os.path.abspath(__file__)), "extra-wlt.txt") frog_words = {} lookup_w = None #specific word to look up lookup_l = None #specific lemma to look up verbose = False wltmode = False #if true, assume test file is columns; only first token is used frog_cfg = os.path.join(os.path.dirname(os.path.abspath(__file__)), "pretrained_models/herodotus/frog.cfg.template") remove_root = True # default is to remove ROOT from brat files, -R to disable suffix = ".lastrun" stats = False callstr = " ".join(sys.argv) try: opts, args = getopt.getopt(sys.argv[1:], "c:f:l:L:s:vw:DE:FM:RWS", []) except getopt.GetoptError as err: print(str(err)) sys.exit(1) for o, a in opts: if o in ("-f"): filenames = sorted(glob.glob(a)) elif o in ("-c"): #alternative frog config frog_cfg = a elif o in ("-l"): #lookup a specific lemma, print to screen lookup_l = a elif o in ("-L"): #choose another lexicon file greekHDfile = a elif o in ("-M"): #choose another merged (wlt) file nofreqfile = a elif o in ("-E"): #choose another extra-wlt (wlt) file extrafile = a elif o in ("-s"): suffix = "." + a elif o in ("-v"): verbose = True elif o in ("-w"): #lookup a specific word, print to screen lookup_w = a elif o in ("-D"): debug = True elif o in ("-F"): # disables Frog, use also when Frog not available have_frog = False #force ignore frog frog_cfg = None elif o in ("-R"): remove_root = not remove_root elif o in ("-W"): wltmode = True elif o in ("-S"): stats = True else: assert False, "unhandled option" logfile = "glem" + suffix + ".log" lgf = open(logfile, "w") #or append? print(callstr, file=lgf, flush=True) # Sanity checks, aborts if specified lexicon files not found. files_found = True for f in [greekHDfile, filename, nofreqfile, extrafile, frog_cfg]: if f and not os.path.exists(f): print("ERROR: FILE NOT FOUND:", f, file=lgf, flush=True) print("ERROR: FILE NOT FOUND:", f, file=sys.stderr) files_found = False if not files_found: print("ABORT: Necessary files not found", file=sys.stderr) print("ABORT: Necessary files nt found", file=lgf, flush=True) lgf.close() sys.exit(1) # Initialise Frog. if have_frog: print("INITIALISE FROG", file=sys.stderr) frog = Frog( FrogOptions(parser=True, tok=False, morph=False, mwu=False, chunking=False, ner=False), frog_cfg) # Statistics on lexicon files. line_count = 0 new_entries = 0 zero_freq = 0 doubles = 0 conflicts = 0 print("READING", greekHDfile, file=sys.stderr) print("READING", greekHDfile, file=lgf, flush=True) with open(greekHDfile, 'r') as f: ''' WORD LEMMA TAG COUNT ἀλλήλοις ἀλλήλων Pc-p---md--i 5 ἀλλήλοις ἀλλήλων Pc-p---nd--i 2 ἀλλήλοισι ἀλλήλων Pc-p---md--i 9 ''' for l in f: l = l.strip() if len(l) > 0 and l[0] == "#": print("SKIP COMMENT", l, file=lgf, flush=True) continue bits = l.split() if len(bits) != 4: print("SKIP NOT 4 FIELDS", l, file=lgf, flush=True) continue line_count += 1 word = normalize('NFC', bits[0]) lemma = normalize('NFC', bits[1]) tag = bits[2] try: freq = int(bits[3]) except ValueError: print("SKIP FREQUENCY ERROR", l, file=lgf, flush=True) continue if freq == 0: #print( "HAS 0 FREQUENCY", l, file=lgf, flush=True ) zero_freq += 1 DBG(word, lemma, tag, freq) # Store it. if word in ghd_words.keys(): word_entry = ghd_words[word] new_lemma = Lemma(word, lemma, tag, freq) new_lemma.src = "greek_Haudag" #proiel # Note we assume unique word-tag combinations. if tag in word_entry.lemmas: # WHAT # τοσόνδε, τοσόσδε, Pd-s---na-, 5 # τοσόνδε τοσόσδε Pd-s---na- 0 # Normally, if the second one has a lower count, it is ignored. if True or freq > word_entry.lemmas[tag].freq: if lemma != word_entry.lemmas[tag].lemma: print("CONFLICTING DOUBLE ENTRY", file=lgf, flush=True) conflicts += 1 else: print("DOUBLE ENTRY", file=lgf, flush=True) print("STORED", word_entry.lemmas[tag], file=lgf, flush=True) print(" NEW", new_lemma, file=lgf, flush=True) doubles += 1 word_entry.lemmas[tag] = new_lemma DBG("append entry", word) else: word_entry = Word(word) new_lemma = Lemma(word, lemma, tag, freq) new_lemma.src = "greek_Haudag" #"proiel" word_entry.lemmas[tag] = new_lemma ghd_words[word] = word_entry new_entries += 1 DBG("new entry", word) print("Added", new_entries, "new entries.", file=lgf, flush=True) print("Counted", zero_freq, "entries with frequency 0.", file=lgf, flush=True) print("Ignored", doubles, "double entries, of which", conflicts, "conflicts.", file=lgf, flush=True) new_entries = 0 if nofreqfile: print("READING", nofreqfile, file=sys.stderr) print("READING", nofreqfile, file=lgf, flush=True) with open(nofreqfile, 'r') as f: for l in f: l = l.strip() if len(l) > 0 and l[0] == "#": print("SKIP", l, file=lgf, flush=True) continue bits = l.split() if len(bits) != 3: print("SKIP", l, file=lgf, flush=True) continue line_count += 1 word = normalize('NFC', bits[0]) lemma = normalize('NFC', bits[1]) tag = bits[2] freq = 0 #unknown DBG(word, lemma, tag) if word in ghd_words.keys(): word_entry = ghd_words[word] if tag in word_entry.lemmas: # if already present, do nothing, because # we have it from first list DBG("TAG ALREADY PRESENT", word, lemma, tag) else: new_lemma = Lemma(word, lemma, tag, freq) new_lemma.src = "merged" #"nofreq" word_entry.lemmas[tag] = new_lemma DBG("append entry", word) DBG("skip existing entry", word) else: word_entry = Word(word) new_lemma = Lemma(word, lemma, tag, freq) new_lemma.src = "merged" #"nofreq" word_entry.lemmas[tag] = new_lemma ghd_words[word] = word_entry new_entries += 1 DBG("new entry", word) print("Added", new_entries, "new entries.", file=lgf, flush=True) new_entries = 0 # At the moment we have punctuation here. # format is word-lemma-tag # if extrafile: print("READING", extrafile, file=sys.stderr) print("READING", extrafile, file=lgf, flush=True) with open(extrafile, 'r') as f: for l in f: l = l.strip() if len(l) > 0 and l[0] == "#": print("SKIP COMMENT", l, file=lgf, flush=True) continue bits = l.split() if len(bits) != 3: print("SKIP NOT 3 FIELDS", l, file=lgf, flush=True) continue line_count += 1 word = normalize('NFC', bits[0]) lemma = normalize('NFC', bits[1]) tag = bits[2] if word in ghd_words.keys(): word_entry = ghd_words[word] if tag in word_entry.lemmas: #indexed by tag word_entry.lemmas[tag].freq += 1 else: new_lemma = Lemma(word, lemma, tag, 1) new_lemma.src = "extra" word_entry.lemmas[tag] = new_lemma else: word_entry = Word(word) new_lemma = Lemma(word, lemma, tag, freq) new_lemma.src = "extra" word_entry.lemmas[tag] = new_lemma ghd_words[word] = word_entry new_entries += 1 print("Added", new_entries, "new entries.\n", file=lgf, flush=True) new_entries = 0 # Print top-5 most frequent words, with top-5 lemmas if verbose: sorted_words = sorted(ghd_words, key=lambda k: len(ghd_words[k].lemmas), reverse=True) for x in sorted_words[0:5]: print(ghd_words[x], file=sys.stderr) # print top-5 frequent lemmas for l in sorted(sorted(ghd_words[x].lemmas.values(), key=attrgetter('tag'), reverse=False), key=attrgetter('freq'), reverse=True)[0:5]: print(" ", l, file=sys.stderr) # Count lemmatisation stats lemmatiser_stats = Counter() # Possible lemmatiser "strategies" strategies = { "MLDTHF": "multi lemmas, no pos tag match, highest frequency", #DT=different tag "MLNTHF": "multi lemmas, no tag, highest frequency", "MLSTHF": "multi lemmas, pos tag match, and highest frequency", "MLNTHF": "multi lemmas, no tag, highest frequency", "MLSTOF": "multi lemmas, pos tag match, but other frequency", "MLNTOF": "multi lemmas, no tag, other frequency", "OLDT": "one lemma, but different pos tag", "OLST": "one lemma, same pos tag", "OLNT": "one lemma, no tag", "FROG": "Frog lemma", "UNKNOWN": "unknown" } # Prefill Counters lemmatiser_stats["unknown"] = 0 for s in strategies: lemmatiser_stats[strategies[s]] = 0 ''' Lemmatiser strategy: Check if word in dictionary. If it is: 1) If it has only one tag/lemma entry, return it. ("one lemma, same pos tag" / "one lemma, different pos tag") 2) More than one tag/lemma entry: go through the tag/lemmas: a) if a lemma with a similar pos tag is found, return it. ("multiple lemmas, same pos tag, highest frequency" / "multi lemmas, same pos tag, other frequency") b) otherwise, return the most frequent tag/lemma. ("multi lemmas, different pos tag, highest frequency") *) sorting was non-deterministic if same count? If it is not: 1) Take Frog entry, and return it. ("Frog" / "Frog list") 2) If this fails: return None. ("unknown") ''' # --------------------------------- # Process testfile(s) # --------------------------------- # Look up a single word from the lexicon, this is mostly for debugging # and/or introspective purposes. if lookup_w: print("\nLOOKUP WORD", lookup_w) if lookup_w in ghd_words: print(" ", ghd_words[lookup_w]) for l in sorted(ghd_words[lookup_w].lemmas.values(), key=attrgetter('freq'), reverse=True): print(" ", l) print("\nLEMMATISER", lemmatise(lookup_w, "", ghd_words, verbose)) # Look up a single lemma in all words if lookup_l: print("\nLOOKUP LEMMA", lookup_l) for x in ghd_words: output = [] for l in sorted(ghd_words[x].lemmas.values(), key=attrgetter('freq'), reverse=True): if l.lemma == lookup_l: output.append(l) if output: print(x) for o in output: print(" ", o) # Test file format: # Lines of Greek text # if not filenames: print("\nNOTHING TO DO...", file=sys.stderr) lgf.close() sys.exit(0) for filename in filenames: # Check for my own output, a bit crude but prevents the worse mistakes. if filename.endswith(".stats.txt") or filename.endswith(".wlt.txt"): continue print("\nLEMMATISING", filename, file=sys.stderr) print("LEMMATISING", filename, file=lgf, flush=True) # Reset Counters lemmatiser_stats["unknown"] = 0 for s in strategies: lemmatiser_stats[strategies[s]] = 0 # Output is put into these (two) files. outprefix = filename if stats: outfile = outprefix + suffix + ".stats.txt" else: outfile = outprefix + suffix + ".wlt.txt" # Process test file. lcount = 0 hcount = 0 #count hash lemmas "foo#1" wcount = 0 #words processed if filename: with open(filename, 'r') as f: with open(outfile, 'w') as of: for l in f: l = l.strip() if not l: continue words = l.split() # we need a "wlt" mode for hdt text. and check results if wltmode: words = [words[0]] if verbose: print("words", words) if remove_root and words and words[0] == "ROOT": words.pop(0) words = [normalize('NFC', w) for w in words] if have_frog: frog_out = query_frog_sentence( frog, " ".join(words), verbose) for word in words: if verbose: print("\n", word, lcount, wcount) # first frog for POS, then lemmatiser if have_frog: try: frog_word = frog_out.pop(0) except IndexError: print("ABORT. FROG OUTPUT EMPTY") sys.exit(1) if verbose: print(frog_word) frog_w = normalize('NFC', frog_word["text"]) frog_l = normalize('NFC', frog_word["lemma"]) frog_t = frog_word["pos"] if verbose: print("frog(" + str(word) + "):", frog_w, frog_l, frog_t) else: frog_t = None # try our lemmatiser, with Frog pos tag the_lemma, ltype = lemmatise( word, frog_t, ghd_words, verbose) if verbose: print("lemmatiser:", word, frog_t, the_lemma, ltype) # we possibly get (NONE, "UNKNOWN") if not the_lemma: #Use frog output for lemma as well if have_frog and frog_w: the_lemma = Lemma(word, frog_l, frog_t, 0) the_lemma.src = "frog" ltype = "FROG" else: the_lemma = None ltype = "UNKNOWN" ltype = strategies[ltype] lemmatiser_stats[ltype] += 1 if the_lemma: # Note that the POS tag here is the one from the lexica, # and not the one supplied by Frog. if verbose: print("lemma =", the_lemma) print(ltype) # if stats: of.write(word + "\t" + the_lemma.lemma + "\t" + the_lemma.tag + "\t" + repr(the_lemma) + "\t" + ltype + "\n") else: of.write(word + "\t" + the_lemma.lemma + "\t" + the_lemma.tag + "\n") else: #not the_lemma if stats: of.write(word + "\tUNKNOWN\tUNKNOWN\tNONE\t" + ltype + "\n") else: of.write(word + "\tNONE\tNONE\n") wcount += 1 lcount += 1 with open(outfile, 'a') as of: print("#", callstr, "[" + VERSION + "]", file=of, flush=True) print("#\n# line count", lcount, "word count", wcount, file=of, flush=True) for stat, count in sorted(lemmatiser_stats.items()): #for stat, count in lemmatiser_stats.most_common(): print("# {0:<60} {1:5n}".format(stat, count), file=of, flush=True) print("\nOutput in", file=lgf, flush=True) print(" ", outfile, file=lgf, flush=True) print("\nOutput in", file=sys.stderr) print(" ", outfile, file=sys.stderr)
line_txt = re.sub(r'\\-', ' ', line_txt) # spelling line_txt = re.sub(r'"', '', line_txt) line_txt = re.sub(r'[ ]+(?=[.,:;?!])', "", line_txt) # line_txt = re.sub(r'[\.!]*[!]+[\.!]*', '!', line_txt) # replace combos including at least 1 '!' # line_txt = re.sub(r'[\.!?]*[?]+[\.!?]*', '?', line_txt) # replace combos including at least 1 '?' # line_txt = re.sub(r'\.+', '.', line_txt) # replace clusters of '.' with a single '.' line_txt = re.sub(r'[!.?]+', '!', line_txt) line_txt = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', line_txt) line_txt = re.sub(r" 's ", " ''s ", line_txt) line_txt = re.sub(r"^'s ", "''s ", line_txt) # if re.search(r'"".+""', line_txt): # print(re.search(r'"".+""', line_txt).group()) txt_dict[pair][part][spkr] += line_txt + " " frog = Frog(FrogOptions(mwu=False, ner=False)) for pair in txt_dict: for part in txt_dict[pair]: with open("{}pos/{}/{}_{}.pos".format(ecsd_path, pair, pair, part), "w", encoding="utf-8") as g: for spkr in txt_dict[pair][part]: print(pair, part, spkr) text = txt_dict[pair][part][spkr] word_list = frog.process(text) s_counter = 0 w_counter = 0 for word in word_list: if word["index"] == "1": s_counter += 1
from frog import Frog, FrogOptions frog = Frog(FrogOptions(chunking=False, parser=False)) def get_words(s, debug=False): result = frog.process(s) kept, thrown = set(), set() for word in result: if all(not c.isalpha() for c in word['lemma']): thrown.add(word['lemma']) elif word['ner'] == 'O': kept.add(word['lemma']) else: if '_' in word['ner']: # sometimes phrases are returned instead of words, # e.g. "zeg maar" will be returned as "zeggen_maar" # with the NER value being "O_O", so two types are # actually returned separated by an underscore. We # split these phrases again and add the "O" types to # the "kept" list. words = word['lemma'].split('_') types = word['ner'].split('_') if 'O' in types: for w, t in zip(words, types): if t == 'O': kept.add(w) else: thrown.add(_format_thrown_word(w, t, debug)) else:
import itertools import json from frog import Frog, FrogOptions from flask import Flask, request frog = Frog(FrogOptions()) app = Flask(__name__) @app.route('/process', methods=['GET', 'POST']) def process(): if request.method == 'POST': return json.dumps(frog.process(request.form.get('document'))) else: return 'Perform a POST request with a sentence parameter to get the FROG-tokenized and annotated sentence back' @app.route('/organisations', methods=['GET', 'POST']) def organisations(): organisations = [] if request.method == 'POST': processed_document = frog.process(request.form.get('document')) for is_organisation, organisation in itertools.groupby( processed_document, key=lambda x: x['ner'].endswith('ORG')): if is_organisation: organisations.append(" ".join(x['text'] for x in organisation)) return json.dumps(organisations) @app.route('/persons', methods=['GET', 'POST'])
def main(): parser = argparse.ArgumentParser( description="", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '-s', '--nerset', type=str, help="NER FoLiA Set", action='store', default= "https://raw.githubusercontent.com/proycon/folia/master/setdefinitions/namedentities.foliaset.ttl" ) parser.add_argument('-c', '--config', type=str, help="Frog configuration", action='store', required=True) parser.add_argument('--notexact', dest='exact', help="Loose evaluation", action='store_false', default=True) parser.add_argument('files', nargs='+', help='bar help') args = parser.parse_args() frog = Frog(FrogOptions(ner=True, parser=False, xmlout=True), args.config) sentence = [] entities = [] precisions = [] recalls = [] entity = None entity_cls = None doc = None classeval = defaultdict(lambda: defaultdict(int)) for filename in args.files: for token, tag in readdata( filename): #extracttrain also works on test gold standard if token is None: #end of sentence if entity: entities.append((" ".join(entity), entity_cls)) entity = [] if sentence: print("Processing: ", " ".join(sentence), file=sys.stderr) print(" Reference entities:", entities, file=sys.stderr) doc = frog.process(" ".join(sentence)) precision, recall = evaluate(doc, entities, args.nerset, classeval, args.exact) print(" precision=", precision, " recall=", recall, file=sys.stderr) if precision is not None: precisions.append(precision) if recall is not None: recalls.append(recall) sentence = [] entities = [] else: if tag[0] == 'B': if entity: entities.append((" ".join(entity), entity_cls)) entity = [] entity_cls = tag[2:] entity.append(token) elif tag[0] == 'I': entity.append(token) elif entity: entities.append((" ".join(entity), entity_cls)) entity = [] sentence.append(token) print("overall precision (macroav):\t", sum(precisions) / len(precisions)) print("overall recall (macroav):\t", sum(recalls) / len(recalls)) for cls, evaldata in classeval.items(): try: print(cls + " precision (microav):\t", evaldata['tp'] / (evaldata['tp'] + evaldata['fp'])) except ZeroDivisionError: print(cls + " precision (microav):\tn/a") try: print(cls + " recall (microav):\t", evaldata['tp'] / (evaldata['tp'] + evaldata['fn'])) except ZeroDivisionError: print(cls + " recall (microav):\tn/a")
from frog import Frog, FrogOptions from polyglot.downloader import downloader from polyglot.text import Text, Word import morfessor import pickle import re Processed_Sentence = " lezen optimaal liep europese unie gekregen spellen rugzak super allesinds boomhut ontwikkelende gemeenschappen vermeenigvuldigde getallen Vereenvoudigd. ....... is werken lopen een kleine test gewoon om te zien of het wel werkt." #Processed_Sentence = "Ik spring wat rond in het rond" #frog = Frog(FrogOptions(tok=True, lemma=True, morph = True, daringmorph=False, mwu=False, chunking=True, ner=True, parser=False)) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) output = frog.process(Processed_Sentence) print("") print("RAW OUTPUT") print(output) print("length") print(len(output)) print(output[0]) print(output[1]) print(output[2]) print(output[3])
def __init__(self, **kwargs): # Disable multiword recognition, which is performed by the chunker options = FrogOptions(parser=False, mwu=False, xmlIn=True, **kwargs) self.__frog = Frog(options)
#!/usr/bin/env python3 from __future__ import print_function, unicode_literals from frog import Frog, FrogOptions import folia.main as folia frog = Frog(FrogOptions(parser=True)) output = frog.process_raw("Dit is een test") print("RAW OUTPUT=", output) output = frog.process("Dit is nog een test.") print("PARSED OUTPUT=", output) frog = Frog(FrogOptions(parser=True, xmlout=True)) output = frog.process("Dit is een FoLiA test.") assert isinstance(output, folia.Document) assert isinstance(len(output.data), 1) assert isinstance( next(output.data.select(folia.Sentence)).text(), "Dit is een FoLiA test.") #output is now no longer a string but an instance of folia.Document, provided by the FoLiA library in PyNLPl (pynlpl.formats.folia) print("FOLIA OUTPUT=") print(output.xmlstring()) print("Inspecting FoLiA output (example):") for word in output.words(): print(word.text() + " " + word.pos() + " " + word.lemma()) assert len(output.words()) == 5
def change_text_to_lemma_POS(sentences, save=False, filename=None): # sentence list to sentence list in frog lemma + pos lemmapos_sentences = [] frog = Frog( FrogOptions(tok=True, lemma=True, morph=False, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) j = 0 for sentenceToBeProcessed in sentences: if j % 1000 == 0: print(j + 1) print("of") print(len(sentences)) j += 1 sentenceToBeProcessed = sentenceToBeProcessed.rstrip('\n') output = frog.process(sentenceToBeProcessed) lemmapos_sentence = "" for i in range(0, len(output)): pos = str(output[i].get("pos")) lemma = str(output[i].get("lemma")) #posprob = str(output[i].get("posprob")) #print(posprob) # print("pos: " + pos) # print("lemma: " + lemma) pos = "<" + pos pos = pos.replace("(", "><") pos = pos.replace(")", ">") pos = pos.replace(",", "><") pos = pos.replace("<>", "") # print(pos) lemmapos_word = lemma + " " + "**" + pos + "**" #word = str(output[i].get("text")) #print(f"{word}: {lemmapos_word}") lemmapos_sentence = lemmapos_sentence + " " + lemmapos_word # Remove the first empty string #print(lemmapos_sentence) lemmapos_sentence = lemmapos_sentence[1:] #print("") #print("") #print("") #print("") #print(lemmapos_sentence) #print("") #print("") #print("") #print("") lemmapos_sentences.append(lemmapos_sentence) #print("") #print(lemmapos_sentences) #print("") if save is True: with open(filename, 'wb') as outputfile: pickle.dump(lemmapos_sentences, outputfile) return lemmapos_sentences
line_txt) # deal with d'rbij line_txt = re.sub(r'^\.', '', line_txt) line_txt = re.sub( r'[!?\.,:;]', lambda m: " " + m.group(), line_txt) # prevent from being interpreted as SPEC(afk) # if re.search(r'"".+""', line_txt): # print(re.search(r'"".+""', line_txt).group()) # if len(line_txt) > 0: # if line_txt[-1] not in [".", ",", "!", "?", ":", ";"]: # add . if chunk does not end in punctuation # if re.search(r' [A-Za-z]$', line_txt): # prevent from being interpreted as SPEC(afk) # line_txt += "!" # else: # line_txt += " ." txt_dict[file_n][speakers[file_n][spkr]] += line_txt + " " frog = Frog(FrogOptions(parser=True)) def tag_files(files): for fl in files: with open(tens_path + "Annotations/pos/" + fl + ".pos", "w") as g: for spkr in txt_dict[fl]: print(fl, spkr) text = txt_dict[fl][spkr] # print(text) word_list = frog.process(text) print("BLA") s_counter = 0 for word in word_list: if word["index"] == "1": s_counter += 1
import sys import re from frog import Frog, FrogOptions import multiprocessing frog = Frog(FrogOptions(parser=True, numThreads=1)) f_path = "/Volumes/tensusers/timzee/other/" if sys.platform == "darwin" else "/vol/tensusers/timzee/other/" trials = [ "administrateur", "admiraal", "alarm", "balkon", "ballon", "bar", "baron", "bretel", "broer", "cabriolet", "champignon", "commandant", "compagnie", "crediteur", "dessert", "directeur", "donateur", "doorn", "duel", "dynastie", "epidemie", "expert", "galerie", "gazon", "gel", "generaal", "genie", "hoorn", "idee", "interieur", "journaal", "kanaal", "kapitein", "kopie", "luitenant", "magnetron", "majoor", "meneer", "mevrouw", "microfoon", "militair", "miljonair", "model", "monarch", "monogram", "monteur", "mortier", "officier", "perron", "pion", "pistool", "protocol", "redacteur", "regisseur", "reptiel", "residu", "restaurant", "saxofoon", "sergeant", "sjaal", "strategie", "telegram", "theorie", "trofee", "vampier" ] print("Loading CELEX") celex = {} with open(f_path + "DPW3.CD", "r") as f: for line in f: l_list = line[:-1].split("\\") word = l_list[1] syls = l_list[4].split("-") syl_struc = [i.strip("[]") for i in l_list[5].split("[") if i != ""]