def __init__(self, file_known_terms, file_patterns, file_stoplist): self.file_patterns = file_patterns self.file_known_terms = file_known_terms self.file_stoplist = file_stoplist #Note: Removing the dual-file option for now. #self.file_known_terms_lemmas = "" #self.file_known_terms_all = "" self.c_value_threshold = 3.0 self.l_distance_threshold = 15 self.s_ratio_threshold = 1.5 #Reynir is *probably* more reentrant if kept as an instance variable, given # that it's going to be continually used for sentence parsing and storage. self.r = Reynir() self.known_term_list = [] self.pattern_list = {} self.known_term_list_roots = [] self.term_candidate_list = [] self.stop_list = [] self.term_candidate_list = [] self.load_known_terms() self.populate_pattern_list() self.load_roots_from_known_terms() self.load_stop_list()
def lemmatizeParse(text): # For parsing sentences #print(text) reynir = Reynir() lemmas = [] sents = reynir.parse(text) for sent in sents['sentences']: # print(sent) try: if sent.lemmas == None: raise AttributeError() else: lemmas.append(' '.join(sent.lemmas)) except AttributeError: print("WARNING: lemmatize AttributeError from Reynir", sent) #TODO: CHECK HACKY LINE BELOW RVB sent = str(sent) bin = BIN_Compressed() bin_lemmas = [] sent_words = sent_tokenize(sent.lower()) sent_words = word_tokenize(' '.join(sent_words)) for word in sent_words: word_lookup = bin.lookup(word) if word_lookup != []: bin_lemmas.append(word_lookup[0][0]) elif bin.lookup(word) == []: bin_lemmas.append(word) lemmas.append(' '.join(bin_lemmas)) pass return lemmas
def lemmatizeParse(text): # For parsing sentences reynir = Reynir() lemmas = [] sents = reynir.parse(text) for sent in sents['sentences']: try: lemmas.append(' '.join(sent.tree.lemmas)) except AttributeError: print("ERROR: lemmatize AttributeError, adding raw: " + str(sent)) lemmas.append(str(sent)) pass return lemmas
def test_init(): """ Test that importing and initializing the reynir module works """ from reynir import Reynir global r r = Reynir()
def remove_username_handle_and_clean(tweet): words = tweet.split() length_of_handle = len(words[0]) cleaner_text = tweet[length_of_handle + 1:] no_handles_text = re.sub('@', '', cleaner_text) print(no_handles_text) no_hashtags_text = re.sub('\#[^\s]*', '', no_handles_text) return no_hashtags_text textgen = textgenrnn(weights_path='model/colaboratory_weights.hdf5', vocab_path='model/colaboratory_vocab.json', config_path='model/colaboratory_config.json') # corpus = open('althingi2.txt') #gen = Markov(corpus) r = Reynir() debug = len(sys.argv) > 1 api = getApi() # def is_non_type(tree): # return type(referenceSent.tree) == type(tree) # def get_noun_phrase(tree) : # if is_non_type(tree) : # return 'NO' # if(tree.is_terminal): # return 'NO' # if 'NP' == tree.tag: # print(tree.fl) # return tree.text # for child in tree.children:
from reynir import Reynir import sys from markovgen import Markov import time import datetime import re #corpus = open('althingi2.txt') #gen = Markov(corpus) r = Reynir() referenceSent = r.parse_single('Í fréttum er þetta helst') def is_grammatically_correct(test_string): sent = r.parse_single(test_string) return not is_non_type(sent.tree) def is_non_type(tree): return type(referenceSent.tree) != type(tree) def count_unique_words_in_BIN_list(lis): ids = [] for item in lis: ids.append(item[1]) return len(set(ids))
# Ignores the standard beginning line of files ('Name: Blabla') def read_files(): for file in sorted_files: with open(file, 'r', encoding='utf-8') as f: for line in f: if line.startswith('Name'): pass elif line == '\n': pass else: txt = f.read().replace('\n', '') yield txt # For parsing sentences reynir = Reynir() # All files in dir all_files = glob.glob('doc2vec/txts/*txt') # All files sorted naturally sorted_files = natural_sort(all_files) # List created from generator all_files = [w for w in read_files()] # Yields all lemmas in every sentences # Structure: [file[sent[lemmas]]] def get_lemmas(): for file in all_files: lemmas = [] sents = reynir.parse(file) for sent in sents['sentences']:
class TermExtractor(): def __init__(self, file_known_terms, file_patterns, file_stoplist): self.file_patterns = file_patterns self.file_known_terms = file_known_terms self.file_stoplist = file_stoplist #Note: Removing the dual-file option for now. #self.file_known_terms_lemmas = "" #self.file_known_terms_all = "" self.c_value_threshold = 3.0 self.l_distance_threshold = 15 self.s_ratio_threshold = 1.5 #Reynir is *probably* more reentrant if kept as an instance variable, given # that it's going to be continually used for sentence parsing and storage. self.r = Reynir() self.known_term_list = [] self.pattern_list = {} self.known_term_list_roots = [] self.term_candidate_list = [] self.stop_list = [] self.term_candidate_list = [] self.load_known_terms() self.populate_pattern_list() self.load_roots_from_known_terms() self.load_stop_list() def load_known_terms(self): #Note: Removing the dual-file option for now. """ def load_known_terms(self, file_lemmas, file_all): if( (file_lemmas) and (not file_all) ): with open(file_lemmas, "r", encoding="utf-8") as f_l: for newline_l in iter(f_l.readline, ''): line_l_full = str(newline_l) line_l_string = line_l_full.rstrip() if( line_l_string ): known_term_list.append(line_l_string) f_l.close() elif( (not file_lemmas) and (file_all) ): line_counter = 0 with open(file_all, "r", encoding="utf-8") as f_a: for newline_a in iter(f_a.readline, ''): line_counter += 1 if( linecounter % 3 == 2 ): line_a_full = str(newline_a) line_a_string = line_a_full.rstrip() if( line_a_string ): known_term_list.append(line_a_string) f_a.close() else: known_term_list = [] """ if (self.file_known_terms): with open(self.file_known_terms, "r", encoding="utf-8") as file_l: for newline_l in iter(file_l.readline, ''): line_l_full = str(newline_l) line_l_string = line_l_full.rstrip() if (line_l_string): self.known_term_list.append(line_l_string) file_l.close() def populate_pattern_list(self): if (self.file_patterns): self.pattern_list[tuple()] = True with open(self.file_patterns, "r", encoding="utf-8") as file_p: for line_p in file_p: tag_seq = [] for tag in line_p.split(): # only uses first character of tag tag_seq.append(tag[0]) # set this sequence's value to false because pattern is incomplete. if tuple(tag_seq) not in self.pattern_list: self.pattern_list[tuple(tag_seq)] = False self.pattern_list[tuple(tag_seq)] = True # line_p_list = tuple(ifd_tag[0] for ifd_tag in line_p_str.split()) # self.pattern_list.add(line_p_list) # print(self.pattern_list) file_p.close() def load_roots_from_known_terms(self): if (len(self.known_term_list) > 0): resources = { "modifiers": os.path.join(os.path.dirname(__file__), 'resources', 'modifiers.dawg'), "heads": os.path.join(os.path.dirname(__file__), 'resources', 'heads.dawg'), "templates": os.path.join(os.path.dirname(__file__), 'resources', 'templates.dawg'), "splits": os.path.join(os.path.dirname(__file__), 'resources', 'splits.dawg') } kv = kvistur.Kvistur(**resources) for line in self.known_term_list: root_line = [] line_list = line.split() for word in line_list: score, tree = kv.decompound(word) compound_list = [] compound_list = tree.get_atoms() if (len(compound_list) > 1): root_line.append(compound_list) if (len(root_line) > 0): self.known_term_list_roots.append(root_line) def load_stop_list(self): if (self.file_stoplist): with open(self.file_stoplist, "r", encoding="utf-8") as file_s: for newline_s in iter(file_s.readline, ''): line_s_full = str(newline_s) line_s_string = line_s_full.rstrip() if (line_s_string): """ line_s_list = [word for word in line_s_string.split()] stop_l.append(line_s_list) """ self.stop_list.append(line_s_string) file_s.close() def line_tokenize(self, newline): list_of_tokenized_words = [] for token in tokenize(newline): kind, txt, val = token if kind == TOK.WORD: list_of_tokenized_words.append(txt) return list_of_tokenized_words def line_tag(self, tokenized_list): str_tokens = " ".join(tokenized_list) parsed_tokens = self.r.parse_single(str_tokens) return parsed_tokens def line_lemmatize(self, pos_tagged_sentence): lemmatized_words = [] if pos_tagged_sentence.tree is not None: number_of_tags = len(pos_tagged_sentence.ifd_tags) number_of_words = len(pos_tagged_sentence.lemmas) lemma_list_with_dashes = pos_tagged_sentence.lemmas ifd_tag_list_with_dashes = pos_tagged_sentence.ifd_tags """ Reynir sometimes inserts dashes ("-") into lemmas and tags. Said dashes may break functionality in code that doesn't expect them, including 3rd party programs like ABLTagger and Nefnir. Moreover, anyone maintaining/expanding this code may not be aware that Reynir does this. So let's make sure the lemmas and the IFD tags are dash-free. """ ifd_tag_list_full = [ d.replace('-', '') for d in ifd_tag_list_with_dashes ] lemma_list_spaces = [ le.replace('-', '') for le in lemma_list_with_dashes ] """ Reynir also occasionally creates a single lemma out of more than one word, which can lead to a number of problems (including immediate misalignment between the list of words and the list of corresponding tags). So we check for empty spaces in each lemma and split it accordingly, ensuring we always end up with a list of single-word lemmas. """ lemma_list = [ space_split_words for unsplit_entry in lemma_list_spaces for space_split_words in unsplit_entry.split(" ") ] ifd_tag_list = [c[:1] for c in ifd_tag_list_full] for i in range(0, number_of_tags): word_tuple = (lemma_list[i], ifd_tag_list[i]) lemmatized_words.append(word_tuple) return lemmatized_words def text_tag(self, tok_text, model_type): """ ABLTagger strongly prefers that each phrase end with some kind of punctuation. Before we pass it our input, we check for a full stop or a question mark; if neither is present, we add the former. """ with open("abl/deepmark.txt", "w+", encoding="utf-8") as file_out: for tokenized_phrase in tok_text: last_token = str(tokenized_phrase[-1]) if not ((last_token == ".") or (last_token == "?")): tokenized_phrase.append(".") for token in tokenized_phrase: file_out.write(str(token)) file_out.write('\n') file_out.close() subprocess.call([ "python3", "tag.py", "--input", "deepmark.txt", "--model", model_type ], cwd="abl") return "abl/deepmark.txt.tagged" def text_lemmatize(self, file_tokenized): with open(file_tokenized, "r", encoding="utf-8") as abl_extra_lines: with open("Nefnir/abl_output.txt", "w", encoding="utf-8") as abl_out: for line in abl_extra_lines: str_line_unstripped = str(line) str_line = str_line_unstripped.rstrip() if (str_line.startswith('.') or str_line.startswith('?')): abl_out.write(str_line + "\n\n") elif (str_line != ""): abl_out.write(str_line + "\n") abl_extra_lines.close() subprocess.call([ "python3", "nefnir.py", "-i", "abl_output.txt", "-o", "lemmas.txt" ], cwd="Nefnir") lemmatized_lists = [] current_list_of_tuples = [] with open("Nefnir/lemmas.txt", "r", encoding="utf-8") as nefnir_lemmas: for n_line in nefnir_lemmas: str_n_unstripped = str(n_line) str_n = str_n_unstripped.rstrip() """ Nefnir divides on blank spaces, so if the line is either blank or starts with something that's not alphanumeric, we've reached the end of our phrase and can add it to lemmatized_lists. """ str_starting_character = str_n[:1] #De Morgan's laws again: not A or not B <==> not (A and B) if not ((str_n) and (str_starting_character.isalnum())): if (len(current_list_of_tuples) > 0): lemmatized_lists.append(current_list_of_tuples) current_list_of_tuples = [] else: line_entries = str_n.split("\t") str_word = line_entries[2] str_category_full_ifd = line_entries[1] str_category_first_char = str_category_full_ifd[0] word_tuple = (str_word, str_category_first_char) current_list_of_tuples.append(word_tuple) nefnir_lemmas.close() return lemmatized_lists @staticmethod def tag_and_lemmatize(text): """ Input: A text string. Output: A list of lists of tuples containing lemmas, tags, and words """ HYPHENS = "-–—" # HACK: hotfix because of stupid tokenizer normalisation DQUOTES = '"“„”«»' # HACK: hotfix because of stupid tokenizer normalisation res = requests.post(url=API_LOCATION, data={ 'text': text, 'model_type': 'coarse', 'lemma': 'on' }) sentences = [ sent for para in res.json()['paragraphs'] for sent in para['sentences'] ] # print(sentences) outputs = [] sentence = [] text_locations = [] i = 0 for sentence in sentences: output_sentence = [] for word_obj in sentence: lemma, mark, ord = (word_obj['lemma'], word_obj['tag'], word_obj['word']) while not text[i:i + len(ord)] == ord: # HACK: Same hack as above if text[i:i + len(ord)] in HYPHENS and ord in HYPHENS: break if text[i:i + len(ord)] in DQUOTES and ord in DQUOTES: break i += 1 # print(text[i:i+len(ord)], ord) if i > len(text): break output_sentence.append((lemma, mark, ord, i)) i += len(ord) outputs.append(output_sentence) return outputs def add_candidate_to_global_list(self, candidate_string, unlemmatized_string, start_end, term_candidate_list=None): if term_candidate_list is None: term_candidate_list = self.term_candidate_list term_wordcount = len(candidate_string.split()) term_already_exists = False for existing_entry in term_candidate_list: if existing_entry["lemmas"] == candidate_string: term_already_exists = True # existing_entry["original"][1] += 1 existing_entry["frequency"] += 1 existing_entry["occurences"].append(unlemmatized_string) existing_entry["boundaries"].append(start_end) break if not term_already_exists: term_candidate_list.append({ "lemmas": candidate_string, "frequency": 1, "parent_count": 0, "parent_types": 0, "wordcount": term_wordcount, "c_value": 0.0, "distance": 0, "s_ratio": -1, "occurences": [(unlemmatized_string, )], "boundaries": [start_end] }) def check_for_stopwords(self, candidate_string): found_stopword = False for stop_string in self.stop_list: stop_string_regex = "(^|\s)" + stop_string + "(\s|\.|$)" stop_pattern = re.compile(stop_string_regex, re.IGNORECASE) if (stop_pattern.search(candidate_string) is not None): found_stopword = True return found_stopword return found_stopword def parse(self, lemmatized_line, term_candidate_list=None): if term_candidate_list is None: term_candidate_list = self.term_candidate_list pl = self.pattern_list sentence_length = len(lemmatized_line) # O(n*(complexity of inner loop)) => O(n*O(1)) == O(n) # Improves upon earlier implementation in that complexity doesn't vary by number of patterns for i in range(len(lemmatized_line)): lemmas = [] words = [] tags = [] start_index = lemmatized_line[i][3] tag_tuple = tuple() j = 0 # Dict membership check is O(1) # tuple conversion is O(k) with k<length of longest pattern so O(1) while tag_tuple in pl and i + j < sentence_length: lemma, tag, word, idx = lemmatized_line[i + j] lemmas.append(lemma) words.append(word) tags.append(tag) tag_tuple = tuple(tags) if tag_tuple in pl and pl[tag_tuple]: end_index = idx + len(word) lemma_string = " ".join(lemmas) word_string = " ".join(words) start_end = [str(start_index), str(end_index)] if not self.check_for_stopwords(lemma_string): self.add_candidate_to_global_list( lemma_string, word_string, start_end, term_candidate_list) j += 1 # #Starting at each successive word in our candidate sentence... # for sentence_word_index, current_word in enumerate(lemmatized_line): # O(k*n*m) where k is no. of patterns, n is length of sentence and m is length of longest pattern # m is generally low so ~O(k*n) # # #...go through every category pattern that's sufficiently short... # for pattern_index, pattern_type in enumerate(self.pattern_list): # if(len(pattern_type) + sentence_word_index <= sentence_length): # match = True # candidate_sentence = [] # unlemmatized_words = [] # pattern_range = len(pattern_type) # # ...and compare side-by-side the sequence of pattern tags and word tags. # for category_index, category_type in enumerate(pattern_type): # if(category_type != lemmatized_line[sentence_word_index+category_index][1]): # """ # If we spot a mismatch, immediately stop checking this particular pattern, # break the innermost "for" loop, and begin checking the next pattern. # """ # match = False # break # else: # """ # If there's a match between the pattern tag and the word tag at this particular offset, # add that one word to candidate_sentence[] and check the next word in line. # """ # candidate_sentence.append(lemmatized_line[sentence_word_index+category_index][0]) # unlemmatized_words.append(lemmatized_line[sentence_word_index+category_index][2]) # if(match): # """ # We've completed all comparisons for this particular pattern at this particular # offset in our candidate, and we've found a match. Convert candidate_sentence to # a string, check it's free of any stoplist phrases and, if so, add it to our # global list of candidates. # Note that no matter whether this particular pattern occurred in the sentence, # we'll keep checking all other patterns from the *same* starting point in that # sentence *before* we move our starting point to the sentence's next word in line. # As a result, we're counting all candidate occurrences, including nested ones. # """ # sentence_string = " ".join(candidate_sentence) # unlemmatized_phrase = " ".join(unlemmatized_words) # if not self.check_for_stopwords(sentence_string): # self.add_candidate_to_global_list( # sentence_string, # unlemmatized_phrase, # lemmatized_line, # term_candidate_list) def calculate_c_values(self, term_candidate_list=None): if term_candidate_list is None: term_candidate_list = self.term_candidate_list term_candidate_list.sort(key=lambda x: x["wordcount"], reverse=True) start = 0 max_index_number = len(term_candidate_list) - 1 highest_wordcount = term_candidate_list[start]["wordcount"] while (start <= max_index_number) and ( term_candidate_list[start]["wordcount"] >= highest_wordcount): start += 1 """ i and j are list indices used to repeatedly scan down the candidate list as we search for smaller terms nested inside larger ones. If every candidate in the entire list has the same number of words, the program will automatically skip the "range" for-loop below. """ i = start for j, term in enumerate(term_candidate_list[start:max_index_number + 1]): if (term["wordcount"] < term_candidate_list[i]["wordcount"]): i = j for larger_term in term_candidate_list[0:i]: if term["lemmas"] in larger_term["lemmas"]: """ Index 2 is the sum of non-nested occurrences of every larger term that contains j a subterm (i.e. each larger term's total frequency minus its frequency specifically as a subterm of some even *larger* term). """ term["parent_count"] += (larger_term["frequency"] - larger_term["parent_count"]) #Index 3 is the number of *unique* larger terms of which j is a subterm. term["parent_types"] += 1 for term in term_candidate_list: log2a = math.log(term["wordcount"], 2.0) constant_i = 1.0 small_c = constant_i + log2a f_a = term["frequency"] SUM_bTa_f_b = term["parent_count"] P_Ta = term["parent_types"] if (term["wordcount"] == highest_wordcount) or (P_Ta < 1): term["c_value"] = small_c * f_a else: term["c_value"] = small_c * (f_a - ((1.0 / P_Ta) * SUM_bTa_f_b)) def find_levenshtein_distances(self, term_candidate_list=None): if term_candidate_list is None: term_candidate_list = self.term_candidate_list if (len(self.known_term_list) > 0): for t in term_candidate_list: lowest_distance = 1000 for known in self.known_term_list: """ 1) mode="NW" means the candidate must be an exact match for a known term. We do have an option ("HW") for substring searches, but that would lead to false positives. 2) task="distance" avoids wasting time trying to chart an optimal L-path (which we're not looking for anyway). """ curr_lev_comparison = edlib.align(t["lemmas"], known, mode="NW", task="distance") curr_lev_distance = curr_lev_comparison["editDistance"] if (curr_lev_distance < lowest_distance): lowest_distance = curr_lev_distance t["distance"] = lowest_distance def find_common_roots(self, term_candidate_list=None): if term_candidate_list is None: term_candidate_list = self.term_candidate_list resources = { "modifiers": os.path.join(os.path.dirname(__file__), 'resources', 'modifiers.dawg'), "heads": os.path.join(os.path.dirname(__file__), 'resources', 'heads.dawg'), "templates": os.path.join(os.path.dirname(__file__), 'resources', 'templates.dawg'), "splits": os.path.join(os.path.dirname(__file__), 'resources', 'splits.dawg') } kv = kvistur.Kvistur(**resources) for candidate_line in term_candidate_list: number_of_compound_words = 0 stem_match_counter = 0 match_ratio = 0.0 candidate_word_list = candidate_line["lemmas"].split() for candidate_word in candidate_word_list: score, tree = kv.decompound(candidate_word) candidate_compound_list = [] candidate_compound_list = tree.get_atoms() if (len(candidate_compound_list) > 1): number_of_compound_words += 1 candidate_last_stem = candidate_compound_list[-1] for known_roots_line in self.known_term_list_roots: for segmented_word in known_roots_line: if (candidate_last_stem == segmented_word[-1]): stem_match_counter += 1 if (number_of_compound_words < 1): match_ratio = -1.0 else: match_ratio = stem_match_counter / number_of_compound_words candidate_line["s_ratio"] = match_ratio def filter_results(self, use_extra_thresholds=False, term_candidate_list=None): filtered_terms = [] if term_candidate_list is None: term_candidate_list = self.term_candidate_list """ if( (l_distance_threshold is not None) and (stem_ratio_threshold is not None) ): extra_thresholds = True """ for t in term_candidate_list: """ First, let's eliminate any candidates that already exist in known_term_list. (We don't want do do this earlier in the program because these candidates may contain new and unknown *nested* terms, and we've a better chance of finding those in the program's statistical calculations if we haven't yet eliminated anything.) There are several ways to implement this particular check, some faster than others. The lists of term candidates and known terms aren't likely to be long enough to affect performance, but if that changes, using "set" or "bisect" instead of "in", and pre-alphabetizing the list of known terms, might help. """ if (use_extra_thresholds and (t["lemmas"] in self.known_term_list)): #Candidate is a known term, so we won't add it to our filtered list. continue passed_c = False passed_l = False s_exists = False passed_s = False if (t["c_value"] >= self.c_value_threshold): passed_c = True if (use_extra_thresholds): if (t["distance"] <= self.l_distance_threshold): passed_l = True if (t["s_ratio"] >= 0.0): s_exists = True if (t["s_ratio"] >= self.s_ratio_threshold): passed_s = True if ((passed_c) and (not use_extra_thresholds)): # t["sentence"] = " ".join([x[2] for x in t["sentence"]]) filtered_terms.append(t) elif ((passed_c) or ((use_extra_thresholds) and ((passed_l) or (passed_s)))): # t["sentence"] = " ".join([x[2] for x in t["sentence"]]) filtered_terms.append(t) return filtered_terms def convert_list_output(self, term_candidate_list=None): if term_candidate_list is None: term_candidate_list = self.term_candidate_list return [[ x["lemmas"], x["frequency"], x["parent_count"], x["parent_types"], x["wordcount"], x["c_value"], x["distance"], x["s_ratio"] ] for x in term_candidate_list]