def __init__(self, corpus_dir, output_dir): self.corpus_dir = corpus_dir self.noun_out_path = os.path.join(output_dir, 'nouns.csv') self.np_out_path = os.path.join(output_dir, 'noun_phrases.csv') self.parser = CoreNLPParser(url='http://localhost:9000') self.n_count = {} self.np_count = {}
"quels","qui","sa","sans","ses","si","sien","son","sont","sous","sur","ta","tandis","tellement","tels","tes","ton","tous", "tout","trop","très","tu","votre","vous","vu","ça","sa", "son", "ses", "de", "a"] en_stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now", "even"] pos_tagger = CoreNLPParser('http://localhost:9004', tagtype='pos') class PairOfEntitiesFeatures(): """ For a given pair of entities in a sentence, find the features between them Features for now include : - surface form entity 1 - surface form entity 2 - type entity 1 (PER, ORG, LOC...) - type entity 2 (PER, ORG, LOC...) - words between entites - x words before the entity 1 - x words after entity 2 - shortest dependency path between two entities
def create_word_csv(speaker_paths, word_extract_path, lex_table, filename_annotation_map, file_timeseg_map): variant_match = dict() pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos') # create csv when there is no csv files if not os.path.exists( word_extract_path): # if the csv does not exist, create the csv with open(word_extract_path, 'w', newline="") as word_extrac_csv: csv_writer = csv.writer(word_extrac_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) csv_writer.writerow([ 'trans_id', 'beg_hms', 'sym_seq', 'Word_SWG', 'var_code', 'Word_German', 'POS_tag' ]) word_extrac_csv.close() for r in zip(lex_table['word_variant'], lex_table['word_standard'], lex_table['word_vars'], lex_table['POS_tag']): # dict with variant as key. # if no match tag the thing v_pattern = compile_pattern(r[0], r[2]) if v_pattern not in variant_match.keys(): variant_match[v_pattern] = [] variant_match[v_pattern].append(r) for w_var, w_val in variant_match.items(): if len(w_val) > 1: print(w_var, w_val) # check if the word's lemma is gehen. If it is, then don't tag the word as SAF5 gehen_variants = set() locations = lex_table.loc[lex_table['word_lemma'] == 'gehen'] for gehen_var in zip(locations['word_variant'], locations['word_vars']): if "SAF5" not in gehen_var[1]: g_pattern = compile_pattern(gehen_var[0], gehen_var[1]) gehen_variants.add(g_pattern) # get speaker file names for speaker in speaker_paths: file_list = [ file for file in os.listdir(speaker) if file.endswith('.TextGrid') ] for file_name in file_list: outputs = [] annotations = filename_annotation_map[file_name] time_seg_map = file_time_seg_map[file_name] # now time stamps and word_count for word_annotation in annotations: beg_hms = word_annotation[-1] word_annotation = word_annotation[:-1] original_segment = time_seg_map[beg_hms] pointer_orgseg = 0 for i, w in enumerate(word_annotation): if w: # empty word check # print(w) sym_seq = None for org_idx, t in enumerate( original_segment ): # this is for the word count if sym_seq is not None: break words = word_filter(t) if words: for word in words: # word,word2 word word2 index? # if same words, take the later one? or there should be a check if (w == word) and (org_idx >= i) and ( org_idx >= pointer_orgseg ) and ( sym_seq is None ): # this is not good, need to clean this orginal segment again, make that into a helper method. sym_seq = org_idx + 1 # print(sym_seq) pointer_orgseg = org_idx # check for var: REL rel = False if i + 1 < len( word_annotation): # make sure next word exist w_next = word_annotation[i + 1] if "[REL]" in w_next: rel = True if "wo" in w: rel_var = " RELd" elif "als" in w or w.startswith( "d") or w.startswith( "wel") or w.startswith("jed"): rel_var = " RELs" elif ("was" in w) or ("wie" in w) or ("wer" in w): rel_var = " RLOs" else: rel_var = " UNK" # regular ddm tagging std_list = set() ddm_list = set() pos_list = set() no_match = True for p in variant_match.keys( ): # could make it into a seperate method if any("IRV" in d for d in ddm_list): # print(" ".join(ddm_list)) break if p.search(w) is not None: # .lower() no_match = False replace = True for values in variant_match[p]: w_var = values[0].replace( "*", "") # word variant w_std = values[1].replace( "*", "") # word standard if std_list: tmp_std = set() while std_list: s = std_list.pop() if p.search(s) is not None: if replace: std = s.replace( w_var, w_std) else: std = values[1] tmp_std.add(std) else: tmp_std.add(s) std_list.update(tmp_std) else: if replace: std = w.replace(w_var, w_std) else: std = values[1] std_list.add(std) if isinstance(values[2], float) and math.isnan( values[2] ): #check for empty var_code ddm_list.add(' ') # do nothing else: ddm_list.add( values[2]) # should be set # another check for the lex table # or change the lex table method when reading just ignore the bad word_vars pos_list.add(values[3]) if no_match: standard = w ddm = " " pos = pos_tagger.tag([w])[0][1] else: standard = " ".join(std_list) if len(std_list) > 1: print(w, "std: ", standard) ddm = " ".join(str(d) for d in ddm_list) # maybe here is the problem if any("SAF5" in d for d in ddm_list): # print(ddm) # right, this for g_pattern in gehen_variants: if g_pattern.search(w) is not None: ddm = ddm.replace("SAF5d", "") ddm = ddm.replace("SAF5s", "") pos = " ".join(str(p) for p in pos_list) if rel: ddm = ddm + rel_var ddm = ddm.strip() output = [ file_name[file_name.rfind("_") + 1:-9], w, ddm, standard, pos, beg_hms, sym_seq ] # print(output) outputs.append(output) outputs = skip_by_tags(outputs, 'r') outputs = skip_by_tags(outputs, 'wl') outputs = skip_by_tags(outputs, 'wg') word_list1_start = [ "Finger", "Flüge", "Biene", "Hunger", "immer", "Äpfel", "Apfel", "Asche", "zum", "waschen" ] word_list1_end = [ "laufen", "Frage", "Linde", "meist", "Haar", "Huhn", "Türe", "Kinder", "alle", "Gast" ] word_list2_start = [ "Flüge", "Fliege", "Söhne", "Sehne", "können", "kennen", "Türe", "Tiere", "vermissen", "vermessen" ] word_list2_end = [ "heiter", "heute", "Feuer", "feiern", "Ofen", "oben", "Kreide", "Kreuze", "Magen", "sagen" ] ft_1_start = [ "Vor", "Zeiten", "war", "ein", "König", "und", "eine", "Königin", "die", "sprachen" ] ft_1_end = [ "alte", "Frau", "mit", "einer", "Spindel", "und", "spann", "emsig", "ihren", "Flachs" ] ft_2_start = [ "Es", "war", "einmal", "eine", "alte", "Geiß", "die", "hatte", "sieben", "junge" ] ft_2_end = [ "er", "in", "seinen", "Rachen", "nur", "das", "jüngste", "fand", "er", "nicht" ] ft_3_start = [ "In", "den", "alten", "Zeiten", "wo", "das", "Wünschen", "noch", "geholfen", "hat" ] ft_3_end = [ "bei", "seinesgleichen", "und", "quakt", "und", "kann", "keines", "Menschen", "Geselle", "sein" ] outputs = skip_word_list(outputs, word_list1_start, word_list1_end, 'wl') outputs = skip_word_list(outputs, word_list2_start, word_list2_end, 'wl') outputs = skip_word_list(outputs, ft_1_start, ft_1_end, 'ft') outputs = skip_word_list(outputs, ft_2_start, ft_2_end, 'ft') outputs = skip_word_list(outputs, ft_3_start, ft_3_end, 'ft') for output in outputs: append_to_word_extract(*output)
class np_extractor: def __init__(self, corpus_dir, output_dir): self.corpus_dir = corpus_dir self.noun_out_path = os.path.join(output_dir, 'nouns.csv') self.np_out_path = os.path.join(output_dir, 'noun_phrases.csv') self.parser = CoreNLPParser(url='http://localhost:9000') self.n_count = {} self.np_count = {} def get_parse_tree(self, sentence): return next(self.parser.raw_parse(sentence)) def get_nps(self, tree): nps = [] np_trees = list(tree.subtrees(filter=lambda x: x.label() == 'NP')) for np_tree in np_trees: np_str = " ".join(np_tree.leaves()) np_str = re.sub("^[\d\s]+", "", np_str) np_str = re.sub("[^a-zA-Z\s]+", "", np_str) np_str = np_str.lower() if len(np_str) > 2: nps.append(np_str) return nps def get_Nouns(self, tree): nouns = [] for tag in tree.pos(): if tag[1] == "NN" and len(tag[0]) > 2: nouns.append(tag[0].lower()) return nouns def extract(self, sentence): try: tree = self.get_parse_tree(sentence) nps = self.get_nps(tree) nouns = self.get_Nouns(tree) return nps, nouns except StopIteration: return [], [] def process_corpus(self): assert os.path.isdir(self.corpus_dir) for root, sub, files in os.walk(self.corpus_dir): for file in files: file_path = os.path.join(root, file) print("processing {}".format(file)) with open(file_path, encoding='utf8', errors="ignore") as fin: for line in fin: nps, nouns = self.extract(line) for np in nps: if np not in self.np_count: self.np_count[np] = 0 self.np_count[np] += 1 for noun in nouns: if noun not in self.n_count: self.n_count[noun] = 0 self.n_count[noun] += 1 sort_nouns = sorted(self.n_count.items(), key=lambda x: x[1], reverse=True) sort_nps = sorted(self.np_count.items(), key=lambda x: x[1], reverse=True) with open(self.noun_out_path, 'w') as fout: for (noun, count) in sort_nouns: fout.write("{},{}\n".format(noun, count)) with open(self.np_out_path, 'w') as fout: for (np, count) in sort_nps: fout.write("{},{}\n".format(np, count)) def start_standford(self): stanforNLP_server_cmd = " java -mx4g -cp * edu.stanford.nlp.pipeline.StanfordCoreNLPServer -preload tokenize,ssplit,pos,lemma,parse,depparse -status_port 9000 -port 9000 -timeout 15000" self.start_server = Popen( stanforNLP_server_cmd.split(), cwd="G:\lib\stanford-corenlp-full-2016-10-31", stderr=PIPE, stdout=PIPE, shell=True) while (True): line = str(self.start_server.stderr.readline()) print(line) success_mark = 'StanfordCoreNLPServer listening at' except_mark = 'Address already in use' if success_mark in line: print("server started...") break elif except_mark in line: print("server already started or port occupied...") break self.start_server.stderr.close() self.start_server.stdout.close()
from colorama import init, Fore, Back, Style from nltk import CoreNLPParser init() # Read out input file to get the list of users input_data = [] user_data = {} INPUT_FILE = './ios_vs_android.csv' INPUT_DIR = './crawled_data' OUTPUT_DIR = './nlp_data' NLP_URL = 'http://nlp:9000' # Set up the parser beforehand parser = CoreNLPParser(url=NLP_URL, encoding='utf8', tagtype='pos') # Kind of lines we would like to ignore IGNORE_LIST = [ u' ', u'**%100$ percent dollars**', u'Don’t waste your money. I spent $10 for this feature and only works %10 of the time, usually just breaks the page loading at all requiring whitelisting instead.', u'She’s using a 4S??? Get her the phone. %100', u'Edit: also worth to note, if you choose “don’t trust” when plugging in it will only charge at .5 because it doesn’t report itself as an iPhone, but that should still be way faster than %10 in three hours', u'It was horrendously buggy for me. I never had an issue with the actual function of sync seemed to keep my stuff in order between my iPad and iPhone, but man I could immeaditly tell when the sync started because my CPU would ramp up to %100 and my battery would start draining like crazy, and the messages app was so laggy I couldn’t even type, and would be like this for like 15+ minutes.', u'[I got the mandelbrot.](https://anvaka.github.io/pplay/?tx=0&ty=0&scale=1&fc=vec4%20get_color%28vec2%10p%29%20%7B%0A%20%20float%20t%20%3D%200.%3B%0A%20%20vec2%20z%20%3D%20p%3B%0A%20%20vec2%20c%20%3D%20vec2%280.60891%2C%200.89098%29%3B%0A%20%20float%20frames%20%3D%20600.%3B%0A%20%20float%20a%20%3D%203.14*%202.%20*%20bease%28mod%28iFrame%2C%20frames%29%2Fframes%29%3B%0A%0A%0A%20%20for%28int%20i%20%3D%200%3B%20i%20%3C%2032%3B%20%2B%2Bi%29%20%7B%0A%20%20%20%20if%20%28length%28z%29%20%3E%202.%29%20break%3B%0A%20%20%20%20z%20%3D%20c_mul%28c_exp%28z%29%20*%20sin%28a%29%2C%20z%29%20%2B%20c%3B%0A%20%20%20%20t%20%3D%20float%28i%29%3B%0A%20%20%7D%0A%0A%20%20return%20vec4%28length%28z%29%20*%20t%20*%20vec3%281.%2F64.%2C%201.%2F32.%2C%201.%2F16.%29%2C%201.0%29%3B%0A%7D)', ] def user_id_to_input_filename(user_id): ''' Construct a file name from user_id
from nltk import pos_tag from nltk import CoreNLPParser text = 'The foods are eaten'.split() print(pos_tag(text)) parser = CoreNLPParser() print(parser.parse(text))
def read_from_textgrid (self, file_list): pos_tagger = CoreNLPParser('http://localhost:9002', tagtype='pos') table = str.maketrans(dict.fromkeys(string.punctuation.replace("[\\]",""))) variant_match = dict() for r in zip(self.lex_table['word_variant'], self.lex_table['word_standard'], self.lex_table['word_vars'], self.lex_table['POS_tag'], self.lex_table['word_lemma'], self.lex_table['word_stem']): # dict with variant as key. # if no match tag the thing v_pattern = compile_pattern(r[0], r[2]) if v_pattern not in variant_match.keys(): variant_match[v_pattern] = [] # else: # print(v_pattern) variant_match[v_pattern].append(r) gehen_variants = set() locations = self.lex_table.loc[self.lex_table['word_lemma'] == 'gehen'] for gehen_var in zip(locations['word_variant'], locations['word_vars']): if "SAF5" not in gehen_var[1]: g_pattern = compile_pattern(gehen_var[0], gehen_var[1]) gehen_variants.add(g_pattern) words_h = [] skip_begin = False skip_begin_tag = "" skip_end_file = "" for each_file_name in file_list: original_words = read_txt(self.rootpath, each_file_name) context = [] rel = False tag_pattern = re.compile("\[[^\[\]]*\]") # collect all the tags tags = [] for i, ow in enumerate(original_words): find_tag = re.search(tag_pattern, ow) # there could be more than one [REL] tag in there. S016-17-I-1-Manni_692. ['der', '[REL]', 'halt', 'e', 'groß---', '-eh-', 'dessen', '[REL', 'Garten', 'groß'] if find_tag: tag = find_tag.group(0) tags.append(tag) # print(tag) elif "[" in ow or "]" in ow: print("incorrect tag in:", each_file_name) print(ow) print(original_words) if tags: for tag in tags: if tag == '[REL]': rel = True context.append(original_words[i-1].translate(table)) elif tag in self.tags.keys(): print(each_file_name) print("Skipping:", tag) skip_begin = True skip_begin_tag = tag elif tag in self.tags.values(): if tag == self.tags[skip_begin_tag]: print(each_file_name) print("Skipping:", tag) skip_begin = False # this will not skip the file which contains the end tag skip_end_file = each_file_name # skip the file that contains the end tag else: print("Wrong end tag:", tag) # maybe should skip before the Aligner. Just have one that operates on TextGrid and WAV then no skipping in extract if skip_begin: print("Skipping:", original_words) continue if each_file_name == skip_end_file: # print("Skipping:", original_words) continue # print("filename:", each_file_name) interval_num = 0 file_path = self.rootpath + each_file_name try: file_textgrid_obj = textgrid.TextGrid.fromFile(file_path) except ValueError: print(each_file_name +'value error has occured') os.rename(self.rootpath + +each_file_name, working_directory + 'valueError/' + each_file_name) continue tier_list = file_textgrid_obj.tiers for each_tier in tier_list: if each_tier.name == 'words': tier_words = each_tier intervals_words = tier_words.intervals elif each_tier.name == 'segments': tier_segments = each_tier intervals_segments = tier_segments.intervals count = 0 current_minTime = 0 seg_num = 1 diphthong_num = 0 diphthong_dict = {'a͜i': {'ua', 'ai', 'êi', 'ei', 'âi', 'aî', 'ãi'}, 'a͜u': {'au', 'âu'}, 'ɔ͜y': {'ôi', 'eu', 'äu', 'oi', 'êu', 'eü', 'oî'}} # print(each_file_name) try: for i, each_word in enumerate(intervals_words): add_rel = False word_start_time = each_word.minTime word_end_time = each_word.maxTime word_mark = each_word.mark if word_mark not in original_words and "<" not in word_mark: match = [ow.translate(table) for ow in original_words if word_mark == clean_word(ow)] if not match: words_h.append((word_mark, original_words, each_file_name)) continue # some words just turned to h. for unknown reason # investigate else: word_mark = match[0].replace("[ge]", "") if rel: if word_mark == context[0] or word_mark == clean_word(context[0]): add_rel = True # maybe not do it here is better rel = False # avoid if "wo" in word_mark: rel_var = " RELd" elif "als" in word_mark or word_mark.startswith("d") or word_mark.startswith("wel") or word_mark.startswith("jed"): rel_var = " RELs" elif ("was" in word_mark) or ("wie" in word_mark) or ("wer" in word_mark): rel_var = " RLOs" else: rel_var = " UNK" std_list = set() ddm_list = set() pos_list = set() lemma_list = set() stem_list = set() no_match = True for p in variant_match.keys(): if p.search(word_mark) is not None: if any("IRV" in d for d in ddm_list): # print(" ".join(ddm_list)) break no_match = False replace = True for values in variant_match[p]: if "*" in values[0] and "*" not in values[1]: replace = False w_var = values[0].replace("*", "") # word variant w_std = values[1].replace("*", "") # word standard if std_list: tmp_std = set() while std_list: s = std_list.pop() if p.search(s) is not None: if replace: std = s.replace(w_var, w_std) else: std = values[1] tmp_std.add(std) else: tmp_std.add(s) std_list.update(tmp_std) else: if replace: std = word_mark.replace(w_var, w_std) else: std = values[1] std_list.add(std) lemma = values[4] stem = values[5] lemma_list.add(lemma) stem_list.add(stem) # if "SAF5" in values[2]: # print(word_mark) # if "ge" if isinstance(values[2], float) and math.isnan(values[2]): # check for empty var_code ddm_list.add(' ') # do nothing else: ddm_list.add(values[2]) # should be set pos_list.add(values[3]) if no_match: word_german = word_mark var_code = " " pos_tag = pos_tagger.tag([word_german])[0][1] word_lemma = word_german word_stem = " " else: var_code = " ".join(str(d) for d in ddm_list) if any("SAF5" in d for d in ddm_list): for g_pattern in gehen_variants: if g_pattern.search(word_mark) is not None: var_code = var_code.replace("SAF5d", "") var_code = var_code.replace("SAF5s", "") word_german = " ".join(std_list) if len(std_list) > 1: print(word_mark, "std: ", word_german) word_lemma = " ".join(lemma_list) word_stem = " ".join(stem_list) pos_tag = " ".join(str(p) for p in pos_list) if add_rel: var_code = var_code + rel_var var_code = var_code.strip() try: vowel_orthography = find_two_vowel(word_mark) while (intervals_segments[interval_num].minTime >= word_start_time) & \ (intervals_segments[interval_num].maxTime <= word_end_time): segment_start_time = intervals_segments[interval_num].minTime segment_end_time = intervals_segments[interval_num].maxTime segment_mark = intervals_segments[interval_num].mark diphthong_orthography = " " if len(segment_mark) == 3 and "_" not in segment_mark and "ː" not in segment_mark: # print(segment_mark) # print(word_mark) if vowel_orthography[diphthong_num].lower() in diphthong_dict[segment_mark]: diphthong_orthography = vowel_orthography[diphthong_num] elif any(vow_bigram.lower() in diphthong_dict[segment_mark] for vow_bigram in vowel_orthography): for vow_bigram in vowel_orthography: if vow_bigram.lower() in diphthong_dict[segment_mark]: diphthong_orthography = vow_bigram else: print(vowel_orthography) print(vowel_orthography[diphthong_num]) print(word_mark) print(segment_mark) diphthong_num += 1 if word_start_time > current_minTime: seg_num = 1 diphthong_num = 0 current_minTime = word_start_time output_flag = False if word_mark not in original_words: match_ow = [ow for ow in original_words if word_mark == clean_word(ow)] if match_ow: word_original = match_ow[0] if word_filter(word_original)[0]: output_flag = True else: output_flag = True if var_code.strip(): # quick fix output_flag = True if output_flag: self.output_as_csv(each_file_name[:-9], word_start_time, word_end_time, word_mark, seg_num, segment_start_time, segment_end_time, segment_mark, diphthong_orthography, var_code, word_german, word_lemma, word_stem, pos_tag) else: if "<" not in word_mark: print("not a word: ", each_file_name[:-9], word_start_time, word_end_time, word_mark, var_code, word_german) seg_num += 1 interval_num += 1 except IndexError: interval_num = 0 if word_mark != '<P>': count += 1 except AttributeError as e: print(each_file_name+': tier words is empty or does not exist ') traceback.print_tb(e.__traceback__) with open('words_tran_error.txt', mode='w', newline="\n") as f: for item in words_h: f.write(str(item) + "\n")
def __init__(self, tokenizer=CoreNLPParser()): super(ByteDataPipe, self).__init__(tokenizer) self.tokenizer = tokenizer self.partten_1 = re.compile('[a-z]\.[A-Z]') self.partten_2 = re.compile('[a-z][A-Z]')