def trans_list(text, root): trans_set = [] for line in text: tmp = [] #print(line) line = line.replace("\n", "") word = line.split(" ") flag = 0 for x in range(len(word)): flag = 0 if word[x] == " ": continue word_length = len(word[x]) # find the word match while word_length > 1 and flag == 0: # find ngram search result_list = ngram.search(root, word[x][:word_length]) #if match if result_list != []: for result in result_list: value = result.split(" ") arr = [] count = 0 for i in range(len(value)): arr.append("0") arr[0] = "1" for next_word_index in range(len(value)): if next_word_index + x < len(word) and ( word[x + next_word_index] == value[next_word_index] or next_word_index == 0 and word[x] [:word_length] == value[next_word_index]) and ( next_word_index >= 1 and arr[next_word_index - 1] == "1" or next_word_index == 0 and arr[0] == "1"): arr[next_word_index] = "1" count = count + 1 if count != 0 and ( count != 1 and count == len(value) or len(value) == 1 and count == 1) and arr[len(arr) - 1] == "1": val = ngram.findValue(root, result) if val == 'none': continue val = val.lower() synoni = val.split(" & ") tmp += synoni tmp = list(set(tmp)) for i in range(len(value)): word[x + i] = " " flag = 1 break word_length = word_length - 1 trans_set.append(tmp) return trans_set
def trans_list(l, root): trans_set = [] kor_set = [] article_word_index = 0 article_line_index = 0 while 1: if (article_line_index >= len(l)): break line = l[article_line_index].replace("\n", "") split_one_line = line.split() print(line) if (len(split_one_line) == 0): article_line_index += 1 trans_set.append([]) kor_set.append([]) continue line_set = [] kortmp_set = [] while 1: if (len(split_one_line[article_word_index]) != 1): search_result_list = ngram.search( root, split_one_line[article_word_index]) if (search_result_list != "none"): for x in search_result_list: length = len(x.split()) to_find_string = "" if (article_word_index + length < len(split_one_line)): for y in range(length): to_find_string = to_find_string + " " + split_one_line[ article_word_index + y] if (to_find_string[1:].find(x) == 0): print("find : ", x) c = ngram.findValue(root, x) kortmp_set.append(x) print("ngram result : ", c) article_word_index = article_word_index + length - 1 sp = c.split(" & ") for i in sp: line_set.append(i) break article_word_index += 1 if (article_word_index >= len(split_one_line)): trans_set.append(line_set) kor_set.append(kortmp_set) article_line_index += 1 article_word_index = 0 break return trans_set, kor_set
def extractNNP_KOR(root, i): try: read_page = open(header_path.format(lang='kor', idx=i), "rU", encoding='UTF8') except: return -1 trans_set = [] for line in read_page: tmp = [] line = line.replace("\n", "") word = line.split(" ") flag = 0 for x in range(len(word)): flag = 0 if word[x] == " ": continue word_length = len(word[x]) # find the word match while word_length > 1 and flag == 0: #print("word[x] : ", word[x][:word_length]) # find ngram search result_list = ngram.search(root, word[x][:word_length]) if result_list != []: for result in result_list: value = result.split(" ") #print("===============") #print("value : ", value) arr = [] count = 0 for i in range(len(value)): arr.append("0") arr[0] = "1" for next_word_index in range(len(value)): if next_word_index + x < len(word) and ( word[x + next_word_index] == value[next_word_index] or next_word_index == 0 and word[x] [:word_length] == value[next_word_index]) and ( next_word_index >= 1 and arr[next_word_index - 1] == "1" or next_word_index == 0 and arr[0] == "1"): arr[next_word_index] = "1" count = count + 1 if count != 0 and ( count != 1 and count == len(value) or len(value) == 1 and count == 1) and arr[len(arr) - 1] == "1": val = ngram.findValue(root, result) synoni = val.split(" & ") tmp += synoni for i in range(len(value)): word[x + i] = " " flag = 1 break word_length = word_length - 1 trans_set.append(tmp) read_page.close() #print(trans_set) return trans_set
def trans_list(l, root): trans_set = [] article_word_index = 0 article_line_index = 0 korset = [] while 1: if (article_line_index >= len(l)): break line = l[article_line_index].replace("\n", "") split_one_line = line.split() if (len(split_one_line) == 0): korset.append([]) trans_set.append([]) article_line_index += 1 continue line_set = [] korTmp = [] while 1: if (len(split_one_line[article_word_index]) != 1): search_result_list = ngram.search( root, split_one_line[article_word_index]) #print(split_one_line[article_word_index]) #print(search_result_list) if (len(search_result_list) != 0): for x in search_result_list: length = len(x.split()) to_find_string = "" if (article_word_index + length < len(split_one_line)): for y in range(length): to_find_string = to_find_string + " " + split_one_line[ article_word_index + y] if (to_find_string[1:].find(x) == 0): c = ngram.findValue(root, x) article_word_index = article_word_index + length - 1 sp = c.split(" & ") korTmp.append(x) for i in sp: line_set.append(i) break length = len(split_one_line[article_word_index]) - 1 #print(split_one_line[article_word_index][:length]) if length > 3: wordLimit = length - 3 else: wordLimit = 1 while length > wordLimit: candi = ngram.findValue( root, split_one_line[article_word_index][:length]) #print(candi) if candi == "none": length -= 1 continue else: sp = candi.split(" & ") for i in sp: line_set.append(i) korTmp.append( split_one_line[article_word_index][:length]) break article_word_index += 1 if (article_word_index >= len(split_one_line)): korset.append(korTmp) trans_set.append(line_set) article_line_index += 1 article_word_index = 0 break return trans_set, korset