from nltk import sent_tokenize from functions import _read, write_lines # English lines = _read("global.en") new = [] for line in lines: new += sent_tokenize(line) write_lines(new, "global.en") # Nepali lines = _read("global.ne") import re new = [] for line in lines: new += re.split("ред\s", line) write_lines(new, "global.ne")
from functions import _read, write_lines, xml_to_text,\ text_to_docx, check_repetition_pair filename = "mono" # write_lines(xml_to_text(filename), filename + ".ne") write_lines(xml_to_text("mono"), "mono.txt") # lines = check_repetition(_read("a.ne"))
tmx_file = "NP8" tree = etree.parse(tmx_file + ".tmx") paragraphs = tree.findall(".//seg") lines = [] en,ne = [],[] counter = 1 for paragraph in paragraphs: line = ''.join(etree.tostring(paragraph, method="text", encoding="unicode").split("\n")).strip() lines.append(line) if counter % 2 == 1: en.append(line) else: ne.append(line) counter += 1 summarize(lines) summarize(en) summarize(ne) write_lines(en, tmx_file + ".en") write_lines(ne, tmx_file + ".ne")
from functions import _read, write_lines, xml_to_text, summarize, length_filter import os directory = "mono/" lines = [] for file in os.listdir(directory): lines += xml_to_text(directory + os.path.splitext(file)[0]) lines = length_filter(lines, 25) print("Total: ") summarize(lines) write_lines(lines, "e-h.ne")
while i < len(a): if re.match("^([a-z0-9])+[^0-9i\.\)]", a[i]): a[i - 1] = a[i - 1].strip() + ' ' + a[i].strip() del (a[i]) else: i += 1 # Joins a numeral line to the next line i = 0 while i < len(a) - 1: if len(a[i]) < 3 and re.match("^([a-z0-9]){1,2}[\.\)]\s*", a[i]): a[i] = a[i].strip() + ' ' + a[i + 1].strip() del (a[i + 1]) i += 1 write_lines(a, "1_bpf.en") # For Nepali # Removes lines with only purnabiraams i = 0 while i < len(b): if re.match("^\ред", b[i]): del (b[i]) i += 1 # Joins a numeral line to the next line i = 0 while i < len(b) - 1: if len(b[i]) < 3 and re.match("^([a-z0-9]){1,2}[\.\)]\s*", b[i]): b[i] = b[i].strip() + ' ' + b[i + 1].strip()
# sents = set() # for (en, ne) in pairs: # try: # if len(ne.split()) > 3 and detect(ne) in ('hi', 'ne'): # sents.add((en, ne)) # except Exception: # pass scores = [] sents = [] for (en, ne) in zip(eng, nep): score = length_similarity(en, ne) try: if score > 0.53: sents.append((en, ne)) except TypeError: pass print(len(scores)) g1, g2 = [], [] for (en, ne) in sents: g1.append(en) g2.append(ne) write_lines(g1, "gnome_final.en") write_lines(g2, "gnome_final.ne")
# write_lines(g2, "PR_improved.ne") # REMOVING REPETITIONS IN A SET A, B = _read("globalvoices_improved.en"), _read("globalvoices_improved.ne") lines = set(zip(A, B)) _A = [] final = set() repetitions = [] count = 0 for (en, ne) in lines: if en not in _A: _A.append(en) final.add((en, ne)) else: repetitions.append(en) count += 1 g1, g2 = [], [] # print(len(repetitions)) for (en, ne) in final: g1.append(en) g2.append(ne) g1, g2 = check_repetition_pair(A, B) write_lines(g1, "global.en") write_lines(g2, "global.ne")
from functions import _read, write_lines, remove_blank_lines file = "mono" lines = _read(file + ".en") write_lines(remove_blank_lines(lines), file + ".en")