reload(sys) sys.setdefaultencoding('utf-8') parser = argparse.ArgumentParser(description='Preprocess the corpus for generating input for reg_test') parser.add_argument('SLF', help='Source Language file for training') parser.add_argument('TLF', help='Target Language file for training') parser.add_argument('SLFT', help='Source Language file for testing') parser.add_argument('TLFT', help='Target Language file for testing') parser.add_argument('OUT', help='Output file for saving pairs', default='out.txt') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') parser.add_argument('--max-len', help='Maximum length of sentences allowed.', default='25') args = parser.parse_args() #Make sure all files exist assertion(os.path.isfile(args.SLF), "Source Language file for training could not be found.") assertion(os.path.isfile(args.TLF), "Target Language file for training could not be found.") assertion(os.path.isfile(args.SLFT), "Source Language file for testing could not be found.") assertion(os.path.isfile(args.TLFT), "Target Language file for testing could not be found.") #TODO:Check lines are equal in SLFs and TLFs. #Command line params min_fms = float(args.min_fms) max_len = int(args.max_len) #Training file pointers file1 = open(args.SLF) file2 = open(args.TLF)
reload(sys) sys.setdefaultencoding('utf-8') parser = argparse.ArgumentParser(description='Regression test for repair.py') parser.add_argument('D', help='Corpus directory.') parser.add_argument('-d', help='Specify the lanuguage-pair installation directory') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2') parser.add_argument('--max-len', help='Maximum length of sub-string allowed.', default='5') args = parser.parse_args() #Preprocessing path = args.D assertion(os.path.isdir(path), "Directory not found.") #Command line params lp_dir = args.d min_fms = float(args.min_fms) min_len = int(args.min_len) max_len = int(args.max_len) all_files = os.listdir(path) files_map = {} test_sentences = 0 fmses = [] for file1 in all_files: match = re.match(r'[a-z]{2}\.[a-z]{2}\-[a-z]{2}\.(test|train)', file1) if match:
parser.add_argument('-d', help='Specify the lanuguage-pair installation directory') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2') parser.add_argument('--max-len', help='Maximum length of sub-string allowed.') args = parser.parse_args() #Applying some preprocessing on input data. s_sentence = preprocess(args.S) t_sentence = preprocess(args.T) s1_sentence = preprocess(args.S1) lp = args.LP lps = lp.split('-') #Testing Input data assertion(s_sentence != "", "S should not be blank.\nSee -h for help") assertion(s1_sentence != "", "S1 should not be blank.\nSee -h for help") assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'") #Read optional params lp_dir = args.d min_fms = float(args.min_fms) min_len = int(args.min_len) max_len = int(args.max_len) if args.max_len else max(len(s_sentence.split()), len(s1_sentence.split())) #Calculate FMS between S and S1. fms = FMS(s_sentence, s1_sentence).calculate() #Exit if low FMS. assertion(fms >= min_fms, "Sentences have low fuzzy match score of %.02f." %fms)
def _do_translations(self, dir=None): S = self.s_sentence.split() S1 = self.s1_sentence.split() src = "" src1 = "" self.mismatches_map = {} self.src_trans_map = {} self.src_trans_map1 = {} could_be_done_from_caching = True for a,b,c,d in self.phrases: try: self.mismatches_map[(a,b)].append((c,d)) except KeyError: self.mismatches_map[(a,b)] = [(c,d)] if self.caching: tgt_segments, tgt1_segments = [], [] for a,b,c,d in self.phrases: str1 = ' '.join(S[a: b+1]) str2 = ' '.join(S1[c: d+1]) tgt1 = self.cacher.retrieve(str1) tgt2 = self.cacher.retrieve(str2) if not (tgt1 and tgt2): could_be_done_from_caching = False break tgt_segments.append(tgt1[0]) tgt1_segments.append(tgt2[0]) if could_be_done_from_caching: for (x, t, t1) in zip(self.phrases, tgt_segments, tgt1_segments): (a,b,c,d) = x self.src_trans_map[(a,b)] = t self.src_trans_map1[(c,d)] = t1 if not self.caching or not could_be_done_from_caching: for a,b,c,d in self.phrases: str1 = ' '.join(S[a: b+1]) str2 = ' '.join(S1[c: d+1]) src += str1 + '.|' src1 += str2 + '.|' src_combined = src+'.||.'+src1 #Get translations for segments. (out, err) = self.apertium.translate(src_combined, dir) # print(out, err) assertion(err == '', "Apertium error: "+err) (out, out1) = out.split('.||.') tgt_segments = out.split('.|') tgt1_segments = out1.split('.|') for (x, t, t1) in zip(self.phrases, tgt_segments[:-1], tgt1_segments[:-1]): (a,b,c,d) = x self.src_trans_map[(a,b)] = t self.src_trans_map1[(c,d)] = t1 if self.caching: str1 = ' '.join(S[a: b+1]) str2 = ' '.join(S1[c: d+1]) try: self.cacher.insert(str1, t) self.cacher.insert(str2, t1) except Exception: pass
from lib.fms import FMS from lib.utilities import assertion from lib.ap import Apertium from lib.phrase_extractor import PhraseExtractor from lib.utilities import preprocess, assertion, get_subsegment_locs, patch parser = argparse.ArgumentParser(description='Calculate the distribution of FMS between pair of sentences.') parser.add_argument('F', help='Corpus path.') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') args = parser.parse_args() #Preprocessing file1 = args.F assertion(os.path.isfile(file1), "Corpus not found.") #Command line params min_fms = float(args.min_fms) fmses = [] src_sentences = [] f1 = open(file1) while True: line = preprocess(f1.readline()) if not line: break if line == '': continue src_sentences.append(line)
parser.add_argument('-d', help='Specify the language-pair installation directory') parser.add_argument('-c', help='Specify the sqlite3 db to be used for caching', default='') parser.add_argument('-v', help='Verbose Mode', action='store_true') parser.add_argument('--mode', help="Modes('all', 'cam', 'compare')", default='all') parser.add_argument('--go', help='To patch only grounded mismatches', action='store_true') parser.add_argument('--bo', help='Uses the best possible transalation only', action='store_true') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2') parser.add_argument('--max-len', help='Maximum length of sub-string allowed.', default='5') args = parser.parse_args() #Preprocessing lp = args.LP lps = lp.split('-') assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'") #Make sure all files exist assertion(os.path.isfile(args.out), args.out+" doesn't exist") #TODO:Check lines are equal in SLFs and TLFs. #Command line params cache = args.c lp_dir = args.d verbose = args.v mode = args.mode.lower() assertion(mode in ['all', 'cam', 'compare'], "Mode couldn't be identified.") grounded = args.go best_only = args.bo
from lib.phrase_extractor import PhraseExtractor parser = argparse.ArgumentParser(description='Generates set A.') parser.add_argument('S', help='First Sentence') parser.add_argument('S1', help='Second Sentence') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2') parser.add_argument('--max-len', help='Maximum length of sub-string allowed.') args = parser.parse_args() #Applying some preprocessing on input data. s_sentence = preprocess(args.S) s1_sentence = preprocess(args.S1) #Testing Input data assertion(s_sentence != "", "S should not be blank.\nSee -h for help") assertion(s1_sentence != "", "S1 should not be blank.\nSee -h for help") min_fms = float(args.min_fms) min_len = int(args.min_len) max_len = int(args.max_len) if args.max_len else max(len(s_sentence.split()), len(s1_sentence.split())) fms = FMS(s_sentence, s1_sentence).calculate() assertion(fms >= min_fms, "Sentences have low fuzzy match score of %.02f." %fms) phrase_extractor = PhraseExtractor(s_sentence, s1_sentence, min_len, max_len) a_set = phrase_extractor.extract_pairs() # print set A S = s_sentence.split()
parser.add_argument("--go", help="To patch only grounded mismatches", action="store_true") parser.add_argument("--bo", help="Prints the best possible transalation only", action="store_true") parser.add_argument("--min-fms", help="Minimum value of fuzzy match score of S and S1.", default="0.8") parser.add_argument("--min-len", help="Minimum length of sub-segment allowed.", default="2") parser.add_argument("--max-len", help="Maximum length of sub-segment allowed.", default="5") args = parser.parse_args() # Applying some preprocessing on input data. s_sentence = preprocess(args.S) tmxfile = preprocess(args.TM) lp = args.LP lps = lp.split("-") # Testing Input data assertion(s_sentence != "", "S should not be blank. See -h for help") assertion(os.path.isfile(tmxfile), "TM does not exist") assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'") # Read optional params cache = args.c lp_dir = args.d verbose = args.v show_traces = args.t cover_all = args.cam grounded = args.go best_only = args.bo min_fms = float(args.min_fms) min_len = int(args.min_len) max_len = int(args.max_len)