#TODO:Check lines are equal in SLFs and TLFs. #Command line params min_fms = float(args.min_fms) max_len = int(args.max_len) #Training file pointers file1 = open(args.SLF) file2 = open(args.TLF) src_sentences, tgt_sentences = [], [] while True: line = preprocess(file1.readline()) line1 = preprocess(file2.readline()) if not line or not line1: break if len(line.split()) > max_len: continue src_sentences.append(line) tgt_sentences.append(line1) #Close files file1.close() file2.close() #Testing file pointers file3 = open(args.SLFT) file4 = open(args.TLFT)
min_len = int(args.min_len) max_len = int(args.max_len) all_files = os.listdir(path) files_map = {} test_sentences = 0 fmses = [] for file1 in all_files: match = re.match(r'[a-z]{2}\.[a-z]{2}\-[a-z]{2}\.(test|train)', file1) if match: print(file1) src_sentences = [] f1 = open(path+'/'+file1) while True: line = preprocess(f1.readline()) if not line: break if line == '': continue src_sentences.append(line) sys.setrecursionlimit(10000) for i in range(len(src_sentences)): for j in range(i+1, len(src_sentences)): s, s1 = src_sentences[i], src_sentences[j] fms = FMS(s, s1).calculate_using_wanger_fischer() fmses.append(fms) break
from lib.utilities import preprocess, assertion, is_subsegment parser = argparse.ArgumentParser(description='Generates set D.') parser.add_argument('S', help='Second Sentence') parser.add_argument('T', help='First Sentence Translation') parser.add_argument('S1', help='Second Sentence') parser.add_argument('LP', help='Language Pair') parser.add_argument('-d', help='Specify the lanuguage-pair installation directory') parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8') parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2') parser.add_argument('--max-len', help='Maximum length of sub-string allowed.') args = parser.parse_args() #Applying some preprocessing on input data. s_sentence = preprocess(args.S) t_sentence = preprocess(args.T) s1_sentence = preprocess(args.S1) lp = args.LP lps = lp.split('-') #Testing Input data assertion(s_sentence != "", "S should not be blank.\nSee -h for help") assertion(s1_sentence != "", "S1 should not be blank.\nSee -h for help") assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'") #Read optional params lp_dir = args.d min_fms = float(args.min_fms) min_len = int(args.min_len)
parser.add_argument("LP", help="Language Pair for TM (for example en-eo)") parser.add_argument("-v", help="Verbose Mode", action="store_true") parser.add_argument("-t", help="Show patching traces", action="store_true") parser.add_argument("-c", help="Specify the sqlite3 db to be used for caching", default="") parser.add_argument("-d", help="Specify the language-pair installation directory") parser.add_argument("--cam", help="Only those patches which cover all the mismatches", action="store_true") parser.add_argument("--go", help="To patch only grounded mismatches", action="store_true") parser.add_argument("--bo", help="Prints the best possible transalation only", action="store_true") parser.add_argument("--min-fms", help="Minimum value of fuzzy match score of S and S1.", default="0.8") parser.add_argument("--min-len", help="Minimum length of sub-segment allowed.", default="2") parser.add_argument("--max-len", help="Maximum length of sub-segment allowed.", default="5") args = parser.parse_args() # Applying some preprocessing on input data. s_sentence = preprocess(args.S) tmxfile = preprocess(args.TM) lp = args.LP lps = lp.split("-") # Testing Input data assertion(s_sentence != "", "S should not be blank. See -h for help") assertion(os.path.isfile(tmxfile), "TM does not exist") assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'") # Read optional params cache = args.c lp_dir = args.d verbose = args.v show_traces = args.t