Example #1
0
reload(sys)
sys.setdefaultencoding('utf-8')

parser = argparse.ArgumentParser(description='Preprocess the corpus for generating input for reg_test')
parser.add_argument('SLF', help='Source Language file for training')
parser.add_argument('TLF', help='Target Language file for training')
parser.add_argument('SLFT', help='Source Language file for testing')
parser.add_argument('TLFT', help='Target Language file for testing')
parser.add_argument('OUT', help='Output file for saving pairs', default='out.txt')

parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
parser.add_argument('--max-len', help='Maximum length of sentences allowed.', default='25')
args = parser.parse_args()

#Make sure all files exist
assertion(os.path.isfile(args.SLF), "Source Language file for training could not be found.")
assertion(os.path.isfile(args.TLF), "Target Language file for training could not be found.")
assertion(os.path.isfile(args.SLFT), "Source Language file for testing could not be found.")
assertion(os.path.isfile(args.TLFT), "Target Language file for testing could not be found.")

#TODO:Check lines are equal in SLFs and TLFs.

#Command line params
min_fms = float(args.min_fms)
max_len = int(args.max_len) 

#Training file pointers
file1 = open(args.SLF)
file2 = open(args.TLF)

Example #2
0
reload(sys)
sys.setdefaultencoding('utf-8')

parser = argparse.ArgumentParser(description='Regression test for repair.py')
parser.add_argument('D', help='Corpus directory.')

parser.add_argument('-d', help='Specify the lanuguage-pair installation directory')
parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2')
parser.add_argument('--max-len', help='Maximum length of sub-string allowed.', default='5')
args = parser.parse_args()

#Preprocessing
path = args.D
assertion(os.path.isdir(path), "Directory not found.")

#Command line params
lp_dir = args.d
min_fms = float(args.min_fms)
min_len = int(args.min_len)
max_len = int(args.max_len) 

all_files = os.listdir(path)
files_map = {}
test_sentences = 0
fmses = []

for file1 in all_files:
	match = re.match(r'[a-z]{2}\.[a-z]{2}\-[a-z]{2}\.(test|train)', file1)
	if match:
Example #3
0
parser.add_argument('-d', help='Specify the lanuguage-pair installation directory')
parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2')
parser.add_argument('--max-len', help='Maximum length of sub-string allowed.')
args = parser.parse_args()

#Applying some preprocessing on input data.
s_sentence = preprocess(args.S)
t_sentence = preprocess(args.T)
s1_sentence = preprocess(args.S1)

lp = args.LP
lps = lp.split('-')

#Testing Input data
assertion(s_sentence != "", "S should not be blank.\nSee -h for help")
assertion(s1_sentence != "", "S1 should not be blank.\nSee -h for help")
assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'")

#Read optional params
lp_dir = args.d
min_fms = float(args.min_fms)
min_len = int(args.min_len)
max_len = int(args.max_len) if args.max_len else max(len(s_sentence.split()), len(s1_sentence.split()))

#Calculate FMS between S and S1.
fms = FMS(s_sentence, s1_sentence).calculate()

#Exit if low FMS.
assertion(fms >= min_fms, "Sentences have low fuzzy match score of %.02f." %fms)
Example #4
0
	def _do_translations(self, dir=None):
		S = self.s_sentence.split()
		S1 = self.s1_sentence.split()
		
		src = ""
		src1 = ""
		self.mismatches_map = {}
		self.src_trans_map = {}
		self.src_trans_map1 = {}
		could_be_done_from_caching = True

		for a,b,c,d in self.phrases:
			try:
				self.mismatches_map[(a,b)].append((c,d))
			except KeyError:
				self.mismatches_map[(a,b)] = [(c,d)]

		if self.caching:
			tgt_segments, tgt1_segments = [], []
			for a,b,c,d in self.phrases:
				str1 = ' '.join(S[a: b+1])
				str2 = ' '.join(S1[c: d+1])
				
				tgt1 = self.cacher.retrieve(str1)
				tgt2 = self.cacher.retrieve(str2)
				
				if not (tgt1 and tgt2):
					could_be_done_from_caching = False
					break
				tgt_segments.append(tgt1[0])
				tgt1_segments.append(tgt2[0])

			if could_be_done_from_caching:
				for (x, t, t1) in zip(self.phrases, tgt_segments, tgt1_segments):
					(a,b,c,d) = x
					self.src_trans_map[(a,b)] = t
					self.src_trans_map1[(c,d)] = t1

		if not self.caching or not could_be_done_from_caching:
			for a,b,c,d in self.phrases:
				str1 = ' '.join(S[a: b+1])
				str2 = ' '.join(S1[c: d+1])

				src += str1 + '.|'
				src1 += str2 + '.|'

			src_combined = src+'.||.'+src1

			#Get translations for segments.
			(out, err) = self.apertium.translate(src_combined, dir)
			# print(out, err)
			assertion(err == '', "Apertium error: "+err)
			(out, out1) = out.split('.||.')

			tgt_segments = out.split('.|')
			tgt1_segments = out1.split('.|')

			for (x, t, t1) in zip(self.phrases, tgt_segments[:-1], tgt1_segments[:-1]):
				(a,b,c,d) = x
				self.src_trans_map[(a,b)] = t
				self.src_trans_map1[(c,d)] = t1
				if self.caching:
					str1 = ' '.join(S[a: b+1])
					str2 = ' '.join(S1[c: d+1])
					try:
						self.cacher.insert(str1, t)
						self.cacher.insert(str2, t1)
					except Exception:
						pass
Example #5
0
from lib.fms import FMS
from lib.utilities import assertion
from lib.ap import Apertium
from lib.phrase_extractor import PhraseExtractor
from lib.utilities import preprocess, assertion, get_subsegment_locs, patch


parser = argparse.ArgumentParser(description='Calculate the distribution of FMS between pair of sentences.')
parser.add_argument('F', help='Corpus path.')

parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
args = parser.parse_args()

#Preprocessing
file1 = args.F
assertion(os.path.isfile(file1), "Corpus not found.")

#Command line params
min_fms = float(args.min_fms)

fmses = []
src_sentences = []
f1 = open(file1)

while True:
	line = preprocess(f1.readline())
	if not line:
		break
	if line == '':
		continue
	src_sentences.append(line)
Example #6
0
parser.add_argument('-d', help='Specify the language-pair installation directory')
parser.add_argument('-c', help='Specify the sqlite3 db to be used for caching', default='')
parser.add_argument('-v', help='Verbose Mode', action='store_true')
parser.add_argument('--mode', help="Modes('all', 'cam', 'compare')", default='all')
parser.add_argument('--go', help='To patch only grounded mismatches', action='store_true')
parser.add_argument('--bo', help='Uses the best possible transalation only', action='store_true')
parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2')
parser.add_argument('--max-len', help='Maximum length of sub-string allowed.', default='5')
args = parser.parse_args()

#Preprocessing
lp = args.LP
lps = lp.split('-')
assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'")

#Make sure all files exist
assertion(os.path.isfile(args.out), args.out+" doesn't exist")

#TODO:Check lines are equal in SLFs and TLFs.

#Command line params
cache = args.c
lp_dir = args.d
verbose = args.v
mode = args.mode.lower()

assertion(mode in ['all', 'cam', 'compare'], "Mode couldn't be identified.")
grounded = args.go
best_only = args.bo
Example #7
0
from lib.phrase_extractor import PhraseExtractor

parser = argparse.ArgumentParser(description='Generates set A.')
parser.add_argument('S', help='First Sentence')
parser.add_argument('S1', help='Second Sentence')
parser.add_argument('--min-fms', help='Minimum value of fuzzy match score of S and S1.', default='0.8')
parser.add_argument('--min-len', help='Minimum length of sub-string allowed.', default='2')
parser.add_argument('--max-len', help='Maximum length of sub-string allowed.')
args = parser.parse_args()

#Applying some preprocessing on input data.
s_sentence = preprocess(args.S)
s1_sentence = preprocess(args.S1)

#Testing Input data
assertion(s_sentence != "", "S should not be blank.\nSee -h for help")
assertion(s1_sentence != "", "S1 should not be blank.\nSee -h for help")

min_fms = float(args.min_fms)
min_len = int(args.min_len)
max_len = int(args.max_len) if args.max_len else max(len(s_sentence.split()), len(s1_sentence.split()))

fms = FMS(s_sentence, s1_sentence).calculate()

assertion(fms >= min_fms, "Sentences have low fuzzy match score of %.02f." %fms)

phrase_extractor = PhraseExtractor(s_sentence, s1_sentence, min_len, max_len)
a_set = phrase_extractor.extract_pairs()

# print set A
S = s_sentence.split()
Example #8
0
parser.add_argument("--go", help="To patch only grounded mismatches", action="store_true")
parser.add_argument("--bo", help="Prints the best possible transalation only", action="store_true")
parser.add_argument("--min-fms", help="Minimum value of fuzzy match score of S and S1.", default="0.8")
parser.add_argument("--min-len", help="Minimum length of sub-segment allowed.", default="2")
parser.add_argument("--max-len", help="Maximum length of sub-segment allowed.", default="5")
args = parser.parse_args()

# Applying some preprocessing on input data.
s_sentence = preprocess(args.S)
tmxfile = preprocess(args.TM)
lp = args.LP
lps = lp.split("-")


# Testing Input data
assertion(s_sentence != "", "S should not be blank. See -h for help")
assertion(os.path.isfile(tmxfile), "TM does not exist")
assertion(len(lps) == 2, "LP should be of type a-b, eg, 'en-eo'")

# Read optional params
cache = args.c
lp_dir = args.d
verbose = args.v
show_traces = args.t
cover_all = args.cam
grounded = args.go
best_only = args.bo
min_fms = float(args.min_fms)
min_len = int(args.min_len)
max_len = int(args.max_len)