def generate_candidates(phrase, args, oov_lookup): """Given a phrase and the cmdline args, return a list of appropriate PTEntrys""" assert isinstance(phrase, tuple) phrase_s = " ".join(phrase) ptentries = phrasetable.lookup(phrase_s) if not ptentries: if len(phrase) > 1: ptentries = generate_split_candidates(phrase, args.source, args.target) ## XXX: this seems completely correct. if not ptentries: frombabelnet = babelnet_candidates(phrase_s, args.source, args.target) ptentries.extend(frombabelnet) if not ptentries: nospaces = "".join(phrase) if nospaces in oov_lookup: looked_up = oov_lookup[nospaces] oov = PTEntry(source=phrase_s,target=looked_up,pdirect=1,pinverse=1) ptentries.append(oov) else: oov = PTEntry(source=phrase_s,target="OOV",pdirect=1,pinverse=1) ptentries.append(oov) return ptentries
def generate_split_candidates(phrase, sl, tl): ptentries = [] splits = list(reversed(allsplits(list(phrase)))) dprint(splits) for split in splits: split_strings = [" ".join(entry) for entry in split] found = [] for entry in split_strings: foundsomething = False from_pt = phrasetable.lookup(entry) if from_pt: foundsomething = True found.append(from_pt) elif " " not in entry: frombabelnet = babelnet_candidates(entry, sl, tl) if frombabelnet: foundsomething = True found.append(frombabelnet) if not foundsomething: found.append([]) if all(found): for assignment in itertools.product(*found): target = " ".join(pte.target for pte in assignment) pdirects = [pte.pdirect for pte in assignment] pinverses = [pte.pinverse for pte in assignment] product_pdirect = functools.reduce(operator.mul, pdirects, 1) product_pinverse = functools.reduce(operator.mul, pinverses, 1) entry = PTEntry(source=" ".join(phrase), target=target, pdirect=product_pdirect, pinverse=product_pinverse) ptentries.append(entry) ## XXX: magic number, or maybe "tunable hyperparameter". if len(ptentries) == 10000: return ptentries return ptentries