Esempio n. 1
0
def generate_candidates(phrase, args, oov_lookup):
    """Given a phrase and the cmdline args, return a list of appropriate
    PTEntrys"""
    assert isinstance(phrase, tuple)

    phrase_s = " ".join(phrase)
    ptentries = phrasetable.lookup(phrase_s)

    if not ptentries:
        if len(phrase) > 1:
            ptentries = generate_split_candidates(phrase,
                                                  args.source,
                                                  args.target)

    ## XXX: this seems completely correct.
    if not ptentries:
        frombabelnet = babelnet_candidates(phrase_s, args.source, args.target)
        ptentries.extend(frombabelnet)

    if not ptentries:
        nospaces = "".join(phrase)
        if nospaces in oov_lookup:
            looked_up = oov_lookup[nospaces]
            oov = PTEntry(source=phrase_s,target=looked_up,pdirect=1,pinverse=1)
            ptentries.append(oov)
        else:
            oov = PTEntry(source=phrase_s,target="OOV",pdirect=1,pinverse=1)
            ptentries.append(oov)

    return ptentries
Esempio n. 2
0
def generate_split_candidates(phrase, sl, tl):
    ptentries = []

    splits = list(reversed(allsplits(list(phrase))))
    dprint(splits)

    for split in splits:
        split_strings = [" ".join(entry) for entry in split]

        found = []
        for entry in split_strings:
            foundsomething = False
            from_pt = phrasetable.lookup(entry)
            if from_pt:
                foundsomething = True
                found.append(from_pt)
            elif " " not in entry:
                frombabelnet = babelnet_candidates(entry, sl, tl)
                if frombabelnet:
                    foundsomething = True
                    found.append(frombabelnet)
            if not foundsomething:
                found.append([])

        if all(found):
            for assignment in itertools.product(*found):
                target = " ".join(pte.target for pte in assignment)
                pdirects = [pte.pdirect for pte in assignment]
                pinverses = [pte.pinverse for pte in assignment]

                product_pdirect = functools.reduce(operator.mul, pdirects, 1)
                product_pinverse = functools.reduce(operator.mul, pinverses, 1)

                entry = PTEntry(source=" ".join(phrase),
                                target=target,
                                pdirect=product_pdirect,
                                pinverse=product_pinverse)
                ptentries.append(entry)

                ## XXX: magic number, or maybe "tunable hyperparameter".
                if len(ptentries) == 10000:
                    return ptentries
    return ptentries