コード例 #1
0
ファイル: mcca.py プロジェクト: vaswani/LEXICON_INDUCTION
def readInput(options, filename_wordsX, filename_wordsY, filename_seed):
    # load data files
    if options.pickled:
        wordsX = IO.readPickledWords(filename_wordsX)
        wordsY = IO.readPickledWords(filename_wordsY)
    else:
        wordsX = IO.readWords(filename_wordsX)
        wordsY = IO.readWords(filename_wordsY)

    if options.filename_graphX is not None:
        print "loading graph -", options.filename_graphX
        wordsX.G = IO.unpickle(options.filename_graphX)
        print "loading graph -", options.filename_graphY
        wordsY.G = IO.unpickle(options.filename_graphY)

    seed_list = Struct()
    seed_list.X, seed_list.Y = IO.readSeed(filename_seed)  # read the seed list (X,Y)
    wordsX.pushSeedToEnd(seed_list.X)
    wordsY.pushSeedToEnd(seed_list.Y)

    # assert sizes are correct
    Nx = len(wordsX.words)
    Ny = len(wordsY.words)
    if Nx != Ny:
        log(0, "Number of words must be the same", Nx, Ny)
    else:
        log(0, Nx, "words loaded.")

    NSx = len(seed_list.X)
    NSy = len(seed_list.Y)

    if NSx != NSy:
        log(0, "Number of seed words must be the same", NSx, NSy)
    else:
        log(0, NSx, "seed words loaded.")
    assert NSx == NSy

    if options.filename_graphX is not None:
        (NGx0, NGx1) = wordsX.G.shape()
        (NGy0, NGy1) = wordsY.G.shape()
        assert NGx0 == NGx1, "GX is not a square adjacency matrix"
        assert NGy0 == NGy1, "GY is not a square adjacency matrix"

    # permute Y if rand_seed > 1, (this should only be used when testing on mock data)
    # wordsY.permuteFirstWords(perm.randperm(perm.ID(Ny)))
    # MU.printMatching(wordsX.words, wordsY.words, perm.ID(Ny))
    return wordsX, wordsY, seed_list
コード例 #2
0
ファイル: med.py プロジェクト: vaswani/LEXICON_INDUCTION
    else:
        print 'file', filename, 'not found'
        D = strings.pweditdist(X, Y)
        IO.writeNumpyArray(filename, D)
    (cost, pi, edge_cost) = MU.ApproxMatch(D)
    # TODO:
    # 3. set up an initial matching based on edit distance.
    return cost, pi, edge_cost

if __name__ == '__main__':
    # load data
    fileX = (sys.argv[1])
    fileY = (sys.argv[2])
    #fileX = '../SCRIPTS/matlab/Jun10_en.txt'
    #fileY = '../SCRIPTS/matlab/Jun10_es.txt'
    X = IO.readWords(fileX)
    Y = IO.readWords(fileY)

    X.features = normalize_rows(X.features)
    Y.features = normalize_rows(Y.features)

    (cost, pi, edge_cost) = med(X.words, Y.words)
    matching = MU.getMatching(X.words, Y.words, pi, edge_cost)
    Y = MU.permuteFirstWords(Y, pi)
    MU.printMatching(X, Y, edge_cost)

    options = Options()
    options.exp_id = -1

    IO.writeMatching(options, X.words, Y.words, pi, edge_cost)
コード例 #3
0
import IO
import BilexiconUtil
from common import *


if __name__ == '__main__':
    filename_wordsX = sys.argv[1]
    filename_wordsY = sys.argv[2]
    filename_lexicon = sys.argv[3]
    Nseed = int(sys.argv[4])

    if 'pickle' in filename_wordsX:
        wordsX = IO.readPickledWords(filename_wordsX)
        wordsY = IO.readPickledWords(filename_wordsY)
    else:
        wordsX = IO.readWords(filename_wordsX)
        wordsY = IO.readWords(filename_wordsY)

    if filename_lexicon == 'None':  # we don't have a lexicon. assume identity.
        log(100, 'Using identity lexicon')
        lex = None
        gold_lex = dict()  #
        for w in wordsX.words:
            gold_lex[w] = [w]
        log(100,  gold_lex)
    else:
        lex = BilexiconUtil.readLexicon(filename_lexicon)
        (gold_lex, times) = BilexiconUtil.filterLexicon(lex, wordsX.words, wordsY.words)
        log(100, 'Done filtering gold lexicon')

    seed = []