def readInput(options, filename_wordsX, filename_wordsY, filename_seed): # load data files if options.pickled: wordsX = IO.readPickledWords(filename_wordsX) wordsY = IO.readPickledWords(filename_wordsY) else: wordsX = IO.readWords(filename_wordsX) wordsY = IO.readWords(filename_wordsY) if options.filename_graphX is not None: print "loading graph -", options.filename_graphX wordsX.G = IO.unpickle(options.filename_graphX) print "loading graph -", options.filename_graphY wordsY.G = IO.unpickle(options.filename_graphY) seed_list = Struct() seed_list.X, seed_list.Y = IO.readSeed(filename_seed) # read the seed list (X,Y) wordsX.pushSeedToEnd(seed_list.X) wordsY.pushSeedToEnd(seed_list.Y) # assert sizes are correct Nx = len(wordsX.words) Ny = len(wordsY.words) if Nx != Ny: log(0, "Number of words must be the same", Nx, Ny) else: log(0, Nx, "words loaded.") NSx = len(seed_list.X) NSy = len(seed_list.Y) if NSx != NSy: log(0, "Number of seed words must be the same", NSx, NSy) else: log(0, NSx, "seed words loaded.") assert NSx == NSy if options.filename_graphX is not None: (NGx0, NGx1) = wordsX.G.shape() (NGy0, NGy1) = wordsY.G.shape() assert NGx0 == NGx1, "GX is not a square adjacency matrix" assert NGy0 == NGy1, "GY is not a square adjacency matrix" # permute Y if rand_seed > 1, (this should only be used when testing on mock data) # wordsY.permuteFirstWords(perm.randperm(perm.ID(Ny))) # MU.printMatching(wordsX.words, wordsY.words, perm.ID(Ny)) return wordsX, wordsY, seed_list
else: print 'file', filename, 'not found' D = strings.pweditdist(X, Y) IO.writeNumpyArray(filename, D) (cost, pi, edge_cost) = MU.ApproxMatch(D) # TODO: # 3. set up an initial matching based on edit distance. return cost, pi, edge_cost if __name__ == '__main__': # load data fileX = (sys.argv[1]) fileY = (sys.argv[2]) #fileX = '../SCRIPTS/matlab/Jun10_en.txt' #fileY = '../SCRIPTS/matlab/Jun10_es.txt' X = IO.readWords(fileX) Y = IO.readWords(fileY) X.features = normalize_rows(X.features) Y.features = normalize_rows(Y.features) (cost, pi, edge_cost) = med(X.words, Y.words) matching = MU.getMatching(X.words, Y.words, pi, edge_cost) Y = MU.permuteFirstWords(Y, pi) MU.printMatching(X, Y, edge_cost) options = Options() options.exp_id = -1 IO.writeMatching(options, X.words, Y.words, pi, edge_cost)
import IO import BilexiconUtil from common import * if __name__ == '__main__': filename_wordsX = sys.argv[1] filename_wordsY = sys.argv[2] filename_lexicon = sys.argv[3] Nseed = int(sys.argv[4]) if 'pickle' in filename_wordsX: wordsX = IO.readPickledWords(filename_wordsX) wordsY = IO.readPickledWords(filename_wordsY) else: wordsX = IO.readWords(filename_wordsX) wordsY = IO.readWords(filename_wordsY) if filename_lexicon == 'None': # we don't have a lexicon. assume identity. log(100, 'Using identity lexicon') lex = None gold_lex = dict() # for w in wordsX.words: gold_lex[w] = [w] log(100, gold_lex) else: lex = BilexiconUtil.readLexicon(filename_lexicon) (gold_lex, times) = BilexiconUtil.filterLexicon(lex, wordsX.words, wordsY.words) log(100, 'Done filtering gold lexicon') seed = []