def readInput(options, filename_wordsX, filename_wordsY, filename_seed): # load data files if options.pickled: wordsX = IO.readPickledWords(filename_wordsX) wordsY = IO.readPickledWords(filename_wordsY) else: wordsX = IO.readWords(filename_wordsX) wordsY = IO.readWords(filename_wordsY) if options.filename_graphX is not None: print "loading graph -", options.filename_graphX wordsX.G = IO.unpickle(options.filename_graphX) print "loading graph -", options.filename_graphY wordsY.G = IO.unpickle(options.filename_graphY) seed_list = Struct() seed_list.X, seed_list.Y = IO.readSeed(filename_seed) # read the seed list (X,Y) wordsX.pushSeedToEnd(seed_list.X) wordsY.pushSeedToEnd(seed_list.Y) # assert sizes are correct Nx = len(wordsX.words) Ny = len(wordsY.words) if Nx != Ny: log(0, "Number of words must be the same", Nx, Ny) else: log(0, Nx, "words loaded.") NSx = len(seed_list.X) NSy = len(seed_list.Y) if NSx != NSy: log(0, "Number of seed words must be the same", NSx, NSy) else: log(0, NSx, "seed words loaded.") assert NSx == NSy if options.filename_graphX is not None: (NGx0, NGx1) = wordsX.G.shape() (NGy0, NGy1) = wordsY.G.shape() assert NGx0 == NGx1, "GX is not a square adjacency matrix" assert NGy0 == NGy1, "GY is not a square adjacency matrix" # permute Y if rand_seed > 1, (this should only be used when testing on mock data) # wordsY.permuteFirstWords(perm.randperm(perm.ID(Ny))) # MU.printMatching(wordsX.words, wordsY.words, perm.ID(Ny)) return wordsX, wordsY, seed_list
# parse cmdline arguments parser = OptionParser() # general setting parser.add_option('--sym', dest='sym', type="int", action='store', default=1) parser.add_option('--stoc', dest='stochastic', type="int", action='store', default=1) parser.add_option('--KNN', dest='KNN', type="int", action='store', default=10) parser.add_option('--normalize', dest='normalize', type="int", action='store', default=1) (options, args) = parser.parse_args() return options if __name__ == '__main__': # parse arguments filename_wordsX = (sys.argv[1]) # read input wordsX = IO.readPickledWords(filename_wordsX) options = parseOptions() # make graph G = makeGraph(wordsX, options) G = G.todense() if options.normalize == 1: G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l1') elif options.normalize == 2: G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l2') msk = MSK(None, wordsX.words, wordsX.words) # save the matrix. # This is hacky, since we're trusting that G is generated with rows/columns that match the order of wordsX.words msk.M = G