コード例 #1
0
ファイル: mcca.py プロジェクト: vaswani/LEXICON_INDUCTION
def readInput(options, filename_wordsX, filename_wordsY, filename_seed):
    # load data files
    if options.pickled:
        wordsX = IO.readPickledWords(filename_wordsX)
        wordsY = IO.readPickledWords(filename_wordsY)
    else:
        wordsX = IO.readWords(filename_wordsX)
        wordsY = IO.readWords(filename_wordsY)

    if options.filename_graphX is not None:
        print "loading graph -", options.filename_graphX
        wordsX.G = IO.unpickle(options.filename_graphX)
        print "loading graph -", options.filename_graphY
        wordsY.G = IO.unpickle(options.filename_graphY)

    seed_list = Struct()
    seed_list.X, seed_list.Y = IO.readSeed(filename_seed)  # read the seed list (X,Y)
    wordsX.pushSeedToEnd(seed_list.X)
    wordsY.pushSeedToEnd(seed_list.Y)

    # assert sizes are correct
    Nx = len(wordsX.words)
    Ny = len(wordsY.words)
    if Nx != Ny:
        log(0, "Number of words must be the same", Nx, Ny)
    else:
        log(0, Nx, "words loaded.")

    NSx = len(seed_list.X)
    NSy = len(seed_list.Y)

    if NSx != NSy:
        log(0, "Number of seed words must be the same", NSx, NSy)
    else:
        log(0, NSx, "seed words loaded.")
    assert NSx == NSy

    if options.filename_graphX is not None:
        (NGx0, NGx1) = wordsX.G.shape()
        (NGy0, NGy1) = wordsY.G.shape()
        assert NGx0 == NGx1, "GX is not a square adjacency matrix"
        assert NGy0 == NGy1, "GY is not a square adjacency matrix"

    # permute Y if rand_seed > 1, (this should only be used when testing on mock data)
    # wordsY.permuteFirstWords(perm.randperm(perm.ID(Ny)))
    # MU.printMatching(wordsX.words, wordsY.words, perm.ID(Ny))
    return wordsX, wordsY, seed_list
コード例 #2
0
    # parse cmdline arguments
    parser = OptionParser()
    # general setting
    parser.add_option('--sym', dest='sym', type="int", action='store', default=1)
    parser.add_option('--stoc', dest='stochastic', type="int", action='store', default=1)
    parser.add_option('--KNN', dest='KNN', type="int", action='store', default=10)
    parser.add_option('--normalize', dest='normalize', type="int", action='store', default=1)
    (options, args) = parser.parse_args()
    return options

if __name__ == '__main__':
    # parse arguments
    filename_wordsX = (sys.argv[1])

    # read input
    wordsX = IO.readPickledWords(filename_wordsX)
    options = parseOptions()

    # make graph
    G = makeGraph(wordsX, options)
    G = G.todense()

    if options.normalize == 1:
        G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l1')
    elif options.normalize == 2:
        G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l2')

    msk = MSK(None, wordsX.words, wordsX.words)
    # save the matrix.
    # This is hacky, since we're trusting that G is generated with rows/columns that match the order of wordsX.words
    msk.M = G