예제 #1
0
    parser.add_option('--KNN', dest='KNN', type="int", action='store', default=10)
    parser.add_option('--normalize', dest='normalize', type="int", action='store', default=1)
    (options, args) = parser.parse_args()
    return options

if __name__ == '__main__':
    # parse arguments
    filename_wordsX = (sys.argv[1])

    # read input
    wordsX = IO.readPickledWords(filename_wordsX)
    options = parseOptions()

    # make graph
    G = makeGraph(wordsX, options)
    G = G.todense()

    if options.normalize == 1:
        G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l1')
    elif options.normalize == 2:
        G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l2')

    msk = MSK(None, wordsX.words, wordsX.words)
    # save the matrix.
    # This is hacky, since we're trusting that G is generated with rows/columns that match the order of wordsX.words
    msk.M = G
    graphFilename = filename_wordsX.replace(".", "_WG.")
    if options.KNN > 0:
        graphFilename = graphFilename.replace(".", "_KNN"+str(options.KNN)+".")

    IO.pickle(graphFilename, msk)
        L = M.M.todense()
        I = (L > 0).sum(axis=0) >= options.M  # find words that co-occur with at least M distinct words
        J = np.nonzero(np.array(I)[0])[0]
        # pi_f = [M.features[i] for i in M.strings]
        # pi_s = [M.strings[i] for i in M.strings]
        # P = L[pi_s, pi_f]
        FCW = set([M.reverseFeatures[j] for j in J])
        print >> sys.stderr, 'FCW length:', len(FCW)
        #too_frequent = FCW.intersection(wordsX.words)

        L = np.array(L)
        for w in FCW:
            i = M.features[w]
            L[:, i] = 0
        #L *= (L > options.minCoFreq)
        output_edges(M, L, M.reverseFeatures)
    elif graph_mode == 2:  # PMI
        M.M = M.M.todense()
        L = M.materialize(wordsX.words, wordsX.words)  # L's rows/columns are ordered by wordsX.words
        L[:, :1500] = 0  # remove common words words
        L = np.array(L) * np.array(L>options.minCoFreq) # remove low occuring bigrams
        #L = normalize(L, norm, axis=1)  # normalize rows
        P = np.triu(L)
        unigram = np.mat(wordsX.freq*1.0 / np.sum(wordsX.freq))
        P -= np.diag(np.diag(P))  # remove diagonal
        P += P.T
        P /= P.sum() # P now contains the joint probability P[i,j]

        Q = np.array(unigram.T * unigram)  # Q_ij = Ui*Uj
        PMI = P / Q  # pointwise mutual information
        output_edges(M, PMI, M.reverseStrings)