L = M.M.todense() I = (L > 0).sum(axis=0) >= options.M # find words that co-occur with at least M distinct words J = np.nonzero(np.array(I)[0])[0] # pi_f = [M.features[i] for i in M.strings] # pi_s = [M.strings[i] for i in M.strings] # P = L[pi_s, pi_f] FCW = set([M.reverseFeatures[j] for j in J]) print >> sys.stderr, 'FCW length:', len(FCW) #too_frequent = FCW.intersection(wordsX.words) L = np.array(L) for w in FCW: i = M.features[w] L[:, i] = 0 #L *= (L > options.minCoFreq) output_edges(M, L, M.reverseFeatures) elif graph_mode == 2: # PMI M.M = M.M.todense() L = M.materialize(wordsX.words, wordsX.words) # L's rows/columns are ordered by wordsX.words L[:, :1500] = 0 # remove common words words L = np.array(L) * np.array(L>options.minCoFreq) # remove low occuring bigrams #L = normalize(L, norm, axis=1) # normalize rows P = np.triu(L) unigram = np.mat(wordsX.freq*1.0 / np.sum(wordsX.freq)) P -= np.diag(np.diag(P)) # remove diagonal P += P.T P /= P.sum() # P now contains the joint probability P[i,j] Q = np.array(unigram.T * unigram) # Q_ij = Ui*Uj PMI = P / Q # pointwise mutual information output_edges(M, PMI, M.reverseStrings)