Esempio n. 1
0
    def setupFeatures(self, options=None):
        #logFr = np.log(X.freq)
        # L = strings.strlen(self.words)

        # normalize the features
        if self.isPickled():
            (orthoDD, orthoFeatures) = strings.to_ngram_dictionary(self.words, affix=True)
            if options.log_features == 1:
                self.orthoMSK = MSK(orthoDD, self.words, orthoFeatures).log(offset=1).normalize(norm='l2')
                self.contextMSK = MSK(self.repr, self.words, self.featureNames).log(offset=1).normalize(norm='l2')
            else:
                self.orthoMSK = MSK(orthoDD, self.words, orthoFeatures).normalize(norm='l2')
                self.contextMSK = MSK(self.repr, self.words, self.featureNames).normalize(norm='l2')
        else:
            self.features = common.normalize_rows(self.features)
    parser.add_option('--KNN', dest='KNN', type="int", action='store', default=10)
    parser.add_option('--normalize', dest='normalize', type="int", action='store', default=1)
    (options, args) = parser.parse_args()
    return options

if __name__ == '__main__':
    # parse arguments
    filename_wordsX = (sys.argv[1])

    # read input
    wordsX = IO.readPickledWords(filename_wordsX)
    options = parseOptions()

    # make graph
    G = makeGraph(wordsX, options)
    G = G.todense()

    if options.normalize == 1:
        G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l1')
    elif options.normalize == 2:
        G = toSymmetricStochastic(G, sym=(options.sym == 1), stochastic=(options.stochastic == 1), norm='l2')

    msk = MSK(None, wordsX.words, wordsX.words)
    # save the matrix.
    # This is hacky, since we're trusting that G is generated with rows/columns that match the order of wordsX.words
    msk.M = G
    graphFilename = filename_wordsX.replace(".", "_WG.")
    if options.KNN > 0:
        graphFilename = graphFilename.replace(".", "_KNN"+str(options.KNN)+".")

    IO.pickle(graphFilename, msk)
Esempio n. 3
0
class Words:
    def __init__(self, name):
        self.name = name
        self.words = []
        self.freq = []
        self.features = []
        self.featureNames = []
        self.G = None
        self.repr = {}
        self.options = None

    def setOptions(self, options):
        self.options = options

    def toNP(self):  # to numpy array the fields
        #self.words = np.array(self.words)
        self.freq = np.array(self.freq)
        self.features = np.array(self.features)
        self.G = np.array(self.G)

    def setupFeatures(self, options=None):
        #logFr = np.log(X.freq)
        # L = strings.strlen(self.words)

        # normalize the features
        if self.isPickled():
            (orthoDD, orthoFeatures) = strings.to_ngram_dictionary(self.words, affix=True)
            if options.log_features == 1:
                self.orthoMSK = MSK(orthoDD, self.words, orthoFeatures).log(offset=1).normalize(norm='l2')
                self.contextMSK = MSK(self.repr, self.words, self.featureNames).log(offset=1).normalize(norm='l2')
            else:
                self.orthoMSK = MSK(orthoDD, self.words, orthoFeatures).normalize(norm='l2')
                self.contextMSK = MSK(self.repr, self.words, self.featureNames).normalize(norm='l2')
        else:
            self.features = common.normalize_rows(self.features)
        # TODO: should be add logFr and L ?

    def computeOrthographicKernel(self):
        print >> sys.stderr, 'Computing Orthographic Kernel for', self.name
        return self.orthoMSK.makeLinearKernel()

    def computeContextKernel(self):
        print >> sys.stderr, 'Computing Context Kernel for', self.name
        return self.contextMSK.makeLinearKernel()

    def cacheOrComputeKernel(self, options, filename, f):
        if options.useCache == 1 and os.path.exists(filename):
            print >> sys.stderr, 'Loading kernel from file:', filename
            return IO.unpickle(filename)
        else:
            K = f(self)
            print >> sys.stderr, 'Saving kernel to file:', filename
            IO.pickle(filename, K)
            return K

    def computeKernel(self, options):
        print >> sys.stderr, 'Computing Kernel for', self.name

        K_context = None
        K_ortho = None
        if options.useContextFeatures == 1:
            filename_ck = self.name.replace('.', '_context_kernel.')

            K_context = self.cacheOrComputeKernel(options, filename_ck, lambda self: self.computeContextKernel())
            #K_context0 = self.computeContextKernel()
            #print 'AAA: ', np.linalg.norm(K_context.K - K_context0.K)

        if options.useOrthoFeatures == 1:
            filename_ok = self.name.replace('.', '_ortho_kernel.')
            K_ortho = self.cacheOrComputeKernel(options, filename_ok, lambda self: self.computeOrthographicKernel())
            #K_ortho0 = self.computeOrthographicKernel()
            #print 'BBB: ', np.linalg.norm(K_ortho.K - K_ortho0.K)

        if K_ortho is None:
            self.kernel = K_context
        elif K_context is None:
            self.kernel = K_ortho
        else:
            assert K_context.strings == K_ortho.strings  # strings should be numbered the same.
            K_context.K += K_ortho.K
            self.kernel = K_context

        return self.kernel

    def getKernel(self):
        return self.kernel.K

    def materializeGraph(self):
        if self.G is None:
            return None
        # otherwise, return the graph,
        # permuted according to the order of self.words
        return self.G.materialize(self.words, self.words)

    # this method permutes all the fields of X according to pi.
    # if pi is shorter than X.words, than only the first entries are permuted,
    # and the last remain in their position.
    def permuteFirstWords(self, pi):
        pi = np.array(pi)
        M = len(pi)
        id = perm.ID(M)
        self.words[id] = self.words[pi]
        self.freq[id] = self.freq[pi]
        if not common.isEmpty(self.features):
            self.features[id, :] = self.features[pi, :]
        # note that there is no need to permute the graph, since it is stored as an MSK
        # and will be materialized according to the order of self.words

    def isPickled(self):
        return len(self.repr) > 0

    def pushSeedToEnd(self, seed):
        # push the seed to the end of the list. The order of seed matter, but the rest doesn't.
        S = set(seed)
        non_seed = filter(lambda x: x not in S, self.words)
        self.words = np.array(non_seed + seed)

    def ICD_representation(self, Nt, eta):
        # step 1: calculate ICD model based on the last N-Nt words
        # (those are supposed to be well aligned - the seed is at the end)
        # step 2: projected all the data based on the model.
        # use_ICD = True
        # if use_ICD:
        print >> sys.stderr, "Computing ICD model for",  self.name
        keys = self.words[Nt:]
        K = self.kernel.materialize(keys, keys)
        model = ICD.ichol_words(K, keys, eta)
        print >> sys.stderr, "Computing Representations"
        K = self.kernel.materialize(self.words, model.keys)
        self.features = ICD.getRepresentations_words(model, K)
        print >> sys.stderr, "Done ICD."
        return model

    def addReprNoise(self, noise):
        for k in self.repr.keys():
            d = self.repr[k]
            for kk in d.keys():
                d[kk] = d[kk] + noise*common.randn((1, 1))

    def asTuple(self):
        return self.words, self.freq, self.features

    # @staticmethod
    # def concat(A, B):
    #     C = Words(A.name)  # take the name of A
    #     C.words = np.append(A.words, B.words)
    #     C.freq = np.append(A.freq, B.freq)
    #     C.features = np.vstack((A.features, B.features))
    #     # union the two dictionaries (python!, what magic you have)
    #     C.repr = dict({}, **copy.deepcopy(A.repr))
    #     C.repr = dict(C.repr, **copy.deepcopy(B.repr))
    #     C.featureNames = A.featureNames
    #     assert A.featureNames == B.featureNames
    #     return C
            if feature in M.strings and L[iw, jf] > 0:
                print "%s,%s,%f" % (word, feature, L[iw, jf])
                #count += 1
            if count == options.K:
                break

# python extract_frequent_neighbors.py data/en-es/en_pickled_N\=3100.txt 10 > en_co_N=3100.edges
# python extract_frequent_neighbors.py data/en-es/es_pickled_N\=3100.txt 10 > es_co_N=3100.edges

if __name__ == '__main__':
    filename = sys.argv[1]
    graph_mode = int(sys.argv[2])
    options = parseOptions()

    wordsX = IO.readPickledWords(filename)
    M = MSK(wordsX.repr, wordsX.words, wordsX.featureNames)

    if graph_mode == 1:  # remove words (columns) that are too frequently occurring
        L = M.M.todense()
        I = (L > 0).sum(axis=0) >= options.M  # find words that co-occur with at least M distinct words
        J = np.nonzero(np.array(I)[0])[0]
        # pi_f = [M.features[i] for i in M.strings]
        # pi_s = [M.strings[i] for i in M.strings]
        # P = L[pi_s, pi_f]
        FCW = set([M.reverseFeatures[j] for j in J])
        print >> sys.stderr, 'FCW length:', len(FCW)
        #too_frequent = FCW.intersection(wordsX.words)

        L = np.array(L)
        for w in FCW:
            i = M.features[w]