Exemple #1
0
def perform_pca(embedding_file, pca_dim):
    freqs, words, _, _, _, A = read_embeddings(embedding_file)
    say('performing PCA to reduce dimensions from {} to {}'.format(
        A.shape[1], pca_dim))
    pca_trans, _, _ = pca_svd(A)
    A_pca = pca_trans[:, :pca_dim]
    write_embeddings(freqs, words, A_pca,
                     embedding_file + '.pca' + str(pca_dim))
Exemple #2
0
 def rec(self, string, newline=True):
     if newline:
         print >> self.logf, string
         self.logf.flush()
     else:
         print string,
         self.logf.flush()
     say(string)        
Exemple #3
0
 def rec(self, string, newline=True):
     if newline:
         print >> self.logf, string
         self.logf.flush()
     else:
         print string,
         self.logf.flush()
     say(string)        
Exemple #4
0
 def write_U(self):
     say('Storing row-normalized U at: %s' % self.dirname+'/Ur')
     sorted_indices = [pair[0] for pair in sorted([(i, self.countX[i]) 
                                                   for i in self.wordmap], 
                                                  key=lambda x:x[1], 
                                                  reverse=True)]
     with open(self.dirname+'/Ur', 'wb') as f:
         for i in sorted_indices: write_row(f, self.countX[i], 
                                            self.wordmap[i], self.U[i,:]) 
Exemple #5
0
 def write_U(self):
     say('Storing row-normalized U at: %s' % self.dirname+'/Ur')
     sorted_indices = [pair[0] for pair in sorted([(i, self.countX[i]) 
                                                   for i in self.wordmap], 
                                                  key=lambda x:x[1], 
                                                  reverse=True)]
     with open(self.dirname+'/Ur', 'wb') as f:
         for i in sorted_indices: write_row(f, self.countX[i], 
                                            self.wordmap[i], self.U[i,:]) 
Exemple #6
0
def decide_vocab(unigrams, cutoff, vocab_size, want):
    assert (unigrams and os.path.isfile(unigrams))
    assert ((not (cutoff is None and vocab_size is None))
            and (cutoff is None or vocab_size is None))

    say('Reading unigrams')
    vocab = {}
    num_words = 0
    total_sum = 0.
    mysum = 0.

    wantname = ''
    if want:
        wanted_words = {}
        lines = open(want).readlines()
        for line in lines:
            toks = line.split()
            if len(toks) == 0: continue
            wanted_words[toks[0]] = True
        wantname = '.' + os.path.splitext(os.path.basename(want))[0]
        num_wanted = 0

    with open(unigrams) as f:
        for line in f:
            num_words += 1
            toks = line.split()
            if len(toks) != 2: continue
            word = toks[0]
            count = int(toks[1])
            total_sum += count

            if ((cutoff is not None) and
                (count <= cutoff)) or ((vocab_size is not None)
                                       and len(vocab) == vocab_size):
                if not (want and word in wanted_words): continue
            vocab[word] = count
            mysum += count
            if want and word in wanted_words: num_wanted += 1

    if cutoff is not None:
        say('Cutoff %i: keep %i out of %i words (%5.2f%% unigram mass)' %
            (cutoff, len(vocab), num_words, mysum / total_sum * 100))
        outfname = os.path.splitext(unigrams)[0] + '.cutoff' + str(
            cutoff) + wantname

    if vocab_size is not None:
        say('Vocab %i: keep %i out of %i words (%5.2f%% unigram mass)' %
            (vocab_size, len(vocab), num_words, mysum / total_sum * 100))
        outfname = os.path.splitext(unigrams)[0] + '.vocab' + str(
            vocab_size) + wantname

    if want:
        say(' - Have %i out of %i wanted words' %
            (num_wanted, len(wanted_words)))

    return vocab, outfname
Exemple #7
0
def count_ngrams(corpus, n_vals=False):
    assert (os.path.isfile(corpus))
    if n_vals == False:
        answer = raw_input('Type in the values of n (e.g., \"1 3\"): ')
        n_vals = [int(n) for n in answer.split()]

    num_tok = 0
    ngrams = [Counter() for n in n_vals]
    queues = [deque([_buffer_ for _ in range(n - 1)], n) for n in n_vals]
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000)  # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0:
                        inline_print('Processed %i tokens' % (num_tok))
                    for i in range(len(n_vals)):
                        queues[i].append(tok)
                        ngrams[i][tuple(queues[i])] += 1

    for i in range(len(n_vals)):
        for _ in range(n_vals[i] - 1):
            queues[i].append(_buffer_)
            ngrams[i][tuple(queues[i])] += 1

    say('\nTotal {} tokens'.format(num_tok))
    files = [
        os.path.splitext(corpus)[0] + '.' + str(n) + 'grams' for n in n_vals
    ]
    for i in range(len(n_vals)):
        say('Sorting {} {}grams and writing to: {}'.format(
            len(ngrams[i]), n_vals[i], files[i]))
        sorted_ngrams = sorted(ngrams[i].items(),
                               key=lambda x: x[1],
                               reverse=True)
        with open(files[i], 'wb') as outf:
            for ngram, count in sorted_ngrams:
                for tok in ngram:
                    print >> outf, tok,
                print >> outf, count
Exemple #8
0
def call_matlab(stat, m, kappa):
    assert(m is not None and kappa is not None)
        
    outdirname = 'output/{}.m{}.kappa{}.matlab.out'.format(complete_path(stat)[:-1].rsplit('/',1)[1] , m, kappa)
    if not os.path.exists(outdirname): os.makedirs(outdirname)                

    commandstr = matlab + ' -nojvm -nodisplay -nosplash -r ' + '\"approx_cca(\'' + stat + '\',' + str(m) + ',' + str(kappa) + ',\'' + outdirname + '\')\"'
    os.system(commandstr)
    
    say('Postprocessing to sort rows by frequency...') 
    wordmap = read_wordmap(os.path.join(stat, 'wordmap'))
    freqmap = read_freqmap(os.path.join(stat, 'X'))
    sorted_indices = [pair[0] for pair in sorted([(i, freqmap[i]) for i in wordmap], key=lambda x:x[1], reverse=True)]
    
    lines = open(os.path.join(outdirname, 'Ur')).readlines()
    with open(os.path.join(outdirname, 'Ur'), 'wb') as outf:
        for i in sorted_indices: write_row(outf, freqmap[i], wordmap[i], lines[i].split())
    
    return outdirname
Exemple #9
0
def call_matlab(stat, m, kappa):
    assert(m is not None and kappa is not None)
        
    outdirname = 'output/{}.m{}.kappa{}.matlab.out'.format(complete_path(stat)[:-1].rsplit('/',1)[1] , m, kappa)
    if not os.path.exists(outdirname): os.makedirs(outdirname)                

    commandstr = matlab + ' -nojvm -nodisplay -nosplash -r ' + '\"approx_cca(\'' + stat + '\',' + str(m) + ',' + str(kappa) + ',\'' + outdirname + '\')\"'
    os.system(commandstr)
    
    say('Postprocessing to sort rows by frequency...') 
    wordmap = read_wordmap(os.path.join(stat, 'wordmap'))
    freqmap = read_freqmap(os.path.join(stat, 'X'))
    sorted_indices = [pair[0] for pair in sorted([(i, freqmap[i]) for i in wordmap], key=lambda x:x[1], reverse=True)]
    
    lines = open(os.path.join(outdirname, 'Ur')).readlines()
    with open(os.path.join(outdirname, 'Ur'), 'wb') as outf:
        for i in sorted_indices: write_row(outf, freqmap[i], wordmap[i], lines[i].split())
    
    return outdirname
Exemple #10
0
def count_ngrams(corpus, n_vals=False):
    assert(os.path.isfile(corpus))    
    if n_vals == False:
        answer = raw_input('Type in the values of n (e.g., \"1 3\"): ')        
        n_vals = [int(n) for n in answer.split()]
    
    num_tok = 0
    ngrams = [Counter() for n in n_vals]                                      
    queues = [deque([_buffer_ for _ in range(n-1)], n) for n in n_vals]
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000) # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
                    for i in range(len(n_vals)):
                        queues[i].append(tok)
                        ngrams[i][tuple(queues[i])] += 1
                 
    for i in range(len(n_vals)):
        for _ in range(n_vals[i]-1):
            queues[i].append(_buffer_)
            ngrams[i][tuple(queues[i])] += 1

    say('\nTotal {} tokens'.format(num_tok))
    files = [os.path.splitext(corpus)[0]+'.'+str(n)+'grams' for n in n_vals]        
    for i in range(len(n_vals)):
        say('Sorting {} {}grams and writing to: {}'.format(len(ngrams[i]), n_vals[i], files[i]))
        sorted_ngrams = sorted(ngrams[i].items(), key=lambda x: x[1], reverse=True)
        with open(files[i], 'wb') as outf:
            for ngram, count in sorted_ngrams:
                for tok in ngram:
                    print >> outf, tok,
                print >> outf, count
Exemple #11
0
def decide_vocab(unigrams, cutoff, vocab_size, want):
    assert(unigrams and os.path.isfile(unigrams))     
    assert((not (cutoff is None and vocab_size is None)) and (cutoff is None or vocab_size is None))        

    say('Reading unigrams')
    vocab = {}
    num_words = 0 
    total_sum = 0.
    mysum = 0.
    
    wantname = ''
    if want:
        wanted_words = {} 
        lines = open(want).readlines()
        for line in lines:
            toks = line.split()
            if len(toks) == 0: continue 
            wanted_words[toks[0]] = True
        wantname = '.' + os.path.splitext(os.path.basename(want))[0]
        num_wanted = 0
    
    with open(unigrams) as f:
        for line in f:
            num_words += 1
            toks = line.split()
            if len(toks) != 2: continue
            word = toks[0]
            count = int(toks[1])
            total_sum += count            

            if ((cutoff is not None) and (count <= cutoff)) or ((vocab_size is not None) and len(vocab) == vocab_size):
                if not (want and word in wanted_words): continue             
            vocab[word] = count            
            mysum += count
            if want and word in wanted_words: num_wanted += 1  
    
    if cutoff is not None:
        say('Cutoff %i: keep %i out of %i words (%5.2f%% unigram mass)' % (cutoff, len(vocab), num_words, mysum/total_sum*100))
        outfname = os.path.splitext(unigrams)[0] + '.cutoff' + str(cutoff) + wantname
         
    if vocab_size is not None: 
        say('Vocab %i: keep %i out of %i words (%5.2f%% unigram mass)' % (vocab_size, len(vocab), num_words, mysum/total_sum*100))
        outfname = os.path.splitext(unigrams)[0] + '.vocab' + str(vocab_size) + wantname
    
    if want: say(' - Have %i out of %i wanted words' %(num_wanted, len(wanted_words)))
        
    return vocab, outfname
Exemple #12
0
    def __init__(self, n_d, words, embs=None, fix_emb=True, bos='<s>', eos='</s>', oov='<oov>', pad='<pad>'):
        '''
            Note: initialization of the extra tokens: [ bos, eos, oov, pad ]
        '''
        word2id = {}
        if embs is not None:
            embwords, embvecs = embs
            for word in embwords:
                assert word not in word2id, "Duplicate words in pre-trained embeddings"
                word2id[word] = len(word2id)

            say("{} pre-trained word embeddings loaded.\n".format(len(word2id)))
            if n_d != len(embvecs[0]):
                say("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.\n".format(
                    n_d, len(embvecs[0]), len(embvecs[0])
                ))
                n_d = len(embvecs[0])

        # if not fix_emb:
        for word in words:
            if word not in word2id:
                word2id[word] = len(word2id)

        extra_tokens = [bos, eos, oov, pad]
        for tok in extra_tokens:
            if tok not in word2id:
                word2id[tok] = len(word2id)

        say("{} embedded words in total\n".format(len(word2id)))

        self.word2id = word2id
        self.n_V, self.n_d = len(word2id), n_d
        self.oovid = word2id[oov]
        self.padid = word2id[pad]
        self.embedding = nn.Embedding(self.n_V, n_d)

        if embs is not None:
            weight = self.embedding.weight
            weight.data[:len(embwords)].copy_(torch.from_numpy(embvecs))
            say("embedding shape: {}\n".format(weight.size()))

        if fix_emb:
            self.embedding.weight.requires_grad = False
Exemple #13
0
    def __init__(self, n_d, words, embs=None, fix_emb=True, bos='<s>', eos='</s>', oov='<oov>', pad='<pad>'):
        '''
        if not fix_emb: use the words that only appears in training data
        '''
        word2id = {}
        if embs is not None:
            embwords, embvecs = embs
            if fix_emb: # only fill word2id with words in pre-trained embeddings when fix_emb
                for word in embwords:
                    word2id[word] = len(word2id)

            say("{} pre-trained word embeddings loaded.\n".format(len(embwords)))
            if n_d != len(embvecs[0]):
                say("[WARNING] n_d ({}) != word vector size ({}). Use {} for embeddings.\n".format(
                    n_d, len(embvecs[0]), len(embvecs[0])
                ))
                n_d = len(embvecs[0])

        if words is not None: # fix_emb should be False
            assert (not fix_emb)
            for word in words:
                if word not in word2id:
                    word2id[word] = len(word2id)

        extra_tokens = [bos, eos, oov, pad]
        for tok in extra_tokens:
            if tok not in word2id:
                word2id[tok] = len(word2id)

        say("{} embedded words in total\n".format(len(word2id)))

        self.word2id = word2id
        self.n_V, self.n_d = len(word2id), n_d
        self.oovid = word2id[oov]
        self.padid = word2id[pad]
        self.embedding = nn.Embedding(self.n_V, n_d)
        self.embedding.weight.data.uniform_(-0.25, 0.25)
        self.embedding.weight.data[self.padid].zero_()
        self.embedding.weight.data[self.oovid].zero_()

        if embs is not None:
            weight = self.embedding.weight
            tor_embvecs = torch.from_numpy(embvecs)
            if fix_emb:
                weight.data[:len(embwords)].copy_(tor_embvecs)
            else:
                for word, wid in word2id.items():
                    if word in embwords:
                        # initialize with pre-trained word embeddings
                        weight.data[wid].copy_(tor_embvecs[embwords.index(word)])

        if fix_emb:
            self.embedding.weight.requires_grad = False
Exemple #14
0
 def set_params(self, m, kappa):        
     say('m: {}'.format(m))
     say('kappa: {}'.format(kappa))
     self.m = m
     self.kappa = kappa
Exemple #15
0
 def write_sv(self):
     say('\nStoring singular values at: %s' % self.dirname+'/sv')
     with open(self.dirname+'/sv', 'wb') as outf:
         for i in range(len(self.sv)): print >> outf, self.sv[i]
Exemple #16
0
def extract_stat(corpus, vocab, stat, window):
    stat += '.window' + str(window)    
    assert(os.path.isfile(corpus))
    
    XYcount = Counter()
    Xcount = Counter()
    Ycount = Counter()
    def inc_stats(q):
        center = (window - 1) / 2 # position of the current token
        if q[center] == _buffer_: return
        token = q[center] if q[center] in vocab else _rare_
        Xcount[token] += 1
        for i in range(window):
            if i != center:
                if q[i] == _buffer_: continue
                friend = q[i] if q[i] in vocab else _rare_
                rel_position = i-center
                position_marker = '<+'+str(rel_position)+'>' if rel_position > 0 else '<'+str(rel_position)+'>'
                friend += position_marker
                XYcount[(token, friend)] += 1
                Ycount[friend] += 1
            
    num_tok = 0
    q = deque([_buffer_ for _ in range(window-1)], window)
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000) # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
                    q.append(tok)
                    inc_stats(q)                    
    inline_print('\n')
                 
    for _ in range(window-1):
        q.append(_buffer_)
        inc_stats(q)
    

    say('Creating directory {}'.format(stat))
    if not os.path.exists(stat): os.makedirs(stat)                
    xi, yi = {}, {}
    xhead, yhead = 1, 1 # starting from 1 for matlab     

    with open(stat + '/X', 'wb') as Xfile:
        for token in Xcount: 
            if not token in xi: xi[token] = xhead; xhead += 1
            print >> Xfile, xi[token], Xcount[token]

    with open(stat + '/wordmap', 'wb') as wordmapfile:
        for token in xi: print >> wordmapfile, xi[token], token
 
    with open(stat + '/Y', 'wb') as Yfile:
        for friend in Ycount:
            if not friend in yi: yi[friend] = yhead; yhead += 1  
            print >> Yfile, yi[friend], Ycount[friend]
    
    with open(stat + '/XY', 'wb') as XYfile:
        for (token, friend) in XYcount:
            print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)]
            
    return XYcount, Xcount, Ycount, stat
Exemple #17
0
    def get_stat(self, stat):
        self.stat = complete_path(stat)
        XYstats = self.stat + 'XY'
        Xstats = self.stat + 'X' 
        Ystats = self.stat + 'Y' 
        
        assert(os.path.isfile(XYstats) and 
               os.path.isfile(Xstats) and 
               os.path.isfile(Ystats))

        say('XYstats: {}'.format(XYstats))
        say('Xstats: {}'.format(Xstats))
        say('Ystats: {}'.format(Ystats))
        self.wordmap = {}
        wordmapf = self.stat + 'wordmap'
        with open(wordmapf) as f:
            for line in f:
                toks = line.split()
                self.wordmap[int(toks[0])-1] = toks[1]
        
        pickle_file = self.stat + 'pickle'
        if os.path.isfile(pickle_file):
            with open(pickle_file) as f:
                self.countXY, self.countX, self.countY, self.num_samples = \
                    cPickle.load(f)
            return
        
        self.countXY = Counter()
        self.countX = Counter()
        self.countY = Counter()
        self.num_samples = 0. 
        
        num_lines = wc_l(XYstats)
        linenum = 0
        with open(XYstats) as f:
            for line in f:
                linenum += 1
                toks = line.split()
                x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2])
                self.countXY[x, y] = count 
                if linenum % 1000 is 0: 
                    inline_print('Processing line %i of %i' % 
                                 (linenum, num_lines))
        
        with open(Xstats) as f:
            for line in f:
                toks = line.split()
                x, count = int(toks[0])-1, int(toks[1])
                self.countX[x] = count
                self.num_samples += count
        
        with open(Ystats) as f:
            for line in f:
                toks = line.split()
                y, count = int(toks[0])-1, int(toks[1])
                self.countY[y] = count
        
        inline_print('\nConstructing matrices\n')
        self.countXY = csc_matrix((self.countXY.values(), 
                                   zip(*self.countXY.keys())), 
                                  shape=(len(self.countX), len(self.countY)))
        self.countX = array([self.countX[i] for i in range(len(self.countX))])
        self.countY = array([self.countY[i] for i in range(len(self.countY))])

        with open(pickle_file, 'wb') as outf:
            cPickle.dump((self.countXY, self.countX, self.countY, 
                          self.num_samples), outf, 
                         protocol=cPickle.HIGHEST_PROTOCOL) 
Exemple #18
0
 def set_params(self, m, kappa):        
     say('m: {}'.format(m))
     say('kappa: {}'.format(kappa))
     self.m = m
     self.kappa = kappa
Exemple #19
0
def extract_stat(corpus, vocab, stat, window, hash_width = 32):
    stat += '.window' + str(window) + '.hashbits' + str(hash_width)    
    assert(os.path.isfile(corpus))
    
    XYcount = Counter()
    Xcount = Counter()
    Ycount = Counter()
    CollisionCount = defaultdict(set)
    def inc_stats(q):
        center = (window - 1) / 2 # position of the current token
        if q[center] == _buffer_: return
        token = q[center] if q[center] in vocab else _rare_
        Xcount[token] += 1
        friend = ''
        for i in range(window):
            if i != center:
                if q[i] != _buffer_:
                    friend += q[i] if q[i] in vocab else _rare_
                rel_pos = i - center
                pos_marker = ('<+'+str(rel_pos)+'>' if rel_pos > 0 else
                                '<'+str(rel_pos)+'>')
                friend += pos_marker
        friend_hashv = fnv_hash(friend, hash_width)
        CollisionCount[friend_hashv].add(friend)
        XYcount[(token, friend_hashv)] += 1
        Ycount[friend_hashv] += 1
            
    num_tok = 0
    q = deque([_buffer_ for _ in range(window-1)], window)
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000) # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
                    q.append(tok)
                    inc_stats(q)                    
    inline_print('\n')
                 
    for _ in range(window-1):
        q.append(_buffer_)
        inc_stats(q)
    
    collisions = 0
    for key, value in CollisionCount.iteritems():
        if len(value) > 1: collisions += len(value)
    say('Collisions: {}'.format(collisions))
    say('Creating directory {}'.format(stat))
    if not os.path.exists(stat): os.makedirs(stat)                
    xi, yi = {}, {}
    xhead, yhead = 1, 1 # starting from 1 for matlab     

    with open(stat + '/X', 'wb') as Xfile:
        for token in Xcount: 
            if not token in xi: xi[token] = xhead; xhead += 1
            print >> Xfile, xi[token], Xcount[token]

    with open(stat + '/wordmap', 'wb') as wordmapfile:
        for token in xi: print >> wordmapfile, xi[token], token
 
    with open(stat + '/Y', 'wb') as Yfile:
        for friend in Ycount:
            if not friend in yi: yi[friend] = yhead; yhead += 1  
            print >> Yfile, yi[friend], Ycount[friend]
    
    with open(stat + '/XY', 'wb') as XYfile:
        for (token, friend) in XYcount:
            print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)]
            
    return XYcount, Xcount, Ycount, stat
Exemple #20
0
 def write_sv(self):
     say('\nStoring singular values at: %s' % self.dirname+'/sv')
     with open(self.dirname+'/sv', 'wb') as outf:
         for i in range(len(self.sv)): print >> outf, self.sv[i]
Exemple #21
0
def count_unigrams(corpus):
    unigrams = os.path.splitext(corpus)[0] + '.1grams'
    if not os.path.isfile(unigrams): count_ngrams(corpus, n_vals=[1])
    else: say('{} exists'.format(unigrams))
    return unigrams
Exemple #22
0
def count_unigrams(corpus):
    unigrams = os.path.splitext(corpus)[0] + '.1grams'
    if not os.path.isfile(unigrams): count_ngrams(corpus, n_vals=[1])
    else: say('{} exists'.format(unigrams))
    return unigrams 
Exemple #23
0
    def get_stat(self, stat):
        self.stat = complete_path(stat)
        XYstats = self.stat + 'XY'
        Xstats = self.stat + 'X' 
        Ystats = self.stat + 'Y' 
        
        assert(os.path.isfile(XYstats) and 
               os.path.isfile(Xstats) and 
               os.path.isfile(Ystats))

        say('XYstats: {}'.format(XYstats))
        say('Xstats: {}'.format(Xstats))
        say('Ystats: {}'.format(Ystats))
        self.wordmap = {}
        wordmapf = self.stat + 'wordmap'
        with open(wordmapf) as f:
            for line in f:
                toks = line.split()
                self.wordmap[int(toks[0])-1] = toks[1]
        
        pickle_file = self.stat + 'pickle'
        if os.path.isfile(pickle_file):
            with open(pickle_file) as f:
                self.countXY, self.countX, self.countY, self.num_samples = \
                    cPickle.load(f)
            return
        
        self.countXY = Counter()
        self.countX = Counter()
        self.countY = Counter()
        self.num_samples = 0. 
        
        num_lines = wc_l(XYstats)
        linenum = 0
        with open(XYstats) as f:
            for line in f:
                linenum += 1
                toks = line.split()
                x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2])
                self.countXY[x, y] = count 
                if linenum % 1000 is 0: 
                    inline_print('Processing line %i of %i' % 
                                 (linenum, num_lines))
        
        with open(Xstats) as f:
            for line in f:
                toks = line.split()
                x, count = int(toks[0])-1, int(toks[1])
                self.countX[x] = count
                self.num_samples += count
        
        with open(Ystats) as f:
            for line in f:
                toks = line.split()
                y, count = int(toks[0])-1, int(toks[1])
                self.countY[y] = count
        
        inline_print('\nConstructing matrices\n')
        self.countXY = csc_matrix((self.countXY.values(), 
                                   zip(*self.countXY.keys())), 
                                  shape=(len(self.countX), len(self.countY)))
        self.countX = array([self.countX[i] for i in range(len(self.countX))])
        self.countY = array([self.countY[i] for i in range(len(self.countY))])

        with open(pickle_file, 'wb') as outf:
            cPickle.dump((self.countXY, self.countX, self.countY, 
                          self.num_samples), outf, 
                         protocol=cPickle.HIGHEST_PROTOCOL) 
Exemple #24
0
def perform_pca(embedding_file, pca_dim):
    freqs, words, _, _, _, A = read_embeddings(embedding_file)
    say('performing PCA to reduce dimensions from {} to {}'.format(A.shape[1], pca_dim))            
    pca_trans, _, _ = pca_svd(A) 
    A_pca = pca_trans[:,:pca_dim]
    write_embeddings(freqs, words, A_pca, embedding_file + '.pca' + str(pca_dim))
Exemple #25
0
def extract_stat(corpus, vocab, stat, window):
    stat += '.window' + str(window)
    assert (os.path.isfile(corpus))

    XYcount = Counter()
    Xcount = Counter()
    Ycount = Counter()

    def inc_stats(q):
        center = (window - 1) / 2  # position of the current token
        if q[center] == _buffer_: return
        token = q[center] if q[center] in vocab else _rare_
        Xcount[token] += 1
        for i in range(window):
            if i != center:
                if q[i] == _buffer_: continue
                friend = q[i] if q[i] in vocab else _rare_
                rel_position = i - center
                position_marker = '<+' + str(
                    rel_position) + '>' if rel_position > 0 else '<' + str(
                        rel_position) + '>'
                friend += position_marker
                XYcount[(token, friend)] += 1
                Ycount[friend] += 1

    num_tok = 0
    q = deque([_buffer_ for _ in range(window - 1)], window)
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000)  # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0:
                        inline_print('Processed %i tokens' % (num_tok))
                    q.append(tok)
                    inc_stats(q)
    inline_print('\n')

    for _ in range(window - 1):
        q.append(_buffer_)
        inc_stats(q)

    say('Creating directory {}'.format(stat))
    if not os.path.exists(stat): os.makedirs(stat)
    xi, yi = {}, {}
    xhead, yhead = 1, 1  # starting from 1 for matlab

    with open(stat + '/X', 'wb') as Xfile:
        for token in Xcount:
            if not token in xi:
                xi[token] = xhead
                xhead += 1
            print >> Xfile, xi[token], Xcount[token]

    with open(stat + '/wordmap', 'wb') as wordmapfile:
        for token in xi:
            print >> wordmapfile, xi[token], token

    with open(stat + '/Y', 'wb') as Yfile:
        for friend in Ycount:
            if not friend in yi:
                yi[friend] = yhead
                yhead += 1
            print >> Yfile, yi[friend], Ycount[friend]

    with open(stat + '/XY', 'wb') as XYfile:
        for (token, friend) in XYcount:
            print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)]

    return XYcount, Xcount, Ycount, stat