Exemple #1
0
def rewrite_corpus(corpus, vocab, outfname):
    outfname += '.corpus'
    num_tok = 0
    with open(outfname, 'wb') as outf:
        with open(corpus) as corpusf:
            while True:
                lines = corpusf.readlines(10000000) # caching lines
                if not lines: break
                for line in lines:
                    toks = line.split()
                    for tok in toks:
                        num_tok += 1
                        if tok in vocab: outf.write(tok+'\n')
                        else:            outf.write('<?>\n')  
                        if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
            inline_print('\n')
Exemple #2
0
def rewrite_corpus(corpus, vocab, outfname):
    outfname += '.corpus'
    num_tok = 0
    with open(outfname, 'wb') as outf:
        with open(corpus) as corpusf:
            while True:
                lines = corpusf.readlines(10000000)  # caching lines
                if not lines: break
                for line in lines:
                    toks = line.split()
                    for tok in toks:
                        num_tok += 1
                        if tok in vocab: outf.write(tok + '\n')
                        else: outf.write('<?>\n')
                        if num_tok % 1000 is 0:
                            inline_print('Processed %i tokens' % (num_tok))
            inline_print('\n')
Exemple #3
0
def count_ngrams(corpus, n_vals=False):
    assert (os.path.isfile(corpus))
    if n_vals == False:
        answer = raw_input('Type in the values of n (e.g., \"1 3\"): ')
        n_vals = [int(n) for n in answer.split()]

    num_tok = 0
    ngrams = [Counter() for n in n_vals]
    queues = [deque([_buffer_ for _ in range(n - 1)], n) for n in n_vals]
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000)  # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0:
                        inline_print('Processed %i tokens' % (num_tok))
                    for i in range(len(n_vals)):
                        queues[i].append(tok)
                        ngrams[i][tuple(queues[i])] += 1

    for i in range(len(n_vals)):
        for _ in range(n_vals[i] - 1):
            queues[i].append(_buffer_)
            ngrams[i][tuple(queues[i])] += 1

    say('\nTotal {} tokens'.format(num_tok))
    files = [
        os.path.splitext(corpus)[0] + '.' + str(n) + 'grams' for n in n_vals
    ]
    for i in range(len(n_vals)):
        say('Sorting {} {}grams and writing to: {}'.format(
            len(ngrams[i]), n_vals[i], files[i]))
        sorted_ngrams = sorted(ngrams[i].items(),
                               key=lambda x: x[1],
                               reverse=True)
        with open(files[i], 'wb') as outf:
            for ngram, count in sorted_ngrams:
                for tok in ngram:
                    print >> outf, tok,
                print >> outf, count
Exemple #4
0
def count_ngrams(corpus, n_vals=False):
    assert(os.path.isfile(corpus))    
    if n_vals == False:
        answer = raw_input('Type in the values of n (e.g., \"1 3\"): ')        
        n_vals = [int(n) for n in answer.split()]
    
    num_tok = 0
    ngrams = [Counter() for n in n_vals]                                      
    queues = [deque([_buffer_ for _ in range(n-1)], n) for n in n_vals]
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000) # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
                    for i in range(len(n_vals)):
                        queues[i].append(tok)
                        ngrams[i][tuple(queues[i])] += 1
                 
    for i in range(len(n_vals)):
        for _ in range(n_vals[i]-1):
            queues[i].append(_buffer_)
            ngrams[i][tuple(queues[i])] += 1

    say('\nTotal {} tokens'.format(num_tok))
    files = [os.path.splitext(corpus)[0]+'.'+str(n)+'grams' for n in n_vals]        
    for i in range(len(n_vals)):
        say('Sorting {} {}grams and writing to: {}'.format(len(ngrams[i]), n_vals[i], files[i]))
        sorted_ngrams = sorted(ngrams[i].items(), key=lambda x: x[1], reverse=True)
        with open(files[i], 'wb') as outf:
            for ngram, count in sorted_ngrams:
                for tok in ngram:
                    print >> outf, tok,
                print >> outf, count
Exemple #5
0
    def get_stat(self, stat):
        self.stat = complete_path(stat)
        XYstats = self.stat + 'XY'
        Xstats = self.stat + 'X' 
        Ystats = self.stat + 'Y' 
        
        assert(os.path.isfile(XYstats) and 
               os.path.isfile(Xstats) and 
               os.path.isfile(Ystats))

        say('XYstats: {}'.format(XYstats))
        say('Xstats: {}'.format(Xstats))
        say('Ystats: {}'.format(Ystats))
        self.wordmap = {}
        wordmapf = self.stat + 'wordmap'
        with open(wordmapf) as f:
            for line in f:
                toks = line.split()
                self.wordmap[int(toks[0])-1] = toks[1]
        
        pickle_file = self.stat + 'pickle'
        if os.path.isfile(pickle_file):
            with open(pickle_file) as f:
                self.countXY, self.countX, self.countY, self.num_samples = \
                    cPickle.load(f)
            return
        
        self.countXY = Counter()
        self.countX = Counter()
        self.countY = Counter()
        self.num_samples = 0. 
        
        num_lines = wc_l(XYstats)
        linenum = 0
        with open(XYstats) as f:
            for line in f:
                linenum += 1
                toks = line.split()
                x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2])
                self.countXY[x, y] = count 
                if linenum % 1000 is 0: 
                    inline_print('Processing line %i of %i' % 
                                 (linenum, num_lines))
        
        with open(Xstats) as f:
            for line in f:
                toks = line.split()
                x, count = int(toks[0])-1, int(toks[1])
                self.countX[x] = count
                self.num_samples += count
        
        with open(Ystats) as f:
            for line in f:
                toks = line.split()
                y, count = int(toks[0])-1, int(toks[1])
                self.countY[y] = count
        
        inline_print('\nConstructing matrices\n')
        self.countXY = csc_matrix((self.countXY.values(), 
                                   zip(*self.countXY.keys())), 
                                  shape=(len(self.countX), len(self.countY)))
        self.countX = array([self.countX[i] for i in range(len(self.countX))])
        self.countY = array([self.countY[i] for i in range(len(self.countY))])

        with open(pickle_file, 'wb') as outf:
            cPickle.dump((self.countXY, self.countX, self.countY, 
                          self.num_samples), outf, 
                         protocol=cPickle.HIGHEST_PROTOCOL) 
Exemple #6
0
    def get_stat(self, stat):
        self.stat = complete_path(stat)
        XYstats = self.stat + 'XY'
        Xstats = self.stat + 'X' 
        Ystats = self.stat + 'Y' 
        
        assert(os.path.isfile(XYstats) and 
               os.path.isfile(Xstats) and 
               os.path.isfile(Ystats))

        say('XYstats: {}'.format(XYstats))
        say('Xstats: {}'.format(Xstats))
        say('Ystats: {}'.format(Ystats))
        self.wordmap = {}
        wordmapf = self.stat + 'wordmap'
        with open(wordmapf) as f:
            for line in f:
                toks = line.split()
                self.wordmap[int(toks[0])-1] = toks[1]
        
        pickle_file = self.stat + 'pickle'
        if os.path.isfile(pickle_file):
            with open(pickle_file) as f:
                self.countXY, self.countX, self.countY, self.num_samples = \
                    cPickle.load(f)
            return
        
        self.countXY = Counter()
        self.countX = Counter()
        self.countY = Counter()
        self.num_samples = 0. 
        
        num_lines = wc_l(XYstats)
        linenum = 0
        with open(XYstats) as f:
            for line in f:
                linenum += 1
                toks = line.split()
                x, y, count = int(toks[0])-1, int(toks[1])-1, int(toks[2])
                self.countXY[x, y] = count 
                if linenum % 1000 is 0: 
                    inline_print('Processing line %i of %i' % 
                                 (linenum, num_lines))
        
        with open(Xstats) as f:
            for line in f:
                toks = line.split()
                x, count = int(toks[0])-1, int(toks[1])
                self.countX[x] = count
                self.num_samples += count
        
        with open(Ystats) as f:
            for line in f:
                toks = line.split()
                y, count = int(toks[0])-1, int(toks[1])
                self.countY[y] = count
        
        inline_print('\nConstructing matrices\n')
        self.countXY = csc_matrix((self.countXY.values(), 
                                   zip(*self.countXY.keys())), 
                                  shape=(len(self.countX), len(self.countY)))
        self.countX = array([self.countX[i] for i in range(len(self.countX))])
        self.countY = array([self.countY[i] for i in range(len(self.countY))])

        with open(pickle_file, 'wb') as outf:
            cPickle.dump((self.countXY, self.countX, self.countY, 
                          self.num_samples), outf, 
                         protocol=cPickle.HIGHEST_PROTOCOL) 
Exemple #7
0
def extract_stat(corpus, vocab, stat, window, hash_width = 32):
    stat += '.window' + str(window) + '.hashbits' + str(hash_width)    
    assert(os.path.isfile(corpus))
    
    XYcount = Counter()
    Xcount = Counter()
    Ycount = Counter()
    CollisionCount = defaultdict(set)
    def inc_stats(q):
        center = (window - 1) / 2 # position of the current token
        if q[center] == _buffer_: return
        token = q[center] if q[center] in vocab else _rare_
        Xcount[token] += 1
        friend = ''
        for i in range(window):
            if i != center:
                if q[i] != _buffer_:
                    friend += q[i] if q[i] in vocab else _rare_
                rel_pos = i - center
                pos_marker = ('<+'+str(rel_pos)+'>' if rel_pos > 0 else
                                '<'+str(rel_pos)+'>')
                friend += pos_marker
        friend_hashv = fnv_hash(friend, hash_width)
        CollisionCount[friend_hashv].add(friend)
        XYcount[(token, friend_hashv)] += 1
        Ycount[friend_hashv] += 1
            
    num_tok = 0
    q = deque([_buffer_ for _ in range(window-1)], window)
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000) # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
                    q.append(tok)
                    inc_stats(q)                    
    inline_print('\n')
                 
    for _ in range(window-1):
        q.append(_buffer_)
        inc_stats(q)
    
    collisions = 0
    for key, value in CollisionCount.iteritems():
        if len(value) > 1: collisions += len(value)
    say('Collisions: {}'.format(collisions))
    say('Creating directory {}'.format(stat))
    if not os.path.exists(stat): os.makedirs(stat)                
    xi, yi = {}, {}
    xhead, yhead = 1, 1 # starting from 1 for matlab     

    with open(stat + '/X', 'wb') as Xfile:
        for token in Xcount: 
            if not token in xi: xi[token] = xhead; xhead += 1
            print >> Xfile, xi[token], Xcount[token]

    with open(stat + '/wordmap', 'wb') as wordmapfile:
        for token in xi: print >> wordmapfile, xi[token], token
 
    with open(stat + '/Y', 'wb') as Yfile:
        for friend in Ycount:
            if not friend in yi: yi[friend] = yhead; yhead += 1  
            print >> Yfile, yi[friend], Ycount[friend]
    
    with open(stat + '/XY', 'wb') as XYfile:
        for (token, friend) in XYcount:
            print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)]
            
    return XYcount, Xcount, Ycount, stat
Exemple #8
0
def extract_stat(corpus, vocab, stat, window):
    stat += '.window' + str(window)
    assert (os.path.isfile(corpus))

    XYcount = Counter()
    Xcount = Counter()
    Ycount = Counter()

    def inc_stats(q):
        center = (window - 1) / 2  # position of the current token
        if q[center] == _buffer_: return
        token = q[center] if q[center] in vocab else _rare_
        Xcount[token] += 1
        for i in range(window):
            if i != center:
                if q[i] == _buffer_: continue
                friend = q[i] if q[i] in vocab else _rare_
                rel_position = i - center
                position_marker = '<+' + str(
                    rel_position) + '>' if rel_position > 0 else '<' + str(
                        rel_position) + '>'
                friend += position_marker
                XYcount[(token, friend)] += 1
                Ycount[friend] += 1

    num_tok = 0
    q = deque([_buffer_ for _ in range(window - 1)], window)
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000)  # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0:
                        inline_print('Processed %i tokens' % (num_tok))
                    q.append(tok)
                    inc_stats(q)
    inline_print('\n')

    for _ in range(window - 1):
        q.append(_buffer_)
        inc_stats(q)

    say('Creating directory {}'.format(stat))
    if not os.path.exists(stat): os.makedirs(stat)
    xi, yi = {}, {}
    xhead, yhead = 1, 1  # starting from 1 for matlab

    with open(stat + '/X', 'wb') as Xfile:
        for token in Xcount:
            if not token in xi:
                xi[token] = xhead
                xhead += 1
            print >> Xfile, xi[token], Xcount[token]

    with open(stat + '/wordmap', 'wb') as wordmapfile:
        for token in xi:
            print >> wordmapfile, xi[token], token

    with open(stat + '/Y', 'wb') as Yfile:
        for friend in Ycount:
            if not friend in yi:
                yi[friend] = yhead
                yhead += 1
            print >> Yfile, yi[friend], Ycount[friend]

    with open(stat + '/XY', 'wb') as XYfile:
        for (token, friend) in XYcount:
            print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)]

    return XYcount, Xcount, Ycount, stat
Exemple #9
0
def extract_stat(corpus, vocab, stat, window):
    stat += '.window' + str(window)    
    assert(os.path.isfile(corpus))
    
    XYcount = Counter()
    Xcount = Counter()
    Ycount = Counter()
    def inc_stats(q):
        center = (window - 1) / 2 # position of the current token
        if q[center] == _buffer_: return
        token = q[center] if q[center] in vocab else _rare_
        Xcount[token] += 1
        for i in range(window):
            if i != center:
                if q[i] == _buffer_: continue
                friend = q[i] if q[i] in vocab else _rare_
                rel_position = i-center
                position_marker = '<+'+str(rel_position)+'>' if rel_position > 0 else '<'+str(rel_position)+'>'
                friend += position_marker
                XYcount[(token, friend)] += 1
                Ycount[friend] += 1
            
    num_tok = 0
    q = deque([_buffer_ for _ in range(window-1)], window)
    with open(corpus) as f:
        while True:
            lines = f.readlines(10000000) # caching lines
            if not lines: break
            for line in lines:
                toks = line.split()
                for tok in toks:
                    num_tok += 1
                    if num_tok % 1000 is 0: inline_print('Processed %i tokens' % (num_tok))
                    q.append(tok)
                    inc_stats(q)                    
    inline_print('\n')
                 
    for _ in range(window-1):
        q.append(_buffer_)
        inc_stats(q)
    

    say('Creating directory {}'.format(stat))
    if not os.path.exists(stat): os.makedirs(stat)                
    xi, yi = {}, {}
    xhead, yhead = 1, 1 # starting from 1 for matlab     

    with open(stat + '/X', 'wb') as Xfile:
        for token in Xcount: 
            if not token in xi: xi[token] = xhead; xhead += 1
            print >> Xfile, xi[token], Xcount[token]

    with open(stat + '/wordmap', 'wb') as wordmapfile:
        for token in xi: print >> wordmapfile, xi[token], token
 
    with open(stat + '/Y', 'wb') as Yfile:
        for friend in Ycount:
            if not friend in yi: yi[friend] = yhead; yhead += 1  
            print >> Yfile, yi[friend], Ycount[friend]
    
    with open(stat + '/XY', 'wb') as XYfile:
        for (token, friend) in XYcount:
            print >> XYfile, xi[token], yi[friend], XYcount[(token, friend)]
            
    return XYcount, Xcount, Ycount, stat