Ejemplo n.º 1
0
 def __init__(self, path, normalize=True):
     self.wi, self.iw = load_vocabulary(path + '.words.vocab')
     self.ci, self.ic = load_vocabulary(path + '.contexts.vocab')
     self.m = load_matrix(path)
     self.m.data = np.log(self.m.data)
     self.normal = normalize
     if normalize:
         self.normalize()
Ejemplo n.º 2
0
 def __init__(self, path, normalize=True):
     self.wi, self.iw = load_vocabulary(path + '.words.vocab')
     self.ci, self.ic = load_vocabulary(path + '.contexts.vocab')
     self.m = load_matrix(path)
     self.m.data = np.log(self.m.data)
     self.normal = normalize
     if normalize:
         self.normalize()
Ejemplo n.º 3
0
 def __init__(self, path, normalize=True, k=1):
     Explicit.__init__(self, path, False)
     self.wi, self.iw = load_vocabulary(path + '.words.vocab')
     self.ci, self.ic = load_vocabulary(path + '.contexts.vocab')
     self.m = load_matrix(path)
     self.m.data = self.m.data - np.log(k)
     # self.normal = normalize
     if normalize:
         self.normalize()
Ejemplo n.º 4
0
 def __init__(self, path, normalize=True, glen=5):
     self.wi, self.iw = load_vocabulary(path + '.words.vocab')
     self.ci, self.ic = load_vocabulary(path + '.contexts.vocab')
     self.sz, self.ng_freqs = self.load_counts(path)
     self.m = load_matrix(path)
     self.m.data = np.log(self.m.data)
     self.normal = normalize
     self.glen = glen
     if normalize:
         self.normalize()
Ejemplo n.º 5
0
 def __init__(self, path, normalize=True):
     Explicit.__init__(self, path, False)
     self.wi, self.iw = load_vocabulary(path + '.words.vocab')
     self.ci, self.ic = load_vocabulary(path + '.contexts.vocab')
     self.m = load_matrix(path)
     self.m.data = np.log(self.m.data)
     self.m.data[self.m.data <= 0] = 0
     self.m.data[self.m.data > 0] = 1
     # self.normal = normalize
     if normalize:
         self.normalize()
Ejemplo n.º 6
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --processes_num NUM        The number of processes [default: 12]
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 300]
        --iters NUM                The number of iterations [default: 1]
    """)

    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    processes_num = int(args['--processes_num'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    global_word_count = Value('l', 0)
    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    print()

    for i in range(iters):
        pool = Pool(processes=processes_num,
                    initializer=__init_process,
                    initargs=(w2i, c2i, syn0, syn1, table, negative, size,
                              alpha, processes_num, global_word_count,
                              pairs_num, iters, pairs_path))
        pool.map(train_process, range(processes_num))

    save(i2w, syn0, outputs_path)
    print("word2vecf finished")
Ejemplo n.º 7
0
    def __init__(self, path, normalize=True):
        self.tmp_m = []
        self.m = []
        for i in range(5):
            ind = (i + 1) * 100
            self.tmp_m.append(np.load(path + '_' + str(ind) + '.words.npy'))

        self.wi, self.iw = load_vocabulary(path + '_500.words.vocab')
        diff_norms = np.linalg.norm(self.tmp_m[4], ord=2, axis=1)

        p_scores = [
            np.percentile(diff_norms, i)
            for i in [0.0, 20.0, 40.0, 60.0, 80.0, 100.0]
        ]
        for i in range(self.tmp_m[0].shape[0]):
            norm = diff_norms[i]
            ind = 5
            for j in range(len(p_scores)):
                if norm < p_scores[j]:
                    ind = j
                    break
            self.m.append(
                np.concatenate(
                    (self.tmp_m[ind - 1][i], np.zeros(500 - (ind) * 100))))

        self.m = np.asarray(self.m)

        if normalize:
            self.normalize()
        self.dim = self.m.shape[1]
Ejemplo n.º 8
0
 def load(cls, path, normalize=True, restricted_context=None, **kwargs):
     mat = load_matrix(path)
     word_vocab, context_vocab = load_vocabulary(mat, path)
     return cls(mat,
                word_vocab,
                context_vocab,
                normalize=normalize,
                restricted_context=restricted_context)
Ejemplo n.º 9
0
 def __init__(self, path):
     self.m = []
     for line in open(path, 'r'):
         self.m.append([float(elem) for elem in line.split()[1:]])
     self.m = np.asarray(self.m)
     self.wi, self.iw = load_vocabulary(path + '.words.vocab')
     self.dim = self.m.shape[1]
     self.normalize()
Ejemplo n.º 10
0
 def __init__(self, path, normalize=True, glen=5):
     self.m = np.load(path + '.npy')
     self.sz, self.ng_freqs = self.load_counts(path)
     self.glen = glen
     if normalize:
         self.normalize()
     self.dim = self.m.shape[1]
     self.wi, self.iw = load_vocabulary(path + '.vocab')
Ejemplo n.º 11
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --processes_num NUM        The number of processes [default: 12]
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 300]
        --iters NUM                The number of iterations [default: 1]
    """)
    
    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    processes_num = int(args['--processes_num'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    global_word_count = Value('l', 0)
    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    print ()

    for i in range(iters):
        pool = Pool(processes=processes_num, initializer=__init_process, initargs=(w2i, c2i, syn0, syn1, table, negative, size, alpha, processes_num, global_word_count, pairs_num, iters, pairs_path))
        pool.map(train_process, range(processes_num))

    save(i2w, syn0, outputs_path)
    print ("word2vecf finished")
Ejemplo n.º 12
0
    def __init__(self, path, normalize=True, eig=0.0, transpose=False):
        if transpose:
            ut = np.load(path + '.vt.npy')
            self.wi, self.iw = load_vocabulary(path + '.contexts.vocab')
        else:
            ut = np.load(path + '.ut.npy')
            self.wi, self.iw = load_vocabulary(path + '.words.vocab')
        s = np.load(path + '.s.npy')

        if eig == 0.0:
            self.m = ut.T
        elif eig == 1.0:
            self.m = s * ut.T
        else:
            self.m = np.power(s, eig) * ut.T

        self.dim = self.m.shape[1]

        diff_norms = np.linalg.norm(self.m, ord=2, axis=1)

        p_scores = [
            np.percentile(diff_norms, i)
            for i in [0.0, 20.0, 40.0, 60.0, 80.0, 100.0]
        ]
        print(self.m.shape)
        dim = [600, 700, 800, 900, 1000]
        #dim = [1000, 1000, 1000, 1000, 1000]
        for i in range(self.m.shape[0]):
            norm = diff_norms[i]
            #ind = [j for j in range(len(p_scores)) if (p_scores[j] > norm) ]
            #ind = ind[0]
            ind = 0
            for j in range(len(p_scores)):
                if norm < p_scores[j]:
                    ind = j
                    break
            #print (ind)
            self.m[i] = ut.T[i] * np.power(
                np.concatenate(
                    (s[:dim[ind - 1]], np.zeros(self.dim - dim[ind - 1]))),
                eig)

        if normalize:
            self.normalize()
Ejemplo n.º 13
0
    def __init__(self, path, pmi, normalize=True, neg=1):
        self.wi, self.iw = load_vocabulary(path + '.words.vocab')
        self.m = pmi
        self.m.data = np.log(self.m.data)

        self.m.data -= np.log(neg)
        self.m.data[self.m.data < 0] = 0
        self.m.eliminate_zeros()
        if normalize:
            self.normalize()
Ejemplo n.º 14
0
def read_counts_matrix(words_path, contexts_path, counts_path):
    wi, iw = load_vocabulary(words_path)
    ci, ic = load_vocabulary(contexts_path)
    counts_num = 0
    row = []
    col = []
    data = []
    with open(counts_path) as f:
        print str(counts_num/1000**2) + "M counts processed."
        for line in f:
            if counts_num % 1000**2 == 0:
                print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed."
            word, context, count = line.strip().split()
            row.append(int(word))
            col.append(int(context))
            data.append(int(float(count)))
            counts_num += 1
    counts = csr_matrix((data, (row, col)), shape=(len(wi), len(ci)), dtype=np.float32)
    return counts
Ejemplo n.º 15
0
    def __init__(self, path, normalize=True, eig=0.0, transpose=False):
        if transpose:
            ut = np.load(path + '.vt.npy')
            self.wi, self.iw = load_vocabulary(path + '.contexts.vocab')
        else:
            ut = np.load(path + '.ut.npy')
            self.wi, self.iw = load_vocabulary(path + '.words.vocab')
        s = np.load(path + '.s.npy')
        
        if eig == 0.0:
            self.m = ut.T
        elif eig == 1.0:
            self.m = s * ut.T
        else:
            self.m = np.power(s, eig) * ut.T

        self.dim = self.m.shape[1]

        if normalize:
            self.normalize()
Ejemplo n.º 16
0
    def __init__(self, path, normalize=True, eig=0.0, transpose=False):
        if transpose:
            ut = np.load(path + '.vt.npy')
            self.wi, self.iw = load_vocabulary(path + '.contexts.vocab')
        else:
            ut = np.load(path + '.ut.npy')
            self.wi, self.iw = load_vocabulary(path + '.words.vocab')
        s = np.load(path + '.s.npy')

        if eig == 0.0:
            self.m = ut.T
        elif eig == 1.0:
            self.m = s * ut.T
        else:
            self.m = np.power(s, eig) * ut.T

        self.dim = self.m.shape[1]

        if normalize:
            self.normalize()
Ejemplo n.º 17
0
def read_counts_matrix(words_path, contexts_path, counts_path):
    wi, iw = load_vocabulary(words_path)
    ci, ic = load_vocabulary(contexts_path)
    counts_num = 0
    row = []
    col = []
    data = []
    with open(counts_path) as f:
        print str(counts_num / 1000**2) + "M counts processed."
        for line in f:
            if counts_num % 1000**2 == 0:
                print "\x1b[1A" + str(
                    counts_num / 1000**2) + "M counts processed."
            word, context, count = line.strip().split()
            row.append(int(word))
            col.append(int(context))
            data.append(int(float(count)))
            counts_num += 1
    counts = csr_matrix((data, (row, col)),
                        shape=(len(wi), len(ci)),
                        dtype=np.float32)
    return counts
Ejemplo n.º 18
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 100]
        --iters NUM                The number of iterations [default: 1]
    """)

    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    for i in range(iters):
        train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha,
                      negative, pairs_num, iters)
    save(i2w, syn0, outputs_path)
    print("word2vecf finished")
Ejemplo n.º 19
0
 def load(cls,
          path,
          normalize=True,
          restricted_context=None,
          thresh=None,
          neg=1):
     mat = load_matrix(path, thresh)
     word_vocab, context_vocab = load_vocabulary(mat, path)
     return cls(mat,
                word_vocab,
                context_vocab,
                normalize,
                restricted_context,
                neg=neg)
Ejemplo n.º 20
0
def main():
    args = docopt("""
    Usage:
        word2vecf.py [options] <pairs> <words> <contexts> <outputs>

    Options:
        --negative NUM             Negative sampling [default: 5]
        --size NUM                 Embedding size [default: 100]
        --iters NUM                The number of iterations [default: 1]
    """)
    
    words_path = args['<words>']
    contexts_path = args['<contexts>']
    pairs_path = args['<pairs>']
    outputs_path = args['<outputs>']

    size = int(args['--size'])
    negative = int(args['--negative'])
    iters = int(args['--iters'])

    w2i, i2w = load_vocabulary(words_path)
    c2i, i2c = load_vocabulary(contexts_path)
    words = load_count_vocabulary(words_path)
    contexts = load_count_vocabulary(contexts_path)

    pairs_num = 0
    with open(pairs_path, 'r') as f:
        for l in f:
            pairs_num += 1

    alpha = 0.025
    syn0, syn1 = init_net(size, len(words), len(contexts))
    table = UnigramTable(i2c, contexts)
    for i in range(iters):
        train_process(pairs_path, size, syn0, syn1, w2i, c2i, table, alpha, negative, pairs_num, iters)
    save(i2w, syn0, outputs_path)
    print ("word2vecf finished")
Ejemplo n.º 21
0
def main():
    args = docopt("""
    Usage:
        text2numpy.py <path>
    """)
    
    path = args['<path>']
    matrix = read_vectors(path)
    wi, iw = load_vocabulary(path + ".vocab")
    
    new_matrix = np.zeros(shape=(len(iw), len(matrix[iw[0]])), dtype=np.float32)
    for i, word in enumerate(iw):
        if word in matrix:
            new_matrix[i, :] = matrix[word]
    
    np.save(path + '.npy', new_matrix)
Ejemplo n.º 22
0
 def load(cls,
          path,
          normalize=True,
          restricted_context=None,
          thresh=None,
          neg=1):
     #This line produces an error because load_matrix takes only one argument
     #mat = load_matrix(path, thresh)
     #Changing the line:
     mat = load_matrix(path)
     word_vocab, context_vocab = load_vocabulary(mat, path)
     return cls(mat,
                word_vocab,
                context_vocab,
                normalize,
                restricted_context,
                neg=neg)
Ejemplo n.º 23
0
 def __init__(self, path, normalize=True):
     self.m = np.load(path + '.npy')
     if normalize:
         self.normalize()
     self.dim = self.m.shape[1]
     self.wi, self.iw = load_vocabulary(path + '.vocab')
Ejemplo n.º 24
0
def main():
    # get all parameters.
    args = docopt("""
    Usage:
        pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts>

    Options:
        --memory_size NUM        Memory size available [default: 8.0]

    """)

    print "**********************"
    print "pairs2counts"

    wi, iw =load_vocabulary(args['<vocab_word>'])
    ci, ic = load_vocabulary(args['<vocab_context>'])
    max_product = 10000
    memory_size = float(args['--memory_size']) * 1000**3
    D = {} #store bottom-right part of co-occurrence matrix in dictionary
    tmpfile_num = 1
    memory_size_used = 0

    #store top-left corner of co-occurrence matrix in array, which is the strategy used in GloVe
    lookup = [0,]
    for i in xrange(len(iw)):
        if max_product / (i + 1) == 0:
            break
        if max_product / (i + 1) > len(iw):
            lookup.append(lookup[-1] + len(iw))
        else:
            lookup.append(lookup[-1] + max_product / (i + 1))
    M = np.zeros(lookup[-1] + 1, dtype=np.int32)

    with open(args['<pairs>']) as f:
        pairs_num = 0
        print str(pairs_num/1000**2) + "M pairs processed."
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                print "\x1b[1A" + str(pairs_num/1000**2) + "M pairs processed."
            if getsizeof(D) + memory_size_used + getsizeof(M) > memory_size * 0.8: #write dictionary to disk when memory is insufficient
                with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
                    tmp_sorted = sorted(D.keys())
                    for i in tmp_sorted:
                        pickle.dump((i, D[i]), f, True)
                    D.clear()
                    memory_size_used = 0
                    tmpfile_num += 1
            pair = line.strip().split()
            word_index = wi[pair[0]]
            context_index = ci[pair[1]]
            if (word_index + 1) * (context_index + 1) <= max_product: #store top-left corner in M, which stays in memory all time
                M[lookup[word_index] + context_index] += 1  
            else: #store bottom-right part in D, which is written to disk when memory is insufficient
                if word_index in D:
                    tmp_size = getsizeof(D[word_index])
                    D[word_index].update({context_index: 1})
                    memory_size_used += getsizeof(D[word_index]) - tmp_size #estimate the size of memory used
                else:
                    D[word_index] = Counter({context_index: 1})
                    memory_size_used += getsizeof(D[word_index])
    with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
        tmp_sorted = sorted(D.keys())
        for i in tmp_sorted:
            pickle.dump((i, D[i]), f, True)
        D.clear()
        tmpfile_num += 1

    for i in xrange(len(lookup)): #transform M to dictionary structure
        D[i] = Counter()
        if i == len(lookup) - 1:
            if M[lookup[i] + j] > 0:
                D[i].update({j: M[lookup[i] + j]})
            break
        for j in xrange(lookup[i+1] - lookup[i]):
            if M[lookup[i] + j] > 0:
                D[i].update({j: M[lookup[i] + j]})
    with open(args['<counts>'] + '_' + str(0), 'wb') as f: #write top-left corner to disk
        tmp_sorted = sorted(D.keys())
        for i in tmp_sorted:
            pickle.dump((i, D[i]), f, True)
        D.clear()      


    #merge tmpfiles to co-occurrence matrix
    tmpfiles = []
    top_buffer = [] #store top elements of tmpfiles
    counts_num = 0
    counts_file = open(args['<counts>'], 'w')
    for i in xrange(tmpfile_num):
        tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb'))
        top_buffer.append(pickle.load(tmpfiles[i]))
    old = top_buffer[0]
    top_buffer[0] = pickle.load(tmpfiles[0])
    print str(counts_num/1000**2) + "M counts processed."
    while True:
        arg_min = np.argmin(np.asarray([c[0] for c in top_buffer])) #find the element with smallest key (center word)
        if top_buffer[arg_min][0] == old[0]: #merge values when keys are the same
            old[1].update(top_buffer[arg_min][1])
        else:
            tmp_sorted = sorted(old[1].keys()) #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated)
            for w in tmp_sorted:
                counts_num += 1
                if counts_num % 1000**2 == 0:
                    print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed."
                counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            old = top_buffer[arg_min]
        try:
            top_buffer[arg_min] = pickle.load(tmpfiles[arg_min])
        except EOFError: #when elements in file are exhausted
            top_buffer[arg_min] = (np.inf, Counter())
            tmpfile_num -= 1
        if tmpfile_num == 0:
            tmp_sorted = sorted(old[1].keys())
            for w in tmp_sorted:
                counts_num += 1
                counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            break
    counts_file.close()
    print "number of counts: ", counts_num
    for i in xrange(len(top_buffer)): #remove tmpfiles
        os.remove(args['<counts>'] + '_' + str(i))

    print "pairs2counts finished"
Ejemplo n.º 25
0
def main():
    # get all parameters.
    args = docopt("""
    Usage:
        pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts>

    Options:
        --memory_size NUM        Memory size available [default: 8.0]

    """)

    print "**********************"
    print "pairs2counts"

    wi, iw = load_vocabulary(args['<vocab_word>'])
    ci, ic = load_vocabulary(args['<vocab_context>'])
    memory_size = float(args['--memory_size']) * 1000**3
    D = {}  #store co-occurrence matrix in dictionary
    tmpfile_num = 0
    memory_size_used = 0

    with open(args['<pairs>']) as f:
        pairs_num = 0
        print str(pairs_num / 1000**2) + "M pairs processed."
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                print "\x1b[1A" + str(
                    pairs_num / 1000**2) + "M pairs processed."
            if getsizeof(
                    D
            ) + memory_size_used > memory_size * 0.8:  #write dictionary to disk when memory is insufficient
                with open(args['<counts>'] + '_' + str(tmpfile_num),
                          'wb') as f:
                    tmp_sorted = sorted(D.keys())
                    for i in tmp_sorted:
                        pickle.dump((i, D[i]), f, True)
                    D.clear()
                    memory_size_used = 0
                    tmpfile_num += 1
            pair = line.strip().split()
            word_index = wi[pair[0]]
            context_index = ci[pair[1]]
            if word_index in D:
                tmp_size = getsizeof(D[word_index])
                D[word_index].update({context_index: 1})
                memory_size_used += getsizeof(
                    D[word_index]
                ) - tmp_size  #estimate the size of memory used
            else:
                D[word_index] = Counter({context_index: 1})
                memory_size_used += getsizeof(D[word_index])
    with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
        tmp_sorted = sorted(D.keys())
        for i in tmp_sorted:
            pickle.dump((i, D[i]), f, True)
        D.clear()
        tmpfile_num += 1

    #merge tmpfiles to co-occurrence matrix
    tmpfiles = []
    top_buffer = []  #store top elements of tmpfiles
    counts_num = 0
    counts_file = open(args['<counts>'], 'w')
    for i in xrange(tmpfile_num):
        tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb'))
        top_buffer.append(pickle.load(tmpfiles[i]))
    old = top_buffer[0]
    top_buffer[0] = pickle.load(tmpfiles[0])
    print str(counts_num / 1000**2) + "M counts processed."
    while True:
        arg_min = np.argmin(np.asarray([
            c[0] for c in top_buffer
        ]))  #find the element with smallest key (center word)
        if top_buffer[arg_min][0] == old[
                0]:  #merge values when keys are the same
            old[1].update(top_buffer[arg_min][1])
        else:
            tmp_sorted = sorted(
                old[1].keys()
            )  #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated)
            for w in tmp_sorted:
                counts_num += 1
                if counts_num % 1000**2 == 0:
                    print "\x1b[1A" + str(
                        counts_num / 1000**2) + "M counts processed."
                counts_file.write(
                    str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            old = top_buffer[arg_min]
        try:
            top_buffer[arg_min] = pickle.load(tmpfiles[arg_min])
        except EOFError:  #when elements in file are exhausted
            top_buffer[arg_min] = (np.inf, Counter())
            tmpfile_num -= 1
        if tmpfile_num == 0:
            tmp_sorted = sorted(old[1].keys())
            for w in tmp_sorted:
                counts_num += 1
                counts_file.write(
                    str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            break
    counts_file.close()
    print "number of counts: ", counts_num
    for i in xrange(len(top_buffer)):  #remove tmpfiles
        os.remove(args['<counts>'] + '_' + str(i))

    print "pairs2counts finished"
Ejemplo n.º 26
0
def main():
    # get all parameters.
    args = docopt("""
    Usage:
        pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts>

    Options:
        --memory_size NUM        Memory size available [default: 8.0]

    """)

    print "**********************"
    print "pairs2counts"

    wi, iw =load_vocabulary(args['<vocab_word>'])
    ci, ic = load_vocabulary(args['<vocab_context>'])
    max_product = 10000
    memory_size = float(args['--memory_size']) * 1000**3
    D = {} #store bottom-right part of co-occurrence matrix in dictionary
    tmpfile_num = 1
    memory_size_used = 0

    #store top-left corner of co-occurrence matrix in array, which is the strategy used in GloVe
    lookup = [0,]
    for i in xrange(len(iw)):
        if max_product / (i + 1) == 0:
            break
        if max_product / (i + 1) > len(iw):
            lookup.append(lookup[-1] + len(iw))
        else:
            lookup.append(lookup[-1] + max_product / (i + 1))
    M = np.zeros(lookup[-1] + 1, dtype=np.int32)

    with open(args['<pairs>']) as f:
        pairs_num = 0
        print str(pairs_num/1000**2) + "M pairs processed."
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                print "\x1b[1A" + str(pairs_num/1000**2) + "M pairs processed."
            if getsizeof(D) + memory_size_used + getsizeof(M) > memory_size * 0.8: #write dictionary to disk when memory is insufficient
                with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
                    tmp_sorted = sorted(D.keys())
                    for i in tmp_sorted:
                        pickle.dump((i, D[i]), f, True)
                    D.clear()
                    memory_size_used = 0
                    tmpfile_num += 1
            pair = line.strip().split()
            word_index = wi[pair[0]]
            context_index = ci[pair[1]]
            if (word_index + 1) * (context_index + 1) <= max_product: #store top-left corner in M, which stays in memory all time
                M[lookup[word_index] + context_index] += 1  
            else: #store bottom-right part in D, which is written to disk when memory is insufficient
                if word_index in D:
                    tmp_size = getsizeof(D[word_index])
                    D[word_index].update({context_index: 1})
                    memory_size_used += getsizeof(D[word_index]) - tmp_size #estimate the size of memory used
                else:
                    D[word_index] = Counter({context_index: 1})
                    memory_size_used += getsizeof(D[word_index])
    with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
        tmp_sorted = sorted(D.keys())
        for i in tmp_sorted:
            pickle.dump((i, D[i]), f, True)
        D.clear()
        tmpfile_num += 1

    for i in xrange(len(lookup)): #transform M to dictionary structure
        D[i] = Counter()
        if i == len(lookup) - 1:
            if M[lookup[i] + j] > 0:
                D[i].update({j: M[lookup[i] + j]})
            break
        for j in xrange(lookup[i+1] - lookup[i]):
            if M[lookup[i] + j] > 0:
                D[i].update({j: M[lookup[i] + j]})
    with open(args['<counts>'] + '_' + str(0), 'wb') as f: #write top-left corner to disk
        tmp_sorted = sorted(D.keys())
        for i in tmp_sorted:
            pickle.dump((i, D[i]), f, True)
        D.clear()      


    #merge tmpfiles to co-occurrence matrix
    tmpfiles = []
    top_buffer = [] #store top elements of tmpfiles
    counts_num = 0
    counts_file = open(args['<counts>'], 'w')
    for i in xrange(tmpfile_num):
        tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb'))
        top_buffer.append(pickle.load(tmpfiles[i]))
    old = top_buffer[0]
    top_buffer[0] = pickle.load(tmpfiles[0])
    print str(counts_num/1000**2) + "M counts processed."
    while True:
        arg_min = np.argmin(np.asarray([c[0] for c in top_buffer])) #find the element with smallest key (center word)
        if top_buffer[arg_min][0] == old[0]: #merge values when keys are the same
            old[1].update(top_buffer[arg_min][1])
        else:
            tmp_sorted = sorted(old[1].keys()) #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated)
            for w in tmp_sorted:
                counts_num += 1
                if counts_num % 1000**2 == 0:
                    print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed."
                counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            old = top_buffer[arg_min]
        try:
            top_buffer[arg_min] = pickle.load(tmpfiles[arg_min])
        except EOFError: #when elements in file are exhausted
            top_buffer[arg_min] = (np.inf, Counter())
            tmpfile_num -= 1
        if tmpfile_num == 0:
            tmp_sorted = sorted(old[1].keys())
            for w in tmp_sorted:
                counts_num += 1
                counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            break
    counts_file.close()
    print "number of counts: ", counts_num
    for i in xrange(len(top_buffer)): #remove tmpfiles
        os.remove(args['<counts>'] + '_' + str(i))

    print "pairs2counts finished"
Ejemplo n.º 27
0
 def __init__(self, path, normalize=True):
     self.m = np.load(path + '.npy')
     if normalize:
         self.normalize()
     self.dim = self.m.shape[1]
     self.wi, self.iw = load_vocabulary(path + '.vocab')
Ejemplo n.º 28
0
 def load(cls, path, normalize=True, restricted_context=None, **kwargs):
     mat = load_matrix(path)
     word_vocab, context_vocab = load_vocabulary(mat, path)
     return cls(mat, word_vocab, context_vocab, normalize=normalize, restricted_context=restricted_context)
Ejemplo n.º 29
0
 def load(cls, path, normalize=True, restricted_context=None, thresh=None, neg=1):
     mat = load_matrix(path, thresh)
     word_vocab, context_vocab = load_vocabulary(mat, path)
     return cls(mat, word_vocab, context_vocab, normalize, restricted_context, neg=neg)
Ejemplo n.º 30
0
def main():
    # get all parameters.
    args = docopt("""
    Usage:
        pairs2counts.py [options] <pairs> <vocab_word> <vocab_context> <counts>

    Options:
        --memory_size NUM        Memory size available [default: 8.0]

    """)

    print "**********************"
    print "pairs2counts"

    wi, iw =load_vocabulary(args['<vocab_word>'])
    ci, ic = load_vocabulary(args['<vocab_context>'])
    memory_size = float(args['--memory_size']) * 1000**3
    D = {} #store co-occurrence matrix in dictionary
    tmpfile_num = 0
    memory_size_used = 0

    with open(args['<pairs>']) as f:
        pairs_num = 0
        print str(pairs_num/1000**2) + "M pairs processed."
        for line in f:
            pairs_num += 1
            if pairs_num % 1000**2 == 0:
                print "\x1b[1A" + str(pairs_num/1000**2) + "M pairs processed."
            if getsizeof(D) + memory_size_used > memory_size * 0.8: #write dictionary to disk when memory is insufficient
                with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
                    tmp_sorted = sorted(D.keys())
                    for i in tmp_sorted:
                        pickle.dump((i, D[i]), f, True)
                    D.clear()
                    memory_size_used = 0
                    tmpfile_num += 1
            pair = line.strip().split()
            word_index = wi[pair[0]]
            context_index = ci[pair[1]]
            if word_index in D:
                tmp_size = getsizeof(D[word_index])
                D[word_index].update({context_index: 1})
                memory_size_used += getsizeof(D[word_index]) - tmp_size #estimate the size of memory used
            else:
                D[word_index] = Counter({context_index: 1})
                memory_size_used += getsizeof(D[word_index])
    with open(args['<counts>'] + '_' + str(tmpfile_num), 'wb') as f:
        tmp_sorted = sorted(D.keys())
        for i in tmp_sorted:
            pickle.dump((i, D[i]), f, True)
        D.clear()
        tmpfile_num += 1   


    #merge tmpfiles to co-occurrence matrix
    tmpfiles = []
    top_buffer = [] #store top elements of tmpfiles
    counts_num = 0
    counts_file = open(args['<counts>'], 'w')
    for i in xrange(tmpfile_num):
        tmpfiles.append(open(args['<counts>'] + '_' + str(i), 'rb'))
        top_buffer.append(pickle.load(tmpfiles[i]))
    old = top_buffer[0]
    top_buffer[0] = pickle.load(tmpfiles[0])
    print str(counts_num/1000**2) + "M counts processed."
    while True:
        arg_min = np.argmin(np.asarray([c[0] for c in top_buffer])) #find the element with smallest key (center word)
        if top_buffer[arg_min][0] == old[0]: #merge values when keys are the same
            old[1].update(top_buffer[arg_min][1])
        else:
            tmp_sorted = sorted(old[1].keys()) #write the old element when keys are different (which means all pairs whose center words are [old.key] are aggregated)
            for w in tmp_sorted:
                counts_num += 1
                if counts_num % 1000**2 == 0:
                    print "\x1b[1A" + str(counts_num/1000**2) + "M counts processed."
                counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            old = top_buffer[arg_min]
        try:
            top_buffer[arg_min] = pickle.load(tmpfiles[arg_min])
        except EOFError: #when elements in file are exhausted
            top_buffer[arg_min] = (np.inf, Counter())
            tmpfile_num -= 1
        if tmpfile_num == 0:
            tmp_sorted = sorted(old[1].keys())
            for w in tmp_sorted:
                counts_num += 1
                counts_file.write(str(old[0]) + " " + str(w) + " " + str(old[1][w]) + "\n")
            break
    counts_file.close()
    print "number of counts: ", counts_num
    for i in xrange(len(top_buffer)): #remove tmpfiles
        os.remove(args['<counts>'] + '_' + str(i))

    print "pairs2counts finished"