Beispiel #1
0
class Char2Vec(Singleton):
    def __init__(self):
        if not check_uptodate(char2vec_path):
            _gen_char2vec()
        self.embedding = np.load(char2vec_path)
        self.char_dict = CharDict()

    def get_embedding(self):
        return self.embedding

    def get_vect(self, ch):
        return self.embedding[self.char_dict.char2int(ch)]

    def get_vects(self, text):
        return np.stack(map(self.get_vect, text)) if len(text) > 0 \
                else np.reshape(np.array([[]]), [0, CHAR_VEC_DIM])
Beispiel #2
0
def _gen_poems():
    print("Parsing poems ...")
    char_dict = CharDict()
    with open(poems_path, 'w') as fout:
        for corpus in _corpus_list:
            with open(os.path.join(raw_dir, corpus), 'r') as fin:
                for line in fin.readlines()[1:]:
                    sentences = split_sentences(line.strip().split()[-1])
                    all_char_in_dict = True
                    for sentence in sentences:
                        for ch in sentence:
                            if char_dict.char2int(ch) < 0:
                                all_char_in_dict = False
                                break
                        if not all_char_in_dict:
                            break
                    if all_char_in_dict:
                        fout.write(' '.join(sentences) + '\n')
            print("Finished parsing %s." % corpus)
Beispiel #3
0
def process(in_path, out_path):

    f_in = open(in_path, 'r')
    f_out = open(out_path, 'w')

    temp = f_in.readline().split()

    num_of_lines = int(temp[0])
    embedding_sz = int(temp[1])

    char_dict = CharDict()

    count = 0
    for line in f_in:

        data = line.split()

        word = data[0]

        all_char_in_dict = True
        for c in word:
            if char_dict.char2int(c) < 0:
                all_char_in_dict = False
                break
        if not all_char_in_dict:
            #print ('skip')
            continue
        if len(word) > 3:
            continue

        f_out.write(line)

        count += 1

        if count % 80000 == 0:
            print('\r {c} / {t}     {p}%'.format(c=count,
                                                 t=num_of_lines,
                                                 p=int(count * 100 /
                                                       num_of_lines)),
                  end='')

    f_in.close()
    f_out.close()