def __set_corpus(self): pre_dict = Dictionary() for lines in self.text_list: for line in lines: if len(line) > 0: words = line.split() #tokens += len(words) for word in words: pre_dict.add_word(word) pro_dict = Dictionary() for key in pre_dict.count: if (pre_dict.count[key] > 10): pro_dict.add_word(key) self.corpus = pro_dict
size = 400 epoch_num = 30 make_pred = False if False: model_dm, model_dbow = doc2vec_train(train_labelized, test_labelized, size, epoch_num) else: model_dm, model_dbow = doc2vec_load(size) train_doc2vecs, test_doc2vecs = get_vectors(model_dm, model_dbow, train_labelized, test_labelized, size) dictionary = Dictionary() for sentence in train_sentences + test: for word in sentence: dictionary.add_word(word) dictionary.refactor(1) print('vocab size = %d' % len(dictionary)) voc_len = len(dictionary) voc_len += 1 train_vecs = np.zeros((len(train_sentences), voc_len)) test_vecs = np.zeros((len(test), voc_len)) for i in range(len(train_sentences)): sentence = train_sentences[i] for word in sentence: train_vecs[i, dictionary.word2idx[word]] += 1 train_vecs[i, voc_len - 1] += 1 for i in range(len(test)): sentence = test[i] for word in sentence: test_vecs[i, dictionary.word2idx[word]] += 1
class SememeDictionary(object): def __init__(self, path=None): if path == None: path = 'data/HowNet.txt' self.word2idx = {} self.idx2word = [] self.idx2freq = [] self.idx2senses = [] self.threshold = -1 self.sememe_dict = Dictionary() self.threshold = 0 file = open(path) phase = 0 re_chn = re.compile(u'[^\u4e00-\u9fa5]') cur_word = '' # add sememe for special tokens self.add_word('<unk>', ['<unk>']) self.add_word('<eos>', ['<eos>']) self.add_word('<N>', ['基数']) self.add_word('<year>', ['时间', '年', '特定']) self.add_word('<date>', ['时间', '月', '特定']) self.add_word('<hour>', ['时间', '时', '特定']) self.add_word('(', ['标点']) self.add_word('『', ['标点']) self.add_word('……', ['标点']) self.add_word('●', ['标点']) self.add_word('《', ['标点']) self.add_word('—', ['标点']) self.add_word('———', ['标点']) self.add_word('』', ['标点']) self.add_word('》', ['标点']) self.add_word('△', ['标点']) self.add_word('、', ['标点']) self.add_word(')', ['标点']) self.add_word('℃', ['标点']) self.add_word('▲', ['标点']) for line in file.readlines(): if line[0:3] == 'NO.': phase = 1 continue # new word if phase == 1 and line[0:3] == 'W_C': phase = 2 word = line[4:-1] if word == '': phase = 0 else: cur_word = word continue if phase == 2 and line[0:3] == 'DEF': phase = 3 content = line[4:-1] sememes = re_chn.split(content) sememe_bag = [] for sememe in sememes: if sememe != '': sememe_bag += [sememe] if cur_word != '': self.add_word(cur_word, sememe_bag) self.sememe_dict.idx2freq = [0] * len(self.sememe_dict) def senses_belong(self, sememes_bag, senses_bag): for i in range(len(senses_bag)): if len(set(sememes_bag + senses_bag[i])) == len(sememes_bag)\ and len(sememes_bag) == len(senses_bag[i]): return True return False def add_word(self, word, sememes_bag): if word not in self.word2idx: self.idx2word.append(word) self.idx2senses.append([]) self.idx2freq.append(0) self.word2idx[word] = len(self.idx2word) - 1 idx = self.word2idx[word] sememe_bag_idx = [] for sememe in sememes_bag: sememe_bag_idx.append(self.sememe_dict.add_word(sememe)) sememe_bag_idx = list(set(sememe_bag_idx)) if not self.senses_belong(sememe_bag_idx, self.idx2senses[idx]): self.idx2senses[idx].append(sememe_bag_idx) return self.word2idx[word] def __len__(self): return len(self.idx2word) def summary(self, print_sememes=False): print('=' * 69) print('-' * 31 + 'SUMMARY' + '-' * 31) print('Number of Sememes: {}'.format(len(self.sememe_dict))) print('Number of Words: {}'.format(len(self.idx2word))) tot_senses = 0 tot_sememes = 0 for i in range(len(self.idx2word)): tot_senses += len(self.idx2senses[i]) for j in range(len(self.idx2senses[i])): tot_sememes += len(self.idx2senses[i][j]) ws_ratio = (tot_senses + 0.0) / len(self.idx2word) ss_ratio = (tot_sememes + 0.0) / tot_senses print('Mean Senses per Word: {}'.format(ws_ratio)) print('Mean Sememes per Sense: {}'.format(ss_ratio)) print('=' * 69) if print_sememes: print(','.join(self.sememe_dict.idx2word)) def exist(self, word): return word in self.word2idx def add_word_f(self, word): if word not in self.word2idx: raise ValueError("Word don't exist") idx = self.word2idx[word] for sense in self.idx2senses[idx]: for sememe in sense: self.sememe_dict.idx2freq[sememe] += 1 self.idx2freq[self.word2idx[word]] += 1 def query_count(self, word): if word not in self.word2idx: raise ValueError("Word don't exist") return self.idx2freq[self.word2idx[word]] def freq_le(self, k): tot = 0 for idx in range(len(self.idx2word)): if self.idx2freq[idx] < k: tot += 1 return tot def freq_ge(self, k): tot = 0 for idx in range(len(self.idx2word)): if self.idx2freq[idx] >= k: tot += 1 return tot def set_threshold(self, threshold): self.threshold = threshold def sememe_word_visit(self, word_dict): sememe_word = [] sememe_sense = [] for i in range(len(self.sememe_dict)): sememe_word.append([]) sememe_sense.append([]) maximum_senses = 0 tot_senses = 0 for word_id in range(len(self.word2idx)): if self.idx2freq[word_id] >= self.threshold: maximum_senses = max(maximum_senses, len(self.idx2senses[word_id])) for sense in self.idx2senses[word_id]: for sememe in sense: sememe_word[sememe].append(word_id) sememe_sense[sememe].append(tot_senses) tot_senses += 1 tot = 0 tot_sememes = 0 max_words = 0 a = [] sememe_word_pair = [[], []] sememe_sense_pair = [[], []] sememe_idx = [] word_sense = [] for i in range(len(word_dict)): word_sense.append([]) for i in range(len(self.sememe_dict)): cur_str = self.sememe_dict.idx2word[i] cur_str += ': ' words = [] for j in range(len(sememe_word[i])): word_id = sememe_word[i][j] sense_id = sememe_sense[i][j] words.append(self.idx2word[word_id]) sememe_word_pair[0].append(tot_sememes) sememe_word_pair[1].append(word_dict[self.idx2word[word_id]]) sememe_sense_pair[0].append(tot_sememes) sememe_sense_pair[1].append(sense_id) word_sense[word_dict[self.idx2word[word_id]]].append(sense_id) tot += len(sememe_word[i]) max_words = max(max_words, len(sememe_word[i])) a += sememe_word[i] cur_str += ','.join(words) if len(set(sememe_word[i])) > 0: sememe_idx.append(tot_sememes) else: sememe_idx.append(-1) tot_sememes += len(sememe_word[i]) > 0 for i in range(len(word_dict)): word_sense[i] = list(set(word_sense[i])) print('Total words: {}'.format(len(set(a)))) print('Maximum words per sememe: {}'.format(max_words)) print('Maximum sense per word: {}'.format(maximum_senses)) print('Total respective semems: {}'.format(tot_sememes)) print('Total sememe-word pairs: {}'.format(tot)) return sememe_word_pair, sememe_idx, sememe_sense_pair, word_sense def visit(self, word, mode='full'): if word not in self.word2idx: raise ValueError('No word!') idx = self.word2idx[word] if mode == 'sbag': sememes = [] for sense in self.idx2senses[idx]: for sememe in sense: sememes.append(sememe) sememes = set(sememes) sememes_str = [] for sememe in sememes: sememes_str.append(self.sememe_dict.idx2word[sememe]) print(word + ':' + ','.join(sememes_str)) if mode == 'full': print('Word: ' + word + ', total {} means'. format(len(self.idx2senses[idx]))) for i in range(len(self.idx2senses[idx])): sememes_list = [] for j in range(len(self.idx2senses[idx][i])): sememes_list.append( self.sememe_dict.idx2word[self.idx2senses[idx][i][j]]) sememes = ','.join(sememes_list) print('Sense #{}: '.format(i + 1) + sememes)