def __init__(self): print 'load original w2v...' w2v = load_w2v(config.originalw2v_path) id2word={id_:key for id_,key in enumerate(w2v.keys())} word2id = dict_reverse(id2word) vectors=w2v.values() old_len=len(word2id) print 'old number of words = ', old_len print 'load original w2v finished' print 'load synonym words...' synonym_words = get_words(config.synonym_path, w2v.keys()) print 'load synonym words finished' print 'synonym words in Kb=', len(set(word2id.keys() ) & set(word2id.keys())) print 'update w2v ...' #synonym_words=list(synonym_words)[:50000] for word in set(synonym_words)-set(word2id.keys()): id_=len(id2word) id2word[id_] = word word2id[word] = id_ #vectors.append(np.zeros((config.vec_len),dtype=np.float64)) append_vectors = np.random.uniform(-0.1,0.1,(len(word2id)-old_len, config.vec_len)) vectors=np.concatenate([np.array(vectors,dtype=np.float16),\ np.array(append_vectors,dtype=np.float16)], axis=0) alpha=old_len*[[1],]+(len(word2id)-old_len)*[[0],] self.word2id=word2id self.id2word=id2word print 'new number of words = ', len(word2id) print 'build graph...' with tf.device('/cpu:0'): self.build_graph(vectors, alpha) print 'build graph finished'
def predict_file(self, file): words = du.get_words(file) ngrams = list() for model in self.models: model_ngrams = du.generate_n_grams(words, model[0]) ngrams += model_ngrams # Initialize score to priors scores = self.priors.copy() for ngram in ngrams: if self.smoothing > 0: for category in scores.keys(): scores[category] += self.word_freq[category].get( ngram, np.log( self.smoothing) - np.log( self.total_word_counts[category] + self.smoothing * self.vocab_size)) else: # Skip if not in all categories in_all = True for cat_freq in self.word_freq.values(): if ngram not in cat_freq.keys(): in_all = False if in_all: for category in scores.keys(): scores[category] += self.word_freq[category][ngram] return 1 if scores['pos'] > scores['neg'] else 0
def test_get_words(self): file = "This isn't a random, test file!" words = du.get_words(file) self.assertEqual( words, ['this', 'is', "n't", 'a', 'random', ',', 'test', 'file', '!'])
def load_data(self, datafile): dataset = pd.read_csv(datafile) if self.debug: dataset = dataset.iloc[:3000] text = 'comment_text' self.X = dataset[text].values labels = [ 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate' ] # labels = ['severe_toxic'] assert (len(labels) == self.config.label_size) self.y = dataset[labels].values self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, test_size=0.1, random_state=124) ## Build the vocabulary using the train data. self.vocab = Vocab() train_sents = [get_words(line) for line in self.X_train] self.vocab.construct(list(itertools.chain.from_iterable(train_sents)), threshold=self.config.min_word_freq) print('Training on {} samples and validating on {} samples'.format( len(self.X_train), len(self.X_val))) print() self.embedding_matrix = np.random.uniform( -0.005, 0.005, size=[len(self.vocab), self.config.embed_size]).astype('float32') with tf.variable_scope("Embeddings") as scope: embedding = tf.get_variable("Embeds", initializer=self.embedding_matrix, dtype=tf.float32) if self.debug: return ## Populate embedding matrix from pre-trained word embeddings pretrained_index = {} with open('./WordVectors/crawl-300d-2M.vec') as fh: for line in fh: word_vec = line.strip().split() word = word_vec[0] vector = np.asarray(word_vec[1:], dtype='float32') pretrained_index[word] = vector pw = 0.0 for word, idx in self.vocab.word_to_idx.items(): pretrained_vector = pretrained_index.get(word) if pretrained_vector is not None: self.embedding_matrix[idx] = pretrained_vector pw += 1 print("Found pretrained vectors for {:.2f}% of data".format( pw / len(self.vocab) * 100)) del pretrained_index ## Done only for memory constraint. Don't do this!!
def count_word_occ(self, file_list): occurence_dict = dict() for model in self.models: tmp_occ_dict = dict() for file in file_list: words = du.get_words(file) ngrams = du.generate_n_grams(words, model[0]) for ngram in ngrams: tmp_occ_dict[ngram] = tmp_occ_dict.get(ngram, 0) + 1 # Cutoff cutoff = model[1] for ngram, count in tmp_occ_dict.items(): if count >= cutoff: occurence_dict[ngram] = occurence_dict.get(ngram, 0) + count return occurence_dict
def prepare_dataset(sentences, char_to_id, word_to_id, tag_to_id, lower=True, train=True): """ Prepare the dataset. Return a list of lists of dictionaries containing: - word indexes - word char indexes - tag indexes """ none_index = tag_to_id["O"] def f(x): return x.lower() if lower else x data = [] #print(tag_to_id) for s in sentences: pos_list = get_pos_list() string = [w[0] for w in s] chars = [ char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string ] words = [w[0][0] for w in s] sen = "".join(words) #print(sen) words = get_words(sen, word_to_id) segs, pos = get_seg_pos_features(sen, pos_list) if train: tags = [tag_to_id.get(w[-1]) for w in s] else: tags = [none_index for _ in chars] #print(tags) data.append([string, chars, words, segs, pos, tags]) #print(string, words) #print(string, chars, words, segs, tags) #print("string len: %i / chars len: %i / words len: %i / segs len: %i / tags len: %i." % ( #len(string), len(chars), len(words), len(segs), len(tags))) return data