Esempio n. 1
0
    def __init__(self):
        print 'load original w2v...'
        w2v = load_w2v(config.originalw2v_path)
        id2word={id_:key for id_,key in enumerate(w2v.keys())}
        word2id = dict_reverse(id2word)
        vectors=w2v.values()
        old_len=len(word2id)
        print 'old number of words  = ', old_len
        
        print 'load original w2v finished'

        print 'load synonym words...'
        synonym_words = get_words(config.synonym_path, w2v.keys())
        print 'load synonym words finished'
        print 'synonym words in Kb=', len(set(word2id.keys() ) & set(word2id.keys()))

        print 'update w2v ...'
        #synonym_words=list(synonym_words)[:50000]
        for word in set(synonym_words)-set(word2id.keys()):
            id_=len(id2word)
            id2word[id_] = word
            word2id[word] = id_
            #vectors.append(np.zeros((config.vec_len),dtype=np.float64))
        append_vectors = np.random.uniform(-0.1,0.1,(len(word2id)-old_len, config.vec_len))
        vectors=np.concatenate([np.array(vectors,dtype=np.float16),\
                np.array(append_vectors,dtype=np.float16)], axis=0)
        alpha=old_len*[[1],]+(len(word2id)-old_len)*[[0],]
        self.word2id=word2id
        self.id2word=id2word
        print 'new number of words  = ', len(word2id)

        print 'build graph...'
        with tf.device('/cpu:0'):
            self.build_graph(vectors, alpha)
        print 'build graph finished'
Esempio n. 2
0
    def predict_file(self, file):
        words = du.get_words(file)
        ngrams = list()
        for model in self.models:
            model_ngrams = du.generate_n_grams(words, model[0])
            ngrams += model_ngrams
        # Initialize score to priors
        scores = self.priors.copy()
        for ngram in ngrams:
            if self.smoothing > 0:
                for category in scores.keys():
                    scores[category] += self.word_freq[category].get(
                        ngram,
                        np.log(
                            self.smoothing) - np.log(
                            self.total_word_counts[category] + self.smoothing * self.vocab_size))
            else:
                # Skip if not in all categories
                in_all = True
                for cat_freq in self.word_freq.values():
                    if ngram not in cat_freq.keys():
                        in_all = False
                if in_all:
                    for category in scores.keys():
                        scores[category] += self.word_freq[category][ngram]

        return 1 if scores['pos'] > scores['neg'] else 0
Esempio n. 3
0
    def test_get_words(self):
        file = "This isn't a random, test file!"
        words = du.get_words(file)

        self.assertEqual(
            words,
            ['this', 'is', "n't", 'a', 'random', ',', 'test', 'file', '!'])
Esempio n. 4
0
    def load_data(self, datafile):

        dataset = pd.read_csv(datafile)
        if self.debug:
            dataset = dataset.iloc[:3000]

        text = 'comment_text'
        self.X = dataset[text].values

        labels = [
            'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
            'identity_hate'
        ]
        # labels = ['severe_toxic']
        assert (len(labels) == self.config.label_size)
        self.y = dataset[labels].values
        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, test_size=0.1, random_state=124)

        ## Build the vocabulary using the train data.
        self.vocab = Vocab()
        train_sents = [get_words(line) for line in self.X_train]
        self.vocab.construct(list(itertools.chain.from_iterable(train_sents)),
                             threshold=self.config.min_word_freq)
        print('Training on {} samples and validating on {} samples'.format(
            len(self.X_train), len(self.X_val)))
        print()

        self.embedding_matrix = np.random.uniform(
            -0.005, 0.005, size=[len(self.vocab),
                                 self.config.embed_size]).astype('float32')
        with tf.variable_scope("Embeddings") as scope:
            embedding = tf.get_variable("Embeds",
                                        initializer=self.embedding_matrix,
                                        dtype=tf.float32)

        if self.debug:
            return

        ## Populate embedding matrix from pre-trained word embeddings
        pretrained_index = {}
        with open('./WordVectors/crawl-300d-2M.vec') as fh:
            for line in fh:
                word_vec = line.strip().split()
                word = word_vec[0]
                vector = np.asarray(word_vec[1:], dtype='float32')
                pretrained_index[word] = vector

        pw = 0.0

        for word, idx in self.vocab.word_to_idx.items():
            pretrained_vector = pretrained_index.get(word)
            if pretrained_vector is not None:
                self.embedding_matrix[idx] = pretrained_vector
                pw += 1

        print("Found pretrained vectors for {:.2f}% of data".format(
            pw / len(self.vocab) * 100))
        del pretrained_index  ## Done only for memory constraint. Don't do this!!
Esempio n. 5
0
    def count_word_occ(self, file_list):
        occurence_dict = dict()
        for model in self.models:
            tmp_occ_dict = dict()
            for file in file_list:
                words = du.get_words(file)
                ngrams = du.generate_n_grams(words, model[0])
                for ngram in ngrams:
                    tmp_occ_dict[ngram] = tmp_occ_dict.get(ngram, 0) + 1

            # Cutoff
            cutoff = model[1]
            for ngram, count in tmp_occ_dict.items():
                if count >= cutoff:
                    occurence_dict[ngram] = occurence_dict.get(ngram, 0) + count
        return occurence_dict
Esempio n. 6
0
def prepare_dataset(sentences,
                    char_to_id,
                    word_to_id,
                    tag_to_id,
                    lower=True,
                    train=True):
    """
    Prepare the dataset. Return a list of lists of dictionaries containing:
        - word indexes
        - word char indexes
        - tag indexes
    """

    none_index = tag_to_id["O"]

    def f(x):
        return x.lower() if lower else x

    data = []
    #print(tag_to_id)
    for s in sentences:
        pos_list = get_pos_list()
        string = [w[0] for w in s]
        chars = [
            char_to_id[f(w) if f(w) in char_to_id else '<UNK>'] for w in string
        ]
        words = [w[0][0] for w in s]
        sen = "".join(words)
        #print(sen)
        words = get_words(sen, word_to_id)
        segs, pos = get_seg_pos_features(sen, pos_list)
        if train:
            tags = [tag_to_id.get(w[-1]) for w in s]
        else:
            tags = [none_index for _ in chars]
        #print(tags)
        data.append([string, chars, words, segs, pos, tags])
        #print(string, words)
        #print(string, chars, words, segs, tags)
        #print("string len: %i / chars len: %i / words len: %i / segs len: %i / tags len: %i." % (
        #len(string), len(chars), len(words), len(segs), len(tags)))
    return data