コード例 #1
0
class WordEmbedding:
    def __init__(self, prep_obj, method='w2v_tfidf'):
        self.vector_corpus = []

        if method == 'w2v_tfidf':
            self.word_to_vec(prep_obj, use_tf_idf=True)
        elif method == 'w2v':
            self.word_to_vec(prep_obj, use_tf_idf=False)
        elif method == 'cv':
            self.count_vectorizer(prep_obj)
        elif method == 'tfidf':
            self.tf_idf(prep_obj)
        elif method == 'glove':
            self.glove(prep_obj)
        elif method == 'bert':
            self.bert(prep_obj)

    def bert(self, prep_obj):
        bert_embedding = BertEmbedding(
            model='bert_12_768_12', dataset_name='book_corpus_wiki_en_cased')
        result = np.array(bert_embedding(prep_obj.detokenized_corpus))

        vec = np.zeros(768)
        for sentence in result:
            for word_vec in sentence[1]:
                vec = np.add(vec, np.array(word_vec))
            vec = np.true_divide(
                vec, 1 if len(sentence[1]) == 0 else len(sentence[1]))
            self.vector_corpus.append(vec)

    def count_vectorizer(self, prep_obj):
        self.vector_corpus = CountVectorizer().fit_transform(
            prep_obj.detokenized_corpus)

    def tf_idf(self, prep_obj):
        vectorizer = TfidfVectorizer(lowercase=False,
                                     analyzer='word',
                                     token_pattern="\S*")
        vec = vectorizer.fit_transform(
            prep_obj.detokenized_corpus).todense().tolist()
        feature_names = vectorizer.get_feature_names()
        self.vector_corpus = [[
            vec[index][feature_names.index(word)] for word in tweet
        ] for index, tweet in enumerate(prep_obj.tokenized_corpus)]

    def word_to_vec(self, prep_obj, use_tf_idf=False):
        self.tf_idf(prep_obj)
        tf_idf_vector = self.vector_corpus
        self.vector_corpus = []

        features = 100
        model = gensim.models.Word2Vec(prep_obj.tokenized_corpus,
                                       min_count=1,
                                       workers=4,
                                       size=features,
                                       window=5,
                                       sg=0)
        vec = np.zeros(features)
        for row, tweet in enumerate(prep_obj.tokenized_corpus):
            for column, word in enumerate(tweet):
                mul = tf_idf_vector[row][column] if use_tf_idf else 1
                vec = np.add(vec, np.array(model.wv[word]) * mul)
            vec = np.true_divide(vec, 1 if len(tweet) == 0 else len(tweet))
            self.vector_corpus.append(vec)

    def glove(self, prep_obj):
        embeddings_dict = {}
        features = 100
        with open("glove.twitter.27B." + str(features) + "d.txt",
                  'r',
                  encoding="utf-8") as f:
            for line in f:
                values = line.split()
                word = values[0]
                vector = np.asarray(values[1:], "float32")
                embeddings_dict[word] = vector

        # if word is not in dictionary then its corresponding vector is taken as zero
        vec = np.zeros(features)
        self.vector_corpus = []
        for tweet in prep_obj.tokenized_corpus:
            for word in tweet:
                try:
                    vec = np.add(vec, np.array(embeddings_dict[word]))
                except ValueError:
                    pass
            vec = np.true_divide(vec, 1 if len(tweet) == 0 else len(tweet))
            self.vector_corpus.append(vec)
コード例 #2
0
def prepare_sequences(k, k2):
    dataframe = pd.read_csv('./Data/address.txt', sep="\t", header=None)
    data = dataframe.values
    lst_labels_true = data[:, 2]
    sequences = data[:, 1]
    list_words = []
    max_review_length = 0
    for i, item in enumerate(sequences):
        if isinstance(item, basestring):
            tmp = CountVectorizer().build_tokenizer()(item)
            tmplen = len(tmp)
            if tmplen > max_review_length:
                max_review_length = tmplen
            list_words.append(tmp)
        else:
            np.delete(lst_labels_true, i, 0)

    totals = Counter(i for i in list(chain.from_iterable(list_words)))

    word_index = {}
    i = 1
    for item in totals.keys():
        if totals[item] > k2:
            word_index[item] = i
            i += 1

    text = []
    for row in list_words:
        tmp = []
        for word in row:
            if k2 < totals[word] < max(totals.values()):
                tmp.append(word_index[word])
            else:
                tmp.append(0)
        while len(tmp) < max_review_length:
            tmp.append(0)
        text.append(tmp)

    totals = Counter(i for i in list(chain.from_iterable(text)))
    top_words = len(totals.keys())

    # len_data_set = len(text)
    # items = [i for i in xrange(k)] * int(math.ceil(len_data_set / float(k)))
    # items = items[:len_data_set]
    # random.shuffle(items)
    #
    # str_file_new = './Data/cross_validation.txt'
    # the_file = open(str_file_new, 'w')
    # for element in items:
    #     the_file.write("%s\n" % element)
    # the_file.close()

    items = []
    with open('./Data/cross_validation.txt', 'r') as f:
        for line in f:
            tmp = line.strip('\n')
            items.append(int(tmp))

    class1 = 0
    class2 = 0
    for item in lst_labels_true:
        if item == 0:
            class1 += 1
        else:
            class2 += 1

    embedding_arr = xrange(10, 30, 5)
    dropout_arr1 = [i / float(10) for i in xrange(0, 10, 2)]
    lstm_hidden = xrange(500, 1301, 50)
    dropout_arr2 = [i / float(10) for i in xrange(0, 10, 2)]
    epoch = 35
    batch = 64

    n = k
    results_cv = np.zeros([n, 1])
    parameters = np.array([[800, 0.1, 750, 0.1, epoch, batch]])
    # parameters = np.load('./Data/Results_RNN/lstm2_parameters_cv0.npy')
    for cv in xrange(n):
        for i in xrange(2):
            results = np.zeros([len(embedding_arr), 1])
            for ii, embedding in enumerate(embedding_arr):
                print 'lstm optimize #%d embedding %d' % (i, ii)
                parameters[0, 0] = embedding
                results[ii, :] = run_optimize(cv, text, items, lst_labels_true,
                                              parameters, max_review_length,
                                              top_words)

            results = results[~np.all(results == 0, axis=1)]
            tmp = np.argmax(results, axis=0)
            parameters[0, 0] = embedding_arr[tmp[0]]
            print parameters

            results = np.zeros([len(dropout_arr1), 1])
            for ii, dropout in enumerate(dropout_arr1):
                print 'lstm optimize #%d dropout1 %d' % (i, ii)
                parameters[0, 1] = dropout
                results[ii, :] = run_optimize(cv, text, items, lst_labels_true,
                                              parameters, max_review_length,
                                              top_words)

            results = results[~np.all(results == 0, axis=1)]
            tmp = np.argmax(results, axis=0)
            parameters[0, 1] = dropout_arr1[tmp[0]]
            print parameters

            results = np.zeros([len(lstm_hidden), 1])
            for ii, hidden in enumerate(lstm_hidden):
                print 'lstm optimize #%d hidden %d' % (i, ii)
                parameters[0, 2] = hidden
                results[ii, :] = run_optimize(cv, text, items, lst_labels_true,
                                              parameters, max_review_length,
                                              top_words)

            results = results[~np.all(results == 0, axis=1)]
            tmp = np.argmax(results, axis=0)
            parameters[0, 2] = lstm_hidden[tmp[0]]
            print parameters

            results = np.zeros([len(dropout_arr2), 1])
            for ii, dropout in enumerate(dropout_arr2):
                print 'lstm optimize #%d dropout2 %d' % (i, ii)
                parameters[0, 3] = dropout
                results[ii, :] = run_optimize(cv, text, items, lst_labels_true,
                                              parameters, max_review_length,
                                              top_words)
            results = results[~np.all(results == 0, axis=1)]
            tmp = np.argmax(results, axis=0)
            parameters[0, 3] = dropout_arr2[tmp[0]]
            print parameters

        # print 'lstm test'
        # results_cv[cv, :] = run_test(cv, text, items, lst_labels_true, parameters)
        # gc.collect()
        np.save('./Data/Results_RNN/lstm2_parameters_cv%d.npy' % cv,
                parameters)