Esempio n. 1
0
class DataPreparation(object):
    def __init__(self, original_path, column_x, column_y, single_file=""):

        self.path = original_path
        self.data_x = []
        self.data_y = []
        self.column_x = column_x
        self.column_y = column_y
        self.remove_stopwords = False
        self.stem_words = True
        self.remove_html = False
        self.voc = Vocabulary(self.path)
        self.filelist = ['train.tsv', 'test.tsv', 'dev.tsv']
        self.single_file = single_file
        if len(single_file) == 0:
            self.voc.get_vocabulary(self.filelist, self.column_x,
                                    self.remove_stopwords, self.stem_words,
                                    self.remove_html)
        else:
            self.voc.get_vocabulary([single_file], self.column_x,
                                    self.remove_stopwords, self.stem_words,
                                    self.remove_html)
        self.maxlength = self.voc.maxlength
        if len(self.path) == 0:
            return 'please give the path to Vocabulary.'

    def get_train_test(self,
                       column_x=3,
                       column_y=2,
                       remove_stopwords=True,
                       stem_words=True,
                       remove_html=False):
        self.voc.save_dict()

        if len(self.single_file) == 0:
            train_x = []
            train_y = []
            test_x = []
            test_y = []
            train_words, result_words = Document2VecUtility.get_x_y(
                self.path + 'train.tsv', column_x, column_y, remove_stopwords,
                stem_words, remove_html)
            for line in train_words:
                line_x = []
                for word in line:
                    if self.voc.word2index.has_key(word):
                        line_x.append(self.voc.word2index[word])
                if len(line_x) > self.maxlength:
                    line_x = line_x[:self.maxlength]
                else:
                    for i in range(self.maxlength - len(line_x)):
                        line_x.append(0)
                train_x.append(line_x)
            print 'train size:', len(train_x), len(train_x[0])

            if result_words[0] == 'positive' or result_words[0] == 'negative':
                for re in result_words:
                    if re == 'positive':
                        train_y.append(1)
                    if re == 'negative':
                        train_y.append(0)
            if result_words[0] == 1 or result_words[0] == 0:
                for item in result_words:
                    train_y.append(item)

            test_words, result_words = Document2VecUtility.get_x_y(
                self.path + 'test.tsv', column_x, column_y, remove_stopwords,
                stem_words, remove_html)
            for line in test_words:
                line_x = []
                for word in line:
                    if self.voc.word2index.has_key(word):
                        line_x.append(self.voc.word2index[word])
                if len(line_x) > self.maxlength:
                    line_x = line_x[:self.maxlength]
                else:
                    for i in range(self.maxlength - len(line_x)):
                        line_x.append(0)
                test_x.append(line_x)
            print 'test size:', len(test_x), len(test_x[0])

            if result_words[0] == 'positive' or result_words[0] == 'negative':
                for re in result_words:
                    if re == 'positive':
                        test_y.append(1)
                    if re == 'negative':
                        test_y.append(0)
            if result_words[0] == 1 or result_words[0] == 0:
                for item in result_words:
                    test_y.append(item)
            for item in train_x:
                self.data_x.append(item)
            for item in test_x:
                self.data_x.append(item)
            for item in train_y:
                self.data_y.append(item)
            for item in test_y:
                self.data_y.append(item)
            print 'all x size:', len(self.data_x), len(self.data_x[0])
            print self.data_x[:3]
            print 'all y size:', len(self.data_y)
            print self.data_y[:3]
        else:
            train_words, result_words = Document2VecUtility.get_x_y(
                self.path + self.single_file, column_x, column_y,
                remove_stopwords, stem_words, remove_html)
            for line in train_words:
                line_x = []
                for word in line:
                    if self.voc.word2index.has_key(word):
                        line_x.append(self.voc.word2index[word])
                if len(line_x) > self.maxlength:
                    line_x = line_x[:self.maxlength]
                else:
                    for i in range(self.maxlength - len(line_x)):
                        line_x.append(0)
                self.data_x.append(line_x)
            print "data_x:", self.data_x[0][0]
            print 'x size:', len(self.data_x), len(self.data_x[0])
            print 'result_world:', result_words[:20]
            if result_words[0] == 'positive' or result_words[0] == 'negative':
                for re in result_words:
                    if re == 'positive':
                        self.data_y.append(1)
                    if re == 'negative':
                        self.data_y.append(0)
            if result_words[0] == '4' or result_words[0] == '0':
                for item in result_words:
                    if item == '0':
                        self.data_y.append(0)
                    else:
                        self.data_y.append(1)
            print 'y size:', len(self.data_y)
        after_delete_x = []
        after_delete_y = []
        for i in range(len(self.data_x)):
            # print i
            # print self.data_x[i]
            if self.data_x[i][0] != 0:
                after_delete_x.append(self.data_x[i])
                after_delete_y.append(self.data_y[i])
        self.data_x = after_delete_x
        self.data_y = after_delete_y

    def save_dict(self, file_name):
        # pickle.dump(self.train_x, open(self.path + file_name,'w'))
        # pickle.dump(self.train_y, open(self.path + file_name,'w'))
        # pickle.dump(self.test_y, open(self.path + file_name, 'w'))
        # pickle.dump(self.test_y, open(self.path + file_name, 'w'))
        file = open(self.path + file_name, 'wb')
        pickle.dump(self.data_x, file)
        pickle.dump(self.data_y, file)

    def test_dataset(self, file_name):
        file = open(self.path + file_name, 'rb')
        data_xx = pickle.load(file)
        data_yy = pickle.load(file)

        print np.array(data_xx).shape
        print np.array(data_yy).shape
        for i in range(5):
            print '&&&&&&&&&&&&&'
            print data_xx[i]
            print data_yy[i]