Esempio n. 1
0
    def _get_extended_questions():
        with open('data/extend/extra_questions.txt', 'r',
                  encoding='utf8') as f:
            raw = f.read().strip()

        question_frames = raw.split(
            "===================================================================================================="
        )
        question_frames = [qf.strip() for qf in question_frames[:-1]]

        def process(question_frame):
            # return original question and its permutations
            lines = question_frame.split('\n')
            lines = [l.strip() for l in lines]
            if lines[0][:2] == "No":
                return None

            original = lines[0].strip("Permutations of '")[:-2]
            permutations = [l for l in lines[1:] if l]
            return original, permutations

        pre_process = PreProcess()

        question_dict = {}
        for qf in question_frames:
            tmp = process(qf)
            if tmp:
                o, p = process(qf)
                k = " ".join(pre_process.process(o, remove_stop_words=False))
                question_dict[k] = [
                    " ".join(pre_process.process(i, remove_stop_words=False))
                    for i in p
                ]

        return question_dict
Esempio n. 2
0
    def pre_process(self):
        # pre process the data to fit into the algorithm
        if self.processor is not None:
            # if we already ran this, ask the user if he wants to run it again
            result = tkMessageBox.askquestion(
                message="pre-processing has already been made.\nare you sure you want to run it again?'",
                icon='warning', title=self.head_title)
            if result != 'yes':
                return
        self.processor = None
        self.is_pre_processed = False
        try:
            # verify the file can be pre-processed
            self.file_path = self.file_path_text.get()
            processor = PreProcess(self.file_path)
            if processor.verifications() is False:
                tkMessageBox.showerror(title=self.head_title, message=processor.error_message)
                return

            # process the data
            processor.pre_process()
            tkMessageBox.showinfo(title=self.head_title, message='Preprocessing completed successfully')
            self.processor = processor
            self.is_pre_processed = True
        except Exception as err:
            template = "An exception of type {0} occurred. Arguments:{1}"
            message = template.format(type(err).__name__, err)
            print_exc(err, file=stdout)
            tkMessageBox.showerror(title=self.head_title, message=message)
Esempio n. 3
0
 def run(self):
     """[read the arguments passed to check if is to train model, to run preprocess or run both]
     """
     config_json = self._open_config()
     my_parser = argparse.ArgumentParser(
         description='Model to classify if is speech or music')
     my_parser.add_argument('-m',
                            '--model',
                            required=False,
                            action='store_true')
     my_parser.add_argument('-d',
                            '--data',
                            required=False,
                            action='store_true')
     my_parser.add_argument('-y',
                            '--youtube',
                            required=False,
                            action='store_true')
     args = my_parser.parse_args()
     if (args.data):
         preProcess = PreProcess(config_json)
         preProcess.run()
     if (args.model):
         mlpClassifier = MlpClassifier(config_json)
         mlpClassifier.run()
     if (args.youtube):
         preProcessYoutube = PreProcessYoutube(config_json)
         preProcessYoutube.run()
Esempio n. 4
0
 def prepare_dataset(self):
     dat_obj = PreProcess()
     dat_obj.prepare_dataset()
     test_df = dat_obj.test_df2
     test_dataset = SentimentDataset(test_df, max_length=100, mode='test')
     self.test_loader = DataLoader(test_dataset,
                                   batch_size=1,
                                   num_workers=0,
                                   shuffle=True)
 def prepare_dataset(self):
     dat_obj = PreProcess()
     dat_obj.prepare_dataset()
     train_df = dat_obj.train_df
     val_df = dat_obj.val_df
     test_df = dat_obj.test_df1
     train_dataset = SentimentDataset(train_df, max_length=100)
     val_dataset = SentimentDataset(val_df, max_length=100)
     test_dataset = SentimentDataset(test_df, max_length=100, mode='test')
     self.train_loader = DataLoader(train_dataset, batch_size=32, num_workers=0, shuffle=True)
     self.val_loader = DataLoader(val_dataset, batch_size=32, num_workers=0, shuffle=True)
     self.test_loader = DataLoader(test_dataset, batch_size=32, num_workers=0, shuffle=True)
Esempio n. 6
0
    def __init__(self, model_name, dataset):
        self.model_name = TRAINED_MODELS + model_name + "/"
        self.dataset = dataset

        self.data = Dataset(self.dataset)
        self.data.tfidf_compressor.train()

        self.model = self._load_model()
        self.pre_process = PreProcess()

        idx = list(self.data.train_data.keys())
        idx.sort()
        self.train_c_word_set, self.train_c = self.data.get_all_c_word_set(
            self.data.train_data)
        self.all_train_contexts = np.array(
            [self.data.train_data[i]['context'] for i in idx])
        self.related_questions = np.array(
            [self.data.train_data[i]['qs'] for i in idx])
Esempio n. 7
0
    def _convert_data(self, data_obj):
        pre_process = PreProcess()

        train_data = {}
        dev_data = {}
        idx = 0
        for d in data_obj:
            # custom pre-process
            d['answer'] = d['answer'].strip("Answer:")

            context = " ".join(pre_process.process(d['answer'], url_norm=True))
            if not context:
                continue

            original_question = " ".join(
                pre_process.process(d['question'], remove_stop_words=False))
            extended_questions = self.extend_question_dict.get(
                original_question, [])

            if extended_questions:
                # split train and dev by questions
                train_questions, dev_questions = train_test_split(
                    extended_questions, test_size=0.1, random_state=42)

                train_data[idx] = {
                    'context': d['answer'],
                    'c': context,
                    'qs': [original_question] + train_questions
                }
                dev_data[idx] = {
                    'context': d['answer'],
                    'c': context,
                    'qs': dev_questions
                }
            else:
                train_data[idx] = {
                    'context': d['answer'],
                    'c': context,
                    'qs': [original_question]
                }
            idx += 1
        return train_data, dev_data
Esempio n. 8
0
    def _convert_data(data_obj):
        pre_process = PreProcess()

        data = {}
        idx = 0
        for d in data_obj:
            # custom pre-process
            d['answer'] = d['answer'].strip("Answer:")
            d['answer'] = re.sub(" ", " ", d['answer'])

            context = " ".join(pre_process.process(d['answer'], url_norm=True))
            question = " ".join(
                pre_process.process(d['question'], remove_stop_words=False))
            if not (d['answer'] and context and question):
                continue
            data[idx] = {
                'context': d['answer'],
                'c': context,
                'qs': [question]
            }
            idx += 1
        return data
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # 连接MongoDB,读取待分类数据
    corpus_collection = MongoClient(
        "mongodb://39.108.180.114:27017")["ennews"]["news"]
    reviews_cursor = corpus_collection.find(no_cursor_timeout=True)

    # 数据预处理
    PreProcess(corpus_collection, reviews_cursor).data_filter()

    # 分类
    classify = Classify(corpus_collection, reviews_cursor)
    classify.run()

    reviews_cursor.close()
Esempio n. 10
0
def main():
    preprocess = PreProcess()
    X_train = preprocess.X_train
    y_train = preprocess.y_train
    X_test = preprocess.X_test
    y_test = preprocess.y_test
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    test_data = scaler.transform(preprocess.test)
    rmse_val = []
    for k in range(30):
        k = k + 1
        model = neighbors.KNeighborsRegressor(n_neighbors=k)
        model.fit(X_train, y_train)  #fit the model
        pred = model.predict(X_test)  #make prediction on test set
        error = sqrt(mean_squared_error(y_test, pred))  #calculate rmse
        rmse_val.append(error)  #store rmse values
        print('RMSE value for k= ', k, 'is:', error)

    curve = pd.DataFrame(rmse_val)  #elbow curve
    curve.plot()
Esempio n. 11
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    dictionary_path = "models/dictionary.dict"
    corpus_path = "models/corpus.lda-c"
    lda_model_path = "models/lda_model.lda"

    # topics = ["World", "Sport", "Business", "Technology", "Lifestyle", "Health"]
    lda_num_topics = 6

    # 连接MongoDB
    corpus_collection = MongoClient(
        "mongodb://39.108.180.114:27017")["ennews"]["news"]
    reviews_cursor = corpus_collection.find(no_cursor_timeout=True)

    # 数据预处理
    PreProcess(corpus_collection, reviews_cursor).data_filter()
    # 建立字典
    dictionary = Dictionary(reviews_cursor, dictionary_path).build()
    # Corpus建模
    Corpus(reviews_cursor, dictionary, corpus_path).serialize()
    reviews_cursor.close()
    # LDA建模
    lda_model = Train.run(lda_model_path, corpus_path, lda_num_topics,
                          dictionary)

    # 输出主题
    # make a copy of original stdout route
    stdout_backup = sys.stdout
    # define the log file that receives your log info
    log_file = open(".\lda_topics.log", "w")
    # redirect print output to log file
    sys.stdout = log_file
    lda_model.print_topics()
    log_file.close()
    # restore the output to initial pattern
    sys.stdout = stdout_backup
Esempio n. 12
0
def main():
    preprocess = PreProcess()
    lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]
    X_train = preprocess.X_train
    y_train = preprocess.y_train
    X_test = preprocess.X_test
    y_test = preprocess.y_test
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    test_data = scaler.transform(preprocess.test)
    print("")
    best_model = ""
    max_testing_score = 0.0
    for learning_rate in lr_list:
        n_estimators = 800
        max_depth = 3
        gb_clf = GradientBoostingRegressor(n_estimators=n_estimators,
                                           learning_rate=learning_rate,
                                           min_samples_split=20,
                                           max_depth=max_depth,
                                           random_state=0)
        gb_clf.fit(X_train, y_train.ravel())
        print("Learning rate: ", learning_rate)
        print("Accuracy score (training): {0:.3f}".format(
            gb_clf.score(X_train, y_train)))
        testing_score = gb_clf.score(X_test, y_test)
        print("Accuracy score (validation): {0:.3f}".format(testing_score))
        if (testing_score > max_testing_score):
            best_model = gb_clf
            max_testing_score = testing_score
            prediction = best_model.predict(test_data)
            filename = "predicted_gb_" + str(n_estimators) + "_" + str(
                max_depth) + "_" + str(learning_rate) + ".csv"
            format_to_csv(preprocess.test_instance, prediction, filename)
            print("CSV created")
Esempio n. 13
0
    def _get_extended_questions(self):
        with open(DATA + self.dataset + "/extra_questions.txt",
                  'r',
                  encoding='utf8') as f:
            raw = f.read().strip()

        question_frames = raw.split(
            "===================================================================================================="
        )
        question_frames = [qf.strip() for qf in question_frames[:-1]]

        def process(question_frame):
            # return original question and its permutations
            lines = question_frame.split('\n')
            lines = [l.strip() for l in lines]
            if lines[0][:2] == "No":
                return None

            original = lines[0].strip("Permutations of '")[:-2]
            permutations = [l for l in lines[1:] if l]
            return original, permutations

        pre_process = PreProcess()

        question_dict = {}
        t = Timer()
        for qf in question_frames:
            tmp = process(qf)
            if tmp:
                t.start("", verbal=False)
                o, p = process(qf)
                k = " ".join(pre_process.process(o, remove_stop_words=False))
                question_dict[k] = [
                    " ".join(pre_process.process(i, remove_stop_words=False))
                    for i in p
                ]

                # select the most diverse question set
                self.tf_idf.train([k] + question_dict[k])
                del_num = len(question_dict[k]) // self.top_k
                if del_num == 0:
                    t.remaining_time(t.stop(verbal=False),
                                     len(question_frames))
                    continue

                selected = []
                while question_dict[k]:
                    indices = self.tf_idf.distance(k, question_dict[k])
                    q = question_dict[k].pop(indices[0])
                    selected.append(q)
                    if not question_dict[k]:
                        break
                    close_q = self.tf_idf.distance(
                        q, question_dict[k])[::-1][:del_num]
                    question_dict[k] = [
                        question_dict[k][i]
                        for i in range(len(question_dict[k]))
                        if i not in close_q
                    ]
                question_dict[k] = selected
                t.remaining_time(t.stop(verbal=False), len(question_frames))

        return question_dict
Esempio n. 14
0
    def run(self,
            num_kernels=[25, 25],
            kernel_sizes=[(11, 11), (5, 5)],
            batch_size=256,
            epochs=100000,
            optimizer='RMSprop'):

        optimizerData = {}
        optimizerData['learning_rate'] = 0.0005 / 2
        optimizerData['rho'] = 0.9
        optimizerData['epsilon'] = 1e-2
        optimizerData['momentum'] = 0.9

        print '... Loading data'

        # load in and process data
        if data_size == 'large':
            preProcess = PreProcess()
            data = preProcess.run()
        elif data_size == 'medium':
            preProcess = Medium()
            data = preProcess.run()
        elif data_size == 'small':
            preProcess = Small()
            data = preProcess.run()
        else:
            print 'data_size must be small, medium or large.'
            exit()
        train_set_x, train_set_y = data[0], data[3]
        valid_set_x, valid_set_y = data[1], data[4]
        test_set_x, test_set_y = data[2], data[5]

        train_set_x = theano.tensor._shared(train_set_x, borrow=True)
        valid_set_x = theano.tensor._shared(valid_set_x, borrow=True)
        train_set_y = theano.tensor._shared(train_set_y, borrow=True)
        valid_set_y = theano.tensor._shared(valid_set_y, borrow=True)
        test_set_x = theano.tensor._shared(test_set_x, borrow=True)
        test_set_y = theano.tensor._shared(test_set_y, borrow=True)

        print '... Initializing network'

        # training parameters
        self.n_sports = 500

        # print error if batch size is to large
        if valid_set_y.get_value(borrow=True).size < batch_size:
            print 'Error: Batch size is larger than size of validation set.'

        # compute batch sizes for train/test/validation
        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_train_batches /= batch_size
        n_valid_batches /= batch_size
        n_test_batches /= batch_size

        # symbolic variables
        x = T.matrix('x')  # input image data
        y = T.ivector('y')  # input label data

        self.model(batch_size, num_kernels, kernel_sizes, x, y)

        # Initialize parameters and functions
        cost = self.layer3.negative_log_likelihood(y)  # Cost function
        params = self.params  # List of parameters
        grads = T.grad(cost, params)  # Gradient
        index = T.lscalar()  # Index

        # Intialize optimizer
        updates = self.init_optimizer(optimizer, cost, params, optimizerData)

        # Train function
        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Validation function
        validate_model = theano.function(
            [index],
            self.layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Test function
        test_model = theano.function(
            [index],
            self.layer3.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]
            })

        def solve():
            costs = []
            for i in xrange(n_train_batches):
                costs.append(train_model(i))
            return costs

        def shuffle(train_set_x, train_set_y):
            #print train_set_x.get_value(borrow=True).shape[0]
            rand = np.random.permutation(
                range(train_set_x.get_value(borrow=True).shape[0]))
            train_set_x.set_value(train_set_x.get_value(borrow=True)[rand],
                                  borrow=True)
            train_set_y.set_value(train_set_y.get_value(borrow=True)[rand],
                                  borrow=True)
            return train_set_x, train_set_y, train_model

        # Solver
        try:
            print '... Solving'
            start_time = time.time()
            for epoch in range(epochs):
                t1 = time.time()
                train_set_x, train_set_y, train_model = shuffle(
                    train_set_x, train_set_y)
                costs = solve()
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                t2 = time.time()
                print "Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins".format(
                    epoch + 1, np.mean(costs), np.mean(validation_losses),
                    (t2 - t1) / 60., (t2 - start_time) / 60.)
                # f = open('workfile' , 'r+')
                # f.write("Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.))
                # f.close()
                with open("workfile_batch_c", "a") as myfile:
                    myfile.write(
                        "Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins \n"
                        .format(epoch + 1, np.mean(costs),
                                np.mean(validation_losses), (t2 - t1) / 60.,
                                (t2 - start_time) / 60.))
                if epoch % 10 == 0:

                    test_errors = [
                        test_model(i) for i in range(n_test_batches)
                    ]
                    print "test errors: {:.1%}".format(np.mean(test_errors))
                    with open("workfile_batch_c2", "a") as myfile:
                        myfile.write("test errors: {:.1%}\n".format(
                            np.mean(test_errors)))

        except KeyboardInterrupt:
            print '... Exiting solver'
        # Evaluate performance

        predict = theano.function(
            inputs=[index],
            outputs=self.layer3.prediction(),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size]
            })

        test_errors = [test_model(i) for i in range(n_test_batches)]
        print "test errors: {:.1%}".format(np.mean(test_errors))

        pred = [predict(i) for i in range(n_test_batches)]
        print pred[0].shape
    def run(self,
            num_kernels  = [125,125],
            kernel_sizes = [(11, 11), (5, 5)],
            batch_size   = 50,
            epochs       = 100000,
            optimizer    = 'RMSprop'):
            
            
        optimizerData = {}
        optimizerData['learning_rate'] = 0.0005/2
        optimizerData['rho']           = 0.9
        optimizerData['epsilon']       = 1e-2
        optimizerData['momentum']      = 0.9
        
        print '... Loading data'
        
        # load in and process data
        if data_size == 'large':
            preProcess              = PreProcess()
            data                    = preProcess.run()
        elif data_size == 'medium':
            preProcess              = Medium()
            data                    = preProcess.run()
        elif data_size == 'small':
            preProcess             = Small()
            data                    = preProcess.run()   
        else:
            print 'data_size must be small, medium or large.'
            exit()
        train_set_x,train_set_y = data[0],data[3]
        valid_set_x,valid_set_y = data[1],data[4]
        test_set_x,test_set_y   = data[2],data[5]

        train_set_x = theano.tensor._shared(train_set_x,borrow=True)
        valid_set_x = theano.tensor._shared(valid_set_x,borrow=True)
        train_set_y = theano.tensor._shared(train_set_y,borrow=True)
        valid_set_y = theano.tensor._shared(valid_set_y,borrow=True)
        test_set_x  = theano.tensor._shared(test_set_x,borrow=True)
        test_set_y  = theano.tensor._shared(test_set_y,borrow=True)
        
        print '... Initializing network'
       
        # training parameters
        self.n_sports = 500
    
        # print error if batch size is to large
        if valid_set_y.get_value(borrow=True).size<batch_size:
            print 'Error: Batch size is larger than size of validation set.'

        # compute batch sizes for train/test/validation
        n_train_batches  = train_set_x.get_value(borrow=True).shape[0]
        n_valid_batches  = valid_set_x.get_value(borrow=True).shape[0]
        n_test_batches   = test_set_x.get_value(borrow=True).shape[0]
        n_train_batches /= batch_size
        n_valid_batches /= batch_size
        n_test_batches  /= batch_size

        # symbolic variables
        x = T.matrix('x')  # input image data
        y = T.ivector('y')  # input label data
        
        self.model(batch_size, num_kernels, kernel_sizes, x, y)

        # Initialize parameters and functions
        cost   = self.layer3.negative_log_likelihood(y)        # Cost function
        params = self.params                                   # List of parameters
        grads  = T.grad(cost, params)                          # Gradient
        index  = T.lscalar()                                   # Index
        
        # Intialize optimizer
        updates = self.init_optimizer(optimizer, cost, params, optimizerData)

        # Train function
        train_model = theano.function(
                        [index],
                        cost,
                        updates = updates,
                        givens  = {
                                        x: train_set_x[index * batch_size: (index + 1) * batch_size], 
                                        y: train_set_y[index * batch_size: (index + 1) * batch_size] 
                        }
                    )      
            

        # Validation function
        validate_model = theano.function(
                         [index],
                         self.layer3.errors(y),
                         givens = {
                                  x: valid_set_x[index * batch_size: (index + 1) * batch_size],
                                  y: valid_set_y[index * batch_size: (index + 1) * batch_size]
                }
            )

        # Test function
        test_model = theano.function(
             [index],
             self.layer3.errors(y),
             givens = {
                      x: test_set_x[index * batch_size: (index + 1) * batch_size],
                      y: test_set_y[index * batch_size: (index + 1) * batch_size]
            }
        )

        predict = theano.function(inputs = [index],
                            outputs = self.layer3.prediction(),
                            givens = {
                            x: test_set_x[index*batch_size: (index+1)*batch_size]
            }
        )

        

        def solve():
            costs = []
            for i in xrange(n_train_batches):
                costs.append(train_model(i))
            return costs

        def shuffle(train_set_x,train_set_y):
            #print train_set_x.get_value(borrow=True).shape[0]
            rand = np.random.permutation(range(train_set_x.get_value(borrow=True).shape[0]))
            train_set_x.set_value(train_set_x.get_value(borrow=True)[rand],borrow=True)
            train_set_y.set_value(train_set_y.get_value(borrow=True)[rand],borrow=True)
            return train_set_x,train_set_y,train_model


        print np.savetxt('y_vec_LARGE.txt',test_set_y.get_value(borrow=True),delimiter=',')
        length = test_set_y.get_value(borrow=True).shape[0]
        print length
        print 'saved'
        try:
            print '... Solving'
            start_time = time.time()    
            for epoch in range(epochs):
                t1 = time.time()
                train_set_x,train_set_y,train_model = shuffle(train_set_x,train_set_y) 
                costs                   = solve()
                validation_losses       = [validate_model(i) for i in xrange(n_valid_batches)]
                t2 = time.time()
                print "Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.)
                # f = open('workfile' , 'r+')
                # f.write("Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.))
                # f.close()
                with open("workfile_BIG31", "a") as myfile:
                     myfile.write("Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins \n".format(epoch + 1, np.mean(costs), np.mean(validation_losses),(t2-t1)/60.,(t2-start_time)/60.))
                if epoch%1== 0:
                    predictions = np.array([predict(i) for i in range(n_test_batches)])
                    print predictions[0].shape
                    print predictions.shape
                    predictions = predictions.reshape((length-length%batch_size),500) 
                    with file('workfile_BIG33', 'w') as outfile:
                        # I'm writing a header here just for the sake of readability
                        # Any line starting with "#" will be ignored by numpy.loadtxt
                        outfile.write('# Array shape: {0}\n'.format(len(predictions)))

                        # Iterating through a ndimensional array produces slices along
                        # the last axis. This is equivalent to data[i,:,:] in this case
                        for data_slice in predictions:

                            # The formatting string indicates that I'm writing out
                            # the values in left-justified columns 7 characters in width
                            # with 2 decimal places.  
                            np.savetxt(outfile, data_slice, fmt='%-7.2f')

                            # Writing out a break to indicate different slices...
                            #outfile.write('\n')
                

                    test_errors = [test_model(i) for i in range(n_test_batches)]
                    print "test errors: {:.1%}".format(np.mean(test_errors))
                    with open("workfile_BIG32", "a") as myfile:
                        myfile.write("test errors: {:.1%}\n".format(np.mean(test_errors)))

        except KeyboardInterrupt:
            print '... Exiting solver'
        # Evaluate performance 




        test_errors = [test_model(i) for i in range(n_test_batches)]
        print "test errors: {:.1%}".format(np.mean(test_errors))

        pred = [predict(i) for i in range(n_test_batches)]
        print pred[0].shape
Esempio n. 16
0
import tensorflow as tf
from pre_process import PreProcess
from tensorflow.keras.optimizers import Adam
from tensorflow import keras
import time
import os
from model.seq2seq_attention import encoder_model, decoder_model
from tensorflow.keras.layers import Input

encoder_weights_path = 'models/encoder.h5'
decoder_weights_path = 'models/decoder.h5'

process = PreProcess('./data/qingyun.tsv', samples_num=3000)
samples_num = process.length

# define params
batch_size = 64
embedding_dim = 50
units = 256
steps_per_epoch = samples_num // batch_size

encoder_input = Input((process.q_lenght, ))
encoder = encoder_model(encoder_input, process.q_vocab_size, embedding_dim,
                        units)

decoder_input, hidden_input, encoder_output_input = Input((1, )), Input(
    (units, )), Input((process.q_lenght, units))
decoder = decoder_model(decoder_input, hidden_input, encoder_output_input,
                        process.a_vocab_size, embedding_dim, units)

if os.path.exists(encoder_weights_path):
Esempio n. 17
0
from multiprocessing import cpu_count
from pre_process import PreProcess
from model_manager import ModelManager

if __name__ == '__main__':
    # Load parameters
    with open(".\\data\\config.yaml") as config_file:
        config = load(config_file, Loader=FullLoader)

    # Define locals
    datasets: dict = dict()
    batch_data: dict = dict()
    num_features: int = len(config["COLUMNS"]["CATEGORICAL"]["NUMERIC"])
    +len(config["COLUMNS"]["CATEGORICAL"]["STRING"])
    +len(config["COLUMNS"]["CONTINUOUS"]) + 1  # 1 for the "Id" column
    pre_proc: PreProcess = PreProcess(columns=config["COLUMNS"],
                                      num_workers=cpu_count())

    # Path checks
    if Path(config["PATH"]["PROCESSED"]["TRAIN"]).is_file() and\
            Path(config["PATH"]["PROCESSED"]["TEST"]).is_file():

        # Load processed data
        print("Pre-Processed data exists!\nLoading data...")
        datasets["train"] = pre_proc.load_data(
            path=config["PATH"]["PROCESSED"]["TRAIN"])
        datasets["test"] = pre_proc.load_data(
            path=config["PATH"]["PROCESSED"]["TEST"])
        print("Data loaded!")

    else:
        # Load raw data
    def __getitem__(self, index):
        row = self.df.iloc[index]
        text, label = row['pre_process'], row[0]
        if label != 0:
            label = 1
        out_dict = self.tokenizer.encode_plus(text=text,
                                              padding='max_length',
                                              max_length=200,
                                              return_tensors='pt')
        # print(out_dict)
        if self.mode != 'test':
            return [(out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label]
        else:
            return [text, (out_dict['input_ids'][:, :self.max_length], out_dict['attention_mask'][:, :self.max_length]), label]

    def __len__(self):
        # return int(self.df.shape[0])
        return 2000

if __name__ == '__main__':
    from pre_process import PreProcess
    dat_obj = PreProcess()
    dat_obj.prepare_dataset()
    train_df = dat_obj.train_df
    dataset = SentimentDataset(train_df, 200)
    train_loader = DataLoader(dataset, batch_size=5,num_workers=8)

    for i, j in enumerate(train_loader,0):
        print(i)
        print(j[0][0])
Esempio n. 19
0
class Inference:
    def __init__(self, model_name, dataset):
        self.model_name = TRAINED_MODELS + model_name + "/"
        self.dataset = dataset

        self.data = Dataset(self.dataset)
        self.data.tfidf_compressor.train()

        self.model = self._load_model()
        self.pre_process = PreProcess()

        idx = list(self.data.train_data.keys())
        idx.sort()
        self.train_c_word_set, self.train_c = self.data.get_all_c_word_set(
            self.data.train_data)
        self.all_train_contexts = np.array(
            [self.data.train_data[i]['context'] for i in idx])
        self.related_questions = np.array(
            [self.data.train_data[i]['qs'] for i in idx])

    def _load_model(self):
        # load model
        num_chars = self.data.get_num_chars()

        embeddings = get_trimmed_embeddings(DATA + "embedding_data.npz")

        model = NtuModel(model_name=self.model_name,
                         embeddings=embeddings,
                         num_chars=num_chars,
                         batch_size=32,
                         early_stopping=False,
                         k_neg=0)
        model.build()
        saver = tf.train.Saver()
        saver.restore(model.sess, tf.train.latest_checkpoint(self.model_name))

        return model

    def get_answer(self, question):
        question_example = self.pre_process.process(question,
                                                    remove_stop_words=False)
        q_word_set = set(question_example)
        question_example = self.data.process_sent(" ".join(question_example))

        filtered_idx = []
        for i in range(len(self.train_c_word_set)):
            if len(q_word_set.intersection(self.train_c_word_set[i])) > 0:
                filtered_idx.append(i)

        context_examples = [
            self.data.process_sent(self.data.tfidf_compressor.compress(c))
            for c in self.train_c[filtered_idx]
        ]

        scores = self.model.get_scores(question_example, context_examples)
        c_max = scores.argsort()[::-1][:10]
        if len(c_max) == 0:
            return "There is no answer for that.", ["None"]

        top_related_questions = self.related_questions[filtered_idx][c_max]
        top_original_context = self.all_train_contexts[filtered_idx][c_max]

        # process top related questions
        related_question_examples = [
            self.data.process_sent(i[0]) for i in top_related_questions
        ]

        q_closet = self._arg_closest_related_questions(
            question_example, related_question_examples)
        return top_original_context[q_closet], top_related_questions[q_closet]

    def _arg_closest_related_questions(self, question, related_questions):
        all_question = [question] + related_questions
        q_char_ids, q_word_ids = zip(*[zip(*zip(*x)) for x in all_question])

        padded_q_word_ids, q_sequence_lengths = pad_sequences(q_word_ids,
                                                              pad_tok=0)
        padded_q_char_ids, q_word_lengths = pad_sequences(q_char_ids,
                                                          pad_tok=0,
                                                          nlevels=2)

        feed_dict = {
            self.model.q_word_ids: padded_q_word_ids,
            self.model.q_char_ids: padded_q_char_ids,
            self.model.q_sequence_lengths: q_sequence_lengths,
            self.model.q_word_lengths: q_word_lengths,
            self.model.keep_op: 1.0,
            self.model.is_training: False
        }
        question_embeddings = self.model.sess.run(self.model.q_dense,
                                                  feed_dict=feed_dict)
        q = question_embeddings[0]  # 1, 300
        rq = question_embeddings[1:]
        scores = np.sum(np.square(rq - q), axis=-1)

        q_min = scores.argsort()[0]
        return q_min
Esempio n. 20
0
    def run(self,
            num_kernels=[15, 15],
            kernel_sizes=[(11, 11), (5, 5)],
            batch_size=50,
            epochs=20,
            optimizer='RMSprop'):

        optimizerData = {}
        optimizerData['learning_rate'] = 0.001
        optimizerData['rho'] = 0.9
        optimizerData['epsilon'] = 1e-4
        optimizerData['momentum'] = 0.9

        print '... Loading data'

        # load in and process data
        preProcess = PreProcess()
        data = preProcess.run()
        train_set_x, train_set_y = data[0], data[3]
        valid_set_x, valid_set_y = data[1], data[4]
        test_set_x, test_set_y = data[2], data[5]

        print train_set_x.eval().shape

        print '... Initializing network'

        # training parameters
        self.n_sports = np.max(train_set_y.eval()) + 1

        # print error if batch size is to large
        if valid_set_y.eval().size < batch_size:
            print 'Error: Batch size is larger than size of validation set.'

        # compute batch sizes for train/test/validation
        n_train_batches = train_set_x.get_value(borrow=True).shape[0]
        n_test_batches = test_set_x.get_value(borrow=True).shape[0]
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0]
        n_train_batches /= batch_size
        n_test_batches /= batch_size
        n_valid_batches /= batch_size

        # symbolic variables
        x = T.matrix('x')  # input image data
        y = T.ivector('y')  # input label data

        self.model(batch_size, num_kernels, kernel_sizes, x, y)

        # Initialize parameters and functions
        cost = self.layer3.negative_log_likelihood(y)  # Cost function
        params = self.params  # List of parameters
        grads = T.grad(cost, params)  # Gradient
        index = T.lscalar()  # Index

        # Intialize optimizer
        updates = self.init_optimizer(optimizer, cost, params, optimizerData)

        # Training model
        train_model = theano.function(
            [index],
            cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Validation function
        validate_model = theano.function(
            [index],
            self.layer3.errors(y),
            givens={
                x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                y: valid_set_y[index * batch_size:(index + 1) * batch_size]
            })

        # Test function
        test_model = theano.function(
            [index],
            self.layer3.errors(y),
            givens={
                x: test_set_x[index * batch_size:(index + 1) * batch_size],
                y: test_set_y[index * batch_size:(index + 1) * batch_size]
            })

        def solve():
            costs = []
            for i in xrange(n_train_batches):
                costs.append(train_model(i))

                #if i % 1000 ==0:
                #    print i
            return costs

        # Solver
        try:
            print '... Solving'
            start_time = time.time()
            for epoch in range(epochs):
                t1 = time.time()
                costs = solve()
                validation_losses = [
                    validate_model(i) for i in xrange(n_valid_batches)
                ]
                t2 = time.time()
                print "Epoch {}    NLL {:.2}    %err in validation set {:.1%}    Time (epoch/total) {:.2}/{:.2} mins".format(
                    epoch + 1, np.mean(costs), np.mean(validation_losses),
                    (t2 - t1) / 60., (t2 - start_time) / 60.)
        except KeyboardInterrupt:
            print '... Exiting solver'
        # Evaluate performance
        test_errors = [test_model(i) for i in range(n_test_batches)]
        print "test errors: {:.1%}".format(np.mean(test_errors))
Esempio n. 21
0
from model.seq2seq_attention import encoder_model, decoder_model, inference
from pre_process import PreProcess
from tensorflow.keras.layers import Input
import argparse
import os

process = PreProcess('./data/qingyun.tsv')

# define params
embedding_dim = 50
units = 256


def _main(args):
    sentence = args.sentence
    encoder_weights_path = args.encoder_weights_path
    decoder_weights_path = args.decoder_weights_path
    if not os.path.exists(encoder_weights_path) or not os.path.exists(decoder_weights_path):
        raise ValueError('weights path should exists')
    # get model
    encoder_input = Input((process.q_lenght,))
    encoder = encoder_model(encoder_input, process.q_vocab_size, embedding_dim, units)

    decoder_input, hidden_input, encoder_output_input = Input((1,)), Input((units,)), Input((process.q_lenght, units))
    decoder = decoder_model(decoder_input, hidden_input, encoder_output_input, process.a_vocab_size, embedding_dim,
                            units)
    # load model weights
    encoder.load_weights(encoder_weights_path, by_name=True)
    decoder.load_weights(decoder_weights_path, by_name=True)
    result, sentence = inference(process, encoder, decoder, sentence)
    print(sentence + '-->' + result.replace(' ', ''))