def __init__(self, X_train, X_test, y_train, y_test, tags):
        self.y_train = y_train
        self.y_test = y_test
        self.tags = tags
        # Limit is used to get the most-frequent 500,000 word's vectors, so speed loading vectors a little.
        glove_model = KeyedVectors.load_word2vec_format(
            "pretrained_vectors/gensim_glove_vectors.txt",
            binary=False,
            limit=500000)
        # Used for initialization of model.syn0norm
        glove_model.init_sims(replace=True)

        print("Done Processing Pretrained Vectors")

        pre = PreProcessing()

        test_tokenized = X_test.apply(lambda item: self.tokenize_text(item))
        train_tokenized = X_train.apply(lambda item: self.tokenize_text(item))

        self.X_train_word_average = pre.word_averaging_list(
            glove_model, train_tokenized)
        self.X_test_word_average = pre.word_averaging_list(
            glove_model, test_tokenized)

        print("Done Applying Pretrained Vectors")
Beispiel #2
0
    def main(self):
        preprocess = PreProcessing()
        self.wordsList, self.wordVectors = preprocess.load_glove()
        data, labels, types = preprocess.load_mutations()
        self.numClasses = len(types)

        # create dictionary of type and it's respective value in int
        count = 0
        for i in types:
            dic = {i: count}
            self.types.update(dic)
            count = count + 1

        train_seqs, test_seqs, train_labels, test_labels = self.normalize_data(
            data, labels)

        # Spit out details about data
        classes = np.sort(np.unique(train_labels))
        print("\n=================================\nData details:")
        print("- Training-set:\t{}".format(len(train_seqs)))
        print("- Test-set:\t\t{}".format(len(test_seqs)))
        print("- Classes:\t\t{}".format(classes))
        print("=================================\n\n")

        self.prepare_lstm(train_seqs, train_labels, test_seqs, test_labels)
Beispiel #3
0
    def run_experiment(self):
        '''
        Run specified experiments

        return: dict with metrics
        '''

        pre = PreProcessing()
        ds = DataSource()
        met = Metrics()
        print('Reading Data')
        train_df = ds.read_data(train=True)
        test_df = ds.read_data(train=False)
        y_test = test_df['y']
        print('Preprocessing train data')
        X_train, y_train = pre.preprocess(train_df, train=True)
        print('Preprocessing test data')
        X_test = pre.preprocess(test_df[pre.train_features], train=False)
        print('Training model')
        models = Experiments().train_model(X_train, y_train)
        print('Running metrics')
        for model in models.keys():
            print(model)
            y_pred = models[model].predict(X_test)
            print(
                met.calculate_classification(model, y_test, pd.Series(y_pred)))
            metrics = met.calculate_classification(model, y_test,
                                                   pd.Series(y_pred))
            pd.DataFrame.from_dict(
                metrics, orient='index').to_csv('../output/' + model + '.csv')
        return metrics
Beispiel #4
0
def music_test():

    # Carregando e normalizando os dados da base de musicas
    dataset = PreProcessing("default_features_1059_tracks.txt")
    dataset.normalize(ignore_first_column=False)

    # Atributos a serem variados no teste
    n_layers = [1, 2]
    hidden_layer = [20, [10, 10]]
    momentums = [0.3, 0.5, 0.7]
    max_iterations = [100, 250, 500]
    etas = [0.3, 0.5, 0.7]
    ps = [0.5, 0.7, 0.9]

    # Teste
    for layer in n_layers:
        for momentum in momentums:
            for eta in etas:
                for max_iteration in max_iterations:
                    for p in ps:
                        train, test = training.holdout(
                            p, dataset.normalized_dataframe)
                        example = test.values.tolist()
                        print("INPUT NEURONS = 68 HIDDEN NEURONS = " +
                              str(int(10 / layer)) +
                              " OUTPUT NEURONS = 2 HIDDEN LAYER = " +
                              str(layer) + " ETA = " + str(eta) +
                              " MAX ITERATIONS = " + str(max_iteration) +
                              " MOMENTUM = " + str(momentum) + " P = " +
                              str(p))
                        print()
                        nn = Mlp(68,
                                 hidden_layer[layer - 1],
                                 2,
                                 n_hidden_layers=layer)
                        nn.backpropagation(train.values.tolist(),
                                           eta=eta,
                                           max_iterations=max_iteration)
                        print("SQUARED ERROR =",
                              training.squared_error(nn, test, n_classes=2))
                        print()

                        print("Input 1")
                        nn.feed_forward(example[0][:(-1 * 2)])
                        print(example[0])
                        print("Result 1")
                        nn.show_class()
                        print()

                        print("Input 2")
                        print(example[15])
                        nn.feed_forward(example[15][:(-1 * 2)])
                        print("Result 2")
                        nn.show_class()
                        print()
                        print(
                            "******************************************************//******************************************************"
                        )
                        print()
Beispiel #5
0
    def process(self, frame, name="TrainingSamples/Image_"):
        # preprocessing for contour detection
        preprocessed = PreProcessing().background_contour_removal(frame)

        # find contours using algorithm by Suzuki et al. (1985)
        contours, hierarchy = cv.findContours(preprocessed, cv.RETR_TREE,
                                              cv.CHAIN_APPROX_NONE)

        # limit observed contours
        if len(contours) > 500:
            contours = contours[:500]

        # ignore first contour, as it is outer border of the frame
        contours = contours[1:]
        hierarchy = hierarchy[0][1:] - 1
        hierarchy = np.where(hierarchy < 0, -1, hierarchy)

        if len(contours) == 0:
            return preprocessed

        # initialize contour object from each contour in contour list
        binarized = PreProcessing().custom_binarize(frame)
        contourList = [
            Contour(contour=cnt, imgShape=frame.shape, frameBinary=binarized)
            for cnt in contours
        ]

        # filter, classify and group segmented contours
        sg = Segmentation(contourList, hierarchy, frame.shape)
        sg.group_and_classify()

        filtered = sg.get_contours()

        if len(filtered) == 0:
            return preprocessed

        # colouring preprocessing for ease in debugging
        preprocessed = cv.cvtColor(preprocessed, cv.COLOR_GRAY2BGR)

        lines = LineOrdering(filtered).get_lines(frame)

        # label contours with additional positional information
        lines = sg.label_contours(lines)

        for l in range(len(lines)):
            line = lines[l]
            for i in range(len(line)):
                cnt = line[i]
                cv.putText(frame,
                           str(l) + str(i), (cnt.center[0], cnt.center[1]),
                           cv.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 1)

        solutions = [
            self.solver.solve([cnt.unwrap() for cnt in line], frame)
            for line in lines if len(line) > 2
        ]

        return preprocessed  # orderedImage
Beispiel #6
0
    def get(self, preprocessing, c):
        global X_train
        global y_train
        global X_test
        global y_test

        if preprocessing == 'StandardScaler':
            X_train, X_test = PreProcessing.standard_scaler(X_train, X_test)
        if preprocessing == 'MinMaxScaler':
            X_train, X_test = PreProcessing.min_max_scaler(X_train, X_test)

        return Model.train_and_test(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, c=c)
Beispiel #7
0
def main():

    dataset = PreProcessing("wine_dataset.txt")
    dataset.normalize(ignore_first_column=True)
    dataset.switch_first_last_column()
    dataset.normalize_class()

    train, test = training.holdout(0.7, dataset.normalized_dataframe)

    nn = Mlp(13, 10, 3, n_hidden_layers=1)
    nn.backpropagation(train.values.tolist(), eta=0.5)

    example = test.values.tolist()
    print(len(example))
    input()
    #print(example)
    #print(example[17])
    #feed example
    nn.feed_forward(example[0][:(-1 * 3)])
    print(example[0])
    nn.show_class()

    nn.feed_forward(example[40][:(-1 * 3)])
    print(example[40])
    print(test.iloc[[40]].values.tolist())
    input()
    nn.show_class()

    nn.feed_forward(example[31][:(-1 * 3)])
    print(example[31])
    nn.show_class()

    print(training.accuracy(nn, test, n_classes=3))
    """
Beispiel #8
0
def wine_test():
    # Carregando e Normalizando os dados da base de vinhos
    dataset = PreProcessing("wine_dataset.txt")
    dataset.normalize(ignore_first_column=True)
    dataset.switch_first_last_column()
    dataset.normalize_class()

    # Atributos a serem variados nos testes
    n_layers = [1, 2]
    hidden_layer = [10, [5, 5]]
    momentums = [0.3, 0.5, 0.7]
    max_iterations = [100, 250, 500]
    etas = [0.3, 0.5, 0.7]
    ps = [0.5, 0.7, 0.9]

    # Teste
    for layer in n_layers:
        for momentum in momentums:
            for eta in etas:
                for max_iteration in max_iterations:
                    for p in ps:
                        train, test = training.holdout(
                            p, dataset.normalized_dataframe)
                        example = test.values.tolist()
                        print("INPUT NEURONS = 13 HIDDEN NEURONS = " +
                              str(int(10 / layer)) +
                              " OUTPUT NEURONS = 3 HIDDEN LAYER = " +
                              str(layer) + " ETA = " + str(eta) +
                              " MAX ITERATIONS = " + str(max_iteration) +
                              " MOMENTUM = " + str(momentum) + " P = " +
                              str(p))
                        print()
                        nn = Mlp(13,
                                 hidden_layer[layer - 1],
                                 3,
                                 n_hidden_layers=layer)
                        nn.backpropagation(train.values.tolist(),
                                           eta=eta,
                                           max_iterations=max_iteration)
                        print("ACCURACY =",
                              training.accuracy(nn, test, n_classes=3))
                        print()

                        print("Input 1")
                        nn.feed_forward(example[0][:(-1 * 3)])
                        print(example[0])
                        print("Result 1")
                        nn.show_class()
                        print()

                        print("Input 2")
                        print(example[15])
                        nn.feed_forward(example[15][:(-1 * 3)])
                        print("Result 2")
                        nn.show_class()
                        print()
                        print(
                            "******************************************************//******************************************************"
                        )
                        print()
Beispiel #9
0
def get_data():
    """Uses EntsoeDownloader and PreProcessing class to initiate the scraping and preprocessing process
    Parameters
    ----------
    Returns
    -------
    df
        A dataframe in form of json in order to dispatch to next services
    prices
        A list of electricity prices of the past 24 hours
    timestamps
        A list of timestamps of the past 24 hours and the next 24 hours (following day)
    """

    date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    print("Todays date & type: ", date, " & ", type(date))

    # Initiate Web Scraping
    # Feed ENTSOE Account details into the EntsoeDownloader class
    downloader = EntsoeDownloader(date, "username",
                                  "password").setup(headless=True)
    downloader.login_and_download()

    # Initiate PreProcessing
    pre_processing = PreProcessing()
    pre_processing.start_preprocess()

    df = pd.read_csv(os.getcwd() + "/download/final_dataset_kafka.csv")

    # Get Day-Ahead price and generate new dates
    temp = df[[df.columns[0], "Day-ahead Price [EUR/MWh]"]].dropna()
    temp.rename(columns={df.columns[0]: "cet_timestamp"}, inplace=True)
    temp["cet_timestamp"] = pd.to_datetime(temp["cet_timestamp"],
                                           format="%Y-%m-%d %H:%M")
    temp.set_index("cet_timestamp", inplace=True)

    time = df[df.columns[0]][-24:].values
    last_date = temp.index[-1:][0]

    timestamp_list = list(time)
    for i in range(1, 25):
        last_date += timedelta(hours=1)
        timestamp_list.append(last_date.strftime("%Y-%m-%d %H:%M:%S"))

    df = df.to_json(orient="split")
    price_list = list(temp["Day-ahead Price [EUR/MWh]"][-24:].values)

    return df, price_list, timestamp_list
def preprocess_dataset(pca_processing: Union[KPCAPreprocessing,
                                             PCAPreprocessing],
                       preprocessing: PreProcessing,
                       dataset: np.ndarray) -> Any:
    ret_list = []
    for data_i in dataset:
        stnd_img = preprocessing.regular_preprocess(data_i)
        ret_list.append(pca_processing.apply_method(stnd_img))

    return ret_list
 def model_training(self):
     pre = PreProcessing()
     print('Reading data')
     df = self.data.read_data(train=True)
     print('Starting training')
     X_train, y_train = pre.preprocess(df, train=True)
     print('Starting training model')
     model = CatBoostClassifier()
     steps = [('over', SMOTE()), ('model', CatBoostClassifier())]
     pipeline = Pipeline(steps=steps)
     pipeline.fit(X_train, y_train)
     modelo = pipeline['model']
     model = {
         'model': modelo,
         'preprocessing': pre,
         'columns': pre.feature_names
     }
     print(model)
     dump(model, '../output/modelo.pkl')
     return model
def train_with_svm(
    dataset: np.ndarray, labels: np.ndarray, names: np.ndarray,
    classifier: Classifier, is_pca: bool
) -> Tuple[PreProcessing, Union[KPCAPreprocessing, PCAPreprocessing]]:
    preprocessing = PreProcessing(dataset, dataset.shape[1], dataset.shape[2],
                                  dataset.shape[3])

    c_matrix: np.ndarray
    if is_pca:
        c_matrix = np.matmul(preprocessing.training_set,
                             preprocessing.training_set.T)
    else:
        c_matrix = KPCAPreprocessing.get_kernel_pol_method(
            preprocessing.training_set)

    # Uses QR method to get eigenvalues and eigenvectors
    eigenvalues, eigenvectors = qr_eig_algorithm(c_matrix)
    total = np.sum(np.abs(eigenvalues))

    acum = 0
    i = 0
    while acum < PRECISION:
        acum += eigenvalues[i] / total
        i = i + 1

    print(
        f"In order to win {round(acum, 4)} variance ratio we will use {i} eigenvectors"
    )

    # Grab the first i eigenvectors
    eigenvectors = eigenvectors[:i]

    if is_pca:
        processing = PCAPreprocessing(preprocessing.training_set,
                                      preprocessing.avg_face, eigenvectors,
                                      dataset.shape[1], dataset.shape[2],
                                      dataset.shape[3], names, labels)
    else:
        processing = KPCAPreprocessing(preprocessing.training_set,
                                       preprocessing.avg_face, eigenvectors,
                                       dataset.shape[1], dataset.shape[2],
                                       dataset.shape[3], names, labels,
                                       c_matrix)
    # Feature scaling
    sc = StandardScaler()
    scaled_training_set = sc.fit_transform(processing.training_set)

    # Train classifier with default C and gamma values
    classifier.train_classifier(scaled_training_set, labels)
    print("Training done!")
    classifier.save(preprocessing, processing)
    return preprocessing, processing
Beispiel #13
0
    def __init__(self):
        data_src = './data_repository/geological_similarity/'
        self.model_path = './trained_model/model_triplet/'

        self.dataset = PreProcessing(data_src)
        self.model = SiameseNetwork()

        # Define Tensor
        self.img_placeholder = tf.placeholder(tf.float32, [None, 28, 28, 3],
                                              name='img')
        self.net = self.model.conv_net(self.img_placeholder, reuse=False)
        self.normalized_training_vectors = self.generate_db_normed_vectors()
        print('Prediction object loaded successfully.')
Beispiel #14
0
def main():

    preProcessing = PreProcessing("mnist_train.csv")
    #preProcessing.preProcessData()

    # number or hidden units
    processing = Processing(10)
    processing.load_data("mnist_train_scaled.csv",
                         "mnist_train_targetClass.csv")

    processing.processing()

    for arg in sys.argv[1:]:
        print(arg)
def testKMEANS():
    data = pd.read_table("datasets/mouse.txt", engine='python', sep = ',', header=None )
    data = pp.scale(data)
    # setting cols names
    # TODO: set standard names to N columns or use default pandas names assigns
    data.columns = ['x','y']
    # instanciating kmeans
    km = K_means(data, k=3, alpha=0.001)

    # updating algorithm
    km.update()

    # plotting the final result
    km.plot()
Beispiel #16
0
def train_with_svm(dataset_train, labels_train, classifier, is_pca, names):
    preprocessing = PreProcessing(dataset_train, dataset_train.shape[1],
                                  dataset_train.shape[2],
                                  dataset_train.shape[3])
    # Over this matrix we need to calculate eigenvectorss
    if is_pca:
        C_matrix = np.matmul(preprocessing.training_set,
                             preprocessing.training_set.T)
    else:
        C_matrix = KPCAPreprocessing.rbf_kernel_pca(preprocessing.training_set)

    # Uses QR method to get eigenvalues and eigenvectors
    eigenvalues, eigenvec = np.linalg.eig(
        C_matrix)  #calculate_eigenvectors(C_matrix)
    total = np.sum(np.abs(eigenvalues))

    accumulated = 0
    i = 0
    while accumulated < 0.50:
        accumulated = accumulated + eigenvalues[i] / total
        i = i + 1
    print(
        f"In order to win {round(accumulated, 4)} variance ratio we will use {i} eigenvectors"
    )
    print("Training...")

    # Grab the first i eigenvectors
    eigenvectors = eigenvec[:i]

    if is_pca:
        # Apply PCA transformation to training training_data
        pca_processing = PCAPreprocessing(preprocessing.training_set,
                                          preprocessing.avg_face, eigenvectors,
                                          dataset_train.shape[1],
                                          dataset_train.shape[2],
                                          dataset_train.shape[3], names,
                                          labels_train)
    else:
        # Apply KPCA transformation to training training_data
        pca_processing = KPCAPreprocessing(
            preprocessing.training_set, preprocessing.avg_face, eigenvectors,
            dataset_train.shape[1], dataset_train.shape[2],
            dataset_train.shape[3], names, labels_train, C_matrix)

    # Train classifier with default C and gamma values
    classifier.train_classifier(pca_processing.training_set, labels_train)

    classifier.save(preprocessing, pca_processing)
    return preprocessing, pca_processing
Beispiel #17
0
def main():

    #read dataset and preprocess it
    dataset = PreProcessing("seeds_dataset.txt", separator='\s+')
    dataset.normalize()
    dataset.normalize_class()

    #divide dataset into training and test sets
    train, test = training.holdout(0.7, dataset.normalized_dataframe)

    nn = Rbf(7, 3)

    nn.train(train, eta=0.5, max_iterations=500)

    print("RBF:", training.accuracy(nn, test, 3))

    mm = Mlp(7, 3, 3)

    mm.backpropagation(train.values.tolist(), max_iterations=500)
    print("MLP:", training.accuracy(mm, test, 3))
Beispiel #18
0
FLAGS = flags.FLAGS
flags.DEFINE_integer('batch_size', 150, 'Batch size.')
flags.DEFINE_integer('train_iter', 100, 'Total training iter')
flags.DEFINE_integer('step', 50, 'Save after ... iteration')
flags.DEFINE_float('learning_rate', '0.01', 'Learning rate')
flags.DEFINE_float('momentum', '0.99', 'Momentum')
flags.DEFINE_string('model', 'conv_net', 'model to run')
flags.DEFINE_string('data_src', r'C:\OData',
                    'source of training dataset')  #Low_data OData all

tf.compat.v1.disable_eager_execution()

if __name__ == "__main__":

    # Setup Dataset
    dataset = PreProcessing(FLAGS.data_src)
    model = TripletLoss()
    placeholder_shape = [None] + list(dataset.images_train.shape[1:])
    print("placeholder_shape", placeholder_shape)

    # Setup Network
    next_batch = dataset.get_triplets_batch
    anchor_input = tf.placeholder(tf.float32,
                                  placeholder_shape,
                                  name='anchor_input')
    positive_input = tf.placeholder(tf.float32,
                                    placeholder_shape,
                                    name='positive_input')
    negative_input = tf.placeholder(tf.float32,
                                    placeholder_shape,
                                    name='negative_input')
Beispiel #19
0
from get_data import GetData
from preprocessing import PreProcessing
from autoencoder import AutoEncoder
from data_processing import DataProcessing
from model import NeuralNetwork

data = GetData("AAPL", "2000-01-01", "2018-10-01")
data.get_stock_data()
preprocess = PreProcessing(0.8, 0.25)
preprocess.make_wavelet_train()
preprocess.make_test_data()
autoencoder = AutoEncoder(20)
autoencoder.build_train_model(55, 40, 30, 30, 40)
process = DataProcessing(0.8, 0.25)
process.make_test_data()
process.make_train_data()
process.make_train_y()
process.make_test_y()
model = NeuralNetwork(20, True)
model.make_train_model()
Beispiel #20
0
def seed_test():
    # Carregando e Normalizando os dados da base de vinhos
    dataset = PreProcessing("seeds_dataset.txt", separator='\s+')
    dataset.normalize()
    dataset.normalize_class()

    # Atributos a serem variados nos testes
    n_layers = [1, 2]
    hidden_layer = [3, [6, 6]]
    momentums = [0.3, 0.5]
    max_iterations = [100, 250, 500]
    etas = [0.3, 0.5]
    ps = [0.7, 0.9]

    rbf_accuracy = 0
    mlp_accuracy = 0
    tests = 0

    # Teste
    for layer in n_layers:
        for momentum in momentums:
            for eta in etas:
                for max_iteration in max_iterations:
                    for p in ps:
                        tests += 1

                        print("Test number", tests)

                        train, test = training.holdout(
                            p, dataset.normalized_dataframe)
                        print("INPUT NEURONS = 7 HIDDEN NEURONS = " +
                              str(int(6 / layer)) +
                              " OUTPUT NEURONS = 3 HIDDEN LAYER = " +
                              str(layer) + " ETA = " + str(eta) +
                              " MAX ITERATIONS = " + str(max_iteration) +
                              " MOMENTUM = " + str(momentum) + " P = " +
                              str(p))
                        print()
                        print("RBF")

                        nn = Rbf(7, 3)

                        nn.train(train, eta=0.5, max_iterations=max_iteration)
                        ac = training.accuracy(nn, test, 3)
                        rbf_accuracy += ac
                        print("ACCURACY =", ac)

                        print()
                        print("MLP")
                        example = test.values.tolist()

                        mm = Mlp(7,
                                 hidden_layer[layer - 1],
                                 3,
                                 n_hidden_layers=layer)
                        mm.backpropagation(train.values.tolist(),
                                           eta=eta,
                                           max_iterations=max_iteration)
                        ac = training.accuracy(mm, test, n_classes=3)
                        mlp_accuracy += ac
                        print("ACCURACY =", ac)
                        print()

                        print("Rbf:")
                        nn.feed_forward(example[15][:(-1 * 3)])
                        print(example[15])
                        print("Result 1")
                        nn.show_class()
                        print()

                        print("Mlp")
                        print(example[15])
                        nn.feed_forward(example[15][:(-1 * 3)])
                        print("Result 2")
                        mm.show_class()
                        print()
                        print(
                            "******************************************************//******************************************************"
                        )
                        print()

    print(tests, " tests executed. Rbf accuracy:", rbf_accuracy / tests,
          " Mlp accuracy:", mlp_accuracy / tests)
Beispiel #21
0
from preprocessing import PreProcessing
from visuals import ClassifierVisual

# Import model library
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

# Import data
dataset = LoadData("Social_Network_Ads.csv").data

# Split the dataset
X = dataset.iloc[:, [2,3]].values
y = dataset.iloc[:, 4].values

# Lets do some preprocessing...
processor = PreProcessing()
# Split the data
X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.25)
# scale the data
X_train = processor.fit_scaler(X_train)
X_test = processor.scale(X_test)

# Lets fit the model now
classifier = SVC(kernel='rbf', random_state=0)
classifier.fit(X_train, y_train)

# Predict!
y_pred = classifier.predict(X_test)

# Creating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
Beispiel #22
0
#############################################################################################################################

# preprocessing, make or load and save dataset
############################################################################################################################# 
if LOAD_DATA == True:

    dataset = load_dataset(DATASET_PATH)
    if SPLIT_DATASET:
        valid_dataset = load_dataset(VALID_DATASET_PATH)

else:

    # preprocessing
    source_path = r'./data/mnist-in-csv/mnist_test.csv'
    target_path = None
    sources, targets = PreProcessing(source_path, target_path,mode='csv') 

    # make dataset
    ##############################################################################

    transform = transforms.Compose([self_transform()])

    if SPLIT_DATASET:
        pivot = int(len(sources) * SPLIT_RATIO)
        train_sources = sources[:pivot]
        train_targets = targets[:pivot]

        valid_sources = sources[pivot:]
        valid_targets = targets[pivot:]

        valid_dataset = Mydataset(valid_sources,valid_targets,transform)      
from flask import Flask, jsonify, request
from flask_cors import CORS
from preprocessing import PreProcessing
from seq2seq import TextSummarization
from config import *
import nltk
import pickle
import numpy as np

app = Flask(__name__)
CORS(app)

preprocessing = PreProcessing()
model = TextSummarization(forward_only=True)
file = open(WORD_DICT_PATH, "rb")
word_dict = pickle.load(file, encoding="utf-8")
reversed_word_dict = dict(zip(word_dict.values(), word_dict.keys()))


@app.route('/get_summary', methods=['POST'])
def get_summary():
    data = request.get_json()
    text = data["text"]
    clean_text = preprocessing.clean_string(text)
    clean_text = [clean_text]
    x = nltk.word_tokenize(clean_text)
    x = [word_dict.get(d, word_dict["<unk>"]) for d in x]
    x = x[:(MAX_ARTICLE_LEN - 1)]
    x = [x + (MAX_ARTICLE_LEN - len(x)) * [word_dict["<padding>"]]]
    x = np.array(x)
    summary_text = model.get_summary(x, reversed_word_dict)
from preprocessing import PreProcessing
from sklearn.model_selection import train_test_split
from Models import LR, SVM, NaiveBayes, Word2VecDeep, BOWDeep, RNN
import matplotlib.pyplot as plt

pre = PreProcessing()
data = pre.clean_text()
X = data['post']
y = data['tags']
# Split to 20% test data and 80% training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

nb = NaiveBayes(X_train, X_test, y_train, y_test, pre.tags)
score = nb.train()

#lr = LR(X_train, X_test, y_train, y_test, pre.tags)
#lr.train()

#svm = SVM(X_train, X_test, y_train, y_test, pre.tags)
#svm.train()

#wv = Word2VecDeep(X_train, X_test, y_train, y_test, pre.tags)
#wv.train()

#bow = BOWDeep(X_train, X_test, y_train, y_test, pre.tags)
#print(bow.train())

# Use different function for pre-processing
"""pre = PreProcessing()
X, y = pre.filter_rnn()
# Split to 20% test data and 80% training data
"""this module generate inverted index for each query box, generate the article shelve, offer conjunctive query """

import shelve
import json
from preprocessing import PreProcessing
import time
from nltk import word_tokenize

# instantiate pp to access normalize and flatten, and to create the test corpus
prep = PreProcessing()


def timing(func):
    def wrapper(*args, **kwargs):
        t1 = time.time()
        func(*args, **kwargs)
        t2 = time.time()
        print("Time it took to build this index: " + str((t2 - t1)) + "\n")

    return wrapper


@timing
def main_query_inverted_index(shelvename1,
                              shelvename2,
                              corpus_name='2018_movies.json'):
    """
    create a title+free text inverted index, and put it into 2 shelve files (because one cannot hold)
    :param shelvename1: String
    :param shelvename2: String
    :param corpus_name: String a json file
# Import lib files
envs = load_dotenv(find_dotenv())
file = os.getenv("lib")
sys.path.insert(0, file)
from utils import LoadData
from preprocessing import PreProcessing

# Load data
dataset = LoadData("50_Startups.csv").data

# Split the dataset
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 4].values

# Using the PreProcessing class from preprocessing
processor = PreProcessing()
# Encoding dummy variables
X = processor.dummy_encoding(data=X, feature_position=3)

# Avoiding the dummy variable trap
X = X[:, 1:]

# Building the optimal model using Backward Elimination
X = np.append(arr=np.ones((X.shape[0], 1)).astype(int), values=X, axis=1)
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()

X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()
Beispiel #27
0
        ps.stem(word) for word in review
        if not word in stopwords.words('english')
    ]
    review = " ".join(review)
    corpus.append(review)

corpus
dataset['cleaned'] = corpus

# Create the Bag of Words model BoW
cv = CountVectorizer(max_features=500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1]

### Build a model using the BoW model
# Lets do some preprocessing...
processor = PreProcessing()
# Split the data
X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.20)

# Lets fit the model now
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict!
y_pred = classifier.predict(X_test)

# Creating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm
Beispiel #28
0
# Import lib files
envs = load_dotenv(find_dotenv())
file = os.getenv("lib")
sys.path.insert(0, file)
from utils import LoadData
from preprocessing import PreProcessing

# Load the data
dataset = LoadData("Salary_Data.csv").data

# Split the dataset
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 1].values

# Using the PreProcessing class from preprocessing
processor = PreProcessing()

# Split the data
X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.2, random_state=0)

# Fit Simple Linear Regression model
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# Predict the test set
y_pred = regressor.predict(X_test)

# Visualizing the data
plt.scatter(X_train, y_train, color='red')
plt.plot(X_train, regressor.predict(X_train), color='blue')
plt.title('Salary vs. Exp.')
Beispiel #29
0
        accuracy.append(acc)
    average = np.mean(accuracy)
    std = np.std(accuracy)
    ret_acc = []
    for i in range(len(test_y)-1):
        if test_y[i] != 0:
            acc = 100 - (np.abs(predicted_data[i] - test_y[i]))/test_y[i] * 100
            ret_acc.append(acc)
    ret_avg = np.mean(ret_acc)
    ret_std = np.std(ret_acc)
    pd.DataFrame(np.reshape(ret_acc, (len(ret_acc, )))).to_csv(return_acc)
    prediction = np.exp(model.predict(np.reshape(test_data[-2], (1, 20))))*price[-2]
    print(prediction)

    return dataset, average, std



# if __name__ == "__main__":
preprocess = PreProcessing(0.8, 0.25,"stock_data.csv","preprocessing/rbm_train.csv","preprocessing/rbm_test.csv","preprocessing/log_train.csv")
preprocess.make_wavelet_train()
preprocess.make_test_data()

# if __name__ == "__main__":
autoencoder = AutoEncoder(20,True,"preprocessing/rbm_train.csv","preprocessing/rbm_test.csv","features/autoencoded_data.csv","preprocessing/log_train.csv")
autoencoder.build_train_model(55, 40, 30, 30, 40)

# if __name__ == "__main__":
dataset, average, std = nnmodel(500, 0.05, 0.01,"features/autoencoded_data.csv","60_return_forex/encoded_return_test_data.csv","preprocessing/log_train.csv","forex_y/log_test_y.csv","forex_y/test_price.csv","60_return_forex/predicted_price.csv","60_return_forex/price.csv","60_return_forex/ret_acc.csv")
print(f"Price Accuracy Average = {average} \nPrice Accuracy Standard Deviation = {std}")
Beispiel #30
0
# Import lib files
envs = load_dotenv(find_dotenv())
file = os.getenv("lib")
sys.path.insert(0, file)
from utils import LoadData
from preprocessing import PreProcessing
from visuals import ClassifierVisual

# Import data
dataset = LoadData("Churn_Modelling.csv").data
X = dataset.iloc[:, 3:13].values
y = dataset.iloc[:, 13].values

# Lets do some preprocessing...
processor = PreProcessing()
# Encode the data (Country/Gender)
X[:, 1] = processor.encode(X[:, 1])
X[:, 2] = processor.encode(X[:, 2])
X = processor.hot_encoding(data=X, features=[1])
X = X[:, 1:]

# Split the data into training+test
X_train, X_test, y_train, y_test = processor.split(X, y, test_size=0.2)

# Fitting XGboost
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the test results
y_pred = classifier.predict(X_test)