Beispiel #1
0
    def test__check_is_list(self):

        df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv',
                                      header=True)

        Preprocess(df_labels=df_long, columns=['country', 'protein'])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=df_long, columns='protein')
Beispiel #2
0
    def test__check_is_spark_data_frame(self):

        df_simple_table = self.spark.read.csv(
            'tests/fixtures/preprocess/simple_table.csv', header=True)
        pd_df_simple_table = pd.read_csv(
            'tests/fixtures/preprocess/simple_table.csv')

        Preprocess(df_labels=df_simple_table, columns=[''])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=pd_df_simple_table, columns=[''])
Beispiel #3
0
    def test__check_nulls_in_index_column(self):

        df_nulls = self.spark.read.csv(
            'tests/fixtures/preprocess/nulls_recipe_id.csv', header=True)
        df_no_nulls = self.spark.read.csv(
            'tests/fixtures/preprocess/no_nulls_recipe_id.csv', header=True)

        Preprocess(df_labels=df_no_nulls, columns=[''])

        with self.assertRaises(AssertionError):
            Preprocess(df_labels=df_nulls, columns=[''])
Beispiel #4
0
    def test__remove_columns(self):

        df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv',
                                      header=True)

        preprocessor_2_columns = Preprocess(df_labels=df_long,
                                            columns=['country', 'protein'])
        preprocessor_2_columns._remove_columns()

        self.assertEqual(len(preprocessor_2_columns.df_labels.columns), 1 + 2)

        preprocessor_all = Preprocess(df_labels=df_long, columns='all')
        preprocessor_all._remove_columns()

        self.assertEqual(len(preprocessor_all.df_labels.columns), 4)
Beispiel #5
0
    def get_environment_state(self):
        state = self.game_environment.get_state()
        img = state.screen_buffer
        img = Preprocess(img).image
        game_vars = state.game_variables

        return img, game_vars
Beispiel #6
0
def controler_1(loc_fp, cat_fp, mongo_username, mongo_password, dbname,
                collectionname):
    query_dict = GetQuery(loc_fp, cat_fp).get_final_query_dict_bygugun()

    i = 0
    for category_query, local_category_query_list in query_dict.items():
        i += 1
        api_id, api_secret = get_api_id_secret(i)
        display = 10
        category_query_dictlist = NaverRequests(api_id, api_secret,
                                                local_category_query_list,
                                                display).get_category_query()

        if len(category_query_dictlist) < 1:
            print("{} / {} : {}, items : 0 #### Failed".format(
                i, len(query_dict.items()), category_query))
            pass
        else:
            processed_df = Preprocess(category_query,
                                      category_query_dictlist).preprocess()

            save_to_mongodb(processed_df,
                            username=mongo_username,
                            userpassword=mongo_password,
                            dbname=dbname,
                            collectionname=collectionname)

            print("{} / {} : {}, items : {}".format(i, len(query_dict.items()),
                                                    category_query,
                                                    processed_df.shape[0]))
def text_preprocess(text):
    pre = Preprocess(text)
    text = pre.clean_text(text)
    text = pre.clean_contractions(text, pre.contraction_mapping)
    text = pre.correct_spelling(text, pre.mispell_dict)
    text = pre.clean_special_chars(text, pre.punct, pre.punct_mapping)
    return text
Beispiel #8
0
 def __init__(self, data_path):
     self.path = data_path
     self.preprocess = Preprocess()
     self.gender = []
     self.userid = []
     self.model = Model()
     self.model.load_model()
Beispiel #9
0
def LoadData():
	print("Preprocess the dataset...", end = ' ')
	preprocess = Preprocess()
	SRC, TRG, tr, valid, ts = preprocess.Build()
	print("DONE")
	
	return SRC,TRG, tr, valid, ts
Beispiel #10
0
    def __init__(self):
        self.vect = TfidfVectorizer()

        self.data = None
        self.vect_data = None

        self.pre = Preprocess()
    def __init__(self):
        cursor.execute("SELECT content FROM data")
        scripts = cursor.fetchall()

        fw = open('vector.txt', 'w')
        fresult = open('result.txt', 'w')
        mPreprocess = Preprocess()
        mPairedToken = PairToken()
        mConvertVector = ConvertVector()
        stanford = StanfordCoreNLP('http://localhost:9000')
        for script in scripts:
            # if type(script) is tuple:
            listToken = mPreprocess.exec(script[0])
            # else:
            #     listToken = mPreprocess.exec(script)

            listCouple = mPairedToken.exec(listToken)

            output = stanford.annotate(script[0], properties={'annotators': 'coref', 'outputFormat': 'json'})

            for mCoupleToken in listCouple:
                if self.checkCoreF(output['corefs'], mCoupleToken):
                    # fresult.write(str(1) + ' ' + mCoupleToken.np1.text + '  ' + mCoupleToken.np2.text)
                    fresult.write(str(1))
                    fresult.write('\n')
                else:
                    # fresult.write(str(-1) + ' ' + mCoupleToken.np1.text + '  ' + mCoupleToken.np2.text)
                    fresult.write(str(-1))
                    fresult.write('\n')

                vector = mConvertVector.exec(mCoupleToken)

                fw.write(str(vector))
                fw.write('\n')
Beispiel #12
0
 def preprocess_pipeline(self,
                         df,
                         upper_quantile,
                         lower_quantile,
                         target_name,
                         none_values,
                         outlier_column: str = None,
                         polytrans_columns: list = None,
                         corr_percentage=0.7,
                         test_size=0.25,
                         validation=False):
     preprocess = Preprocess(df)
     target_feature = preprocess.target_feature(none_values, target_name)
     if outlier_column != None:
         preprocess.drop_outliers(outlier_column, upper_quantile,
                                  lower_quantile)
     preprocess.drop_multicoll_columns(ALLOWED_CORR_PER)
     preprocess.imputer()
     if polytrans_columns != None:
         preprocess.polytrans(polytrans_columns)
     features = preprocess.one_hot_encoder()
     if validation:
         x_train, y_train, x_test, y_test, x_validation, y_validation = train_test_split(
             features, target_feature, test_size, validation)
         return x_train, y_train, x_test, y_test, x_validation, y_validation
     x_train, y_train, x_test, y_test = preprocess.train_test_split(
         features, target_feature, test_size, validation)
     return x_train, x_test, y_train, y_test
Beispiel #13
0
def main():
    '''Training of the model on the preprocessed data. '''

    preprocess = Preprocess()
    data = preprocess.getData(
        path="creditcard.csv",  # path of the csv file
        feature_incides=[0, 29],  # column indices of the features
        label_indices=[30],  # column indices of the labels
        training_size=0.5,  # size for the training set 
        standardize=True,  # apply standardization?
        eval_set=True  # create evaluation set?
    )

    model = Model(
        batch_size=10,  # size of the training batch  
        epochs=50,  # number of training epochs  
        nodes=[
            29, 200, 2
        ],  # List of neurons, first entry is the number of input, last entry 
        # the number of output neurons. The values in between are the hidden neurons.
        learning_rate=0.0001,  # learning rate for the training
        hidden_activation=
        "sigmoid",  # activation function for the hidden nodes, choose between "tanh", "sigmoid" and "relu"
        output_activation=
        "linear",  # activation function for the output nodes, choose between "tanh", "sigmoid" and "linear" 
        data=data,  # the loaded and preprocessed data form the csv file
        do_eval=True  # measure accuracy of the evaluation set?
    )

    model.train()
Beispiel #14
0
    def getConstraints(self, setnumber=""):
        try:
            preprocess = Preprocess()
            absolute_path = path.join(self._path, self._data['params'])
            count = []
            constraints = []
            for filename in listdir(absolute_path):
                match = re.match(self._patterns['params'], filename)
                if match:
                    if (match.group(2) == setnumber):
                        count.append(match.group(3))
                        with open(path.join(absolute_path, filename),
                                  "r") as c:
                            constraints.append(
                                preprocess.preprocessConstraints(
                                    c.read().split("\n")))
            if constraints == []:
                return {"error": True, "message": "Something's up"}
            return {"error": False, "constraints": constraints, "count": count}
        except FileNotFoundError:
            return {
                "error":
                True,
                "message":
                """Files not found. Please make sure that there is a directory called 'params' 
				in the given path, with the files named as params.txt or params1.txt or params1-1.txt"""
            }
Beispiel #15
0
    def closed_form_extra_features(self):
        preprocess1 = Preprocess()

        x_set = preprocess1.matrixify(self.data, 60)
        y_set = Preprocess.get_y(self.data)
        lengths = []
        length_squared = []

        for datapoint in self.data:
            text_length = len(datapoint['text'])
            lengths.append(text_length)

        children_length_inter = []
        children_list = []
        log_children_list = []
        for datapoint in self.data:
            children_list.append(datapoint['children'])
            if datapoint['children'] != 0:
                log_children_list.append(math.log(datapoint['children']))
            else:
                log_children_list.append(0)

        for length, children in zip(lengths, children_list):
            children_length_inter.append(length * children)

        preprocess1.add_features(children_length_inter)
        x_set = preprocess1.add_features(log_children_list)
        x_set = feature_selector.backwardElimination(x_set, y_set, 0.1)
        return self.run_model(x_set, y_set)
Beispiel #16
0
 def display_training_and_validation_error(self):
     num_words = 160
     word_nums = np.arange(num_words)
     val_error_list = []
     train_error_list = []
     preprocess1 = Preprocess()
     x_set = preprocess1.matrixify(self.data, num_words)
     y_set = Preprocess.get_y(self.data)
     for x in word_nums:
         cur = x_set[:, 3:3 + x]
         print("Running on top " + str(x) + " words")
         val_error, train_error = self.run_model(cur, y_set)
         val_error_list.append(val_error)
         train_error_list.append(train_error)
     fig, ax = plt.subplots()
     plt.scatter(word_nums,
                 val_error_list,
                 color='blue',
                 s=5,
                 label="Validation set")
     plt.scatter(word_nums,
                 train_error_list,
                 color='red',
                 s=5,
                 label="Training set")
     plt.title("MSE vs number of words used")
     ax.set_xlabel("Words Used")
     ax.set_ylabel("MSE")
     plt.legend(loc='upper right')
     plt.show()
Beispiel #17
0
def main():
    # Load and merge review, business and user data
    loader = DataLoader(path="../data/")
    df = loader.merge()

    preprocessor = Preprocess(df)

    # Remove weekday columns
    preprocessor.rm_weekdays()

    # Convert categorical variables to one-hot encoded variables and convert
    # strings to numerical types if possible
    preprocessor.categorical_to_numerical()

    # Fill NA observations with the mode of the given feature
    preprocessor.fill_na()

    # Unravel and one-hot encode business categories
    preprocessor.unravel_categories()

    # Sort observations by date and reindex with this ordering
    preprocessor.sort_by_date()

    # Form temporal train-val-test split
    preprocessor.split_data()

    # Write the final dataframe to a pickle file
    preprocessor.dump("../data/yelp_df.pkl")
Beispiel #18
0
def create_threads(detector):
    config = configparser.ConfigParser()
    config.read(FACE_DETEC_CONF)

    video_decoders = []
    for item in config['videostream']:
        preprocesser = Preprocess(config['videostream'][item],
                                  len(video_decoders), MODEL_WIDTH,
                                  MODEL_HEIGHT)
        video_decoders.append(preprocesser)

    rtsp_num = len(video_decoders)
    if rtsp_num == 0:
        log_error("No video stream name or addr configuration in ",
                  FACE_DETEC_CONF)
        return None, None

    postprocessor = Postprocess(detector)

    display_channel = int(config['display']['channel'])
    if (display_channel is None) or (display_channel >= rtsp_num):
        log_info("No video to display, display configuration: ",
                 config['display']['channel'])
    else:
        video_decoders[display_channel].set_display(True)
        ret = postprocessor.create_presenter_channel(FACE_DETEC_CONF)
        if ret == False:
            log_error("Create presenter channel failed")
            return None, None

    return video_decoders, postprocessor
Beispiel #19
0
    def train(self, data, X_column, y_columns):
        if y_columns is None:
            _ = data.columns.to_list()
            y_columns = list(set(_) - set([X_column]))
        X = data[X_column]
        y = data.drop(X_column, axis=1)
        xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        test_size=0.2)
        mlb = MultiLabelBinarizer()
        train_labels = mlb.fit_transform(ytrain[y_columns].values)
        # test_labels not used when training
        # test_labels = mlb.fit_transform(ytest[y_columns].values)

        train_cleaned = xtrain.copy(deep=True).apply(Preprocess().clean_text)
        # test cleaned not used when training
        # test_cleaned = xtest.copy(deep=True).apply(clean_text)
        vectorizer = TfidfVectorizer()
        vectorised_train_documents = vectorizer.fit_transform(train_cleaned)
        powersetsvc = LabelPowerset(LinearSVC())
        powersetsvc.fit(vectorised_train_documents, train_labels)
        dump(powersetsvc, open("powersetsvc.pickle", "wb"))
        with open('vec.pickle', 'wb') as f1:
            dump(vectorizer, f1)
        return powersetsvc, vectorizer
Beispiel #20
0
    def gettimeparams(self, method):
        time_window = self.time_win.get()
        entries = int(int(self.sr) * float(time_window))
        k = 0
        process = Preprocess()
        if self.fftflag == True:
            l = len(self.tfdata)
            n = len(self.tfdata[0])
            m = len(self.tfdata[0][0])
            for i in range(l):
                for j in range(n):
                    k = 0
                    self.classes_new = []
                    while k < m / entries:
                        if (all(x == self.classes[k * entries]
                                for x in self.classes[k * entries:(k + 1) *
                                                      entries])):
                            self.features.append(
                                getattr(process, method)(
                                    self.tfdata[i][j][k * entries:(k + 1) *
                                                      entries]))
                            self.classes_new.append(self.classes[k * entries])
                            k += 1
                        elif (all(x == self.classes[k * entries]
                                  for x in self.classes[k * entries:(k + 1) *
                                                        entries]) == False):
                            k += 1

                    self.final_df[method +
                                  "_fft_{}_{}".format(i, j)] = self.features
                    self.features = []

        else:
            df = pd.read_csv(self.filename)
            [m, n] = df.shape
            df = pd.DataFrame(df.values, columns=range(n))
            self.classes = df[n - 1]
            self.classes_fin = self.classes
            df = df.drop(labels=n - 1, axis=1)
            [m, n] = df.shape
            for i in range(n):
                k = 0
                self.classes_new = []
                while k < m / entries:
                    if (all(x == self.classes[k * entries]
                            for x in self.classes[k * entries:(k + 1) *
                                                  entries])):
                        self.features.append(
                            getattr(process, method)(
                                df[i].iloc[k * entries:(k + 1) * entries]))
                        self.classes_new.append(self.classes[k * entries])
                        k += 1
                    elif (all(x == self.classes[k * entries]
                              for x in self.classes[k * entries:(k + 1) *
                                                    entries]) == False):
                        k += 1
                self.final_df[method + "_{}".format(i)] = self.features
                self.features = []
        self.classes_fin = self.classes_new
Beispiel #21
0
def main(model_num=1):

    preprocess = Preprocess()

    texts_train, labels_train = preprocess.preprocessData(
        '../projet2/train.txt', mode="train")
    texts_dev, labels_dev = preprocess.preprocessData('../projet2/dev.txt',
                                                      mode="train")

    MAX_SEQUENCE_LENGTH = 24
    LSTM_DIM = 64
    HIDDEN_LAYER_DIM = 30
    NUM_CLASSES = 4
    GAUSSIAN_NOISE = 0.1
    DROPOUT = 0.2
    DROPOUT_LSTM = 0.2
    BATCH_SIZE = 200

    X_train, X_val, y_train, y_val = train_test_split(texts_train,
                                                      labels_train,
                                                      test_size=0.2,
                                                      random_state=42)

    labels_categorical_train = to_categorical(np.asarray(y_train))
    labels_categorical_val = to_categorical(np.asarray(y_val))
    labels_categorical_dev = to_categorical(np.asarray(labels_dev))

    embedding = Embedding('../projet2/emosense.300d.txt')
    embeddings = embedding.getMatrix()
    tokenizer = embedding.getTokenizer()

    message_first_message_train, message_second_message_train, message_third_message_train = get_sequences(
        X_train, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_val, message_second_message_val, message_third_message_val = get_sequences(
        X_val, MAX_SEQUENCE_LENGTH, tokenizer)
    message_first_message_dev, message_second_message_dev, message_third_message_dev = get_sequences(
        texts_dev, MAX_SEQUENCE_LENGTH, tokenizer)

    model = CustomModel(model_num)
    model.build(embeddings,
                MAX_SEQUENCE_LENGTH,
                LSTM_DIM,
                HIDDEN_LAYER_DIM,
                NUM_CLASSES,
                noise=GAUSSIAN_NOISE,
                dropout_lstm=DROPOUT_LSTM,
                dropout=DROPOUT)
    model.summary()
    history = model.train(message_first_message_train,
                          message_second_message_train,
                          message_third_message_train,
                          labels_categorical_train, message_first_message_val,
                          message_second_message_val,
                          message_third_message_val, labels_categorical_val)

    y_pred = model.predict([
        message_first_message_dev, message_second_message_dev,
        message_third_message_dev
    ])
    def compile(self, dataset, lookahead, dense, dimension):

        prep = Preprocess(emb_file=self.emb_file, dataset=dataset,lookahead=lookahead, dense=dense, dimension=dimension)
        num_classes = prep.num_classes

        x_train, y_train, x_test, y_test = prep.x_train, prep.y_train, prep.x_test, prep.y_test

        return x_train, y_train, x_test, y_test, num_classes
Beispiel #23
0
    def test__convert_column_argument(self):

        df_long = self.spark.read.csv('tests/fixtures/preprocess/long.csv',
                                      header=True)

        preprocessor = Preprocess(df_labels=df_long, columns='all')

        self.assertEqual(len(df_long.columns) - 1, len(preprocessor.columns))
Beispiel #24
0
    def setUp(self):
        self.test_size = 12000
        self.data = data[:self.test_size]

        self.training_set = data[:10000]
        self.validation_set = data[10000:11000]
        self.testing_set = data[11000:12000]

        preprocess1 = Preprocess()
        preprocess1.preprocess(self.training_set)
        self.x_train = preprocess1.matrixify(self.training_set)
        self.y_train = Preprocess.get_y(self.training_set)

        preprocess2 = Preprocess()
        preprocess2.preprocess(self.validation_set)
        self.x_val = preprocess2.matrixify(self.validation_set)
        self.y_val = Preprocess.get_y(self.validation_set)
Beispiel #25
0
def main():

    preprocess = Preprocess()
    preprocess.check_data_distribution()

    print "\n\n*********** ANALYSIS PART I *******************"
    partI_classifier = Classifiers(1)
    partI_classifier.draw_auc_curve(1)
Beispiel #26
0
def main():
    df = load_csv_to_df(path, input_filename)
    print(df.shape)

    pipeline = Pipeline([('apply_rules', Preprocess())])
    df = pipeline.fit_transform(df)

    dump_df_to_csv(df, path, output_filename)
Beispiel #27
0
    def test_preprocess(self):

        df_recipe_info = self.spark.read.csv(
            'tests/fixtures/preprocess/recipe_info.csv', header=True)

        preprocessor_all = Preprocess(df_labels=df_recipe_info, columns='all')

        df_preprocessed_all = preprocessor_all.preprocess()
        self.assertEqual(df_preprocessed_all.count(),
                         df_recipe_info.count() - 1)

        preprocessor_country = Preprocess(df_labels=df_recipe_info,
                                          columns=['country'])
        df_preprocessed_country = preprocessor_country.preprocess()
        self.assertEqual(df_preprocessed_country.count(),
                         df_recipe_info.count() - 1)
        self.assertEqual(len(df_preprocessed_country.columns), 1 + 4)
 def test_remove_non_alpha(self):
     preprocessor = Preprocess()
     preprocessor.preprocess_remove_non_alpha(self.data)
     for point in self.data:
         for word in point['text']:
             try:
                 self.assertTrue(word.isalpha())
             except AssertionError:
                 print(word)
    def __init__(self):
        """initialize dataset and load model"""
        self.model = load_model(config.model_path)
        print("[Log] Pretrained model was loaded.")

        self.preprocess = Preprocess(database_path=config.database_path)
        print("[Log] Preprocess object was created.")

        self.database = self.init_database()
Beispiel #30
0
def main():
    #df = pd.read_csv(os.path.join(path, input_filename), dtype='unicode')
    df = load_csv_to_df(path, input_filename)
    print(df.shape)

    pipeline = Pipeline([('apply_rules', Preprocess())])
    df = pipeline.fit_transform(df)

    dump_df_to_csv(df, path, output_filename)