def load_data(self, sample, x_axi_attr_index, y_axi_attr_index): from Preprocessing import Preprocess from Postprocessing import Postprocess creditdata = Preprocess("default of credit card clients.xls") raw_X_train, raw_X_test, raw_y_train, raw_y_test = creditdata.load() low_dim_X_train, low_dim_X_test, low_dim_Y_train, low_dim_Y_test = creditdata.dimension_decrease( ) postp = Postprocess(low_dim_X_train, low_dim_X_test, low_dim_Y_train, low_dim_Y_test) x1, x2, y1, y2 = postp.improve_data() return self.data_simplification(x1, y1, sample, x_axi_attr_index, y_axi_attr_index)
def __init__(self): self.classifier = [] self.processor = [] self.result = [] creditdata = Preprocess("default of credit card clients.xls") self.raw_X_train, self.raw_X_test, self.raw_Y_train, self.raw_Y_test = creditdata.load( ) self.low_dim_X_train, self.low_dim_X_test, self.low_dim_Y_train, self.low_dim_Y_test = \ creditdata.dimension_decrease() x1, x2, y1, y2 = self.low_dim_X_train, self.low_dim_X_test, self.low_dim_Y_train, self.low_dim_Y_test self.discretizer = Postprocess(x1, x2, y1, y2) self.discretized_X_train, self.discretized_X_test, self.discretized_Y_train, self.discretized_Y_test = \ self.discretizer.improve_data() self.buildclf() self.buildprocessor() self.logfile = open("execution_Log", "a")
class FeatureEngineer: def __init__(self, feature, path, train, test): self.preprocessor = Preprocess(feature, path, train, test) self.train_feat, self.test_feat = self.preprocessor.combine_data( self.preprocessor.features) self.df_train = self.preprocessor.df_train self.df_test = self.preprocessor.df_test def convert_labels(self): #dictionary to change labels to integers levels = {key: value \ for key, value in zip(self.preprocessor.df_train.word.unique(), \ range( len( self.df_train.word.unique() )) ) } #binary encoding labels labels = to_categorical(np.array([levels[key] \ for key in self.df_train["word"]], dtype=np.float32)) return labels, levels def scale(self): size_train = len(self.train_feat) size_test = len(self.test_feat) sc = StandardScaler() for i in range(size_train): self.train_feat[i] = sc.fit_transform(self.train_feat[i]) for i in range(size_test): self.test_feat[i] = sc.fit_transform(self.test_feat[i]) def remake_array(self, arr): ''' To ensure the last axis is the same for all samples ''' remade_array = np.zeros((len(arr), 99, 13)) for i, item in enumerate(arr): for j in range(item.shape[0]): for k in range(len(item[j])): remade_array[i][j][k] += arr[i][j][k] return remade_array[:, :, :, np.newaxis] def define_splitting(self, feat, lbl, ratio): np.random.seed(37555) X_1, X_2, y_1, y_2 = ms.train_test_split(feat, lbl, test_size=ratio) return X_1, X_2, y_1, y_2 def splitting(self): remade_data = self.remake_array(self.train_feat) labels, _ = self.convert_labels() X_train, X_2, y_train, y_2 = self.define_splitting( remade_data, labels, 0.3) X_val, X_test, y_val, y_test = self.define_splitting(X_2, y_2, 0.5) return (X_train, y_train), (X_val, y_val), (X_test, y_test)
def trainmodel(self): prep = Preprocess("default of credit card clients.xls") prep.load() low_dim_x1, low_dim_x2, low_dim_y1, low_dim_y2 = prep.dimension_decrease( ) postp = Postprocess(low_dim_x1, low_dim_x2, low_dim_y1, low_dim_y2) discretized_x1, discretized_x2, discretized_y1, discretized_y2 = postp.improve_data( ) x = np.concatenate((discretized_x1, discretized_x2)) y = np.concatenate((discretized_y1, discretized_y2)) self.c.fit(x, y) y_pred = self.c.predict(x) mislabeled = (y != y_pred).sum() totaltest = x.shape[0] print( "Mislabeled points (%s Classification) out of a total %d points : %d" % ("SVC", totaltest, mislabeled)) Precision = 1 - mislabeled / totaltest print("Precision of %s is %4.2f%%" % ("SVC", Precision * 100))
def __init__(self, Method=KMeans(), data=pd.DataFrame(), orig_data=pd.DataFrame(), log=None, test_name="", random_state=42): """ Constructor for LinearInversion class object. No parameters are given. Necessary information from the estimator class is provided later in the execution by QST_sim. Args: data: Raw data input method: Implementation method class (e.g. KMeans) log: Master Log object reference test_name: Name of the test being run random_state: Seed for random number generator """ #Date and time metadata self.execution_date_start = dt.datetime.now().date() self.execution_time_start = dt.datetime.now().time().strftime( "%H.%M.%S") #Set test name self.test_name = test_name #Attribute user inputs to object self.data = data self.Log = log self.Log.test_number += 1 #Initialize a preprocessing object self.Preprocess = Preprocess(self) #Initialize Segmentation Method Object self.SegMethod = SegMethod(self, Method=Method) #Set random state to default input self.random_state = random_state #Visualization folder name self.viz_folder_name = self.test_name + "_Visualizations" #Set original data self.orig_train_data = orig_data #Initialize all data storage variables self.train_data = None self.class_label = None
def __init__(self): self.preprocess = Preprocess.Preprocess() self.embeddings = Embeddings.Embeddings() self.stop_words = [] stop_words_path = config.data_prefix_path + 'spanish.txt' with open(stop_words_path, 'r') as fr: lines = fr.readlines() for line in lines: self.stop_words.append(line.strip())
def __init__(self): self.preprocessor = Preprocess.Preprocessor() self.embedding = Embeddings() self.lr = 5e-4 self.batch_size = 128 self.n_epoch = 10 self.sentence_length = self.preprocessor.sentence_length self.vec_dim = self.embedding.vec_dim self.filter_sizes = [2, 3] self.num_filters = 64 self.num_hidden = 100 self.l2_reg = 0.0004 self.num_classes = 2 self.vocab_size = 212237
def __init__(self, lang): self.preprocessor = Preprocess.Preprocess() self.embedding = Embeddings() self.Feature = Feature.Feature() self.Powerfulwords = PowerfulWord.PowerfulWord() self.Graph = GraphFeature.GraphFeature() self.lang = lang if lang == 'es': self.sentence_length = self.preprocessor.max_es_length elif lang == 'en': self.sentence_length = self.preprocessor.max_en_length self.n_folds = 10 self.num_classes = 2 self.eclipse = 1e-10 self.vec_dim = self.embedding.vec_dim self.clip_gradients = False self.max_grad_norm = 5.
def __init__(self): self.preprocessor = Preprocess.Preprocess() self.embedding = Embeddings() self.Feature = Feature.Feature() self.lr = 0.0004 self.keep_prob = 0.5 self.l2_reg = 0.04 self.sentence_length = self.preprocessor.max_length self.vec_dim = self.embedding.vec_dim self.hidden_dim = 16 self.num_classes = 2 self.batch_size = 128 self.n_epoch = 20 self.eclipse = 1e-10 self.num_features = 15 self.cosine = True self.psize1 = 3 self.psize2 = 3
def __init__(self, model_type="ABCNN3", clip_gradients=True): self.model_type = model_type self.preprocessor = Preprocess.Preprocessor() self.embedding = Embeddings() self.lr = 0.05 self.batch_size = 64 self.n_epoch = 12 self.sentence_length = self.preprocessor.sentence_length self.w = 4 self.l2_reg = 0.0004 self.di = 50 # The number of convolution kernels self.vec_dim = self.embedding.vec_dim self.num_classes = 2 self.num_layers = 2 self.clip_gradients = clip_gradients self.max_grad_norm = 5. self.eclipse = 1e-9 self.vocab_size = 212237
def main_exec(config): """ Main execution line. Dispatch processes according to parameter groups. Multiple processes here prevent main process from consuming too much memory. """ if not os.path.isdir(config.bdir): os.mkdir(config.bdir) if not os.path.isdir(config.weights_path): os.mkdir(config.weights_path) if not os.path.isdir(config.model_path): os.mkdir(config.model_path) if not os.path.isdir(config.cache): os.mkdir(config.cache) if not os.path.isdir(config.logdir): os.mkdir(config.logdir) if config.preprocess: if config.img_type is None: imgt = img_types else: imgt = config.img_type if config.multiprocess: proc = Process(target=Preprocess.preprocess_data, args=(config, imgt)) proc.start() proc.join() if proc.exitcode != Exitcodes.ALL_GOOD: print( "System did not end well. Check logs or enhace verbosity level." ) sys.exit(proc.exitcode) else: Preprocess.preprocess_data(config, imgt) if config.train: if not os.path.isdir(config.weights_path): os.mkdir(config.weights_path) if not os.path.isdir(config.model_path): os.mkdir(config.model_path) if config.multiprocess: ctx = mp.get_context('spawn') cache_m = CacheManager() proc = ctx.Process(target=GenericTrainer.run_training, args=(config, cache_m.getLocations())) proc.start() proc.join() if proc.exitcode != Exitcodes.ALL_GOOD: print( "System did not end well. Check logs or enhace verbosity level." ) sys.exit(proc.exitcode) else: GenericTrainer.run_training(config, None) if config.al: if not os.path.isdir(config.weights_path): os.mkdir(config.weights_path) if not os.path.isdir(config.model_path): os.mkdir(config.model_path) if config.multiprocess: ctx = mp.get_context('spawn') cache_m = CacheManager() proc = ctx.Process(target=ALTrainer.run_training, args=(config, cache_m.getLocations())) proc.start() proc.join() if proc.exitcode != Exitcodes.ALL_GOOD: print( "System did not end well. Check logs or enhace verbosity level." ) sys.exit(proc.exitcode) else: ts = importlib.import_module('Trainers', config.strategy) getattr(ts, config.strategy).run_training(config, None) if config.pred: if config.multiprocess: ctx = mp.get_context('spawn') cache_m = CacheManager() proc = Process(target=Predictions.run_prediction, args=(config, cache_m.getLocations())) proc.start() proc.join() if proc.exitcode != Exitcodes.ALL_GOOD: print( "System did not end well. Check logs or enhace verbosity level." ) sys.exit(proc.exitcode) else: Predictions.run_prediction(config, None) if config.postproc: pass if config.runtest: if config.tmode == 0: pass elif config.tmode == 1: #Run train test TrainTest.run(config) elif config.tmode == 2: DatasourcesTest.run(config) elif config.tmode == 3: PredictionTest.run(config) elif config.tmode == 4: ActiveLearningTest.run(config) if not (config.preprocess or config.train or config.postproc or config.pred or config.runtest): print( "The problem begins with choice: preprocess, train, postprocess or predict" )
# group owed amount into different intervals if -100000 <= temp_owe < 0: self.x_test[row, 6] = -1 elif -500000 <= temp_owe < -100000: self.x_test[row, 6] = -2 elif temp_owe < -500000: self.x_test[row, 6] = -3 elif self.x_test[row, 6] == 0: continue elif 1 <= temp_owe < 100001: self.x_test[row, 6] = 1 elif 10000 <= temp_owe < 500001: self.x_test[row, 6] = 2 else: self.x_test[row, 6] = 3 def improve_data(self): self.set_age() self.set_amount() return self.x_train, self.x_test, self.y_train, self.y_test if __name__ == '__main__': a = Preprocess("default of credit card clients.xls") rx1, rx2, ry1, ry2 = a.load() x1, x2, y1, y2 = a.dimension_decrease() b = Postprocess(x1, x2, y1, y2) xd1, xd2, yd1, yd2 = b.improve_data()
def __init__(self): self.scale = 0.1 self.vec_dim = 300 self.preprocessor = Preprocess.Preprocess()
def __init__(self): self.Proprocess = Preprocess.Preprocess() self.GraphFeture = GraphFeature.GraphFeature()
import pandas as pd import sys from keras.models import model_from_json sys.path.append('./code/DataCleaning') sys.path.append('./code/Models') sys.path.append('./code/Evaluation') from Preprocessing import Preprocess from DataManipulation import Manipulations_Selector import Train import DataSeperator data = pd.read_csv( "/home/sultan/Desktop/bitbuket/Text-Classification-master/data/GT.csv") x = data["comment_text"] x = x.values.tolist() x = Preprocess(x) available_text_manipulation = sys.argv[1] available_class = sys.argv[2] available_model = sys.argv[3] Embed = False reshape = True if available_text_manipulation == "Embedding": Embed = True reshape = False if available_text_manipulation == "WORD2VEC": reshape = False if available_text_manipulation == "WORD2VEC_pre":
def __init__(self): self.preprocess = Preprocess.Preprocess()
def __init__(self, feature, path, train, test): self.preprocessor = Preprocess(feature, path, train, test) self.train_feat, self.test_feat = self.preprocessor.combine_data( self.preprocessor.features) self.df_train = self.preprocessor.df_train self.df_test = self.preprocessor.df_test
def __init__(self): self.preprocess = Preprocess.Preprocess() self.Feature = Feature.Feature()
def __init__(self): self.preprocessor = Preprocess.Preprocess() self.Feature = Feature.Feature() self.Powerfulwords = PowerfulWord.PowerfulWord() self.Graph = GraphFeature.GraphFeature()