def __init__(self, word_dim, char_dim, max_sent_len, max_char_len, learning_rate, num_train_steps): self.word_dim = word_dim self.char_dim = char_dim self.max_sent_len = max_sent_len self.max_char_len = max_char_len self.learning_rate = learning_rate self.num_train_steps = num_train_steps ## Preprocess data self.prepro = preprocess.Preprocess(self.char_dim, self.max_sent_len, self.max_char_len) self.train_X, self.train_seq_length, self.train_Y, self.test_X, self.test_seq_length, self.test_Y = self.prepro.load_data( "./train.csv", "./test.csv", self.max_sent_len) self.word_embedding, self.char_embedding = self.prepro.prepare_embedding( self.char_dim) self.train_X, self.train_X_char, self.train_X_char_len, self.train_Y = self.prepro.prepare_data( self.train_X, self.train_Y, "train") self.test_X, self.test_X_char, self.test_X_char_len, self.test_Y = self.prepro.prepare_data( self.test_X, self.test_Y, "test") ## Placeholders self.word_input = tf.placeholder(tf.int32, shape=[None, max_sent_len], name='word') self.char_input = tf.placeholder( tf.int32, shape=[None, max_sent_len, max_char_len], name='char') self.label = tf.placeholder(tf.int32, shape=[None], name='label') self.seq_len = tf.placeholder(tf.int32, shape=[None]) self.char_len = tf.placeholder(tf.int32, [None, max_sent_len]) self.dropout = tf.placeholder(tf.float32, shape=())
def store_features2(name): pre = preprocess.Preprocess('./BTCUSDT/dol_bar.csv') df = pre.x_feature2() df.to_csv("./BTCUSDT/features_min.csv") df = pre.clean_df() df.to_csv("./BTCUSDT/features_min_clean.csv") print(name)
def run(): print "Loading data..." # load training data trainImages,trainLabels=dl.load_mnist_train() imDim = trainImages.shape[0] inputDim = 50 outputDim = 10 layerSizes = [16]*2 trainImages = trainImages.reshape(imDim**2,-1) pcer = pc.Preprocess() pcer.computePCA(trainImages) whitenedTrain = pcer.whiten(trainImages, inputDim) minibatch = whitenedTrain.shape[1] print "minibatch size: %d" % (minibatch) epochs = 10000 stepSize = 1e-2 nn = nnet.NNet(inputDim,outputDim,layerSizes,minibatch) nn.initParams() SGD = sgd.SGD(nn,alpha=stepSize,minibatch=minibatch) for e in range(epochs): print "Running epoch %d"%e SGD.run(whitenedTrain,trainLabels) SGD.dumptrace()
def write_preprocess_sentence_without_synonymous(): with open(const.FIRST_PROCESS_REVIEW_PATH, 'r') as fr: fw = open(const.REVIEW_FOR_CLUSTER_PATH, 'w+') p = pre.Preprocess() for line in fr: l = line.split(',') cla = l[len(l) - 1].strip() j = 1 s = '' while j < len(l) - 2: if j == len(l) - 3: s += l[j] else: s += l[j] + ',' j += 1 # print s # 预处理 p.set_sentence(s) res = p.preprocess(False) # 如果出现非英文文本,处理为空文本,并标位第4类 if res == '': cla = '4' fw.write(l[0] + ',' + res + ',' + cla + '\n') fw.close()
def get_labels_no_side(outfolder, inputfile="x_features.csv"): inputfile = outfolder + inputfile pre = preprocess.Preprocess(inputfile) df = pre.label_fix_no_side(is_infile=True, inputfile=inputfile) print(df.head()) df.to_csv(outfolder + "labels_no_side.csv") return df
def write_preprocess_sentence(): with open(const.MARKED_REVIEW_PATH, 'r') as fr: fw = open(const.FIRST_PROCESS_REVIEW_PATH, 'w+') i = 0 p = pre.Preprocess() for line in fr: l = line.split(',') cla = l[len(l) - 1].strip() j = 5 s = '' while j < len(l) - 2: s += l[j] + ',' j += 1 # print s # 预处理 p.set_sentence(s) res = p.preprocess() # 如果出现非英文文本,处理为空文本,并标位第4类 if res == '': cla = '4' fw.write(l[0] + ',' + s + res + ',' + cla + '\n') i += 1 if i == 1300: break fw.close()
def main(): root = utils.get_root_path(False) usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option('--learning_rate_rbm', action='store', type='string', dest='learning_rate_rbm') parser.add_option('--epochs_rbm', action='store', type='string', dest='epochs_rbm') parser.add_option('--batch_size', action='store', type='string', dest='batch_size') parser.add_option('--data_set', action='store', type='string', dest='data_set') (opts, args) = parser.parse_args() file_data = ReadFile.ReadFile(root + '/NSL_KDD-master', opts=opts).get_data() data_pp = preprocess.Preprocess(file_data).do_predict_preprocess() dbn_model.DBN(data_pp).do_dbn('yadlt', opts=opts) dbn_model.DBN(data_pp).do_dbn_with_weight_matrix(root + '/save') model.do_svm()
def get_labels(outfolder,inputfile="x_features.csv"): inputfile = outfolder+inputfile pre = preprocess.Preprocess(inputfile) df = pd.read_csv(inputfile,index_col=0) df['date_time'] = pd.to_datetime(df.date_time) df.index = df.date_time df.drop(columns=['date_time'],inplace=True) # print(df.head()) # return df = pre.labeling(df) print(df.head()) df.to_csv(outfolder+"labels.csv") # get_labels("./BNBUSDT/") # outfolder = ["./BNBUSDT/","./ETHUSDT/"] # p1 = Process(target=get_labels,args=(outfolder[0],"x_features.csv")) # p1.start() # p2 = Process(target=get_labels,args=(outfolder[1],)) # p2.start() # def clean_df(outfolder,inputfile="x_features.csv"): # inputfile = outfolder+inputfile # df = pd.read_csv(inputfile) # df.dropna(axis=0, how='any', inplace=True) # df.to_csv(outfolder+"x_features_clean.csv") # outfolder = ["./BNBUSDT/","./ETHUSDT/"] # p1 = Process(target=gtd.get_training_dataset,args=(outfolder[0],)) # p1.start() # p2 = Process(target=gtd.get_training_dataset,args=(outfolder[1],)) # p2.start()
def change_tfidf(text): pre = preprocess.Preprocess() with open('tfidf.pickle', 'rb') as handle: vectorizer = pickle.load(handle) clean = ' '.join(e for e in pre.preprocess(text)) vector = vectorizer.transform([clean]) return vector.toarray()
def __init__(self, min_bid=0.0, max_bid=MAX_BID, min_cost=0.0, max_cost=MAX_EXP): self.min_bid = min_bid self.max_bid = max_bid self.min_cost = min_cost self.max_cost = max_cost kernel_cost = gpflow.kernels.SquaredExponential( ) # * gpflow.kernels.Constant() # * gpflow.kernels.SquaredExponential() kernel_rev = gpflow.kernels.SquaredExponential( ) # * gpflow.kernels.Constant() # * gpflow.kernels.SquaredExponential() self.__mean_cost = None # gpflow.mean_functions.Linear() # None # 0.0 self.__mean_rev = None # gpflow.mean_functions.Linear() # None # 0.0 self.__input_scaler = preprocess.Preprocess(with_scaler=True, with_mean=False, with_std=False) self.__output_cost_scaler = preprocess.Preprocess(scale_min=0.0, scale_max=50.0, with_scaler=True, with_mean=True, with_std=False) self.__output_rev_scaler = preprocess.Preprocess(scale_min=0.0, scale_max=50.0, with_scaler=True, with_mean=True, with_std=False) # data already knew self.X = np.array(0.0).reshape( -1, 1) # self.__input_scaler.fit(np.array(0.0).reshape(-1, 1)) self.Y_cost = np.array(0.0).reshape( -1, 1) # self.__output_scaler.fit(np.array(0.0).reshape(-1, 1)) self.Y_rev = np.array(0.0).reshape( -1, 1) # self.__output_scaler.fit(np.array(0.0).reshape(-1, 1)) self.transformed_X = self.__input_scaler.fit( np.array(0.0).reshape(-1, 1)) self.transformed_Y_cost = self.__output_cost_scaler.fit( np.array(0.0).reshape(-1, )) self.transformed_Y_rev = self.__output_rev_scaler.fit( np.array(0.0).reshape(-1, )) self._optimize()
def add_features(outfolder, inputfile="dol_bar.csv"): inputfile = outfolder + inputfile pre = preprocess.Preprocess(inputfile) print('compute x_features.csv') x_features = pre.x_feature2() x_features.to_csv(outfolder + "x_features.csv") df = pre.clean_df() df.to_csv(outfolder + "x_features_clean.csv")
def manual(self, args): """Manual execute.""" # read manual configure file pre_process = preprocess.Preprocess() pre_process.readconfig('manual.ini') self.endoutput = pre_process.endoutput self.midoutput = pre_process.midoutput self.respath = pre_process.respath self.gsystem = pre_process.gsystem self.ctype = pre_process.ctype self.duration = pre_process.duration self.prn = pre_process.prn # choose moduel if args[1].upper() == '-A': # start process process = list() process.append(multiprocessing.Process(target=self.enu)) process.append(multiprocessing.Process(target=self.uh)) process.append(multiprocessing.Process(target=self.satnum)) process.append(multiprocessing.Process(target=self.satiode)) process.append(multiprocessing.Process(target=self.satorbitc)) [p.start() for p in process] [p.join() for p in process] print('All Done!') elif args[1].upper() == '-R': self.report() print('Done!') elif args[1].upper() == '--ENU': self.enu() print('Done!') elif args[1].upper() == '--HV': self.uh() print('Done!') elif args[1].upper() == '--HVM': self.uhmean() print('Done!') elif args[1].upper() == '--SAT': self.satnum() print('Done!') elif args[1].upper() == '--IODE': self.satiode() print('Done!') elif args[1].upper() == '--ORBITC': self.satorbitc() print('Done!') elif args[1].upper() == '--HELP' or args[1]: print('Arg:') print('\t-a -A:execute all module.') print('\t-r -R:zdpos report') print('\t--ENU:plot ENU') print('\t--HV:plot horizontal and vertical errors') print('\t--HVM:plor mean of horizontal and vertical errors') print('\t--SAT:plot satellite number') print('\t--IODE:plot satellite iode') print('\t--ORBITC:plot orbit and clock errors')
def __init__(self , data_dir = 'data'): self.data_dir = data_dir #self.train_dir_name = os.path.join(self.data_dir ,'/EASC-UTF-8/Articles/') #self.test_dir_name = os.path.join(self.data_dir ,'/EASC-UTF-8/Articles/') #self.train_dir_name = 'data/EASC-UTF-8/Articles/' #self.test_dir_name = 'data/EASC-UTF-8/Articles/' self.train_dir_name = 'data\EASC-UTF-8\Articles' self.test_dir_name = 'data\EASC-UTF-8\MTurk' self.data =pd.DataFrame(columns = ['Orignal' ,'Summary1' ,'Summary2' ,'Summary3' ,'Summary4' ,'Summary5']) self.pr = preprocess.Preprocess()
def database_compare(images, query_img, i_size, n_angles): pre = preprocess.Preprocess(i_size, n_angles) pre.process_img(query_img) hitmap = pre.get_hitmap() edgel_count = pre.get_edgel_counts()[0] hit_counts = np.asarray([0.] * len(images)) for i in range(0, len(images)): edgels = pre.get_edgels(images[i]) for edgel in edgels: for _ in hitmap[edgel[0]][edgel[1]][edgel[2]]: hit_counts[i] += 1 return hit_counts / edgel_count
def main(): #filter warnings warnings.warn = warn x = pr.Preprocess() #preprocessing X_train, Y_train, X_dev, Y_dev, X_test, Y_test = x.get_data() #Using sys agr if (sys.argv[1] == 'train_b'): train_baseline(X_train, Y_train, X_dev, Y_dev, X_test, Y_test) elif (sys.argv[1] == 'train_e'): train_extended(X_train, Y_train, X_dev, Y_dev, X_test, Y_test) elif (sys.argv[1] == 'hyper_tune'): feature_tuning(X_train, Y_train, X_dev, Y_dev, X_test, Y_test)
def __init__(self,batch_size,word_dim,hidden_dim,num_layers,max_vocab_size,max_word_len,learning_rate,training_epochs,path, isTrain=True): self.batch_size = batch_size self.word_dim = word_dim self.hidden_dim = hidden_dim self.num_layers = num_layers self.max_vocab_size = max_vocab_size self.max_word_len = max_word_len self.learning_rate = learning_rate self.training_epochs = training_epochs self.path = path ## Preprocess data self.prepro = preprocess.Preprocess(word_dim=word_dim, max_vocab_size=max_vocab_size, path=path) self.word_embedding, self.clear_padding, self.word2idx, self.idx2word = self.prepro.build_embedding() ## Placeholders self.word_idx = tf.placeholder(tf.int32, shape = [None, max_word_len], name = 'word_idx') self.label = tf.placeholder(tf.int32, shape = [None], name = 'label') self.seq_len = tf.placeholder(tf.int32, shape = [None], name = 'seq_len') self.dropout = tf.placeholder(tf.float32, shape = (), name = 'dropout') ## Read file self.train_text, self.train_len, self.train_score = self.prepro.read_data(self.path + '/ratings_train.txt') self.test_text, self.test_len, self.test_score = self.prepro.read_data(self.path + '/ratings_test.txt') self.train_size, self.test_size = len(self.train_score), len(self.test_score) num_train_steps = int(self.train_size / self.batch_size) + 1 train_dataset = tf.data.Dataset.from_tensor_slices((self.word_idx, self.label, self.seq_len)) train_dataset = train_dataset.shuffle(self.train_size) train_dataset = train_dataset.batch(self.batch_size) train_dataset = train_dataset.repeat() test_dataset = tf.data.Dataset.from_tensor_slices((self.word_idx, self.label, self.seq_len)) test_dataset = test_dataset.batch(self.batch_size) test_dataset = test_dataset.repeat() iters = tf.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes) self.iter_word_idx, self.iter_label, self.iter_seq_len = iters.get_next() ## Create the initialisation operations self.train_init_op = iters.make_initializer(train_dataset) self.test_init_op = iters.make_initializer(test_dataset) ## Build graph self.build_model(isTrain) self.build_optimizer(num_train_steps) self.get_accuracy()
def main(): global files files = list(glob.glob('*.txt')) v = list() for filename in files: with open(filename) as f: text = f.read() p = preprocess.Preprocess(text) v.append(p.get_list()) # Creating Index idx = indexer.Index(v) display_menu(idx)
def compute_low_dimensions_data_matrix(weight, datas): _datas = datas weight_matrix = np.matmul( np.matmul( np.matmul( np.matmul(weight[0], weight[1] ), weight[2] ), weight[3] ), weight[4] ) output_train_data = np.matmul(_datas[0][0], weight_matrix) output_test_data = np.matmul(_datas[1][0], weight_matrix) datas = preprocess.Preprocess( ((output_train_data, _datas[0][1]), (output_test_data, _datas[1][1])), type='svm').do_svm_preprocess() save_dbn_weight_as_svm(datas[0], 'train') save_dbn_weight_as_svm(datas[1], 'test')
def run(): input_dir = "input" output_dir = "output" ap = argparse.ArgumentParser() ap.add_argument("-i", "--input", required=True,help="path to input text document") #ap.add_argument("-o", "--output", required=True,help="path to output Summarized Document") args = vars(ap.parse_args()) input_path = os.path.join(input_dir,args['input']) output_path = os.path.join(output_dir,args['input']) pr = preprocess.Preprocess() input_text = pr.get_article_content(input_path) summary = get_summary(input_text) #pdb.set_trace() with open(output_path,'w' ,encoding = "utf-8") as f: f.write(summary)
def get_summary(input_text): pr = preprocess.Preprocess() original_text = input_text preprocessed_text = pr.get_clean_article(input_text) sentences = pr.get_article_sentences(preprocessed_text) original_sentences = pr.get_article_sentences(input_text) paragraphs = pr.get_cleaned_article_paragraphes(preprocessed_text) para_sent_list = pr.get_para_sentences(paragraphs) tokenized_word_sentences = pr.get_tokenized_word_sentences(sentences) doc = document.Doc( original_text = original_text , original_sentences = original_sentences , preprocessed_text = preprocessed_text.replace('ppp',""), sentences = sentences, paragraphs = paragraphs ,para_sent_list = para_sent_list ,tokenized_word_sentences = tokenized_word_sentences) summary = doc.summarize() return summary
def get_sample_users(self, valid_users=None): if self._cache.exist_sample_users(): if self._run is not None and self._run.verbose: print('Sample users cache found') # Check if sample users is in cache # If sampel uses exist in cache read and return that sample_users = self._cache.read_sample_users() return sample_users else: if self._run is not None and self._run.verbose: print('Sample users cache not found. Generating.') # If not in cache generate sample users and save if valid_users is None: valid_users = self.get_valid_users() sample_users = random.sample(valid_users, self._config.sample_size) if self._run is not None and self._run.verbose: print('Sample users generated. Checking for outliers.') # Check for outliers in user_samples self._db.open() time_start, time_end = self._db.get_time_min_max() prep = preprocess.Preprocess() # Check if there are outliers in the selected sample links = self._db.get_links(time_start, time_end, sample_users) outliers = prep.outlier_nodes(links, sample_users, self._config.density_neighbors, self._config.density_cutoff, True) if self._run is not None and self._run.verbose: print(str(len(outliers)) + ' outliers found') # Remove the outliers from the users_sample for n in outliers: sample_users.remove(n) if self._run is not None and self._run.verbose: print('Outliers removed. Saving sample users in cache.') self._cache.save_sample_users(sample_users) self._db.close() return sample_users
def __init__(self, directory, load_file=None): self.img_size = 100 self.n_angles = 6 direc = directory self.pre = preprocess.Preprocess(self.img_size, self.n_angles) self.lookup = [] for subdir, dirs, files in os.walk(direc): for f in files: filename = os.path.join(subdir, f) if filename[-3:] == 'jpg': if not load_file: raw_img = cv2.imread(filename, 0) self.pre.process_img(raw_img) self.lookup.append(filename) print '%i:\t%s' % (len(self.lookup), filename) if load_file: edgel_counts = database.load_data(self.pre.hits, load_file) self.pre.set_edgel_counts(edgel_counts)
def __init__(self, fname1, fname2): self.img_size = 100 self.n_angles = 4 self.pre = preprocess.Preprocess(self.img_size, self.n_angles) self.test_img = np.zeros((self.n_angles, self.img_size, self.img_size)) query_img = cv2.imread(fname1, 0) database_img = cv2.imread(fname2, 0) self.query_set = self.pre.get_edgels(query_img) self.pre.process_img(database_img) self.hitmap = self.pre.get_hitmap() for i, r in enumerate(self.hitmap): for j, c in enumerate(r): for theta, entry in enumerate(c): if len(entry) > 0: self.test_img[theta, i, j] = 128 for i, j, theta in self.query_set: self.test_img[theta, i, j] = 255
def autorun(self): """Auto run.""" # read autorun configure file pre_process = preprocess.Preprocess() pre_process.readconfig('autorun.ini') self.endoutput = pre_process.endoutput self.midoutput = pre_process.midoutput self.respath = pre_process.respath self.gsystem = pre_process.gsystem self.ctype = pre_process.ctype yesterday = datetime.datetime.now().date() + datetime.timedelta(-1) self.duration = [yesterday, yesterday] # start process process = list() process.append(multiprocessing.Process(target=self.enu)) process.append(multiprocessing.Process(target=self.uh)) process.append(multiprocessing.Process(target=self.satnum)) [p.start() for p in process] [p.join() for p in process] # check evaluation quality check.check() now = datetime.datetime.now().replace(second=0, microsecond=0) print('%s: The process of %s Done!' % (str(now), str(yesterday)))
def startPreprocess(self): self.prepro = preprocess.Preprocess() self.prepro.run()
@app.route('/', methods=['GET', 'POST']) def index(): if request.method == 'POST': sentence = request.form["sentence"] result = infer_example(sentence, infer_graph, sess) return render_template("index.html", result=result, input_sent=sentence) return render_template("index.html") if __name__ == '__main__': prepro = preprocess.Preprocess(word_dim=word_dim, max_vocab_size=max_vocab_size, path=corpuspath) word_embedding, clear_padding, word2idx, idx2word = prepro.build_embedding( ) infer_graph = Model(word_embedding, max_word_len) ## Create model graph infer_fn = infer_graph.build_model(hidden_dim, num_layers, None, False) sess = tf.Session() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, modelpath + modelName) app.run(host='0.0.0.0', port=5000, debug=True)
def __init__(self): self.preprocess = preprocess.Preprocess() self.batcher = batch.Batcher(self.preprocess) self.batchGen = self.batcher.batchGen self.embeddingMatrix = self.preprocess.embeddingMatrix
def run(args): print '> compute LCS' files, labels = pc.getFiles(args.inputfolder, args.suffix, args.labelfile, exact=args.exact) if len(args.max_descriptors) == 0: descriptors, index_list = pc.loadDescriptors(files, rand=True, return_index_list=1) else: descriptors, index_list = pc.loadDescriptors(files,\ max_descs=args.lcs_max_descriptors, max_descs_per_file=max(int(args.lcs_max_descriptors/len(files)),\ 1), rand=True, return_index_list=1) print 'descriptors.shape', descriptors.shape # #if not args.inputfolders: # cur_data, index_list = pc.loadDescriptors(files, # max_descs=args.max_descriptors[0]\ # if args.max_descriptors\ # else 0, # return_index_list=True) # per descriptor labels: if len(index_list) - 1 != len(labels): raise ValueError('{} != {} + 1'.format(len(index_list), len(labels))) le = preprocessing.LabelEncoder() labels = le.fit_transform(labels) desc_labels = np.zeros(len(descriptors), dtype=np.uint32) for r in xrange(len(labels)): desc_labels[index_list[r]:index_list[r + 1]] = labels[r] prep = preprocess.Preprocess(args) ubm = ubm_adaption.loadGMM(args.load_ubm) if not args.no_assignment: assignments = encoding.getAssignment(ubm.means_, descriptors) lcs = [] descr = [] # Note: we could also compute the LCS afterwards using 'multipca' option # of preprocess... for i in range(len(ubm.means_)): if args.no_assignment: diff = descriptors - ubm.means_[i] else: for_lcs = descriptors[assignments[:, i] > 0] diff = for_lcs - ubm.means_[i] if args.resnorm: diff = preprocessing.normalize(diff, norm='l2', copy=False) if not args.global_cs: prep.fit(diff, desc_labels[assignments[:, i] > 0]) lcs.append(copy.deepcopy(prep.pca)) prep.pca = None else: descr.append(diff) if args.global_cs: print '> compute global lcs' diff = np.concatenate(descr, axis=1) print '... from descr.shape', diff.shape prep.fit(diff, desc_labels) print '< compute global lcs' lcs = copy.deepcopy(prep.pca) prep.pca = None folder = os.path.join(args.outputfolder, 'lcs.pkl.gz') pc.dump(folder, lcs) return folder
import preprocess import plotting import matplotlib.pyplot as plt import pickle import os import sys import database model_pat = os.path.dirname(os.path.realpath(__file__)) + "/model.sav" model = pickle.load(open(model_pat, "rb")) env = preprocess.Preprocess("test_image/car4.jpg") env.plate_detection() segmented_characters = env.character_segmentation() plotting.show() segmented_characters.sort() ans = [] for char in segmented_characters: #print(plt.imshow(char[1])) ans.append(model.predict(char[1].reshape(1, -1))) license_plate = [] for val in ans: license_plate.append(val[0]) for idx in range(len(license_plate)): if (idx == 0 or idx == 1 or idx == 4 or idx == 5): if (license_plate[idx] == '0'): license_plate[idx] = str('O') elif (license_plate[idx] == '1'):
import os import preprocess as pp from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() mpiSize = comm.Get_size() name = MPI.Get_processor_name() pre_process = pp.Preprocess('data.txt', mpiSize) data_info = pre_process.data terminate_flag = None #print pre_process.data.is_build_file if pre_process.data.is_build_file == True: if rank == 0: if not pre_process.make_file(): print "The target file does not exists." terminate_flag = True terminate_flag = comm.bcast(terminate_flag, root=0) if terminate_flag == True: print "the end: ", rank exit() task_set = pre_process.get_task_set(rank) if len(task_set) == 0: exit() try: if not pre_process.set_parameter(task_set, rank): exit() pre_process.do_montage(task_set, rank)