def main(): # load the tweet objects tweets = pickle.load(open('tweets.p', 'rb')) # preprocess each tweet's text preprocess(tweets) # perform KFold cross-validation with 3 folds to get more accurate accuracy prediction kf = KFold(n=len(tweets), n_folds=3, shuffle=True) for train_indices, test_indices in kf: tweets_train = [tweets[i] for i in train_indices] tweets_test = [tweets[i] for i in test_indices] clf = BayesSentimentClassifier() clf.train( tweets_train ) # train the classifier, i.e. populate the sentiment dictionary clf.predict( tweets_test) # predict sentiment of each tweet using Bayes Theorem # calculate accuracy print(accuracy_score(tweets_test)) test = Tweet("", "", "I am skeptical about this result", "Tue Oct 18 18:05:50 +0000 2011", "") clf.predict([test]) print(test.prediction.sentiment) print(test.prediction.probabilities)
def process_tweets(rank, input_file, processes): ht_occurences = Counter([]) lang_occurences = Counter([]) with open(input_file) as f: logging.info(f"Process: {rank} | Initiating processing task.") try: for i, line in enumerate(f): line = line.replace(",\n", "") if i % processes == rank: try: data = json.loads(line) lang_occurences[data['doc']['lang']] += 1 hashtags = [ preprocess(i['text']) for i in data['doc']['entities']['hashtags'] ] for ht in hashtags: ht_occurences[ht] += 1 except ValueError: logging.info( f"Process: {rank} | Malformed JSON on line: {i}") except Exception: logging.error(f"Problem reading file.") logging.info(f"Process: {rank} | I am done Processing.") return ht_occurences, lang_occurences
def __init__(self, image_path, transform=None): self.dataset = np.array([]) self.label = [] self.transform = transform # Initialize and read data in the specified path self._init_dataset(os.path.abspath(image_path)) # Preprocess data self.dataset = preprocess(self.dataset) # For data augmentation if self.transform is not None: for i in range(len(self.dataset)): self.dataset[i] = self.transform( Image.fromarray(self.dataset[i])) else: self.dataset = torch.FloatTensor( self.dataset) # a tensor of shape (n, 36, 128) self.label = torch.LongTensor( self.label).squeeze() # a tensor of shape (n, 5) if torch.cuda.is_available(): self.dataset = self.dataset.cuda() self.label = self.label.cuda()
def main(): print("Reading training data ...") train = data_io.read_train() train.fillna(0, inplace=True) train_sample = train.fillna(value=0) features = ut.preprocess(train_sample) target = ut.construct_target(train_sample) # target = train_sample["booking_bool"].values # save the processed data, which may be useful # to test the performance of our model print("Saving processed training data ...") data_io.save_processed_data([features, target]) print("Training the Regressor ...") regressor = RandomForestRegressor(n_estimators=10, #RandomForestClassifier verbose=2, n_jobs=-1, max_features = "sqrt", min_samples_split=10, random_state=1) regressor.fit(features, target) print("Saving the Regressor ...") data_io.save_model(regressor)
def main(): # load the tweet objects tweets = pickle.load(open('tweets.p', 'rb')) # preprocess each tweet's text preprocess(tweets) # perform KFold cross-validation with 3 folds to get more accurate accuracy prediction kf = KFold(n=len(tweets), n_folds=3, shuffle=True) for train_indices, test_indices in kf: tweets_train = [tweets[i] for i in train_indices] tweets_test = [tweets[i] for i in test_indices] clf = BayesSentimentClassifier() clf.train(tweets_train) # train the classifier, i.e. populate the sentiment dictionary clf.predict(tweets_test) # predict sentiment of each tweet using Bayes Theorem # calculate accuracy print(accuracy_score(tweets_test)) test = Tweet("", "", "I am skeptical about this result", "Tue Oct 18 18:05:50 +0000 2011", "") clf.predict([test]) print(test.prediction.sentiment) print(test.prediction.probabilities)
def predict(event, context): """Makes inference on the passed data.""" df = utilities.load_dataframe_from_sqs_event(event) X = utilities.preprocess(df) model = utilities.load_model(MODEL_URI) y = model.predict(X) results = utilities.postprocess(X, y) msg = utilities.SQSMessage() msg.dataframe = results msg.send(queue=WRITER_QUEUE) return { "status": "success", }
def main(): model_file_path = "output" + os.sep + "linear_regression_model_mv.sav" ignored_columns = ['ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT'] X, Y = load_data('input' + os.sep + 'housing.csv', False, ignored_columns) X = preprocess(X, "normalize") X_train, y_train, X_test, y_test = split_dataset(X, Y) train(X_train, y_train, model_file_path) y_predicted = predict(X_test, model_file_path) rmse_ration = calculate_rmse_ration(y_test, y_predicted) print("rmse ratio:", rmse_ration)
def main(input_path, output_path, ignored_columns, preprocess_type, training_data_rate, step_length, threshold_rate, max_loop_num, dynamic_step): print("input:", input_path) print("output:", output_path) print("\n") if ignored_columns is not None: print("ignored_columns:", ignored_columns) print("\n") print("preprocess_type:", preprocess_type) print("training_data_rate:", training_data_rate) print("\n") print("threshold_rate:", threshold_rate) print("max_loop_num:", max_loop_num) print("step_length:", step_length) if dynamic_step: print("dynamic stepping ...") else: print("static stepping ...") print("\n") start_time = datetime.now() X, Y = load_data(input_path, True, ignored_columns) X = preprocess(X, preprocess_type) X_train, y_train, X_test, y_test = split_dataset(X, Y, training_data_rate) threshold = gen_threshold(Y, threshold_rate) train(X_train, y_train, output_path, step_length, threshold, max_loop_num, dynamic_step) Y_pred = predict(output_path, X_test) rmse_ration = calculate_rmse_ration(y_test, Y_pred) print("rmse ratio (rmse / y_mean) is:", rmse_ration, "\n") end_time = datetime.now() execution_duration = end_time - start_time print("execution duration:", execution_duration, "\n") return
def master_worker(comm, input_file): # Read our tweets rank = comm.Get_rank() size = comm.Get_size() if size > 1: logging.info(f'Process: {rank} | I am Master!') ht_counts, lang_counts = process_tweets(rank, input_file, size) ht_temp, lang_temp = gather_tweets(comm) logging.info(f"Process: 0 (Master) | Shutting Down slave(s)") ht_counts += ht_temp lang_counts += lang_temp # Turn everything off for i in range(size - 1): comm.send('exit', dest=(i + 1), tag=(i + 1)) return ht_counts.most_common(10), lang_counts.most_common(10) else: logging.info(f'Process: {rank} | I am processing alone!') ht_counts = Counter([]) lang_counts = Counter([]) with open(input_file) as f: logging.info(f"Process: {rank} | Initiating processing task.") try: for i, line in enumerate(f): line = line.replace(",\n", "") try: data = json.loads(line) lang_counts[data['doc']['lang']] += 1 hashtags = [ preprocess(i['text']) for i in data['doc']['entities']['hashtags'] ] for ht in hashtags: ht_counts[ht] += 1 except ValueError: logging.info( f"Process: {rank} | Malformed JSON on line: {i}") except Exception: logging.error(f"Problem reading file.") logging.info(f"Process: {rank} | I am done Processing.") return ht_counts.most_common(10), lang_counts.most_common(10)
def main(): ignored_columns = [ 'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT' ] X, Y = load_data('input' + os.sep + 'housing.csv', True, ignored_columns) X = preprocess(X, "normalize") X_train, y_train, X_test, y_test = split_dataset(X, Y) path = 'output' + os.sep + 'lsm_multivariant.csv' lsm(X_train, y_train, path) y_predicted = predict(path, X_test) rmse_ration = calculate_rmse_ration(y_test, y_predicted) print("rmse ratio:", rmse_ration) return
def cross_vad(examples, num_folds = 10): data = ut.dataCrossSplit(examples, num_folds, False) errorRates = [] for i in range(num_folds): egs = data[i] dt = DTree(SelectAtt) dt.training(egs[0], 1) # calculate error rate error = [0.] * 2 for j in range(2): for x in egs[j]: if dt.predict(x) != x[0]: error[j] = error[j] + 1 error[j] = error[j] / len(egs[j]) print "Fold ", i, " trainingData errorRate: ", error[0], " testData errorRate:", error[1] errorRates.append(error) arr = np.array(errorRates) print "Train Mean ErrorRate:", np.mean(arr[:,0]), " Test Mean ErrorRate:", np.mean(arr[:,1]) print "Train StdVar ErrorRate:", np.sqrt(np.var(arr[:,0])), " Test Mean ErrorRate:", np.sqrt(np.var(arr[:, 1])) if __name__ == '__main__': filename = sys.argv[1] egs = ut.importRawData(filename) egs = ut.preprocess(egs) cross_vad(egs) dt = DTree(SelectAtt) dt.training(egs) print "========= the decision tree ============" dt.printTree()
def find_math_elements(image, arrow_c, bound=20, show=False): """Detect math elements in the image. Args: image: The input image. arrow: Center of the arrow. bound: Bounding box size. show: Whether to show the results. Return: list: Images of all math elements. list: Center coordinates of math elements. """ image_original = image.copy() image_copy = image.copy() # cover red arrow with white rectangle cv2.rectangle(image_copy, (arrow_c[0] - 60, arrow_c[1] - 60), (arrow_c[0] + 60, arrow_c[1] + 60), (255, 255, 255), -1) value_threshold = int( cv2.cvtColor(image, cv2.COLOR_RGB2HSV)[:, :, 2].mean() * 0.9) masked = mask_image(image_copy, np.array([0, 0, 0]), np.array([180, 255, value_threshold]), np.array([100, 100, 0]), np.array([140, 255, 255])) preprocessed = preprocess(masked, 10, False) # get the contours of all shapes contours, _ = cv2.findContours(preprocessed, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) centers = [] elements = [] for i, c in enumerate(contours): # compute the centroid of the shapes M = cv2.moments(c) area = M['m00'] elongation = compute_elongation(M) # these are either too small or too big or too elongated if area < 40 or area > 400 or elongation > 3000: continue cY = int(M['m01'] / M['m00']) cX = int(M['m10'] / M['m00']) center = (cX, cY) # if it is too close to a known element, it is not a valid element too_close = False for center_ in centers: d = (center_[0] - center[0])**2 + (center_[1] - center[1])**2 if d < 4000: too_close = True break if too_close: continue # save element and center element = image[cY - bound:cY + bound, cX - bound:cX + bound] element = cv2.resize(element, (28, 28)) elements.append(element) centers.append(center) # visualize the result on the image label_color = (214, 39, 40) cv2.rectangle(image_original, (cX - bound, cY - bound), (cX + bound, cY + bound), label_color, 2) cv2.putText(image_original, f'{len(elements) - 1}', (cX, cY + 40), cv2.FONT_HERSHEY_SIMPLEX, 0.6, label_color, 2) if show: imshow(image_original) return elements, centers
def find_red_arrow(image, show=False): """Detect the red arrow in the image. Args: image: The input image. show: Whether to show the results. Return: tuple: Tip coordinates. tuple: Center coordinates. """ image_copy = image.copy() masked = mask_image(image_copy, np.array([0, 100, 0]), np.array([20, 255, 255]), np.array([160, 100, 0]), np.array([180, 255, 255])) preprocessed = preprocess(masked, 100) contours, _ = cv2.findContours(preprocessed, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE) for c in contours: # compute the centroid of the shapes M = cv2.moments(c) area = M['m00'] elongation = compute_elongation(M) # these will not be the arrow (too small or too big) if area < 1000 or area > 10000 or elongation > 100: continue cX = int(M['m10'] / area) cY = int(M['m01'] / area) center = (cX, cY) # Not sure do we need this # if abs(M['mu20'] - M['mu02']) > 420000: continue # find the corners of the arrow points = cv2.approxPolyDP(c, 4.7, True).squeeze(1) tip_idx = 0 cand_tips = [] angles = [] # find tip candidates for i in range(len(points)): # get the current point and the surrounding points x = points[i - 1] if i != 0 else points[-1] y = points[i] z = points[i + 1] if i != len(points) - 1 else points[0] # get the lengths between the current point and the surrounding points l1 = np.linalg.norm(np.array(x) - np.array(y)) l2 = np.linalg.norm(np.array(y) - np.array(z)) ang = compute_angle(x, y, z) angles.append(ang) # save candidates if abs(ang - 100) < 15 and (l1 + l2 > 30): cand_tips.append(len(angles) - 1) # choose the correct tip for i in cand_tips: pang = angles[i - 1] if i != 0 else angles[-1] nang = angles[i + 1] if i != len(angles) - 1 else angles[0] if pang + nang < 300 and pang + nang > 200: tip_idx = i # visualize the result on the image cv2.drawContours(image_copy, [c], 0, (214, 39, 40), 2) cv2.circle(image_copy, tuple(center), 5, (0, 255, 0), -1) cv2.circle(image_copy, tuple(points[tip_idx]), 5, (0, 0, 255), -1) break if show: imshow(image_copy) return points[tip_idx], center
def main(): parser = argparse.ArgumentParser() parser.add_argument('--load', type=str, help='Checkpoint to load all weights from.') parser.add_argument('--load-gen', type=str, help='Checkpoint to load generator weights only from.') parser.add_argument('--name', type=str, help='Name of experiment.') parser.add_argument('--overfit', action='store_true', help='Overfit to a single image.') parser.add_argument('--batch-size', type=int, default=16, help='Mini-batch size.') parser.add_argument( '--log-freq', type=int, default=10000, help='How many training iterations between validation/checkpoints.') parser.add_argument('--learning-rate', type=float, default=1e-4, help='Learning rate for Adam.') parser.add_argument('--content-loss', type=str, default='mse', choices=['mse', 'vgg22', 'vgg54'], help='Metric to use for content loss.') parser.add_argument( '--use-gan', action='store_true', help='Add adversarial loss term to generator and trains discriminator.' ) parser.add_argument('--image-size', type=int, default=96, help='Size of random crops used for training samples.') parser.add_argument('--vgg-weights', type=str, default='vgg_19.ckpt', help='File containing VGG19 weights (tf.slim)') parser.add_argument('--train-dir', type=str, help='Directory containing training images') parser.add_argument( '--validate-benchmarks', action='store_true', help= 'If set, validates that the benchmarking metrics are correct for the images provided by the authors of the SRGAN paper.' ) parser.add_argument('--gpu', type=str, default='0', help='Which GPU to use') parser.add_argument('--epoch', type=int, default='1000000', help='How many iterations ') args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # Set up models d_training = tf.placeholder(tf.bool, name='d_training') g_training = tf.placeholder(tf.bool, name='g_training') discriminator = srgan.SRGanDiscriminator(training=g_training, image_size=args.image_size) generator = srgan.SRGanGenerator(discriminator=discriminator, training=d_training, learning_rate=args.learning_rate, content_loss=args.content_loss, use_gan=args.use_gan) # Generator g_x = tf.placeholder(tf.float32, [None, None, None, 3], name='input_lowres') g_y = tf.placeholder(tf.float32, [None, None, None, 3], name='input_highres') g_y_pred = generator.forward(g_x) g_loss = generator.loss_function(g_y, g_y_pred) g_train_step = generator.optimize(g_loss) # Discriminator d_x_real = tf.placeholder(tf.float32, [None, None, None, 3], name='input_real') d_y_real_pred, d_y_real_pred_logits = discriminator.forward(d_x_real) d_y_fake_pred, d_y_fake_pred_logits = discriminator.forward(g_y_pred) d_loss = discriminator.loss_function(d_y_real_pred, d_y_fake_pred, d_y_real_pred_logits, d_y_fake_pred_logits) d_train_step = discriminator.optimize(d_loss) # Set up benchmarks benchmarks = [ Benchmark('Benchmarks/Set5', name='Set5'), Benchmark('Benchmarks/Set14', name='Set14'), Benchmark('Benchmarks/BSD100', name='BSD100') ] if args.validate_benchmarks: for benchmark in benchmarks: benchmark.validate() # Create log folder if args.load and not args.name: log_path = os.path.dirname(args.load) else: log_path = build_log_dir(args, sys.argv) with tf.Session() as sess: # Build input pipeline get_train_batch, get_val_batch, get_eval_batch = input_setup( args, sess) # Initialize sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) # Start input pipeline thread(s) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # Load saved weights iteration = 0 saver = tf.train.Saver() # Load generator if args.load_gen: gen_saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='generator')) iteration = int(args.load_gen.split('-')[-1]) gen_saver.restore(sess, args.load_gen) # Load all if args.load: iteration = int(args.load.split('-')[-1]) saver.restore(sess, args.load) print(saver) print("load_process_DEBUG") # Load VGG if 'vgg' in args.content_loss: vgg_saver = tf.train.Saver( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='vgg_19')) vgg_saver.restore(sess, args.vgg_weights) # Train while True: if iteration % args.log_freq == 0: # Test every log-freq iterations val_error = evaluate_model(g_loss, get_val_batch, sess, 119, args.batch_size) eval_error = evaluate_model(g_loss, get_eval_batch, sess, 119, args.batch_size) # Log error print('[%d] Test: %.7f, Train: %.7f' % (iteration, val_error, eval_error), end='') # Evaluate benchmarks log_line = '' for benchmark in benchmarks: psnr, ssim, _, _ = benchmark.evaluate( sess, g_y_pred, log_path, iteration) print(' [%s] PSNR: %.2f, SSIM: %.4f' % (benchmark.name, psnr, ssim), end='') log_line += ',%.7f, %.7f' % (psnr, ssim) print() # Write to log with open(log_path + '/loss.csv', 'a') as f: f.write('%d, %.15f, %.15f%s\n' % (iteration, val_error, eval_error, log_line)) # Save checkpoint saver.save(sess, os.path.join(log_path, 'weights'), global_step=iteration, write_meta_graph=False) # Train discriminator if args.use_gan: batch_hr = sess.run(get_train_batch) batch_lr = downsample_batch(batch_hr, factor=4) batch_lr, batch_hr = preprocess(batch_lr, batch_hr) sess.run(d_train_step, feed_dict={ d_training: True, g_training: True, g_x: batch_lr, g_y: batch_hr, d_x_real: batch_hr }) # Train generator batch_hr = sess.run(get_train_batch) batch_lr = downsample_batch(batch_hr, factor=4) batch_lr, batch_hr = preprocess(batch_lr, batch_hr) sess.run(g_train_step, feed_dict={ d_training: True, g_training: True, g_x: batch_lr, g_y: batch_hr }) iteration += 1 # Stop queue threads coord.request_stop() coord.join(threads)
from keras import backend as K from keras.optimizers import SGD K.tensorflow_backend._get_available_gpus() from matplotlib.pyplot import imshow import numpy as np import scipy.io import matplotlib.pyplot as plt import utilities #load our dataset train_data = scipy.io.loadmat('train_32x32.mat') test_data = scipy.io.loadmat('test_32x32.mat') #x_train : 73257x1024 , y_train : 73257 x 11 x_train, y_train = utilities.preprocess(train_data) x_test, y_test = utilities.preprocess(test_data) imshow(x_test[4].reshape(32, 32)) plt.show() print(y_test[4]) #training model = Sequential() model.add(Dense(input_dim=32 * 32, units=633, activation='relu')) #training set的performance不大好,所以先不加dropout #model.add(Dropout(0.5)) model.add(Dense(units=633, activation='relu')) model.add(Dense(units=633, activation='relu')) model.add(Dense(units=633, activation='sigmoid')) model.add(Dense(units=633, activation='sigmoid'))
raw_text1 = """When forty winters shall besiege thy brow, And dig deep trenches in thy beauty's field, Thy youth's proud livery so gazed on now, Will be a totter'd weed of small worth held: Then being asked, where all thy beauty lies, Where all the treasure of thy lusty days; To say, within thine own deep sunken eyes, Were an all-eating shame, and thriftless praise. How much more praise deserv'd thy beauty's use, If thou couldst answer 'This fair child of mine Shall sum my count, and make my old excuse,' Proving his beauty by succession thine! This were to be new made when thou art old, And see thy blood warm when thou feel'st it cold.""" raw_text = preprocess(raw_text1) data = prepareData(raw_text) vocab = set(raw_text) vocab_size = len(vocab) print(vocab_size) word_to_ix = {word: i for i, word in enumerate(vocab)} print(len(data)) losses = [] loss_function = nn.NLLLoss() model = CBOW(len(vocab), EMBEDDING_DIM, 2 * CONTEXT_SIZE) optimizer = optim.SGD(model.parameters(), lr=0.001)