def __iter__(self): # custom iterator function that defines how to iterate over # records according to the configuration specified # INTERFACE DEFINITION: this iterator should always yield a string if self.msg_flag: print('\n\n\n\nRunning the following preprocessing actions:\n\n') print(utilities.get_config('./config/preprocessing.yaml')) self.msg_flag = 0 if self.grouping == 'doc': for PDFObj, f in zip(self.data_map, self.files): pdf_reader = PDFR(PDFObj) text_file = "" for pg_num in range(pdf_reader.numPages): page_text = pdf_reader.getPage(pg_num).extractText() text_file = text_file + ' ' + page_text self.doc_ids.append(os.path.splitext(ntpath.basename(f))[0]) yield Preprocessor(text_file, './config/preprocessing.yaml').run() self.__read_data(self.files) # get data elif self.grouping == 'page': for PDFObj, f in zip(self.data_map, self.files): pdf_reader = PDFR(PDFObj) for pg_num in range(pdf_reader.numPages): page_text = pdf_reader.getPage(pg_num).extractText() self.doc_ids.append( os.path.splitext(ntpath.basename(f))[0] + ' Page ' + str(pg_num + 1) + ' of ' + str(pdf_reader.numPages)) yield Preprocessor(page_text, './config/preprocessing.yaml').run() self.__read_data(self.files) # get data
def __iter__(self): if self.msg_flag: print('\n\n\n\nRunning the following preprocessing actions on group of files:\n\n') print(utilities.get_config('./config/preprocessing.yaml')) self.msg_flag = 0 for csv_file, f in zip(self.data_map, self.files): reader = csv.reader(csv_file, delimiter=',') if self.grouping == "row": for row in reader: # print('ROW: \n', row) row_cells = "" for cell in row: # print('cell: \n', cell) row_cells += ' ' + cell + ' ' # print('row_cells:\n', row_cells) yield Preprocessor(row_cells,'./config/preprocessing.yaml', self.files).run() elif self.grouping == "col": columns = zip(*reader) col_text = "" for column in columns: # print ('COLUMN:\n\n', column ) for cell in column: col_text += ' ' + cell + ' ' yield Preprocessor(col_text,'./config/preprocessing.yaml').run() self.__read_data(self.files) # get data
def get_means_sigmas(args, x): if args.pre == 'kmeans': return Preprocessor().compute_gaussian_basis(x, deg=int(args.d), scale=args.scale) elif args.pre == 'grid': return Preprocessor().grid2d_means(np.min(x[:, 0]), np.max(x[:, 0]), np.min(x[:, 1]), np.max(x[:, 1]), step=args.gsize, scale=args.scale)
def preprocess(args, X, T): d = args.d X_normal, std = Preprocessor().normalize(X) if args.pre == 'pca': X_phi, phi = Preprocessor().pca(X_normal, k=d) elif args.pre == 'lda': X_phi, phi = Preprocessor().lda(X_normal, T, d=d) else: X_phi = X phi = np.ones(d) bias = np.ones(len(X))[:, np.newaxis] X_phi = np.hstack((bias, X_phi)) return X_phi, phi, std
def main(): # initialize preprocessor preprocessor = Preprocessor() # serialize preprocessor with open('preprocessor.pkl', 'wb') as f: pickle.dump(preprocessor, f)
def print_document(X, Y, T, cams): doc, tag, text = Doc().tagtext() prepro = Preprocessor(cache_path=cache_dir / 'train_text.json') X_text = prepro.to_text(X_sample) with tag('html'): with tag('body', style="width: 900px;"): for i, p in enumerate(X_text): cam = cams[i] # normalize cam heatmap = cam / np.ptp(cam) color_map = cv2.applyColorMap( np.uint8(255 * heatmap), cv2.COLORMAP_AUTUMN) with tag('div'): with tag('p'): words = p.split(' ') for j, word in enumerate(words): color = color_map[j][0] with tag('span', style=f'background: rgba({color[2]}, {color[1]}, {color[0]}, {heatmap[j]});', title=int(X_test[i, j])): text(word + ' ') with tag('p'): text( f'Pred: {Y[i]}, Label: {T[i]}') doc.stag('hr') with open(out_dir / 'out.html', 'w') as f: f.write(doc.getvalue())
def __init__(self, world, filename=None, simulator=None, once=False, headless=False): logging.info('Initialising vision') if simulator: self.capture = SimCapture(simulator) else: self.capture = Capture(self.rawSize, filename, once) self.headless = headless self.threshold = threshold.AltRaw() self.pre = Preprocessor(self.rawSize, self.threshold, simulator) self.featureEx = FeatureExtraction(self.pre.cropSize) self.interpreter = Interpreter() self.world = world self.gui = GUI(world, self.pre.cropSize, self.threshold) self.histogram = Histogram(self.pre.cropSize) self.times = [] self.N = 0 #debug.thresholdValues(self.threshold.Tblue, self.gui) logging.debug('Vision initialised')
def train(): print('Preprocessing raw data') preprocessor = Preprocessor() preprocessor.preprocess() dataset = Dataset(preprocessor) print('Training MF') mf = MF(preprocessor, dataset) mf.train_or_load_if_exists() print('Building I2I') i2i = Item2Item(dataset) print('Generating candidates') candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) X_train, y_train, q_train, q_train_reader = candidate_generator.generate_train() X_val, y_val, q_val, q_val_reader = candidate_generator.generate_val() import pickle try: with open('puke.pkl', 'wb') as f: pickle.dump((X_train, y_train, q_train, q_train_reader, X_val, y_val, q_val, q_val_reader), f) except: print("Couldn't save puke") print('Training ranker') ranker = Ranker() ranker.train(X_train, y_train, q_train, X_val, y_val, q_val) ranker.save() print('Validating ranker') rank_scores = ranker.rank(X_val) print('ndcg', dataset.validate_ndcg(y_val, q_val, q_val_reader, rank_scores))
def ngram_model(Xtrain, ytrain, Xval, yval): model_name = "mnb_ngram" preprocessor = Preprocessor(vectorizer_mode=TFIDF_MODE, verbose=True) # apply tf-idf vectorization on the text Xtrain = preprocessor.vectorize_fit_transform_text(Xtrain) Xval = preprocessor.vectorize_transform_text(Xval) # define classifier and train model = MultinomialNB(alpha=0.4) model.fit(Xtrain, ytrain) # make predictions ytrain_pred = model.predict(Xtrain) yval_pred = model.predict(Xval) prediction_probs = model.predict_proba(Xval).argsort(axis=1) best_preds = prediction_probs[:, -1] second_best_preds = prediction_probs[:, -2] label_in_top_two_preds = np.full((len(yval), ), True) for i in range(len(yval)): if yval[i] != best_preds[i] and yval[i] != second_best_preds[i]: label_in_top_two_preds[i] = False val_acc_top_two = label_in_top_two_preds.sum() / len(yval) train_acc = accuracy_score(ytrain, ytrain_pred) val_acc = accuracy_score(yval, yval_pred) print("Validation accuracy (label in top 2 predictions): {}".format( val_acc_top_two)) return model, model_name, preprocessor, ytrain_pred, yval_pred, "accuracy", train_acc, val_acc
def get_next(self, set): """ Get next preprocessed batch """ raw_data = tf.placeholder(tf.float32, shape=[None, 3, 32, 32]) preprocessor = Preprocessor(raw_data, centered=self.centered, rescaled=self.rescaled, grayscale=self.grayscale, shaped=self.shaped) if set == "train": batch = self.train_batch size = self.batch_size raw_x, y_batch = batch.get_next() elif set == "train_acc": batch = self.train_batch size = self.accuracy_size raw_x, y_batch = batch.get_first_size(self.accuracy_size) elif set == "test_acc": batch = self.test_batch size = self.validation_size raw_x, y_batch = batch.get_first_size(self.validation_size) with tf.Session() as sess: raw_x = raw_x.reshape(-1, 3, 32, 32) x_batch = sess.run(preprocessor.apply(raw_x, size), feed_dict={raw_data: raw_x}) return x_batch, y_batch
def __init__(self, n_action_space, n_training_frames = 50 * 1000000, replay_memory_size = 1000000, k = 4, m = 4): # Hyperparameters - dynamic self.n_action_space = n_action_space self.n_training_frames = n_training_frames self.replay_memory_size = replay_memory_size self.replay_memory = deque(maxlen = self.replay_memory_size) self.m = m self.k = k # Hyperparameters - static self.epsilon = 1.0 self.minibatch_size = 32 self.C = 10000 self.gamma = 0.99 self.update_frequency self.epsilon_initial = 1.0 self.epsilon_final = 0.1 self.exploration_frame = 1000000 # 1 million self.epsilon_decay = (self.epsilon_initial - self.epsilon_final) / self.exploration_frame self.replay_start_frame = 50000 # Parameters - etc self.action = None self.timestep = 0 # Modules self.preprocessor = Preprocessor(m = self.m) self.Q = DQN() self.Q_hat = copy.deepcopy(self.Q) # Operations self.mode('train')
def main(args): preprocessor = Preprocessor(args.model_type, args.max_len) train_dataloader, val_dataloader, test_dataloader = get_dataloader( args, preprocessor) bert_finetuner = BertModel(args, train_dataloader, val_dataloader, test_dataloader) logger = TensorBoardLogger(save_dir=args.log_dir, version=1, name="nsmc-bert") early_stop_callback = EarlyStopping(monitor='val_acc', min_delta=0.00, patience=5, verbose=False, mode='max') checkpoint_callback = ModelCheckpoint(filepath=args.checkpoint_path, verbose=True, monitor='val_acc', mode='max', save_top_k=3, prefix='') trainer = pl.Trainer( gpus=1, # distributed_backend='ddp' checkpoint_callback=checkpoint_callback, early_stop_callback=early_stop_callback, logger=logger) trainer.fit(bert_finetuner) trainer.test()
def classify(model): """ Function that fits a model using the entire training set and stores its predictions on the held out test set in a csv file. """ # Read datasets df = pd.read_csv("data/preprocessed_reddit_train_SnowballStemmer.csv") # Using preprocessor to transform data into tf-idf representation preprocessor = Preprocessor("stemmer") # Transform training data to tf_idf representation x_train = preprocessor.tf_idf_vectorizer.fit_transform(df["cleaned"]) y_train = df["label"] # Preprocess test data and transform to tf_idf representation x_test_df = pd.read_csv( "data/preprocessed_reddit_test_SnowballStemmer.csv") x_test = preprocessor.tf_idf_vectorizer.transform( x_test_df["cleaned"].values.astype('U')) # Train model using whole training set model.fit(x_train, y_train) # Predict on test set predictions = model.predict(x_test) # Turn predictions back to original labels preprocessor.label_encoder.fit(df["subreddits"]) predictions = preprocessor.label_encoder.inverse_transform(predictions) # save predictions pred_df = pd.DataFrame({"Id": x_test_df.id, "Category": predictions}) pred_df.to_csv("predictions/predictions{}.csv".format( datetime.datetime.now()), index=False)
def preprocess(self, data_dir, re_seg=True, to_file=False, mid_data_paths=None, split_train_test=True, test_ratio=0.2, vec_method="count", feature_select=True, is_percent=True, feature_keep_percent=90, feature_keep_num=10, min_df=3): """ """ preprocessor = Preprocessor( feature_gen_func=self.feature_label_gen, vec_method=vec_method, feature_keep_percent=feature_keep_percent, feature_keep_num=feature_keep_num, is_percent=is_percent, test_ratio=test_ratio, min_df=min_df) _, train_data, train_label, val_data, val_label = preprocessor.gen_data_vec( data_dir, self.feature_id_path, split_train_test=split_train_test, feature_select=feature_select, to_file=to_file, re_seg=re_seg, process_file_path=mid_data_paths)
def train_and_validate_viterbi2(_inputFile, _outputFile, _devFile, _devOutputFile, _validateFile): """ Create the Preprocessor object Train using the SG, EN, CN, FR datasets Generate the representer, vocabulary and states and feed it into an Emission object """ preprocessor = Preprocessor(_inputFile) representer = preprocessor.get_representer() vocabulary = preprocessor.get_vocabulary() states = preprocessor.get_states() listOfWords = getAllTokens(_devFile) """ Create the Emission and Transition objects Validate using the dev datasets Label the input sequence and output the file as dev.p3.out """ emission = Emission(representer, vocabulary, states, listOfWords) transition = Transition2() transition.compute_params(preprocessor) label_viterbi(_devFile, _devOutputFile, emission, transition) """ Calculate Validation Error """ evaluate(_validateFile, _devOutputFile)
def __init__(self): self.preprocessor = Preprocessor() self.feature_extractor = FeatureExtractor() self.crf_analyzer = CRFAnalyzer() self.sentiment_analyzer = SentimentAnalyzer() print("\nAll module instantiated and ready to go....\n")
def generate_summary(text): preprocessor = Preprocessor() postprocessor = Postprocessor() SummaRise = SummaRiser('./path/to/data/vocab', './') sentences = sent_tokenize(text) totalSentences = len(sentences) tokens = 0 # count of tokens tokenized = [] summarys = '' for id, sentence in enumerate(sentences): tokenized += preprocessor.tokenize(sentence) tokens += len(tokenized) if tokens >= MAX_TOKENS or (id == (totalSentences - 1) and tokens >= MIN_TOKENS): tokenized = (' '.join(tokenized)) preprocessed_text = preprocessor.preprocess_text( tokenized.split('*N*')) print(preprocessed_text) summary = SummaRise.summarize([preprocessed_text]) summarys += postprocessor.postprocess_text(summary[0]) summarys += ' ' tokens = 0 tokenized = [] return summarys
def generate_plots(path): """ Generates plots for all videos in a directory :param path: the directory to search for videos """ videos = glob(path + '/*.mkv') print(path, len(videos), videos) if len(videos) == 0: return else: videos = videos[0] metadata_list = glob(path + '/metadata.txt') #print(path, len(metadata_list), metadata_list) if len(metadata_list) == 0: return P = Preprocessor() P.import_video(str(videos)) P.read_metadata(path) P.preprocess() Im = P.frames_processed if len(Im) == 0: print(len(Im)) return z_start = P.z_start z_end = P.z_end mean, cov = analyze_image(Im) window_size = 10 mean_smoothed = smoothing.mean_moving_average(mean, window_size) cov_smoothed = smoothing.cov_moving_average(cov, window_size) c = CubicFitRotated() c.fit(mean=mean_smoothed, cov=cov_smoothed, z_start=z_start, z_end=z_end) try: os.mkdir(path + '/analysis') path += '/analysis' except OSError: pass plots.plot_mean(mean, z_start, z_end).savefig(path + '/beam_center.png') plots.plot_beta(cov, z_start, z_end).savefig(path + '/sigma_squared.png') export.export_mean(mean=mean, filename=path + '/center.csv', z_start=z_start, z_end=z_end) export.export_cov(cov=cov, filename=path + '/cov.csv', z_start=z_start, z_end=z_end) plt.close('all')
def get_data_generator(args, model_args, schema, test=False): from cocoa.core.scenario_db import ScenarioDB from cocoa.core.dataset import read_dataset from cocoa.core.util import read_json from core.scenario import Scenario from core.lexicon import Lexicon from preprocess import DataGenerator, Preprocessor import os.path # TODO: move this to dataset dataset = read_dataset(args, Scenario) mappings_path = model_args.mappings lexicon = Lexicon(schema.values['item']) preprocessor = Preprocessor(schema, lexicon, model_args.entity_encoding_form, model_args.entity_decoding_form, model_args.entity_target_form, model=model_args.model) if test: model_args.dropout = 0 train, dev, test = None, None, dataset.test_examples else: train, dev, test = dataset.train_examples, dataset.test_examples, None data_generator = DataGenerator(train, dev, test, preprocessor, args, schema, mappings_path, cache=args.cache, ignore_cache=args.ignore_cache, num_context=model_args.num_context, batch_size=args.batch_size, model=model_args.model) return data_generator
def preprocess(self, data_dir, re_seg=True, to_file=False, mid_data_paths=None, split_train_test=True, test_ratio=0.2, vec_method="count", feature_select=True, is_percent=True, feature_keep_percent=90, feature_keep_num=10, min_df=3): """根据指定目录 获得数据特征 [out] train_data_vec: matrix, 数据集特征 """ preprocessor = Preprocessor( feature_gen_func=self.feature_label_gen, vec_method=vec_method, feature_keep_percent=feature_keep_percent, feature_keep_num=feature_keep_num, is_percent=is_percent, test_ratio=test_ratio, min_df=min_df) # 根据数据生成特征 _, self.train_data_vec, _, _, _ = preprocessor.gen_data_vec( data_dir, self.feature_id_path, split_train_test=split_train_test, feature_select=feature_select, to_file=to_file, re_seg=re_seg, process_file_path=mid_data_paths)
def create_model_instance(): from models import ADEM from preprocess import Preprocessor logger.info('loading model from %s', ADEM_MODEL) model = ADEM(Preprocessor(), None, ADEM_MODEL) logger.info('model loaded. config: %r', model.config) return model
def test_get_feature_names(self): feature_names = ["f1", "f2", "f3"] data_set = np.array([feature_names, ["1", "2", "3"], ["", "4", "5"]]) preprocessor = Preprocessor(data_set) names = preprocessor.get_feature_names() self.assertTrue(feature_names[0] == names[0]) self.assertTrue(feature_names[1] == names[1]) self.assertTrue(feature_names[2] == names[2])
def initComponents(self, crop=None): undistort = False self.pre = Preprocessor(self.rawSize, self.threshold, undistort, crop=crop) self.featureEx = FeatureExtraction(self.pre.cropSize) self.gui = GUI(self.world, self.pre.cropSize, self.threshold, self) self.world.setResolution(self.pre.cropSize)
def inference(): preprocessor = Preprocessor(first_time=False) preprocessor.preprocess() dataset = Dataset(preprocessor) mf = MF(preprocessor, dataset) mf.load() i2i = Item2Item(dataset) candidate_generator = CandidateGenerator(preprocessor, dataset, mf, i2i) ranker = Ranker() ranker.load() X_submit, X_article_nums, q_submit, q_reader = candidate_generator.generate_submit() try: with open('submit_puke.pkl', 'wb') as f: pickle.dump((X_submit, X_article_nums, q_submit, q_reader), f) except: print("Couldn't save submit_puke") # X_submit, X_article_nums, q_submit, q_reader = pickle.load(open('submit_puke.pkl', 'rb')) rank_scores = ranker.rank(X_submit) base = 0 entire_articles = [] not_heavy_items = set(range(1, article_count+1)) - set(preprocessor.heavy_items) not_heavy_items = sorted(not_heavy_items) cut = 50 random.seed(0) with result_path.open('w') as fout: for group_size, reader in tqdm(zip(q_submit, q_reader), total=len(q_submit)): articles = X_article_nums[base:base+group_size] scores = rank_scores[base:base+group_size] articles = [a for _, a in sorted(zip(scores, articles), key=lambda x: x[0], reverse=True)] articles = articles[:cut] from_followable = candidate_generator.get_readers_followable_articles(reader) # from_keywords = candidate_generator.get_readers_keyword_articles(reader) for item in from_followable: if len(articles) >= cut + 15: break if item in articles: continue articles.append(item) while len(articles) < 100: item = random.choice(not_heavy_items) if item not in articles: articles.append(item) entire_articles.extend(articles) reader_str = preprocessor.num2reader[reader] article_strs = map(preprocessor.num2article.get, articles) fout.write('%s %s\n' % (reader_str, ' '.join(article_strs))) base += group_size print('Entropy of candidates = ', entropy(entire_articles))
def main(matcher_path, test_path): m_trackers_paths = glob.glob(matcher_path + '/*') t_trackers_paths = glob.glob(test_path + '/*') tracker_manager = TrackerManager('test') matcher = FaissMatcher() preprocessor = Preprocessor() align_preprocessor = Preprocessor(algs=align_and_crop) face_rec_graph_face = FaceGraph() face_extractor = FacenetExtractor(face_rec_graph_face, model_path=Config.FACENET_DIR) detector = MTCNNDetector(face_rec_graph_face) # create matcher print('Creating matcher ...') for m_dir in m_trackers_paths: print('Processing ' + m_dir) face_id = m_dir.split('/')[-1] embs, labels = extract_embs(m_dir, preprocessor, face_extractor, None) face_id_labels = [face_id for i in range(len(labels))] matcher.update(embs, face_id_labels) # create tracker print('Creating trackers') for t_dir in t_trackers_paths: print('Processing ' + t_dir) embs, _ = extract_embs(t_dir, preprocessor, face_extractor, None) track_id = int(t_dir.split('/')[-1]) first_emb = embs.pop() face_info = FaceInfo(None, first_emb, None, None, None, None) tracker_manager.current_trackers[track_id] = Tracker( track_id, face_info, None) for emb in embs: face_info = FaceInfo(None, emb, None, None, None, None) tracker_manager.current_trackers[track_id].update(face_info, None) len(tracker_manager.current_trackers) # test matching print('Test matching ...') for fid in tracker_manager.current_trackers: print('Processing: ' + str(fid)) tops = tracker_manager.recognize_current_tracker(fid, matcher, None) print('Track_id {}, recognize: {}'.format(fid, tops))
def preprocess(args, X, T): pre = Preprocessor() X_normal = X if args.pre == 'pca': logging.info('Preprocess with PCA(d = %d)' % args.deg) X_phi = pre.pca(X_normal, args.deg) elif args.pre == 'lda': logging.info('Preprocess with LDA(d = %d)' % args.deg) X_phi = pre.lda(X_normal, T, args.deg) return X_phi, pre
def __init__(cls): cls.face_rec_graph_face = FaceGraph() cls.coeff_graph = FaceGraph() cls.face_extractor = FacenetExtractor( cls.face_rec_graph_face, model_path=Config.Model.FACENET_DIR) cls.coeff_extractor = FacenetExtractor( cls.coeff_graph, model_path=Config.Model.COEFF_DIR) cls.detector = MTCNNDetector( cls.face_rec_graph_face, scale_factor=Config.MTCNN.SCALE_FACTOR) cls.preprocessor = Preprocessor()
def test_encode_and_no_categorical(self): dask_data = dd.read_csv('data_encode.csv') x = Preprocessor(['feat1', 'feat2', 'feat3'], 'target', dask_data, ['o', 'p', 'n']) x.execute(duplicates_invalid=True, missing=True, scale=True, transform=True, encode_target=True, train=True) expected_output_dict = { 'target': { 0: 2, 1: 2, 2: 0, 3: 0, 6: 1, 7: 2, 8: 0, 9: 2 }, 'feat1': { 0: -0.928, 1: -0.093, 2: -0.928, 3: -0.928, 6: 0.743, 7: -0.093, 8: -0.093, 9: -0.093 }, 'feat2': { 0: -0.844, 1: 0.998, 2: -0.844, 3: -0.844, 6: -0.23, 7: -0.844, 8: 0.384, 9: 0.0 }, 'feat3': { 0: -0.548, 1: 0.0, 2: 0.0, 3: -0.548, 6: 2.739, 7: -0.548, 8: 0.0, 9: -0.548 } } self.assertEqual(expected_output_dict, x.df.round(3).head(8).to_dict())
def main(argv): if len(argv) < 2: print "Invalid arguments. Format is \'python nlp.py [file name]\'" else: pp = Preprocessor() print "Starting NLP..." texts = pp.prepDoc(argv[1]) c = Corpus(texts) c.createCorpus() scores = c.calc_tfidf() export(scores)
def create_dataset(filenames, batch_size, num_heatmap, is_train): preprocess = Preprocessor(IMAGE_SHAPE, (HEATMAP_SIZE[0], HEATMAP_SIZE[1], num_heatmap), is_train) dataset = tf.data.TFRecordDataset(filenames) dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) if is_train: dataset = dataset.shuffle(batch_size) dataset = dataset.batch(batch_size) dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE) return dataset