def load_netflix(): data_home = get_data_home() path = os.path.join(data_home, "nf_prize", "X_tr.pkl") X_tr = joblib.load(path) path = os.path.join(data_home, "nf_prize", "X_te.pkl") X_te = joblib.load(path) return X_tr, X_te
def compute_nearest_docs(self, query, topn=10): t1 = time.time() query_affinity = [] entity = load(self.file_entity, mmap_mode="r") relation_normal = load(self.file_relation_normal, mmap_mode="r") relation = load(self.file_relation, mmap_mode="r") for query_triple in query: candidates = [(query_triple, (n.left_entity, n.relation, n.right_entity)) for n in self.cluster_representative.values()] affinity = [(candidate, self.relation_embedding.kernel_density_pair (candidate, relation_normal=relation_normal, entity=entity, relation=relation)) for candidate in candidates] affinity = {c : e for c, e in enumerate(affinity)} query_affinity.append(affinity) print query_affinity candidates = [(block_id, query, block_relations) for block_id, block_relations in self.relation_by_doc.iteritems()] density_by_doc = self.parallel_pool(delayed(func)(self, candidate, query_affinity, entity, relation_normal, relation) for candidate in candidates) density_by_doc = sorted(density_by_doc, key=lambda e: e[1])[:topn] nearest_docs = [(block_id, score, self.doc_text[block_id]) for block_id, score in density_by_doc] t2 = time.time() print t2 -t1, " seconds" return nearest_docs
def load_experts(fname, max_files=float('inf'), min_return=None): config = tf.ConfigProto() config.gpu_options.allow_growth = True if hasattr(fname, '__iter__'): paths = [] for fname_ in fname: tf.reset_default_graph() with tf.Session(config=config): snapshot_dict = joblib.load(fname_) paths.extend(snapshot_dict['paths']) else: with tf.Session(config=config): snapshot_dict = joblib.load(fname) paths = snapshot_dict['paths'] tf.reset_default_graph() trajs = [] for path in paths: obses = path['observations'] actions = path['actions'] returns = path['returns'] total_return = np.sum(returns) if (min_return is None) or (total_return >= min_return): traj = {'observations': obses, 'actions': actions} trajs.append(traj) random.shuffle(trajs) print('Loaded %d trajectories' % len(trajs)) return trajs
def transform(self, X, stride_size=1, save_to_file=None, memmap=False, force_rerun=False): """ Expects X to be in the shape of (n, x, y, chan) """ if not hasattr(self, 'centroids_'): raise RuntimeError("Model has not been fitted") if save_to_file is not None and os.path.exists(save_to_file) and not force_rerun: logger.info("File already exists, loading from {}".format(save_to_file)) if memmap: res = joblib.load(save_to_file, mmap_mode='r+') else: res = joblib.load(save_to_file) else: all_rows = range(X.shape[0]) chunked_rows = list(chunks(all_rows, self.n_jobs)) logger.info("Transforming in {} jobs, chunk sizes: {}".format(self.n_jobs, [len(x) for x in chunked_rows])) res = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)( delayed(chunked_extract_features)(i, X, self.rf_size, self.centroids_, self.mean_, self.p_, True, stride_size, self.pool_method) for i in chunked_rows ) res = np.vstack(res) if save_to_file is not None: logger.info("Saving results to file {}".format(save_to_file)) joblib.dump(res, save_to_file) if memmap: res = joblib.load(save_to_file, mmap_mode='r+') return res
def find_best_features(df_train, y_train): rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16) # vals_pearson = df_train.corr('pearson').values vals_pearson = joblib.load("vals_pearson.pkl") # vals_kendall = df_train.corr('kendall').values # vals_spearman = df_train.corr('spearman').values vals_spearman = joblib.load("vals_spearman.pkl") vals = (vals_pearson + vals_spearman) / 2 dumped_cols = [] res_cols = [True] * vals.shape[0] for i in range(vals.shape[0]): if i not in dumped_cols: for j in range(vals.shape[1]): if i != j: if abs(vals[i, j]) > 0.90: dumped_cols.append(j) res_cols[j] = False # df_train2 = df_train[df_train.columns[res_cols]] rfecv = RFECV(rfr, step=10, cv=5, scoring=rmse_scorer, verbose=2) # Float step gives error on the end # rfecv.fit(df_train2, y_train) rfecv = joblib.load("rfecv.pkl") return (res_cols, rfecv.get_support())
def __init__(self, start_url=settings.TEST_START_URL, domains=settings.ALLOWED_DOMAINS): self.name = 'find_data' self.start_urls = [start_url] self.allowed_domains = domains # load in the regressors self.page_reg = joblib.load(settings.DATA_DIRECTORY+'../clf/page_pipe.pkl') self.url_reg = joblib.load(settings.DATA_DIRECTORY+'../clf/url_pipe.pkl')
def make_counts(self, preprocessor, short_id, column_names, type_n, type_v): #count_vector_titles = CountVectorizer( #read_column(train_filename, column_name), #max_features=200) file_id = self._check_type_n(type_n) valid_file_id = self._check_type_n(type_v) name = "%s_%s_%s_%s" for column_name in column_names: vocabulary_path = path_join(self.cache_dir, name % (column_name, type_n, short_id, "vocabulary")) stop_words_path = path_join(self.cache_dir, name % (column_name, type_n, short_id, "stop_words")) valid_path = path_join(self.cache_dir, name % (column_name, type_v, short_id, "matrix")) cur_preprocessor = clone(preprocessor) print "Working on %s" % column_name if isfile(vocabulary_path) and isfile(stop_words_path): print "vocabulary exists" vocabulary = joblib.load(vocabulary_path) stop_words = joblib.load(stop_words_path) cur_preprocessor.set_params(vocabulary=vocabulary) cur_preprocessor.set_params(stop_words=stop_words) else: print "Fitting train" cur_preprocessor.set_params(input=self.read_column(file_id, column_name)) titles = cur_preprocessor.fit_transform(self.read_column(file_id, column_name)) joblib.dump(cur_preprocessor.vocabulary_, vocabulary_path) joblib.dump(cur_preprocessor.stop_words_, stop_words_path) print joblib.dump(titles, path_join(self.cache_dir, name % (column_name, type_n, short_id, "matrix"))) if not isfile(valid_path): print "Fitting valid" titles_valid = cur_preprocessor.transform( self.read_column(valid_file_id, column_name)) print joblib.dump(titles_valid, valid_path)
def my_form_post(): title = request.form['title'] description = request.form['description'] model = joblib.load('./ListUp/ListupNLP_v2.pkl') count_vect = joblib.load('./ListUp/vect.pkl') tfidf_transformer = joblib.load('./ListUp/tfidf.pkl') def review_to_words( raw_review ): review_text = BeautifulSoup(raw_review).get_text() letters_only = re.sub("[^a-zA-Z]", " ", review_text) words = letters_only.lower().split() more_meaningful_words=[] for words in words: if len(words) < 3: continue else: more_meaningful_words.append(words) return( " ".join( more_meaningful_words )) def stem_words(text): lemma = joblib.load('./ListUp/lemma.pkl') stemmed_words =[lemma.lemmatize(word) for word in text.split(" ")] return( " ".join( stemmed_words )) docs_new = [stem_words(review_to_words(title + " "+ description))] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = model.predict(X_new_tfidf) decison_function = model.decision_function(X_new_tfidf) confidence = 1/(1+np.exp(-np.amax(decison_function))) if confidence > 0.8: return 'The predicted class is %s with confidence of %d' %(predicted[0], confidence*100) else: return 'Lets be honest, cant predict this as confidence is %s' %(round(confidence*100, 2))
def retest(trainsvd): truncated_train_svd = joblib.load("truncated_train_svd_" + str(trainsvd)+".o") truncated_test_svd = joblib.load("truncated_test_svd_" + str(trainsvd)+".o") row_index = 0 with open("../data/f_hashtag_prediction/test_data_tweets_processed_2K.txt") as ftest: test_set = ftest.read().splitlines() with open("prediction_result_1K_unique_"+ str(trainsvd)+".txt","w") as output_prediction: with open("../data/f_hashtag_prediction/train_data_all_hashtags.txt") as ftrain: with open("../data/f_hashtag_prediction/test_data_all_hashtags.txt") as ftest: test_set_hashtags = ftest.read().splitlines() train_set_hashtags = ftrain.read().splitlines() begin_index = 0 for row in truncated_test_svd[begin_index:]: if row_index > 1000: break print "TEST TWEET (row: " + str(row_index) + ") : " + test_set[row_index] cosine = cosine_similarity(truncated_test_svd[row_index], truncated_train_svd) m = max(cosine[0]) mindex = [i for i, j in enumerate(cosine[0]) if j == m] train_tags = set() test_tags = set() for num_line in mindex: train_tags.update(train_set_hashtags[num_line].split(",")) test_tags.update(test_set_hashtags[row_index].split(",")) utr = set(list(itertools.chain(train_tags))) ut = set(list(itertools.chain(test_tags))) test_tweet = "TEST TWEET (row: " + str(row_index) + ") : " + str(test_set[row_index]) print "TRAIN TAGS: " + str(utr) print "TEST TAGS:" + str(ut) print "*****" output_prediction.write("*****\n"+test_tweet +"\n" + "TRAIN TAGS: " + str(utr) + "\n" + "TEST TAGS:" + str(ut) + "\n" + "*****") row_index += 1
def load_serial(): print 'Deserializing learned model, vectorizers, and lexicons' char_vectorizer = joblib.load(saved_model_dir + 'char_vectorizer.pickle') word_vectorizer = joblib.load(saved_model_dir + 'word_vectorizer.pickle') model = joblib.load(saved_model_dir + 'svm_model.pk1') lexicons = joblib.load(saved_model_dir + 'lexicons.pickle') return model, char_vectorizer, word_vectorizer, lexicons
def motionEstTSS(curI, nextI, blockSize, stepSize, shiftSize): """ Computes motion vectors using 3-step search method Input: curI: The image for which we want to find motion vectors nextI: The reference image blockSize: stepSize: shiftSize: Ouput: velX, velY : the motion vectors for each direction """ # check if two images have the same size if nextI.shape != curI.shape: print "Two images do not have the same size" return [], [] # filepath for temp generated file used by parallel computation folder = tempfile.mkdtemp() curI_path = os.path.join(folder, 'curI') nextI_path = os.path.join(folder, 'nextI') velX_path = os.path.join(folder, 'velX') velY_path = os.path.join(folder, 'velY') # get pre-defined size height, width = curI.shape block_r = blockSize / 2 velSize = ((height + 1 - 2 * block_r) / shiftSize, (width + 1 - 2 * block_r) / shiftSize) # get the number of system cores num_cores = multiprocessing.cpu_count() """Pre-allocate a writeable shared memory map as a container for the results motion vectors of the parallel computation """ velX = np.memmap(velX_path, dtype=np.int32, shape=velSize, mode='w+') velY = np.memmap(velY_path, dtype=np.int32, shape=velSize, mode='w+') # Dump the input images to disk to free the memory dump(curI, curI_path) dump(nextI, nextI_path) """Release the reference on the original in memory array and replace it by a reference to the memmap array so that the garbage collector can release the memory before forking. gc.collect() is internally called in Parallel just before forking. """ curI = load(curI_path, mmap_mode='r') nextI = load(nextI_path, mmap_mode='r') # Fork the worker processes to perform motion vector computation concurrently Parallel(n_jobs=num_cores)(delayed(estTSS)(curI, nextI, velX, velY, i, j, block_r, stepSize, shiftSize, height, width) for i in range(velSize[0]) for j in range(velSize[1])) # try: # shutil.rmtree(folder) # except: # print("Failed to delete: " + folder) return velX, velY
def main(): X = joblib.load('./X_words.jbl') y = joblib.load('./y_words.jbl') print('loaded data') model = models.create_model(X.shape[1], X.shape[2]) print('model compiled') print(model.summary()) model.fit(X, y, batch_size=128, nb_epoch=1) model.save_weights('word_model.h5', overwrite=True)
def worker_init(id_state, id_exp): """process initialization function. This function is only used when the child processes are spawned (instead of forked). When using the fork model of multiprocessing the data is just inherited in process memory.""" import joblib global _mp_state state = joblib.load(id_state) experiment = joblib.load(id_exp) _mp_state = state + (experiment,)
def main(): os.system("taskset -p 0xff %d" % os.getpid()) corpus = [] query_queryvector_map_file = os.path.join(output_location, "query_queryvector_map.pkl") repvector_file = os.path.join(output_location, "repvector_nparray.pkl") query_queryvector_map = {} print "\nStarting ... " print "\nNow caching the corpus list with the list of queries... " # Load the Query Data corpus = joblib.load(inputfilepath) print "\nCaching of corpus list complete!" corpuscount = len(corpus) vocab_dict, repvector = getCorpusDict(vectorsfilepath) # Dump the large numpy array to disk if not os.path.exists(repvector_file): print "Dump the large numpy array to disk" joblib.dump(repvector,repvector_file) print "Dumping of the Vector file to Disk complete!" # Load the repvector into the memory map -- Shared memory to be used by the processes. print "Loading the representation vector into the memory map." repvector_memmap = joblib.load(repvector_file, mmap_mode='r+') print "\nStarting Query Vector Computation ... " # Multi-Processing Code using job-lib # Initiating Parallel jobs for compute intensive task of generating sentence vectors. # max_nbytes=None, results = Parallel(n_jobs=numJobs, max_nbytes=None, verbose=10)(delayed(generateQueryAndQueryVectorMap)(line_tmp, vocab_dict, repvector_memmap) \ for line_tmp in corpus[:100]) # results = Parallel(n_jobs=numJobs, max_nbytes=None, verbose=10)(delayed(generateQueryAndQueryVectorMap)(line_tmp, vocab_dict, repvector) \ # for line_tmp in corpus) # Aggregate the results into the query_queryvector_map dict. for indiv_res in results: key, value = indiv_res query_queryvector_map[key] = value print "\nQuery Vector Computation finished!" print 'Vector population dumped to disk ... ' joblib.dump(query_queryvector_map, query_queryvector_map_file) print 'Data successfully dumped to disk!'
def test_old_pickle(tmpdir): import joblib # Check that a pickle that references sklearn.external.joblib can load f = tmpdir.join('foo.pkl') f.write(b'\x80\x02csklearn.externals.joblib.numpy_pickle\nNumpyArrayWrappe' b'r\nq\x00)\x81q\x01}q\x02(U\x05dtypeq\x03cnumpy\ndtype\nq\x04U' b'\x02i8q\x05K\x00K\x01\x87q\x06Rq\x07(K\x03U\x01<q\x08NNNJ\xff' b'\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\tbU\x05shapeq\nK\x01\x85q' b'\x0bU\x05orderq\x0cU\x01Cq\rU\x08subclassq\x0ecnumpy\nndarray\nq' b'\x0fU\nallow_mmapq\x10\x88ub\x01\x00\x00\x00\x00\x00\x00\x00.', mode='wb') joblib.load(str(f))
def retrain(svdcomp): smatrix = joblib.load("test_tfidf_matrix.o") tfidf_matrix = joblib.load("train_tfidf_matrix.o") svd = TruncatedSVD(n_components=svdcomp, random_state=42) svd.fit(tfidf_matrix) truncated_train_svd = svd.transform(tfidf_matrix) truncated_test_svd = svd.transform(smatrix) print truncated_train_svd.shape print truncated_test_svd.shape joblib.dump(truncated_train_svd, "truncated_train_svd_" + str(svdcomp)+".o") joblib.dump(truncated_test_svd, "truncated_test_svd_" + str(svdcomp)+".o")
def run_binary(name, comments): """run a binary model (logistic regression)""" models_path = models_dir() vector = joblib.load('{}/{}_vectorizer.pkl'.format(models_path, name)) vecs = vector.transform((c['body'] for c in comments)) model = joblib.load('{}/{}.pkl'.format(models_path, name)) probs = model.predict_proba(vecs) pred = [{ 'id': c['_id'], 'prob': prob[1] } for c, prob in zip(comments, probs)] return pred
def main(): """ Main function. Read the test and training data, and tokenization. Apply find_tags for test documents that do not have a duplicate title in the training data to compute tags. Finally, write all results to file. @return: None """ data = read_zip(trainingzip, trainingfile, cols=["Id", "Title", "Tags"], index_col=0, count=nrows).drop_duplicates( cols="Title", take_last=True) # TODO: take_last=True test = read_zip(testzip, testfile, cols=["Id", "Title", "Body"], count=nrows) logger.info(asctime() + " Reading tag counts from '{0}'...".format(tagcache)) tags = joblib.load(tagcache, mmap_mode="r") # no normalization done here multitoken_i = multitoken_index(tags) logger.info(asctime() + " Loading punkt_tokenizations index from '{0}'...".format(punkt_tokenizationsindexfile)) punkt_tokenizationindex = joblib.load(punkt_tokenizationsindexfile) punctword_tokenizationindex = joblib.load(punctword_tokenizationsindexfile) logger.info(asctime() + " Merging training data and test data...") predictions = pd.merge(test, data, on="Title", how="left").drop_duplicates("Id") missing = predictions.index[predictions.Tags.isnull()] logger.info("{0} Computing {1} missing tags between {2} and {3}...".format(asctime(), len(missing), predictions.Id[missing[0]], predictions.Id[missing[-1]])) punkt_tokenizations = pd.Series() wordpunct_tokenizations = pd.Series() counter = 0 for i in missing: counter += 1 if counter % 10000 == 0: logger.info(asctime() + " Done: {0} out of {1}.".format(counter, len(missing))) if predictions.Id[i] not in punkt_tokenizations.index: logger.info(asctime() + " Loading tokenizations for {0} from '{1}'".format(predictions.Id[i], punkt_tokenizationindex[ predictions.Id[i]])) punkt_tokenizations = joblib.load(punkt_tokenizationindex[predictions.Id[i]]) logger.info(asctime() + " Loading tokenizations for {0} from '{1}'".format(predictions.Id[i], punctword_tokenizationindex[ predictions.Id[i]])) wordpunct_tokenizations = joblib.load(punctword_tokenizationindex[predictions.Id[i]]) logger.info(asctime() + " Done reading '{0}'.".format(punctword_tokenizationindex[predictions.Id[i]])) tokenization = pd.Series(Counter(punkt_tokenizations[predictions.Id[i]].to_dict()) + Counter( wordpunct_tokenizations[predictions.Id[i]].to_dict())) predictions.Tags[i] = " ".join(find_tags(tokenization, tags, multitoken_i)) outfile = "/home/carsten/facebook/predictions_{0}documents.csv".format(nrows) logger.info(asctime() + " Writing predictions to '{0}'...".format(outfile)) predictions.sort(columns="Id").to_csv(outfile, index=False, cols=["Id", "Tags"], quoting=csv.QUOTE_ALL) logger.info(asctime() + " Done.")
def memmap(self): if isinstance(self.points, numpy.memmap): return dn = tempfile.mkdtemp(prefix='springmesh') Mesh._memmap_dirs.append(dn) pfn = os.path.join(dn, 'mesh_points.npy') sfn = os.path.join(dn, 'mesh_springs.npy') # dump dpfn = joblib.dump(self.points, pfn)[0] dsfn = joblib.dump(self.springs, sfn)[0] # load # TODO free originals? self.points = joblib.load(dpfn, 'r+') self.springs = joblib.load(dsfn, 'r+')
def test(netFile, dataSet, model='RNN', trees=None): if trees == None: if dataSet == "train": trees = tr.load_trees(TRAIN_DATA_FILE) elif dataSet == "dev": trees = tr.load_trees(DEV_DATA_FILE) assert netFile is not None, "Must give model to test" print "Testing netFile %s" % netFile #f = open(netFile, 'rb') #opts = pickle.load(f) #_ = pickle.load(f) opts = joblib.load(netFile + "_opts") _ = joblib.load(netFile + "_cost") if (model=='RNTN'): nn = RNTN(opts.wvecDim,opts.outputDim,opts.numWords,opts.minibatch) elif(model=='RNN'): nn = RNN(opts.wvecDim,opts.outputDim,opts.numWords,opts.minibatch) elif(model=='RNN2'): nn = RNN2(opts.wvecDim,opts.middleDim,opts.outputDim,opts.numWords,opts.minibatch) else: raise '%s is not a valid neural network so far only RNTN, RNN, and RNN2' % opts.model nn.initParams() #nn.stack = pickle.load(f) #nn.stack = np.load(f) nn.stack = joblib.load(netFile + "_stack") #f.close() print "Testing %s..." % model cost, correct, guess, total = nn.costAndGrad(trees, test=True) correct_sum = 0 for i in xrange(0, len(correct)): correct_sum += (guess[i] == correct[i]) # confusion matrix conf_arr = np.zeros((opts.outputDim, opts.outputDim)) for i in xrange(len(correct)): curr_correct = correct[i] curr_guess = guess[i] conf_arr[curr_correct][curr_guess] += 1.0 #makeconf(conf_arr) print "Cost %f, Acc %f" % (cost, correct_sum / float(total)) return correct_sum / float(total)
def get_grid_featurized_pdbbind_dataset(subset): """Downloads and caches grid featurized PDBBind dataset. Args: subset (str): subset name of PDBBind dataset. Returns (NumpyTupleDataset): grid featurized PDBBind dataset. """ x_path, y_path = get_grid_featurized_pdbbind_filepath(subset) x = joblib.load(x_path).astype('i') y = joblib.load(y_path).astype('f') dataset = NumpyTupleDataset(x, y) return dataset
def check_bad(self, delete_bad=True): """Check that the result dumps are not bad -> sometimes length does not match the batch. Optionally delete these so that they can be re-grown. Parameters ---------- delete_bad : bool Delete bad results as they are come across. Returns ------- bad_ids : tuple The bad batch numbers. """ # XXX: work out why this is needed sometimes on network filesystems. result_files = glob( os.path.join(self.location, "results", RSLT_NM.format("*"))) bad_ids = [] for result_file in result_files: # load corresponding batch file to check length. result_num = os.path.split( result_file)[-1].strip("xyz-result-").strip(".jbdmp") batch_file = os.path.join( self.location, "batches", BTCH_NM.format(result_num)) batch = joblib.load(batch_file) try: result = joblib.load(result_file) unloadable = False except Exception as e: unloadable = True err = e if unloadable or (len(result) != len(batch)): msg = "result {} is bad".format(result_file) msg += "." if not delete_bad else " - deleting it." msg += " Error was: {}".format(err) if unloadable else "" print(msg) if delete_bad: os.remove(result_file) bad_ids.append(result_num) return tuple(bad_ids)
def load_net(nnet_file): if path.splitext(nnet_file)[1] == 'joblib': nnet = joblib.load(nnet_file) else: with open(nnet_file, 'rb') as fid: nnet = pickle.load(fid) return nnet
def create_test_prediction(dataset, model): """Create and yield test prediction, then delete. Params ------ dataset : `models.Dataset` instance The dataset on which prediction will be performed. model : `models.Model` instance The model to use to create prediction. """ with featureset.from_netcdf(model.featureset.file.uri, engine=cfg['xr_engine']) as fset_data: model_data = joblib.load(model.file.uri) pred_data = predict.model_predictions(fset_data.load(), model_data) pred_path = pjoin(cfg['paths']['predictions_folder'], '{}.nc'.format(str(uuid.uuid4()))) pred_data.to_netcdf(pred_path, engine=cfg['xr_engine']) f, created = m.File.create_or_get(uri=pred_path) pred = m.Prediction.create(file=f, dataset=dataset, project=dataset.project, model=model, finished=datetime.datetime.now()) pred.save() try: yield pred finally: pred.delete_instance()
def load(self): if self.system_joblib: import joblib else: from sklearn.externals import joblib X, y = [], [] filenames = sorted(glob.glob(expand_path(self.filenames))) if len(filenames) == 0: raise RuntimeError('no filenames matched by pattern: %s' % self.filenames) for fn in filenames: obj = joblib.load(fn) if isinstance(obj, (list, np.ndarray)): X.append(obj) else: X.append(obj[self.x_name]) y.append(obj[self.y_name]) if len(X) == 1: X = X[0] if len(y) == 1: y = y[0] elif len(y) == 0: y = None return X, y
def load_p_w(self, sub_folder): if (os.path.exists(os.path.join(os.path.join(self.cache_path_, sub_folder, 'p_w.hdf')))): p_w = utils.hdf_to_numpy(os.path.join(self.cache_path_, sub_folder), 'p_w') else: p_w = joblib.load(os.path.join(self.cache_path_, sub_folder, 'p_w.joblib')) return p_w
def dimReduction(corpus,mode,idx): print("Dimension reduction...") if sp.sparse.isspmatrix_csr(corpus): data_matrix = corpus.toarray() data_matrix=[] if mode == 'train': dim_reduc_pipe = marcos.DIMREDUC_PIPE dim_reduc_pipe.set_params(pca__n_components=1000) # bow_transformer = BOWTransformer() data_matrix = dim_reduc_pipe.fit_transform(corpus) #save transform model jl.dump(dim_reduc_pipe,'{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx)) elif mode == 'test': dim_reduc_pipe = jl.load('{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx)) data_matrix = dim_reduc_pipe.transform(corpus) else: print("Unexpected mode in BOWtransform",file=sys.stderr) sys.exit() # turn dt matrix to list print ("The shape of dt matrix is {} (after dimension reduction)\n".format(data_matrix.shape)) return data_matrix.tolist()
def compute_distance_matrix(self): triples = (((ti.left_entity, ti.relation, ti.right_entity), (tj.left_entity, tj.relation, tj.right_entity)) for ti,tj in product(*[self.kb_triples, self.kb_triples])) entity = load(self.file_entity, mmap_mode="r") relation_normal = load(self.file_relation_normal, mmap_mode="r") relation = load(self.file_relation, mmap_mode="r") distances = self.parallel_pool(delayed(kernel_density_pair) (self, pair, relation_normal=relation_normal, relation=relation, entity=entity, mmaped=True) for pair in triples) self.distances = distances distances = np.array(distances) self.distance_matrix = distances.reshape(len(self.kb_triples), len(self.kb_triples))
def BOWtransform(corpus,mode,idx): data_matrix=[] print('Transform data...') if mode == 'train': bow_transformer = BOWTransformer() data_matrix = bow_transformer.fit_transform(corpus) #save transform model jl.dump(bow_transformer,'{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx)) elif mode == 'test': bow_transformer = jl.load('{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx)) data_matrix = bow_transformer.transform(corpus) else: print("Unexpected mode in BOWtransform",file=sys.stderr) sys.exit() # turn dt matrix to list print ("The shape of dt matrix is {}\n".format(data_matrix.shape)) if sp.sparse.isspmatrix_csr(data_matrix): data_matrix = data_matrix.toarray().tolist() else: #pass through dimension reduction pipe data_matrix = data_matrix.tolist() return data_matrix
def from_file(cls, objdump_path): ''' Parameters ---------- objdump_path: str Path to the object dump file. Returns ------- instance New instance of an object from the pickle at the specified path. ''' obj_version, object = joblib.load(objdump_path) # Check that we've actually loaded a PersistenceMixin (or sub-class) if not isinstance(object, cls): raise ValueError(('The pickle stored at {} does not contain ' + 'a {} object.').format(objdump_path, cls)) # Check that versions are compatible. (Currently, this just checks # that major versions match) elif obj_version[0] == VERSION[0]: if not hasattr(object, 'sampler'): object.sampler = None return object else: raise ValueError(("{} stored in pickle file {} was created with version {} " "of {}, which is incompatible with the current version " "{}").format(cls, objdump_path, cls.__name__, '.'.join(obj_version), '.'.join(VERSION)))
def run(dataset_path=DEFAULT_DATASET, dataset_name='timit', iterator_type=ABX2OIterator, batch_size=100, nframes=13, features="fbank", init_lr=0.01, max_epochs=500, network_type="dropout_net", trainer_type="adadelta", layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression], layers_sizes=[2400, 2400, 2400, 2400], dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5], prefix_fname='', debug_print=0, debug_time=False, debug_plot=0): """ FIXME TODO """ output_file_name = dataset_name if prefix_fname != "": output_file_name = prefix_fname + "_" + dataset_name output_file_name += "_" + features + str(nframes) output_file_name += "_" + network_type + "_" + trainer_type output_file_name += "_emb_" + str(DIM_EMBEDDING) print "output file name:", output_file_name n_ins = None n_outs = None print "loading dataset from", dataset_path # TODO DO A FUNCTION if dataset_path[-7:] != '.joblib': print >> sys.stderr, "prepare your dataset with align_words.py or lucid.py or buckeye.py" sys.exit(-1) ### LOADING DATA data_same = joblib.load(dataset_path) shuffle(data_same) has_dev_and_test_set = True dev_dataset_path = dataset_path[:-7].replace("train", "") + 'dev.joblib' test_dataset_path = dataset_path[:-7].replace("train", "") + 'test.joblib' dev_split_at = len(data_same) test_split_at = len(data_same) if not os.path.exists(dev_dataset_path) or not os.path.exists(test_dataset_path): has_dev_and_test_set = False dev_split_at = int(0.8 * dev_split_at) test_split_at = int(0.9 * test_split_at) print data_same[0] print data_same[0][3].shape n_ins = data_same[0][3].shape[1] * nframes n_outs = DIM_EMBEDDING normalize = True min_max_scale = False marginf = (nframes-1)/2 # TODO ### TRAIN SET if has_dev_and_test_set: train_set_iterator = iterator_type(data_same, normalize=normalize, min_max_scale=min_max_scale, scale_f1=None, scale_f2=None, nframes=nframes, batch_size=batch_size, marginf=marginf) else: train_set_iterator = iterator_type( data_same[:dev_split_at], normalize=normalize, min_max_scale=min_max_scale, scale_f1=None, scale_f2=None, nframes=nframes, batch_size=batch_size, marginf=marginf) f1 = train_set_iterator._scale_f1 f2 = train_set_iterator._scale_f2 ### DEV SET if has_dev_and_test_set: data_same = joblib.load(dev_dataset_path) valid_set_iterator = iterator_type(data_same, normalize=normalize, min_max_scale=min_max_scale, scale_f1=f1, scale_f2=f2, nframes=nframes, batch_size=batch_size, marginf=marginf) else: valid_set_iterator = iterator_type( data_same[dev_split_at:test_split_at], normalize=normalize, min_max_scale=min_max_scale, scale_f1=f1, scale_f2=f2, nframes=nframes, batch_size=batch_size, marginf=marginf) ### TEST SET if has_dev_and_test_set: data_same = joblib.load(test_dataset_path) test_set_iterator = iterator_type(data_same, normalize=normalize, min_max_scale=min_max_scale, scale_f1=f1, scale_f2=f2, nframes=nframes, batch_size=batch_size, marginf=marginf) else: test_set_iterator = iterator_type( data_same[test_split_at:], normalize=normalize, min_max_scale=min_max_scale, scale_f1=f1, scale_f2=f2, nframes=nframes, batch_size=batch_size, marginf=marginf) assert n_ins != None assert n_outs != None # numpy random generator numpy_rng = numpy.random.RandomState(123) print '... building the model' # TODO the proper network type other than just dropout or not nnet = None fast_dropout = False if "dropout" in network_type: nnet = DropoutABNeuralNet(numpy_rng=numpy_rng, # TODO with 2 Outputs n_ins=n_ins, layers_types=layers_types, layers_sizes=layers_sizes, n_outs=n_outs, loss='cos_cos2', rho=0.95, eps=1.E-6, max_norm=4., fast_drop=fast_dropout, debugprint=debug_print) else: nnet = ABNeuralNet2Outputs(numpy_rng=numpy_rng, n_ins=n_ins, layers_types=layers_types, layers_sizes=layers_sizes, n_outs=n_outs, loss='cos_cos2', #loss='dot_prod', rho=0.90, eps=1.E-6, max_norm=0., debugprint=debug_print) print "Created a neural net as:", print str(nnet) # get the training, validation and testing function for the model print '... getting the training functions' print trainer_type train_fn = None if debug_plot or debug_print: if trainer_type == "adadelta": train_fn = nnet.get_adadelta_trainer(debug=True) elif trainer_type == "adagrad": train_fn = nnet.get_adagrad_trainer(debug=True) else: train_fn = nnet.get_SGD_trainer(debug=True) else: if trainer_type == "adadelta": train_fn = nnet.get_adadelta_trainer() elif trainer_type == "adagrad": train_fn = nnet.get_adagrad_trainer() else: train_fn = nnet.get_SGD_trainer() train_scoref_w = nnet.score_classif_same_diff_word_separated(train_set_iterator) valid_scoref_w = nnet.score_classif_same_diff_word_separated(valid_set_iterator) test_scoref_w = nnet.score_classif_same_diff_word_separated(test_set_iterator) train_scoref_s = nnet.score_classif_same_diff_spkr_separated(train_set_iterator) valid_scoref_s = nnet.score_classif_same_diff_spkr_separated(valid_set_iterator) test_scoref_s = nnet.score_classif_same_diff_spkr_separated(test_set_iterator) data_iterator = train_set_iterator print '... training the model' # early-stopping parameters patience = 1000 # look as this many examples regardless TODO patience_increase = 2. # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant best_validation_loss = numpy.inf test_score = 0. start_time = time.clock() done_looping = False epoch = 0 lr = init_lr timer = None if debug_plot: print_mean_weights_biases(nnet.params) #with open(output_file_name + 'epoch_0.pickle', 'wb') as f: # cPickle.dump(nnet, f, protocol=-1) while (epoch < max_epochs) and (not done_looping): epoch = epoch + 1 avg_costs = [] avg_params_gradients_updates = [] if debug_time: timer = time.time() for iteration, (x, y) in enumerate(data_iterator): #print "x[0][0]", x[0][0] #print "x[1][0]", x[1][0] #print "y[0][0]", y[0][0] #print "y[1][0]", y[1][0] avg_cost = 0. if "delta" in trainer_type: # TODO remove need for this if avg_cost = train_fn(x[0], x[1], y[0], y[1]) else: avg_cost = train_fn(x[0], x[1], y[0], y[1], lr) if debug_print >= 3: print "cost:", avg_cost[0] if debug_plot >= 2: plot_costs(avg_cost[0]) if not len(avg_params_gradients_updates): avg_params_gradients_updates = map(numpy.asarray, avg_cost[1:]) else: avg_params_gradients_updates = rolling_avg_pgu( iteration, avg_params_gradients_updates, map(numpy.asarray, avg_cost[1:])) if debug_plot >= 3: plot_params_gradients_updates(iteration, avg_cost[1:]) if type(avg_cost) == list: avg_costs.append(avg_cost[0]) else: avg_costs.append(avg_cost) if debug_print >= 2: print_mean_weights_biases(nnet.params) if debug_plot >= 2: plot_params_gradients_updates(epoch, avg_params_gradients_updates) if debug_time: print(' epoch %i took %f seconds' % (epoch, time.time() - timer)) avg_cost = numpy.mean(avg_costs) if numpy.isnan(avg_cost): print("avg costs is NaN so we're stopping here!") break print(' epoch %i, avg costs %f' % \ (epoch, avg_cost)) tmp_train = zip(*train_scoref_w()) print(' epoch %i, training sim same words %f, diff words %f' % \ (epoch, numpy.mean(tmp_train[0]), numpy.mean(tmp_train[1]))) tmp_train = zip(*train_scoref_s()) print(' epoch %i, training sim same spkrs %f, diff spkrs %f' % \ (epoch, numpy.mean(tmp_train[0]), numpy.mean(tmp_train[1]))) # TODO update lr(t) = lr(0) / (1 + lr(0) * lambda * t) lr = numpy.float32(init_lr / (numpy.sqrt(iteration) + 1.)) ### TODO #lr = numpy.float32(init_lr / (iteration + 1.)) ### TODO # or another scheme for learning rate decay #with open(output_file_name + 'epoch_' +str(epoch) + '.pickle', 'wb') as f: # cPickle.dump(nnet, f, protocol=-1) # we check the validation loss on every epoch validation_losses_w = zip(*valid_scoref_w()) validation_losses_s = zip(*valid_scoref_s()) this_validation_loss = 0.25*(1.-numpy.mean(validation_losses_w[0])) +\ 0.25*numpy.mean(validation_losses_w[1]) +\ 0.25*(1.-numpy.mean(validation_losses_s[0])) +\ 0.25*numpy.mean(validation_losses_s[1]) print(' epoch %i, valid sim same words %f, diff words %f' % \ (epoch, numpy.mean(validation_losses_w[0]), numpy.mean(validation_losses_w[1]))) print(' epoch %i, valid sim same spkrs %f, diff spkrs %f' % \ (epoch, numpy.mean(validation_losses_s[0]), numpy.mean(validation_losses_s[1]))) # if we got the best validation score until now if this_validation_loss < best_validation_loss: with open(output_file_name + '.pickle', 'wb') as f: cPickle.dump(nnet, f, protocol=-1) # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iteration * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss # test it on the test set test_losses_w = zip(*test_scoref_w()) test_losses_s = zip(*test_scoref_s()) print(' epoch %i, test sim same words %f, diff words %f' % \ (epoch, numpy.mean(test_losses_w[0]), numpy.mean(test_losses_w[1]))) print(' epoch %i, test sim same spkrs %f, diff spkrs %f' % \ (epoch, numpy.mean(test_losses_s[0]), numpy.mean(test_losses_s[1]))) if patience <= iteration: # TODO correct that done_looping = True break end_time = time.clock() print(('Optimization complete with best validation score of %f, ' 'with test performance %f') % (best_validation_loss, test_score)) print >> sys.stderr, ('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) with open(output_file_name + '_final.pickle', 'wb') as f: cPickle.dump(nnet, f, protocol=-1)
st.sidebar.title("Number of Samples") number_samples = st.sidebar.slider('#Visuliazation Samples', 10, 1000) # load related data X_test = np.load('data/X_test.npy') y_test = np.load('data/y_test.npy') df = pd.read_csv('data/kc_house_data.csv', parse_dates=['date']) df_new = df[['lat', 'long']] df_new.columns = ['lat', 'lon'] st.map(df_new) st.write('\n') # plot LR model result if model_name == "Linear Regression": model_lr = load('model/regr.joblib') y_pred_lr = model_lr.predict(X_test) st.write('R2 Error of ', model_name, 'is ', round(r2_score(y_test, y_pred_lr), 3)) chart_plot(model_name, y_pred_lr, number_samples) # plot RF model result elif model_name == "Random Forest": model_rf = load('model/rf.joblib') y_pred_rf = model_rf.predict(X_test) st.write('R2 Error of ', model_name, 'is ', round(r2_score(y_test, y_pred_rf), 3)) chart_plot(model_name, y_pred_rf, number_samples) # compare models else: model_lr = load('model/regr.joblib') y_pred_lr = model_lr.predict(X_test)
def main(): st.title('Trying out Sentiment Analysis with Streamlit!') st.subheader("EDA, Data Cleaning, & Modeling with Kaggle's \ Twitter US Ariline Sentiment Dataset.") main_image = Image.open('./Images/nlp-pipe.jpg') st.image(main_image, use_column_width=True) html_temp = """ <div style="background-color:tomato;"><p style="color:white; font-size:18px; text-align:center">Choose what to do:</p></div> """ st.markdown(html_temp, unsafe_allow_html=True) if st.checkbox('Exploratory Data Analysis'): explorer = EDA() n_rows = st.sidebar.slider('Displaying dataset, select number of rows', 10, 20) all_cols = explorer.df.columns.tolist() select_cols = st.sidebar.multiselect('Select column(s) to display:', all_cols, ['airline_sentiment', 'text']) 'Number of rows:', n_rows, # explorer.df[select_cols].head(n_rows), # if st.sidebar.checkbox('Most Frequent Words Per Category'): '---------------------------------------------', # st.info("Try with removing stopwords and/or tags('@'/'#')") st.write( 'Most Frequent Words for Positive(Blue), Negative(Red), and Neutral(Green) Tweets:' ) c = st.sidebar.slider( 'Select a number for the top frequent words to display', 10, 15, 10) c = int(c) remove_stop = False if st.sidebar.checkbox('Remove stop words'): remove_stop = True remove_at = False if st.sidebar.checkbox('Remove @ and #'): remove_at = True freqs = explorer.most_freq_words(c, remove_at, remove_stop) plt.show() st.pyplot() cat = st.sidebar.selectbox( "To view word counts, select a sentiment category", ('Positive', 'Negative', 'Neutral')) if cat == 'Positive': 'Top words in ', freqs[0][0], ' tweets', # freqs[0][1].head(c), # elif cat == 'Negative': 'Top words in ', freqs[1][0], ' tweets', # freqs[1][1].head(c), # else: 'Top words in ', freqs[2][0], ' tweets', # freqs[2][1].head(c), # if st.sidebar.checkbox('Word Counts'): '---------------------------------------------', # explorer.word_counts() st.pyplot() if st.sidebar.checkbox("View most frequent @'s and #'s"): '---------------------------------------------', # char = st.sidebar.radio('', ('@', '#')) if char == '@': explorer.find_at_hash() else: explorer.find_at_hash(at=False) st.pyplot() if st.sidebar.checkbox("View most frequent emojis and emoticons"): '---------------------------------------------', # c = st.sidebar.slider('Choose the number of top emojis to view', 10, 20) emojis = explorer.find_emojis() emojis.head(c), # st.balloons() if st.sidebar.checkbox('Target Field'): '---------------------------------------------', # explorer.show_target_field() st.pyplot() if st.checkbox("Text Preprocessing And Sentiment Analysis"): text = st.text_area( "Enter your text to analize:", "@americanairline Thanks for the #amazing flying experience!") cleaner = Cleaner(text) operations = st.sidebar.multiselect( "Choose the preprocessing steps to perform", [ 'Lowercasing', 'Remove html tags', 'Remove punctuations', 'Replace links', 'Replace emojis', 'Replace Mentions(@)', 'Replace Hashtags(#)', 'Remove stop words', 'Lemmatization', 'Spell correction' ], ['Remove stop words']) str_to_func = { 'Lowercasing': cleaner.lowercasing, 'Remove html tags': cleaner.remove_html, 'Remove punctuations': cleaner.remove_punc, 'Replace links': cleaner.replace_links, 'Replace Mentions(@)': cleaner.replace_mentions, 'Replace Hashtags(#)': cleaner.replace_hashtags, 'Replace emojis': cleaner.replace_emojis, 'Remove stop words': cleaner.remove_stop, 'Lemmatization': cleaner.lemmatize, 'Spell correction': cleaner.sepll_correct } if not operations: st.info('### No preprocessing steps selected') else: for op in operations: op = str_to_func[op] sample_text, findings = op() if findings: st.info(op.__doc__ + ', '.join(findings).strip()) st.write('#### Preprocessed text: ', sample_text) if st.button("Analyze Text Sentiment"): model = load('./Model/lr_clf.joblib') # confusion_matrix = Image.open('./Images/confusion_matrix.jpg') # 'Model Performance on the Test set:', # # st.image(confusion_matrix) class_names = ['negative', 'neutral', 'positive'] explainer = LimeTextExplainer(class_names=class_names) if text: model = load('./lr_clf.joblib') processed_text, sentiment = get_sentiment(text, model) 'Original text ---> ', text, # 'Processed text --> ', processed_text, # 'Text Sentiment --> {}'.format(sent_dict[sentiment]), # exp = explainer.explain_instance(processed_text, model.predict_proba) # exp.show_in_notebook() exp.as_pyplot_figure() st.pyplot()
from sklearn import datasets from joblib import load import numpy as np import json #load the model my_model = load('svc_model.pkl') iris_data = datasets.load_iris() class_names = iris_data.target_names def my_prediction(id): dummy = np.array(id) dummyT = dummy.reshape(1,-1) r = dummy.shape t = dummyT.shape r_str = json.dumps(r) t_str = json.dumps(t) prediction = my_model.predict(dummyT) name = class_names[prediction] name = name.tolist() name_str = json.dumps(name) str = [t_str, r_str, name_str] return str
import requests from joblib import load, dump url = 'http://localhost:5000/sample_predict' data = load(r'F:\ML App\app\Model\sample_data.pkl') #json={'LotArea':10084,'YearBuilt':2004,'1stFlrSF':1694,'2ndFlrSF':0,'FullBath':2,'BedroomAbvGr':3,'TotRmsAbvGrd':7} r = requests.post(url) print(r.json()) #print(data)
@app.get("/") def root(): return {"message": "hello world"} # In[39]: @app.get("/item/") async def create_item(item: Item): return item # In[40]: @app.get("/prediction/{stdInput}") async def prediction(stdInput): classT, proba = compare(clf.predict_proba([stdInput])[0]) result = " Classe : %d avec %f %%. " % (classT, proba) return result # ## Prediction # In[41]: clf = load('labelsTrained.joblib')
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import joblib #loading the model... model = joblib.load('house_price_prediction.pkl') #load the data set... df = pd.read_csv('clean_house_data.csv') X = df.drop('price', axis=1) #train data y = df['price'] #target data #print(X.head()) #spliting the data for training and test.... X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #feature scalling.... sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) #function to predict price.... def predict_house_price(bath, balcony, bhk, total_sqft_int, price_per_sqft,
# as csv and h5 files filename = '_'.join(['all_evaluation', '_'.join(filenames)]) ((all_evaluations.sort_values(['accuracy MEAN', 'accuracy SD'], ascending=[False, True])).to_csv(os.path.join(path, filename + '.csv'), sep=';', decimal=',')) ((evaluation.sort_values(['accuracy MEAN', 'accuracy SD'], ascending=[False, True])).to_hdf(os.path.join(path, filename + '.h5'), key='evaluation', mode='w')) print() print('Done') #%% create kaggle file if False: #load model filename='ridge' f=os.path.join(path, 'ridge_0_scaling_cross-validation_bestEstimator.joblib') model=load(f) #encode test data test_data_encoded = enc.transform(test_data) #predict y_test_data_enc=model.predict(test_data_encoded) y_test_data=le.inverse_transform(y_test_data_enc) #write output file kaggleOutput=pd.DataFrame( data={'ID':test_data.index.values,'Class':y_test_data}) kaggleFile=f=os.path.join(path, 'kaggle.csv') kaggleOutput.to_csv(kaggleFile, index=False)
st.markdown(hide_streamlit_style, unsafe_allow_html=True) st.write("# Fake Message Recognition Engine") message_text = st.text_area("Enter a message for evaluation") def preprocessor(text): text = re.sub('<[^>]*>', '', text) # Effectively removes HTML markup tags emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace( '-', '') return text model = joblib.load('spam_classifier.joblib') message_submit = st.button('Evaluate') if message_submit: label = (model.predict([message_text])[0]) spam_prob = (model.predict_proba([message_text])) if label == "spam": label = "fake" elif label == "ham": label = "real" result = {'label': label, 'probability': spam_prob[0][1]} st.write(result)
def predict_mlp(image, im_origin, model_name, mu, std) : """ Function that to the prediction based on a mlp model Inputs : - Image --> original image to pred - model_name --> name of the model (in the same folder as the main) """ # Loading the model model = load(model_name) # Find maximal dimension dim = max(image.shape) ind = np.argmin(image.shape) # Padding to have sqaure image if ind == 0 : i = 0 while image.shape[ind] != dim : if i%2 == 0 : image = pad(image, ((1,0),(0,0)), mode='maximum') else : image = pad(image, ((0,1),(0,0)), mode='maximum') i+=1 else : i = 0 while image.shape[ind] != dim : if i%2 == 0 : image = pad(image, ((0,0),(1,0)), mode='maximum') else : image = pad(image, ((0,0),(0,1)), mode='maximum') i+=1 # Resize to have a correct dimension for the input of the mlp image = resize(image, (28, 28)) image = median(image) # Threshold for the inarisation thresholds = threshold_multiotsu(im_origin, classes=2) thresh_background = thresholds[0] # Binarisation im = image.copy() """ im[np.where(image>=thresh_background)] = 0 im[np.where(image<thresh_background)] = 255 """ im[np.where(image>=thresh_background)] = 1 im = invert(im) m = np.max(im) im = im/m*255 test = im.copy() # Copy for the plot #plot = im.copy() #mu = image.mean() #std = image.std() im = im.reshape(1, -1) # Normalisation im_norm = (im - mu)/std prediction = model.predict(im_norm) prediction_string = str(prediction[0]) prob = model.predict_proba(im_norm) # Plot #plt.imshow(plot, cmap='gray') #plt.title('Rotated binary box : %d , %f' %(int(prediction), prob[0][int(prediction)])) #plt.show() return prediction_string, prob, test
#Load credentials from .env name = os.environ["DB_NAME_AWS"] password = os.environ["DB_PW_AWS"] host = os.environ["DB_HOST_AWS"] user = os.environ["DB_USER_AWS"] pg_conn = psycopg2.connect(dbname=name, user=user, password=password, host=host) ## Cursor is always open pg_curs = pg_conn.cursor() # Load in slimmed random forest pickled model test_model = load("targetiterrobustforest.joblib") # Load the craigslist cleaned data df_cl = pd.read_csv("data/model_and_image_url_lookup.csv") # List of unique CL cars cl_models = sorted(df_cl.model.unique()) def status_200_or_nan(url): response = requests.get(url) if response.status_code == 200: return url else: return np.NaN
# env FLASK_APP=api.py flask run # in order to start operating the api from flask import Flask, request app = Flask(__name__) import sys import string from joblib import dump, load from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.svm import LinearSVC import json count_vect2 = load('count_vect2.joblib') tf_transformer2 = load('tf_transformer2.joblib') model = load('svm.joblib') @app.route('/') def hello_world(): return 'Welcome to the API for review classification. To classify text, your API call should look something like: http://127.0.0.1:5000/classify/?text=my text \n' @app.route('/hello') def api_hello(): if 'name' in request.args: return 'Hello ' + request.args['name'] else: return 'Hello John Doe'
import joblib import tensorflow as tf from tensorflow import keras pickle_file = './FERDataset/FER.joblib' with open(pickle_file, 'rb') as f: save = joblib.load(f) train_dataset = save['train_dataset'] train_labels = save['train_labels'] valid_dataset = save['valid_dataset'] valid_labels = save['valid_labels'] test_dataset = save['test_dataset'] test_labels = save['test_labels'] del save print('Training set', train_dataset.shape, train_labels.shape) print('Validation set', valid_dataset.shape, valid_labels.shape) print('Test set', test_dataset.shape, test_labels.shape) image_size = 48 num_labels = 7 num_channels = 1 # grayscale model = keras.Sequential([ keras.layers.Reshape((1,48,48), input_shape=((48,48))), keras.layers.Conv2D(filters=64, kernel_size=5, data_format='channels_first',
results = pd.DataFrame(dict(Counter(results[2])).items()).sort_values(1) results[0] = results[0].apply(np.abs) results = results.groupby(0).sum() sum = results.sum().item() results["diff"] = results[1] / sum results = results.reset_index() results["diff"] # Save model joblib.dump(vectorizer, open("tfidf.joblib", "wb")) joblib.dump(lda, open("lda.joblib", "wb")) joblib.dump(clf, open("lrclf.joblib", "wb")) pickle.dump(stp.TextPreprocess(), open("textprocess.pkl", "wb")) # Predict vectorizer = joblib.load(path + "cvss_flask/tfidf.joblib") lda = joblib.load(path + "cvss_flask/lda.joblib") clf = joblib.load(path + "cvss_flask/lrclf.joblib") tp = stp.TextPreprocess() def pred_cvss(input_raw): """Predict CVSS score.""" input = tp.transform_df(pd.DataFrame([input_raw]), reformat="stopstemprocessonly", columns=[0]).iloc[0][0] input_tfidf = vectorizer.transform([input]) input_lda = lda.transform(input_tfidf.toarray()) return clf.predict(input_lda)[0]
''' batch_size = [128] lr = [1e-3] hidden_units = [128] epochs = [10] param_grid = dict(batch_size=batch_size, lr=lr, hidden_units=hidden_units, epochs=epochs) grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3) grid.fit(train_x, train_y) joblib.dump(grid.best_estimator_, '{}/grid.pkl'.format(save_dir)) print() print("Best parameters set found:") print() print(grid.best_params_) print() # predict print("testing model ...") loaded_model = joblib.load('{}/grid.pkl'.format(save_dir)) predictions = loaded_model.predict(test_x, batch_size=128, verbose=1) predictions = (predictions >= 0.5).astype('int') test_y = test_y.reshape((-1, 1)) print("Classfication report:") print(classification_report(test_y, predictions))
def load(self, filename: str): self.probs = joblib.load(filename)
from flask import Flask, request, jsonify app = Flask(__name__) import numpy as np from joblib import load dt = load('dt1.joblib') @app.route('/prueba/<uuid>', methods=['POST']) def add_message(uuid): content = request.json print(content) ejemplo = np.array([ content['c1'], content['c2'], content['c3'], content['c4'], content['c5'], content['c6'], content['c7'], content['c8'], content['c9'], content['c10'], content['c11'] ]) print(dt.predict(ejemplo.reshape(1, -1))) a = dt.predict(ejemplo.reshape(1, -1)) return jsonify({"resultado": a[0]}) @app.route('/image', methods=['POST']) def post(): request_data = request.form['some_text'] print(request_data) imagefile = request.files.get('imagefile', '') imagefile.save('test_image.jpg') return jsonify({"status": 'OK'})
lstBrevet = ficBrevet['brevets'] # if data.has_key('requete'): # DataBrevet['requete'] = data["requete"] print ("Found "+ ndf+ " datafile with " +str(len(lstBrevet)) + " patents!") else: print ('gather your data again') sys.exit() cles = ['IPCR11', 'CitO', 'dateDate', 'inventor-nice', 'equivalents', 'CitedBy', 'representative', 'Inventor-Country', 'date', 'inventor', 'kind', 'priority-active-indicator', 'applicant-nice', 'IPCR1', 'country', 'IPCR3', 'applicant', 'IPCR4', 'IPCR7', 'title', 'application-ref'] Titles = [] Labels = [] Abstracts = [] # Pure abstracts IPCRsText = [] Contents = joblib.load(os.path.normpath(ResultContentsPath+'//Contents-'+ndf+'.pkl')) # Contains IPCRs (text of associated IPCR classes) + title + abstracts Titles = joblib.load( os.path.normpath(ResultContentsPath+'//Titles-'+ndf+'.pkl')) Labels = joblib.load( os.path.normpath(ResultContentsPath+'//Labels-'+ndf+'.pkl')) IPCRsText = joblib.load( os.path.normpath(ResultContentsPath+'//IPCRsText-'+ndf+'.pkl')) Abstracts = joblib.load( os.path.normpath(ResultContentsPath+'//Abstracts-'+ndf+'.pkl')) CIB = [] print("loading patents contents") # Tit2FicName=joblib.load(os.path.normpath(ResultContentsPath+'//Titles_ficNames-'+ndf+'.pkl')) # FreqTrie= joblib.load(os.path.normpath(ResultContentsPath+'//FreqTrie'+ndf+'.pkl')) word_freq_df = joblib.load(os.path.normpath(ResultContentsPath+'//word_freq'+ndf+'.pkl')) D0=len(set(word_freq_df['term'])) #unic forms of corpus H0=np.log(D0) #
import glob import flask from dash.dependencies import Input, Output from pvtm.pvtm import PVTM, Documents # construct the argument parse and parse the arguments ap = argparse.ArgumentParser() # general ap.add_argument("-m", "--model", required=True, help="path to the trained PVTM model") parsed_args = ap.parse_args() args = vars(parsed_args) data = joblib.load(args['model']) image_directory = 'Output/' if not os.path.exists(os.path.dirname(image_directory)): try: os.makedirs(os.path.dirname(image_directory)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise for i in range(data.gmm.n_components): data.wordcloud_by_topic(i).to_file('Output/img_{}.png'.format(i)) def generate_table(dataframe, max_rows=20): return html.Table( # Header
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model disc_model = MLPDisc( obs_dim + action_dim if not variant['adv_irl_params']['state_only'] else 2 * obs_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvIRL(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['adv_irl_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
from joblib import load import numpy as np import pandas as pd import matplotlib.pyplot as plt model = load("models/randomforest.pkl") col = load('models/column_list.pkl') importances = model.feature_importances_ std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking # print("Feature ranking:") feature_names = col # for f in range(10): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # sorted(zip(map(lambda x: round(x,4), model.steps[1][1].feature_importances_), important_features = pd.Series(data=importances, index=feature_names) important_features.sort_values(ascending=False, inplace=True) important_features.nlargest(12).plot(kind='barh') plt.title("Top important features") plt.savefig('imgs/feat_import.png') # plt.figure() # plt.title("Feature importances") # plt.bar(range(12), importances[indices], # color="r", yerr=std[indices], align="center") # plt.xticks(range(12), indices)
import hydro_serving_grpc as hs import numpy as np from joblib import load clf = load('/model/files/random-forest-adult.joblib') features = [ 'age', 'workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'country' ] def extract_value(proto): return np.array(proto.int64_val, dtype='int64')[0] def predict(**kwargs): extracted = np.array( [extract_value(kwargs[feature]) for feature in features]) transformed = np.dstack(extracted).reshape(1, len(features)) predicted = clf.predict(transformed) response = hs.TensorProto(int64_val=[predicted.item()], dtype=hs.DT_INT64, tensor_shape=hs.TensorShapeProto()) return hs.PredictResponse(outputs={"classes": response})
path = input('Please enter the path of email you want to test: ') fea = [] f = open(path, 'r').read() contents = processing(f) word_indices = create_feature(contents) fea.append(word_indices) dataframe = pd.DataFrame(fea) # store the DataFrame as a .csv, 'index=False' means line label was unseen,'sep='''means default=True dataframe.to_csv("test_sample.csv", index=False, sep=',') # read train data set df = pd.read_csv("test_sample.csv") # path to train set data_set = df.values x_test = data_set[:, 1:len(word_indices)] # features predicts = [] bnb = joblib.load("BNB_model.m") predicts.extend(bnb.predict(x_test)) svm = joblib.load("SVM_model.m") predicts.extend(svm.predict(x_test)) mlp = joblib.load("MLP_model.m") predicts.extend(mlp.predict(x_test)) dt = joblib.load("DT_model.m") predicts.extend(dt.predict(x_test)) knn = joblib.load("KNN_model.m") predicts.extend(knn.predict(x_test)) # obtain the predicted results by integrating the outputs of the most predicted items from classifiers print(predicts)
#32 動詞の原型 import joblib a = joblib.load("100knock_30") base = [] for sentense in a: for i in range(len(sentense)): if sentense[i]['pos'] == '動詞': base.append(sentense[i]['base']) print(base)
'output': { 'probabilities': '[float]', 'prediction': 'Iris Setosa, Iris Versicolour, Iris Virginica' } } }) modelFilePath = 'models/iris-svc.joblib' from joblib import dump dump(toBePersisted, modelFilePath) # Testing deserialized model from joblib import load dictionary = load(modelFilePath) loaded_model = dictionary['model'] #5.1,3.5,1.4,0.2,Iris-setosa prediction = loaded_model.predict([[5.1, 3.5, 1.4, 0.2]]) print("prediction with serialized model: " + str(prediction) + " expect [0]") #6.0,3.4,4.5,1.6,Iris-versicolor prediction = loaded_model.predict([[6.0, 3.4, 4.5, 1.6]]) print("prediction with serialized model: " + str(prediction) + " expect [1]") #6.3,2.5,5.0,1.9,Iris-virginica prediction = loaded_model.predict([[6.3, 2.5, 5.0, 1.9]]) print("prediction with serialized model: " + str(prediction) + " expect [2]")
import joblib import numpy as np import shap from numeral import int2roman from oximachinerunner import OximachineRunner from .utils import generate_csd_link # pylint:disable=relative-beyond-top-level from .utils import ( load_pickle as read_pickle, # pylint:disable=relative-beyond-top-level ) RUNNER = OximachineRunner("mof") THIS_DIR = os.path.dirname(os.path.realpath(__file__)) EXPLAINER = joblib.load(os.path.join(THIS_DIR, "explainer.joblib")) KDTREE = joblib.load(os.path.join(THIS_DIR, "kd_tree.joblib")) NAMES = np.array(read_pickle(os.path.join(THIS_DIR, "names.pkl"))) warnings.simplefilter("ignore") log = logging.getLogger("shap") # pylint:disable=invalid-name log.setLevel(logging.ERROR) # adjust these features according to model METAL_CENTER_FEATURES = [ "column", "row", "valenceelectrons", "diffto18electrons", "sunfilled",
import joblib import os classifier = joblib.load('medical_appointment.joblib') #['idade', 'auxilio_bolsa_familia', 'hipertensao', 'diabetes', # 'alcolismo', 'deficienca', 'sms_recebido', 'dias_para_consulta', # 'genero_M'] instance=[ [62, 0, 1, 0, 0, 0, 0, -1, 0], [23, 0, 0, 0, 0, 0, 0, 2, 0], [60, 1, 1, 1, 1, 1, 1, 5, 0], [50, 0, 0, 0, 0, 0, 1, 80, 0] ] print(classifier.predict(instance))
from flask import Flask, jsonify, request import spacy import joblib import re app = Flask(__name__) nlp = spacy.load('en_core_web_sm') #nlp = joblib.load('nlp_pipeline.sav') ot_classifier = joblib.load('ot_classifier.sav') # Classifier model transformer = joblib.load('tfidf_transformer.sav') # TF-IDF model def predict_tweet(tweet): x = re.sub(r'http\S+', '', tweet) # remove URLs x = ' '.join( re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", x).split()) # remove special characters and extra spaces tweet = nlp(x) # add the text to spacy pipeline # clean text by removing stopwords, punctuation, digits, lemmatize the tokens and turn them into lowercase. tweet = ' '.join([ token.lemma_.lower() for token in tweet if not token.is_stop and not token.is_punct and not token.text.isdigit() and len(token.text) > 2 ]) # Predictions # pass the clean text to the TF-IDF to transform the text and then use the classifier to predict result = ot_classifier.predict(transformer.transform([tweet])) # covert results into readable classes
fps = int(cap.get(cv2.CAP_PROP_FPS)) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) YOLOSIZE = (512, 512) FRAMESIZE = (height, width) tracker = Tracker(xs=xs, ys=ys, sigma_h=0.9, sigma_iou=0.7, metric=args.metric, t_min=10, params_file=PARAMFILE, frameRate=fps) all_detections = load(f"results/detections/video{args.video}") i = 0 for frame_detections in all_detections: ret, frame = cap.read() img_in = cv2.resize(frame, YOLOSIZE) img_in = cv2.cvtColor(img_in, cv2.COLOR_BGR2RGB) bboxes, scores, classes, num_dets = frame_detections #print(bboxes) image = draw_bbox(frame, frame_detections) if len(frame_detections[0]) > 0: tracker.update(bboxes, i) image = tracker.write_velocities(image)
import logging import random from fastapi import APIRouter from fastapi.encoders import jsonable_encoder from fastapi.responses import JSONResponse import pandas as pd from pydantic import BaseModel, Field, validator from sklearn.preprocessing import StandardScaler from sklearn.cluster import KMeans from sklearn.neighbors import NearestNeighbors import json from joblib import load model=load('knn_final.joblib') df = pd.read_csv("https://raw.githubusercontent.com/BW-pilot/MachineLearning/master/spotify_final.csv") spotify = df.drop(columns = ['track_id']) scaler = StandardScaler() spotify_scaled = scaler.fit_transform(spotify) log = logging.getLogger(__name__) router = APIRouter() def knn_predictor(audio_feats, k=20): """ differences_df = knn_predictor(audio_features) """ audio_feats_scaled = scaler.transform([audio_feats]) ##Nearest Neighbors model