Exemple #1
0
def load_netflix():
    data_home = get_data_home()
    path = os.path.join(data_home, "nf_prize", "X_tr.pkl")
    X_tr = joblib.load(path)
    path = os.path.join(data_home, "nf_prize", "X_te.pkl")
    X_te = joblib.load(path)
    return X_tr, X_te
    def compute_nearest_docs(self, query, topn=10):
        t1 = time.time()
        query_affinity = []

        entity = load(self.file_entity, mmap_mode="r")
        relation_normal = load(self.file_relation_normal, mmap_mode="r")
        relation = load(self.file_relation, mmap_mode="r")

        for query_triple in query:
            candidates = [(query_triple, (n.left_entity, n.relation, n.right_entity))
                          for n in self.cluster_representative.values()]

            affinity = [(candidate, self.relation_embedding.kernel_density_pair
                        (candidate, relation_normal=relation_normal,
                         entity=entity, relation=relation)) for candidate in candidates]

            affinity = {c : e for c, e in enumerate(affinity)}
            query_affinity.append(affinity)

        print query_affinity
        candidates = [(block_id, query, block_relations) for block_id, block_relations
                      in self.relation_by_doc.iteritems()]

        density_by_doc = self.parallel_pool(delayed(func)(self, candidate, query_affinity,
                                                          entity, relation_normal, relation)
                                            for candidate in candidates)

        density_by_doc = sorted(density_by_doc, key=lambda e: e[1])[:topn]
        nearest_docs = [(block_id, score, self.doc_text[block_id]) for block_id, score in density_by_doc]
        t2 = time.time()
        print t2 -t1, " seconds"
        return nearest_docs
Exemple #3
0
def load_experts(fname, max_files=float('inf'), min_return=None):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    if hasattr(fname, '__iter__'):
        paths = []
        for fname_ in fname:
            tf.reset_default_graph()
            with tf.Session(config=config):
                snapshot_dict = joblib.load(fname_)
            paths.extend(snapshot_dict['paths'])
    else:
        with tf.Session(config=config):
            snapshot_dict = joblib.load(fname)
        paths = snapshot_dict['paths']
    tf.reset_default_graph()

    trajs = []
    for path in paths:
        obses = path['observations']
        actions = path['actions']
        returns = path['returns']
        total_return = np.sum(returns)
        if (min_return is None) or (total_return >= min_return):
            traj = {'observations': obses, 'actions': actions}
            trajs.append(traj)
    random.shuffle(trajs)
    print('Loaded %d trajectories' % len(trajs))
    return trajs
    def transform(self, X, stride_size=1, save_to_file=None, memmap=False, force_rerun=False):
        """
        Expects X to be in the shape of (n, x, y, chan)
        """
        if not hasattr(self, 'centroids_'):
            raise RuntimeError("Model has not been fitted")

        if save_to_file is not None and os.path.exists(save_to_file) and not force_rerun:
            logger.info("File already exists, loading from {}".format(save_to_file))
            if memmap:
                res = joblib.load(save_to_file, mmap_mode='r+')
            else:
                res = joblib.load(save_to_file)
        else:
            all_rows = range(X.shape[0])
            chunked_rows = list(chunks(all_rows, self.n_jobs))
            logger.info("Transforming in {} jobs, chunk sizes: {}".format(self.n_jobs, [len(x) for x in chunked_rows]))
            res = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
                delayed(chunked_extract_features)(i, X, self.rf_size, self.centroids_, self.mean_, self.p_, True, stride_size, self.pool_method) for i in chunked_rows
            )
            res = np.vstack(res)
            if save_to_file is not None:
                logger.info("Saving results to file {}".format(save_to_file))
                joblib.dump(res, save_to_file)
                if memmap:
                    res = joblib.load(save_to_file, mmap_mode='r+')

        return res
def find_best_features(df_train, y_train):
    rfr = RandomForestRegressor(n_estimators=500, max_depth=6, n_jobs=16)

    # vals_pearson = df_train.corr('pearson').values
    vals_pearson = joblib.load("vals_pearson.pkl")
    # vals_kendall = df_train.corr('kendall').values
    # vals_spearman = df_train.corr('spearman').values
    vals_spearman = joblib.load("vals_spearman.pkl")

    vals = (vals_pearson + vals_spearman) / 2

    dumped_cols = []
    res_cols = [True] * vals.shape[0]
    for i in range(vals.shape[0]):
        if i not in dumped_cols:
            for j in range(vals.shape[1]):
                if i != j:
                    if abs(vals[i, j]) > 0.90:
                        dumped_cols.append(j)
                        res_cols[j] = False

    # df_train2 = df_train[df_train.columns[res_cols]]

    rfecv = RFECV(rfr, step=10, cv=5, scoring=rmse_scorer, verbose=2)  # Float step gives error on the end

    # rfecv.fit(df_train2, y_train)
    rfecv = joblib.load("rfecv.pkl")

    return (res_cols, rfecv.get_support())
Exemple #6
0
 def __init__(self, start_url=settings.TEST_START_URL, domains=settings.ALLOWED_DOMAINS):
     self.name = 'find_data'
     self.start_urls = [start_url]
     self.allowed_domains = domains
     # load in the regressors
     self.page_reg = joblib.load(settings.DATA_DIRECTORY+'../clf/page_pipe.pkl')
     self.url_reg = joblib.load(settings.DATA_DIRECTORY+'../clf/url_pipe.pkl')
Exemple #7
0
 def make_counts(self, preprocessor, short_id, column_names, type_n, type_v):
     #count_vector_titles = CountVectorizer(
         #read_column(train_filename, column_name),
         #max_features=200)
     file_id = self._check_type_n(type_n)
     valid_file_id = self._check_type_n(type_v)
     name = "%s_%s_%s_%s"
     for column_name in column_names:
         vocabulary_path = path_join(self.cache_dir, name % (column_name, type_n, short_id, "vocabulary"))
         stop_words_path = path_join(self.cache_dir, name % (column_name, type_n, short_id, "stop_words"))
         valid_path = path_join(self.cache_dir, name % (column_name, type_v, short_id, "matrix"))
         cur_preprocessor = clone(preprocessor)
         print "Working on %s" % column_name
         if isfile(vocabulary_path) and isfile(stop_words_path):
             print "vocabulary exists"
             vocabulary = joblib.load(vocabulary_path)
             stop_words = joblib.load(stop_words_path)
             cur_preprocessor.set_params(vocabulary=vocabulary)
             cur_preprocessor.set_params(stop_words=stop_words)
         else:
             print "Fitting train"
             cur_preprocessor.set_params(input=self.read_column(file_id, column_name))
             titles = cur_preprocessor.fit_transform(self.read_column(file_id, column_name))
             joblib.dump(cur_preprocessor.vocabulary_, vocabulary_path)
             joblib.dump(cur_preprocessor.stop_words_, stop_words_path)
             print joblib.dump(titles, path_join(self.cache_dir, name % (column_name, type_n, short_id, "matrix")))
         if not isfile(valid_path):
             print "Fitting valid"
             titles_valid = cur_preprocessor.transform(
                 self.read_column(valid_file_id, column_name))
             print joblib.dump(titles_valid, valid_path)
def my_form_post():

    title = request.form['title']
    description = request.form['description']
    model = joblib.load('./ListUp/ListupNLP_v2.pkl')
    count_vect = joblib.load('./ListUp/vect.pkl')
    tfidf_transformer = joblib.load('./ListUp/tfidf.pkl')
    def review_to_words( raw_review ):
        review_text = BeautifulSoup(raw_review).get_text()     
        letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
        words = letters_only.lower().split()                                               
        more_meaningful_words=[]
        for words in words:
            if len(words) < 3:
                continue
            else:
                more_meaningful_words.append(words)         
        return( " ".join( more_meaningful_words ))
    def stem_words(text):
        lemma = joblib.load('./ListUp/lemma.pkl')
        stemmed_words =[lemma.lemmatize(word) for word in text.split(" ")]
        return( " ".join( stemmed_words ))
    docs_new = [stem_words(review_to_words(title + " "+ description))]
    X_new_counts = count_vect.transform(docs_new)
    X_new_tfidf = tfidf_transformer.transform(X_new_counts)
    predicted = model.predict(X_new_tfidf)
    decison_function = model.decision_function(X_new_tfidf)
    confidence = 1/(1+np.exp(-np.amax(decison_function)))
    if confidence > 0.8:
        return 'The predicted class is %s with confidence of %d' %(predicted[0], confidence*100)
    else:
        return 'Lets be honest, cant predict this as confidence is %s' %(round(confidence*100, 2))
Exemple #9
0
def retest(trainsvd):
    truncated_train_svd = joblib.load("truncated_train_svd_" + str(trainsvd)+".o")
    truncated_test_svd = joblib.load("truncated_test_svd_" + str(trainsvd)+".o")
    row_index = 0
    with open("../data/f_hashtag_prediction/test_data_tweets_processed_2K.txt") as ftest:
        test_set = ftest.read().splitlines()
        with open("prediction_result_1K_unique_"+ str(trainsvd)+".txt","w") as output_prediction:
            with open("../data/f_hashtag_prediction/train_data_all_hashtags.txt") as ftrain:
                with open("../data/f_hashtag_prediction/test_data_all_hashtags.txt") as ftest:
                    test_set_hashtags = ftest.read().splitlines()
                    train_set_hashtags = ftrain.read().splitlines()
                    begin_index = 0
                    for row in truncated_test_svd[begin_index:]:
                        if row_index > 1000:
                            break
                        print "TEST TWEET (row: " + str(row_index) + ") : " + test_set[row_index]
                        cosine = cosine_similarity(truncated_test_svd[row_index], truncated_train_svd)
                        m = max(cosine[0])
                        mindex = [i for i, j in enumerate(cosine[0]) if j == m]
                        train_tags = set()
                        test_tags = set()
                        for num_line in mindex:
                            train_tags.update(train_set_hashtags[num_line].split(","))
                        test_tags.update(test_set_hashtags[row_index].split(","))

                        utr = set(list(itertools.chain(train_tags)))
                        ut = set(list(itertools.chain(test_tags)))
                        test_tweet = "TEST TWEET (row: " + str(row_index) + ") : " + str(test_set[row_index])
                        print "TRAIN TAGS: " + str(utr)
                        print "TEST TAGS:" + str(ut)
                        print "*****"
                        output_prediction.write("*****\n"+test_tweet +"\n" + "TRAIN TAGS: " + str(utr) + "\n" + "TEST TAGS:" + str(ut) + "\n" + "*****")

                        row_index += 1
def load_serial():
    print 'Deserializing learned model, vectorizers, and lexicons'
    char_vectorizer = joblib.load(saved_model_dir + 'char_vectorizer.pickle')
    word_vectorizer = joblib.load(saved_model_dir + 'word_vectorizer.pickle')
    model = joblib.load(saved_model_dir + 'svm_model.pk1')
    lexicons = joblib.load(saved_model_dir + 'lexicons.pickle')
    return model, char_vectorizer, word_vectorizer, lexicons
Exemple #11
0
def motionEstTSS(curI, nextI, blockSize, stepSize, shiftSize):
	""" Computes motion vectors using 3-step search method
		Input:
			curI: The image for which we want to find motion vectors
			nextI: The reference image
			blockSize:
		 	stepSize:
			shiftSize:
		Ouput:
		    velX, velY : the motion vectors for each direction
	"""
	# check if two images have the same size
	if nextI.shape != curI.shape:
		print "Two images do not have the same size"
		return [], []
	
	# filepath for temp generated file used by parallel computation
	folder = tempfile.mkdtemp()
	curI_path = os.path.join(folder, 'curI')
	nextI_path = os.path.join(folder, 'nextI')
	velX_path = os.path.join(folder, 'velX')
	velY_path = os.path.join(folder, 'velY')

	# get pre-defined size
	height, width = curI.shape
	
	block_r = blockSize / 2
	velSize = ((height + 1 - 2 * block_r) / shiftSize, (width + 1 - 2 * block_r) / shiftSize)
	
	# get the number of system cores
	num_cores = multiprocessing.cpu_count()

	"""Pre-allocate a writeable shared memory map as a container for the results
	motion vectors of the parallel computation
	"""
	velX = np.memmap(velX_path, dtype=np.int32, shape=velSize, mode='w+')
	velY = np.memmap(velY_path, dtype=np.int32, shape=velSize, mode='w+')

	# Dump the input images to disk to free the memory
	dump(curI, curI_path)
	dump(nextI, nextI_path)

	"""Release the reference on the original in memory array and replace it
	by a reference to the memmap array so that the garbage collector can
	release the memory before forking. gc.collect() is internally called
	in Parallel just before forking.
	"""
	curI = load(curI_path, mmap_mode='r')
	nextI = load(nextI_path, mmap_mode='r')

	# Fork the worker processes to perform motion vector computation concurrently
	Parallel(n_jobs=num_cores)(delayed(estTSS)(curI, nextI, velX, velY, i, j, block_r, stepSize, shiftSize, height, width) for i in range(velSize[0]) for j in range(velSize[1]))

	# try:
	# 	shutil.rmtree(folder)
	# except:
	# 	print("Failed to delete: " + folder)

	return velX, velY
def main():
    X = joblib.load('./X_words.jbl')
    y = joblib.load('./y_words.jbl')

    print('loaded data')
    model = models.create_model(X.shape[1], X.shape[2])
    print('model compiled')
    print(model.summary())
    model.fit(X, y, batch_size=128, nb_epoch=1)
    model.save_weights('word_model.h5', overwrite=True)
def worker_init(id_state, id_exp):
    """process initialization function. This function is only used when the
    child processes are spawned (instead of forked). When using the fork model
    of multiprocessing the data is just inherited in process memory."""
    import joblib

    global _mp_state
    state = joblib.load(id_state)
    experiment = joblib.load(id_exp)
    _mp_state = state + (experiment,)
def main():
    os.system("taskset -p 0xff %d" % os.getpid())
    corpus = []
    query_queryvector_map_file = os.path.join(output_location, "query_queryvector_map.pkl")
    repvector_file = os.path.join(output_location, "repvector_nparray.pkl")

    query_queryvector_map = {}

    print "\nStarting ... "
    print "\nNow caching the corpus list with the list of queries... "

    # Load the Query Data
    corpus = joblib.load(inputfilepath)

    print "\nCaching of corpus list complete!"
    corpuscount = len(corpus)

    vocab_dict, repvector = getCorpusDict(vectorsfilepath)

    # Dump the large numpy array to disk


    if not os.path.exists(repvector_file):
        print "Dump the large numpy array to disk"
        joblib.dump(repvector,repvector_file)
        print "Dumping of the Vector file to Disk complete!"

    # Load the repvector into the memory map -- Shared memory to be used by the processes.
    print "Loading the representation vector into the memory map."
    repvector_memmap = joblib.load(repvector_file, mmap_mode='r+')


    print "\nStarting Query Vector Computation ... "

    # Multi-Processing Code using job-lib
    # Initiating Parallel jobs for compute intensive task of generating sentence vectors.
    # max_nbytes=None,
    results = Parallel(n_jobs=numJobs,  max_nbytes=None, verbose=10)(delayed(generateQueryAndQueryVectorMap)(line_tmp, vocab_dict, repvector_memmap) \
                                       for line_tmp in corpus[:100])

    # results = Parallel(n_jobs=numJobs,  max_nbytes=None, verbose=10)(delayed(generateQueryAndQueryVectorMap)(line_tmp, vocab_dict, repvector) \
    #                                     for line_tmp in corpus)

    # Aggregate the results into the query_queryvector_map dict.
    for indiv_res in results:
        key, value = indiv_res
        query_queryvector_map[key] = value

    print "\nQuery Vector Computation finished!"
    print 'Vector population dumped to disk ... '
    joblib.dump(query_queryvector_map, query_queryvector_map_file)
    print 'Data successfully dumped to disk!'
def test_old_pickle(tmpdir):
    import joblib

    # Check that a pickle that references sklearn.external.joblib can load
    f = tmpdir.join('foo.pkl')
    f.write(b'\x80\x02csklearn.externals.joblib.numpy_pickle\nNumpyArrayWrappe'
            b'r\nq\x00)\x81q\x01}q\x02(U\x05dtypeq\x03cnumpy\ndtype\nq\x04U'
            b'\x02i8q\x05K\x00K\x01\x87q\x06Rq\x07(K\x03U\x01<q\x08NNNJ\xff'
            b'\xff\xff\xffJ\xff\xff\xff\xffK\x00tq\tbU\x05shapeq\nK\x01\x85q'
            b'\x0bU\x05orderq\x0cU\x01Cq\rU\x08subclassq\x0ecnumpy\nndarray\nq'
            b'\x0fU\nallow_mmapq\x10\x88ub\x01\x00\x00\x00\x00\x00\x00\x00.',
            mode='wb')

    joblib.load(str(f))
Exemple #16
0
def retrain(svdcomp):
            smatrix = joblib.load("test_tfidf_matrix.o")
            tfidf_matrix = joblib.load("train_tfidf_matrix.o")

            svd = TruncatedSVD(n_components=svdcomp, random_state=42)
            svd.fit(tfidf_matrix)
            truncated_train_svd = svd.transform(tfidf_matrix)
            truncated_test_svd = svd.transform(smatrix)

            print truncated_train_svd.shape
            print truncated_test_svd.shape

            joblib.dump(truncated_train_svd, "truncated_train_svd_" + str(svdcomp)+".o")
            joblib.dump(truncated_test_svd, "truncated_test_svd_" + str(svdcomp)+".o")
Exemple #17
0
def run_binary(name, comments):
    """run a binary model (logistic regression)"""
    models_path = models_dir()
    vector = joblib.load('{}/{}_vectorizer.pkl'.format(models_path, name))
    vecs = vector.transform((c['body'] for c in comments))

    model = joblib.load('{}/{}.pkl'.format(models_path, name))
    probs = model.predict_proba(vecs)

    pred = [{
        'id': c['_id'],
        'prob': prob[1]
    } for c, prob in zip(comments, probs)]
    return pred
Exemple #18
0
def main():
    """
    Main function. Read the test and training data, and tokenization. Apply find_tags for test documents that do not have a
    duplicate title in the training data to compute tags. Finally, write all results to file.
    @return: None
    """
    data = read_zip(trainingzip, trainingfile, cols=["Id", "Title", "Tags"], index_col=0, count=nrows).drop_duplicates(
        cols="Title", take_last=True)   # TODO: take_last=True
    test = read_zip(testzip, testfile, cols=["Id", "Title", "Body"], count=nrows)

    logger.info(asctime() + " Reading tag counts from '{0}'...".format(tagcache))
    tags = joblib.load(tagcache, mmap_mode="r") # no normalization done here
    multitoken_i = multitoken_index(tags)

    logger.info(asctime() + " Loading punkt_tokenizations index from '{0}'...".format(punkt_tokenizationsindexfile))
    punkt_tokenizationindex = joblib.load(punkt_tokenizationsindexfile)
    punctword_tokenizationindex = joblib.load(punctword_tokenizationsindexfile)

    logger.info(asctime() + " Merging training data and test data...")
    predictions = pd.merge(test, data, on="Title", how="left").drop_duplicates("Id")

    missing = predictions.index[predictions.Tags.isnull()]
    logger.info("{0} Computing {1} missing tags between {2} and {3}...".format(asctime(), len(missing),
                                                                               predictions.Id[missing[0]],
                                                                               predictions.Id[missing[-1]]))
    punkt_tokenizations = pd.Series()
    wordpunct_tokenizations = pd.Series()
    counter = 0
    for i in missing:
        counter += 1
        if counter % 10000 == 0:
            logger.info(asctime() + " Done: {0} out of {1}.".format(counter, len(missing)))
        if predictions.Id[i] not in punkt_tokenizations.index:
            logger.info(asctime() + " Loading tokenizations for {0} from '{1}'".format(predictions.Id[i],
                                                                                       punkt_tokenizationindex[
                                                                                           predictions.Id[i]]))
            punkt_tokenizations = joblib.load(punkt_tokenizationindex[predictions.Id[i]])
            logger.info(asctime() + " Loading tokenizations for {0} from '{1}'".format(predictions.Id[i],
                                                                                       punctword_tokenizationindex[
                                                                                           predictions.Id[i]]))
            wordpunct_tokenizations = joblib.load(punctword_tokenizationindex[predictions.Id[i]])
            logger.info(asctime() + " Done reading '{0}'.".format(punctword_tokenizationindex[predictions.Id[i]]))
        tokenization = pd.Series(Counter(punkt_tokenizations[predictions.Id[i]].to_dict()) + Counter(
            wordpunct_tokenizations[predictions.Id[i]].to_dict()))
        predictions.Tags[i] = " ".join(find_tags(tokenization, tags, multitoken_i))

    outfile = "/home/carsten/facebook/predictions_{0}documents.csv".format(nrows)
    logger.info(asctime() + " Writing predictions to '{0}'...".format(outfile))
    predictions.sort(columns="Id").to_csv(outfile, index=False, cols=["Id", "Tags"], quoting=csv.QUOTE_ALL)
    logger.info(asctime() + " Done.")
Exemple #19
0
 def memmap(self):
     if isinstance(self.points, numpy.memmap):
         return
     dn = tempfile.mkdtemp(prefix='springmesh')
     Mesh._memmap_dirs.append(dn)
     pfn = os.path.join(dn, 'mesh_points.npy')
     sfn = os.path.join(dn, 'mesh_springs.npy')
     # dump
     dpfn = joblib.dump(self.points, pfn)[0]
     dsfn = joblib.dump(self.springs, sfn)[0]
     # load
     # TODO free originals?
     self.points = joblib.load(dpfn, 'r+')
     self.springs = joblib.load(dsfn, 'r+')
Exemple #20
0
def test(netFile, dataSet, model='RNN', trees=None):
    if trees == None:
        if dataSet == "train":
            trees = tr.load_trees(TRAIN_DATA_FILE)
        elif dataSet == "dev":
            trees = tr.load_trees(DEV_DATA_FILE)
    
    assert netFile is not None, "Must give model to test"
    print "Testing netFile %s" % netFile

    #f = open(netFile, 'rb')
    #opts = pickle.load(f)
    #_ = pickle.load(f)
    opts = joblib.load(netFile + "_opts")
    _ = joblib.load(netFile + "_cost")
    
    if (model=='RNTN'):
        nn = RNTN(opts.wvecDim,opts.outputDim,opts.numWords,opts.minibatch)
    elif(model=='RNN'):
        nn = RNN(opts.wvecDim,opts.outputDim,opts.numWords,opts.minibatch)
    elif(model=='RNN2'):
        nn = RNN2(opts.wvecDim,opts.middleDim,opts.outputDim,opts.numWords,opts.minibatch)
    else:
        raise '%s is not a valid neural network so far only RNTN, RNN, and RNN2' % opts.model
    
    nn.initParams()
    #nn.stack = pickle.load(f)
    #nn.stack = np.load(f)
    nn.stack = joblib.load(netFile + "_stack")
    #f.close()

    print "Testing %s..." % model

    cost, correct, guess, total = nn.costAndGrad(trees, test=True)
    correct_sum = 0
    for i in xrange(0, len(correct)):        
        correct_sum += (guess[i] == correct[i])
    
    # confusion matrix
    conf_arr = np.zeros((opts.outputDim, opts.outputDim))
    for i in xrange(len(correct)):
        curr_correct = correct[i]
        curr_guess = guess[i]
        conf_arr[curr_correct][curr_guess] += 1.0

    #makeconf(conf_arr)
    
    print "Cost %f, Acc %f" % (cost, correct_sum / float(total))
    return correct_sum / float(total)
Exemple #21
0
def get_grid_featurized_pdbbind_dataset(subset):
    """Downloads and caches grid featurized PDBBind dataset.

    Args:
        subset (str): subset name of PDBBind dataset.

    Returns (NumpyTupleDataset):
        grid featurized PDBBind dataset.

    """
    x_path, y_path = get_grid_featurized_pdbbind_filepath(subset)
    x = joblib.load(x_path).astype('i')
    y = joblib.load(y_path).astype('f')
    dataset = NumpyTupleDataset(x, y)
    return dataset
Exemple #22
0
    def check_bad(self, delete_bad=True):
        """Check that the result dumps are not bad -> sometimes length does not
        match the batch. Optionally delete these so that they can be re-grown.

        Parameters
        ----------
        delete_bad : bool
            Delete bad results as they are come across.

        Returns
        -------
        bad_ids : tuple
            The bad batch numbers.
        """
        # XXX: work out why this is needed sometimes on network filesystems.
        result_files = glob(
            os.path.join(self.location, "results", RSLT_NM.format("*")))

        bad_ids = []

        for result_file in result_files:
            # load corresponding batch file to check length.
            result_num = os.path.split(
                result_file)[-1].strip("xyz-result-").strip(".jbdmp")
            batch_file = os.path.join(
                self.location, "batches", BTCH_NM.format(result_num))

            batch = joblib.load(batch_file)

            try:
                result = joblib.load(result_file)
                unloadable = False
            except Exception as e:
                unloadable = True
                err = e

            if unloadable or (len(result) != len(batch)):
                msg = "result {} is bad".format(result_file)
                msg += "." if not delete_bad else " - deleting it."
                msg += " Error was: {}".format(err) if unloadable else ""
                print(msg)

                if delete_bad:
                    os.remove(result_file)

                bad_ids.append(result_num)

        return tuple(bad_ids)
Exemple #23
0
def load_net(nnet_file):
    if path.splitext(nnet_file)[1] == 'joblib':
        nnet = joblib.load(nnet_file)
    else:
        with open(nnet_file, 'rb') as fid:
            nnet = pickle.load(fid)
    return nnet
Exemple #24
0
def create_test_prediction(dataset, model):
    """Create and yield test prediction, then delete.

    Params
    ------
    dataset : `models.Dataset` instance
        The dataset on which prediction will be performed.
    model  : `models.Model` instance
        The model to use to create prediction.

    """
    with featureset.from_netcdf(model.featureset.file.uri, engine=cfg['xr_engine']) as fset_data:
        model_data = joblib.load(model.file.uri)
        pred_data = predict.model_predictions(fset_data.load(), model_data)
    pred_path = pjoin(cfg['paths']['predictions_folder'],
                      '{}.nc'.format(str(uuid.uuid4())))
    pred_data.to_netcdf(pred_path, engine=cfg['xr_engine'])
    f, created = m.File.create_or_get(uri=pred_path)
    pred = m.Prediction.create(file=f, dataset=dataset, project=dataset.project,
                               model=model, finished=datetime.datetime.now())
    pred.save()
    try:
        yield pred
    finally:
        pred.delete_instance()
Exemple #25
0
    def load(self):
        if self.system_joblib:
            import joblib
        else:
            from sklearn.externals import joblib

        X, y = [], []

        filenames = sorted(glob.glob(expand_path(self.filenames)))
        if len(filenames) == 0:
            raise RuntimeError('no filenames matched by pattern: %s' %
                               self.filenames)

        for fn in filenames:
            obj = joblib.load(fn)
            if isinstance(obj, (list, np.ndarray)):
                X.append(obj)
            else:
                X.append(obj[self.x_name])
                y.append(obj[self.y_name])

        if len(X) == 1:
            X = X[0]
        if len(y) == 1:
            y = y[0]
        elif len(y) == 0:
            y = None

        return X, y
Exemple #26
0
	def load_p_w(self, sub_folder):
		if (os.path.exists(os.path.join(os.path.join(self.cache_path_, sub_folder, 'p_w.hdf')))):
			p_w = utils.hdf_to_numpy(os.path.join(self.cache_path_, sub_folder), 'p_w')
		else:
			p_w = joblib.load(os.path.join(self.cache_path_, sub_folder, 'p_w.joblib'))

		return p_w
def dimReduction(corpus,mode,idx):
    
    print("Dimension reduction...")
    if sp.sparse.isspmatrix_csr(corpus):
        data_matrix = corpus.toarray()
    data_matrix=[]
    if mode == 'train':
        dim_reduc_pipe = marcos.DIMREDUC_PIPE
        dim_reduc_pipe.set_params(pca__n_components=1000)

        # bow_transformer = BOWTransformer()
        data_matrix = dim_reduc_pipe.fit_transform(corpus)

        #save transform model
        jl.dump(dim_reduc_pipe,'{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx))

    elif mode == 'test':
        dim_reduc_pipe = jl.load('{}/{}.model_reduc'.format(marcos.TRANSFORM_MODEL_DIR,idx))
        data_matrix = dim_reduc_pipe.transform(corpus)

    else:
        print("Unexpected mode in BOWtransform",file=sys.stderr)
        sys.exit()

    # turn dt matrix to list
    print ("The shape of dt matrix is {} (after dimension reduction)\n".format(data_matrix.shape))

    return data_matrix.tolist()
    def compute_distance_matrix(self):
        triples = (((ti.left_entity, ti.relation, ti.right_entity), (tj.left_entity, tj.relation, tj.right_entity))
                   for ti,tj in product(*[self.kb_triples, self.kb_triples]))

        entity = load(self.file_entity, mmap_mode="r")
        relation_normal = load(self.file_relation_normal, mmap_mode="r")
        relation = load(self.file_relation, mmap_mode="r")

        distances = self.parallel_pool(delayed(kernel_density_pair)
                                       (self, pair, relation_normal=relation_normal,
                                        relation=relation, entity=entity, mmaped=True)
                                       for pair in triples)

        self.distances = distances
        distances = np.array(distances)
        self.distance_matrix = distances.reshape(len(self.kb_triples), len(self.kb_triples))
def BOWtransform(corpus,mode,idx):

    data_matrix=[]
    print('Transform data...')

    if mode == 'train':
        bow_transformer = BOWTransformer()
        data_matrix = bow_transformer.fit_transform(corpus)

        #save transform model
        jl.dump(bow_transformer,'{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx))

    elif mode == 'test':
        bow_transformer = jl.load('{}/{}.model'.format(marcos.TRANSFORM_MODEL_DIR,idx))
        data_matrix = bow_transformer.transform(corpus)

    else:
        print("Unexpected mode in BOWtransform",file=sys.stderr)
        sys.exit()

    # turn dt matrix to list
    print ("The shape of dt matrix is {}\n".format(data_matrix.shape))

    if sp.sparse.isspmatrix_csr(data_matrix):
        data_matrix = data_matrix.toarray().tolist()
    else: #pass through dimension reduction pipe
        data_matrix = data_matrix.tolist()

    return data_matrix
Exemple #30
0
    def from_file(cls, objdump_path):
        '''
        Parameters
        ----------
        objdump_path: str
            Path to the object dump file.

        Returns
        -------
        instance
            New instance of an object from the pickle at the specified path.
        '''
        obj_version, object = joblib.load(objdump_path)
        # Check that we've actually loaded a PersistenceMixin (or sub-class)
        if not isinstance(object, cls):
            raise ValueError(('The pickle stored at {} does not contain ' +
                              'a {} object.').format(objdump_path, cls))
            # Check that versions are compatible. (Currently, this just checks
            # that major versions match)
        elif obj_version[0] == VERSION[0]:
            if not hasattr(object, 'sampler'):
                object.sampler = None
                return object
            else:
                raise ValueError(("{} stored in pickle file {} was created with version {} "
                                  "of {}, which is incompatible with the current version "
                                  "{}").format(cls, objdump_path, cls.__name__,
                                               '.'.join(obj_version), '.'.join(VERSION)))
Exemple #31
0
def run(dataset_path=DEFAULT_DATASET, dataset_name='timit',
        iterator_type=ABX2OIterator, batch_size=100,
        nframes=13, features="fbank",
        init_lr=0.01, max_epochs=500, 
        network_type="dropout_net", trainer_type="adadelta",
        layers_types=[ReLU, ReLU, ReLU, ReLU, LogisticRegression],
        layers_sizes=[2400, 2400, 2400, 2400],
        dropout_rates=[0.2, 0.5, 0.5, 0.5, 0.5],
        prefix_fname='',
        debug_print=0,
        debug_time=False,
        debug_plot=0):
    """
    FIXME TODO
    """

    output_file_name = dataset_name
    if prefix_fname != "":
        output_file_name = prefix_fname + "_" + dataset_name
    output_file_name += "_" + features + str(nframes)
    output_file_name += "_" + network_type + "_" + trainer_type
    output_file_name += "_emb_" + str(DIM_EMBEDDING)
    print "output file name:", output_file_name

    n_ins = None
    n_outs = None
    print "loading dataset from", dataset_path
     # TODO DO A FUNCTION
    if dataset_path[-7:] != '.joblib':
        print >> sys.stderr, "prepare your dataset with align_words.py or lucid.py or buckeye.py"
        sys.exit(-1)

    ### LOADING DATA
    data_same = joblib.load(dataset_path)
    shuffle(data_same)

    has_dev_and_test_set = True
    dev_dataset_path = dataset_path[:-7].replace("train", "") + 'dev.joblib'
    test_dataset_path = dataset_path[:-7].replace("train", "") + 'test.joblib'
    dev_split_at = len(data_same)
    test_split_at = len(data_same)
    if not os.path.exists(dev_dataset_path) or not os.path.exists(test_dataset_path):
        has_dev_and_test_set = False
        dev_split_at = int(0.8 * dev_split_at)
        test_split_at = int(0.9 * test_split_at)

    print data_same[0]
    print data_same[0][3].shape
    n_ins = data_same[0][3].shape[1] * nframes
    n_outs = DIM_EMBEDDING

    normalize = True
    min_max_scale = False
    marginf = (nframes-1)/2  # TODO

    ### TRAIN SET
    if has_dev_and_test_set:
        train_set_iterator = iterator_type(data_same,
                normalize=normalize, min_max_scale=min_max_scale,
                scale_f1=None, scale_f2=None, nframes=nframes,
                batch_size=batch_size, marginf=marginf)
    else:
        train_set_iterator = iterator_type(
                data_same[:dev_split_at], normalize=normalize,
                min_max_scale=min_max_scale, scale_f1=None, scale_f2=None,
                nframes=nframes, batch_size=batch_size, marginf=marginf)
    f1 = train_set_iterator._scale_f1
    f2 = train_set_iterator._scale_f2

    ### DEV SET
    if has_dev_and_test_set:
        data_same = joblib.load(dev_dataset_path)
        valid_set_iterator = iterator_type(data_same,
                normalize=normalize, min_max_scale=min_max_scale,
                scale_f1=f1, scale_f2=f2,
                nframes=nframes, batch_size=batch_size, marginf=marginf)
    else:
        valid_set_iterator = iterator_type(
                data_same[dev_split_at:test_split_at], normalize=normalize,
                min_max_scale=min_max_scale, scale_f1=f1, scale_f2=f2,
                nframes=nframes, batch_size=batch_size, marginf=marginf)

    ### TEST SET
    if has_dev_and_test_set:
        data_same = joblib.load(test_dataset_path)
        test_set_iterator = iterator_type(data_same,
                normalize=normalize, min_max_scale=min_max_scale,
                scale_f1=f1, scale_f2=f2, nframes=nframes,
                batch_size=batch_size, marginf=marginf)
    else:
        test_set_iterator = iterator_type(
                data_same[test_split_at:], normalize=normalize,
                min_max_scale=min_max_scale, scale_f1=f1, scale_f2=f2,
                nframes=nframes, batch_size=batch_size, marginf=marginf)

    assert n_ins != None
    assert n_outs != None

    # numpy random generator
    numpy_rng = numpy.random.RandomState(123)
    print '... building the model'

    # TODO the proper network type other than just dropout or not
    nnet = None
    fast_dropout = False
    if "dropout" in network_type:
        nnet = DropoutABNeuralNet(numpy_rng=numpy_rng,  # TODO with 2 Outputs
                n_ins=n_ins,
                layers_types=layers_types,
                layers_sizes=layers_sizes,
                n_outs=n_outs,
                loss='cos_cos2',
                rho=0.95,
                eps=1.E-6,
                max_norm=4.,
                fast_drop=fast_dropout,
                debugprint=debug_print)
    else:
        nnet = ABNeuralNet2Outputs(numpy_rng=numpy_rng, 
                n_ins=n_ins,
                layers_types=layers_types,
                layers_sizes=layers_sizes,
                n_outs=n_outs,
                loss='cos_cos2',
                #loss='dot_prod',
                rho=0.90,
                eps=1.E-6,
                max_norm=0.,
                debugprint=debug_print)
    print "Created a neural net as:",
    print str(nnet)

    # get the training, validation and testing function for the model
    print '... getting the training functions'
    print trainer_type
    train_fn = None
    if debug_plot or debug_print:
        if trainer_type == "adadelta":
            train_fn = nnet.get_adadelta_trainer(debug=True)
        elif trainer_type == "adagrad":
            train_fn = nnet.get_adagrad_trainer(debug=True)
        else:
            train_fn = nnet.get_SGD_trainer(debug=True)
    else:
        if trainer_type == "adadelta":
            train_fn = nnet.get_adadelta_trainer()
        elif trainer_type == "adagrad":
            train_fn = nnet.get_adagrad_trainer()
        else:
            train_fn = nnet.get_SGD_trainer()

    train_scoref_w = nnet.score_classif_same_diff_word_separated(train_set_iterator)
    valid_scoref_w = nnet.score_classif_same_diff_word_separated(valid_set_iterator)
    test_scoref_w = nnet.score_classif_same_diff_word_separated(test_set_iterator)
    train_scoref_s = nnet.score_classif_same_diff_spkr_separated(train_set_iterator)
    valid_scoref_s = nnet.score_classif_same_diff_spkr_separated(valid_set_iterator)
    test_scoref_s = nnet.score_classif_same_diff_spkr_separated(test_set_iterator)
    data_iterator = train_set_iterator

    print '... training the model'
    # early-stopping parameters
    patience = 1000  # look as this many examples regardless TODO
    patience_increase = 2.  # wait this much longer when a new best is
                            # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = time.clock()

    done_looping = False
    epoch = 0
    lr = init_lr
    timer = None
    if debug_plot:
        print_mean_weights_biases(nnet.params)
    #with open(output_file_name + 'epoch_0.pickle', 'wb') as f:
    #    cPickle.dump(nnet, f, protocol=-1)

    while (epoch < max_epochs) and (not done_looping):
        epoch = epoch + 1
        avg_costs = []
        avg_params_gradients_updates = []
        if debug_time:
            timer = time.time()
        for iteration, (x, y) in enumerate(data_iterator):
            #print "x[0][0]", x[0][0]
            #print "x[1][0]", x[1][0]
            #print "y[0][0]", y[0][0]
            #print "y[1][0]", y[1][0]
            avg_cost = 0.
            if "delta" in trainer_type:  # TODO remove need for this if
                avg_cost = train_fn(x[0], x[1], y[0], y[1])
            else:
                avg_cost = train_fn(x[0], x[1], y[0], y[1], lr)
            if debug_print >= 3:
                print "cost:", avg_cost[0]
            if debug_plot >= 2:
                plot_costs(avg_cost[0])
                if not len(avg_params_gradients_updates):
                    avg_params_gradients_updates = map(numpy.asarray, avg_cost[1:])
                else:
                    avg_params_gradients_updates = rolling_avg_pgu(
                            iteration, avg_params_gradients_updates,
                            map(numpy.asarray, avg_cost[1:]))
            if debug_plot >= 3:
                plot_params_gradients_updates(iteration, avg_cost[1:])
            if type(avg_cost) == list:
                avg_costs.append(avg_cost[0])
            else:
                avg_costs.append(avg_cost)
        if debug_print >= 2:
            print_mean_weights_biases(nnet.params)
        if debug_plot >= 2:
            plot_params_gradients_updates(epoch, avg_params_gradients_updates)
        if debug_time:
            print('  epoch %i took %f seconds' % (epoch, time.time() - timer))
        avg_cost = numpy.mean(avg_costs)
        if numpy.isnan(avg_cost):
            print("avg costs is NaN so we're stopping here!")
            break
        print('  epoch %i, avg costs %f' % \
              (epoch, avg_cost))
        tmp_train = zip(*train_scoref_w())
        print('  epoch %i, training sim same words %f, diff words %f' % \
              (epoch, numpy.mean(tmp_train[0]), numpy.mean(tmp_train[1])))
        tmp_train = zip(*train_scoref_s())
        print('  epoch %i, training sim same spkrs %f, diff spkrs %f' % \
              (epoch, numpy.mean(tmp_train[0]), numpy.mean(tmp_train[1])))
        # TODO update lr(t) = lr(0) / (1 + lr(0) * lambda * t)
        lr = numpy.float32(init_lr / (numpy.sqrt(iteration) + 1.)) ### TODO
        #lr = numpy.float32(init_lr / (iteration + 1.)) ### TODO
        # or another scheme for learning rate decay
        #with open(output_file_name + 'epoch_' +str(epoch) + '.pickle', 'wb') as f:
        #    cPickle.dump(nnet, f, protocol=-1)

        # we check the validation loss on every epoch
        validation_losses_w = zip(*valid_scoref_w())
        validation_losses_s = zip(*valid_scoref_s())
        this_validation_loss = 0.25*(1.-numpy.mean(validation_losses_w[0])) +\
                0.25*numpy.mean(validation_losses_w[1]) +\
                0.25*(1.-numpy.mean(validation_losses_s[0])) +\
                0.25*numpy.mean(validation_losses_s[1])

        print('  epoch %i, valid sim same words %f, diff words %f' % \
              (epoch, numpy.mean(validation_losses_w[0]), numpy.mean(validation_losses_w[1])))
        print('  epoch %i, valid sim same spkrs %f, diff spkrs %f' % \
              (epoch, numpy.mean(validation_losses_s[0]), numpy.mean(validation_losses_s[1])))
        # if we got the best validation score until now
        if this_validation_loss < best_validation_loss:
            with open(output_file_name + '.pickle', 'wb') as f:
                cPickle.dump(nnet, f, protocol=-1)
            # improve patience if loss improvement is good enough
            if (this_validation_loss < best_validation_loss *
                improvement_threshold):
                patience = max(patience, iteration * patience_increase)
            # save best validation score and iteration number
            best_validation_loss = this_validation_loss
            # test it on the test set
            test_losses_w = zip(*test_scoref_w())
            test_losses_s = zip(*test_scoref_s())
            print('  epoch %i, test sim same words %f, diff words %f' % \
                  (epoch, numpy.mean(test_losses_w[0]), numpy.mean(test_losses_w[1])))
            print('  epoch %i, test sim same spkrs %f, diff spkrs %f' % \
                  (epoch, numpy.mean(test_losses_s[0]), numpy.mean(test_losses_s[1])))
        if patience <= iteration:  # TODO correct that
            done_looping = True
            break

    end_time = time.clock()
    print(('Optimization complete with best validation score of %f, '
           'with test performance %f') %
                 (best_validation_loss, test_score))
    print >> sys.stderr, ('The fine tuning code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time)
                                              / 60.))
    with open(output_file_name + '_final.pickle', 'wb') as f:
        cPickle.dump(nnet, f, protocol=-1)
Exemple #32
0
st.sidebar.title("Number of Samples")
number_samples = st.sidebar.slider('#Visuliazation Samples', 10, 1000)

# load related data
X_test = np.load('data/X_test.npy')
y_test = np.load('data/y_test.npy')
df = pd.read_csv('data/kc_house_data.csv', parse_dates=['date'])
df_new = df[['lat', 'long']]
df_new.columns = ['lat', 'lon']
st.map(df_new)

st.write('\n')

# plot LR model result
if model_name == "Linear Regression":
    model_lr = load('model/regr.joblib')
    y_pred_lr = model_lr.predict(X_test)
    st.write('R2 Error of ', model_name, 'is ',
             round(r2_score(y_test, y_pred_lr), 3))
    chart_plot(model_name, y_pred_lr, number_samples)
# plot RF model result
elif model_name == "Random Forest":
    model_rf = load('model/rf.joblib')
    y_pred_rf = model_rf.predict(X_test)
    st.write('R2 Error of ', model_name, 'is ',
             round(r2_score(y_test, y_pred_rf), 3))
    chart_plot(model_name, y_pred_rf, number_samples)
# compare models
else:
    model_lr = load('model/regr.joblib')
    y_pred_lr = model_lr.predict(X_test)
def main():
    st.title('Trying out Sentiment Analysis with Streamlit!')

    st.subheader("EDA, Data Cleaning, & Modeling with Kaggle's \
		Twitter US Ariline Sentiment Dataset.")

    main_image = Image.open('./Images/nlp-pipe.jpg')
    st.image(main_image, use_column_width=True)

    html_temp = """
	<div style="background-color:tomato;"><p style="color:white; font-size:18px; text-align:center">Choose what to do:</p></div>
	"""
    st.markdown(html_temp, unsafe_allow_html=True)

    if st.checkbox('Exploratory Data Analysis'):
        explorer = EDA()
        n_rows = st.sidebar.slider('Displaying dataset, select number of rows',
                                   10, 20)

        all_cols = explorer.df.columns.tolist()
        select_cols = st.sidebar.multiselect('Select column(s) to display:',
                                             all_cols,
                                             ['airline_sentiment', 'text'])

        'Number of rows:', n_rows,  #
        explorer.df[select_cols].head(n_rows),  #

        if st.sidebar.checkbox('Most Frequent Words Per Category'):
            '---------------------------------------------',  #
            st.info("Try with removing stopwords and/or tags('@'/'#')")
            st.write(
                'Most Frequent Words for Positive(Blue), Negative(Red), and Neutral(Green) Tweets:'
            )
            c = st.sidebar.slider(
                'Select a number for the top frequent words to display', 10,
                15, 10)
            c = int(c)

            remove_stop = False
            if st.sidebar.checkbox('Remove stop words'):
                remove_stop = True

            remove_at = False
            if st.sidebar.checkbox('Remove @ and #'):
                remove_at = True

            freqs = explorer.most_freq_words(c, remove_at, remove_stop)
            plt.show()
            st.pyplot()

            cat = st.sidebar.selectbox(
                "To view word counts, select a sentiment category",
                ('Positive', 'Negative', 'Neutral'))

            if cat == 'Positive':
                'Top words in ', freqs[0][0], ' tweets',  #
                freqs[0][1].head(c),  #
            elif cat == 'Negative':
                'Top words in ', freqs[1][0], ' tweets',  #
                freqs[1][1].head(c),  #
            else:
                'Top words in ', freqs[2][0], ' tweets',  #
                freqs[2][1].head(c),  #

        if st.sidebar.checkbox('Word Counts'):
            '---------------------------------------------',  #
            explorer.word_counts()
            st.pyplot()

        if st.sidebar.checkbox("View most frequent @'s and #'s"):
            '---------------------------------------------',  #
            char = st.sidebar.radio('', ('@', '#'))
            if char == '@':
                explorer.find_at_hash()
            else:
                explorer.find_at_hash(at=False)
            st.pyplot()

        if st.sidebar.checkbox("View most frequent emojis and emoticons"):
            '---------------------------------------------',  #
            c = st.sidebar.slider('Choose the number of top emojis to view',
                                  10, 20)
            emojis = explorer.find_emojis()
            emojis.head(c),  #
            st.balloons()

        if st.sidebar.checkbox('Target Field'):
            '---------------------------------------------',  #
            explorer.show_target_field()
            st.pyplot()

    if st.checkbox("Text Preprocessing And Sentiment Analysis"):
        text = st.text_area(
            "Enter your text to analize:",
            "@americanairline Thanks for the #amazing flying experience!")
        cleaner = Cleaner(text)
        operations = st.sidebar.multiselect(
            "Choose the preprocessing steps to perform", [
                'Lowercasing', 'Remove html tags', 'Remove punctuations',
                'Replace links', 'Replace emojis', 'Replace Mentions(@)',
                'Replace Hashtags(#)', 'Remove stop words', 'Lemmatization',
                'Spell correction'
            ], ['Remove stop words'])

        str_to_func = {
            'Lowercasing': cleaner.lowercasing,
            'Remove html tags': cleaner.remove_html,
            'Remove punctuations': cleaner.remove_punc,
            'Replace links': cleaner.replace_links,
            'Replace Mentions(@)': cleaner.replace_mentions,
            'Replace Hashtags(#)': cleaner.replace_hashtags,
            'Replace emojis': cleaner.replace_emojis,
            'Remove stop words': cleaner.remove_stop,
            'Lemmatization': cleaner.lemmatize,
            'Spell correction': cleaner.sepll_correct
        }

        if not operations:
            st.info('### No preprocessing steps selected')
        else:
            for op in operations:
                op = str_to_func[op]
                sample_text, findings = op()

                if findings:
                    st.info(op.__doc__ + ', '.join(findings).strip())

            st.write('#### Preprocessed text: ', sample_text)

        if st.button("Analyze Text Sentiment"):
            model = load('./Model/lr_clf.joblib')
            # confusion_matrix = Image.open('./Images/confusion_matrix.jpg')
            # 'Model Performance on the Test set:', #
            # st.image(confusion_matrix)

            class_names = ['negative', 'neutral', 'positive']
            explainer = LimeTextExplainer(class_names=class_names)

            if text:
                model = load('./lr_clf.joblib')
                processed_text, sentiment = get_sentiment(text, model)
                'Original text ---> ', text,  #
                'Processed text --> ', processed_text,  #
                'Text Sentiment --> {}'.format(sent_dict[sentiment]),  #

                exp = explainer.explain_instance(processed_text,
                                                 model.predict_proba)
                # exp.show_in_notebook()
                exp.as_pyplot_figure()
                st.pyplot()
Exemple #34
0
from sklearn import datasets
from joblib import load
import numpy as np
import json

#load the model

my_model = load('svc_model.pkl')


iris_data = datasets.load_iris()
class_names = iris_data.target_names

def my_prediction(id):
    dummy = np.array(id)
    dummyT = dummy.reshape(1,-1)
    r = dummy.shape
    t = dummyT.shape
    r_str = json.dumps(r)
    t_str = json.dumps(t)
    prediction = my_model.predict(dummyT)
    name = class_names[prediction]
    name = name.tolist()
    name_str = json.dumps(name)
    str = [t_str, r_str, name_str]
    return str
Exemple #35
0
import requests
from joblib import load, dump
url = 'http://localhost:5000/sample_predict'
data = load(r'F:\ML App\app\Model\sample_data.pkl')
#json={'LotArea':10084,'YearBuilt':2004,'1stFlrSF':1694,'2ndFlrSF':0,'FullBath':2,'BedroomAbvGr':3,'TotRmsAbvGrd':7}
r = requests.post(url)
print(r.json())
#print(data)
Exemple #36
0
@app.get("/")
def root():
    return {"message": "hello world"}


# In[39]:


@app.get("/item/")
async def create_item(item: Item):
    return item


# In[40]:


@app.get("/prediction/{stdInput}")
async def prediction(stdInput):
    classT, proba = compare(clf.predict_proba([stdInput])[0])
    result = " Classe : %d avec %f %%. " % (classT, proba)
    return result


# ## Prediction

# In[41]:


clf = load('labelsTrained.joblib')

Exemple #37
0
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

#loading the model...
model = joblib.load('house_price_prediction.pkl')

#load the data set...
df = pd.read_csv('clean_house_data.csv')

X = df.drop('price', axis=1)  #train data
y = df['price']  #target data

#print(X.head())

#spliting the data for training and test....
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

#feature scalling....
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


#function to predict price....
def predict_house_price(bath, balcony, bhk, total_sqft_int, price_per_sqft,
Exemple #38
0
    # as csv and h5 files
    filename = '_'.join(['all_evaluation', '_'.join(filenames)])
    ((all_evaluations.sort_values(['accuracy MEAN', 'accuracy SD'],
                                  ascending=[False, True])).to_csv(os.path.join(path, filename + '.csv'), sep=';',
                                                                   decimal=','))
    ((evaluation.sort_values(['accuracy MEAN', 'accuracy SD'],
                             ascending=[False, True])).to_hdf(os.path.join(path, filename + '.h5'), key='evaluation',
                                                              mode='w'))

print()
print('Done')


#%% create kaggle file
if False:
    #load model
    filename='ridge'
    f=os.path.join(path, 'ridge_0_scaling_cross-validation_bestEstimator.joblib')
    model=load(f)
    
    #encode test data
    test_data_encoded = enc.transform(test_data)
    
    #predict
    y_test_data_enc=model.predict(test_data_encoded)
    
    y_test_data=le.inverse_transform(y_test_data_enc)
    #write output file
    kaggleOutput=pd.DataFrame( data={'ID':test_data.index.values,'Class':y_test_data})
    kaggleFile=f=os.path.join(path, 'kaggle.csv')
    kaggleOutput.to_csv(kaggleFile, index=False)
Exemple #39
0
st.markdown(hide_streamlit_style, unsafe_allow_html=True)

st.write("# Fake Message Recognition Engine")

message_text = st.text_area("Enter a message for  evaluation")


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)  # Effectively removes HTML markup tags
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace(
        '-', '')
    return text


model = joblib.load('spam_classifier.joblib')

message_submit = st.button('Evaluate')

if message_submit:

    label = (model.predict([message_text])[0])
    spam_prob = (model.predict_proba([message_text]))
    if label == "spam":
        label = "fake"
    elif label == "ham":
        label = "real"

    result = {'label': label, 'probability': spam_prob[0][1]}

    st.write(result)
def predict_mlp(image, im_origin, model_name, mu, std) :
    """
    Function that to the prediction based on a mlp model
    
    Inputs : - Image --> original image to pred
             - model_name --> name of the model (in the same folder as the main)
    """
    
    # Loading the model
    model = load(model_name)
    
    # Find maximal dimension
    dim = max(image.shape)
    ind = np.argmin(image.shape)
    
    # Padding to have sqaure image
    if ind == 0 :
        i = 0
        while image.shape[ind] != dim :
            if i%2 == 0 :
                image = pad(image, ((1,0),(0,0)), mode='maximum')
            else :
                image = pad(image, ((0,1),(0,0)), mode='maximum')
            i+=1
            
    else :
        i = 0
        while image.shape[ind] != dim :
            if i%2 == 0 :
                image = pad(image, ((0,0),(1,0)), mode='maximum')
            else :
                image = pad(image, ((0,0),(0,1)), mode='maximum')
            i+=1
        
    # Resize to have a correct dimension for the input of the mlp
    image = resize(image, (28, 28))
    image = median(image)
    
    # Threshold for the inarisation
    thresholds = threshold_multiotsu(im_origin, classes=2)
    thresh_background = thresholds[0]
    
    # Binarisation
    im = image.copy()
    """
    im[np.where(image>=thresh_background)] = 0
    im[np.where(image<thresh_background)] = 255
    """
    im[np.where(image>=thresh_background)] = 1
    im = invert(im)
    m = np.max(im)
    im = im/m*255

    test = im.copy()
    
    # Copy for the plot
    #plot = im.copy()
    
    #mu = image.mean()
    #std = image.std()
    im = im.reshape(1, -1)
    
    # Normalisation
    im_norm = (im - mu)/std
    prediction = model.predict(im_norm)
    prediction_string = str(prediction[0])
    prob = model.predict_proba(im_norm)
    
    # Plot
    #plt.imshow(plot, cmap='gray')
    #plt.title('Rotated binary box : %d , %f' %(int(prediction), prob[0][int(prediction)]))
    #plt.show()
    
    return prediction_string, prob, test
Exemple #41
0
#Load credentials from .env
name = os.environ["DB_NAME_AWS"]
password = os.environ["DB_PW_AWS"]
host = os.environ["DB_HOST_AWS"]
user = os.environ["DB_USER_AWS"]

pg_conn = psycopg2.connect(dbname=name,
                           user=user,
                           password=password,
                           host=host)
## Cursor is always open
pg_curs = pg_conn.cursor()

# Load in slimmed random forest pickled model
test_model = load("targetiterrobustforest.joblib")

# Load the craigslist cleaned data
df_cl = pd.read_csv("data/model_and_image_url_lookup.csv")
# List of unique CL cars
cl_models = sorted(df_cl.model.unique())


def status_200_or_nan(url):
    response = requests.get(url)
    if response.status_code == 200:
        return url
    else:
        return np.NaN

Exemple #42
0
# env FLASK_APP=api.py flask run
# in order to start operating the api

from flask import Flask, request

app = Flask(__name__)

import sys
import string
from joblib import dump, load
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import LinearSVC
import json

count_vect2 = load('count_vect2.joblib')
tf_transformer2 = load('tf_transformer2.joblib')
model = load('svm.joblib')


@app.route('/')
def hello_world():
    return 'Welcome to the API for review classification. To classify text, your API call should look something like: http://127.0.0.1:5000/classify/?text=my text \n'


@app.route('/hello')
def api_hello():
    if 'name' in request.args:
        return 'Hello ' + request.args['name']
    else:
        return 'Hello John Doe'
Exemple #43
0
import joblib
import tensorflow as tf
from tensorflow import keras

pickle_file = './FERDataset/FER.joblib'

with open(pickle_file, 'rb') as f:
  save = joblib.load(f)

  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']

  del save

  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

image_size = 48
num_labels = 7
num_channels = 1 # grayscale

model = keras.Sequential([
  keras.layers.Reshape((1,48,48), input_shape=((48,48))),
  keras.layers.Conv2D(filters=64,
                      kernel_size=5,
                      data_format='channels_first',
Exemple #44
0
results = pd.DataFrame(dict(Counter(results[2])).items()).sort_values(1)
results[0] = results[0].apply(np.abs)
results = results.groupby(0).sum()
sum = results.sum().item()
results["diff"] = results[1] / sum
results = results.reset_index()

results["diff"]

# Save model
joblib.dump(vectorizer, open("tfidf.joblib", "wb"))
joblib.dump(lda, open("lda.joblib", "wb"))
joblib.dump(clf, open("lrclf.joblib", "wb"))
pickle.dump(stp.TextPreprocess(), open("textprocess.pkl", "wb"))

# Predict
vectorizer = joblib.load(path + "cvss_flask/tfidf.joblib")
lda = joblib.load(path + "cvss_flask/lda.joblib")
clf = joblib.load(path + "cvss_flask/lrclf.joblib")
tp = stp.TextPreprocess()


def pred_cvss(input_raw):
    """Predict CVSS score."""
    input = tp.transform_df(pd.DataFrame([input_raw]),
                            reformat="stopstemprocessonly",
                            columns=[0]).iloc[0][0]
    input_tfidf = vectorizer.transform([input])
    input_lda = lda.transform(input_tfidf.toarray())
    return clf.predict(input_lda)[0]
'''
batch_size = [128]
lr = [1e-3]
hidden_units = [128]
epochs = [10]

param_grid = dict(batch_size=batch_size,
                  lr=lr,
                  hidden_units=hidden_units,
                  epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid.fit(train_x, train_y)
joblib.dump(grid.best_estimator_, '{}/grid.pkl'.format(save_dir))

print()
print("Best parameters set found:")
print()
print(grid.best_params_)
print()

# predict
print("testing model ...")
loaded_model = joblib.load('{}/grid.pkl'.format(save_dir))
predictions = loaded_model.predict(test_x, batch_size=128, verbose=1)
predictions = (predictions >= 0.5).astype('int')

test_y = test_y.reshape((-1, 1))

print("Classfication report:")
print(classification_report(test_y, predictions))
Exemple #46
0
 def load(self, filename: str):
     self.probs = joblib.load(filename)
Exemple #47
0
from flask import Flask, request, jsonify
app = Flask(__name__)

import numpy as np
from joblib import load
dt = load('dt1.joblib')


@app.route('/prueba/<uuid>', methods=['POST'])
def add_message(uuid):
    content = request.json
    print(content)
    ejemplo = np.array([
        content['c1'], content['c2'], content['c3'], content['c4'],
        content['c5'], content['c6'], content['c7'], content['c8'],
        content['c9'], content['c10'], content['c11']
    ])
    print(dt.predict(ejemplo.reshape(1, -1)))
    a = dt.predict(ejemplo.reshape(1, -1))
    return jsonify({"resultado": a[0]})


@app.route('/image', methods=['POST'])
def post():
    request_data = request.form['some_text']
    print(request_data)
    imagefile = request.files.get('imagefile', '')
    imagefile.save('test_image.jpg')
    return jsonify({"status": 'OK'})
Exemple #48
0
    lstBrevet = ficBrevet['brevets']
#        if data.has_key('requete'):
#            DataBrevet['requete'] = data["requete"]
    print ("Found "+ ndf+ " datafile with " +str(len(lstBrevet)) + " patents!")
else:
    print ('gather your data again')
    sys.exit()

cles =  ['IPCR11', 'CitO', 'dateDate', 'inventor-nice', 'equivalents', 'CitedBy', 'representative', 'Inventor-Country', 'date', 'inventor', 'kind', 'priority-active-indicator', 'applicant-nice', 'IPCR1', 'country', 'IPCR3', 'applicant', 'IPCR4', 'IPCR7', 'title', 'application-ref']
Titles = []
Labels = []

Abstracts  = [] # Pure abstracts
IPCRsText  = []

Contents = joblib.load(os.path.normpath(ResultContentsPath+'//Contents-'+ndf+'.pkl'))   # Contains IPCRs (text of associated IPCR classes) + title + abstracts 
Titles = joblib.load( os.path.normpath(ResultContentsPath+'//Titles-'+ndf+'.pkl'))
Labels = joblib.load( os.path.normpath(ResultContentsPath+'//Labels-'+ndf+'.pkl'))
IPCRsText = joblib.load( os.path.normpath(ResultContentsPath+'//IPCRsText-'+ndf+'.pkl'))
Abstracts = joblib.load( os.path.normpath(ResultContentsPath+'//Abstracts-'+ndf+'.pkl'))
CIB = []

print("loading patents contents")
#
Tit2FicName=joblib.load(os.path.normpath(ResultContentsPath+'//Titles_ficNames-'+ndf+'.pkl'))
#
FreqTrie= joblib.load(os.path.normpath(ResultContentsPath+'//FreqTrie'+ndf+'.pkl'))
word_freq_df = joblib.load(os.path.normpath(ResultContentsPath+'//word_freq'+ndf+'.pkl'))

D0=len(set(word_freq_df['term'])) #unic forms of corpus
H0=np.log(D0) # 
Exemple #49
0
import glob
import flask
from dash.dependencies import Input, Output
from pvtm.pvtm import PVTM, Documents

# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()

# general
ap.add_argument("-m", "--model", required=True,
                help="path to the trained PVTM model")

parsed_args = ap.parse_args()
args = vars(parsed_args)

data = joblib.load(args['model'])

image_directory = 'Output/'
if not os.path.exists(os.path.dirname(image_directory)):
    try:
        os.makedirs(os.path.dirname(image_directory))
    except OSError as exc: # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise

for i in range(data.gmm.n_components):
    data.wordcloud_by_topic(i).to_file('Output/img_{}.png'.format(i))

def generate_table(dataframe, max_rows=20):
    return html.Table(
        # Header
def experiment(variant):
    with open('expert_demos_listing.yaml', 'r') as f:
        listings = yaml.load(f.read())
    expert_demos_path = listings[variant['expert_name']]['file_paths'][
        variant['expert_idx']]
    buffer_save_dict = joblib.load(expert_demos_path)
    expert_replay_buffer = buffer_save_dict['train']

    env_specs = variant['env_specs']
    env = get_env(env_specs)
    env.seed(env_specs['eval_env_seed'])
    training_env = get_env(env_specs)
    training_env.seed(env_specs['training_env_seed'])

    print('\n\nEnv: {}'.format(env_specs['env_name']))
    print('kwargs: {}'.format(env_specs['env_kwargs']))
    print('Obs Space: {}'.format(env.observation_space))
    print('Act Space: {}\n\n'.format(env.action_space))

    if variant['scale_env_with_demo_stats']:
        env = ScaledEnv(
            env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )
        training_env = ScaledEnv(
            training_env,
            obs_mean=buffer_save_dict['obs_mean'],
            obs_std=buffer_save_dict['obs_std'],
            acts_mean=buffer_save_dict['acts_mean'],
            acts_std=buffer_save_dict['acts_std'],
        )

    obs_space = env.observation_space
    act_space = env.action_space
    assert not isinstance(obs_space, Dict)
    assert len(obs_space.shape) == 1
    assert len(act_space.shape) == 1

    obs_dim = obs_space.shape[0]
    action_dim = act_space.shape[0]

    # build the policy models
    net_size = variant['policy_net_size']
    num_hidden = variant['policy_num_hidden_layers']
    qf1 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=num_hidden * [net_size],
        input_size=obs_dim,
        output_size=1,
    )
    policy = ReparamTanhMultivariateGaussianPolicy(
        hidden_sizes=num_hidden * [net_size],
        obs_dim=obs_dim,
        action_dim=action_dim,
    )

    # build the discriminator model
    disc_model = MLPDisc(
        obs_dim +
        action_dim if not variant['adv_irl_params']['state_only'] else 2 *
        obs_dim,
        num_layer_blocks=variant['disc_num_blocks'],
        hid_dim=variant['disc_hid_dim'],
        hid_act=variant['disc_hid_act'],
        use_bn=variant['disc_use_bn'],
        clamp_magnitude=variant['disc_clamp_magnitude'])

    # set up the algorithm
    trainer = SoftActorCritic(policy=policy,
                              qf1=qf1,
                              qf2=qf2,
                              vf=vf,
                              **variant['sac_params'])
    algorithm = AdvIRL(env=env,
                       training_env=training_env,
                       exploration_policy=policy,
                       discriminator=disc_model,
                       policy_trainer=trainer,
                       expert_replay_buffer=expert_replay_buffer,
                       **variant['adv_irl_params'])

    if ptu.gpu_enabled():
        algorithm.to(ptu.device)
    algorithm.train()

    return 1
from joblib import load
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

model = load("models/randomforest.pkl")
col = load('models/column_list.pkl')
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
# print("Feature ranking:")
feature_names = col

# for f in range(10):
#     print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# sorted(zip(map(lambda x: round(x,4), model.steps[1][1].feature_importances_),

important_features = pd.Series(data=importances, index=feature_names)
important_features.sort_values(ascending=False, inplace=True)
important_features.nlargest(12).plot(kind='barh')
plt.title("Top important features")
plt.savefig('imgs/feat_import.png')

# plt.figure()
# plt.title("Feature importances")
# plt.bar(range(12), importances[indices],
#         color="r", yerr=std[indices], align="center")
# plt.xticks(range(12), indices)
import hydro_serving_grpc as hs
import numpy as np
from joblib import load

clf = load('/model/files/random-forest-adult.joblib')

features = [
    'age', 'workclass', 'education', 'marital_status', 'occupation',
    'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
    'hours_per_week', 'country'
]


def extract_value(proto):
    return np.array(proto.int64_val, dtype='int64')[0]


def predict(**kwargs):
    extracted = np.array(
        [extract_value(kwargs[feature]) for feature in features])
    transformed = np.dstack(extracted).reshape(1, len(features))
    predicted = clf.predict(transformed)

    response = hs.TensorProto(int64_val=[predicted.item()],
                              dtype=hs.DT_INT64,
                              tensor_shape=hs.TensorShapeProto())

    return hs.PredictResponse(outputs={"classes": response})
    path = input('Please enter the path of email you want to test: ')
    fea = []
    f = open(path, 'r').read()
    contents = processing(f)
    word_indices = create_feature(contents)
    fea.append(word_indices)
    dataframe = pd.DataFrame(fea)
    # store the DataFrame as a .csv, 'index=False' means line label was unseen,'sep='''means default=True
    dataframe.to_csv("test_sample.csv", index=False, sep=',')
    # read train data set
    df = pd.read_csv("test_sample.csv")  # path to train set
    data_set = df.values
    x_test = data_set[:, 1:len(word_indices)]  # features

    predicts = []
    bnb = joblib.load("BNB_model.m")
    predicts.extend(bnb.predict(x_test))

    svm = joblib.load("SVM_model.m")
    predicts.extend(svm.predict(x_test))

    mlp = joblib.load("MLP_model.m")
    predicts.extend(mlp.predict(x_test))

    dt = joblib.load("DT_model.m")
    predicts.extend(dt.predict(x_test))

    knn = joblib.load("KNN_model.m")
    predicts.extend(knn.predict(x_test))
    # obtain the predicted results by integrating the outputs of the most predicted items from classifiers
    print(predicts)
Exemple #54
0
#32 動詞の原型
import joblib
a = joblib.load("100knock_30")
base = []
for sentense in a:
    for i in range(len(sentense)):
        if sentense[i]['pos'] == '動詞':
            base.append(sentense[i]['base'])
print(base)
Exemple #55
0
        'output': {
            'probabilities': '[float]',
            'prediction': 'Iris Setosa, Iris Versicolour, Iris Virginica'
        }
    }
})

modelFilePath = 'models/iris-svc.joblib'
from joblib import dump

dump(toBePersisted, modelFilePath)

# Testing deserialized model

from joblib import load

dictionary = load(modelFilePath)
loaded_model = dictionary['model']

#5.1,3.5,1.4,0.2,Iris-setosa
prediction = loaded_model.predict([[5.1, 3.5, 1.4, 0.2]])
print("prediction with serialized model: " + str(prediction) + " expect [0]")

#6.0,3.4,4.5,1.6,Iris-versicolor
prediction = loaded_model.predict([[6.0, 3.4, 4.5, 1.6]])
print("prediction with serialized model: " + str(prediction) + " expect [1]")

#6.3,2.5,5.0,1.9,Iris-virginica
prediction = loaded_model.predict([[6.3, 2.5, 5.0, 1.9]])
print("prediction with serialized model: " + str(prediction) + " expect [2]")
Exemple #56
0
import joblib
import numpy as np
import shap
from numeral import int2roman
from oximachinerunner import OximachineRunner

from .utils import generate_csd_link  # pylint:disable=relative-beyond-top-level
from .utils import (
    load_pickle as read_pickle,  # pylint:disable=relative-beyond-top-level
)

RUNNER = OximachineRunner("mof")
THIS_DIR = os.path.dirname(os.path.realpath(__file__))

EXPLAINER = joblib.load(os.path.join(THIS_DIR, "explainer.joblib"))
KDTREE = joblib.load(os.path.join(THIS_DIR, "kd_tree.joblib"))
NAMES = np.array(read_pickle(os.path.join(THIS_DIR, "names.pkl")))

warnings.simplefilter("ignore")

log = logging.getLogger("shap")  # pylint:disable=invalid-name
log.setLevel(logging.ERROR)

# adjust these features according to model
METAL_CENTER_FEATURES = [
    "column",
    "row",
    "valenceelectrons",
    "diffto18electrons",
    "sunfilled",
import joblib
import os

classifier = joblib.load('medical_appointment.joblib')

#['idade', 'auxilio_bolsa_familia', 'hipertensao', 'diabetes',
# 'alcolismo', 'deficienca', 'sms_recebido', 'dias_para_consulta',
# 'genero_M']
instance=[
    [62, 0, 1, 0, 0, 0, 0, -1, 0],
    [23, 0, 0, 0, 0, 0, 0, 2, 0],
    [60, 1, 1, 1, 1, 1, 1, 5, 0],
    [50, 0, 0, 0, 0, 0, 1, 80, 0]
]
print(classifier.predict(instance))
Exemple #58
0
from flask import Flask, jsonify, request
import spacy
import joblib
import re

app = Flask(__name__)

nlp = spacy.load('en_core_web_sm')
#nlp = joblib.load('nlp_pipeline.sav')
ot_classifier = joblib.load('ot_classifier.sav')  # Classifier model
transformer = joblib.load('tfidf_transformer.sav')  # TF-IDF model


def predict_tweet(tweet):
    x = re.sub(r'http\S+', '', tweet)  # remove URLs
    x = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ",
               x).split())  # remove special characters and extra spaces

    tweet = nlp(x)  # add the text to spacy pipeline
    # clean text by removing stopwords, punctuation, digits, lemmatize the tokens and turn them into lowercase.
    tweet = ' '.join([
        token.lemma_.lower() for token in tweet
        if not token.is_stop and not token.is_punct
        and not token.text.isdigit() and len(token.text) > 2
    ])

    # Predictions
    # pass the clean text to the TF-IDF to transform the text and then use the classifier to predict
    result = ot_classifier.predict(transformer.transform([tweet]))
    # covert results into readable classes
Exemple #59
0
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
YOLOSIZE = (512, 512)
FRAMESIZE = (height, width)

tracker = Tracker(xs=xs,
                  ys=ys,
                  sigma_h=0.9,
                  sigma_iou=0.7,
                  metric=args.metric,
                  t_min=10,
                  params_file=PARAMFILE,
                  frameRate=fps)

all_detections = load(f"results/detections/video{args.video}")

i = 0

for frame_detections in all_detections:

    ret, frame = cap.read()
    img_in = cv2.resize(frame, YOLOSIZE)
    img_in = cv2.cvtColor(img_in, cv2.COLOR_BGR2RGB)
    bboxes, scores, classes, num_dets = frame_detections
    #print(bboxes)
    image = draw_bbox(frame, frame_detections)
    if len(frame_detections[0]) > 0:
        tracker.update(bboxes, i)
        image = tracker.write_velocities(image)
Exemple #60
0
import logging
import random

from fastapi import APIRouter
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
import pandas as pd
from pydantic import BaseModel, Field, validator

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
import json

from joblib import load
model=load('knn_final.joblib')
df = pd.read_csv("https://raw.githubusercontent.com/BW-pilot/MachineLearning/master/spotify_final.csv")
spotify = df.drop(columns = ['track_id'])
scaler = StandardScaler()
spotify_scaled = scaler.fit_transform(spotify)

log = logging.getLogger(__name__)
router = APIRouter()

def knn_predictor(audio_feats, k=20):
    """
    differences_df = knn_predictor(audio_features)
    """
    audio_feats_scaled = scaler.transform([audio_feats])

    ##Nearest Neighbors model