def main(): df_train: pd.DataFrame = load_train_data() df_eval: pd.DataFrame = load_eval_data() df_train = preprocessing(df_train) df_eval = preprocessing(df_eval) X_train = df_train y_train = df_train["score"] X_eval = df_eval y_eval = df_eval["score"] model = Pipeline([("features", ColumnTransformer([ ("target_enc", ce.TargetEncoder(cols=["reviewerID", "asin"]), ["reviewerID", "asin"]), ])), ("model", SGDClassifier(loss='modified_huber'))]) # model = ce.TargetEncoder(cols=["reviewerID"]) model.fit(X_train, y_train) print("Target encoding eval acc:", np.mean(model.predict(X_eval) == y_eval)) print("Target encoding acc:", np.mean(model.predict(X_train) == y_train)) with open("models/target_encoding.pickle", "wb") as file: pickle.dump(model, file)
def brain_extraction(args): print 'Starting pre-processing' preprocessing(args) print 'Starting Decomposition/Registration' main(args) print 'Starting post-processing' postprocessing(args) return
def __getData(self): for i in xrange(self.__classes): for j in xrange(self.__train_samples): #Load file TODO: rewrite this piece of shit if j < 10: _file = self.__file_path + str(i) + "/" + str(i) + "0" + str(j) + ".pbm" else: _file = self.__file_path + str(i) + "/" + str(i) + str(j) + ".pbm" src_image = imread(_file, 0) #process file prs_image = preprocessing(src_image, self.__size, self.__size) #Set class label row = GetRow(self.__trainClasses, i * self.__train_samples + j) Set(row, i) #Set data row = GetRow(self.__trainData, i * self.__train_samples + j) img = CreateImage((self.__size, self.__size), IPL_DEPTH_32F, 1 ) #convert 8 bits image to 32 float image ConvertScale(fromarray(prs_image), img, scale=(1.0/255)) data = GetSubRect(img, (0, 0, self.__size, self.__size)) #convert data matrix sizexsize to vecor row1 = Reshape(data, 0, 1) Copy(row1, row)
def get_paragraphs(files_list, mystem, del_stopwords=False): file_text = {} data = [] for i, file in enumerate(files_list): if file.endswith('.txt'): with open(file, 'r', encoding='utf-8') as f: text = f.read() file_text[file] = text else: text = file file = i paragraphs = splitter(text, 1) for paragraph in paragraphs: paragraph_lemmatized = preprocessing(paragraph, del_stopwords) data.append({'file': file, 'paragraph': paragraph_lemmatized}) if file_text: with open('file_text', 'w', encoding='utf-8') as fw: json.dump(file_text, fw) return data, file_text else: return data
def predict(): # Error checking data_csv = request.files['inputFile'] if not data_csv: return "No file" DQ = pd.read_csv(data_csv) #print(DQ) patientids, data = preprocessing(DQ) print(type(data)) # Convert JSON to numpy array # predict_request = [data['FIRSTNAME'], data['LASTNAME'], data['GENDERCODE'], data['DATEOFBIRTH'], data['ETHNICITYCODE'], data['RACECODE'], data['MARITALSTATUS']] predict_request = np.array(data) #predict_request.astype(float) print(predict_request) # # Predict using the random forest model y = xgb_model.predict(predict_request) y = np.array(y).reshape((y.shape[0],1)) patientids = np.array(patientids).reshape((patientids.shape[0],1)) print(y) print(patientids.shape) y = pd.DataFrame(y) patientids = pd.DataFrame(patientids) # Return prediction #output = y #d = {'col1': PATIENTID , 'col2': PREDICTIONS} #result = pd.DataFrame(data=d, index = index) results = pd.concat([patientids.reset_index(drop = True), y], axis=1) results.columns = ['Patient_id', 'Predictions'] #results = pd.DataFrame.to_json(results) #print(result) return render_template("score.html", score=results.to_html())
def trainingModel(self): self.registerWorking.finishThread.emit() state = 0 while True: if state == 0: # Pre-process obj = preprocessing(self.input_datadir, self.output_datadir) nrof_images_total, nrof_successfully_aligned = obj.alignProcessing( ) print('Total number of images: %d' % nrof_images_total) print('Number of successfully aligned images: %d' % nrof_successfully_aligned) state += 1 # Classifier elif state == 1: print("Training Start") objModel = classifier( mode='TRAIN', datadir=self.datadir, modeldir=self.modeldir, classifierFilename=self.classifier_filename) get_file = objModel.main() sys.exit("All Done") state += 1 else: break
def __getitem__(self, item): data = preprocessing( self.dataset[item][0], self.dataset[item][1], self.dataset[item][2], self.dataset[item][3], self.dataset[item][4], self.dataset[item][5] ) temp = [] for i in data["targets_class"]: temp.append(torch.tensor(i, dtype=torch.long)) # Return the processed data where the lists are converted to `torch.tensor`s return { 'ids': torch.tensor(data["ids"], dtype=torch.long), 'mask': torch.tensor(data["mask"], dtype=torch.long), 'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long), 'targets_start': torch.tensor(data["targets_start"], dtype=torch.long), 'targets_end': torch.tensor(data["targets_end"], dtype=torch.long), 'targets_class': torch.tensor(data["targets_class"], dtype=torch.long), 'orig_tweet': data["orig_text"], 'orig_selected': data["orig_keyword"], 'sentiment': data["class"], 'offsets': torch.tensor(data["offsets"], dtype=torch.long), 'error_index': torch.tensor(data["error_index"], dtype=torch.long) }
def main(): for column in ["summary", "reviewText"]: df_train: pd.DataFrame = load_train_data() df_eval: pd.DataFrame = load_eval_data() df_train = df_train[:50000] df_train = preprocessing(df_train) df_eval = preprocessing(df_eval) X_train = df_train y_train = df_train["score"] X_eval = df_eval y_eval = df_eval["score"] model = Pipeline( [ ("tfidf", ColumnTransformer([ ("tfidf", TfidfVectorizer(min_df=10, max_df=0.3,preprocessor=preprocess, tokenizer=tokenize,ngram_range=(1,2), stop_words='english'), column) ])), ("SVD", TruncatedSVD(n_components=500)), ("forest", RandomForestClassifier(n_estimators=10, random_state=0, max_depth=4)) ] ) param_grid = { 'SVD__n_components': [500], 'forest__n_estimators': [100, 500, 1000], 'forest__max_depth': [4,8] } # print(model.fit_transform(X_train[:100], y_train[:100])) tscv = TimeSeriesSplit(n_splits=2) search = GridSearchCV(model, param_grid, n_jobs=12, cv=tscv.split(X_train), verbose=True) search.fit(X_train, y_train) print("Best parameter (CV score=%0.3f):" % search.best_score_) print(search.best_params_) model = search.best_estimator_ print(model[1].explained_variance_ratio_) print(f"Tfidf random forest column {column} eval acc:", np.mean(model.predict(X_eval) == y_eval)) print(f"Tfidf random forest column {column} train acc:", np.mean(model.predict(X_train) == y_train)) with open(f"models/random_forest_{column}.pickle", "wb") as file: pickle.dump(model, file)
def preprocessing(self, ponct=0, spell=0, predict=0, stop=0, lem=0): tokens = [] for i in range(len(self.dyads_index)): index = self.dyads_index[i] tokens = tokens + preprocessing(self.utterances[index], self.labels[index], ponct, spell, predict, stop, lem) self.tokens = tokens
class CombineClassifiers(): def __init__(self, classifiers, nsamples=24, global_stats=False): """ We initialize with a list of tuples where the first element is a classifier and the second is the name of the classifier Whereas nsamples is a constant or a list. If a list, each element corresponds to the number of samples for the corresponding classifier For a """ self.classifiers = {} self.nsamples = {} self.nclassif = 0 self.global_stats = global_stats for classifier, name in classifiers: self.classifiers[name] = classifier if type(nsamples)==int: self.nsamples[name] = nsamples elif len(nsamples)==len(classifiers): self.nsamples[name] = nsamples[self.nclassif] else: raise ValueError('nsamples has to be either an int or a list of the same length than classifiers') self.nclassif += 1 def fit(self, X, y): """ provide X and y in the form of X, y = load_train() here fit all the classifiers individually here """ for k in self.classifiers.keys(): X_train, y_train = preprocessing(X, y, sampling_rate=1440//self.nsamples[k]) if 'RNN' in k: X_train = X_train.reshape(X_train.shape[0], -1, self.nsamples[k]) self.classifiers[k].fit(X_train, y_train) def predict(self, X): """ predict the classes for X """ preds = np.zeros((self.nclassif, X.shape[0])) i = 0 for k in self.classifiers.keys(): X_test, _ = preprocessing(X, _, sampling_rate=1440//self.nsamples[k]) if 'RNN' in k: X_test = X_test.reshape(X_test.shape[0], -1, self.nsamples[k]) preds[i] = self.classifiers[k].predict_proba(X_test)
def test(line): line = preprocessing(line) tokens = tokenize(line) preferentially_evaluated_token = prioritizeParentheses(tokens) actualAnswer = evaluateAll(preferentially_evaluated_token) expectedAnswer = eval(line) if abs(actualAnswer - expectedAnswer) < 1e-8: print("PASS! (%s = %f)" % (line, expectedAnswer)) else: print("FAIL! (%s should be %f but was %f)" % (line, expectedAnswer, actualAnswer))
def search(query, inverted_index, data, document_length, n_results=5): query = preprocessing(query, del_stopwords=False) search_result = get_search_result(query, inverted_index, data['corpus'], document_length, n_results) results = [(data.loc[index, 'url'], data.loc[index, 'corpus']) for index in search_result] return results
def __init__(self, size, makeData, noComp): self.pca = eigenHands(size) self.gabor = gaborFilters(False, size) self.classify = classifyHands(False, size) self.prep = preprocessing(size, noComp) if (makeData == True): self.pca.makeMatrix("garb") self.pca.makeMatrix("hands") self.pca.makeMatrix("rock") self.pca.makeMatrix("paper") self.pca.makeMatrix("scissors")
def __init__(self, size, makeData, noComp): self.pca = eigenHands(size) self.gabor = gaborFilters(False, size) self.classify = classifyHands(False, size) self.prep = preprocessing(size, noComp) if(makeData == True): self.pca.makeMatrix("garb") self.pca.makeMatrix("hands") self.pca.makeMatrix("rock") self.pca.makeMatrix("paper") self.pca.makeMatrix("scissors")
def fit(self, X, y): """ provide X and y in the form of X, y = load_train() here fit all the classifiers individually here """ for k in self.classifiers.keys(): X_train, y_train = preprocessing(X, y, sampling_rate=1440//self.nsamples[k]) if 'RNN' in k: X_train = X_train.reshape(X_train.shape[0], -1, self.nsamples[k]) self.classifiers[k].fit(X_train, y_train)
def top(): # 全角を半角に、空白を削除する line = preprocessing(request.args.get('line', '')) # 不正な入力(記号の連続、数字と記号以外の入力等)を防ぐ if validator(line) == True: tokens = tokenize(line) preferentially_evaluated_token = prioritizeParentheses(tokens) answer = evaluateAll(preferentially_evaluated_token) else: answer = validator(line) return render_template('top.html', ans=answer)
def main(): df_train: pd.DataFrame = load_train_data() df_eval: pd.DataFrame = load_eval_data() df_train = preprocessing(df_train) df_eval = preprocessing(df_eval) X_train = [0] * len(df_train) y_train = df_train["score"] X_eval = [0] * len(df_eval) y_eval = df_eval["score"] model = DummyClassifier("most_frequent") model.fit(X_train, y_train) print("Baseline eval acc:", np.mean(model.predict(X_eval) == y_eval)) print("Baseline train acc:", np.mean(model.predict(X_train) == y_train)) with open("models/baseline.pickle", "wb") as file: pickle.dump(model, file)
def search(query, search_method, inverted_index, data, document_length, w2v_model, w2v_base, d2v_model, d2v_base, n_results=5): query = preprocessing(query, del_stopwords=False) if search_method == ['inverted_index', 'word2vec', 'doc2vec']: res_inv_ind = get_search_result(query, inverted_index, data['corpus'], document_length, n_results * 50, return_sim=True) res_w2v = search_w2v(query, w2v_model, w2v_base, n_results * 50, return_sim=True) res_d2v = search_d2v(query, d2v_model, d2v_base, n_results * 50, return_sim=True) combination = res_inv_ind + res_w2v + res_d2v search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]] elif search_method == ['inverted_index', 'word2vec']: res_inv_ind = get_search_result(query, inverted_index, data['corpus'], document_length, n_results * 50, return_sim=True) res_w2v = search_w2v(query, w2v_model, w2v_base, n_results * 50, return_sim=True) combination = res_inv_ind + res_w2v search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]] elif search_method == ['inverted_index', 'doc2vec']: res_inv_ind = get_search_result(query, inverted_index, data['corpus'], document_length, n_results * 50, return_sim=True) res_d2v = search_d2v(query, d2v_model, d2v_base, n_results * 50, return_sim=True) combination = res_inv_ind + res_d2v search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]] elif search_method == ['word2vec', 'doc2vec']: res_w2v = search_w2v(query, w2v_model, w2v_base, n_results * 50, return_sim=True) res_d2v = search_d2v(query, d2v_model, d2v_base, n_results * 50, return_sim=True) combination = res_w2v + res_d2v search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]] elif search_method == ['inverted_index']: search_result = get_search_result(query, inverted_index, data['corpus'], document_length, n_results) elif search_method == ['word2vec']: search_result = search_w2v(query, w2v_model, w2v_base, n_results) elif search_method == ['doc2vec']: search_result = search_d2v(query, d2v_model, d2v_base, n_results) else: raise TypeError('unsupported search method') results = [(data.loc[index, 'url'], data.loc[index, 'corpus']) for index in search_result] return results
def process_image(inputfolder, fn, f): cutted, ref_lines, lines_spacing = preprocessing(inputfolder, fn, f) last_acc = '' last_num = '' height_before = 0 if len(cutted) > 1: f.write('{\n') for it in range(len(cutted)): f.write('[') is_started = False symbols_boundaries = segmentation(height_before, cutted[it]) symbols_boundaries.sort(key=lambda x: (x[0], x[1])) for boundary in symbols_boundaries: label, cutted_boundaries = get_label_cutted_boundaries( boundary, height_before, cutted[it]) if label == 'clef': is_started = True for cutted_boundary in cutted_boundaries: _, y1, _, y2 = cutted_boundary if is_started == True and label != 'barline' and label != 'clef': text = text_operation(label, ref_lines[it], lines_spacing[it], y1, y2) if (label == 't_2' or label == 't_4') and last_num == '': last_num = text elif label in accidentals: last_acc = text else: if last_acc != '': text = text[0] + last_acc + text[1:] last_acc = '' if last_num != '': text = f'\meter<"{text}/{last_num}">' last_num = '' not_dot = label != 'dot' f.write(not_dot * ' ' + text) height_before += cutted[it].shape[0] f.write(' ]\n') if len(cutted) > 1: f.write('}')
def search_w2v(query, model, w2v_base, n_results): query_vec = get_w2v_vectors(model, preprocessing(query)) similarities = {} for doc in w2v_base: sim = similarity(query_vec, doc['vec']) # print(query_vec) similarities[sim] = doc['index'] results = [ re.split('/Friends - season [0-9]/Friends - ', similarities[sim].strip('.ru.txt'))[1] for sim in sorted(similarities, reverse=True)[:n_results] ] return results
def test(self): error = 0 testCount = 0 for i in xrange(self.__classes): for j in xrange(50, 50 + self.__train_samples): _file = self.__file_path + str(i) + "/" + str(i) + str(j) + ".pbm" src_image = imread(_file, 0) #process file prs_image = preprocessing(src_image, self.__size, self.__size) prs_np = prs_image r = self.classify(prs_np, 0) if int(r) != i: error += 1 testCount += 1 totalerror = 100 * error / float(testCount) print "System Error: " + str(totalerror)
def prediction(x, clf, multilabel_binarizer, vec): processed_text = preprocessing(x) data = vec.transform([processed_text]) ops = clf.predict(data) labels = multilabel_binarizer.inverse_transform(ops) ops_prob = clf.predict_proba(data) * ops labels_prob = multilabel_binarizer.classes_[ops_prob[0] > 0] ops_list = ops_prob[ops_prob != 0] cat = decode_cat(labels) ''' return data -> {categories:[...],root_cause:[.....],proba:[......]} ''' if len(cat[0]) > 0: categories = "" for i in cat[0][:]: categories += i + "," ops_list *= 100 data = { 'cat': [categories], 'root_causes': labels, 'proba': ops_list.tolist() } return data # for i,j in enumerate(labels[0]): # st.write("### "+j.strip() + 'proba'ops_list[i] * 100) else: return "no root cause deteced please enter valid input"
def __init__(self, data_path = './data_bci', train = True, one_khz = False, filter = False, robust_scaler = False, num_samples = 20, shift = 10, force_cpu = False): # Load data self.input, self.target = dlc_bci.load(root = data_path, one_khz = one_khz, train = train) self.train = train self.force_cpu = force_cpu print('Input data loaded (size = {})'.format(self.input.shape)) print('Target data loaded (size = {})'.format(self.target.shape)) #Filtering if filter: if one_khz: fs = 1000 else: fs = 100 self.input = preprocessing(self.input, ignore_outliers = robust_scaler, fs = fs) if torch.cuda.is_available() and not force_cpu: self.input = self.input.cuda()
def test_main(checkpoint, data, batch_size, num_workers, num_classes, inner_emotion=-1, test=False): max_len = 64 model = BERTClassifier(num_classes=num_classes).build_model() model.load_state_dict(checkpoint) device = torch.device( "cuda:0") if torch.cuda.is_available() else torch.device("cpu") eval_dtls = preprocessing(json2csv(data, test=test), inner_emotion=inner_emotion, test=test) data_test = test_loader(eval_dtls, max_len, batch_size, num_workers) result_df = test(data_test, model, device) return result_df
def digit_process(): if (request.method == "POST"): img = request.get_json() img = preprocessing(img) save_path = 'model/params.pkl' params, cost = pickle.load(open(save_path, 'rb')) [f1, f2, w3, w4, b1, b2, b3, b4] = params digit, probability = predict(img, params) #print(digit, "%0.2f"%probability) #l = int(digit) #p = float(probability) data = { "digit": int(digit), "probability": float(np.round(probability, 3)) } data_json = json.dumps(data) return jsonify(data_json) print(done)
def save_w2v_base(files_list, model, mystem, save=True, title='w2v_base'): """Индексирует всю базу для поиска через word2vec""" documents_info = [] for i, file in tqdm_notebook(enumerate(files_list)): if file.endswith('.txt'): with open(file, 'r', encoding='utf-8') as f: text = f.read() else: text = file file = i lemmas = preprocessing(text) vec = get_w2v_vectors(model, lemmas) file_info = {'file': file, 'word2vec': vec} documents_info.append(file_info) if save: with open(title + '.pkl', 'wb') as fw: pickle.dump(documents_info, fw) return documents_info
def search_inv_index(query, inverted_index, term_doc_matrix, files_length, n_results) -> list: """ Compute sim score between search query and all documents in collection :param query: input text :return: list of doc_ids """ relevance_dict = defaultdict(float) lemmas = preprocessing(query) for lemma in lemmas: sims = compute_sim(lemma, inverted_index, term_doc_matrix, files_length) for doc in sims: relevance_dict[doc] += sims[doc] result = sorted(relevance_dict, key=relevance_dict.get, reverse=True)[:n_results] return [ re.split('/Friends - season [0-9]/Friends - ', files_list[doc].strip('.ru.txt'))[1] for doc in result ]
def classify(self, image_source, show_result): nearest = CreateMat(1, self.__K,CV_32FC1) #process file prs_image = preprocessing(image_source, self.__size, self.__size) #Set data img32 = CreateImage((self.__size, self.__size), IPL_DEPTH_32F, 1) ConvertScale(fromarray(prs_image), img32, scale=(1.0/255)) data = GetSubRect(img32, (0, 0, self.__size, self.__size)) row1 = Reshape(data, 0, 1) row1np = np.array(row1) retval, result, nearest, dists = self.__knn.find_nearest(row1np, self.__K) accuracy = 0 for i in xrange(self.__K): if nearest[0][i] == result[0][0]: accuracy += 1 pre = 100 * accuracy / float(self.__K) if show_result == 1: print "|\t" + str(result[0][0]) + "\t| \t" + str(pre) + " \t| \t" + str(accuracy) + " of " + str(self.__K) + " \t|" print " ---------------------------------------------------------------" return result
tweet_set[len(tweet_set) - 1].tweetId = tweet['_id'] dic_user[userId].addTweet(tweet_set[len(tweet_set) - 1]) c += 1 if c == num_tweets: break #for userId, user in dic_user.iteritems(): # print(str(userId) + " " + str(len(user.tweet_set))) k_topics = num_topics LDA_iterations = num_iterations sentimentPoints = getSentimentPoints() #print(sentimentPoints) dictionary, corpus, out_set = preprocessing(doc_set) for i in range(0,len(out_set)): tweet_set[i].wordSet = out_set[i] sentimentsOfTweets = getSentimentScoreOfTweets(out_set) model = LDA(dictionary, corpus, k_topics, LDA_iterations) for i in range(0,len(sentimentsOfTweets)): tweet_set[i].russell_tuple = sentimentsOfTweets[i] sentDic = loadDict() dictByTopic = [] tempDic = {} topics = model.get_topics()
# import important packages from imports import * from logs import log from preprocessing import * # set a logger file logger = log(path="logs/", file="cross_val.logs") #load dataset data = pd.read_csv("data/loans_data.csv") # preprocessing the loan data data = preprocessing(data) # split data into train and test X = data.drop('Loan_Status', axis=1) y = data.Loan_Status # create a dictionary for classifiers models = { "KNN": KNeighborsClassifier(), "RF": RandomForestClassifier(), "GB": GradientBoostingClassifier(), "DTC": DecisionTreeClassifier(), "BC": BaggingClassifier(), "XGB": XGBClassifier(), "EXT": ExtraTreesClassifier(), "LG": LogisticRegression(), "BBC": BalancedBaggingClassifier(), "EEC": EasyEnsembleClassifier(), }
def register(m1,m2,m3=""): if(m3!=""): if(m1 not in template_constraints): template_constraints[m1]=set() template_constraints[m1].add(m3) else: if(m1 not in template_constraints): template_constraints[m1]=set() template_constraints[m1].add(m2) filename=input("Enter file name\n") input_expressions = preprocessing(filename) datatype_pattern=r'int | double' for datatype in temp_datatype_table: datatype_pattern+=' | '+datatype tokens = ('OB','CB','ID','DT','INT','DBL','COMMA','SC','EQ','COUT','CIN','RETURN', 'EXT','INS','ASSN_OP','OR','AND','S_AND','EQUAL','N_EQUAL','REL_OP', 'INCR','DECR','SO','MUL','DIV','PLUS','MINUS','NOT','MOD','OC','CONST') @lex.TOKEN(datatype_pattern) def t_DT(t): return t def t_COMMA(t): r','
yhat_e, label='(' + str(rmse_e.round(decimals=2)) + ') Bayesian Linear Regression (r/b)') pyplot.xlabel('Sample Index') pyplot.ylabel('Values') pyplot.title('Comparison of Linear Regression Model answer to 1f') pyplot.legend(loc="best") if __name__ == '__main__': #partition it to 80% trainingset and 20%testset#total parameter=24 datasets = pd.read_csv('train.csv') # Set seed so we get same random allocation on each run of code #np.random.seed(7) preprocessed = preprocessing(datasets) train_set_x, train_set_y, test_set_x, test_set_y = train_test_split_preprocessed( preprocessed) #solution1: Normal Linear Regression (1b) rmse_b, yhat_b = one_b.linear_regression_1b(train_set_x, train_set_y, test_set_x, test_set_y) #solution2: Regularized Linear Regression (1c) rmse_c, yhat_c = one_c.Regularized_LinearRegression_1c( train_set_x, train_set_y, test_set_x, test_set_y) #solution3: Regularized Linear Regression with bias term (1d) rmse_d, yhat_d = one_d.Regularized_Biased_LinearRegression_1d( train_set_x, train_set_y, test_set_x, test_set_y)
dir_path = sys.argv[2] #inFile = dir_path + "dvd.xml" #inFile = dir_path + "dvdReviews.xml" # inFile = dir_path + "example.xml" inFile = dir_path + "all.review" pickelfile = dir_path + "dvd_reviews_limited.pkl" h5_file = dir_path + "data.h5" # pickelfile = dir_path + "example.pkl" # pickelfile = dir_path + "dvd_reviews.pkl" mem_file_results = dir_path + "lda_results.h5" # inFile = sys.argv[2] + "dvd.xml" huge file if preprocess == 'True': reviews, w, doc_words = preprocessing(inFile) print "Save objects to file %s" % pickelfile start = timer() with open(pickelfile, 'wb') as f: pickle.dump(reviews, f) pickle.dump(w, f) print "Number of reviews : %d" % len(reviews) print "# of words in bag %s %s" % doc_words.shape #pickle.dump(doc_words, f) end = timer() h5f = h5py.File(h5_file, 'w') h5f.create_dataset('doc_words', data=doc_words) h5f.close() print "Saved objects to file in %s seconds." % (end - start) else: with open(pickelfile, 'rb') as f:
if len(sys.argv) == 1: preprocess = "True" # dir_path = 'F:/temp/topics/D - data/movie/test/' product = "electronics" dir_path = 'S:/Workspace/data/sports/' else: preprocess = sys.argv[1] dir_path = sys.argv[2] mem_file_results = dir_path + "mglda_" + product + "_" + str( N_GIBBS_SAMPLING_ITERATIONS) + ".mem" if preprocess == 'True': inFile = dir_path + "all.review.xml" reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = preprocessing( inFile) else: reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = load_objects( dir_path, product) # check_doc_word_matrix(doc_words, reviews, w) # last parameter is the max number of sentences for corpus doc_sentence_count, max_number_s = count_sent_docs(reviews) # create LDAModel object and initialize counters for Gibbs sampling lda = LDAModel(l_bag_of_words, m_docs_sentence_words, doc_sentence_count, max_number_s, K_GL, K_LOC, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, dir_path) # initialize counters start = timer() print "LDA initialize..." lda.initialize()
global reviews if len(sys.argv) == 1: preprocess = "True" # dir_path = 'F:/temp/topics/D - data/movie/test/' product = "electronics" dir_path = 'S:/Workspace/data/sports/' else: preprocess = sys.argv[1] dir_path = sys.argv[2] mem_file_results = dir_path + "mglda_" + product + "_" + str(N_GIBBS_SAMPLING_ITERATIONS) + ".mem" if preprocess == 'True': inFile = dir_path + "all.review.xml" reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = preprocessing(inFile) else: reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = load_objects(dir_path, product) # check_doc_word_matrix(doc_words, reviews, w) # last parameter is the max number of sentences for corpus doc_sentence_count, max_number_s = count_sent_docs(reviews) # create LDAModel object and initialize counters for Gibbs sampling lda = LDAModel(l_bag_of_words, m_docs_sentence_words, doc_sentence_count, max_number_s, K_GL, K_LOC, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.005, dir_path) # initialize counters start = timer() print "LDA initialize..." lda.initialize() # lda.print_counts() end = timer()
projData = hands.justGetDataMat(datas[dataset][i],"",True) hands.projPCA(projData, False, "PCA/", datas[dataset][i]) #____________________________________________________________________________________________________ elif(int(choice) == 2): dataset = raw_input('choose the dataset (r/p/s) ...') datas = {'r':'rock', 'p':'paper', 's':'scissors'} gabor = gaborFilters(buildOpt[str(build)],int(sizeImg)) gabor.setParameters(0.4, 0.8, 20, (numpy.pi*3.0/4.0), 5.0, 4.0) data = cv.Load("data_train/"+datas[dataset]+"Train"+str(sizeImg)+".dat") gabor.convolveImg(data, True) #____________________________________________________________________________________________________ elif(int(choice) == 3): aNumber = raw_input('write an unused nr/word ...') prep = preprocessing(int(sizeImg),0,0) prep.getHandsVideo(aNumber) #____________________________________________________________________________________________________ elif(int(choice) == 4): noComp = raw_input('number of components for PCA no ...') dataset = raw_input('choose the dataset c= > rock & paper & scissors; h => hands vs garbage ...') datas = {'c':['rock','paper','scissors'], 'h':['hands','garb']} hands = eigenHands(int(sizeImg)) _,data,txtLabels = hands.justGetDataMat(datas[dataset][0],"",False) prep = preprocessing(int(sizeImg),int(noComp)) prep.doManyGabors(data,txtLabels,dataset, False) #____________________________________________________________________________________________________ elif(int(choice) == 5): noComp = raw_input('number of components for PCA ...')
print(len(example)) df = pd.DataFrame({'example': example}) df['example'] = [word_tokenize(entry) for entry in df['example']] stopWords = set(stopwords.words('French')) stopWords_ang = set(stopwords.words('English')) l = ["-", "d", "co", "si"] modification(df, "example", stopWords_ang) modification(df, "example", stopWords) modification(df, "example", l) #stemming_frensh stem_list = stem(df, "example") preprocessing(df, 'example', stem_list) example = df.values.tolist() flat_list = lambda l: [item for sublist in l for item in sublist] example = [i.lstrip() for i in df['example']] #Model dtm_lsa = LSA(example) df = pd.read_excel(path + "\input_file.xlsx") #example_des_b=df.reset_index()['Description'].values.tolist() #Label des Données brutes df[col2] = df[col2][(~df[col2].duplicated()) | df[col2].isna()] #df to list example_des_b = df.reset_index()[col1].values.tolist()