def project_solution(height, length, gs_x, gs_y, u_in, cc, au, av, ap, rho, mu): # Get Mesh u_grid, v_grid, p_grid = Preprocessing.gen_mesh(height, length, gs_x, gs_y) # Get Initial Values u, v, p = Preprocessing.initial_con(u_grid[0].shape[0], u_grid[1].shape[0], u=0., v=0., p=0.) # Get Boundary Conditions bc = Preprocessing.boundary_cond('velocity', 'velocity gradient', 'no slip', 'no slip', [u_in, None, None, None]) # Create viewers p_viewer = Viewer.FlowContours(p, p_grid[0], p_grid[1], [0, 0, length, height], 'Pressure') x_v_viewer = Viewer.FlowContours(u, u_grid[0], u_grid[1], [0, 0, length, height], 'X Velocity') y_v_viewer = Viewer.FlowContours(v, v_grid[0], v_grid[1], [0, 0, length, height], 'Y Velocity') s = Solver.Solution(p, u, v, u_grid[0], v_grid[0], v_grid[1], u_grid[1], bc, cc, au, av, ap, rho, mu, p_viewer, x_v_viewer, y_v_viewer) p = s.p_n[s.ni/2:-1, s.nj/2] dp = p[-1]-p[0] print 'dp/dx = ' + str(dp/(gs_x*(len(p)-1))) print 'max U = ' + str(s.u_n[s.ni/2:-1, s.nj/2].max()) print 'max V = ' + str(s.v_n[s.ni/2:-1, s.nj/2].max()) Viewer.keep_open()
def oldNewDupes(sources=glob.glob("duplicate/sources/*.txt"), suspectDupe=glob.glob("duplicate/duplicates/*.txt")): dupesDictionary = {} for src in sources: duplicates = [] doc = open(src, 'r', encoding='utf-8') string = doc.read() doc.close() src_tokens = nltk.word_tokenize(Preprocessing.process(string)) for dp in suspectDupe.copy(): if src == dp: continue doc = open(dp, 'r', encoding='utf-8') dupe = doc.read() doc.close() dupe = nltk.word_tokenize(Preprocessing.process(dupe)) if jaccard(set(src_tokens), set(dupe)) > 0.9: duplicates.append(ntpath.basename(dp)) suspectDupe.remove(dp) print(ntpath.basename(src)) print(ntpath.basename(dp)) try: dupesDictionary[ntpath.basename(src)] = duplicates except: dupesDictionary = {ntpath.basename(src): duplicates} return dupesDictionary
def runEpisode(ale, agent, stepsRemaining): maxEpisodeDuration = 60 * 60 * 5 #Max game duration is 5 minutes, at 60 fps framesElapsed = 0 totalEpisodeReward = 0 ale_game_over = False screenObservation = ale.getScreenRGB() preprocessedObservation = Preprocessing.preprocessALEObservation(screenObservation, agent.inputHeight, agent.inputWidth) action = agent.startEpisode(preprocessedObservation) while not ale_game_over and framesElapsed < stepsRemaining and framesElapsed < maxEpisodeDuration: framesElapsed += 1 reward = ale.act(action) totalEpisodeReward += reward if ale.game_over(): ale_game_over = True screenObservation = ale.getScreenRGB() preprocessedObservation = Preprocessing.preprocessALEObservation(screenObservation, agent.inputHeight, agent.inputWidth) action = agent.stepEpisode(reward, preprocessedObservation) ale.reset_game() avgLoss = agent.endEpisode(0) return framesElapsed, totalEpisodeReward, avgLoss
def getImage(): while True: if lib.ftrScanGetFrame(hDevice, pointer(pBuffer), None)==1: #print "Done!\n\nWriting to file......\n" vect = bytearray(pBuffer.raw) outputIm = Image.new("RGB", (ImageSize.nWidth, ImageSize.nHeight)) outputIm.putdata(vect) base_name = '/home/samara/Documentos/TG/Amostras/Valter/Valter1'+str(datetime.datetime.now()).replace(':','_').replace('/','_')+'.jpeg' img = outputIm.save(base_name) #image_64 = base64.encodestring(open(img,"rb").read()) #image_64 = unicode(base64.encodestring(open(img,"rb").read())) #print(image_64) #print('ok-teste') #return img #print('ok') #print(img) improveImage = Preprocessing.improveImage(base_name) skeletonization = Preprocessing.skeletonization(improveImage) createKeyPoints = Preprocessing.createKeyPoints(skeletonization)# encryptFingerprint = Preprocessing.encryptFingerprint(createKeyPoints) print(encryptFingerprint) #teste = Preprocessing.webservice(encryptFingerprint) #print(image_64) return encryptFingerprint break else: PrintErrorMessage(lib.ftrScanGetLastError()) sleep(0.2) print 'System Terminate' lib.ftrScanCloseDevice(hDevice)
def predict_and_write(self, min, max, alp): mnb = MultinomialNB(alpha=alp) train_data = Preprocessing.process_train() test_set = Preprocessing.process_test() x_test = test_set[1] id_data = test_set[0] x_all = train_data[:, 0] y_all = train_data[:, 1] vectorizer = TfidfVectorizer(min_df=min, max_df=max, ngram_range=(1, 1), stop_words='english', strip_accents='ascii') output = vectorizer.fit_transform(x_all) x_all = output[:, :] mnb.fit(x_all, y_all) x_test = vectorizer.transform(x_test.ravel())[:, :] y_pred = mnb.predict(x_test).ravel() y_pred = y_pred.reshape(len(y_pred), 1) y_pred = np.concatenate((id_data.reshape(len(id_data), 1), y_pred), axis=1) first = ["Id", "Category"] y_pred = np.concatenate((np.array(first).reshape(1, 2), y_pred), axis=0) np.savetxt("prediction.csv", y_pred, fmt="%s", delimiter=",") CSVChange.write() return
def runEpisode(ale, agent, stepsRemaining): maxEpisodeDuration = 60 * 60 * 5 #Max game duration is 5 minutes, at 60 fps framesElapsed = 0 totalEpisodeReward = 0 ale_game_over = False screenObservation = ale.getScreenRGB() preprocessedObservation = Preprocessing.preprocessALEObservation( screenObservation, agent.inputHeight, agent.inputWidth) action = agent.startEpisode(preprocessedObservation) while not ale_game_over and framesElapsed < stepsRemaining and framesElapsed < maxEpisodeDuration: framesElapsed += 1 reward = ale.act(action) totalEpisodeReward += reward if ale.game_over(): ale_game_over = True screenObservation = ale.getScreenRGB() preprocessedObservation = Preprocessing.preprocessALEObservation( screenObservation, agent.inputHeight, agent.inputWidth) action = agent.stepEpisode(reward, preprocessedObservation) ale.reset_game() avgLoss = agent.endEpisode(0) return framesElapsed, totalEpisodeReward, avgLoss
def preprocess_aalto_hand_data_sequences( data, time_per_frame=1 / 240., max_skipped_frames=30, remove_timestamp_column=True, min_seq_length=240, representation=Representations.REPRESENTATION_XYZ, reorder_columns=True, verbose=True): data, step_index = preprocess_aalto_hand_data( data, remove_timestamp_column=False, representation=representation, reorder_columns=reorder_columns, verbose=verbose, returnStepIndex=True) data = pre.assign_sequence_ids(data, time_per_frame, max_skipped_frames) print '%d. Added a sequence id column: Found %d sequences based on %.6fs per frame and a maximum gap of %d skipped frames.' \ % (step_index, data['Sequence_id'].max(), time_per_frame, max_skipped_frames) step_index += 1 if min_seq_length is not None: data = pre.remove_short_sequences(data, min_seq_length) data = data.reset_index() num_seqs_left = len(data.groupby('Sequence_id')) print '%d. Removed %d short sequences (shorter than %d frames), leaving %d sequences.' \ % (step_index, data['Sequence_id'].max()-num_seqs_left, min_seq_length, num_seqs_left) step_index += 1 if remove_timestamp_column: data = remove_time_column(data, step_index, verbose) step_index += 1 return data
def SI(): global svc_clf, P300_clf ind = random.randint(0, 25) print("num =", ind) Series = np.load("../npSave/Pavarisa280219R06.npy")[ind, 0, :, :] print(np.asarray(Series).shape) bb, a = pre.butter_bandpass(0.5, 30, 500, order=5) bandpassData = pre.lfilter(bb, a, Series) print(bandpassData.shape) KaiserData = [] for i in range(8): tmp = pre.KaiserFil(bandpassData[i]) KaiserData.append(tmp) phaseData = np.array( [np.unwrap(np.angle(hilbert(i))) for i in bandpassData]) powerData = np.array([np.abs(hilbert(i)) for i in bandpassData]) aaa = np.ravel((phaseData, powerData)) A = np.reshape(aaa, (1, -1)) Seq = [] output = svc_clf.decision_function(A) # np.array([FeaturedData]))[0] for j in range(26): Seq.append([-output[0][j], j]) Seq.sort() # sort percentage SI_result = '' for t in Seq[0:9]: # เลือก 9 ตัวที่มี percent มากสุด SI_result = SI_result + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'[ t[1]] # แปรผลเป็นตัวอักษรimport socket print('Result:' + SI_result) return SI_result
def browse_file(): global filename filename = filedialog.askopenfilename() print(filename) # Audio preprocessing pr.noise_reductionM1(filename)
def test_order_files_to_paires(self): self.assertEqual(Preprocessing.order_files_to_paires(["testfile35_1.fastq.gz", "testfile36_2.fastq.gz", "testfile36_1.fastq.gz", "testfile35_2.fastq.gz"]), [("testfile35_1.fastq.gz", "testfile35_2.fastq.gz"), ("testfile36_1.fastq.gz", "testfile36_2.fastq.gz")]) with self.assertRaises(Exception) as context: Preprocessing.order_files_to_paires(["testfile35_1.fastq.gz", "testfile36_2.fastq.gz", "testfile36_1.fastq.gz"]), self.assertTrue("File_list argument does not contain pair for '" + "testfile35_1.fastq.gz" + "'" in str(context.exception))
def work(self): Prep.process(self.saveFilename) self.saveFilenameRes = 'imgResult.png' self.res = TI.KNNDigits(self.saveFilenameRes) self.result.set("Your number is " + str(self.res)) #self.lblRight.labelText = self.result #.lblRight.grid(row=0, column=0, sticky="nsew") self.displayPictureRight()
def upload(): # Panggil UploadForm dari form.py form = UploadForm() docList = renderDocList() global fileList , wordList , mVec # Jika ada file tersubmit, ambil nama file (secured oleh werkzeug) , cek ekstensinya jika .txt, simpan file ke folder test if form.validate_on_submit(): for files in form.file.data : filename = secure_filename(files.filename) file_ext = os.path.splitext(filename)[1] if file_ext != '.txt': flash(f'Format of file(s) uploaded is not allowed (.txt only), file submission canceled!' , 'danger') return redirect(url_for('upload')) files.save('../test/' + filename) flash(f'File(s) added!' , 'success') # Karena ada input file baru, database kamus di-update # Menampung semua nama file ke dalam suatu variabel list fileList fileList = [] for root, dirs, files in os.walk('../test', topdown=False): for name in files : dir = os.path.join(root,name).split('\\') # Mengambil hanya nama filenya, tidak bersama direktori yang displit oleh \ fileList.append(dir[1]) contentList = [] for name in fileList : # Menampung tiap konten dalam tiap file, melakukan cleaning, konversi ke token, menghapus stopword, dan lemmatize content = Read.readfile('../test/'+name) content = Read.cleaning(content) content = Read.token(content) content = Preprocessing.stopwords(content) content = Preprocessing.stemming(content) # Menggabungkan semua konten menjadi satu list contentList.append(content) # Membuat variabel penampung kata-kata yang ada di dokumen wordList = [] for content in contentList : for word in content : if word not in wordList : wordList.append(word) # Membuat variabel penampung jumlah kemunculan tiap kata pada tiap data mVec = [[0 for x in range(len(wordList))] for y in range(len(fileList))] j = 0 for content in contentList : for word in content : for i in range(len(wordList)) : if word == wordList[i] : mVec[j][i] = mVec[j][i] + 1 j = j + 1 return redirect(url_for('upload')) return render_template('upload.html', form=form, docs = docList)
def prepare_faces(): base_path = 'E:/BosphorusDB/ply/' persons = get_person_ids(base_path) for person in persons: print person person_path = base_path + person + '_filtered' s_save_path = person_path.replace('ply', 'npy').replace('_filtered', '') pr.convert_ply_to_npy(person_path, s_save_path)
def prepFiles(self, type): self.cleanRepository() tabFiles = [] Prep.preprocess(self.saveFilename, type) for file in os.listdir(self.dirPrep): if file.endswith(".png"): tabFiles.append(self.dirPrep + '/' + file) return tabFiles
def solution_zscore(self, solution): new_solution = list() new_x = 0 for index, x in enumerate(solution): new_x = 0 new_x = (x - PRE.average(self.pre_processedX[index + 1])) / ( PRE.standard_deviation(self.pre_processedX[index + 1])) new_solution.append(new_x) return new_solution
def main(): # Load Data train_data, test_data = prep.load_data() # Encode Categorical Variables x_train, y_train, x_test, id_test = prep.encode_categories(train_data, test_data) # Decompositions pca = PCA(n_components=5) ica = FastICA(n_components=5, max_iter=1000) tsvd = TruncatedSVD(n_components=5) gp = GaussianRandomProjection(n_components=5) sp = SparseRandomProjection(n_components=5, dense_output=True)
def describeLines(image,numPoints=8,radius=1,eps=1e-7): imageLines=preprocessing.getHorizontalImageLinesGray(image,20) concattinatedImage=preprocessing.concatinateLines(imageLines) # compute the Local Binary Pattern representation # of the image, and then use the LBP representation # to build the histogram of patterns lbp = feature.local_binary_pattern(concattinatedImage, numPoints,radius, method="default") (hist, _) = np.histogram(lbp, bins=256, range=(0,256)) #(hist, _) = np.histogram(lbp.ravel(),bins=np.arange(0, numPoints + 3),range=(0, numPoints + 2)) # normalize the histogram [np.where(lbp<255)] hist = hist.astype("float") hist /= (hist.sum()) # return the histogram of Local Binary Patterns return hist[0:255].tolist()
def plot_examples(path): img = cv2.imread( path, cv2.IMREAD_GRAYSCALE) height = img.shape[0] width = img.shape[1] img_prewitt=Preprocessing.prewitt(img,width,height,True) img_roberts= Preprocessing.roberts(img,width,height,True) plt.imshow(img_prewitt, cmap = 'Greys') plt.title('Prewitt operator') plt.show() plt.imshow(img_roberts, cmap='Greys') plt.title('Roberts operator') plt.show()
def build_phoneme_object(pair_index, word_index, info): phonemes = [] for phoneme_index in range(3, len(info[pair_index][word_index])): if info[pair_index][word_index][phoneme_index] == '': break start = 0 if phoneme_index == 3 else \ Preprocessing.toInt((info[pair_index][word_index + 1][phoneme_index - 1])) end = Preprocessing.toInt(info[pair_index][word_index + 1][phoneme_index]) phoneme = {'phoneme': info[pair_index][word_index][phoneme_index], 'start': start, 'end': end } phonemes.append(phoneme) return phonemes
def train_regressor(filename): pipe = Pipeline([('reduce_dim', TruncatedSVD(n_components=70)), ('regression', MLPRegressor(solver='lbfgs'))]) param_grid={'regression__hidden_layer_sizes':[(230,),(300,)],'regression__alpha':[0.0001,0.1,0.01]} mlp = GridSearchCV(pipe, param_grid,cv=10) features,labels,sparse_encoder,int_encoder = Preprocessing.feature_extraction_regression_train(filename) mlp.fit(features,labels) return mlp,sparse_encoder,int_encoder
def text_clean_df(merged_df, trans_df, pipeline=TOPIC_PIPELINE): df1 = text_df(merged_df, trans_df) col_processed = [ pre.text_preprocessing(text, pipeline) for text in tqdm(df1['Convo_1']) ] df1['Convo_1'] = cc.untokenize(col_processed) return df1
def createJsonRepresentation(app): unit = getUnit(app) root = {} allWits = [] rdgs = [el for el in app.childNodes if el.nodeType == 1] for rdg in rdgs: appLevel = {} appLevel['id'] = rdg.getAttribute('wit') tokenList = [] ws = rdg.getElementsByTagName('w') for ind, w in enumerate(ws): if not 3 in [child.nodeType for child in w.childNodes]: continue currentWord = w if ind == 0: previousWord = '' else: previousWord = ws[ind-1] token = {} token['t'] = currentWord.toxml()[8 + len(w.getAttribute('n')):-4] token['n'] = Preprocessing.conflate(currentWord) token['u'] = unit tokenList.append(token) appLevel['tokens'] = tokenList allWits.append(appLevel) root['witnesses'] = allWits return json.loads(json.dumps(root))
def search(sentence): result = {} words = Preprocessing.Clean(sentence) for word in words: q = db.search(qr.Word == word) rating = [] try: for i in range(len(q[0]['Count'])): rating.append(q[0]['Place'][i] - q[0]['Count'][i]) documents = q[0]['Documents'] rating, documents = (list(t) for t in zip(*sorted(zip(rating, documents)))) result[word] = documents except: pass if sentence.startswith('"') and sentence.endswith('"'): res = () if len(words) > 1 and result: for r in result: if res: res = res.intersection(result[r]) else: res = set(result[r]) return {'result': list(res)} else: return result else: return result
def test(X, cutline, columns): X.drop("scores", axis='columns', inplace=True) anomaly_path = "./CIC-output/normal-1.pcap_Flow.csv" anomaly_data = Preprocessing.load_df(anomaly_path) result = [] c = 1 for xi in range(len(anomaly_data)): nth_data = list() for x in list(anomaly_data.columns): nth_data.append(anomaly_data.loc[xi,x]) X.loc[len(X)] = nth_data #LOF clf = LocalOutlierFactor(n_neighbors=2, contamination=0.1) y_pred = clf.fit_predict(X.values) X_scores = clf.negative_outlier_factor_ X_scores = np.array(X_scores, dtype=np.float64) print("[{}] - {}".format(c, -X_scores[-1])) if -X_scores[-1] >= cutline: result.append(-1) # out c+=1 else: result.append(1) # in c+=1 print('=====') X = X.drop(X.index[len(X)-1]) print("\n\n") print("FileName : {}".format(anomaly_path)) print(result) print("-1 : {}".format(result.count(-1))) print("1 : {}".format(result.count(1))) return ''
def run_OMR(inputPath, classifiersPath): image, useAugmented = Preprocessing.read_and_preprocess_image(inputPath) Processing = Pipeline.Augmented if useAugmented else Pipeline.Standard Classifier.load_classifiers(classifiersPath) image = Processing.remove_brace(image) lineImage, staffDim = Processing.extract_staff_lines(image) groups = Processing.split_bars(image, lineImage, staffDim) output = [] for group in groups: components, sanitized, staffDim, lineImage, dotBoxes = Processing.segment_image( group) Classifier.assign_components(sanitized, components, staffDim) Processing.join_meters(components) Processing.bind_accidentals_to_following_notes(components) Processing.bind_dots_to_notes(components, dotBoxes) Processing.assign_note_tones(components, sanitized, lineImage, staffDim, group) output.append(Display.get_guido_notation(components)) return output
def random_forest(): modelPath = "RandomForest1.joblib" dataFile = pd.read_csv(self.filePath) # Preprocessing: f = Preprocessing("output.csv") f.preprocessing("filetest.csv") file_data = pd.read_csv("output.csv") file_data_test = file_data.drop(['RainTomorrow'], axis=1) rf = load(modelPath) result = rf.predict(file_data_test) df = pd.DataFrame(result) label = Label(self.canvas, text=df) label.config(font=("Helvetica", 17)) label.place(x=850, y=150)
def getFeatureVector(image): _, allcontour = preprocessing.segmentCharactersUsingProjection( image, "contorBasedOrientation") histDirections = np.zeros(9) totalContourpixels = 0 for i in range(len(allcontour)): totalContourpixels += len(allcontour[i]) for j in range(len(allcontour[i]) - 1): xdiff = allcontour[i][j][0] - allcontour[i][j + 1][0] ydiff = allcontour[i][j][1] - allcontour[i][j + 1][1] if xdiff < 0 and ydiff < 0: histDirections[1] += 1 elif xdiff == 0 and ydiff > 0: histDirections[2] += 1 elif xdiff > 0 and ydiff < 0: histDirections[3] += 1 elif xdiff > 0 and ydiff == 0: histDirections[4] += 1 elif xdiff > 0 and ydiff > 0: histDirections[5] += 1 elif xdiff == 0 and ydiff > 0: histDirections[6] += 1 elif xdiff < 0 and ydiff > 0: histDirections[7] += 1 elif xdiff < 0 and ydiff == 0: histDirections[8] += 1 return (histDirections / totalContourpixels).tolist()
def train(filename): fileTrain = xlrd.open_workbook(filename) dataTrain = fileTrain.sheet_by_index(0) rowLen = dataTrain.nrows filePreprocessed = openpyxl.Workbook() dataPreprocessed = filePreprocessed.active for i in range(0, rowLen): data_i = dataTrain.cell(i,0).value class_i = dataTrain.cell(i, 1).value prep = Preprocessing.preprocess(data_i) # print(prep) if prep: # prep = list(prep).split() for i in range(0,len(prep)): dataPreprocessed.append([''.join(prep[i]), class_i]) filePreprocessed.save("dataset_preprocessing.xlsx") # FeatureSelection.mutualInformation() # FeatureSelection.elimination() # NaiveBayes.classify(filename)
def predict(self, X, means, std_devs): X = Preprocessing.zscore_norm_prediction(X, means, std_devs) solution = self.forward_propagation(X)[-1] print('----- SOLUTION -----') print(solution.item((0, 0)))
def train_dimension_reduction(filename): pipe = Pipeline([('reduce_dim', TruncatedSVD()), ('classification', SVC())]) param_grid={'reduce_dim__n_components': [70,75,85,100,120], 'classification__kernel':['poly','linear','rbf'], 'classification__C':[0.1,0.5,1,2]} svm = GridSearchCV(pipe, param_grid,cv=10) features,labels,sparse_encoder,int_encoder = Preprocessing.feature_extraction_sparse_train(filename) svm.fit(features,labels) return svm,sparse_encoder,int_encoder
def train_sparse( filename ): svm = SVC() features,labels,sparse_encoder,int_encoder = Preprocessing.feature_extraction_sparse_train(filename) param_grid={'kernel':['poly','linear','rbf'], 'C': [0.1,0.5,0.9,1,2]} best_svm = GridSearchCV(svm, param_grid,cv=10) best_svm.fit(features,labels) return best_svm,sparse_encoder,int_encoder
def preprocess(self, img_face): Size_For_Eye_Detection = (48, 48) img_face = cv2.resize(img_face, Size_For_Eye_Detection, Image.ANTIALIAS) img_norm = Preprocessing.LBH_Norm(img_face) # img_norm = Preprocessing.mask_on_rect(img_norm) return img_norm, img_face
def save_preprocessed_tweets(self): with open(self.tweets_not_processed_file_path, 'r', encoding='utf-8') as csv_r_file: csv_reader = csv.reader(csv_r_file) path = r"./data" self.tweets_processed_file_path = os.path.join(path,\ self.tweets_not_processed_file_name[:14]+\ self.tweets_not_processed_file_name[18:]) # print(self.tweets_processed_file_path) with open(self.tweets_processed_file_path, 'w', encoding='utf-8', newline='') as csv_w_file: csv_writer = csv.writer(csv_w_file) csv_writer.writerow(['tweet_id', 'processed_tweet']) #skip headers next(csv_reader) for tweet in csv_reader: if (len(tweet) == 2): processed_tweet = preprocess.preprocess_tweets("".join( tweet[1])) # print(processed_tweet) t = processed_tweet t = " ".join(t) csv_writer.writerow([tweet[0], t]) return self.tweets_processed_file_path
def createWordCloud(text): #removes STOPWORDS from the chart to make more readable return WordCloud(stopwords=Preprocessing.stemText( STOPWORDS | {'endofsen', 'endofpar', 'said', 'say', 'will'}), background_color="white", width=500, height=500).generate(text)
def integrator(): K=5 Preprocessing.main() Boostrap.main(Preprocessing.inputSet) #print(Boostrap.testSet) global prediction prediction={} #bootstrapVoting for i in range(Boostrap.NO_OF_BOOTSTRAPS): print("*** %d BootStrap ****" %(i)) #prediction[i]=KNN.main(Boostrap.bootstrap[i],Boostrap.testSet,K) (inputToRectifier,probabilityChart, priors, featureAndValues, trainSetLength)=NaiveBayes.NaiveBayes(Boostrap.bootstrap[i]) print("********DECIION TREE************") tree=DecisionTree.GenerateTreeFromDatasetGivenByAssorter(inputToRectifier) print("********DECIION TREE ENDS************") print("********TEST NAIVE START************") probabilityDistributionChart=NaiveBayes.GetProbabilityDistributionTable(probabilityChart, priors, featureAndValues, Boostrap.testSet,trainSetLength) print("********TEST NAIVE ENDS************") print("********PREDICTION START************") prediction[i]=DecisionTree.PredictedList(probabilityDistributionChart,tree,Boostrap.testSet) #print("***prediction***",prediction) print("*********** BootStrapVoting *************") boostrapVoting() calculateTestError()
import datetime, json, os, Preprocessing, sys, xml.dom.minidom as minidom os.chdir(os.path.abspath(os.path.dirname(__file__))) args = sys.argv assert len(args) == 3, "Expected 4 arguments exactly! -i followed by input directory path" assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory" path = args[args.index('-i')+1] xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path)) l = len(xmls) count = 0 print for afile in xmls: count += 1 Preprocessing.updateProgressBar('XMLtoJSON.py', float(100)*count/l) unit = Preprocessing.parseName(afile) root = {} alldocs = [] rdgs = [el for el in minidom.parse(os.path.join(path, afile)).getElementsByTagName('*') if el.localName in ['lem', 'rdg']] for rdg in rdgs: docLevel = {} docLevel['id'] = rdg.getAttribute('wit') tokenList = [] ws = rdg.getElementsByTagName('w') words = [] for w in range(len(ws)): if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w continue currentWord = ws[w] previousWord = ''
if textNodeValue != '-': normalizedAttrValue = token[0]['n'] else: textNodeValue = '' normalizedAttrValue = '' tokenElement.appendChild(doc.createTextNode(textNodeValue)) tokenElement.setAttribute('n', normalizedAttrValue) tokenElement.setAttribute('u', unitValue) tokenElement.setAttribute('witness', nameToNumber[number]) blockElement.appendChild(tokenElement) number += 1 line.appendChild(blockElement) return pseudoPrettyPrint(normalChars(line.toprettyxml().encode('utf-8'))) if os.path.exists('output.xml'): os.remove('output.xml') with codecs.open('output.xml', 'a') as out: out.write('<collationOutput>\n') for app in apps: c += 1 Preprocessing.updateProgressBar('Collation', float(100)*c/l) collationResults = collate_pretokenized_json(createJsonRepresentation(app), 'json') out.write(processColumn(collationResults, getUnit(app))) if c % FLUSH == 0: Preprocessing.updateProgressBar('Collation', float(100)*c/l, True) gc.collect() out.write('</collationOutput>') print '\nTook', datetime.datetime.now() - startTime, 'to execute.'
filenameTesting = "../../data/testing_48x48_aligned_large.p_R.csv.gz" totTime = 0 count = 0 show = True ok = 0 preds = 0 import matplotlib.pyplot as plt plt.ion() pos = np.arange(6)+.5 with gzip.open(filenameTesting) as f: reader = csv.reader(f) for row in reader: truePerson = int(row[0]) vals = np.asarray(row[1:], np.float) start = time.time() preprocessed = Preprocessing.preprocess(vals, None, 46, show) res = pred.getPrediction(preprocessed / 255.) totTime += time.time() - start predPerson = int(res.argmax()) plt.clf() plt.yticks(pos, ('Dejan', 'Diego', 'Martin', 'Oliver', 'Rebekka', 'Ruedi')) predPValue = res[0][res.argmax()] if (predPValue > 0.9): preds += 1 if predPerson == truePerson: if (predPValue > 0.9): ok += 1 col = 'g' else: col = 'r' plt.barh(pos, np.asarray(res[0], dtype = float), align='center', color = col)
import pandas as pd import Preprocessing as prep train_data = pd.read_csv('data/train.csv') preprocessed_data = prep.preprocess(train_data) print preprocessed_data.describe()
def evaluate_lenet5(topo, learning_rate=0.005, n_epochs=500, datasetName='mnist.pkl.gz', batch_size=4242, stateIn = None, stateOut = None): rng = numpy.random.RandomState(23455) theano_rng = RandomStreams(numpy.random.randint(2 ** 30)) #Original #datasets = load_data(dataset) #n_out = 10 datasets = Preprocessing.load_pictures() # pickle.dump(datasets, open( datasetName, "wb" ) ) #Attention y is wrong # print("Saveing the pickeled data-set") #Loading the pickled images #print("Loading the pickels data-set " + str(datasetName)) #datasets = pickle.load(open(datasetName, "r")) n_out = 6 batch_size = 10 print(" Learning rate " + str(learning_rate)) # Images for face recognition #train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[0] test_set_x, test_set_y = datasets[1] # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' print 'Number of Kernels' + str(topo.nkerns) in_2 = 14 #Input in second layer (layer1) # Reshape matrix of rasterized images of shape (batch_size,28*28) # to a 4D tensor, compatible with our LeNetConvPoolLayer layer0_input = x.reshape((batch_size, 1, topo.ishape[0], topo.ishape[1])) # Using presistent state from last run w0 = w1 = b0 = b1 = wHidden = bHidden = wLogReg = bLogReg = None if stateIn is not None: print(" Loading previous state ...") state = pickle.load(open(stateIn, "r")) convValues = state.convValues w0 = convValues[0][0] b0 = convValues[0][1] w1 = convValues[1][0] b1 = convValues[1][1] hiddenVals = state.hiddenValues wHidden = hiddenVals[0] bHidden = hiddenVals[1] logRegValues = state.logRegValues wLogReg = logRegValues[0] bLogReg = logRegValues[1] print("Hallo Gallo") # Construct the first convolutional pooling layer: # filtering reduces the image size to (28-5+1,28-5+1)=(24,24) # maxpooling reduces this further to (24/2,24/2) = (12,12) # 4D output tensor is thus of shape (batch_size,nkerns[0],12,12) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, topo.ishape[0], topo.ishape[0]), filter_shape=(topo.nkerns[0], 1, topo.filter_1, topo.filter_1), poolsize=(topo.pool_1, topo.pool_1), wOld=w0, bOld=b0) # Construct the second convolutional pooling layer # filtering reduces the image size to (12-5+1,12-5+1)=(8,8) # maxpooling reduces this further to (8/2,8/2) = (4,4) # 4D output tensor is thus of shape (nkerns[0],nkerns[1],4,4) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=(batch_size, topo.nkerns[0], topo.in_2, topo.in_2), filter_shape=(topo.nkerns[1], topo.nkerns[0], topo.filter_2, topo.filter_2), poolsize=(topo.pool_2, topo.pool_2), wOld=w1, bOld=b1) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size,num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (20,32*4*4) = (20,512) layer2_input = layer1.output.flatten(2) # Evt. some drop out for the fully connected layer # Achtung p=1 entspricht keinem Dropout. # layer2_input = theano_rng.binomial(size=layer2_input.shape, n=1, p=1 - 0.02) * layer2_input # paper_6 no dropout # paper_14 again 0.02 dropout # paper_15 again no dropout layer2 = HiddenLayer(rng, input=layer2_input, n_in=topo.nkerns[1] * topo.hidden_input, n_out=topo.numLogisticInput, activation=T.tanh, Wold = wHidden, bOld = bHidden) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=topo.numLogisticInput, n_out=n_out, Wold = wLogReg, bOld=bLogReg ) # Some regularisation (not for the conv-Kernels) L2_sqr = (layer2.W ** 2).sum() + (layer3.W ** 2).sum() # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) + 0.001 * L2_sqr # paper7 # paper9 back to 0.001 again # paper10 no reg. # paper12 back to 0.001 again # create a function to compute the mistakes that are made by the model test_model = theano.function([index], layer3.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function([index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # create a list of all model parameters to be fit by gradient descent params = layer3.params + layer2.params + layer1.params + layer0.params # create a list of gradients for all model parameters grads = T.grad(cost, params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i],grads[i]) pairs. updates = [] for param_i, grad_i in zip(params, grads): updates.append((param_i, param_i - learning_rate * grad_i)) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 10000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.995 # a relative improvement of this much is # considered significant # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False epoch_fraction = 0.0 while (epoch < n_epochs) and (not done_looping): # New epoch the training set is disturbed again print(" Starting new training epoch") print(" Manipulating the training set") train_set_x, train_set_y = Preprocessing.giveMeNewTraining() n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_train_batches /= batch_size validation_frequency = min(n_train_batches, patience / 2) print(" Compiling new function") learning_rate *= 0.993 #See Paper from Cican train_model = theano.function([index], cost, updates=updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) print(" Finished compiling the training set") epoch = epoch + 1 for minibatch_index in xrange(n_train_batches): #Alle einmal anfassen iter = (epoch - 1) * n_train_batches + minibatch_index epoch_fraction += 1.0 / float(n_train_batches) if iter % 100 == 0: print 'training @ iter = ', iter, ' epoch_fraction ', epoch_fraction cost_ij = train_model(minibatch_index) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = numpy.mean(validation_losses) # test it on the test set test_start = time.clock(); test_losses = [test_model(i) for i in xrange(n_test_batches)] train_costs = [train_model(i) for i in xrange(n_test_batches)] dt = time.clock() - test_start print'Testing %i faces in %f msec image / sec %f', batch_size * n_test_batches, dt, dt/(n_test_batches * batch_size) test_score = numpy.mean(test_losses) train_cost = numpy.mean(train_costs) print('%i, %f, %f, %f, %f, 0.424242' % (epoch, this_validation_loss * 100.,test_score * 100., learning_rate, train_cost)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # # test it on the test set # test_losses = [test_model(i) for i in xrange(n_test_batches)] # test_score = numpy.mean(test_losses) # print((' epoch %i, minibatch %i/%i, test error of best ' # 'model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) # if (this_validation_loss < 0.02): # learning_rate /= 2 # print("Decreased learning rate due to low xval error to " + str(learning_rate)) if patience <= iter: print("--------- Finished Looping ----- earlier ") done_looping = True break end_time = time.clock() print('---------- Optimization complete -------------------------') print('Res: ', str(topo.nkerns)) print('Res: ', learning_rate) print('Res: Best validation score of %f %% obtained at iteration %i,' \ 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print('Res: The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) # Oliver if not os.path.isdir("conv_images"): os.makedirs("conv_images") os.chdir("conv_images") # d = layer0.W.get_value() #e.g. (20, 1, 5, 5) number of filter, num of incomming filters, dim filter # for i in range(0, numpy.shape(d)[0]): # dd = d[i][0] # rescaled = (255.0 / dd.max() * (dd - dd.min())).astype(numpy.uint8) # img = Image.fromarray(rescaled) # img.save('filter_l0' + str(i) + '.png') # # d = layer1.W.get_value() #e.g. (20, 1, 5, 5) number of filter, num of incomming filters, dim filter # for i in range(0, numpy.shape(d)[0]): # dd = d[i][0] # rescaled = (255.0 / dd.max() * (dd - dd.min())).astype(numpy.uint8) # img = Image.fromarray(rescaled) # img.save('filter_l1' + str(i) + '.png') state = LeNet5State(topology=topo, convValues = [layer0.getParametersAsValues(), layer1.getParametersAsValues()], hiddenValues = layer2.getParametersAsValues(), logRegValues = layer3.getParametersAsValues()) print if stateOut is not None: pickle.dump(state, open(stateOut, 'wb') ) #Attention y is wrong print("Saved the pickeled data-set") return learning_rate
assert len(args) == 3, "Expected exactly 2 arguments!\n\n-i followed by input directory path" assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory" def normalChars(l): return l.replace('<', '<').replace('>','>').replace('"', '"') path = args[args.index('-i')+1] jsons = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'json' , os.listdir(path)) os.chdir(path) c = 0 l = len(jsons) couldnt = [] print for afile in jsons: c += 1 Preprocessing.updateProgressBar('JSONtoXML.py', float(100)*c/l) data = json.loads(open(afile, 'r').read()) nameToNumber = {number:name for number, name in enumerate(data['witnesses'])} with codecs.open(afile[:-4] + 'xml','w') as out: doc = minidom.Document() witnessElement = doc.createElement('witnesses') doc.appendChild(witnessElement) blockc = 0 for block in data['table']: blockc += 1 blockElement = doc.createElement('block') blockElement.setAttributeNode(doc.createAttribute('n')) blockElement.setAttribute('n', str(blockc-1)) number = 0 for token in block: tokenElement = doc.createElement('token')
def classify(self,article, dataModel): "fungsi mengklasifikasikan suatu artikel berdasarkan model Bayes yang telah dibuat" label = [] # preprocessing for data input token = Preprocessing.preprocess(article) # get label from xls for i in range(1, dataModel.ncols): label.append(dataModel.cell(0, i).value) if token: probability = [] # get probability i = 0 idx = 0 while i < len(token): probability.append([]) j = 2 while j < dataModel.nrows: if token[i] == dataModel.cell(j, 0).value: for k in range(0, len(label)): pd = dataModel.cell(j, k+1).value probability[idx].append(pd) break j += 1 if j == dataModel.nrows: del probability[-1] i += 1 else: i += 1 idx += 1 # probability calc pFinal = [] for i in range(0, len(label)): pc = dataModel.cell(1, i+1).value valP = 1 for j in range(0, len(probability)): valP *= probability[j][i] if len(probability) == 1: if valP != 1: value = valP else: value = 0 else: if valP != 1: value = valP*pc else: value = 0 pFinal.append(value) maks = max(pFinal) if maks != 0: for i in range(0, len(pFinal)): if pFinal[i] == maks: idxMax = i decision = dataModel.cell(0, idxMax+1).value else: decision = 'ERROR' else: decision = 'ERROR' return decision
def runEpisode(ale, agent, stepsRemaining, currentEpisodeTask, frameSkip, maxNoActions): maxEpisodeDuration = 60 * 60 * 5 #Max game duration is 5 minutes, at 60 fps framesElapsed = 0 totalEpisodeReward = 0 ale_game_over = False width, height = ale.getScreenDims() screenBuffer = np.zeros((2, height, width), dtype=np.uint8) screenBufferIndex = 0 frameSkipCounter = 0 rewardPool = 0 reward = 0 startingLives = -1 if maxNoActions > 0: numNoActionsToTake = np.random.randint(0, maxNoActions) for x in xrange(numNoActionsToTake): ale.act(0) screenObservation = ale.getScreenRGB() grayScreenObservation = Preprocessing.grayScaleALEObservation(screenObservation) screenBuffer[screenBufferIndex] = grayScreenObservation screenBufferIndex = (screenBufferIndex + 1) % 2 preprocessedObservation = Preprocessing.resizeALEObservation(grayScreenObservation, agent.inputHeight, agent.inputWidth) action = agent.startEpisode(preprocessedObservation, currentEpisodeTask) startingLives = ale.lives() while not ale_game_over and framesElapsed < stepsRemaining and framesElapsed < maxEpisodeDuration: framesElapsed += 1 frameSkipCounter = 0 while frameSkipCounter < frameSkip: rewardPool += ale.act(action) # if not ale.game_over() and startingLives == -1: # startingLives = ale.lives() screenObservation = ale.getScreenRGB() grayScreenObservation = Preprocessing.grayScaleALEObservation(screenObservation) screenBuffer[screenBufferIndex] = grayScreenObservation screenBufferIndex = (screenBufferIndex + 1) % 2 frameSkipCounter += 1 if ale.game_over() or (agent.training == True and agent.deathEndsEpisode and ale.lives() != startingLives): ale_game_over = True break reward = rewardPool rewardPool = 0 totalEpisodeReward += reward # if not ale.game_over() and startingLives == -1: # startingLives = ale.lives() # if ale.game_over() or (agent.deathEndsEpisode and ale.lives() != startingLives): # ale_game_over = True maxImage = np.maximum(screenBuffer[screenBufferIndex, ...], screenBuffer[screenBufferIndex - 1, ...]) preprocessedObservation = Preprocessing.resizeALEObservation(maxImage, agent.inputHeight, agent.inputWidth) action = agent.stepEpisode(reward, preprocessedObservation) ale.reset_game() avgLoss = agent.endEpisode(reward) return framesElapsed, totalEpisodeReward, avgLoss
assert len(args) == 5, "Expected 4 arguments! \n\n-i followed by input directory path\n-o followed by output file path" assert '-i' in args and os.path.exists(args[args.index('-i')+1]), "Invalid input directory" assert '-o' in args and not args.index('-o') == len(args)-1, "No output file path provided" path = args[args.index('-i')+1] jsonFileName = os.path.join(os.getcwd(), args[args.index('-o')+1]) xmls = filter(lambda x: str(x.split('.')[len(x.split('.'))-1]) == 'xml' , os.listdir(path)) root = {} alldocs = [] l = len(xmls) count = 0 for afile in xmls: count += 1 print 'XMLsToJSON.py: Processing', afile, 'file', count, 'out of', l unit = Preprocessing.parseName(afile) docLevel = {} docLevel['id'] = afile tokenList = [] if debug: html.write('<h2>' + afile + '</h2><table border = "1"><th>Original<th>Conflated</th>') ws = minidom.parse(os.path.join(path, afile)).getElementsByTagName('w') words = [] for w in range(len(ws)): if not 3 in [child.nodeType for child in ws[w].childNodes]: #checking presence of text nodes inside the w continue currentWord = ws[w] previousWord = '' try: previousWord = ws[w-1] except IndexError:
return previous_row[-1] def isBlank(node): return node.getAttribute('n') == '' os.chdir(path) if os.path.exists('Postprocessed'): shutil.rmtree('Postprocessed') os.mkdir('Postprocessed') print for afile in xmls: c += 1 Preprocessing.updateProgressBar('Postprocessing.py', float(100)*c/x) doc = minidom.parse(os.path.join(path, afile)) blocks = doc.getElementsByTagName('block') tokens = doc.getElementsByTagName('token') blanks = [token for token in tokens if token.getAttribute('n') == ''] if blanks: #generate dictionary of witness to its token nodes for each row column1Toks = blocks[0].getElementsByTagName('token') wit2toks = {} for token in column1Toks: wit = token.getAttribute('witness') row = [token for token in doc.getElementsByTagName('token') if token.nodeType == 1 and token.getAttribute('witness') == wit] wit2toks[wit] = row for (wit, row) in wit2toks.items(): #generate list of lists of sequences of empty tokens fin = []