def run_de_ting(start): """ Run the pipeline so far starting at given stage """ if start < 1 or start > 4: print 'Only stages between 1 and 4 exist!' return 0 # take required inputs if start <= 2: no_orgs = input( 'Should organisations be excluded (1 for yes, 0 for no)? ') if start <= 4: file_name = raw_input( 'Enter a name for your WEKA file, .arff will be appended. ') # run de ting! if start == 1: preprocess.preprocessing() print 'STAGE ONE FINISHED' if start <= 2: entity_extraction.extract_all_entities(no_orgs) print 'STAGE TWO FINISHED' if start <= 3: feature_extraction.generate_feature_vectors() print 'STAGE THREE FINISHED' if start <= 4: weka.write_file(file_name) print 'STAGE FOUR FINISHED'
def load_img(training_sample_size=1000, test_sample_size=200, x_size=18, y_size=5, all=False): #load dataset dataset = dict() dataset['labels'] = [ line.rstrip('\n') for line in open('./dataset/labels.txt') ] new_image = cv2.imread('./dataset/%d.png' % 0) # Preprocessing : new_image = preprocessing(new_image) images = np.array([new_image]) for i in range(1, training_sample_size + test_sample_size): new_image = cv2.imread('./dataset/%d.png' % i) #Preprocessing : new_image = preprocessing(new_image) images = np.insert(images, i, [new_image], 0) if (all): return images, dataset['labels'][:training_sample_size + test_sample_size] #test and training set X_train_orginal = images[:training_sample_size] y_train = np.squeeze(dataset['labels'][:training_sample_size]) X_test_original = images[training_sample_size:training_sample_size + test_sample_size] y_test = np.squeeze( dataset['labels'][training_sample_size:training_sample_size + test_sample_size]) #resize X_train_5by5 = [ cv2.resize(img, dsize=(x_size, y_size)) for img in X_train_orginal ] X_test_5by_5 = [ cv2.resize(img, dsize=(x_size, y_size)) for img in X_test_original ] #reshape X_train = [x.reshape(x_size * y_size) for x in X_train_5by5] X_test = [x.reshape(x_size * y_size) for x in X_test_5by_5] #return return X_train, y_train, X_test, y_test
def dataImport(self): ''' output: skeleton,label ''' from preprocess import preprocessing all_skeleton = np.array([]) all_label = np.array([]) for i in range(7): for j in range(19): try: # ====pay attention to this address, every time move python file, remember to change it====# rawData = np.loadtxt('LSTM_Train/data/{}/{}.txt'.format( i, j)) except: pass else: _pre = preprocessing(pos=rawData) _skeleton = _pre.run() # print("_data size is {}".format(_data.shape)) skeleton, label = self.add2List(_skeleton, i) all_skeleton = np.append(all_skeleton, skeleton) all_label = np.append(all_label, label) # print("allData size is {}".format(allData.shape)) all_label = np.reshape(all_label, [-1, self.size]) all_skeleton = np.reshape(all_skeleton, [-1, self.n_steps, self.joints * 3]) return all_skeleton, all_label
def predict(): check_news = str(request.form['check_news']) print(check_news) final = pp.preprocessing(check_news) # lst = check_news.split() # for j in range(len(lst)): # if lst[j] == 'U.S.': # lst[j] = "USA" # # check_news = " ".join(map(str, lst)) # # final_check = re.sub('[^a-zA-Z]', ' ', check_news) # final_check = final_check.lower() # final_check = final_check.split() # final_check = [lemmatizer.lemmatize(word) for word in final_check if not word in set(stopwords.words('english'))] # final_check = ' '.join(final_check) # voc_size = 10000 # final_onehot = one_hot(final_check, voc_size) # # final_onehot = np.array(final_onehot) # final_onehot = final_onehot.reshape((1, len(final_onehot))) # final_onehot = pad_sequences(final_onehot, padding='pre', maxlen=20) ans = model1.predict(final) if (np.round(ans) == 0): output = "News is true" else: output = "News is fake" return render_template("index.html", prediction_text=output)
def reconst(imgPath, maskPath, moving, templatePath, preFlag): if preFlag: imgPath, maskPath = preprocessing(imgPath, maskPath) img = load(imgPath) directory = dirname(imgPath) inPrefix = imgPath.split('.nii')[0] prefix = basename(inPrefix) outPrefix = os.path.join(directory, 'harm', prefix) b0, shm_coeff, qb_model = rish(imgPath, maskPath, inPrefix, outPrefix, N_shm) print(f'Registering template FA to {imgPath} space ...') outPrefix = pjoin(directory, 'harm', 'ToSubjectSpace_' + prefix.replace(f'_b{bshell_b}','')) # glob the directory for _b{bmax} transform files # if not glob(outPrefix.replace(f'_b{bshell_b}','*')+'1Warp.nii.gz'): if not isfile(outPrefix+'1Warp.nii.gz'): fixed = pjoin(directory, 'dti', f'{prefix}_FA.nii.gz') antsReg(fixed, maskPath, moving, outPrefix) antsApply(templatePath, pjoin(directory, 'harm'), prefix) print(f'Reconstructing signal from {imgPath} rish features ...') harmImg, harmMask = ring_masking(directory, prefix, maskPath, shm_coeff, b0, qb_model, img.header) copyfile(inPrefix + '.bvec', harmImg.split('.nii')[0] + '.bvec') copyfile(inPrefix + '.bval', harmImg.split('.nii')[0] + '.bval') if debug: dti_harm(harmImg, harmMask) return (imgPath, maskPath, harmImg, harmMask)
def main(): document_to_process = [] for doc_name in document_to_process: doc = preprocessing(doc_name) keywords = compute_tfidf(get_corpus(), doc, KEYWORD_AMOUNT) sentiment = get_sentiment(doc) print("\nAbstract:") print(doc) print("\nSentiment:") print(sentiment) print("\nKeywords:") for k in keywords: print(k, keywords[k]) print("==================") return
def text_classification(): if request.method == 'POST': text = request.form[ 'text_class'] ##get input data from form user types t = text max_review_length = 3000 ##input data with max length is 3000 words print(text + "\n") pre = preprocessing() start = time.time() ##data preprocessing such as remove_html, puntuation,... document = pre.text_preprocess(text) end2 = time.time() ## total time taken print(f"Runtime of text preprocessing is {end2 - start}") ###Remove stopwords of input data preprocessed document = pre.remove_stopwords(document) end3 = time.time() ## total time taken print(f"Runtime of removing stopwords is {end3 - start}") ##Text prediction such the_thao, thoi_su,... label = nb_model.predict([document]) end4 = time.time() # total time taken print(f"Runtime of prediction is {end4 - start}") ##inversion_transform label from index to class class1 = label_encoder.inverse_transform(label) end5 = time.time() ## total time taken print(f"Runtime of inverse_transform is {end5 - start}") print(f"Predict label: {class1}") ##write data into csv file with open("output/output_text_classification.csv", 'a', newline='', encoding="utf8", errors="ignore") as out: filenames = ['class', 'content'] writer = csv.DictWriter(out, filenames) writer.writeheader() writer.writerow({'class': class1, 'content': text.strip("\n")}) end6 = time.time() # total time taken print(f"Runtime of saving into cs file is {end6 - start}") return render_template('home.html', classification=class1)
def perform_ocr(raw_image, model_dict): # Borro todo el contenido de la carpeta de output fileList = os.listdir( os.path.dirname(os.path.abspath(__file__)) + "/output_images") for fileName in fileList: os.remove( os.path.dirname(os.path.abspath(__file__)) + "/output_images/" + fileName) # Redimensionamos la region de interes seleccionada a 100 px de altura. La anchura dependerá de la resolución original. height = raw_image.shape[0] width = raw_image.shape[1] aspectRatio = width / (height * 1.0) height = 100 width = int(height * aspectRatio) raw_image = cv2.resize(raw_image, (width, height)) # Aplico un procesamiento a la imagen preprocessed_image = preprocessing(raw_image) # Segmento y redimensiono los dígitos para mantener la relación de aspecto all_digits = segmentation(preprocessed_image) all_results = ["" for _ in range(0, len(model_dict))] best_result = "" for digit in all_digits: current_predicted_digit = [] ''' plt.imshow(digit) plt.show() ''' for key, value in model_dict.items(): predicted = predict_image(digit, value['model'], value['graph'], value['session']) all_results[key] += str(predicted) current_predicted_digit.append(str(predicted)) # De todas las predicciones, supongo que el digito correcto es el que más veces se predijo best_result += get_best_digit(current_predicted_digit) print(all_results) print(best_result) return best_result
import cv2 import os import preprocess # root folder root_path = "raw_leaf_data/" all_image_files = preprocess.get_all_files(root_path) for f, idx in zip(all_image_files, range(len(all_image_files))): t = preprocess.preprocessing(f) target_class = f.split('\\')[0].split('/')[1] print(target_class) cv2.imwrite( os.path.join('new_leaf_data', target_class, "leaf-image" + str(idx)) + '.jpg', t) print( 'saved', os.path.join('new_leaf_data', target_class, "leaf-image" + str(idx)) + '.jpg')
def multi_process(input_dir): for i in glob.glob(os.path.join(input_dir, '*')): print(i) preprocessing(i)
use_normalized_coordinates=True, line_thickness=5, min_score_thresh=0.50) #if any objects are detected, generate the temp file if (coordinates): generate_temp(coordinates) dim = (950, 1000) resized = cv2.resize(image_np, dim, interpolation=cv2.INTER_AREA) cv2.imshow('output_image', resized) cv2.waitKey(0) cv2.destroyAllWindows() #the image has been processed #delete the temporary processes file if exists path = '/home/mohak/Music/implementation/' os.remove(path + 'temp.jpg') #run the generated html code in browser as a webpage #give the complete path to the generated html file file_path = 'PATH_TO_DIR/generated_code.html' new = 2 webbrowser.open(file_path, new=new) #For running on local testImages #path of the image using which the html code is to be generated path = 'COMPLETE_PATH_TO_INPUT_IMAGE' processed_image = preprocessing(path) processImage(processed_image)
import preprocess import os chapter_folder = "chapters" # Folder to save result chapter_split_mark = "$" # Split mark to mark the end of chapter # Create result folder if not os.path.exists(chapter_folder): os.makedirs(chapter_folder) # Split chapters string = preprocess.input_file.read() string = preprocess.str_replace_re(string, "正文 第.{1,5}回", chapter_split_mark) chapters = string.split(chapter_split_mark) # Save chapters for chapter_no, chapter_string in enumerate(chapters): if chapter_no == 0: continue result = preprocess.preprocessing(chapter_string) file_name = os.path.join(chapter_folder, "%d.txt" % chapter_no) chapter_file = open(file_name, "w") chapter_file.write(result)
y= x['label'] #removing the column used in prediction x= x.drop('label',axis=1) x['content'] = x['title'] + x['text'] x['content'] # preprocessing and cleaning data # ps = PorterStemmer() corpus = [] for i in range(len(x['title'])): # final =[] #final.append(op) op = pp.preprocessing(x['title'][i]) corpus.append(op) # lst = x['title'][i].split() # for j in range(len(lst)): # if lst[j] == 'U.S.': # lst[j] = "USA" # # x['title'][i] = " ".join(map(str, lst)) # # review = re.sub('[^a-zA-Z]', ' ', x['title'][i]) # review = review.lower() # review = review.split() # review = [lemmatizer.lemmatize(word) for word in review if not word in set(stopwords.words('english'))] # review = ' '.join(review) #corpus.append(review) print(i, " row completed.")
def main(): if len(sys.argv)!= 5: #check arguments print "Usage :: python ibmModel1.py file_source file_target iterations numberOfSentencesForTraining" sys.exit(0) numberOfSentences=int(sys.argv[4]) #initialisation numberOfIterations = int(sys.argv[3]) sentencePair = preprocessing(numberOfSentences, sys.argv[1], sys.argv[2] ) listOfSourceWords = defaultdict(list) #create list of possible source words for a target word for pair in sentencePair: sourceWords=pair.split(' ') targetWords=sentencePair[pair].split(' ') for word in targetWords: if word!='': for key in sourceWords: if key!='': listOfSourceWords[word].append(key) for word in listOfSourceWords: listOfSourceWords[word] = list(set(listOfSourceWords[word])) translationProbability=defaultdict(dict) #initialize the translation probability for wordTarget in listOfSourceWords: uniqueWordsSource = listOfSourceWords[wordTarget] for wordSource in uniqueWordsSource: translationProbability[wordTarget][wordSource]=1/float(len(uniqueWordsSource)) perplexity=0 #Expectation Maximisation iteration=0 while iteration<numberOfIterations: iteration+=1 print "Iteration: " +str(iteration) count=defaultdict(lambda: defaultdict(float)) #initialisation sumTotal=defaultdict(float) total=defaultdict(float) for pair in sentencePair: #E Step sourceWords=pair.split(' ') targetWords=sentencePair[pair].split(' ') for words in targetWords: if words!='': for key in sourceWords: if key!='': sumTotal[words]+=translationProbability[words][key] for words in targetWords: if words!='': for key in sourceWords: if key!='': count[words][key]+=(translationProbability[words][key]/sumTotal[words]) total[key]+=(translationProbability[words][key]/sumTotal[words]) for key in sourceWords: #M Step if key!='': for words in targetWords: if words!='': translationProbability[words][key]=count[words][key]/total[key] newperplexity=calculatePerplexity(sentencePair, translationProbability) print newperplexity if perplexity>=newperplexity: print "Successful" else: print "Failed" perplexity=newperplexity data=[] print len(translationProbability) for key in translationProbability: possibleWords = translationProbability[key] data.append(key+' '+max(possibleWords.iteritems(), key=operator.itemgetter(1))[0]) with open('wordTranslation.txt','w') as f: f.write('\n'.join(data))
def main_function(database_dir, bore_IDs, Lithology_table, box_bottom_rate=1.1, bottomlength=15, predefined_angle_degree=None, Merge_Layers=False, bottom_box_type='normalbottombox', xshifter=0.5, yshifter=0.5, epsbn_ratio=0.05, eps_ratio=0.01, ExtendLine_edit_distance=5, TrimLine_edit_dangle_length=2, Integrate_management_distance=0.01, del_x=10, del_y=10, smooth_2d=False, gen_polygons=True): t_total_start = time.time() con_to_mdb = pypyodbc.win_connect_mdb(database_dir) arcpy.env.workspace = database_dir #arcpy.env.outputZFlag = "Enabled" ######################################################################################################## '''Pre processing''' print "######### Pre-processing... #########" t0 = time.time() bore, indextemplist, mainpolylist = preprocess.preprocessing(bore_IDs) t1 = time.time() print "########## Pre-processing finished #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## print "######### Reading data... #########" t0 = time.time() '''reading Borehole_table''' boidtable, x, y, elev, indextemplist_with_coords = readinginputdata.readBorehole_table( bore, xshifter, yshifter, indextemplist, con_to_mdb) '''reading Borehole_Litho table''' boid, top, bot, lit, fault_points_in_rawdata = readinginputdata.readBorehole_LithoTable( bore, boidtable, elev, Lithology_table, con_to_mdb) '''Read and pre - processing priority table ''' prior = readinginputdata.readpriority_table(con_to_mdb) #format = [prior 0:prioroty 1:[toplayer] 2:[bottomlayer] 3:type] #top layer and bottom layer type is list''' '''reading fault_table''' fault_table = readinginputdata.readfault_table(prior, con_to_mdb) #fault_table=[ [priority_number,[ [bhid,elevation] , [bhid,elevation] , ...] ] #, [priority_number,[ [bhid,elevation] , [bhid,elevation] , ...] , ... ] , ...] '''reading surface data''' temp_points = readinginputdata.readtopo_points(prior, indextemplist_with_coords, del_x, del_y, con_to_mdb) con_to_mdb.close() t1 = time.time() print "######### Reading data finished #########" print "Time:", t1 - t0 print "\n \n \n" '''merging the layers with the same Lithology and change the raw data structure''' if Merge_Layers == True: print "######### Merging layers... #########" rawdata = layermerger.layermerge( boid, top, bot, lit, indextemplist_with_coords, bore, Merge_Layers) #don't do the merge part when it is false if Merge_Layers == True: print "######### Merging layers finished #########" ######################################################################################################### print "######### 3D plane calculator, Projecting surface points, Generating Topography and guide bottom boundary... #########" t0 = time.time() '''plane calculator''' planenormalslist = plane_calculator_topo_projector.cross_product_and_planenormalslist_maker( indextemplist_with_coords) '''Projecting surface points''' temp_points = plane_calculator_topo_projector.surface_point_projector( planenormalslist, temp_points) ''' drawing the Box ''' #ratiobottombox or normalbottombox: #normalbottombox find the minimum, and increase the box by a user defined ratio (consider faults) #ratiobottombox do it based on minimum of every borehole (consider faults) mainpolylist, boxpoints, rawdata = boxcreator.boxcreator2( bottom_box_type, box_bottom_rate, fault_table, rawdata, prior, mainpolylist, indextemplist_with_coords) ''' drawing the topo''' mainpolylist, maxfirst, maxlast, temp_points = topolinecreator.topolinecreator( mainpolylist, indextemplist_with_coords, prior, temp_points) t2 = time.time() print "######### 3D plane calculator, Projecting surface points, Generating Topography and guide bottom boundary finished #########" print "Time:", t2 - t0 print "\n \n \n" ######################################################################################################## '''point extraction (normal and fault poin extraction)''' print "######### Point extraction... #########" t0 = time.time() mainpointlist, mainpointlistreverse, point_id = pointextraction.pointextraction( rawdata, prior, indextemplist_with_coords, x, y, boidtable, fault_table, elev) #mainpointlist=[0=point_id, 1=index, 2=bhid, 3=priority, 4=type, 5=coordinates, #6=connectedleft point id, 7=connectedright point id, 8=pointcode ] #pointcodes: # 0 = not connected # 1 = just left connected # 2 = just right connected # 3 = fully connected''' t1 = time.time() print "######### Point extraction finished #########" print "Time:", t1 - t0 print "\n \n \n" ################################################################# '''topography reading data and processing''' print "####topography reading data and processing####" t0 = time.time() #to read and preprocess the surface data. Also to produce the maintopolist mainpointlist, mainpointlistreverse, mainpolylist, maintopolist = surface_pnt_process( mainpointlist, mainpointlistreverse, mainpolylist, prior, rawdata, temp_points, indextemplist_with_coords) t1 = time.time() print "####topography reading data and processing finished####" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## '''Manual stage''' print "######### Critical zones processing... #########" t0 = time.time() mainpointlist, mainpolylist = foldfaulintrusionsitunationdeterminer.foldfaulintrusionsitunationdeterminers( mainpointlist, mainpointlistreverse, mainpolylist, prior, indextemplist_with_coords, maintopolist) #IMPORTANT: output mainpointlist contains mainpointlistreverse also, the point situation for all points changed back t1 = time.time() print "######### Critical zones processing finish #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## '''create definite lines''' print "######### Generating automatic definite lines... #########" t0 = time.time() temres = definitelines.definitelines(prior, indextemplist_with_coords, mainpointlist, mainpolylist) mainpointlist = temres[0] mainpolylist = temres[1] t1 = time.time() print "######### Generating automatic definite lines finished #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## '''calculate angles''' print "######### Angle processing... #########" t0 = time.time() angles = anglefinder.anglefinder(mainpolylist, prior, fault_table, predefined_angle_degree) #angles= [ [prio_num,type, tan_angle, quantity, [ [index1,index2,startpoint,endpoint] ] ] ,...] t1 = time.time() print "######### Angle processing finished #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## '''create lines in stage 2 ''' print "######### Automatic structure completion... #########" t0 = time.time() mainpointlist, mainpolylist = secondstagelinecompleter.secondstagelinecompleter( mainpointlist, prior, mainpolylist, rawdata, angles, indextemplist_with_coords) t1 = time.time() print "######### Automatic structure completion finished #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## '''post bottom box''' print "######### Ultimate bottom boundary... #########" t0 = time.time() intersectionpoints, postbottomboxlist, minsfirst, minslast = postbottombox.postbottombox( mainpolylist, mainpointlist, indextemplist_with_coords, bottomlength) t1 = time.time() print "######### Ultimate bottom boundary finished #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## '''Create point and polylines in 3d''' print "######### Creating 3D point and layers (polylines) database... #########" t0 = time.time() pfcadd = createpointfeatureclass.createpointfeatureclass( boxpoints, mainpointlist, intersectionpoints, postbottomboxlist, indextemplist_with_coords[-1][1]) arcgistempdb = createpolyfeatureclass.createpolyfeatureclass( mainpolylist, pfcadd, postbottomboxlist, minsfirst, minslast, maxfirst, maxlast, prior) t1 = time.time() print "######### Creating 3D point and layers (polylines) database finished #########" print "Time:", t1 - t0 print "\n \n \n" #This is to stop the algorithm in case the user doesn't want the polygons if gen_polygons == False: print "'gen_polygons' variable is False. In case of need to 3D polygons and/or 2D lines and polygons, set 'gen_polygons' to True" sys.exit() ######################################################################################################## '''This function make the 3d polygons between boreholes individually, then merge them''' print "######### 3D to 2D Convertion, polygon determiner, 2D cross-section maker, 3D polygon database generator... #########" t0 = time.time() #epsbn_ratio=0.05 #eps_ratio=0.01 arcgistempdb_2d, polygns_3d = threeD_to_2d_projector_v2.cs_3d_to_2d( planenormalslist, mainpolylist, indextemplist_with_coords, postbottomboxlist, minsfirst, minslast, maxfirst, maxlast, prior, rawdata, epsbn_ratio, eps_ratio, ExtendLine_edit_distance, TrimLine_edit_dangle_length, Integrate_management_distance, smooth_2d) t1 = time.time() print "######### 3D to 2D Convertion, polygon determiner, 2D cross-section maker, 3D polygon database generator finished #########" print "Time:", t1 - t0 print "\n \n \n" ######################################################################################################## print "Total time:" t_total_end = time.time() print t_total_end - t_total_start print "\n \n \n" ######################################################################################################## #return pfcadd,arcgistempdb, mainpolylist,mainpointlist,indextemplist,prior,fault_table,rawdata,angles,arcgistempdb_2d #return arcgistempdb,arcgistempdb_2d, mainpolylist,mainpointlist,indextemplist,prior,fault_table,rawdata,angles #return pfcadd,arcgistempdb, mainpolylist,mainpointlist,indextemplist,prior,fault_table,rawdata return
def algo(exp_data, attr_data_, map_data, pop, n, prop1, prop2, propensity1, propensity2, sessionId, userId, cols): try: start_time = time.time() status = 0 is_opened = 0 print "Started" if int(exp_data['Market_flg'].unique()[0]) == 0: client = 'client' + str(exp_data['ClientNum'].unique()[0]) split_col_flg = 'SPLIT_FLG' market_flg = 0 else: split_col_flg = 'SPLIT_FLG_NATIONAL_FILE' market_flg = 1 attr_data = attr_data_[attr_data_['EXPERIAN_DB_Col_NAME'].isin( list(exp_data.columns))] req_cols = list(attr_data['EXPERIAN_DB_Col_NAME'].values) cat_col = list(attr_data[attr_data.loc[:, 'CLASS_DEFN'] == 'factor'] ['EXPERIAN_DB_Col_NAME'].values) num_col = list(attr_data[attr_data.loc[:, 'CLASS_DEFN'] == 'numeric'] ['EXPERIAN_DB_Col_NAME'].values) int_col = list(attr_data[attr_data.loc[:, 'CLASS_DEFN'] == 'integer'] ['EXPERIAN_DB_Col_NAME'].values) bin_col = list(attr_data[attr_data.loc[:, 'BINNING_FLG'] == 'Yes'] ['EXPERIAN_DB_Col_NAME'].values) split_data = attr_data[attr_data.loc[:, split_col_flg] != 'No'][[ 'EXPERIAN_DB_Col_NAME', split_col_flg ]] print "attriutes loaded" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() need_attr = attr_data[attr_data['VAR_NAME'].isin(list(cols))] cols = list(need_attr['EXPERIAN_DB_Col_NAME'].values) print "needed attributes loaded" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() #clean for modeling exp_data_ = exp_data[req_cols] con_col = num_col + int_col exp_data_clean_ = clean(exp_data_, con_col) print "data cleaned" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() if market_flg == 0: #scoring propensity #dc = joblib.load("/home/ec2-user/softwares/VerbatimModule_RCLCO/model_imp.pkl" ) dc = joblib.load("model_imp.pkl") test1 = exp_data_clean_.copy() test2 = pd.get_dummies(test1, columns=cat_col) prop1_scores = 0 prop1_flg = 0 prop2_scores = 0 prop2_flg = 0 if client in dc: models_imp = dc[client] if prop1 in models_imp: fi_prop1 = models_imp[prop1] #models = joblib.load("/home/ec2-user/softwares/VerbatimModule_RCLCO/"+client+"_"+prop1+".pkl") models = joblib.load(client + "_" + prop1 + ".pkl") model1 = models[0] fi = models[1] feats1 = [i[0] for i in fi] ls2 = list(set(feats1) - set(test2.columns)) ls1 = list(test2.columns) + ls2 test3 = test2.reindex(columns=ls1, fill_value=0) X1 = test3[feats1] prop1_scores = (model1.predict_proba(X1).T)[1] prop1_flg = 1 if prop2 in models_imp: fi_prop2 = models_imp[prop2] #models = joblib.load("/home/ec2-user/softwares/VerbatimModule_RCLCO/"+client+"_"+prop2+".pkl") models = joblib.load(client + "_" + prop2 + ".pkl") model2 = models[0] fi = models[1] feats2 = [i[0] for i in fi] ls2 = list(set(feats2) - set(test2.columns)) ls1 = list(test2.columns) + ls2 test3 = test2.reindex(columns=ls1, fill_value=0) X1 = test3[feats2] prop2_scores = (model2.predict_proba(X1).T)[1] prop2_flg = 1 print "Scored propensity" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() exp_data_clean = exp_data_clean_[req_cols] exp_data_proc, cat_col_new = preprocessing(exp_data_clean, cat_col, num_col, bin_col, split_data, cols, split_col_flg) print "data preprocessed" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() clus_labels = kmeans(exp_data_proc, n, cat_col_new) exp_data_clean['cluster'] = clus_labels print "segmented" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() cat_cols = cat_col + int_col d = { 0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J', 10: 'K', 11: 'L', 12: 'M', 13: 'N', 14: 'O', 15: 'P' } for i in range(0, n): exp_data_clean['cluster'].replace(i, d[i], inplace=True) profiling_data1, final_data1 = profile(exp_data_clean, req_cols, cat_cols, bin_col, n, 'cluster') print "profiled" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() exp_data_clean['prop1_scores'] = 0 exp_data_clean['prop2_scores'] = 0 if market_flg == 0: if prop1_flg == 1: exp_data_clean['prop1_scores'] = prop1_scores exp_data_clean['prop1_seg'] = 1 exp_data_clean.loc[exp_data_clean['prop1_scores'] < .25, 'prop1_seg'] = 1 exp_data_clean.loc[(exp_data_clean['prop1_scores'] >= .25) & (exp_data_clean['prop1_scores'] < .7), 'prop1_seg'] = 2 exp_data_clean.loc[exp_data_clean['prop1_scores'] >= .7, 'prop1_seg'] = 3 exp_data_clean['prop1_seg'] = exp_data_clean['cluster'].astype( str) + exp_data_clean['prop1_seg'].astype(str) profiling_data_1, final_data_1 = profile( exp_data_clean, req_cols, cat_cols, bin_col, n, 'prop1_seg') if prop2_flg == 1: exp_data_clean['prop2_scores'] = prop2_scores exp_data_clean['prop2_seg'] = 1 exp_data_clean.loc[exp_data_clean['prop2_scores'] < .25, 'prop2_seg'] = 1 exp_data_clean.loc[(exp_data_clean['prop2_scores'] >= .25) & (exp_data_clean['prop2_scores'] < .7), 'prop2_seg'] = 2 exp_data_clean.loc[exp_data_clean['prop2_scores'] >= .7, 'prop2_seg'] = 3 exp_data_clean['prop2_seg'] = exp_data_clean['cluster'].astype( str) + exp_data_clean['prop2_seg'].astype(str) profiling_data_2, final_data_2 = profile( exp_data_clean, req_cols, cat_cols, bin_col, n, 'prop2_seg') print "profiled again" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() temp = exp_data_clean.copy() prop_seg = dict(temp['cluster'].value_counts()) if market_flg == 0: if prop1_flg == 1: prop1_seg = dict(temp['prop1_seg'].value_counts()) if prop2_flg == 1: prop2_seg = dict(temp['prop2_seg'].value_counts()) print "start the dump" print("--- %s seconds ---" % (time.time() - start_time)) start_time = time.time() #------------------------------sql dump-------------------------------------------------- con = mysql.connector.connect( user='******', password='******', host='rclco.ctcznzw1aqdz.us-west-1.rds.amazonaws.com', port='3306', database='rclco_bi2i') #con= mysql.connector.connect(user='******', password='******',host='localhost',port='3306',database='rclco') cursor = con.cursor(True) is_opened = 1 for i in range(0, n): seg = d[i] value = prop_seg[seg] sql = "INSERT INTO SEGMENT_DONUT_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,NO_HH) VALUES('{}','{}',{},'{}','{}',{});".format( sessionId, userId, i + 1, pop, seg, value) cursor.execute(sql) if market_flg == 0: for i in range(0, n): for j in range(0, 3): seg1 = str(d[i]) + str(j + 1) if prop1_flg == 1: if seg1 in prop1_seg: value = prop1_seg[seg1] sql = "INSERT INTO SUB_SEG_DONUT_TMP(sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,NO_HH,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format( sessionId, userId, i + 1, seg1, propensity1, value, pop) #sqls1.append(sql) cursor.execute(sql) if prop2_flg == 1: if seg1 in prop2_seg: value1 = prop2_seg[seg1] sql = "INSERT INTO SUB_SEG_DONUT_TMP(sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,NO_HH,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format( sessionId, userId, i + 1, seg1, propensity2, value1, pop) #sqls2.append(sql) cursor.execute(sql) #mapping needs to be done for cat_cols map_cols = map_data['EXP_COL_NAME'].unique() temp1 = final_data1.T.to_dict(orient='list') for i in temp1: seg_id = (d.keys()[d.values().index(temp1[i][0])]) + 1 seg = temp1[i][0] attr_name = temp1[i][2] attr_value = str(temp1[i][3]) #if temp1[i][5] == 'cat_cols': if attr_name in map_cols: x = map_data[map_data['EXP_COL_NAME'] == attr_name] try: attr_value = x[x['RANGE_VALUE'] == attr_value]['DISPLAY_NAME'].values[0] except: try: attr_value = x[x['RANGE_VALUE'].map(float) == float( attr_value)]['DISPLAY_NAME'].values[0] except: try: attr_value = x[(x['RANGE_VALUE'].map( float) <= float(attr_value)) & ( x['MAX'].map(float) >= float(attr_value) )]['DISPLAY_NAME'].values[0] except: attr_value = attr_value #else: # attr_value = "{0:.2f}".format(float(attr_value)) attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] == attr_name]['VAR_NAME'].values[0] sql = "INSERT INTO SEGMENT_DEFN_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,ATTRIBUTE_NAME,ATTRIBUTE_VALUE) VALUES('{}','{}',{},'{}','{}','{}','{}');".format( sessionId, userId, seg_id, pop, seg, attr_name, attr_value) cursor.execute(sql) if market_flg == 0: if prop1_flg == 1: temp2 = final_data_1.T.to_dict(orient='list') for i in temp2: seg_id = (d.keys()[d.values().index(temp2[i][0][0])]) + 1 sub_seg = temp2[i][0] attr_name = temp2[i][2] attr_value = str(temp2[i][3]) #if temp2[i][5] == 'cat_cols': if attr_name in map_cols: x = map_data[map_data['EXP_COL_NAME'] == attr_name] try: attr_value = x[x['RANGE_VALUE'] == attr_value][ 'DISPLAY_NAME'].values[0] except: try: attr_value = x[x['RANGE_VALUE'].map( float) == float( attr_value)]['DISPLAY_NAME'].values[0] except: try: attr_value = x[(x['RANGE_VALUE'].map( float) <= float(attr_value)) & ( x['MAX'].map(float) >= float( attr_value) )]['DISPLAY_NAME'].values[0] except: attr_value = attr_value #else: # attr_value = "{0:.2f}".format(float(attr_value)) attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] == attr_name]['VAR_NAME'].values[0] sql = "INSERT INTO SUB_SEG_DEFN_TMP(sessionId,userId,SEG_ID,SUB_SEG_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRIBUTE_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}','{}','{}');".format( sessionId, userId, seg_id, sub_seg, propensity1, attr_name, attr_value, pop) cursor.execute(sql) if prop2_flg == 1: temp3 = final_data_2.T.to_dict(orient='list') for i in temp3: seg_id = (d.keys()[d.values().index(temp3[i][0][0])]) + 1 sub_seg = temp3[i][0] attr_name = temp3[i][2] attr_value = str(temp3[i][3]) #if temp3[i][5] == 'cat_cols': if attr_name in map_cols: x = map_data[map_data['EXP_COL_NAME'] == attr_name] try: attr_value = x[x['RANGE_VALUE'] == attr_value][ 'DISPLAY_NAME'].values[0] except: try: attr_value = x[x['RANGE_VALUE'].map( float) == float( attr_value)]['DISPLAY_NAME'].values[0] except: try: attr_value = x[(x['RANGE_VALUE'].map( float) <= float(attr_value)) & ( x['MAX'].map(float) >= float( attr_value) )]['DISPLAY_NAME'].values[0] except: attr_value = attr_value #else: # attr_value = "{0:.2f}".format(float(attr_value)) attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] == attr_name]['VAR_NAME'].values[0] sql = "INSERT INTO SUB_SEG_DEFN_TMP(sessionId,userId,SEG_ID,SUB_SEG_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRIBUTE_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}','{}','{}');".format( sessionId, userId, seg_id, sub_seg, propensity2, attr_name, attr_value, pop) cursor.execute(sql) for i in range(0, 4): if prop1_flg == 1: attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] == fi_prop1[i][0]]['VAR_NAME'].values[0] sql = "INSERT INTO ATRI_CORELATION_TMP(sessionId,userId,PROPENSITY_TYPE,ATTRIBUTE_NAME,CORERLATION,POPULATION_TYPE) VALUES('{}','{}','{}','{}',{},'{}');".format( sessionId, userId, propensity1, attr_name, round(fi_prop1[i][1], 3), pop) cursor.execute(sql) if prop2_flg == 1: attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] == fi_prop2[i][0]]['VAR_NAME'].values[0] sql = "INSERT INTO ATRI_CORELATION_TMP(sessionId,userId,PROPENSITY_TYPE,ATTRIBUTE_NAME,CORERLATION,POPULATION_TYPE) VALUES('{}','{}','{}','{}',{},'{}');".format( sessionId, userId, propensity2, attr_name, round(fi_prop2[i][1], 3), pop) cursor.execute(sql) for i in range(0, n): seg = d[i] if prop1_flg == 1: p1 = temp[temp['cluster'] == seg]['prop1_scores'].mean() if prop2_flg == 1: p2 = temp[temp['cluster'] == seg]['prop2_scores'].mean() sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',{},{});".format( 'PROP_' + propensity1, 'PROP_' + propensity2, sessionId, userId, i + 1, pop, seg, p1, p2) else: sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',{},NULL);".format( 'PROP_' + propensity1, 'PROP_' + propensity2, sessionId, userId, i + 1, pop, seg, p1) elif prop2_flg == 1: p2 = temp[temp['cluster'] == seg]['prop2_scores'].mean() sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',NULL,{});".format( 'PROP_' + propensity1, 'PROP_' + propensity2, sessionId, userId, i + 1, pop, seg, p2) else: sql = "INSERT INTO SEGMENT_PROPENSITY_TMP(sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,{},{}) VALUES('{}','{}',{},'{}','{}',NULL,NULL);".format( 'PROP_' + propensity1, 'PROP_' + propensity2, sessionId, userId, i + 1, pop, seg) cursor.execute(sql) for j in range(0, 3): seg1 = str(d[i]) + str(j + 1) if prop1_flg == 1: if seg1 in prop1_seg: p1 = temp[temp['prop1_seg'] == seg1]['prop1_scores'].mean() sql = "INSERT INTO SUB_SEG_PROPENSITY_TMP(sessionId,userId,SEG_ID,PROPENSITY_TYPE,SEGMENT_TYPE,PROP_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format( sessionId, userId, i + 1, propensity1, seg1, p1, pop) cursor.execute(sql) if prop2_flg == 1: if seg1 in prop2_seg: p2 = temp[temp['prop2_seg'] == seg1]['prop2_scores'].mean() sql = "INSERT INTO SUB_SEG_PROPENSITY_TMP(sessionId,userId,SEG_ID,PROPENSITY_TYPE,SEGMENT_TYPE,PROP_VALUE,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}',{},'{}');".format( sessionId, userId, i + 1, propensity2, seg1, p2, pop) cursor.execute(sql) need_cols1 = final_data1['variable'].unique() for i in range(0, n): seg = d[i] attr = [] for j in need_cols1: value1 = profiling_data1[ (profiling_data1['cluster'] == seg) & (profiling_data1['variable'] == j)]['score'] if len(value1) == 0: value = 0 else: value = value1.values[0] if np.isnan(value): value = 0 value = "{0:.2f}".format(float(value)) attr_name = attr_data[attr_data['EXPERIAN_DB_Col_NAME'] == j]['VAR_NAME'].values[0] if attr_name not in attr: sql = "INSERT INTO SEGMENT_ATTR_QUAL_TMP (sessionId,userId,SEG_ID,POPULATION_TYPE,SEGMENT_TYPE,ATTRIBUTE_NAME,ATTRI_QUAL) VALUES('{}','{}',{},'{}','{}','{}',{});".format( sessionId, userId, i + 1, pop, seg, attr_name, value) cursor.execute(sql) attr.append(attr_name) if market_flg == 0: for i in range(0, n): seg = d[i] for l in range(0, 3): sub_seg = str(d[i]) + str(l + 1) if prop1_flg == 1: need_cols_1 = final_data_1[ final_data_1['cluster'].isin( [d[i] + '1', d[i] + '2', d[i] + '3'])]['variable'].unique() if sub_seg in prop1_seg: attr = [] for j in need_cols_1: value1 = profiling_data_1[ (profiling_data_1['cluster'] == sub_seg) & (profiling_data_1['variable'] == j )]['score'] if len(value1) == 0: value = 0 else: value = value1.values[0] if np.isnan(value): value = 0 value = "{0:.2f}".format(float(value)) attr_name = attr_data[ attr_data['EXPERIAN_DB_Col_NAME'] == j]['VAR_NAME'].values[0] if attr_name not in attr: sql = "INSERT INTO SUB_SEG_ATTR_QUAL_TMP (sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRI_QUAL,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}',{},'{}');".format( sessionId, userId, i + 1, sub_seg, propensity1, attr_name, value, pop) cursor.execute(sql) attr.append(attr_name) if prop2_flg == 1: need_cols_2 = final_data_2[ final_data_2['cluster'].isin( [d[i] + '1', d[i] + '2', d[i] + '3'])]['variable'].unique() if sub_seg in prop2_seg: attr1 = [] for k in need_cols_2: value2 = profiling_data_2[ (profiling_data_2['cluster'] == sub_seg) & (profiling_data_2['variable'] == k )]['score'] if len(value2) == 0: value = 0 else: value = value2.values[0] if np.isnan(value): value = 0 value = "{0:.2f}".format(float(value)) attr_name = attr_data[ attr_data['EXPERIAN_DB_Col_NAME'] == k]['VAR_NAME'].values[0] if attr_name not in attr1: sql = "INSERT INTO SUB_SEG_ATTR_QUAL_TMP (sessionId,userId,SEG_ID,SEGMENT_TYPE,PROPENSITY_TYPE,ATTRIBUTE_NAME,ATTRI_QUAL,POPULATION_TYPE) VALUES('{}','{}',{},'{}','{}','{}',{},'{}');".format( sessionId, userId, i + 1, sub_seg, propensity2, attr_name, value, pop) cursor.execute(sql) attr1.append(attr_name) con.commit() con.close() is_opened = 0 print "Success" print("--- %s seconds ---" % (time.time() - start_time)) except: if is_opened == 1: con.close() status = -1 print "Failed" print("--- %s seconds ---" % (time.time() - start_time)) traceback.print_exc(file=sys.stdout) return status
import pandas as pd from sklearn.ensemble import ExtraTreesClassifier from preprocess import preprocessing cols=['IFATHER', 'NRCH17_2', 'IRHHSIZ2', 'IIHHSIZ2', 'IRKI17_2', 'IIKI17_2', 'IRHH65_2', 'IIHH65_2', 'PRXRETRY',\ 'PRXYDATA', 'MEDICARE', 'CAIDCHIP', 'CHAMPUS', 'PRVHLTIN', 'GRPHLTIN', 'HLTINNOS', 'HLCNOTYR', 'HLCNOTMO',\ 'HLCLAST', 'HLLOSRSN', 'HLNVCOST', 'HLNVOFFR', 'HLNVREF', 'HLNVNEED', 'HLNVSOR', 'IRMCDCHP', 'IIMCDCHP',\ 'IRMEDICR', 'IIMEDICR', 'IRCHMPUS', 'IICHMPUS', 'IRPRVHLT', 'IIPRVHLT', 'IROTHHLT', 'IIOTHHLT', 'HLCALLFG',\ 'HLCALL99', 'ANYHLTI2', 'IRINSUR4', 'IIINSUR4', 'OTHINS', 'CELLNOTCL', 'CELLWRKNG', 'IRFAMSOC', 'IIFAMSOC',\ 'IRFAMSSI', 'IIFAMSSI', 'IRFSTAMP', 'IIFSTAMP', 'IRFAMPMT', 'IIFAMPMT', 'IRFAMSVC', 'IIFAMSVC', 'IRWELMOS',\ 'IIWELMOS', 'IRPINC3', 'IRFAMIN3', 'IIPINC3', 'IIFAMIN3', 'GOVTPROG', 'POVERTY3', 'TOOLONG', 'TROUBUND',\ 'PDEN10', 'COUTYP2', 'MAIIN102', 'AIIND102', 'ANALWT_C', 'VESTR', 'VEREP'] train = pd.read_csv('criminal_train.csv') train = preprocessing(train) train_x = train[cols] train_y = train['Criminal'] model = ExtraTreesClassifier() model.fit(train_x, train_y) print(list(zip(model.feature_importances_, cols)))
import cv2 import numpy as np from matplotlib import pyplot as plt from random import randint from preprocess import preprocessing from sklearn.cluster import KMeans # import image : new_image = cv2.imread('./dataset/%d.png' % randint(0, 1000)) # preprocessing : new_image = preprocessing(new_image) # save orginal image : cv2.imwrite("./org.jpg", new_image) # k-means fit alg : np_img = np.asarray(new_image) np_img = np.argwhere(np_img == 255) print(np_img[:, 0]) kmeans = KMeans(n_clusters=5, random_state=0).fit(np_img) # draw point on image : new_image = cv2.cvtColor(new_image, cv2.COLOR_GRAY2BGR) for i in range(0, len(kmeans.cluster_centers_)): cv2.circle(new_image, (int( kmeans.cluster_centers_[i][1]), int(kmeans.cluster_centers_[i][0])), 1, (0, 0, 255), 2) cv2.imwrite("./kmeans.jpg", new_image) # draw plot :
from preprocess import preprocessing from cnn import build_dcnn from tensorflow.keras.preprocessing.image import ImageDataGenerator from tensorflow.keras import optimizers from tensorflow.keras.models import Sequential, Model from tensorflow.keras.layers import Flatten, Dense, Conv2D, MaxPooling2D, Dropout, BatchNormalization, LeakyReLU, Activation, TimeDistributed, LSTM, Bidirectional, GlobalAvgPool2D, GlobalMaxPool2D, ZeroPadding2D from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, TensorBoard, LearningRateScheduler from keras import backend as K X_train, y_train, X_val, y_val, X_test, y_test = preprocessing('./data/fer2013.csv') train_datagen = ImageDataGenerator( rotation_range=15, width_shift_range=0.15, height_shift_range=0.15, shear_range=0.15, zoom_range=0.15, horizontal_flip=True, ) train_datagen.fit(X_train) optimizer = optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, name='Nadam') model = build_dcnn((48,48,1)) model.compile( loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'] )
# -*- coding: utf-8 -*- """ Created on Tue Aug 20 16:16:20 2019 @author: raahul46 """ ####DEPENDENCIES#### import pickle from sklearn.ensemble import RandomForestClassifier from sklearn import metrics from sklearn.model_selection import cross_val_score from preprocess import preprocessing ####PREPROCESSING OF DATASET#### x_train, y_train, x_test, y_test = preprocessing() ####MODEL TRAINING#### print("...training model...") classifier1 = RandomForestClassifier( n_estimators=300, criterion="entropy", ) classifier1.fit(x_train, y_train) ####PREDICTION#### y_pred = classifier1.predict(x_test) #SAVING THE MODEL(Pickle File) filename = 'RF_model_final1.sav' pickle.dump(classifier1, open(filename, 'wb'))
def st_app(df_read): set_png_as_page_bg('hog-1.png') st.sidebar.markdown(html_temp_1.format("Team : McGonagall"), unsafe_allow_html=True) st.sidebar.selectbox("Choose", ['Project', 'About']) st.markdown(html_temp_1.format("Best books of the decade:2000 "), unsafe_allow_html=True) #columns = ['Title', 'Author', 'minirating', 'num_reviews', 'num_pages', 'awards', 'genres', 'series', 'year_published', 'places'] df = preprocessing(df=df_read) df = df.drop("Title_URL", axis=1) my_bar = st.progress(0) if st.checkbox("Generate Data"): with st.spinner("Waiting .."): for p in range(0, 120, 20): time.sleep(0.1) my_bar = my_bar.progress(p) st.dataframe( df.style.set_properties( **{ 'background-color': 'black', 'color': 'white', 'border-color': 'blue' })) st.success("Generated Dataframe") # ANALYSIS ## EX-1 st.markdown(html_temp_3.format("ANALYSIS"), unsafe_allow_html=True) str1 = "Group the books by `original_publish_year` and get the mean of the `minmax_norm_ratings` of the groups." st.markdown(html_temp_2.format(str1), unsafe_allow_html=True) groupby_minmax = df.groupby('year_published').agg( {'minmax_norm_rating': 'mean'}) col1, col2 = st.beta_columns([2, 4]) groupby_minmax = groupby_minmax.style.set_properties(**{ 'background-color': 'black', 'color': 'white', 'border-color': 'blue' }) with col1: st.dataframe(groupby_minmax) with col2: st.area_chart(groupby_minmax) # EX-2 str2 = 'Create a function that given an author as input it returns her/his book with the highest minmax_norm_ratings.' st.markdown(html_temp_2.format(str2), unsafe_allow_html=True) col3, col4 = st.beta_columns(2) with col3: auth = st.selectbox("Select Author", df['Author'].unique().tolist()) with col4: st.success(authors_best(auth, df)) st.markdown(html_temp_3.format("VISUALIZATION"), unsafe_allow_html=True) # EX-1 str3 = 'Create a 2D scatterplot with `pages` on the x-axis and `num_ratings` on the y-axis.' st.markdown(html_temp_2.format(str3), unsafe_allow_html=True) # st.info("""### *1. Create a 2D scatterplot with `pages` on the x-axis and `num_ratings` on the y-axis.*""") ex_1 = scatter_pages_num_rating(df) st.plotly_chart(ex_1) st.subheader("Same plot using Streamlit-line_vega_chart") plotly_line_vega(df) # EX-2 str4 = 'Can you compute numerically the correlation coefficient of these two columns?' st.markdown(html_temp_2.format(str4), unsafe_allow_html=True) # st.info("""### *2. Can you compute numerically the correlation coefficient of these two columns?* """) st.write(plot_correlation(df)) # EX-3 str5 = 'Visualise the `avg_rating` distribution.' st.markdown(html_temp_2.format(str5), unsafe_allow_html=True) # st.info("""### *3. Visualise the `avg_rating` distribution.*""") ex_3 = avg_rating_dist(df) st.plotly_chart(ex_3) # EX-4 str6 = 'Visualise the `minmax_norm_rating` distribution.' st.markdown(html_temp_2.format(str6), unsafe_allow_html=True) # st.info("""### *4. Visualise the `minmax_norm_rating` distribution.*""") st.plotly_chart(minmax_norm_dist(df)) # EX-5 str7 = 'Visualise the `mean_norm_rating` distribution.' st.markdown(html_temp_2.format(str7), unsafe_allow_html=True) # st.info("""### *5. Visualise the `mean_norm_rating` distribution.*""") st.plotly_chart(mean_norm_dist(df)) st.plotly_chart(all_three_dist(df)) # EX-6 str8 = 'Create one graph that represents in the same figure both `minmax_norm_rating` and `mean_norm_rating` distributions.' st.markdown(html_temp_2.format(str8), unsafe_allow_html=True) st.plotly_chart(norm_comparison(df)) # EX-8 str9 = 'Visualize the awards distribution in a boxplot and aggregtated bars.' st.markdown(html_temp_2.format(str9), unsafe_allow_html=True) st.plotly_chart(awards_boxplot(df)) # EX-9 str10 = 'Group the `books` by `original_publish_year` and get the mean of the `minmax_norm_ratings` of the groups.' st.markdown(html_temp_2.format(str10), unsafe_allow_html=True) st.plotly_chart(yearly_minmax_mean(df)) # EX-10 str11 = 'Make a scatterplot to represent minmax_norm_ratings in function of the number of awards won by the book.' st.markdown(html_temp_2.format(str11), unsafe_allow_html=True) st.plotly_chart(minmax_awards(df)) # # st.pyplot(minmax_awards_2(df,fig_size=(10,10))) ## Old matplotlib plot # EX-7 Not working # str12='What is the best fit in terms of a distribution (normal, chi-squared...) to represent each of those graphs?' # st.markdown(html_temp_2.format(str12), unsafe_allow_html=True) # st.image("D:/Strive/st/goodreads_best2000-main/pngs/distribution_fit.png") # Explore maps in streamlit str_m = "Books and Places." st.markdown(html_temp_2.format(str_m), unsafe_allow_html=True) book = st.selectbox("Select Book", df['Title'].unique()) df_res = pd.DataFrame(place_title(book, df)) place = st.write( df_res.style.set_properties(**{ 'background-color': 'black', 'color': 'white', 'border-color': 'blue' })) """### *Type the Location you received above*""" where = st.text_area("\n", " Type here...") if st.button("Submit"): geolocator = Nominatim(user_agent="a") location = geolocator.geocode(where) lat = location.latitude lon = location.longitude map_df = pd.DataFrame.from_dict({"lat": [lat], "lon": [lon]}) st.map(map_df)
# -*- coding: utf-8 -*- import preprocess from sklearn.linear_model import LinearRegression import pandas as pd from matplotlib import pyplot as plt obj = preprocess.preprocessing() x_train, x_test, y_train, y_test = obj.getData() lr = LinearRegression() lr.fit(x_train, y_train) predict = lr.predict(x_test) predict = pd.DataFrame(data=predict, index=y_test.index, columns=['tahmin']) comparision = pd.concat([y_test, predict], axis=1) x_train = x_train.sort_index( ) #indexe göre sıralama. ay-satis eşleşmesini bozmaz y_train = y_train.sort_index() plt.title('Aylara Göre Satış Tahmin Grafiği', color='r') plt.xlabel('Aylar', color='r') plt.ylabel('Satışlar', color='r') plt.scatter(x_train, y_train) plt.plot(x_train, y_train) plt.plot(x_test, predict)
xaxis.set_minor_formatter(dates.DateFormatter('%H')) # minor locator timme xaxis.set_tick_params(which='major', pad=18) # sätter ner datumen 18 steg plt.title("Nesting box activities") plt.ylabel("In/out movements per hour") save_plot() plt.show() # --------------------- TASK 7 (EXTRA) ---------------------------------------- def save_plot(): save = input('Press s to save graph as png file or enter to skip: ') if(save == 's'): plt.savefig('bird_movements.png', bbox_inches='tight') print("File saved as bird_movements.png in your working directory") if __name__ == '__main__': list_dates, list_data = preprocessing("birds.txt") convert_local_timezone() continue_loop = 'y' while (continue_loop is 'y'): graph_dates, graph_data, sun_indexes = compute_data() plot_graph(graph_dates, graph_data, sun_indexes) # day_night_cycle() continue_loop = input('Do you want to plot something more? [y/n]') if (continue_loop is not 'y' and continue_loop is not 'n'): print( "Since you apparently can't read, I'll shut it down for you.") continue_loop = 'n'
# import necessary packages import os import random import cv2 from preprocess import preprocessing # selecting a random image for testing root_path = "raw_leaf_data/" all_image_files = [] for fold in os.listdir(root_path): all_image_files = (all_image_files + sorted([ os.path.join(root_path, fold, file) for file in os.listdir(os.path.join(root_path, fold)) ])) r_no = random.randint(0, len(all_image_files) - 1) test_image = all_image_files[r_no] print('displaying', test_image) random_save = preprocessing(test_image) cv2.imwrite('test.jpg', random_save)
'CAIDCHIP', 'HLLOSRSN', 'MAIIN102', 'IRMCDCHP' ] # to filter -1 rows with preprocess 0.63387 w/0 0.67071 def isnt(*cols): for c in cols: if c == -1: return False return True #train data train = pandas.read_csv('criminal_train.csv') train = preprocessing(train) #0.68185 with preprocess train_x = train[cols] train_y = train['Criminal'] #Predict data predict_data = pandas.read_csv('criminal_test.csv') predict_data = preprocessing(predict_data) predict_id = predict_data['PERID'] predict_x = predict_data[cols] lr = LogisticRegression() cv_score = np.mean(cross_val_score(lr, train_x, train_y, scoring='accuracy')) print('CV score for class is {}'.format(cv_score)) lr.fit(train_x, train_y)
def setRepo(): repo_path = repo_entry.get() preprocess.preprocessing(repo_path)
return False return True def isnt2(*cols): for c in cols: if c == -1: return True return False ##train = train[train[cols].apply(lambda x: isnt(*x), axis=1)] ##test_missing = test[test[cols].apply(lambda x: isnt2(*x), axis=1)] ##test = test[test[cols].apply(lambda x: isnt(*x), axis=1)] train = preprocessing(train) test = preprocessing(test) X_train = train[cols].values y_train = train["Criminal"].values X_test = test[cols].values max_features = 200000 # max value of data maxlen = len(cols) #70 # len of input embed_size = 300 sequence_input = Input(shape=(maxlen, )) x = Embedding(max_features, embed_size, trainable=False)(sequence_input) x = SpatialDropout1D(0.2)(x)
def train(hp_dict, args, data_dir, save_path): use_chars = hp_dict['char_dim'] > 0 # load data dp = preprocessing() data = dp.preprocess(data_dir, no_training_set=False, use_chars=use_chars) # build minibatch loader train_batch_loader = mini_batch_loader(data.training, BATCH_SIZE, sample_rate=1.0, len_bin=hp_dict['use_bin']) valid_batch_loader = mini_batch_loader(data.validation, BATCH_SIZE, shuffle=False, len_bin=hp_dict['use_bin']) test_batch_loader = mini_batch_loader(data.test, BATCH_SIZE, shuffle=False, len_bin=hp_dict['use_bin']) logging.info("loading word2vec file ...") embed_init, embed_dim = \ load_word2vec_embeddings(data.dictionary[0], hp_dict['embed_file'],EMBED_SIZE) logging.info("embedding dim: {}".format(embed_dim)) logging.info("initialize model ...") model = GA_reader(hp_dict['nhidden'], data.vocab_size, embed_dim, embed_init, hp_dict['train_emb'], use_chars, hp_dict['char_nhidden'], data.n_chars, hp_dict['char_dim'], hp_dict['nlayers'], hp_dict['gating_fn'], hp_dict['use_feat'], hp_dict['dropout']) if USE_CUDA: model.cuda() logging.info("Running on cuda: {}".format(USE_CUDA)) # training phase opt = torch.optim.Adam(params=filter(lambda p: p.requires_grad, model.parameters()), lr=LEARNING_RATE) shutil.copyfile('config.py', os.path.join(save_path, 'config.py')) # # load existing best model if os.path.isfile(os.path.join(save_path, 'best_model.pkl')): print('loading previously best model') model.load_state_dict( torch.load(os.path.join(save_path, 'best_model.pkl'))) # load existing train_model elif os.path.isfile(os.path.join(save_path, 'init_model.pkl')): print('loading init model') model.load_state_dict( torch.load(os.path.join(save_path, 'init_model.pkl'))) logging.info('-' * 50) logging.info("Start training ...") best_valid_acc = best_test_acc = 0 for epoch in range(NUM_EPOCHS): new_max = False if epoch >= 2: for param_group in opt.param_groups: param_group['lr'] /= 2 model.train() acc = loss = n_examples = it = 0 start = time.time() for dw, dw_m,qw,qw_m,dt,qt,tt,tm, \ answear, candidate, candi_m, cloze_pos, fnames in train_batch_loader: n_examples += dw.shape[0] feat = feat_fuc(dw, qw) #-------train-------# dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat=to_vars(\ [dw, dw_m,qw,qw_m,dt,qt,tt,tm, answear, candidate, candi_m, cloze_pos,feat], use_cuda=USE_CUDA) loss_, acc_ = model(dw, dw_m, qw, qw_m, dt, qt, tt, tm, answear, candidate, candi_m, cloze_pos, feat) # tensor.float size 1 #print(acc_.cpu().data.numpy()) loss += loss_.cpu().data.numpy()[0] # numpy [1] acc += acc_.cpu().data.numpy()[0] it += 1 opt.zero_grad() loss_.backward() clip_grad_norm(parameters=filter(lambda p: p.requires_grad, model.parameters()), max_norm=GRAD_CLIP) opt.step() if it % print_every == 0 \ or it % len(train_batch_loader) == 0: spend = (time.time() - start) / 60 statement = "Epoch: {}, it: {} (max: {}), "\ .format(epoch, it, len(train_batch_loader)) statement += "loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(loss / print_every, acc / n_examples, spend) logging.info(statement) del acc, loss, n_examples acc = loss = n_examples = 0 start = time.time() # save every print torch.save(model.state_dict(), os.path.join(save_path, 'init_model.pkl')) # torch.save(model,os.path.join(save_path,'init_model.pkl')) #-------valid-------# if it % eval_every == 0: start = time.time() model.eval() test_loss, test_acc = evaluate(model, valid_batch_loader, USE_CUDA) spend = (time.time() - start) / 60 statement = "Valid loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(test_loss, test_acc, spend) logging.info(statement) if best_valid_acc < test_acc: best_valid_acc = test_acc new_max = True # store best valid model torch.save(model.state_dict(), os.path.join(save_path, 'best_model.pkl')) #torch.save(model,os.path.join(save_path,'best_model.pkl')) logging.info("Best valid acc: {:.3f}".format(best_valid_acc)) model.train() start = time.time() #-------test-------# start = time.time() model.eval() test_loss, test_acc = evaluate(model, test_batch_loader, USE_CUDA) spend = (time.time() - start) / 60 logging.info("Test loss: {:.3f}, acc: {:.3f}, time: {:.1f}(m)"\ .format(test_loss, test_acc, spend)) if best_test_acc < test_acc: best_test_acc = test_acc logging.info("Best test acc: {:.3f}".format(best_test_acc))
def multi_process_contrast(input_dir): for i in glob.glob(os.path.join(input_dir, '*.png')): print(i) preprocessing(i) contrast_enhancement(i)
def save_model(model, model_path): """Save model.""" torch.save(model.state_dict(), model_path) def load_model(model, model_path, use_cuda=False): """Load model.""" map_location = 'cpu' if use_cuda and torch.cuda.is_available(): map_location = 'cuda:0' model.load_state_dict(torch.load(model_path, map_location)) return model data_dict = preprocessing("data/train.csv", "data/test.csv", "embeddings/malayalam200.txt") train_data = data_dict["data_train"] val_data = data_dict["data_val"] test_data = data_dict["data_test"] classifier = MalayalamModel(data_dict["pretrained_embeddings"], data_dict["padding_idx"]) device = 'cpu' if torch.cuda.is_available(): device = 'cuda:0' batch_size = 256 epochs = 20 optimizer = torch.optim.Adam(classifier.parameters(), lr=1e-2)
def analyze(): parser = argparse.ArgumentParser(description='CST Parser command line') parser.add_argument("--d", help="path of directory with the texts", required=True, action="store", dest="directory_path") parser.add_argument("--o", help="path of directory to store results", required=True, action="store", dest="analysis_path") parser.add_argument("--e", help="embed sentence text into cst analysis", required=False, action="store", dest="embed") args = parser.parse_args() texts = [] logging.info('Getting documents ...') files = os.listdir(args.directory_path) # if macOS files = [f for f in files if f != '.DS_Store'] for f in files: try: with open(os.path.join(args.directory_path, f), 'r') as file: lines = file.readlines() texts.append(''.join(lines)) except: print("The following error occurred while opening file {}: ".format( os.path.join(args.directory_path, f)), sys.exc_info()[0]) raise """ Prepare XML data with files tokenized by sentence """ logging.info('Preprocessing documents ...') generated_files = preprocessing(texts, args.analysis_path) """ Select pairs of sentence to be related using word overlap """ logging.info('Selecting candidate sentence pairs ...') selected_pairs = select_pairs(generated_files, 0.12) """ Apply rules on selected pairs """ logging.info('Applying rules ...') apply_rules(selected_pairs, args.analysis_path, args.embed) """ Extract features """ logging.info('Extracting features ...') features = extract_features(selected_pairs, args.analysis_path) """ Applying classifier """ logging.info('Applying classifier ...') multiclass_classify(selected_pairs, features, args.analysis_path, args.embed) logging.info('Done! CST analysis out at {}'.format(args.analysis_path))