def download_binary(self, binary_name, download_url): if os.path.exists(self.binary_path) is False: utilities.mkdir(self.binary_path) binary_file_path = self.get_required_binary_path(binary_name) utilities.log_info("Download url: %s" % download_url) with open(binary_file_path, 'wb') as file: response = requests.get(download_url, stream=True) total = response.headers.get('content-length') if total is None or int(total) < 100000: utilities.log_error( "Download binary %s failed, Please check the existence of the binary version %s" % (binary_name, self.version)) return False utilities.log_info( "* Download %s from %s\n* size: %fMB, dst_path: %s" % (binary_name, download_url, float(total) / float(1000000), binary_file_path)) downloaded = 0 total = int(total) for data in response.iter_content( chunk_size=max(int(total / 1000), 1024 * 1024)): downloaded += len(data) file.write(data) done = int(50 * downloaded / total) utilities.log_info("Download percent: %d%%" % (downloaded / total * 100)) sys.stdout.write('\r[{}{}]'.format('█' * done, '.' * (50 - done))) sys.stdout.flush() sys.stdout.write('\n') utilities.log_info("* Download %s from %s success" % (binary_name, download_url)) return True
def main(): # load and transform data to lda format samples = preprocessing.load_dataset_from_disk('dataset/the_thao/2', remove_tags=True) lda_data, vocab = preprocessing.build_lda_data(samples) # Get environment variables model_folder = 'models/ml-ope' tops = 10 # Create model folder if it doesn't exist utilities.mkdir('models') utilities.mkdir(model_folder) # Build settings settings = build_setting(len(lda_data), len(vocab)) # run algorithm runmlope = run_ML_OPE.runMLOPE(lda_data, settings, model_folder, tops) theta, beta = runmlope.run() duplicate_topics = get_duplicate_topics(beta, topn=20) new_theta = rebuild_theta(theta, duplicate_topics) unique_idx, unique_samples = remove_duplicate(new_theta, samples) pass
def sim_mat_dist(**kwargs): ''' Saves the sim mat in a file along with median in last column. Plots the hist and box plot of median cosim of each topic ''' data = np.load(kwargs.get("file","sim_mat")) labels=data['labels'] sim_mat = data['sim_mat'] sim_mat = np.round(sim_mat,decimals=6) medians = np.median(sim_mat, axis=1) maxs = [] sim_mat[sim_mat>1] = 1 sim_mat_sorted = np.copy(sim_mat) #find second highest value as max will always be 1 for row in sim_mat_sorted: row.sort() maxs.append(row[-2]) print "saving sim_mat in stats_data/sim_mat/"+c.query_name+".csv" u.mkdir("stats_data/sim_mat/") f= open("stats_data/sim_mat/"+c.query_name+"_sim_mat.csv","w") f.write(","+",".join(labels)+",median"+","+"max"+"\n") for i,med in enumerate(medians): f.write(labels[i]+","+",".join(map("{:.6f}".format,sim_mat[i]))+","+str(medians[i])+","+str(maxs[i])+"\n") u.box_plot(maxs,"stats_data/sim_mat/box_plot_"+c.query_name+"_maxcosim.png") u.hist_plot(maxs,"stats_data/sim_mat/hist_plot_"+c.query_name+"_maxcosim.png",xlabel='Maximum cosine similarity',ylabel='Number of topics',xticks=0) u.box_plot(medians,"stats_data/sim_mat/box_plot_"+c.query_name+"_median.png") u.hist_plot(medians,"stats_data/sim_mat/hist_plot_"+c.query_name+"_median.png",xlabel='median co-sim',ylabel='Number of topics',xticks=0)
def column_analysis(**kwargs): fpath = kwargs.get("file") path = u.mkdir("./stats_data/desc/") df = pd.read_csv(fpath) for domain in c.domains: df2 = df[df.domain == domain] u.box_plot(df2["med.cosim.inter"],path+"box_med.cosim.inter_"+domain+".png") u.dist_plot(df2["med.cosim.inter"],path+"dist_med.cosim.inter_"+domain+".png")
def desc_stat_data(**kwargs): for fpath in kwargs.get("files"): #path to where the files will be saved path = u.mkdir("./stats_data/desc/") df = pd.read_csv(fpath) r = re.compile('/(.*?).csv') fname = r.search(fpath).group(1).split("-")[0] #correlation between columns of stat_all '''
def create_domain_df(): ''' function to create data frame ''' start_time = time.time() print "Creating dataframe" stat_data = pd.DataFrame() for d in c.domains: df = collect_data(d) print d,df.shape df.to_csv('stats_data/'+d+'_data.csv',sep=',',index=False) #if not stat_data: stat_data=pd.DataFrame(columns=df.columns) stat_data = pd.concat([stat_data,df],ignore_index=True) #print "all",stat_data.shape df = collect_network_inter() stat_data = pd.merge(stat_data,df,how="inner") u.mkdir("./stats_data") stat_data.to_csv('stats_data/all_data.csv',sep=',',index=False) print "--- time for dataframe generation "+str((time.time() - start_time)/60)+" minutes ---"
def reg_analysis(**kwargs): for fpath in kwargs.get("files"): #path to where the files will be saved path = u.mkdir("./stats_data/desc/") df = pd.read_csv(fpath) r = re.compile('/(.*?).csv') fname = r.search(fpath).group(1).split("-")[0] fname = fname[:3]+"."+fname[3:] cols = [col for col in df.columns if col!=fname and col!='id'] print "Running Regression Analysis for",fname y = np.array(df[fname]) X = np.array(df[cols]) #print "X",X.shape,"y",y.shape X_train, X_test, y_train, y_test =train_test_split(X, y, test_size=0.33,random_state=42) #print "X_train",X_train.shape,"y_train",y_train.shape regr = RandomForestRegressor(max_depth=2, random_state=0) regr.fit(X_train, y_train) y_pred = regr.predict(X_test) zipped = zip(regr.feature_importances_,df[cols].columns) zipped.sort(key = lambda t: t[0]) for imp,f in zipped: print f,":",imp print "-------- R2 score",regr.score(X_test,y_test),"----------"