Beispiel #1
0
def run(idir, odir):
    pfile = idir + "/papers.json"
    afile = idir + "/authors.json"
    vfile = idir + "/venues.json"
    authors = read_json(afile)
    venues = read_json(vfile)
    papers = read_json(pfile)
    assert (os.path.isdir(odir), odir + " should be a directory")
    output_authors(authors, odir)
    output_venues(venues, odir)
    output_papers(papers, odir)
Beispiel #2
0
 def on_click(self):
     self.close()
     list_json = read_json()
     flag = False
     for item in list_json:
         for i in list_json[item]:
             if i[0] == self.user_text.text() and i[1] == self.password_text.text():
                 flag = True
     if flag:
         self.main = MainWindow()
         self.main.show()
     else:
         self.error = QtWidgets.QErrorMessage()
         self.error.showMessage('Tên đăng nhập và mật khẩu chưa đúng !')
         self.error.setWindowTitle('Error')
Beispiel #3
0
 def add(self):
     if not self.user_text.text():
         self.error = error_form("Hãy nhập vào username")
         self.error.show()
     else:
         data = read_json()
         if self.password_text.text() == self.cfm_password.text():
             data.update({
                 self.user_text.text():
                 [[self.user_text.text(),
                   self.password_text.text()]]
             })
             with open('user.json', mode='w', encoding='utf-8') as f:
                 json.dump(data, f)
             self.success = QtWidgets.QErrorMessage()
             self.success.showMessage('Đăng ký thành công !')
             self.close()
             self.login = main2.Login()
             self.login.show()
         else:
             self.error = error_form('Mật khẩu chưa trùng khớp')
             self.error.show()
Beispiel #4
0
#build vocabulary matrix
ROOT.gStyle.SetTitleSize(.05, "XY")

Decode_Error = "ignore"

#widget_selection = "assetallocationcalculator"
widget_selection = "budgetcalculator"
#widget_selection = "careercalculator"

if widget_selection=="budgetcalculator": hist_title="Ad Type 1"
if widget_selection=="careercalculator": hist_title="Ad Type 2"
if widget_selection=="homeaffordability": hist_title="Ad Type 3"
if widget_selection=="assetallocationcalculator": hist_title="Ad Type 4"
figures_folder = "figures/"+widget_selection + "/"
corpus, engagement_rate, page_stats = read_json("web_text_v12_data_set_1_2.json",widget_selection, 0, 0)

vectorizer = CountVectorizer(analyzer="word", stop_words="english", decode_error=Decode_Error, ngram_range=(1,1))#, min_df=0.15)
vectorizer_bigram = CountVectorizer(analyzer="word", stop_words="english", decode_error=Decode_Error, ngram_range=(2,2))#, min_df=0.15)

X = vectorizer.fit_transform(corpus)
X_2 = vectorizer_bigram.fit_transform(corpus)

corpus_array = X.toarray()
corpus_array_bigram = X_2.toarray()

switch=1
setoptstat=1111
ROOT.gStyle.SetOptStat(0)
ROOT.gStyle.SetStatX(0.85)
ROOT.gStyle.SetStatY(0.90)
#Modeling metrics
Max_Iter=1000
Fit_Intercept=False
Return_Models=False
Positive=True
Verbose=False
N_Jobs=-1
N_Alphas=1000
Normalize=False
Alphas=[0]
Tol=0.001
Min_DF=float(sys.argv[4])

N_Estimators=10

corpus, engagement_rate, page_stats = read_json("web_text_v9c.json",widget_selection)

my_words = ["considering","proper","agree", "soon", "changing", "wish", "flickr", "protect","including", "example", "want", "concept", "photo", "like" ,"comes", "things", "com", "don", "help"]#, "improve wisegeek", "related article", "u'improve wisegeek"]

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

#build vocabulary matrix
print "size of corpus:%d" % len(corpus)

vectorizer = CountVectorizer(analyzer="word", stop_words=set(my_stop_words), decode_error=Decode_Error, 
				ngram_range=(Ngram_Range_Low,Ngram_Range_High),  min_df=Min_DF)#, max_df=0.85)
vectorizer_binary = CountVectorizer(analyzer="word", stop_words=set(my_stop_words), decode_error=Decode_Error, 
			ngram_range=(Ngram_Range_Low,Ngram_Range_High), binary="True",  min_df=Min_DF)#, max_df=0.85)

vectorizer_unigram = CountVectorizer(analyzer="word", stop_words=set(my_stop_words), decode_error=Decode_Error, 
				ngram_range=(1,1),  min_df=Min_DF)#, max_df=0.85)
from read_json import *
import sys
from pylab import *
import numpy as np

mset = read_json(sys.argv[1])

volerror = []
for atoms in mset:
    if atoms.exptvol is not None:
        volerror.append((atoms.calcvol - atoms.exptvol) / atoms.exptvol * 100)
print np.mean(volerror), np.min(volerror), np.max(volerror)

fig = figure(figsize=(7.5,5.2))
hist(volerror, 50)
xlabel("Error in calculated volume [%]", fontsize=18)
ylabel("Number of ICSD entries", fontsize=18)
savefig('volume_error.pdf', bbox_inches='tight')                                  


#################################################
Eref = attribute_tolist(mset, attr="Eref")
fig = figure(figsize=(7.5,5.2))
hist(Eref, 50)
xlabel("Cohesive Energy [eV]", fontsize=18)
ylabel("Number of ICSD entries", fontsize=18)
savefig('hist_Ecoh.pdf', bbox_inches='tight')                                  



N_Estimators=1000

DSampling=False
DSampling_Rate=0.50

PageLoaded = int(sys.argv[5])
WidgetViewed = int(sys.argv[6])
ite = int(sys.argv[7])

RSeed=ite

input_json = "web_text_v12_data_set_1_2.json"

print "%s, %s, %s, %s, %s, %s" % (widget_selection, Ngram_Range_Low, Ngram_Range_High, Min_DF, PageLoaded, WidgetViewed)

corpus, engagement_rate, page_stats = read_json(input_json, widget_selection, PageLoaded, WidgetViewed)

print "size of corpus:%d" % len(corpus)
print "size of corpus target:%d" % len(engagement_rate)
print len(engagement_rate)/4.

#Test_Size=1./(CV-1)
Test_Size=0.50

print "Relative test data size:%.3g" % Test_Size
#ADDITIONAL STOPWORDS
my_words = ["considering","proper","agree", "soon", "changing", "wish", "flickr", "protect","including", 
		"example", "want", "concept", "photo", "like" ,"comes", "things", "com", "don", "help"] 

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)
from read_json import *
import sys
from pylab import *
import numpy as np

mset = read_json(sys.argv[1])

volerror = []
for atoms in mset:
    if atoms.exptvol is not None:
        volerror.append((atoms.calcvol - atoms.exptvol) / atoms.exptvol * 100)
print np.mean(volerror), np.min(volerror), np.max(volerror)

fig = figure(figsize=(7.5, 5.2))
hist(volerror, 50)
xlabel("Error in calculated volume [%]", fontsize=18)
ylabel("Number of ICSD entries", fontsize=18)
savefig('volume_error.pdf', bbox_inches='tight')

#################################################
Eref = attribute_tolist(mset, attr="Eref")
fig = figure(figsize=(7.5, 5.2))
hist(Eref, 50)
xlabel("Cohesive Energy [eV]", fontsize=18)
ylabel("Number of ICSD entries", fontsize=18)
savefig('hist_Ecoh.pdf', bbox_inches='tight')

#show()
Beispiel #9
0
from read_json import *
from split_dataset import *
from ml import *
from visualize import *
from pylab import *
#np.random.seed(0)


elmethod = "composition"
sigma = 13 ; lamda = 0.001 ; kernel = "gaussian"
maxrun = 1

MAEtrain = []
MAEcross = []
for i in range(maxrun):
    mset = read_json("include_ML_natoms_30/data.json", energytype="formation")
    mcross, mtrain = get_testset(mset)
#    mcross = read_json("MnSi.json", energytype="formation")
    #mtest, mset = get_testset(mset)
    #mtrain, mcross, mset = get_train_validation_set(mset)
    elmap = get_elements_map(mset)

    result = krr_regression(mtrain, mcross, sigma, lamda, kernel=kernel, elmap=elmap, elmethod=elmethod,
                            loadalpha=False, alphanum=i)
    MAEtrain.append(result[0])
    MAEcross.append(result[1])
    print result[0], result[1]

#hist(MAEtrain, 10)
#hist(MAEcross, 10)
#show()
def feature_extraction_model(widget_selection, Ngram_Range_Low, Ngram_Range_High, Min_DF, PageLoaded, WidgetViewed, ite, Find, input_json_name):

	print "###FEATURE EXTRACTION MODEL###"

	figures_folder = "figures/"+widget_selection + "/"

	#NLP knobs
	Decode_Error='ignore'

	#CV metrics
	CV = 3
	N_Jobs = 2 

	#Forest parameters
	N_Estimators = 10 #Increasing this dramatically slows code
	Max_Iter = 100
	Criterion = 'entropy'	
	
	#Logistic parameters
	Tol = 0.001
	Logistic_Penalty = 'l2'

	#SGD parameters
	Loss = 'hinge'
	SGD_Penalty = 'elasticnet' 

	DSampling = False
	DSampling_Rate = 0.50 #Relative downsampling percentage

	Scoring_2 = 'f1'
	if ite==0 or ite>0: RSeed = ite
	else: RSeed = random.randint(1, 100000)

	print "Widget:%s, nL:%s, nH:%s, mDF:%s, pLoad:%s, wView:%s, Seed:%s, f:%s" % \
		(widget_selection, Ngram_Range_Low, Ngram_Range_High, Min_DF, PageLoaded, WidgetViewed, RSeed, Find)

	corpus, engagement_rate, page_stats = read_json(input_json_name, widget_selection, PageLoaded, WidgetViewed)

	print "Size of corpus:%d" % len(corpus)
	print "Size of corpus target:%d" % len(engagement_rate)

	if Find > 0 : Test_Size = Find
	elif Find == 0 : Test_Size = 0
	else: Test_Size = 0.5
	
	print "Relative test data size:%.3g" % Test_Size

	########UNIGRAM STOPWORDS##########################
	############APPEND TO LIST TO ADD##################
	my_words = ["says", "comment", "jeff", "rose", "2015", "considering", "proper","agree", "soon", "changing", "wish", "flickr", "protect", "including", 
			"example", "want", "concept", "photo", "like" ,"comes", "things", "com", "don", "help"] 

	my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

	###########BUILD VOCABULARY MATRIX#####################

	vectorizer = CountVectorizer(analyzer="word", stop_words=set(my_stop_words), decode_error=Decode_Error, 
					ngram_range=(Ngram_Range_Low,Ngram_Range_High),  min_df=Min_DF)#, max_df=0.85)

	X = vectorizer.fit_transform(corpus)

	corpus_array = X.toarray()

	print "Number of zeros:%d" % engagement_rate.count(0)
	print "Total number of engagements:%d" % len(engagement_rate)
	print "Total number of non-zero:%d" % (len(engagement_rate) - engagement_rate.count(0))
	zero_rate = float(engagement_rate.count(0))/float(len(engagement_rate))

	print "Zero rate:%.3g" % zero_rate

	#######DOWNSAMPLING BEGIN############
	if zero_rate>(1-DSampling_Rate): DSampling=True

	training_matrix = np.array(corpus_array)
	engagement_matrix = np.array(engagement_rate)

	print "Training matrix:%d, engagement matrix:%d" % (len(training_matrix), len(engagement_matrix))
	
	total_matrix =  np.column_stack((training_matrix, engagement_matrix))
	matrix_of_zeros = []
	matrix_of_nonzeros = []

	for i in range(len(total_matrix)):
		if total_matrix[i][len(total_matrix[i])-1]>0:
			matrix_of_nonzeros.append(total_matrix[i])
		else: matrix_of_zeros.append(total_matrix[i])


	#print matrix_of_nonzeros
	if DSampling==True:
		target_downsampling = DSampling_Rate;
		downsampling=int(np.round((len(matrix_of_nonzeros)/target_downsampling)*(1-target_downsampling)))
		#print downsampling
		downsampled_nonzeros=resample(matrix_of_zeros, n_samples=downsampling, random_state=0, replace = False)

		print len(downsampled_nonzeros)

		downsampled_total = np.concatenate((downsampled_nonzeros,matrix_of_nonzeros))
		downsampled_engagement = downsampled_total[:,(len(downsampled_total[0])-1):(len(downsampled_total[0]))]
		downsampled_training = downsampled_total[:,:-1]
		corpus_array = downsampled_training
	
		temp_y = []
		for i in range(len(downsampled_engagement)):
			temp_y.append(downsampled_engagement[i][0])	
		engagement_rate = temp_y
		print "Resampled engagement length %d" % len(engagement_rate)

	####NORMALIZATION AND TDIDF WEIGHTING BEGIN#######
	transformer = TfidfTransformer()
	tfidf = transformer.fit_transform(corpus_array)
	tfidf_array = tfidf.toarray()

	corpus_array = np.array(tfidf_array)

	########NORMALIZATION AND TDIDF WEIGHTING END#########

	#####SPLITTING TRAINING AND TEST DATASETS BEING##########
	x_train, x_test, y_train, y_test = train_test_split(corpus_array, engagement_rate, test_size=Test_Size, random_state=ite)

	#print x_train.shape
	#print y_train.shape

	print "Zeroes in engagement vector y_train:%d" % list(y_train).count(0)
	print "Zeroes in engagement vector in y_test:%d" % list(y_test).count(0)
	print "Total in engagement vector in y_train:%d" % len(list(y_train))
	print "Total in engagement vector in y_test:%d" % len(list(y_test))

	X = x_train

	y = y_train

	print "Size of training X:%d, Training y:%d, Test X:%d, Test y:%d" % (x_train.shape[0], y_train.shape[0], x_test.shape[0], y_test.shape[0])
	#####SPLITTING TRAINING AND TEST DATASETS END##########


	#####MODEL TRAINING##################

	number_of_features = len(vectorizer.get_feature_names())
	list_of_features = vectorizer.get_feature_names()
	print "Number of features :%d" % number_of_features

	binary_y = np.array(make_binary(y))

	coef_path_SGD_cv = SGDClassifier(loss = Loss, penalty = SGD_Penalty) 
	coef_path_logistic_cv = LogisticRegression(penalty = Logistic_Penalty, tol = Tol)
	coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, random_state = ite, criterion = Criterion, max_features = number_of_features)


	coef_path_forest_cv.fit(X,binary_y)
	coef_path_SGD_cv.fit(X,binary_y)
	coef_path_logistic_cv.fit(X,binary_y)

	####MODEL CROSS VALIDATION#########

	forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs = N_Jobs, cv = CV, scoring = Scoring_2)
	SGD_cv_score = cross_validation.cross_val_score(coef_path_SGD_cv, X, binary_y, n_jobs = N_Jobs, cv = CV, scoring = Scoring_2)
	logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs = N_Jobs, cv = CV, scoring = Scoring_2)

	forest_prediction_training = coef_path_forest_cv.predict(X)

	forest_results_parameters = [ forest_prediction_training, coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, 
					 coef_path_forest_cv.predict(x_test), np.array(make_binary(y_test)), coef_path_forest_cv.classes_] 

	forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest',
	  				 precision_score(np.array(make_binary(y)), forest_prediction_training),
			                recall_score(np.array(make_binary(y)), forest_prediction_training),
			                accuracy_score(np.array(make_binary(y)), forest_prediction_training),
			                confusion_matrix(np.array(make_binary(y)), forest_prediction_training)
					]
	SGD_prediction_training = coef_path_SGD_cv.predict(X)

	SGD_results_parameters = [ SGD_prediction_training, coef_path_SGD_cv.get_params, coef_path_SGD_cv.coef_, 
					 coef_path_SGD_cv.predict(x_test), np.array(make_binary(y_test))] 

	SGD_scores = [SGD_cv_score, classification_report(binary_y, SGD_results_parameters[0]), 'SGD',
					 precision_score(np.array(make_binary(y)), SGD_prediction_training),
			                recall_score(np.array(make_binary(y)), SGD_prediction_training),
			                accuracy_score(np.array(make_binary(y)), SGD_prediction_training),
			                confusion_matrix(np.array(make_binary(y)), SGD_prediction_training)
					]

	logistic_prediction_training = coef_path_logistic_cv.predict(X)

	logistic_results_parameters = [logistic_prediction_training, coef_path_logistic_cv.get_params, coef_path_logistic_cv.coef_, 
					coef_path_logistic_cv.predict(x_test), np.array(make_binary(y_test)), coef_path_logistic_cv.predict_proba(x_test)]
	
	logistic_scores = [logistic_cv_score, classification_report(binary_y, logistic_results_parameters[0]), 'logistic'
					, precision_score(np.array(make_binary(y)), logistic_prediction_training), 
					recall_score(np.array(make_binary(y)), logistic_prediction_training), 
					accuracy_score(np.array(make_binary(y)), logistic_prediction_training),
					confusion_matrix(np.array(make_binary(y)), logistic_prediction_training)
					]
	
	model_results = [forest_results_parameters, SGD_results_parameters, logistic_results_parameters]

	model_scores = [forest_scores, SGD_scores, logistic_scores]

	return model_results, model_scores, X, y, widget_selection, list_of_features,\
						 Ngram_Range_Low, Ngram_Range_High, Min_DF, PageLoaded, WidgetViewed, ite, x_test
Beispiel #11
0
N_Jobs=-1
N_Alphas=1000
Normalize=False
Alphas=[0]
Tol=0.001
Min_DF=float(sys.argv[4])

N_Estimators=1000
DSampling=False
DSampling_Rate=0.50

PageLoaded = int(sys.argv[5])
WidgetViewed = int(sys.argv[6])

print "%s, %s, %s, %s, %s, %s" % (widget_selection, Ngram_Range_Low, Ngram_Range_High, Min_DF, PageLoaded, WidgetViewed)
corpus, engagement_rate, page_stats = read_json("web_text_v9c.json",widget_selection, PageLoaded, WidgetViewed)

##DOWNSAMPLING TEST


#for i in range(len(engagement_rate)):
#	print engagement_rate[i]
my_words = ["considering","proper","agree", "soon", "changing", "wish", "flickr", "protect","including", "example", "want", "concept", "photo", "like" ,"comes", "things", "com", "don", "help"]#, "improve wisegeek", "related article", "u'improve wisegeek"]

my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

#build vocabulary matrix
print "size of corpus:%d" % len(corpus)

vectorizer = CountVectorizer(analyzer="word", stop_words=set(my_stop_words), decode_error=Decode_Error, 
				ngram_range=(Ngram_Range_Low,Ngram_Range_High),  min_df=Min_DF)#, max_df=0.85)