def check_rotation(test_name, factors, method, rotation, rel_tol=0, abs_tol=0.1, **kwargs): """ Check the rotation results. Parameters ---------- test_name : str The name of the test (e.g. 'test01') factors : int The number of factors. method : str The rotation method (e.g. 'uls') rotation : str The type of rotation (e.g. 'varimax') rel_tol : float, optional The relative tolerance. Defaults to 0.0. abs_tol : float, optional The absolute tolerance. Defaults to 0.1. Returns ------ check : float The proportion that match between the calculated and expected. """ r_input = collect_r_output(test_name, factors, method, 'none', output_types=['loading']) r_loading = r_input['loading'] r_loading = normalize(r_loading, absolute=False) rotator = Rotator(method=rotation, **kwargs) rotated_loading = rotator.fit_transform(r_loading) r_output = collect_r_output(test_name, factors, method, rotation, output_types=['loading']) expected_loading = r_output['loading'] data1 = normalize(rotated_loading) data2 = normalize(expected_loading) return check_close(data1, data2, rel_tol, abs_tol)
def _out_load_matrix(df2, a_columns, m): fa = factor_analyzer.FactorAnalyzer(n_factors=m, method='principal', rotation='varimax') fa.fit(df2) # # 旋转后的因子载荷矩阵 rotator = Rotator() load_matrix = pd.DataFrame(rotator.fit_transform(fa.loadings_), columns=a_columns, index=df2.columns) print("\n因子旋转:\n", load_matrix) out_load_matrix_name = pd.DataFrame([[None], ['因子旋转']]) out_load_matrix_header = pd.DataFrame([a_columns]) out_load_matrix_data = pd.DataFrame(rotator.fit_transform(fa.loadings_)) out_load_matrix = pd.concat( [out_load_matrix_name, out_load_matrix_header, out_load_matrix_data]) out_load_matrix_dum_index = [None, None, None] out_load_matrix_dum_index.extend(df2.columns) out_load_matrix.insert(0, 'index', out_load_matrix_dum_index) out_load_matrix.columns = np.arange(out_load_matrix.shape[1]) print(out_load_matrix) return load_matrix, out_load_matrix
def varimax(self): fa = FactorAnalyzer(rotation=None) rotator = Rotator() a = rotator.fit(self.loading3) return a.rotation_, a.loadings_
def varclus(self, speedup=True): self.speedup = speedup if self.speedup is True: return self._varclusspu() ClusInfo = collections.namedtuple( 'ClusInfo', ['clus', 'eigval1', 'eigval2', 'pc1', 'varprop']) c_eigvals, _, c_princomps, c_varprops = VarClusHi.pca( self.df[self.feat_list]) clus0 = ClusInfo(clus=self.feat_list, eigval1=c_eigvals[0], eigval2=c_eigvals[1], pc1=c_princomps[:, 0], varprop=c_varprops[0]) self.clusters = collections.OrderedDict([(0, clus0)]) while (True): if self.maxclus is not None and len(self.clusters) >= self.maxclus: break idx = max(self.clusters, key=lambda x: self.clusters.get(x).eigval2) if self.clusters[idx].eigval2 > self.maxeigval2: split_clus = self.clusters[idx].clus c_eigvals, c_eigvecs, _, _ = VarClusHi.pca(self.df[split_clus]) else: break if c_eigvals[1] > self.maxeigval2: clus1, clus2 = [], [] rotator = Rotator(method='quartimax') r_eigvecs = rotator.fit_transform(pd.DataFrame(c_eigvecs)) stand_df = (self.df - self.df.mean()) / self.df.std() r_pcs = np.dot(stand_df[split_clus].values, r_eigvecs) for feat in split_clus: corr_pc1 = np.corrcoef(self.df[feat].values.T, r_pcs[:, 0])[0, 1] corr_pc2 = np.corrcoef(self.df[feat].values.T, r_pcs[:, 1])[0, 1] if abs(corr_pc1) > abs(corr_pc2): clus1.append(feat) else: clus2.append(feat) fin_clus1, fin_clus2, _ = VarClusHi._reassign_rs( self.df, clus1, clus2, self.n_rs) c1_eigvals, _, c1_princomps, c1_varprops = VarClusHi.pca( self.df[fin_clus1]) c2_eigvals, _, c2_princomps, c2_varprops = VarClusHi.pca( self.df[fin_clus2]) self.clusters[idx] = ClusInfo(clus=fin_clus1, eigval1=c1_eigvals[0], eigval2=c1_eigvals[1], pc1=c1_princomps[:, 0], varprop=c1_varprops[0]) self.clusters[len(self.clusters)] = ClusInfo( clus=fin_clus2, eigval1=c2_eigvals[0], eigval2=c2_eigvals[1], pc1=c2_princomps[:, 0], varprop=c2_varprops[0]) else: break return self
def _varclusspu(self): ClusInfo = collections.namedtuple( 'ClusInfo', ['clus', 'eigval1', 'eigval2', 'eigvecs', 'varprop']) c_eigvals, c_eigvecs, c_corrs, c_varprops = VarClusHi.correig( self.df[self.feat_list]) self.corrs = c_corrs clus0 = ClusInfo(clus=self.feat_list, eigval1=c_eigvals[0], eigval2=c_eigvals[1], eigvecs=c_eigvecs, varprop=c_varprops[0]) self.clusters = collections.OrderedDict([(0, clus0)]) while (True): if self.maxclus is not None and len(self.clusters) >= self.maxclus: break idx = max(self.clusters, key=lambda x: self.clusters.get(x).eigval2) if self.clusters[idx].eigval2 > self.maxeigval2: split_clus = self.clusters[idx].clus c_eigvals, c_eigvecs, split_corrs, _ = VarClusHi.correig( self.df[split_clus]) else: break if c_eigvals[1] > self.maxeigval2: clus1, clus2 = [], [] rotator = Rotator(method='quartimax') r_eigvecs = rotator.fit_transform(pd.DataFrame(c_eigvecs)) comb_sigma1 = math.sqrt( np.dot(np.dot(r_eigvecs[:, 0], split_corrs.values), r_eigvecs[:, 0].T)) comb_sigma2 = math.sqrt( np.dot(np.dot(r_eigvecs[:, 1], split_corrs.values), r_eigvecs[:, 1].T)) for feat in split_clus: comb_cov1 = np.dot(r_eigvecs[:, 0], split_corrs[feat].values.T) comb_cov2 = np.dot(r_eigvecs[:, 1], split_corrs[feat].values.T) corr_pc1 = comb_cov1 / comb_sigma1 corr_pc2 = comb_cov2 / comb_sigma2 if abs(corr_pc1) > abs(corr_pc2): clus1.append(feat) else: clus2.append(feat) fin_clus1, fin_clus2, _ = VarClusHi._reassign_rs( self.df, clus1, clus2, self.n_rs) c1_eigvals, c1_eigvecs, _, c1_varprops = VarClusHi.correig( self.df[fin_clus1]) c2_eigvals, c2_eigvecs, _, c2_varprops = VarClusHi.correig( self.df[fin_clus2]) self.clusters[idx] = ClusInfo(clus=fin_clus1, eigval1=c1_eigvals[0], eigval2=c1_eigvals[1], eigvecs=c1_eigvecs, varprop=c1_varprops[0]) self.clusters[len(self.clusters)] = ClusInfo( clus=fin_clus2, eigval1=c2_eigvals[0], eigval2=c2_eigvals[1], eigvecs=c2_eigvecs, varprop=c2_varprops[0]) else: break return self
for i in range(int(len(a.columns.tolist()))): q = 'F_' + str(i + 1) a_list.append(q) ma_list.append(q) a.columns = a_list ma.columns = ma_list a.index = xlabel ma.index = xlabel component_matrix = copy.deepcopy(ma) component_matrix.insert(0, u'列名', ma.index) # 样本的相关系数矩阵,旋转成分矩阵,重点需要看的地方!!!!!!!!!!!!!!!!! # 如果rowvar是1(默认),那么每行代表一个variables,每列代表一个observations(样本);反之则每行是observations,每列是variables covr = np.corrcoef(data, rowvar=0) # 计算相关系数矩阵 covrr = pd.DataFrame(covr) rotator = Rotator() test1 = covrr.T test2 = test1.T covrr = covrr.T.values.tolist() covrr = rotator.rotate(pd.DataFrame(covrr), 'varimax') covrr = pd.DataFrame(list(covrr)[0].values.tolist()) # 因子得分系数矩阵,对应成分得分矩阵 b = np.dot(np.linalg.inv(covr), a) score_matrix = pd.DataFrame(b) # rotate stop b_list = [] for i in range(int(len(score_matrix.columns.tolist()))): q = 'F_' + str(i + 1) b_list.append(q) score_matrix.columns = b_list score_matrix.insert(0, u'列名', a.index)
def _varimax(X): return Rotator(normalize=False).fit_transform(X)
def varclus(self): """ varclus的步骤 :return: """ ClusInfo = collections.namedtuple('ClusInfo', ['cluster', 'eigval1', 'eigval2', 'eigvecs', 'varprop']) c_eigvals, c_eigvecs, c_corrs, c_varprops = VarClus.correig(self.df[self.feat_list]) self.corrs = c_corrs cluster0 = ClusInfo(cluster=self.feat_list, eigval1=c_eigvals[0], eigval2=c_eigvals[1], eigvecs=c_eigvecs, varprop=c_varprops[0] ) self.clusters = collections.OrderedDict([(0, cluster0)]) while True: # 超过最大的cluster数目则停止 if self.max_cluster is not None and len(self.clusters) >= self.max_cluster: break # 找出eigval2最大的cluster idx = max(self.clusters, key=lambda x: self.clusters.get(x).eigval2) # 如果最大的eigval2小于等于设定的阈值,则停止 if self.clusters[idx].eigval2 <= self.max_second_eig_val: break # 分裂成两个簇的过程为:先计算该簇的前两个主成分,再进行斜交旋转, # 并把每个变量分配到旋转分量对应的簇里,分配的原则是变量与这个主成分相关系数的绝对值最大 c_cluster = self.clusters[idx].cluster c_eigvals, c_eigvecs, c_corrs, _ = VarClus.correig(self.df[c_cluster]) cluster1, cluster2 = [], [] rotator = Rotator(method='quartimax') r_eigvecs = rotator.fit_transform(pd.DataFrame(c_eigvecs)) comb_sigma1 = math.sqrt(np.dot(np.dot(r_eigvecs[:, 0], c_corrs.values), r_eigvecs[:, 0].T)) comb_sigma2 = math.sqrt(np.dot(np.dot(r_eigvecs[:, 1], c_corrs.values), r_eigvecs[:, 1].T)) for feat in c_cluster: comb_cov1 = np.dot(r_eigvecs[:, 0], c_corrs[feat].values.T) comb_cov2 = np.dot(r_eigvecs[:, 1], c_corrs[feat].values.T) corr_pc1 = comb_cov1 / comb_sigma1 corr_pc2 = comb_cov2 / comb_sigma2 if abs(corr_pc1) > abs(corr_pc2): cluster1.append(feat) else: cluster2.append(feat) fin_cluster1, fin_cluster2, _ = VarClus._reassign_rs(self.df, cluster1, cluster2, self.n_rs) c1_eigvals, c1_eigvecs, _, c1_varprops = VarClus.correig(self.df[fin_cluster1]) c2_eigvals, c2_eigvecs, _, c2_varprops = VarClus.correig(self.df[fin_cluster2]) self.clusters[idx] = ClusInfo(cluster=fin_cluster1, eigval1=c1_eigvals[0], eigval2=c1_eigvals[1], eigvecs=c1_eigvecs, varprop=c1_varprops[0] ) self.clusters[len(self.clusters)] = ClusInfo(cluster=fin_cluster2, eigval1=c2_eigvals[0], eigval2=c2_eigvals[1], eigvecs=c2_eigvecs, varprop=c2_varprops[0] )
X = att_data.copy().values X = check_array(X, force_all_finite='allow-nan') statistic, p_value = calculate_bartlett_sphericity(X) #print("\nBarlett sphericity p={0}".format(p_value)) kmo_per_variable, kmo_total = calculate_kmo(X) #print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total)) # Create factor analysis object and perform factor analysis # Using maximum likelihood analysis (ml) n_factors = 5 fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml") fa.fit(att_data) # Kaiser normalization and oblimin rotation rotator = Rotator(method="oblimin", normalize=True, max_iter=25) loadings = rotator.fit_transform(fa.loadings_) # Set FA loadings to be rotator loadings fa.loadings_ = loadings #print (loadings) # Get factor scores factor_scores = fa.transform(att_data) factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)]) #print("\nFactor scores: \n", factor_scores) factor_names = ["Numerical Self Efficacy", "School Math", "Academic maturity", "Numerical Relevancy", "Math Anxiety"] # Convert factor loadings to a df loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names)
plt.grid() plt.show() fa = FactorAnalyzer(n_factors=2, method='principal', rotation='varimax') fa.fit(datas) # 公因子方差 print(fa.get_communalities()) # 特征值 print("\n特征值:\n", fa.get_factor_variance()[0]) # 方差贡献率 print("\n方差贡献率:\n", fa.get_factor_variance()[1]) # 累积方差贡献率 print("\n累计方差贡献率:\n", fa.get_factor_variance()[2]) print("\n成分矩阵:\n", fa.loadings_) rotator = Rotator() load_matrix = rotator.fit_transform(fa.loadings_) print(load_matrix) # 因子得分系数矩阵 # 相关系数 corr = datas.corr() # array转matrix corr = np.mat(corr) # 因子载荷矩阵 load_matrix = np.mat(load_matrix) pr_matrix_score = np.dot(nlg.inv(corr), load_matrix) print(pr_matrix_score)
def score(database, semester, year, season, answer_key, savedname): ''' Modified so that it uses numerical values of question/answer rather than string values. By: Ilija Nikolov, 5 March 2018 ''' ''' The score function reads in a QuaRCS dataset and answer key file to create a series of columns to add to the dataset. The function creates columns for: - score on a binary scale (1 for correct, 0 for incorrect) - total score - totals and means by category - number of questions answered - total and mean confidence Args: database: pre or post QuaRCS dataset for a semester answer_key: QuaRCS Assessment Answer Key semester: 'PRE' or 'PST' Output: name of file + '_scored' as .csv file Example: score('QuaRCS_Summer_2017_Pre.csv', 'PRE', QuaRCS Assessment Answer Key.csv', QuaRCS_Summer_2017_Pre ) New File saved under QuaRCS_Summer_2017_Pre_scored.csv Check folder for files By: Abdoulaye Sanogo, 08/11/2017 Future Improvements: add columns for confidence means and totals by category add extra colums after insert so the deletion of columns will not be necessary ''' question = semester + "_Q" # question = 'PRE_Q' or 'PST_Q' data = pd.read_csv(database, encoding = 'utf-8', skiprows = [1,2], header = 0) df = pd.read_csv(answer_key, encoding = 'utf-8') cols = list(data.columns.values) c = len(cols) e = 0 h = len(data) # Adds the Q#_SCORE column right next to each question questions = np.unique(df['Question #']) for item in questions: if(question+str(item) in data.columns): data.insert(data.columns.get_loc(question+str(item))+1,question+str(item)+'_SCORE', 0) # e >= 50 --> Full, e < 50 --> Lite for d in range(c): column = cols[d] column = column[0:5] if question == column: e = e + 1 data.insert(6 , 'VERSION', " ") if e == 50: if(year == "16" and season == "Fa"): data['VERSION'] = "Fl_2.0" # If the value "progress bar" is in comments, change the version to 2.1 for v in range(h): if 'COMMENTS' in data.columns: if (data.loc[v, 'COMMENTS'] == "progress bar"): data.loc[v, 'VERSION'] = "Fl_2.1" else: data['VERSION'] = "Fl_1.1" elif e == 54: data['VERSION'] = "Fl_1.0" data = data.drop([semester + '_Q18'], axis=1) data = data.drop([semester + '_Q18CF'], axis=1) data = data.drop([semester + '_Q25'], axis=1) data = data.drop([semester + '_Q25CF'], axis=1) e = 50 elif e == 22: data['VERSION'] = "Lt_1.0" elif e == 30: intyr = int(year) if (intyr >= 19 or (year == "18" and season == "Fa")): data['VERSION'] = "Lt_2.1" else: data['VERSION'] = "Lt_2.0" elif e == 28: data['VERSION'] = "SM_1.0" # New columns for the totals data[semester + '_TOTAL'] = np.nan data[semester + '_PCT_TOTAL'] = np.nan data[semester + '_GR_TOTAL'] = np.nan data[semester + '_GR_MEAN'] = np.nan data[semester + '_AR_TOTAL'] = np.nan data[semester + '_AR_MEAN'] = np.nan data[semester + '_PR_TOTAL'] = np.nan data[semester + '_PR_MEAN'] = np.nan data[semester + '_PC_TOTAL'] = np.nan data[semester + '_PC_MEAN'] = np.nan data[semester + '_SP_TOTAL'] = np.nan data[semester + '_SP_MEAN'] = np.nan data[semester + '_TR_TOTAL'] = np.nan data[semester + '_TR_MEAN'] = np.nan data[semester + '_AV_TOTAL'] = np.nan data[semester + '_AV_MEAN'] = np.nan #data[semester + '_ER_MEAN'] = np.nan data[semester + '_UD_TOTAL'] = np.nan data[semester + '_UD_MEAN'] = np.nan data[semester + '_ES_TOTAL'] = np.nan data[semester + '_ES_MEAN'] = np.nan # Composite Variables data[semester + '_SELFEFF'] = np.nan data[semester + '_MATHANX'] = np.nan data[semester + '_MATHREL'] = np.nan data[semester + '_ACADMAT'] = np.nan data[semester + '_SCHMATH'] = np.nan corr_ans = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0} for item in corr_ans: corr_ans[item] = int(list(df.loc[df['Question #']==item]['Correct Answer'])[0]) # Adds totals and means to total and means columns for nn in range(h): qn = {15: 0, 12:0, 14:0, 26:0, 27:0, 23:0, 28:0, 19:0, 3:0, 16:0, 13:0, 31:0, 32:0, 29:0, 30:0, 5:0, 6:0, 7:0, 10:0, 11:0, 20:0, 21:0, 33:0, 34:0, 35:0} for q_num in qn: try: if(int(data.loc[nn, question + str(q_num)]) == corr_ans[q_num]): qn[q_num] = 1 data.loc[nn, question+str(q_num)+'_SCORE'] = 1 except: pass GR = int(np.nansum([qn[15], qn[14], qn[12], qn[29], qn[30], qn[13]])) AR = int(np.nansum([qn[15], qn[14], qn[26], qn[27], qn[23], qn[28], qn[19], qn[3], qn[16], qn[31], qn[32], qn[5], qn[6], qn[7], qn[29], qn[30], qn[10], qn[11], qn[20], qn[21], qn[33], qn[34], qn[35]])) PR = int(np.nansum([qn[15], qn[12], qn[14], qn[23], qn[28], qn[3], qn[16], qn[7], qn[10], qn[11], qn[20], qn[21], qn[33], qn[35], qn[13]])) PC = int(np.nansum([qn[27], qn[3], qn[32], qn[20], qn[21]])) SP = int(np.nansum([qn[27], qn[23], qn[28], qn[29], qn[30], qn[20], qn[21]])) TR = int(np.nansum([qn[26], qn[27], qn[23]])) AV = int(np.nansum([qn[31], qn[10], qn[11], qn[33], qn[34]])) UD = int(np.nansum([qn[31], qn[6], qn[7], qn[35], qn[16]])) ES = int(np.nansum([qn[15], qn[12], qn[14], qn[16], qn[13]])) data.loc[nn, semester + '_GR_TOTAL'] = GR data.loc[nn, semester + '_AR_TOTAL'] = AR data.loc[nn, semester + '_PR_TOTAL'] = PR data.loc[nn, semester + '_PC_TOTAL'] = PC data.loc[nn, semester + '_SP_TOTAL'] = SP data.loc[nn, semester + '_TR_TOTAL'] = TR data.loc[nn, semester + '_AV_TOTAL'] = AV data.loc[nn, semester + '_UD_TOTAL'] = UD data.loc[nn, semester + '_ES_TOTAL'] = ES total_full = 0 for q_num in qn: total_full += qn[q_num] if e == 50: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(25) data.loc[nn, semester + '_GR_MEAN'] = GR/6 data.loc[nn, semester + '_AR_MEAN'] = AR/23 data.loc[nn, semester + '_PR_MEAN'] = PR/15 data.loc[nn, semester + '_PC_MEAN'] = PC/5 data.loc[nn, semester + '_SP_MEAN'] = SP/7 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_AV_MEAN'] = AV/5 data.loc[nn, semester + '_UD_MEAN'] = UD/5 data.loc[nn, semester + '_ES_MEAN'] = ES/5 elif e == 22: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(11) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/9 data.loc[nn, semester + '_PR_MEAN'] = PR/8 data.loc[nn, semester + '_SP_MEAN'] = SP/3 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_ES_MEAN'] = ES/5 #lacks number of questions for meaningful subscore #1 q data.loc[nn, semester + '_UD_MEAN'] = np.nan data.loc[nn, semester + '_UD_TOTAL'] = np.nan #2 qs data.loc[nn, semester + '_PC_MEAN'] = np.nan data.loc[nn, semester + '_PC_TOTAL'] = np.nan #1 q data.loc[nn, semester + '_AV_MEAN'] = np.nan data.loc[nn, semester + '_AV_TOTAL'] = np.nan elif e == 30: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(15) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/13 data.loc[nn, semester + '_PR_MEAN'] = PR/11 data.loc[nn, semester + '_SP_MEAN'] = SP/3 data.loc[nn, semester + '_TR_MEAN'] = TR/3 data.loc[nn, semester + '_AV_MEAN'] = AV/4 data.loc[nn, semester + '_ES_MEAN'] = ES/5 #lacks number of questions for meaningful subscore #1 q data.loc[nn, semester + '_UD_MEAN'] = np.nan data.loc[nn, semester + '_UD_TOTAL'] = np.nan #2 qs data.loc[nn, semester + '_PC_MEAN'] = np.nan data.loc[nn, semester + '_PC_TOTAL'] = np.nan elif e == 28: data.loc[nn, semester + '_TOTAL'] = total_full data.loc[nn, semester + '_PCT_TOTAL'] = total_full/(14) data.loc[nn, semester + '_GR_MEAN'] = GR/4 data.loc[nn, semester + '_AR_MEAN'] = AR/13 data.loc[nn, semester + '_PR_MEAN'] = PR/9 data.loc[nn, semester + '_PC_MEAN'] = PC/3 data.loc[nn, semester + '_SP_MEAN'] = SP/7 data.loc[nn, semester + '_UD_MEAN'] = UD/5 data.loc[nn, semester + '_ES_MEAN'] = ES/3 #lacks number of questions for meaningful subscore #2 q data.loc[nn, semester + '_TR_MEAN'] = np.nan data.loc[nn, semester + '_TR_TOTAL'] = np.nan #1 q data.loc[nn, semester + '_AV_MEAN'] = np.nan data.loc[nn, semester + '_AV_TOTAL'] = np.nan data[semester + '_CF_TOTAL'] = np.nan data[semester + '_CF_TOTAL_CORR'] = np.nan data[semester + '_CF_TOTAL_INCORR'] = np.nan data[semester + '_CF_MEAN'] = np.nan data[semester + '_CF_MEAN_CORR'] = np.nan data[semester + '_CF_MEAN_INCORR'] = np.nan # Calculates confidence totals and means; adds to respective columns for u in range(h): qcf = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0} qc = {'15': 0, '12':0, '14':0, '26':0, '27':0, '23':0, '28':0, '19':0, '3':0, '16':0, '13':0, '31':0, '32':0, '29':0, '30':0, '5':0, '6':0, '7':0, '10':0, '11':0,'20':0, '21':0, '33':0, '34':0, '35':0} for q_num in qcf: try: qcf[q_num] = int(data.loc[u, question + str(q_num) + "CF"]) qc[q_num] = int(data.loc[u, question + str(q_num) + '_SCORE']) except: pass medscore = 0 corrscore = 0 incorrscore = 0 confcount = 0 for item in qcf: medscore += qcf[item] if qcf[item] > 0: confcount +=1 if qc[item] == 1: corrscore += qcf[item] else: incorrscore += qcf[item] #print(confcount) if (confcount == 0): confcount = 1 # Student's score numcorr = data.loc[u, semester + '_TOTAL'] # Calculate confidence scores if e == 30: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 22: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 28: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = np.nan data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 elif e == 50: data.loc[u, semester + '_CF_TOTAL'] = medscore data.loc[u, semester + '_CF_TOTAL_CORR'] = corrscore data.loc[u, semester + '_CF_TOTAL_INCORR'] = incorrscore data.loc[u, semester + '_CF_MEAN'] = medscore/confcount if numcorr != 0: data.loc[u, semester + '_CF_MEAN_CORR'] = corrscore/numcorr else: data.loc[u, semester + '_CF_MEAN_CORR'] = 0 if numcorr != confcount: data.loc[u, semester + '_CF_MEAN_INCORR'] = incorrscore/(confcount-numcorr) else: data.loc[u, semester + '_CF_MEAN_INCORR'] = 0 data[semester + '_QCOMPLETE'] = 0 data[semester + '_COMPFLAG'] = 0 data[semester +'_EFFFLAG'] = 0 # Counts number of completed columns try: if e == 50: q = [15, 12, 14, 26, 27, 23, 28, 19, 3, 16, 13, 31, 32, 29, 30, 5, 6, 7, 10, 11, 20, 21, 33, 34, 35] elif e == 22: q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16] elif e == 30: q = [15, 12, 13, 14, 26, 27, 23, 28, 19, 3, 16, 10, 11, 33, 34] elif e == 28: q = [6, 7, 13, 14, 16, 20, 21, 23, 27, 28, 29, 30, 31, 35] for v in range(h): # Count up totals total = 0 for w in q: count = question + str(w) answered = data.loc[v, count] if (str(answered) == 'nan' or str(answered) == ' '): continue else: total = int(np.nansum([total, 1])) data.loc[v, semester + '_QCOMPLETE'] = total # Add completed flag if total == len(q): data.loc[v, semester + '_COMPFLAG'] = 1 else: data.loc[v, semester + '_COMPFLAG'] = 0 except: KeyError # Calculating effort column for v in range(h): # If there is no response for effort, mark completion as 0 for that student! if (pd.isnull(data.loc[v, semester + '_EFFORT'])): data.loc[v, semester + '_COMPFLAG'] = 0 # If there is high effort, give full marks in flag if data.loc[v, semester + '_EFFORT'] == 4 or data.loc[v, semester + '_EFFORT'] == 5: data.loc[v, semester +'_EFFFLAG'] = 1 # Some effort gives you only so many marks... elif data.loc[v, semester + '_EFFORT'] == 3: data.loc[v, semester +'_EFFFLAG'] = 0.5 # NO EFFORT!! :-( elif data.loc[v, semester + '_EFFORT'] == 2 or data.loc[v, semester + '_EFFORT'] == 1: data.loc[v, semester +'_EFFFLAG'] = 0 # Factor Analysis! if (semester == "PRE" and e == 30) or (semester == "PRE" and e == 22) or (semester == "PRE" and e == 28): # Fill out whymajs with 0 instead of NaN values so we can # perform FA on them nan_columns = [semester + "_WHYMAJ_1", semester + "_WHYMAJ_2", semester + "_WHYMAJ_3", semester + "_WHYMAJ_4", semester + "_WHYMAJ_5", semester + "_WHYMAJ_6", semester + "_WHYMAJ_7", semester + "_WHYMAJ_8", semester + "_WHYCS_1", semester + "_WHYCS_2", semester + "_WHYCS_3", semester + "_WHYCS_4", semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_WHYCS_7" ] for i in data.index: for column in nan_columns: if pd.isna(data.at[i, column]): data.at[i, column] = 0 # Factor Analysis variables att = [semester + '_FREQEN', semester + '_DAILYM', semester + '_DAILYG', semester + '_ATT_DL_3', semester + '_ATT_SC_1', semester + '_ATT_SC_2', semester + '_ATT_SC_4', semester + '_ATT_SC_5', semester + '_LK1', semester + '_LK2', semester + '_LK5', semester + '_ANX#1_1', semester + '_ANX#1_2', semester + '_ANX#1_3', semester + '_ANX#1_4', semester + '_CF_TOTAL', semester + '_ATT_DL_2', semester + '_ATT_SC_3', semester + "_WHYCS_1", semester + "_WHYCS_3", semester + "_WHYCS_5", semester + "_WHYCS_6", semester + "_EFFORT" ] # Variable selection att_data = data.loc[ data[semester + '_COMPFLAG']==1 ] att_data = att_data[att] # Drop all rows with NaN values att_data.dropna(inplace=True) swapList = ['_ATT_DL_2', '_ATT_DL_3', '_ATT_SC_1', '_ATT_SC_2', '_ATT_SC_3', '_ATT_SC_4', '_ATT_SC_5' ] for i in att_data.index: for col in swapList: swapOrdering(att_data, i, semester + col) # KMO and Barlett tests X = att_data.copy().values X = check_array(X, force_all_finite='allow-nan') statistic, p_value = calculate_bartlett_sphericity(X) print("\nBarlett sphericity p={0}".format(p_value)) kmo_per_variable, kmo_total = calculate_kmo(X) print("Kaiser-Meyer-Olkin measure of sampling adequacy = {0}\n".format(kmo_total)) # Create factor analysis object and perform factor analysis # Using maximum likelihood analysis (ml) n_factors = 5 fa = FactorAnalyzer(rotation=None, n_factors=n_factors, method="ml") fa.fit(att_data) # Kaiser normalization and oblimin rotation rotator = Rotator(method="oblimin", normalize=True, max_iter=25) loadings = rotator.fit_transform(fa.loadings_) # Set FA loadings to be rotator loadings fa.loadings_ = loadings # Get factor scores factor_scores = fa.transform(att_data) factor_scores = pd.DataFrame(data=factor_scores, index=att_data.index, columns=["Factor "+str(i+1) for i in range(n_factors)]) # print("\nFactor scores: \n", factor_scores) factor_names = ["Numerical Self Efficacy", "School Math", "Academic maturity", "Numerical Relevancy", "Math Anxiety"] # Convert factor loadings to a df loadings = pd.DataFrame(data=loadings, index=att, columns=factor_names) # Drop non-meaningful values loadings = loadings.where(abs(loadings) > 0.32) print("Factor loadings: \n", loadings) scores1 = factor_scores['Factor 1'].tolist() plt.hist(scores1, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Self Efficacy") # plt.show() scores2 = factor_scores['Factor 2'].tolist() plt.hist(scores2, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("School Math") # plt.show() scores3 = factor_scores['Factor 3'].tolist() plt.hist(scores3, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Academic maturity") # plt.show() scores4 = factor_scores['Factor 4'].tolist() plt.hist(scores4, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Numerical Relevancy") # plt.show() scores5 = factor_scores['Factor 5'].tolist() plt.hist(scores5, bins=[x for x in np.arange(-4.0, 4.0, 0.2)]) plt.title("Math Anxiety") # plt.show() # Update composite variables for i in factor_scores.index: data.at[i, semester + '_SELFEFF'] = factor_scores.at[i, 'Factor 1'] data.at[i, semester + '_SCHMATH'] = factor_scores.at[i, 'Factor 2'] data.at[i, semester + '_ACADMAT'] = factor_scores.at[i, 'Factor 3'] data.at[i, semester + '_MATHREL'] = factor_scores.at[i, 'Factor 4'] data.at[i, semester + '_MATHANX'] = factor_scores.at[i, 'Factor 5'] #data.to_csv(semester+"_scored.csv", encoding='utf-8',index=False) #print("Results saved to " + savedname + "_scored.csv") return data