Example #1
0
def R_Factor_Analysis( comm_str,
                       csv_data, csv_colvars, csv_coltypes, fpref,
                       test_arr, # -> can be NULL for interior calc
                       Nfac,     # -> can be 0    for interior calc
                       Ntopload, # -> can be 0    for interior calc
                       flab,
                       DO_GRAPH,
                       N_cent = 99,   # 'centile'
                       N_iter = 5000,  # 'iterations'
                       ftype = 'jpeg'):
    '''Perform factor analysis using R function factanal().  User can
    specify the number of latent factors using the paran() function,
    which implements Horn's test.
    Returns:  Factor scores and loadings'''
    
    # R libraries used here.
    paran = importr('paran')

    # some stuff about format types
    if PARN_OUT_types.__contains__(ftype):
        ii = PARN_OUT_types.index(ftype)
        OUT_dev = PARN_OUT_devs[ii]
        OUT_paran = fpref+'.'+ftype
    else:
        print "** Error! ",
        print "Output file type '%s' is not valid. Select from:" % (ftype)
        print "\t",
        for x in PARN_OUT_types:
            print " '"+x+"' ",
        print "\n"
        sys.exit(32)

    
    fff = open(fpref+'.log','w')
    if comm_str:
        fff.write('# '+comm_str+"\n")

    # SETUP THE VARIABLE VALUES
    Lx,Ly = np.shape(csv_data)

    # if user hasn't entered a selection, then use 'em all.
    if not(test_arr):
        test_arr = list(csv_colvars)

    # Get rid of variable columns with 'NA'
    test_arr = Cut_ColVars_with_NAs(csv_data, csv_colvars, test_arr)

    # check for duplicate columns, which lead to bad singularities
    test_arr = CheckForDuplicates( test_arr )

    # if user hasn't entered a label, then use:
    if not(flab):
        flab = 'FACTOR'

    # only select variables that are represented in the csv_data headings,
    # as well as being either int or float
    VARS_inds = []
    VARS_names = []
    for x in test_arr:
        if csv_colvars.__contains__(x):
            ii = csv_colvars.index(x)
            if [int, float].__contains__(csv_coltypes[ii]):
                VARS_inds.append(ii)
                VARS_names.append(x)

    Nvars = len(VARS_names)
    Y = np.zeros((Lx,Nvars), dtype=float)

    print "++ Factor analysis contains %s variables:" % (Nvars)
    fff.write("\n++ Factor analysis contains %s variables:\n" % (Nvars))
    for j in range(Nvars):
        jj = VARS_inds[j]
        print "\t %s" % (VARS_names[j])
        fff.write("\t %s\n" % (VARS_names[j]))
        for i in range(Lx):
            Y[i,j] = csv_data[i][jj]


    i = CorMatCheck(Y, VARS_names)

    # SETUP THE NUMBER OF FACTORS
    # use eval info to pick number of vars, if user hasn't
    if not(Nfac):
        print "++ Graphing of parallel analysis (PA) Horn's test is:",
        if DO_GRAPH:
            print "ON."
        else:
            print "OFF."
        print "++ PA percentile in Horn's test is: ", N_cent 
        print "++ Number of PA Monte Carlo iterations: ", N_iter

        # mostly default values, some user control
        PARN = r.paran( Y, iterations=N_iter, centile=N_cent,
                        quietly=False, status=True, all=True,
                        cfa=True, graph=DO_GRAPH, color=True,
                        col=r.c("black","red","blue"), lty=r.c(1,2,3),
                        lwd=1, legend=True, file=OUT_paran, width=640,
                        height=640, grdevice=OUT_dev, seed=0)

        if DO_GRAPH:
            grDevices.dev_off()
            print "++ Don't worry about the briefly passing image."
            print "\tIt has been saved as: %s\n\n" % ( OUT_paran )

        N_PARN_arr = np.array(PARN.rx2('Retained'))
        Nfac = int(N_PARN_arr[0])

    else:
        if Nfac > Nvars:
            print "*+ Warning! The user has selected a number of factors larger"
            print "\tthan the number of variables (%d > %d)!" % (Nfac, Nvars)
            print "\t-> Therefore, we're setting it to be %d," % (Nvars)
            print "\t  but you might still want to check if anything went awry?"
        else:
            print "++ The user has selected the number of factors"
            print "\tto be %d out of %d." % (Nfac, Nvars)


    # RUN THE FACTOR ANALYSIS IN R
    FA_out = r.factanal(Y, 
                        factors=Nfac, 
                        scores='regression', 
                        rotation="varimax")

    FA_scores =np.array(FA_out.rx2('scores'))
    FA_loadings =np.array(FA_out.rx2('loadings'))
    

    # match up highest loadings with the variable names, so we have an
    # idea of what's going into the sausage

    # how many loadings to output.  
    # Can be: ALL, 5, or user-entered other
    if not(Ntopload):
        Ntopload = min(Nvars, 5)
    elif Ntopload<0 :
        Ntopload = Nvars
    else:
        Ntopload = min(Nvars, Ntopload)
    if Ntopload==Nvars:
        strNtopload = "ALL "+str(Nvars)
    else:
        strNtopload = 'top '+str(Ntopload)+'/'+str(Nvars)

    # ordering process
    FA_titles = []
    print "\n++ Factor loading contributions (%s):" % (strNtopload)
    fff.write("\n++ Factor loading contributions (%s):\n" % (strNtopload))
    for i in range(Nfac):
        P = list(FA_loadings[:,i])
        Q = list(VARS_names)
        PQ = sorted(zip(P,Q),reverse=1)
        str_title = "%s_%02d" % (flab, i+1)
        FA_titles.append(str_title)
        print "\n\t"+str_title
        fff.write("\n\t"+str_title+"\n")
        for j in range(Ntopload):
            print "\t%20s  %12.5f" % (PQ[j][1],PQ[j][0])
            fff.write("\t%20s  %12.5f\n" % (PQ[j][1],PQ[j][0]))
    fff.close()

    return FA_scores, FA_titles, VARS_names
Example #2
0
#############################
# Let's get going!

filthresh=0.6
filtper=94
nComponents=15
numSkills=6 # Number of Skills to display
permitted=allDat.columns

for count1 in range(10):

ad2=allDat[permitted]
ad3=np.array(ad2) #added by Ioannis
# Factor Analysis
#lff=pd.DataFrame(np.array(r.factanal(ad2,nComponents)[1])).T
lff=pd.DataFrame(np.array(r.factanal(ad3,nComponents,scores='regression', rotation = "varimax")[1])).T
fit = r.factanal(ad3,nComponents, scores='regression', rotation = "varimax")
corr = fit[3]
scores= np.array(fit.rx2('scores'))

if True: # If want compatibility with ICIS paper set to False
filthresh=np.percentile(lff,filtper)

# Compiling list of "permitted" elements
permitted=[]
for count2 in range(len(lff[0])):
foo=ad2.columns[lff.iloc[count2,:].apply(abs)>filthresh]
if len(foo)>2:
permitted.extend(foo)
permitted=np.unique(permitted)
print "#Permitted: "+str(len(permitted))
Example #3
0
def fa(source=False, use_filter="default", data_file="latest", participant_subset="", drop_metadata=True, drop=[], clean=7, factors=5, facecolor="#ffffff"):
    #gets config file:
    config = get_config_file(localpath=path.dirname(path.realpath(__file__))+'/')

    #IMPORT VARIABLES
    if not source:
	    source = config.get('Source', 'source')
    data_path = config.get('Addresses', source)
    filter_dir = config.get('Paths', "filter_dir")
    filter_name = config.get("Filters", use_filter)
    #END IMPORT VARIABLES

    filter_path = path.dirname(path.realpath(__file__)) + '/' + filter_dir + filter_name + '.csv'

    filters = DataFrame.from_csv(filter_path, header=None).transpose() # transpose filters because of .csv file formatting
    all_data = DataFrame.from_csv(data_path + data_file + ".csv")
    all_data = all_data.reset_index(level=0)
    #~ print filters["metadata"]

    #clean data of respondents who only ckeck extreme answers:
    all_data = all_data[map(lambda y: len(set(y)) > clean,np.array(all_data))]

    if drop_metadata == True:
        # drops metadata
        all_data = all_data.drop(filters["metadata"][Series.notnull(filters["metadata"])], axis=1)

    drop_list = []
    for drop_item in drop:
        # compile list of column names to be dropped:
        drop_list += list(filters[drop_item][Series.notnull(filters[drop_item])])
    #get unique column names (the list may contain duplicates if overlaying multiple filters):
    drop_list = list(set(drop_list))

    all_data = all_data.drop(drop_list, axis=1)

    if participant_subset == "odd":
        # selects only odd indexes (keep the other dataset half for validation)
        keep_rows = all_data.index.values[1::2]
        filtered_data = all_data.ix[keep_rows]
    elif participant_subset == "even":
        # selects only even indexes (keep the other dataset half for validation)
        keep_rows = all_data.index.values[0::2]
        filtered_data = all_data.ix[keep_rows]
    elif participant_subset == "male":
        # selects only male participants
        filtered_data = all_data[all_data['My legal gender:'] == 'Male']
    elif participant_subset == "female":
        # selects only female participants
        filtered_data = all_data[all_data['My legal gender:'] == 'Female']
    else:
        filtered_data = all_data

    #convert to correct type for analysis:
    filtered_data_array = np.array(filtered_data, dtype='float64')

    filtered_data_array = filtered_data_array / 100

    fit = r.factanal(filtered_data_array, factors, rotation='promax')
    load = r.loadings(fit)
    load = numpy2ri.ri2numpy(load)

    load = r.t(load)

    remapped_cmap = remappedColorMap(cm.PiYG, start=(np.max(load)-abs(np.min(load)))/(2*np.max(load)), midpoint=abs(np.min(load))/(np.max(load)+abs(np.min(load))), name='shrunk')

    fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(17.5, 5), facecolor=facecolor)
    graphic = ax.imshow(load, cmap = remapped_cmap, interpolation='none')
    ax.xaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=1.0))
    ax.yaxis.set_major_locator(matplotlib.ticker.MultipleLocator(base=1.0))
    ax.set_xticklabels([0]+filtered_data.columns.tolist(),fontsize=8,rotation=90)
    ax.set_yticklabels(np.arange(factors+1))
    ax.set_ylabel('Factors')
    ax.set_title("Question Loadings on Factors")

    #Recolor plot spines:
    for spine_side in ["bottom", "top", "left", "right"]:
        ax.spines[spine_side].set_color("#777777")

    #Remove ticks:
    plt.tick_params(axis='both', which='both', left="off", right="off", bottom='off', top='off')

    divider = make_axes_locatable(ax)
    #calculate width for cbar so that it is equal to the question column width:
    cbar_width = str(100/np.shape(load)[1])+ "%"
    cax = divider.append_axes("right", size=cbar_width, pad=0.05)
    cbar = colorbar(graphic, cax=cax, drawedges=True)

    #Limit the number of ticks:
    tick_locator = ticker.MaxNLocator(nbins=6)
    cbar.locator = tick_locator
    cbar.update_ticks()

    #Align ticklabels so that negative values are not misaligned (meaning right align):
    for t in cbar.ax.get_yticklabels():
        t.set_horizontalalignment('right')
        t.set_x(0.045*(np.shape(load)[1]+6))

    #Tweak color bar borders
    cbar.outline.set_color("#666666")
    cbar.dividers.set_linewidth(0)