## response_<scaleIdentifier>, where scaleIdentifier is a single letter. Each cell ## should have a comma seperated list in it that will be split and handled by the program ## The output has a format of word,scale,tau_k import cct import pandas import sys import numpy from sklearn import preprocessing data = pandas.read_csv(sys.argv[1],quoting=3,delimiter="\t") #Calculate competency scores competencies, eigenRatios, originalCompetencies = cct.processData(data) responseMatrices = cct.buildMatrices(data) sortedWords = [] #For each scale in the dataset for dataset, responseMatrix in responseMatrices.items(): datasetLetter = dataset.split('_')[1] estimatedRank = {} # Transform the response matrix into a scaled matrix # Also flip the rankings of the informants that orginially had negative competencies responseMatrix = pandas.DataFrame(preprocessing.scale(cct.reverseNegatives(responseMatrix,originalCompetencies[dataset]),axis=1), columns = responseMatrix.columns) local_comp = competencies[dataset] #Calculate tau_k for each word in the scale for col in responseMatrix:
a = 1 - D ratio1 = ((D + g * a) * ( 1 - g * a))/(a ** 2 * g * (1 - g)) ratio2 = (1 - g * a)/(a * (1 - g)) return X * numpy.log(ratio1) - numpy.log(ratio2) # Perform spellchecking and lowercasing on data responses, prompts = cleaning.processFile(sys.argv[1]) # Perform CCT on data and cap competencies that are great than 1 # at .999 cctResults = cct.processData(responses,prompts) competencyMatrix = pandas.DataFrame(cctResults[0]) competencyMatrix[competencyMatrix > 1] = .999 print pandas.Series(cctResults[1]) responseMatrices = cct.buildMatrices(responses) correctSet = {} correct = defaultdict(lambda:defaultdict()) # Calculate G_k for each word in each scale for dataset, responseMatrix in responseMatrices.items(): datasetLetter = dataset.split('_')[1] local_comp = competencyMatrix[dataset][responseMatrix.index] # Get the average number of words per informant for each scale # and the total number of words given per scale. This is used to derive the bias variable avgResponse = responseMatrix[responseMatrix==1].count(1).mean() length = responseMatrix.shape[1] prompt = prompts['Answer.prompt_' + datasetLetter]