Esempio n. 1
0
## response_<scaleIdentifier>, where scaleIdentifier is a single letter. Each cell
## should have a comma seperated list in it that will be split and handled by the program

## The output has a format of word,scale,tau_k

import cct
import pandas
import sys
import numpy
from sklearn import preprocessing

data = pandas.read_csv(sys.argv[1],quoting=3,delimiter="\t")

#Calculate competency scores
competencies, eigenRatios, originalCompetencies = cct.processData(data)
responseMatrices = cct.buildMatrices(data)

sortedWords = []

#For each scale in the dataset
for dataset, responseMatrix in responseMatrices.items():
    datasetLetter = dataset.split('_')[1]
    estimatedRank = {}

    # Transform the response matrix into a scaled matrix
    # Also flip the rankings of the informants that orginially had negative competencies
    responseMatrix = pandas.DataFrame(preprocessing.scale(cct.reverseNegatives(responseMatrix,originalCompetencies[dataset]),axis=1), columns = responseMatrix.columns)
    local_comp = competencies[dataset]

    #Calculate tau_k for each word in the scale
    for col in responseMatrix:
Esempio n. 2
0
    a = 1 - D
    ratio1 = ((D + g * a) * ( 1 - g * a))/(a ** 2 * g * (1 - g))
    ratio2 = (1 - g * a)/(a * (1 - g))
    return X * numpy.log(ratio1) - numpy.log(ratio2)

# Perform spellchecking and lowercasing on data
responses, prompts = cleaning.processFile(sys.argv[1])

# Perform CCT on data and cap competencies that are great than 1
# at .999
cctResults = cct.processData(responses,prompts)
competencyMatrix = pandas.DataFrame(cctResults[0])
competencyMatrix[competencyMatrix > 1] = .999
print pandas.Series(cctResults[1])

responseMatrices = cct.buildMatrices(responses)
correctSet = {}

correct = defaultdict(lambda:defaultdict())
# Calculate G_k for each word in each scale
for dataset, responseMatrix in responseMatrices.items():
    datasetLetter = dataset.split('_')[1]
    local_comp = competencyMatrix[dataset][responseMatrix.index]

    # Get the average number of words per informant for each scale
    # and the total number of words given per scale. This is used to derive the bias variable

    avgResponse =  responseMatrix[responseMatrix==1].count(1).mean()
    length = responseMatrix.shape[1]

    prompt = prompts['Answer.prompt_' + datasetLetter]