def generateY(filData): # initiate transformer : binary count vectoriser # stopword=> None to classify the None observations as negative examples # yTransformer = CountVectorizer(min_df = 0.0, binary=True, lowercase = False)#, stop_words=[u'None']); # vectorize \m/ # Y = yTransformer.fit_transform(filData['answer']); # print Y newY = []; for answer in filData['answer']: temp = qbPre.convClasses(answer,'|'); newY.append(temp); Y = yTransformer.fit_transform(newY) qbGbl.classDict = yTransformer.classes_; # tempY = Y.todense() # for row in tempY: # temp = [] # for topic in xrange(0,row.shape[1]-1): # # if row[topic] != 0: # print # # save topic labels to a reference dictionary return Y
def analyse(filename): # filData = qbPre.readDataFrame(filename,None,0); # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] # new = filData # filename = '{0}'.format(qbGbl.filFileName) # # filData = pd.DataFrame(columns=['index','worker','declaration','answer']) # filData = qbPre.readDataFrame(filename,None,None); # filData.columns = ['index','worker','declaration','answer']; # del filData['index'] # old = filData # oldDecs = [] # for row in new['Input.declaration']: # if (old[old['declaration'] == row].empty): # continue; # else: # oldDecs.append(numpy.array(old[old['declaration'] == row])[1]) # oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer']) # oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False); ## =============================================================================== filData = qbPre.readDataFrame(filename,None,0); # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] filData = filData[['WorkerId','Input.declaration','Answer.Q1']] new = filData filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName) old = qbPre.readDataFrame(filename,None,0) # print new['Input.declaration'].nunique() # print len(old['declaration'].unique()) filData = pd.Series(old['declaration']).drop_duplicates() print filData # tempSer = pd.Series(new['Input.declaration']).drop_duplicates() tempSer = new.drop_duplicates(cols=['Input.declaration']) print len (new) print len(tempSer) # print len(filData) # print '=================' accuracy = [] count=0; # print len(new) # print len(old) for row in filData: # print row if not (new[new['Input.declaration'] == row].empty): if len(new[new['Input.declaration'] == row])>1: print new[new['Input.declaration'] == row] count += len(new[new['Input.declaration'] == row]) tempOld = qbPre.convClasses(list(old[old['declaration'] == row]['answer'])[0],'|') # print tempOld tempNew = qbPre.convClasses(list(new[new['Input.declaration'] == row]['Answer.Q1'])[0],'|') # print tempNew tempScore = 0.0; for topic in tempNew: if topic in tempOld: tempScore += 1.0; tempScore /= float(len(tempNew)) accuracy.append(tempScore) print scipy.stats.tmean(accuracy) print count P.figure(); n, bins, patches = P.hist(accuracy,len(set(accuracy)), histtype='step',cumulative=True,normed=1) P.title("Score distribution") P.xlabel("score") P.ylabel("Frequency") P.show()
def analyseWorkers(): filData = qbPre.readDataFrame(qbGbl.filFileName,None,0); workers = filData.WorkerId.drop_duplicates(); filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName) old = qbPre.readDataFrame(filename,None,0) perfectDecs = old.declaration.drop_duplicates() dataSet = pd.DataFrame() for row in perfectDecs: if not filData[filData['Input.declaration']==row].empty: if dataSet.empty: dataSet = filData[filData['Input.declaration']==row] else: dataSet = dataSet.append(filData[filData['Input.declaration']==row]) dataSet.SubmitTime = pd.to_datetime(dataSet.SubmitTime) # dataSet = pd.DataFrame(dataSet.values, # columns=['SubmitTime','WorkerId','Input.declaration','Answer.Q1']) # print dataSet dataSet = dataSet.sort(columns=['SubmitTime']) # firstDate = list(dataSet.SubmitTime)[0] records = []; # print dataSet workers = dataSet.WorkerId.drop_duplicates(); for worker in workers: tempStats = {'score':0.0,'freq':0} tempRecords = dataSet[dataSet.WorkerId==worker] for row in tempRecords.itertuples(): newRow = list(row) # print newRow[-2] # print old[old['declaration'] == newRow[-2]] tempOld = qbPre.convClasses(list(old[old['declaration'] == newRow[-2]].answer)[0],'|') # print tempOld tempNew = qbPre.convClasses(newRow[-1],'|') # print tempNew tempScore = 0.0; for topic in tempNew: if topic in tempOld: tempScore += 1.0; tempScore /= float(len(tempNew)) tempStats['freq']+=1; # frequency ++ tempStats['score']+=tempScore; aggrScore = tempStats['score']/tempStats['freq'] tm = row[1].time() tm = float(tm.hour) + float(tm.minute) / 60 # print tm compl = float(tempStats['freq'])/float(len(tempRecords)) # del newRow[0] newRow.extend([tm,tempScore,aggrScore,compl]) records.append(newRow) records = numpy.array(records); dataSet = pd.DataFrame(records[:,1:], columns = ['SubmitTime','WorkerId','Input.declaration','Answer.Q1','Time','TempScore','AggrScore','Completion'], index=records[:,0]) return dataSet
def analyse(filename): # filData = qbPre.readDataFrame(filename,None,0); # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] # new = filData # filename = '{0}'.format(qbGbl.filFileName) # # filData = pd.DataFrame(columns=['index','worker','declaration','answer']) # filData = qbPre.readDataFrame(filename,None,None); # filData.columns = ['index','worker','declaration','answer']; # del filData['index'] # old = filData # oldDecs = [] # for row in new['Input.declaration']: # if (old[old['declaration'] == row].empty): # continue; # else: # oldDecs.append(numpy.array(old[old['declaration'] == row])[1]) # oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer']) # oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False); ## =============================================================================== filData = qbPre.readDataFrame(filename, None, 0) # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] filData = filData[['WorkerId', 'Input.declaration', 'Answer.Q1']] new = filData filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName) old = qbPre.readDataFrame(filename, None, 0) # print new['Input.declaration'].nunique() # print len(old['declaration'].unique()) filData = pd.Series(old['declaration']).drop_duplicates() print filData # tempSer = pd.Series(new['Input.declaration']).drop_duplicates() tempSer = new.drop_duplicates(cols=['Input.declaration']) print len(new) print len(tempSer) # print len(filData) # print '=================' accuracy = [] count = 0 # print len(new) # print len(old) for row in filData: # print row if not (new[new['Input.declaration'] == row].empty): if len(new[new['Input.declaration'] == row]) > 1: print new[new['Input.declaration'] == row] count += len(new[new['Input.declaration'] == row]) tempOld = qbPre.convClasses( list(old[old['declaration'] == row]['answer'])[0], '|') # print tempOld tempNew = qbPre.convClasses( list(new[new['Input.declaration'] == row]['Answer.Q1'])[0], '|') # print tempNew tempScore = 0.0 for topic in tempNew: if topic in tempOld: tempScore += 1.0 tempScore /= float(len(tempNew)) accuracy.append(tempScore) print scipy.stats.tmean(accuracy) print count P.figure() n, bins, patches = P.hist(accuracy, len(set(accuracy)), histtype='step', cumulative=True, normed=1) P.title("Score distribution") P.xlabel("score") P.ylabel("Frequency") P.show()