def preProcessData(type): # load the cleaned dataset # filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type); filData = qbPre.readDataFrame('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),None,0); filData = filData.drop_duplicates(cols=['declaration']) # drop all the duplicates sample = filData[filData['answer'].str.contains('None')] # pick all the observaions as None of above filData = filData.drop(sample.index) # drop them as well rows = list(filData.index); random.shuffle(rows); # print rows filData = filData.ix[rows] # print oldSample.declaration # filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type); # print filData # m = int(round(len(filData)*1.0)) # print m # rows = random.sample(filData.index,m) # # random sample generated for old set # filData = filData.ix[rows] # print 'Number of observations: {0}'.format(len(filData)) X = qbPrepare.generateX(filData); Y = qbPrepare.generateY(filData); return [filData,X,Y]
def preProcessData(type): # load the cleaned dataset # filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type); filData = qbPre.readDataFrame( '{0}_{1}.csv'.format(qbGbl.dataSetFileName, type), None, 0) filData = filData.drop_duplicates(cols=['declaration' ]) # drop all the duplicates sample = filData[filData['answer'].str.contains( 'None')] # pick all the observaions as None of above filData = filData.drop(sample.index) # drop them as well rows = list(filData.index) random.shuffle(rows) # print rows filData = filData.ix[rows] # print oldSample.declaration # filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type); # print filData # m = int(round(len(filData)*1.0)) # print m # rows = random.sample(filData.index,m) # # random sample generated for old set # filData = filData.ix[rows] # print 'Number of observations: {0}'.format(len(filData)) X = qbPrepare.generateX(filData) Y = qbPrepare.generateY(filData) return [filData, X, Y]
def pickRowDataset(): newData = qbPre.readDataFrame('{0}/Batch_1123120_batch_results.csv'.format(qbGbl.oriFileName),None,0); data = pd.DataFrame(newData['Input.pv_id'], columns= ['pv_id']) data['global_user_id'] = newData['Input.global_user_id']; data['time'] = newData['Input.time']; data['declaration'] = newData['Input.declaration']; return data
def cleanExistingData(filename1,filename2): labelled = qbPre.readDataFrame(filename1,None,0) labelled = labelled['Input.declaration'] print labelled unlabelled = qbPre.readDataFrame(filename2,None,0); unlabelled = unlabelled.drop_duplicates(cols=['declaration']) # print unlabelled[unlabelled['declaration']==labelled] for row in labelled: unlabelled = unlabelled.drop(unlabelled[unlabelled['declaration']==row].index) # if not unlabelled[unlabelled['declaration']==row].empty: # unlabelled.drop # if not (unlabelled[unlabelled['Input.declaration'] == row].empty): print unlabelled
def cleanExistingData(filename1, filename2): labelled = qbPre.readDataFrame(filename1, None, 0) labelled = labelled['Input.declaration'] print labelled unlabelled = qbPre.readDataFrame(filename2, None, 0) unlabelled = unlabelled.drop_duplicates(cols=['declaration']) # print unlabelled[unlabelled['declaration']==labelled] for row in labelled: unlabelled = unlabelled.drop( unlabelled[unlabelled['declaration'] == row].index) # if not unlabelled[unlabelled['declaration']==row].empty: # unlabelled.drop # if not (unlabelled[unlabelled['Input.declaration'] == row].empty): print unlabelled
def generateSample(fulConSet,m,newFileName,n): # generate old data sample ================================ oldSample = pd.DataFrame(columns=('pv_id', 'global_user_id', 'time', 'declaration')); # pick full information oldData = pickRowDataset() # generate and write the fully concorded for later reference to HDD tempFulConSet = pd.DataFrame(list(fulConSet),columns = ['declaration']) tempFulConSet.to_csv('data/relAnalytics/fulConSet.csv',index = False); # find full info of the filly concorded occurences for feedback in fulConSet: p = oldData[oldData['declaration'] == feedback]; oldSample = oldSample.append(p[0:1],ignore_index=True) rows = list(oldSample.index) # print oldSample # until the population > sample size while len(oldSample) < m: # double the population by duplicating oldSample = oldSample.append(oldSample,ignore_index=True); # shuffle the observations rows = list(oldSample.index); random.shuffle(rows); oldSample = oldSample.ix[rows] print oldSample.declaration # pick a random sample of size m rows = random.sample(oldSample.index,m) # random sample generated for old set sample = oldSample.ix[rows] # generate new data sample ================================= # load the filtered dataset newData = qbPre.readDataFrame(newFileName,None,0); # pick a random sample of size newRows = random.sample(newData.index,n); newSample = newData.ix[newRows]; # aggregate the old and new samples to gether sample = sample.append(newSample,ignore_index=True); print sample.declaration rows = list(sample.index); # shuffle them random.shuffle(rows); sample = sample.ix[rows]; print sample.declaration # generate csv file :D sample.to_csv('data/write/newFeedbackSample2.csv',index = False);
def analyseWorkers(): filData = qbPre.readDataFrame(qbGbl.filFileName,None,0); workers = filData.WorkerId.drop_duplicates(); filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName) old = qbPre.readDataFrame(filename,None,0) perfectDecs = old.declaration.drop_duplicates() dataSet = pd.DataFrame() for row in perfectDecs: if not filData[filData['Input.declaration']==row].empty: if dataSet.empty: dataSet = filData[filData['Input.declaration']==row] else: dataSet = dataSet.append(filData[filData['Input.declaration']==row]) dataSet.SubmitTime = pd.to_datetime(dataSet.SubmitTime) # dataSet = pd.DataFrame(dataSet.values, # columns=['SubmitTime','WorkerId','Input.declaration','Answer.Q1']) # print dataSet dataSet = dataSet.sort(columns=['SubmitTime']) # firstDate = list(dataSet.SubmitTime)[0] records = []; # print dataSet workers = dataSet.WorkerId.drop_duplicates(); for worker in workers: tempStats = {'score':0.0,'freq':0} tempRecords = dataSet[dataSet.WorkerId==worker] for row in tempRecords.itertuples(): newRow = list(row) # print newRow[-2] # print old[old['declaration'] == newRow[-2]] tempOld = qbPre.convClasses(list(old[old['declaration'] == newRow[-2]].answer)[0],'|') # print tempOld tempNew = qbPre.convClasses(newRow[-1],'|') # print tempNew tempScore = 0.0; for topic in tempNew: if topic in tempOld: tempScore += 1.0; tempScore /= float(len(tempNew)) tempStats['freq']+=1; # frequency ++ tempStats['score']+=tempScore; aggrScore = tempStats['score']/tempStats['freq'] tm = row[1].time() tm = float(tm.hour) + float(tm.minute) / 60 # print tm compl = float(tempStats['freq'])/float(len(tempRecords)) # del newRow[0] newRow.extend([tm,tempScore,aggrScore,compl]) records.append(newRow) records = numpy.array(records); dataSet = pd.DataFrame(records[:,1:], columns = ['SubmitTime','WorkerId','Input.declaration','Answer.Q1','Time','TempScore','AggrScore','Completion'], index=records[:,0]) return dataSet
def filterNoneObs(type): filData = qbPre.readDataFrame('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),None,0); noneData = filData[filData.answer=='None'] noneData.to_csv('{0}_{1}.csv'.format(qbGbl.noneSetFileName,type),index = False);
def analyse(filename): # filData = qbPre.readDataFrame(filename,None,0); # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] # new = filData # filename = '{0}'.format(qbGbl.filFileName) # # filData = pd.DataFrame(columns=['index','worker','declaration','answer']) # filData = qbPre.readDataFrame(filename,None,None); # filData.columns = ['index','worker','declaration','answer']; # del filData['index'] # old = filData # oldDecs = [] # for row in new['Input.declaration']: # if (old[old['declaration'] == row].empty): # continue; # else: # oldDecs.append(numpy.array(old[old['declaration'] == row])[1]) # oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer']) # oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False); ## =============================================================================== filData = qbPre.readDataFrame(filename,None,0); # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] filData = filData[['WorkerId','Input.declaration','Answer.Q1']] new = filData filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName) old = qbPre.readDataFrame(filename,None,0) # print new['Input.declaration'].nunique() # print len(old['declaration'].unique()) filData = pd.Series(old['declaration']).drop_duplicates() print filData # tempSer = pd.Series(new['Input.declaration']).drop_duplicates() tempSer = new.drop_duplicates(cols=['Input.declaration']) print len (new) print len(tempSer) # print len(filData) # print '=================' accuracy = [] count=0; # print len(new) # print len(old) for row in filData: # print row if not (new[new['Input.declaration'] == row].empty): if len(new[new['Input.declaration'] == row])>1: print new[new['Input.declaration'] == row] count += len(new[new['Input.declaration'] == row]) tempOld = qbPre.convClasses(list(old[old['declaration'] == row]['answer'])[0],'|') # print tempOld tempNew = qbPre.convClasses(list(new[new['Input.declaration'] == row]['Answer.Q1'])[0],'|') # print tempNew tempScore = 0.0; for topic in tempNew: if topic in tempOld: tempScore += 1.0; tempScore /= float(len(tempNew)) accuracy.append(tempScore) print scipy.stats.tmean(accuracy) print count P.figure(); n, bins, patches = P.hist(accuracy,len(set(accuracy)), histtype='step',cumulative=True,normed=1) P.title("Score distribution") P.xlabel("score") P.ylabel("Frequency") P.show()
def filterNoneObs(type): filData = qbPre.readDataFrame( '{0}_{1}.csv'.format(qbGbl.dataSetFileName, type), None, 0) noneData = filData[filData.answer == 'None'] noneData.to_csv('{0}_{1}.csv'.format(qbGbl.noneSetFileName, type), index=False)
def analyse(filename): # filData = qbPre.readDataFrame(filename,None,0); # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] # new = filData # filename = '{0}'.format(qbGbl.filFileName) # # filData = pd.DataFrame(columns=['index','worker','declaration','answer']) # filData = qbPre.readDataFrame(filename,None,None); # filData.columns = ['index','worker','declaration','answer']; # del filData['index'] # old = filData # oldDecs = [] # for row in new['Input.declaration']: # if (old[old['declaration'] == row].empty): # continue; # else: # oldDecs.append(numpy.array(old[old['declaration'] == row])[1]) # oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer']) # oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False); ## =============================================================================== filData = qbPre.readDataFrame(filename, None, 0) # filData = filData[['WorkerId','Input.declaration','Answer.Q1']] filData = filData[['WorkerId', 'Input.declaration', 'Answer.Q1']] new = filData filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName) old = qbPre.readDataFrame(filename, None, 0) # print new['Input.declaration'].nunique() # print len(old['declaration'].unique()) filData = pd.Series(old['declaration']).drop_duplicates() print filData # tempSer = pd.Series(new['Input.declaration']).drop_duplicates() tempSer = new.drop_duplicates(cols=['Input.declaration']) print len(new) print len(tempSer) # print len(filData) # print '=================' accuracy = [] count = 0 # print len(new) # print len(old) for row in filData: # print row if not (new[new['Input.declaration'] == row].empty): if len(new[new['Input.declaration'] == row]) > 1: print new[new['Input.declaration'] == row] count += len(new[new['Input.declaration'] == row]) tempOld = qbPre.convClasses( list(old[old['declaration'] == row]['answer'])[0], '|') # print tempOld tempNew = qbPre.convClasses( list(new[new['Input.declaration'] == row]['Answer.Q1'])[0], '|') # print tempNew tempScore = 0.0 for topic in tempNew: if topic in tempOld: tempScore += 1.0 tempScore /= float(len(tempNew)) accuracy.append(tempScore) print scipy.stats.tmean(accuracy) print count P.figure() n, bins, patches = P.hist(accuracy, len(set(accuracy)), histtype='step', cumulative=True, normed=1) P.title("Score distribution") P.xlabel("score") P.ylabel("Frequency") P.show()