Exemple #1
0
def preProcessData(type):
	# load the cleaned dataset
	# filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type);

	filData = qbPre.readDataFrame('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),None,0);
	filData = filData.drop_duplicates(cols=['declaration']) # drop all the duplicates
	sample = filData[filData['answer'].str.contains('None')] # pick all the observaions as None of above
	filData = filData.drop(sample.index) # drop them as well

	rows = list(filData.index);
	random.shuffle(rows);
	# print rows
	filData = filData.ix[rows]

	# print oldSample.declaration
	# filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type);
	# print filData

	# m = int(round(len(filData)*1.0))
	# print m
	# rows = random.sample(filData.index,m)
	
	# # random sample generated for old set
	# filData = filData.ix[rows]
	# print 'Number of observations: {0}'.format(len(filData))

	X = qbPrepare.generateX(filData);

	Y = qbPrepare.generateY(filData);

	return [filData,X,Y] 
Exemple #2
0
def preProcessData(type):
    # load the cleaned dataset
    # filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type);

    filData = qbPre.readDataFrame(
        '{0}_{1}.csv'.format(qbGbl.dataSetFileName, type), None, 0)
    filData = filData.drop_duplicates(cols=['declaration'
                                            ])  # drop all the duplicates
    sample = filData[filData['answer'].str.contains(
        'None')]  # pick all the observaions as None of above
    filData = filData.drop(sample.index)  # drop them as well

    rows = list(filData.index)
    random.shuffle(rows)
    # print rows
    filData = filData.ix[rows]

    # print oldSample.declaration
    # filData = qbPre.readFile('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),type);
    # print filData

    # m = int(round(len(filData)*1.0))
    # print m
    # rows = random.sample(filData.index,m)

    # # random sample generated for old set
    # filData = filData.ix[rows]
    # print 'Number of observations: {0}'.format(len(filData))

    X = qbPrepare.generateX(filData)

    Y = qbPrepare.generateY(filData)

    return [filData, X, Y]
Exemple #3
0
def pickRowDataset():
	newData = qbPre.readDataFrame('{0}/Batch_1123120_batch_results.csv'.format(qbGbl.oriFileName),None,0);
	data = pd.DataFrame(newData['Input.pv_id'], columns= ['pv_id'])
	
	data['global_user_id'] = newData['Input.global_user_id'];
	data['time'] = newData['Input.time'];
	data['declaration'] = newData['Input.declaration'];

	return data
Exemple #4
0
def cleanExistingData(filename1,filename2):

	labelled = qbPre.readDataFrame(filename1,None,0)
	labelled = labelled['Input.declaration']
	print labelled
	
	unlabelled = qbPre.readDataFrame(filename2,None,0);

	unlabelled = unlabelled.drop_duplicates(cols=['declaration'])

	# print unlabelled[unlabelled['declaration']==labelled]
	for row in labelled:
		unlabelled = unlabelled.drop(unlabelled[unlabelled['declaration']==row].index)
		# if not unlabelled[unlabelled['declaration']==row].empty:
		# 	unlabelled.drop
		# if not (unlabelled[unlabelled['Input.declaration'] == row].empty):
	
	print unlabelled
Exemple #5
0
def cleanExistingData(filename1, filename2):

    labelled = qbPre.readDataFrame(filename1, None, 0)
    labelled = labelled['Input.declaration']
    print labelled

    unlabelled = qbPre.readDataFrame(filename2, None, 0)

    unlabelled = unlabelled.drop_duplicates(cols=['declaration'])

    # print unlabelled[unlabelled['declaration']==labelled]
    for row in labelled:
        unlabelled = unlabelled.drop(
            unlabelled[unlabelled['declaration'] == row].index)
        # if not unlabelled[unlabelled['declaration']==row].empty:
        # 	unlabelled.drop
        # if not (unlabelled[unlabelled['Input.declaration'] == row].empty):

    print unlabelled
Exemple #6
0
def generateSample(fulConSet,m,newFileName,n):

	# generate old data sample ================================

	oldSample = pd.DataFrame(columns=('pv_id', 'global_user_id', 'time', 'declaration'));

	# pick full information
	oldData = pickRowDataset()

	# generate and write the fully concorded for later reference to HDD
	tempFulConSet = pd.DataFrame(list(fulConSet),columns = ['declaration'])

	tempFulConSet.to_csv('data/relAnalytics/fulConSet.csv',index = False);

	# find full info of the filly concorded occurences
	for feedback in fulConSet:
		p = oldData[oldData['declaration'] == feedback];
		oldSample = oldSample.append(p[0:1],ignore_index=True)

	rows = list(oldSample.index)
	# print oldSample
	# until the population > sample size
	while len(oldSample) < m:
		# double the population by duplicating 
	 	oldSample = oldSample.append(oldSample,ignore_index=True);
	 	# shuffle the observations
	 	rows = list(oldSample.index);
		random.shuffle(rows);
		oldSample = oldSample.ix[rows]

	print oldSample.declaration
	# pick a random sample of size m
	rows = random.sample(oldSample.index,m)
	
	# random sample generated for old set
	sample = oldSample.ix[rows]

	# generate new data sample =================================
	# load the filtered dataset
	newData = qbPre.readDataFrame(newFileName,None,0);
	# pick a random sample of size
	newRows = random.sample(newData.index,n);

	newSample = newData.ix[newRows];

	# aggregate the old and new samples to gether
	sample = sample.append(newSample,ignore_index=True);
	print sample.declaration
	rows = list(sample.index);
	# shuffle them
	random.shuffle(rows);
	sample = sample.ix[rows];
	print sample.declaration
	# generate csv file :D
	sample.to_csv('data/write/newFeedbackSample2.csv',index = False);
Exemple #7
0
def analyseWorkers():
	filData = qbPre.readDataFrame(qbGbl.filFileName,None,0);

	workers = filData.WorkerId.drop_duplicates();

	filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName)

	old = qbPre.readDataFrame(filename,None,0)

	perfectDecs = old.declaration.drop_duplicates()

	dataSet = pd.DataFrame()

	for row in perfectDecs:
		if not filData[filData['Input.declaration']==row].empty:
			if dataSet.empty:
				dataSet = filData[filData['Input.declaration']==row]
			else:
				dataSet = dataSet.append(filData[filData['Input.declaration']==row])


	dataSet.SubmitTime = pd.to_datetime(dataSet.SubmitTime)

	# dataSet = pd.DataFrame(dataSet.values,
	# 	columns=['SubmitTime','WorkerId','Input.declaration','Answer.Q1'])

	# print dataSet

	dataSet = dataSet.sort(columns=['SubmitTime'])
	# firstDate = list(dataSet.SubmitTime)[0]

	records = [];
	# print dataSet	
	workers = dataSet.WorkerId.drop_duplicates();
	


	for worker in workers:
		tempStats = {'score':0.0,'freq':0}
		tempRecords = dataSet[dataSet.WorkerId==worker]
		for row in tempRecords.itertuples():
			newRow = list(row)
			# print newRow[-2]
		 	# print old[old['declaration'] == newRow[-2]]
			tempOld = qbPre.convClasses(list(old[old['declaration'] == newRow[-2]].answer)[0],'|')
			# print tempOld
			tempNew = qbPre.convClasses(newRow[-1],'|')
			# print tempNew
			tempScore = 0.0;
			for topic in tempNew:
				if topic in tempOld:
					tempScore += 1.0;
			tempScore /= float(len(tempNew))

		 	tempStats['freq']+=1; # frequency ++
			tempStats['score']+=tempScore;
			
			aggrScore = tempStats['score']/tempStats['freq']
			tm = row[1].time()
			tm = float(tm.hour) + float(tm.minute) / 60
			# print tm

			compl = float(tempStats['freq'])/float(len(tempRecords))
			# del newRow[0]
			newRow.extend([tm,tempScore,aggrScore,compl])

			records.append(newRow)

	records = numpy.array(records);

	dataSet = pd.DataFrame(records[:,1:],
		columns = ['SubmitTime','WorkerId','Input.declaration','Answer.Q1','Time','TempScore','AggrScore','Completion'],
		index=records[:,0])

	return dataSet
Exemple #8
0
def filterNoneObs(type):
	filData = qbPre.readDataFrame('{0}_{1}.csv'.format(qbGbl.dataSetFileName,type),None,0);
	noneData = filData[filData.answer=='None']
	noneData.to_csv('{0}_{1}.csv'.format(qbGbl.noneSetFileName,type),index = False);
Exemple #9
0
def analyse(filename):

	# filData = qbPre.readDataFrame(filename,None,0);
	# filData = filData[['WorkerId','Input.declaration','Answer.Q1']]

	# new = filData

	# filename = '{0}'.format(qbGbl.filFileName) 
	# # filData = pd.DataFrame(columns=['index','worker','declaration','answer'])
	# filData = qbPre.readDataFrame(filename,None,None);
	# filData.columns = ['index','worker','declaration','answer'];

	# del filData['index']

	# old = filData

	# oldDecs = []
	
	# for row in new['Input.declaration']:
	# 	if (old[old['declaration'] == row].empty):
	# 		continue;
	# 	else:
	# 		oldDecs.append(numpy.array(old[old['declaration'] == row])[1])

	# oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer'])
	
	# oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False);
	
	## ===============================================================================
	filData = qbPre.readDataFrame(filename,None,0);
	# filData = filData[['WorkerId','Input.declaration','Answer.Q1']]
	filData = filData[['WorkerId','Input.declaration','Answer.Q1']]

	new = filData

	filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName)

	old = qbPre.readDataFrame(filename,None,0)

	# print new['Input.declaration'].nunique()
	# print len(old['declaration'].unique())
	filData = pd.Series(old['declaration']).drop_duplicates()
	print filData
	# tempSer = pd.Series(new['Input.declaration']).drop_duplicates()
	tempSer = new.drop_duplicates(cols=['Input.declaration'])
			
	print len (new)
	print len(tempSer)

	# print len(filData)
	# print '================='
	accuracy = []
	count=0;
	# print len(new)
	# print len(old)

	for row in filData:
		# print row
		if not (new[new['Input.declaration'] == row].empty):
			if len(new[new['Input.declaration'] == row])>1:
				print new[new['Input.declaration'] == row]
			count += len(new[new['Input.declaration'] == row])
			tempOld = qbPre.convClasses(list(old[old['declaration'] == row]['answer'])[0],'|')
			# print tempOld
			tempNew = qbPre.convClasses(list(new[new['Input.declaration'] == row]['Answer.Q1'])[0],'|')
			# print tempNew
			tempScore = 0.0;
			for topic in tempNew:
				if topic in tempOld:
					tempScore += 1.0;

			tempScore /= float(len(tempNew))
			accuracy.append(tempScore)
	
	print scipy.stats.tmean(accuracy) 

	print count

	P.figure();

	n, bins, patches = P.hist(accuracy,len(set(accuracy)), histtype='step',cumulative=True,normed=1)

	P.title("Score distribution")
	P.xlabel("score")
	P.ylabel("Frequency")
	P.show()
Exemple #10
0
def filterNoneObs(type):
    filData = qbPre.readDataFrame(
        '{0}_{1}.csv'.format(qbGbl.dataSetFileName, type), None, 0)
    noneData = filData[filData.answer == 'None']
    noneData.to_csv('{0}_{1}.csv'.format(qbGbl.noneSetFileName, type),
                    index=False)
Exemple #11
0
def analyse(filename):

    # filData = qbPre.readDataFrame(filename,None,0);
    # filData = filData[['WorkerId','Input.declaration','Answer.Q1']]

    # new = filData

    # filename = '{0}'.format(qbGbl.filFileName)
    # # filData = pd.DataFrame(columns=['index','worker','declaration','answer'])
    # filData = qbPre.readDataFrame(filename,None,None);
    # filData.columns = ['index','worker','declaration','answer'];

    # del filData['index']

    # old = filData

    # oldDecs = []

    # for row in new['Input.declaration']:
    # 	if (old[old['declaration'] == row].empty):
    # 		continue;
    # 	else:
    # 		oldDecs.append(numpy.array(old[old['declaration'] == row])[1])

    # oldDecs = pd.DataFrame(oldDecs,columns=['worker','declaration','answer'])

    # oldDecs.to_csv('{0}/PerfectDataset.csv'.format(qbGbl.oriFileName),index = False);

    ## ===============================================================================
    filData = qbPre.readDataFrame(filename, None, 0)
    # filData = filData[['WorkerId','Input.declaration','Answer.Q1']]
    filData = filData[['WorkerId', 'Input.declaration', 'Answer.Q1']]

    new = filData

    filename = '{0}/PerfectDataset.csv'.format(qbGbl.oriFileName)

    old = qbPre.readDataFrame(filename, None, 0)

    # print new['Input.declaration'].nunique()
    # print len(old['declaration'].unique())
    filData = pd.Series(old['declaration']).drop_duplicates()
    print filData
    # tempSer = pd.Series(new['Input.declaration']).drop_duplicates()
    tempSer = new.drop_duplicates(cols=['Input.declaration'])

    print len(new)
    print len(tempSer)

    # print len(filData)
    # print '================='
    accuracy = []
    count = 0
    # print len(new)
    # print len(old)

    for row in filData:
        # print row
        if not (new[new['Input.declaration'] == row].empty):
            if len(new[new['Input.declaration'] == row]) > 1:
                print new[new['Input.declaration'] == row]
            count += len(new[new['Input.declaration'] == row])
            tempOld = qbPre.convClasses(
                list(old[old['declaration'] == row]['answer'])[0], '|')
            # print tempOld
            tempNew = qbPre.convClasses(
                list(new[new['Input.declaration'] == row]['Answer.Q1'])[0],
                '|')
            # print tempNew
            tempScore = 0.0
            for topic in tempNew:
                if topic in tempOld:
                    tempScore += 1.0

            tempScore /= float(len(tempNew))
            accuracy.append(tempScore)

    print scipy.stats.tmean(accuracy)

    print count

    P.figure()

    n, bins, patches = P.hist(accuracy,
                              len(set(accuracy)),
                              histtype='step',
                              cumulative=True,
                              normed=1)

    P.title("Score distribution")
    P.xlabel("score")
    P.ylabel("Frequency")
    P.show()