コード例 #1
0
ファイル: main.py プロジェクト: sahanbull/QubitProject
def filterData2(cols=['HITId','HITTypeId','Title','Description','Keywords','Reward',
	'CreationTime','MaxAssignments','RequesterAnnotation','AssignmentDurationInSeconds',
	'AutoApprovalDelayInSeconds','Expiration','NumberOfSimilarHITs','LifetimeInSeconds',
	'AssignmentId','WorkerId','AssignmentStatus','AcceptTime','SubmitTime','AutoApprovalTime',
	'ApprovalTime','RejectionTime','RequesterFeedback','WorkTimeInSeconds','LifetimeApprovalRate',
	'Last30DaysApprovalRate','Last7DaysApprovalRate','Input.pv_id','Input.global_user_id',
	'Input.time','Input.declaration','Answer.Q1','Approve','Reject']):

	paths = qbPre.listFiles(qbGbl.oriFileName)
	filData = qbPre.readFiles(paths)

	filData = filData[cols];

	filData.to_csv(qbGbl.filFileName,index = False,encoding='utf-8');
コード例 #2
0
ファイル: main.py プロジェクト: sahanbull/QubitProject
def getReliableData(cols=[
    'HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
    'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
    'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds', 'Expiration',
    'NumberOfSimilarHITs', 'LifetimeInSeconds', 'AssignmentId', 'WorkerId',
    'AssignmentStatus', 'AcceptTime', 'SubmitTime', 'AutoApprovalTime',
    'ApprovalTime', 'RejectionTime', 'RequesterFeedback', 'WorkTimeInSeconds',
    'LifetimeApprovalRate', 'Last30DaysApprovalRate', 'Last7DaysApprovalRate',
    'Input.pv_id', 'Input.global_user_id', 'Input.time', 'Input.declaration',
    'Answer.Q1', 'Approve', 'Reject'
]):

    dataSet = qbRel.analyseWorkers()
    badEntries = qbRel.pickBadEntries(dataSet)

    # print dataSet

    dataSet = dataSet.drop(badEntries.index)

    dataSet = dataSet[['WorkerId', 'Input.declaration', 'Answer.Q1']]
    silverSet = dataSet['Input.declaration']

    # print silverSet

    # load the fulDataset

    paths = qbPre.listFiles(qbGbl.oriFileName)
    filData = qbPre.readFiles(paths)

    filData = filData[cols]

    filData.index = (xrange(0, len(filData)))

    # remove the observations that were verified earlier as the silver set
    dups = pd.DataFrame()
    for dec in silverSet:
        # dups = dups.append(filData[filData['Input.declaration']==dec])
        filData = filData.drop(
            filData[filData['Input.declaration'] == dec].index)

    badEntries = qbRel.pickBadObs(filData)

    filData = filData.drop(badEntries.index)

    dataSet = dataSet.append(filData)
    dataSet = dataSet[['WorkerId', 'Input.declaration', 'Answer.Q1']]
    dataSet.index = (xrange(0, len(dataSet)))

    # print dataSet
    dataSet.to_csv(qbGbl.finalReaderFile, header=False)
コード例 #3
0
ファイル: main.py プロジェクト: sahanbull/QubitProject
def filterData():
	filData = []; # to store the filtered dataset
	count = 0; # to keep the counter

	# read original file and load the relevant data
	# list all the csv log files with records
	paths = qbPre.listFiles(qbGbl.oriFileName);

	# foreach csv file in the paths
	for path in paths:
		filterSet = qbPre.filterFile(path,count); # filter data 
		filData.extend(filterSet[0]); # add to the filtered dataset in RAM
		count = filterSet[1]; # update count

	# write filtered data to a different file in the HDD 
	qbPre.writeFilCSV(qbGbl.filFileName,filData);
コード例 #4
0
ファイル: main.py プロジェクト: sahanbull/QubitProject
def getReliableData(cols=['HITId','HITTypeId','Title','Description','Keywords','Reward',
	'CreationTime','MaxAssignments','RequesterAnnotation','AssignmentDurationInSeconds',
	'AutoApprovalDelayInSeconds','Expiration','NumberOfSimilarHITs','LifetimeInSeconds',
	'AssignmentId','WorkerId','AssignmentStatus','AcceptTime','SubmitTime','AutoApprovalTime',
	'ApprovalTime','RejectionTime','RequesterFeedback','WorkTimeInSeconds','LifetimeApprovalRate',
	'Last30DaysApprovalRate','Last7DaysApprovalRate','Input.pv_id','Input.global_user_id',
	'Input.time','Input.declaration','Answer.Q1','Approve','Reject']):

	dataSet = qbRel.analyseWorkers();
	badEntries = qbRel.pickBadEntries(dataSet)

	# print dataSet

	dataSet = dataSet.drop(badEntries.index)

	dataSet = dataSet[['WorkerId','Input.declaration','Answer.Q1']]
	silverSet = dataSet['Input.declaration']

	# print silverSet

	# load the fulDataset

	paths = qbPre.listFiles(qbGbl.oriFileName)
	filData = qbPre.readFiles(paths)

	filData = filData[cols];

	filData.index = (xrange(0,len(filData)))

	# remove the observations that were verified earlier as the silver set 
	dups = pd.DataFrame()
	for dec in silverSet:
		# dups = dups.append(filData[filData['Input.declaration']==dec])
		filData = filData.drop(filData[filData['Input.declaration']==dec].index)
	
	badEntries = qbRel.pickBadObs(filData)

	filData = filData.drop(badEntries.index)
	
	dataSet = dataSet.append(filData)
	dataSet = dataSet[['WorkerId','Input.declaration','Answer.Q1']]
	dataSet.index = (xrange(0,len(dataSet)))

	# print dataSet
	dataSet.to_csv(qbGbl.finalReaderFile,header=False);
コード例 #5
0
ファイル: main.py プロジェクト: sahanbull/QubitProject
def filterData2(cols=[
    'HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
    'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
    'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds', 'Expiration',
    'NumberOfSimilarHITs', 'LifetimeInSeconds', 'AssignmentId', 'WorkerId',
    'AssignmentStatus', 'AcceptTime', 'SubmitTime', 'AutoApprovalTime',
    'ApprovalTime', 'RejectionTime', 'RequesterFeedback', 'WorkTimeInSeconds',
    'LifetimeApprovalRate', 'Last30DaysApprovalRate', 'Last7DaysApprovalRate',
    'Input.pv_id', 'Input.global_user_id', 'Input.time', 'Input.declaration',
    'Answer.Q1', 'Approve', 'Reject'
]):

    paths = qbPre.listFiles(qbGbl.oriFileName)
    filData = qbPre.readFiles(paths)

    filData = filData[cols]

    filData.to_csv(qbGbl.filFileName, index=False, encoding='utf-8')
コード例 #6
0
ファイル: main.py プロジェクト: sahanbull/QubitProject
def filterData():
    filData = []
    # to store the filtered dataset
    count = 0
    # to keep the counter

    # read original file and load the relevant data
    # list all the csv log files with records
    paths = qbPre.listFiles(qbGbl.oriFileName)

    # foreach csv file in the paths
    for path in paths:
        filterSet = qbPre.filterFile(path, count)
        # filter data
        filData.extend(filterSet[0])
        # add to the filtered dataset in RAM
        count = filterSet[1]
        # update count

    # write filtered data to a different file in the HDD
    qbPre.writeFilCSV(qbGbl.filFileName, filData)