def __init__(self, testSize, category, type=1): """ INPUT: type = full data (1) or training data (2) testSize = % of model size (nr of documents in model) to test with """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"] self.limitList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL", "FATHERID", "GENERAL"] self.limitList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") print "SimilarityLevel created" #Sheva Objects self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaSimilarity = ShevaSimilarity() self.shevaCSV = ShevaCSV() self.shevaClassificationMetrics = ShevaClassificationMetrics() self.shevaUtils = ShevaUtils() #SimilarityLevel Variables self.rootDir = "LimitModels/" self.testSize = testSize self.category = category self.maxDepth = self.shevaDB.getCategorymaxDepth(self.category)
def __init__(self, testSize): """ INPUT: type = full data (1) or training data (2) testSize = % of model size (nr of documents in model) to test with """ """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"] #self.percentageList = [25, 50, 75, 100] elif type == 2: self.GROUPTYPE = ["GENERAL"] #self.percentageList = [25, 50, 75, 100] else: sys.exit("Wrong 'type' parameter in createData.__init__") """ print "SimilarityLevel created" #Sheva Objects self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaSimilarity = ShevaSimilarity() self.shevaCSV = ShevaCSV() self.shevaClassificationMetrics = ShevaClassificationMetrics() self.shevaUtils = ShevaUtils() #SimilarityLevel Variables self.rootDir = "LevelModels/" self.testSize = testSize
class SimilarityLimit: ##@profile w def __init__(self, testSize, category, type=1): """ INPUT: type = full data (1) or training data (2) testSize = % of model size (nr of documents in model) to test with """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID", "FATHERID", "GENERAL"] self.limitList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL", "FATHERID", "GENERAL"] self.limitList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") print "SimilarityLevel created" #Sheva Objects self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaSimilarity = ShevaSimilarity() self.shevaCSV = ShevaCSV() self.shevaClassificationMetrics = ShevaClassificationMetrics() self.shevaUtils = ShevaUtils() #SimilarityLevel Variables self.rootDir = "LimitModels/" self.testSize = testSize self.category = category self.maxDepth = self.shevaDB.getCategorymaxDepth(self.category) def __del__(self): print 'SimilarityLevel destroyed' #@profile def calculateLimitSimilarity(self): for limit in self.limitList: for group in self.GROUPTYPE: print "####################################################################" #print category, group, percentage, debth sim = [] vec_bow = [] allCategoryDataOID = [] categoryDataOID = [] categoryData = [] print "created variables" #path & csv file path = "%s%s/%s/" %(self.rootDir,group,limit) fileName = "%s_%s" %(limit,self.category) IODfilePath = "%soriginalID/%s.csv" %(path,fileName) print "Setup paths" #get data from original ID csv; unique ID allCategoryDataOID = self.shevaCSV.getModelCSV(IODfilePath) #categoryDataOID = self.shevaCSV.getIDfromModel(IODfilePath) print "Got all modelRow->originalID mappings" #get sim index, model, dict indexDir = "%sindexFiles/" %(path) self.shevaUtils.createDirOne(indexDir) index, tfidfModel, dictionary, corpusSize = self.shevaSimilarity.getSimilarityIndex(path, fileName, group) #return sample from original data categoryDataOID, categoryData = self.shevaDB.getSample(limit,self.testSize,self.category,self.maxDepth, group) #calculate similarites cleanText = self.shevaTPF.returnClean(categoryData, 1) cleanTextBoW = [dictionary.doc2bow(cleanText[i]) for i in range(0, len(cleanText))] print "Done with bow representation" vec_bow = self.shevaSimilarity.convert2VSM(cleanTextBoW, tfidfModel) print len(vec_bow) simCalculation = self.shevaSimilarity.calculateSimilarity(index, vec_bow, 0.1) #calcualte IR measures cPrecision, cRecall, cF1 = self.shevaClassificationMetrics.computeClassificationMetrics(categoryDataOID, allCategoryDataOID, simCalculation) print "All data measures :\t\t\t\tPrecision:\t", cPrecision, "\t\tRecall\t", cRecall, "\t\tF1:\t", cF1 sqlClassic = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth, testSize, measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetrics",cPrecision, cRecall, cF1) self.shevaDB.dbQuery(sqlClassic) cPrecisionR, cRecallR, cF1R = self.shevaClassificationMetrics.computeClassificationMetricsRelative(categoryDataOID, allCategoryDataOID, simCalculation) print "Relative (with or) data measures :\t\tPrecision:\t", cPrecisionR, "\t\tRecall\t", cRecallR, "\t\tF1:\t", cF1R sqlRelative = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth, testSize,measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetricsRelative",cPrecisionR, cRecallR, cF1R) self.shevaDB.dbQuery(sqlRelative) cPrecisionE, cRecallE, cF1E = self.shevaClassificationMetrics.computeClassificationMetricsExclusive(categoryDataOID, allCategoryDataOID, simCalculation) print "Exclusive (with and) data measures :\t\tPrecision:\t", cPrecisionE, "\t\tRecall\t", cRecallE, "\t\tF1:\t", cF1E sqlExclusive = "INSERT INTO analysis_results_limit (category, groupingType, limitValue, levelDepth,testSize, measureType, P, R, F1) VALUES ('%s', '%s', '%s', '%i', '%i', '%s','%f','%f','%f')" % (self.category,group,limit,self.maxDepth,self.testSize, "computeClassificationMetricsExclusive",cPrecisionE, cRecallE, cF1E) self.shevaDB.dbQuery(sqlExclusive) #trying to figure out the memory thing. needs speed-up in performance otherwise... dbData = [] simCalculation = [] cleanText = [] cleanTextBoW = [] vec_bow = [] del index del tfidfModel del dictionary del corpusSize del simCalculation del vec_bow del allCategoryDataOID del categoryDataOID del categoryData del dbData gc.collect()