def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__")
class createDataSingleLimit: def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000, 2500, 5000, 7500, 10000, 20000] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [1000] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__") def createData(self, category): """ 1. get root categories to be used and iterate through main categories 3. get max depth for individual category 4. from 1 to max till 1 to 1 get all catid for iterated category get all pages for selected categories call createCorpusAndVectorModel from selected documents """ for group in self.GROUPTYPE: #gruping dependent queries if group != "FATHERID": sqlCategory = "select Description, catid from dmoz_combined where mainCategory = '%s' limit 20000" %(category) else: sqlCategory = "select Description, fatherid from dmoz_combined where mainCategory = '%s' limit 20000" %(category) sqlQueryResults = self.shevaDB.dbQuery(sqlCategory) if sqlQueryResults == 0: sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel) for percentageItem in self.percentageList: sqlQueryResultsLimit = [x for x in sqlQueryResults[:percentageItem]] #data for % model, range data dataCategoryLevelAll = [] dataCategoryLabelAll = [] originalCatIDAll = [] dataCategorySingleAll = [] path = "%s/%s/%s/" %(self.rootDir,group,percentageItem) self.shevaUtils.createDir(self.rootDir,group,percentageItem) #for indeks in ranger: #level list variables dataCategoryLevel = [] dataCategoryLabel = [] originalCatID = [] originalFatherID = [] finalContent = [] #get unique values if group == "GENERAL": finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLimit] originalCatID = [row[1] for row in sqlQueryResultsLimit] dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) else: unique = [] for row in sqlQueryResultsLimit: if row[1] not in unique: unique.append(row[1]) #prepare rows with uniq for document in model for uniq in unique: tempUnique = [] tempUnique = [row[0].split() for row in sqlQueryResultsLimit if row[1] == uniq] mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)] finalContent.append(mergedContent) originalCatID.append(uniq) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) self.shevaUtils.createDir(self.rootDir, group, percentageItem) #create file names #fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks)) fileNameLevel = "%s_%s" %(str(percentageItem),category) fileNameSingleAll = "%s_%s" %(str(percentageItem),category) ########## ORIGINAL DESCRIPTION AND VECTORIZATION ################# #create corpus models #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path) dataCategoryLevelAll.extend(dataCategoryLevel) #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path) #single model for all documents dataCategorySingleAll.append([x for sublist in dataCategoryLevelAll for x in sublist]) #print dataCategorySingleAll #print len(Counter(dataCategorySingleAll[0])) print len(dataCategorySingleAll), len(dataCategorySingleAll[0]) self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll,path) ########## ORIGINAL CATEGORIES ID ################# #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path) #originalCatIDAll.extend(originalCatID) #self.shevaUtils.getCategoryListLevel(originalCatIDAll,fileNameAll,path) #print out number of documents for (cat,level,model) print "Done with:\t",group,"\t",category,"\t","\t",percentageItem
class createDataSingleLevel: def __init__(self,type,rootDir): """ type: 1 -> full data 2 -> limited data """ #percentage of data to be used for model build if type == 1: self.GROUPTYPE = ["CATID","FATHERID","GENERAL"] self.percentageList = [25, 50, 75, 100] elif type == 2: self.GROUPTYPE = ["GENERAL"] self.percentageList = [5] else: sys.exit("Wrong 'type' parameter in createData.__init__") self.shevaDB = ShevaDB() self.shevaTPF = ShevaTPF() self.shevaUtils = ShevaUtils() self.shevaVect = ShevaVect() self.shevaCSV = ShevaCSV() if rootDir != "": self.shevaUtils.createDirOne(str(rootDir)) self.rootDir = str(rootDir) else: sys.exit("Wrong 'rootDir' parameter in createData.__init__") def createData(self, category): """ 1. get root categories to be used and iterate through main categories 3. get max depth for individual category 4. from 1 to max till 1 to 1 get all catid for iterated category get all pages for selected categories call createCorpusAndVectorModel fro selected documents """ ranger = self.shevaDB.getCategoryDepth(category) for group in self.GROUPTYPE: sqlQueryResults = [] #gruping dependent queries if group == "FATHERID": sqlCategory = "select Description, fatherid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category) else: sqlCategory = "select Description, catid, categoryDepth from dmoz_combined where mainCategory = '%s'" %(category) sqlQueryResults = self.shevaDB.dbQuery(sqlCategory) if sqlQueryResults == 0: sys.exit("SQL code error in level: \t", category,"\t",indeks,"\t",sqlCategoryLevel) for percentageItem in self.percentageList: #data for % model, range data dataCategoryLevelAll = [] dataCategoryLabelAll = [] originalCatIDAll = [] dataCategorySingleAll = [[]] path = "%s/%s/%s/" %(self.rootDir,group,percentageItem) self.shevaUtils.createDir(self.rootDir,group,percentageItem) #var = "" for indeks in ranger: #var += "Level:\t%s\n" %(indeks) """ #gruping dependent queries if group == "FATHERID": sqlCategoryLevel = "select Description, fatherid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks) else: sqlCategoryLevel = "select Description, catid from dmoz_combined where mainCategory = '%s' and categoryDepth = '%s'" %(category,indeks) sqlQueryResultsLevel = self.shevaDB.dbQuery(sqlCategoryLevel) """ sqlQueryResultsLevel = [x for x in sqlQueryResults if x[2] == indeks] #level list variables finalContent = [] dataCategoryLevel = [] dataCategoryLabel = [] originalCatID = [] #originalFatherID = [] #get unique values if group == "GENERAL": #finalContent = [] percentageLevel = self.shevaUtils.setLimit(percentageItem,sqlQueryResultsLevel) finalContent = [[item for item in row[0].split()] for row in sqlQueryResultsLevel[:percentageLevel]] #var += "Original words:\t%s\n" %(finalContent) originalCatID = [row[1] for row in sqlQueryResultsLevel[:percentageLevel]] #var += "Original IDs:\t%s\n" %(originalCatID) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) #var += "Processed words:\t%s\n" %(dataCategoryLevel) else: unique = [] for row in sqlQueryResultsLevel: if row[1] not in unique: unique.append(row[1]) for uniq in unique: #var += "ID:\t%s\n" %(uniq) tempUnique = [] tempUnique = [row[0] for row in sqlQueryResultsLevel if row[1] == uniq] percentageLevel = self.shevaUtils.setLimit(percentageItem,tempUnique) tempUnique = [item.split() for item in tempUnique[:percentageLevel]] mergedContent = [i for i in itertools.chain.from_iterable(tempUnique)] #var += "Original words:\t%s\n" %(mergedContent) finalContent.append(mergedContent) originalCatID.append(uniq) dataCategoryLevel.extend(self.shevaTPF.returnClean(finalContent,1)) self.shevaUtils.createDir(self.rootDir, group, percentageItem) ########## FILE NAMES ################# fileNameAll = "%s_%s_1_%s" %(str(percentageItem),category,str(indeks)) fileNameLevel = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) fileNameSingleAll = "%s_%s_%s" %(str(percentageItem),category,str(indeks)) ########## PRINT OUT ORIGINAL AND PROCESSED DATA ################# """ print originalCatID print finalContent print dataCategoryLevel """ ########## ORIGINAL DESCRIPTION AND VECTORIZATION ################# #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevel,fileNameLevel,path) dataCategoryLevelAll.extend(dataCategoryLevel) #self.shevaVect.createCorpusAndVectorModel(dataCategoryLevelAll, fileNameAll,path) #single model for all documents dataCategorySingleAll[0].extend([x for sublist in dataCategoryLevelAll for x in sublist]) self.shevaVect.createCorpusAndVectorModel(dataCategorySingleAll, fileNameSingleAll, path) ########## ORIGINAL CATEGORIES ID ################# #self.shevaUtils.getCategoryListLevel(originalCatID,fileNameLevel,path) originalCatIDAll.extend(originalCatID) #self.shevaCSV.getCategoryListLevel(originalCatIDAll,fileNameAll,path) #print out number of documents for (cat,level,model) print "Done with:\t",group,"\t",category,"\t",indeks,"\t",percentageItem ####################### GC ################# del dataCategoryLevel del originalCatID gc.collect() del dataCategoryLevelAll del dataCategoryLabelAll del originalCatIDAll del dataCategorySingleAll del sqlQueryResults gc.collect()