def __init__(self, dataLines): if not len(dataLines) > 0: print '(!) dataLines list is empty' self.sizes = Sizes(dataLines) self.dataLines = [] self.features = []
def main(): # preprocess() io = IO() dataLines = io.ReadFile(preprocessOutFile) sizes = Sizes(dataLines) bounds = sizes.getSizesBounds() for bound in bounds: print bound, bounds[bound]
def preprocess(): # load file as list of string datalines io = IO() sizesDataLines = io.ReadFile(inSizesFile) # create Sizes object sizes = Sizes(sizesDataLines) ### starts preprocessing ### # make all column data upperCase (exceptions are the url and size columns) upperDataLines = sizes.doUpperCase() # create new Sizes object using the upperCase sizes dataLines upperSizes = Sizes(upperDataLines) #change brands' URLs (from sizes source url to the brand url) changedURLsDataLines = upperSizes.changeURLs() # create new Sizes object using the changedUrls sizes dataLines changedUrlsSizes = Sizes(changedURLsDataLines) # merge synonymous sizeTypes mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes() # create new Sizes object using the merged sizes dataLines mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # get fixed lines as list of string datalines, and write them to file fixedLines = mergedSizes.getFixedLines() io.WriteSizesHeader(preprocessOutFile) io.WriteFile(preprocessOutFile, fixedLines, 'a')
def main(): inSizesFile = "sizes.txt" upperOutFileName = "upperCaseSizes.txt" changedUrlsOutFileName = "changedUrlsSizes.txt" mergedOutFileName = "mergedSynonymousSizeTypes_sizes.txt" outFileName = "fixedSizes.txt" fixedSizesOutFile = "fixedSizes.txt" outPeopleFile = "generatedPeople.txt" outBrandsFile = "generatedBrands.txt" sizeCatalogFile = "sizeCatalog.txt" ######################## # usage examples # ######################## ### get useful stats from size data ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # get the unique brands # brands = sizes.getBrands() # # # get the unique clothe categories # clotheCategories = sizes.getClotheCategories() # # # get the unique size types for every clothe category # clotheSizeTypes = sizes.getSizeTypes() # # # get a list with all the unique size types # sizeTypes = sizes.getSizeTypesList() # # # get the unique size categories for every clothe category # clotheSizeCategories = sizes.getSizeCats() # ### get size lines with merged synonymous SizeTypes ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # merge synonymous sizeTypes # mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes() # # # write mergedSynonymousSizeTypes lines to file # io.WriteSizesHeader(mergedOutFileName) # io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a') ### get fixed size lines ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # get fixed lines as list of string datalines # fixedLines = sizes.getFixedLines() # # # write fixed lines to file # io.WriteSizesHeader(fixedSizesOutFile) # io.WriteFile(fixedSizesOutFile, fixedLines, 'a') ### get the size bounds (min, max) for every sizeType ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # get fixed lines as list of string datalines # fixedLines = sizes.getFixedLines() # # # create new Sizes object using the fixed sizes dataLines # fixedSizes = Sizes(fixedLines) # # # get the size bounds (min, max) for every sizeType as dictionary # # (!) works for size data lines with fixed sizes # sizeBounds = fixedSizes.getSizesBounds() ### construct the size catalog ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes() # # # create new Sizes object using the merged sizes dataLines # mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # # # get fixed lines as list of string datalines # fixedLines = mergedSizes.getFixedLines() # # # create new Sizes object using the fixed sizes dataLines # fixedSizes = Sizes(fixedLines) # # # get the size type projections for every size catalog entry # sizeTypesProjections = fixedSizes.getSizeTypesProjections() # # # get the size catalog as list of string datalines # sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections) # # # write size catalog to file # io.WriteSizeCatalogHeader(sizeCatalogFile) # io.WriteFile(sizeCatalogFile, sizeCatalog, 'a') ### generate virtual people ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes() # # # create new Sizes object using the merged sizes dataLines # mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # # # get fixed lines as list of string datalines # fixedLines = mergedSizes.getFixedLines() # # # write fixed lines to file # io.WriteSizesHeader(fixedSizesOutFile) # io.WriteFile(fixedSizesOutFile, fixedLines, 'a') # # generator = Generator(fixedSizesOutFile) # generator.GeneratePeople(10, outPeopleFile) ### from unprocessed sizes data to database - as one shot run ### # # preprocessing # size catalog construction # generate virtual people # database construction and bulk load # load file as list of string datalines io = IO() sizesDataLines = io.ReadFile(inSizesFile) # create Sizes object sizes = Sizes(sizesDataLines) ### starts preprocessing ### # make all column data upperCase (exceptions are the url and size columns), and write them to file upperDataLines = sizes.doUpperCase() io.WriteSizesHeader(upperOutFileName) io.WriteFile(upperOutFileName, upperDataLines, 'a') # create new Sizes object using the upperCase sizes dataLines upperSizes = Sizes(upperDataLines) #change brands' URLs (from sizes source url to the brand url), and write them to file changedURLsDataLines = upperSizes.changeURLs() io.WriteSizesHeader(changedUrlsOutFileName) io.WriteFile(changedUrlsOutFileName, changedURLsDataLines, 'a') # create new Sizes object using the changedUrls sizes dataLines changedUrlsSizes = Sizes(changedURLsDataLines) # merge synonymous sizeTypes, and write them to file mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes() io.WriteSizesHeader(mergedOutFileName) io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a') # create new Sizes object using the merged sizes dataLines mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # get fixed lines as list of string datalines, and write them to file fixedLines = mergedSizes.getFixedLines() io.WriteSizesHeader(fixedSizesOutFile) io.WriteFile(fixedSizesOutFile, fixedLines, 'a') ### ends preprocessing ### # create new Sizes object using the fixed sizes dataLines fixedSizes = Sizes(fixedLines) # get the size type projections for every size catalog entry sizeTypesProjections = fixedSizes.getSizeTypesProjections() # get the size catalog as list of string datalines, and write it to file sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections) io.WriteSizeCatalogHeader(sizeCatalogFile) io.WriteFile(sizeCatalogFile, sizeCatalog, 'a') # generate virtual people, and write them to file generator = Generator(fixedSizesOutFile) generator.GeneratePeople(10, outPeopleFile) # create a dbHelper object, connect to database, and construct the db schema dbHelper = DBHelper("betaDB", fixedSizesOutFile, outPeopleFile, sizeCatalogFile) dbHelper.constructDbSchema() print 'all ok'
def preprocess(): # load file as list of string datalines io = IO() sizesDataLines = io.ReadFile(inSizesFile) # create Sizes object sizes = Sizes(sizesDataLines) ### starts preprocessing ### # make all column data upperCase (exceptions are the url and size columns) upperDataLines = sizes.doUpperCase() # create new Sizes object using the upperCase sizes dataLines upperSizes = Sizes(upperDataLines) #change brands' URLs (from sizes source url to the brand url) changedURLsDataLines = upperSizes.changeURLs() # create new Sizes object using the changedUrls sizes dataLines changedUrlsSizes = Sizes(changedURLsDataLines) # merge synonymous sizeTypes mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes( ) # create new Sizes object using the merged sizes dataLines mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # get fixed lines as list of string datalines, and write them to file fixedLines = mergedSizes.getFixedLines() io.WriteSizesHeader(preprocessOutFile) io.WriteFile(preprocessOutFile, fixedLines, 'a')
def main(): inSizesFile = "sizes.txt" upperOutFileName = "upperCaseSizes.txt" changedUrlsOutFileName = "changedUrlsSizes.txt" mergedOutFileName = "mergedSynonymousSizeTypes_sizes.txt" outFileName = "fixedSizes.txt" fixedSizesOutFile = "fixedSizes.txt" outPeopleFile = "generatedPeople.txt" outBrandsFile = "generatedBrands.txt" sizeCatalogFile = "sizeCatalog.txt" ######################## # usage examples # ######################## ### get useful stats from size data ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # get the unique brands # brands = sizes.getBrands() # # # get the unique clothe categories # clotheCategories = sizes.getClotheCategories() # # # get the unique size types for every clothe category # clotheSizeTypes = sizes.getSizeTypes() # # # get a list with all the unique size types # sizeTypes = sizes.getSizeTypesList() # # # get the unique size categories for every clothe category # clotheSizeCategories = sizes.getSizeCats() # ### get size lines with merged synonymous SizeTypes ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # merge synonymous sizeTypes # mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes() # # # write mergedSynonymousSizeTypes lines to file # io.WriteSizesHeader(mergedOutFileName) # io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a') ### get fixed size lines ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # get fixed lines as list of string datalines # fixedLines = sizes.getFixedLines() # # # write fixed lines to file # io.WriteSizesHeader(fixedSizesOutFile) # io.WriteFile(fixedSizesOutFile, fixedLines, 'a') ### get the size bounds (min, max) for every sizeType ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # # get fixed lines as list of string datalines # fixedLines = sizes.getFixedLines() # # # create new Sizes object using the fixed sizes dataLines # fixedSizes = Sizes(fixedLines) # # # get the size bounds (min, max) for every sizeType as dictionary # # (!) works for size data lines with fixed sizes # sizeBounds = fixedSizes.getSizesBounds() ### construct the size catalog ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes() # # # create new Sizes object using the merged sizes dataLines # mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # # # get fixed lines as list of string datalines # fixedLines = mergedSizes.getFixedLines() # # # create new Sizes object using the fixed sizes dataLines # fixedSizes = Sizes(fixedLines) # # # get the size type projections for every size catalog entry # sizeTypesProjections = fixedSizes.getSizeTypesProjections() # # # get the size catalog as list of string datalines # sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections) # # # write size catalog to file # io.WriteSizeCatalogHeader(sizeCatalogFile) # io.WriteFile(sizeCatalogFile, sizeCatalog, 'a') ### generate virtual people ### # # # load file as list of string datalines # io = IO() # sizesDataLines = io.ReadFile(inSizesFile) # # # create Sizes object # sizes = Sizes(sizesDataLines) # # mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes() # # # create new Sizes object using the merged sizes dataLines # mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # # # get fixed lines as list of string datalines # fixedLines = mergedSizes.getFixedLines() # # # write fixed lines to file # io.WriteSizesHeader(fixedSizesOutFile) # io.WriteFile(fixedSizesOutFile, fixedLines, 'a') # # generator = Generator(fixedSizesOutFile) # generator.GeneratePeople(10, outPeopleFile) ### from unprocessed sizes data to database - as one shot run ### # # preprocessing # size catalog construction # generate virtual people # database construction and bulk load # load file as list of string datalines io = IO() sizesDataLines = io.ReadFile(inSizesFile) # create Sizes object sizes = Sizes(sizesDataLines) ### starts preprocessing ### # make all column data upperCase (exceptions are the url and size columns), and write them to file upperDataLines = sizes.doUpperCase() io.WriteSizesHeader(upperOutFileName) io.WriteFile(upperOutFileName, upperDataLines, 'a') # create new Sizes object using the upperCase sizes dataLines upperSizes = Sizes(upperDataLines) #change brands' URLs (from sizes source url to the brand url), and write them to file changedURLsDataLines = upperSizes.changeURLs() io.WriteSizesHeader(changedUrlsOutFileName) io.WriteFile(changedUrlsOutFileName, changedURLsDataLines, 'a') # create new Sizes object using the changedUrls sizes dataLines changedUrlsSizes = Sizes(changedURLsDataLines) # merge synonymous sizeTypes, and write them to file mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes( ) io.WriteSizesHeader(mergedOutFileName) io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a') # create new Sizes object using the merged sizes dataLines mergedSizes = Sizes(mergedSynonymousSizeTypesLines) # get fixed lines as list of string datalines, and write them to file fixedLines = mergedSizes.getFixedLines() io.WriteSizesHeader(fixedSizesOutFile) io.WriteFile(fixedSizesOutFile, fixedLines, 'a') ### ends preprocessing ### # create new Sizes object using the fixed sizes dataLines fixedSizes = Sizes(fixedLines) # get the size type projections for every size catalog entry sizeTypesProjections = fixedSizes.getSizeTypesProjections() # get the size catalog as list of string datalines, and write it to file sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections) io.WriteSizeCatalogHeader(sizeCatalogFile) io.WriteFile(sizeCatalogFile, sizeCatalog, 'a') # generate virtual people, and write them to file generator = Generator(fixedSizesOutFile) generator.GeneratePeople(10, outPeopleFile) # create a dbHelper object, connect to database, and construct the db schema dbHelper = DBHelper("betaDB", fixedSizesOutFile, outPeopleFile, sizeCatalogFile) dbHelper.constructDbSchema() print 'all ok'
class Dataset(): #constructor # # - dataLines, list, preprocessed size data def __init__(self, dataLines): if not len(dataLines) > 0: print '(!) dataLines list is empty' self.sizes = Sizes(dataLines) self.dataLines = [] self.features = [] # creates a dataset line for every size catalog entry # returns a dictionary dataset[size_catalog_entry] = "[(sizeType_1:value), (sizeType_2:value),]" def getDataset(self, features): self.features = features sizeTypes = self.sizes.getSizeTypesList() for feature in features: if feature not in sizeTypes: print '(!) feature ' + feature + ' is not a proper sizeType' return groupDict = {} # for every line for line_i in self.sizes.sizesDataLines: # split line to columns using 'tab' columns = line_i.split('\t') size_type_i = columns[0] size_i = columns[1] label_i = columns[2] brand_i = columns[3] url_i = columns[4] clothe_category_i = columns[5] parts = columns[6].split('\n') size_category_i = parts[0] # for every line again for line_j in self.sizes.sizesDataLines: # split line to columns using 'tab' columns = line_j.split('\t') size_type_j = columns[0] size_j = columns[1] label_j = columns[2] brand_j = columns[3] url_j = columns[4] clothe_category_j = columns[5] parts_j = columns[6].split('\n') size_category_j = parts[0] if (brand_i == brand_j) and (url_i == url_j) and ( clothe_category_i == clothe_category_j) and ( size_category_i == size_category_j) and (label_i == label_j): key = brand_i + " : " + clothe_category_i + " : " + label_i + " : " + size_category_i if key in groupDict: groupDict[key].append((size_type_i, size_i)) else: sizeList = [] sizeList.append((size_type_i, size_i)) groupDict[key] = sizeList fixedLines = {} for key in list(groupDict.keys()): featureList = list(set(groupDict[key])) id_counter = 0 tempDict = {} doubles = [] for feature in featureList: if feature[0] not in tempDict: tempDict[feature[0]] = feature[1] else: doubles.append(feature[0]) tempFeatureList = [] for feature in featureList: if feature[0] not in doubles: tempFeatureList.append(feature) if len(featureList) == len(tempFeatureList): fixedLines[key] = featureList else: for feature in featureList: if feature[0] in doubles: key_new = key + " : " + str(id_counter) id_counter += 1 featureList_new = list(tempFeatureList) featureList_new.append(feature) fixedLines[key_new] = featureList_new ext_dataset = {} id_counter = 0 for key in list(fixedLines.keys()): features1 = [] features2 = [] for feature in fixedLines[key]: parts = feature[1].split('-') features1.append((feature[0], parts[0])) features2.append((feature[0], parts[1])) ext_dataset[id_counter] = features1 id_counter += 1 ext_dataset[id_counter] = features2 id_counter += 1 dataset = {} id_counter = 0 feature_vector = {} for feature in features: feature_vector[feature] = 0 for key in ext_dataset: new_feature_vector = dict(feature_vector) for feature in ext_dataset[key]: new_feature_vector[feature[0]] = feature[1] dataset[id_counter] = new_feature_vector id_counter += 1 dataLines = [] for key in dataset.keys(): dataLine = [] for feature in self.features: dataLine.append(float(dataset[key][feature])) self.dataLines.append(dataLine) return self.dataLines def getStats(self): featuresStats = {} patternDict = {} for feature in self.features: featuresStats[feature] = 0 for data in self.dataLines: row = '' for i in range(len(self.features)): if data[i] != 0: featuresStats[self.features[i]] += 1 row += self.features[i] + ' + ' if row in patternDict.keys(): patternDict[row] += 1 else: patternDict[row] = 1 return (featuresStats, patternDict)