Ejemplo n.º 1
0
    def __init__(self, dataLines):

        if not len(dataLines) > 0:
            print '(!) dataLines list is empty'

        self.sizes = Sizes(dataLines)

        self.dataLines = []

        self.features = []
Ejemplo n.º 2
0
def main():

#	preprocess()
	io = IO()
	dataLines = io.ReadFile(preprocessOutFile)

	sizes = Sizes(dataLines)	
	bounds = sizes.getSizesBounds()

	for bound in bounds:
		print bound, bounds[bound]
Ejemplo n.º 3
0
def preprocess():

	# load file as list of string datalines
	io = IO()
	sizesDataLines = io.ReadFile(inSizesFile)

	# create Sizes object
	sizes = Sizes(sizesDataLines)	


	### starts preprocessing ###

	# make all column data upperCase (exceptions are the url and size columns)
	upperDataLines = sizes.doUpperCase()

	# create new Sizes object using the upperCase sizes dataLines
	upperSizes = Sizes(upperDataLines)

	#change brands' URLs (from sizes source url to the brand url)
	changedURLsDataLines = upperSizes.changeURLs()

	# create new Sizes object using the changedUrls sizes dataLines
	changedUrlsSizes = Sizes(changedURLsDataLines)

	# merge synonymous sizeTypes
	mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes()

	# create new Sizes object using the merged sizes dataLines
	mergedSizes = Sizes(mergedSynonymousSizeTypesLines)

	# get fixed lines as list of string datalines, and write them to file
	fixedLines = mergedSizes.getFixedLines()
	io.WriteSizesHeader(preprocessOutFile)
	io.WriteFile(preprocessOutFile, fixedLines, 'a')
Ejemplo n.º 4
0
def main():

	inSizesFile = "sizes.txt"
	upperOutFileName = "upperCaseSizes.txt"
	changedUrlsOutFileName = "changedUrlsSizes.txt"
	mergedOutFileName = "mergedSynonymousSizeTypes_sizes.txt"
	outFileName = "fixedSizes.txt"
	fixedSizesOutFile = "fixedSizes.txt"
	outPeopleFile = "generatedPeople.txt"
	outBrandsFile = "generatedBrands.txt"
	sizeCatalogFile = "sizeCatalog.txt"




########################
#    usage examples    #
########################



### get useful stats from size data ###
#
#	# load file as list of string datalines
#	io = IO()
#	sizesDataLines = io.ReadFile(inSizesFile)
#
#	# create Sizes object
#	sizes = Sizes(sizesDataLines)	
#
#	# get the unique brands
#	brands = sizes.getBrands()
#
#	# get the unique clothe categories
#	clotheCategories = sizes.getClotheCategories()
#
#	# get the unique size types for every clothe category
#	clotheSizeTypes = sizes.getSizeTypes()
#
#	# get a list with all the unique size types
#	sizeTypes = sizes.getSizeTypesList()
#
#	# get the unique size categories for every clothe category
#	clotheSizeCategories = sizes.getSizeCats()		
#


### get size lines with merged synonymous SizeTypes ###
#
#	# load file as list of string datalines
#	io = IO()
#	sizesDataLines = io.ReadFile(inSizesFile)
#
#	# create Sizes object
#	sizes = Sizes(sizesDataLines)
#
#	# merge synonymous sizeTypes
#	mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes()
#
#	# write mergedSynonymousSizeTypes lines to file
#	io.WriteSizesHeader(mergedOutFileName)
#	io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a')	


### get fixed size lines ###
#
#	# load file as list of string datalines
#	io = IO()
#	sizesDataLines = io.ReadFile(inSizesFile)
#
#	# create Sizes object
#	sizes = Sizes(sizesDataLines)	
#
#	# get fixed lines as list of string datalines
#	fixedLines = sizes.getFixedLines()
#
#	# write fixed lines to file
#	io.WriteSizesHeader(fixedSizesOutFile)
#	io.WriteFile(fixedSizesOutFile, fixedLines, 'a')



### get the size bounds (min, max) for every sizeType ###
#
#	# load file as list of string datalines
#	io = IO()
#	sizesDataLines = io.ReadFile(inSizesFile)
#
#	# create Sizes object
#	sizes = Sizes(sizesDataLines)	
#
#	# get fixed lines as list of string datalines
#	fixedLines = sizes.getFixedLines()
#
#	# create new Sizes object using the fixed sizes dataLines
#	fixedSizes = Sizes(fixedLines)
#
#	# get the size bounds (min, max) for every sizeType as dictionary
#	# (!) works for size data lines with fixed sizes
#	sizeBounds = fixedSizes.getSizesBounds()



### construct the size catalog ###
#
#	# load file as list of string datalines
#	io = IO()
#	sizesDataLines = io.ReadFile(inSizesFile)
#	
#	# create Sizes object
#	sizes = Sizes(sizesDataLines)
#
#	mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes()
#
#	# create new Sizes object using the merged sizes dataLines
#	mergedSizes = Sizes(mergedSynonymousSizeTypesLines)
#
#	# get fixed lines as list of string datalines
#	fixedLines = mergedSizes.getFixedLines()
#
#	# create new Sizes object using the fixed sizes dataLines
#	fixedSizes = Sizes(fixedLines)	
#
#	# get the size type projections for every size catalog entry
#	sizeTypesProjections = fixedSizes.getSizeTypesProjections()
#
#	# get the size catalog as list of string datalines
#	sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections)
#
#	# write size catalog to file
#	io.WriteSizeCatalogHeader(sizeCatalogFile)
#	io.WriteFile(sizeCatalogFile, sizeCatalog, 'a')



### generate virtual people ###
#
#	# load file as list of string datalines
#	io = IO()
#	sizesDataLines = io.ReadFile(inSizesFile)
#
#	# create Sizes object
#	sizes = Sizes(sizesDataLines)	
#
#	mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes()
#
#	# create new Sizes object using the merged sizes dataLines
#	mergedSizes = Sizes(mergedSynonymousSizeTypesLines)
#
#	# get fixed lines as list of string datalines
#	fixedLines = mergedSizes.getFixedLines()
#
#	# write fixed lines to file
#	io.WriteSizesHeader(fixedSizesOutFile)
#	io.WriteFile(fixedSizesOutFile, fixedLines, 'a')
#
#	generator = Generator(fixedSizesOutFile)
#	generator.GeneratePeople(10, outPeopleFile)



### from unprocessed sizes data to database - as one shot run ###
#
# preprocessing
# size catalog construction
# generate virtual people
# database construction and bulk load  


	# load file as list of string datalines
	io = IO()
	sizesDataLines = io.ReadFile(inSizesFile)

	# create Sizes object
	sizes = Sizes(sizesDataLines)	


	### starts preprocessing ###

	# make all column data upperCase (exceptions are the url and size columns), and write them to file
	upperDataLines = sizes.doUpperCase()
	io.WriteSizesHeader(upperOutFileName)
	io.WriteFile(upperOutFileName, upperDataLines, 'a')

	# create new Sizes object using the upperCase sizes dataLines
	upperSizes = Sizes(upperDataLines)

	#change brands' URLs (from sizes source url to the brand url), and write them to file
	changedURLsDataLines = upperSizes.changeURLs()
	io.WriteSizesHeader(changedUrlsOutFileName)
	io.WriteFile(changedUrlsOutFileName, changedURLsDataLines, 'a')

	# create new Sizes object using the changedUrls sizes dataLines
	changedUrlsSizes = Sizes(changedURLsDataLines)

	# merge synonymous sizeTypes, and write them to file
	mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes()
	io.WriteSizesHeader(mergedOutFileName)
	io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a')

	# create new Sizes object using the merged sizes dataLines
	mergedSizes = Sizes(mergedSynonymousSizeTypesLines)

	# get fixed lines as list of string datalines, and write them to file
	fixedLines = mergedSizes.getFixedLines()
	io.WriteSizesHeader(fixedSizesOutFile)
	io.WriteFile(fixedSizesOutFile, fixedLines, 'a')

	### ends preprocessing ###


	# create new Sizes object using the fixed sizes dataLines
	fixedSizes = Sizes(fixedLines)	

	# get the size type projections for every size catalog entry
	sizeTypesProjections = fixedSizes.getSizeTypesProjections()

	# get the size catalog as list of string datalines, and write it to file
	sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections)
	io.WriteSizeCatalogHeader(sizeCatalogFile)
	io.WriteFile(sizeCatalogFile, sizeCatalog, 'a')

	# generate virtual people, and write them to file
	generator = Generator(fixedSizesOutFile)
	generator.GeneratePeople(10, outPeopleFile)

	# create a dbHelper object, connect to database, and construct the db schema
	dbHelper = DBHelper("betaDB", fixedSizesOutFile, outPeopleFile, sizeCatalogFile)
	dbHelper.constructDbSchema()
	
	print 'all ok'
Ejemplo n.º 5
0
def preprocess():

    # load file as list of string datalines
    io = IO()
    sizesDataLines = io.ReadFile(inSizesFile)

    # create Sizes object
    sizes = Sizes(sizesDataLines)

    ### starts preprocessing ###

    # make all column data upperCase (exceptions are the url and size columns)
    upperDataLines = sizes.doUpperCase()

    # create new Sizes object using the upperCase sizes dataLines
    upperSizes = Sizes(upperDataLines)

    #change brands' URLs (from sizes source url to the brand url)
    changedURLsDataLines = upperSizes.changeURLs()

    # create new Sizes object using the changedUrls sizes dataLines
    changedUrlsSizes = Sizes(changedURLsDataLines)

    # merge synonymous sizeTypes
    mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes(
    )

    # create new Sizes object using the merged sizes dataLines
    mergedSizes = Sizes(mergedSynonymousSizeTypesLines)

    # get fixed lines as list of string datalines, and write them to file
    fixedLines = mergedSizes.getFixedLines()
    io.WriteSizesHeader(preprocessOutFile)
    io.WriteFile(preprocessOutFile, fixedLines, 'a')
Ejemplo n.º 6
0
def main():

    inSizesFile = "sizes.txt"
    upperOutFileName = "upperCaseSizes.txt"
    changedUrlsOutFileName = "changedUrlsSizes.txt"
    mergedOutFileName = "mergedSynonymousSizeTypes_sizes.txt"
    outFileName = "fixedSizes.txt"
    fixedSizesOutFile = "fixedSizes.txt"
    outPeopleFile = "generatedPeople.txt"
    outBrandsFile = "generatedBrands.txt"
    sizeCatalogFile = "sizeCatalog.txt"

    ########################
    #    usage examples    #
    ########################

    ### get useful stats from size data ###
    #
    #	# load file as list of string datalines
    #	io = IO()
    #	sizesDataLines = io.ReadFile(inSizesFile)
    #
    #	# create Sizes object
    #	sizes = Sizes(sizesDataLines)
    #
    #	# get the unique brands
    #	brands = sizes.getBrands()
    #
    #	# get the unique clothe categories
    #	clotheCategories = sizes.getClotheCategories()
    #
    #	# get the unique size types for every clothe category
    #	clotheSizeTypes = sizes.getSizeTypes()
    #
    #	# get a list with all the unique size types
    #	sizeTypes = sizes.getSizeTypesList()
    #
    #	# get the unique size categories for every clothe category
    #	clotheSizeCategories = sizes.getSizeCats()
    #

    ### get size lines with merged synonymous SizeTypes ###
    #
    #	# load file as list of string datalines
    #	io = IO()
    #	sizesDataLines = io.ReadFile(inSizesFile)
    #
    #	# create Sizes object
    #	sizes = Sizes(sizesDataLines)
    #
    #	# merge synonymous sizeTypes
    #	mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes()
    #
    #	# write mergedSynonymousSizeTypes lines to file
    #	io.WriteSizesHeader(mergedOutFileName)
    #	io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a')

    ### get fixed size lines ###
    #
    #	# load file as list of string datalines
    #	io = IO()
    #	sizesDataLines = io.ReadFile(inSizesFile)
    #
    #	# create Sizes object
    #	sizes = Sizes(sizesDataLines)
    #
    #	# get fixed lines as list of string datalines
    #	fixedLines = sizes.getFixedLines()
    #
    #	# write fixed lines to file
    #	io.WriteSizesHeader(fixedSizesOutFile)
    #	io.WriteFile(fixedSizesOutFile, fixedLines, 'a')

    ### get the size bounds (min, max) for every sizeType ###
    #
    #	# load file as list of string datalines
    #	io = IO()
    #	sizesDataLines = io.ReadFile(inSizesFile)
    #
    #	# create Sizes object
    #	sizes = Sizes(sizesDataLines)
    #
    #	# get fixed lines as list of string datalines
    #	fixedLines = sizes.getFixedLines()
    #
    #	# create new Sizes object using the fixed sizes dataLines
    #	fixedSizes = Sizes(fixedLines)
    #
    #	# get the size bounds (min, max) for every sizeType as dictionary
    #	# (!) works for size data lines with fixed sizes
    #	sizeBounds = fixedSizes.getSizesBounds()

    ### construct the size catalog ###
    #
    #	# load file as list of string datalines
    #	io = IO()
    #	sizesDataLines = io.ReadFile(inSizesFile)
    #
    #	# create Sizes object
    #	sizes = Sizes(sizesDataLines)
    #
    #	mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes()
    #
    #	# create new Sizes object using the merged sizes dataLines
    #	mergedSizes = Sizes(mergedSynonymousSizeTypesLines)
    #
    #	# get fixed lines as list of string datalines
    #	fixedLines = mergedSizes.getFixedLines()
    #
    #	# create new Sizes object using the fixed sizes dataLines
    #	fixedSizes = Sizes(fixedLines)
    #
    #	# get the size type projections for every size catalog entry
    #	sizeTypesProjections = fixedSizes.getSizeTypesProjections()
    #
    #	# get the size catalog as list of string datalines
    #	sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections)
    #
    #	# write size catalog to file
    #	io.WriteSizeCatalogHeader(sizeCatalogFile)
    #	io.WriteFile(sizeCatalogFile, sizeCatalog, 'a')

    ### generate virtual people ###
    #
    #	# load file as list of string datalines
    #	io = IO()
    #	sizesDataLines = io.ReadFile(inSizesFile)
    #
    #	# create Sizes object
    #	sizes = Sizes(sizesDataLines)
    #
    #	mergedSynonymousSizeTypesLines = sizes.mergeSynonymousSizeTypes()
    #
    #	# create new Sizes object using the merged sizes dataLines
    #	mergedSizes = Sizes(mergedSynonymousSizeTypesLines)
    #
    #	# get fixed lines as list of string datalines
    #	fixedLines = mergedSizes.getFixedLines()
    #
    #	# write fixed lines to file
    #	io.WriteSizesHeader(fixedSizesOutFile)
    #	io.WriteFile(fixedSizesOutFile, fixedLines, 'a')
    #
    #	generator = Generator(fixedSizesOutFile)
    #	generator.GeneratePeople(10, outPeopleFile)

    ### from unprocessed sizes data to database - as one shot run ###
    #
    # preprocessing
    # size catalog construction
    # generate virtual people
    # database construction and bulk load

    # load file as list of string datalines
    io = IO()
    sizesDataLines = io.ReadFile(inSizesFile)

    # create Sizes object
    sizes = Sizes(sizesDataLines)

    ### starts preprocessing ###

    # make all column data upperCase (exceptions are the url and size columns), and write them to file
    upperDataLines = sizes.doUpperCase()
    io.WriteSizesHeader(upperOutFileName)
    io.WriteFile(upperOutFileName, upperDataLines, 'a')

    # create new Sizes object using the upperCase sizes dataLines
    upperSizes = Sizes(upperDataLines)

    #change brands' URLs (from sizes source url to the brand url), and write them to file
    changedURLsDataLines = upperSizes.changeURLs()
    io.WriteSizesHeader(changedUrlsOutFileName)
    io.WriteFile(changedUrlsOutFileName, changedURLsDataLines, 'a')

    # create new Sizes object using the changedUrls sizes dataLines
    changedUrlsSizes = Sizes(changedURLsDataLines)

    # merge synonymous sizeTypes, and write them to file
    mergedSynonymousSizeTypesLines = changedUrlsSizes.mergeSynonymousSizeTypes(
    )
    io.WriteSizesHeader(mergedOutFileName)
    io.WriteFile(mergedOutFileName, mergedSynonymousSizeTypesLines, 'a')

    # create new Sizes object using the merged sizes dataLines
    mergedSizes = Sizes(mergedSynonymousSizeTypesLines)

    # get fixed lines as list of string datalines, and write them to file
    fixedLines = mergedSizes.getFixedLines()
    io.WriteSizesHeader(fixedSizesOutFile)
    io.WriteFile(fixedSizesOutFile, fixedLines, 'a')

    ### ends preprocessing ###

    # create new Sizes object using the fixed sizes dataLines
    fixedSizes = Sizes(fixedLines)

    # get the size type projections for every size catalog entry
    sizeTypesProjections = fixedSizes.getSizeTypesProjections()

    # get the size catalog as list of string datalines, and write it to file
    sizeCatalog = fixedSizes.constructSizeCatalog(sizeTypesProjections)
    io.WriteSizeCatalogHeader(sizeCatalogFile)
    io.WriteFile(sizeCatalogFile, sizeCatalog, 'a')

    # generate virtual people, and write them to file
    generator = Generator(fixedSizesOutFile)
    generator.GeneratePeople(10, outPeopleFile)

    # create a dbHelper object, connect to database, and construct the db schema
    dbHelper = DBHelper("betaDB", fixedSizesOutFile, outPeopleFile,
                        sizeCatalogFile)
    dbHelper.constructDbSchema()

    print 'all ok'
Ejemplo n.º 7
0
class Dataset():

    #constructor
    #
    # - dataLines, list, preprocessed size data
    def __init__(self, dataLines):

        if not len(dataLines) > 0:
            print '(!) dataLines list is empty'

        self.sizes = Sizes(dataLines)

        self.dataLines = []

        self.features = []

    # creates a dataset line for every size catalog entry
    # returns a dictionary dataset[size_catalog_entry] = "[(sizeType_1:value), (sizeType_2:value),]"
    def getDataset(self, features):

        self.features = features

        sizeTypes = self.sizes.getSizeTypesList()
        for feature in features:
            if feature not in sizeTypes:
                print '(!) feature ' + feature + ' is not a proper sizeType'
                return

        groupDict = {}

        # for every line
        for line_i in self.sizes.sizesDataLines:

            # split line to columns using 'tab'
            columns = line_i.split('\t')

            size_type_i = columns[0]
            size_i = columns[1]
            label_i = columns[2]
            brand_i = columns[3]
            url_i = columns[4]
            clothe_category_i = columns[5]
            parts = columns[6].split('\n')
            size_category_i = parts[0]

            # for every line again
            for line_j in self.sizes.sizesDataLines:

                # split line to columns using 'tab'
                columns = line_j.split('\t')

                size_type_j = columns[0]
                size_j = columns[1]
                label_j = columns[2]
                brand_j = columns[3]
                url_j = columns[4]
                clothe_category_j = columns[5]
                parts_j = columns[6].split('\n')
                size_category_j = parts[0]

                if (brand_i == brand_j) and (url_i == url_j) and (
                        clothe_category_i == clothe_category_j) and (
                            size_category_i
                            == size_category_j) and (label_i == label_j):

                    key = brand_i + " : " + clothe_category_i + " : " + label_i + " : " + size_category_i

                    if key in groupDict:
                        groupDict[key].append((size_type_i, size_i))
                    else:
                        sizeList = []
                        sizeList.append((size_type_i, size_i))
                        groupDict[key] = sizeList

        fixedLines = {}

        for key in list(groupDict.keys()):

            featureList = list(set(groupDict[key]))

            id_counter = 0
            tempDict = {}
            doubles = []

            for feature in featureList:
                if feature[0] not in tempDict:
                    tempDict[feature[0]] = feature[1]
                else:
                    doubles.append(feature[0])

            tempFeatureList = []

            for feature in featureList:
                if feature[0] not in doubles:
                    tempFeatureList.append(feature)

            if len(featureList) == len(tempFeatureList):
                fixedLines[key] = featureList
            else:
                for feature in featureList:
                    if feature[0] in doubles:

                        key_new = key + " : " + str(id_counter)
                        id_counter += 1

                        featureList_new = list(tempFeatureList)
                        featureList_new.append(feature)

                        fixedLines[key_new] = featureList_new

        ext_dataset = {}
        id_counter = 0

        for key in list(fixedLines.keys()):

            features1 = []
            features2 = []

            for feature in fixedLines[key]:

                parts = feature[1].split('-')

                features1.append((feature[0], parts[0]))
                features2.append((feature[0], parts[1]))

            ext_dataset[id_counter] = features1
            id_counter += 1
            ext_dataset[id_counter] = features2
            id_counter += 1

        dataset = {}
        id_counter = 0

        feature_vector = {}
        for feature in features:
            feature_vector[feature] = 0

        for key in ext_dataset:

            new_feature_vector = dict(feature_vector)

            for feature in ext_dataset[key]:
                new_feature_vector[feature[0]] = feature[1]

            dataset[id_counter] = new_feature_vector
            id_counter += 1

        dataLines = []

        for key in dataset.keys():

            dataLine = []

            for feature in self.features:
                dataLine.append(float(dataset[key][feature]))

            self.dataLines.append(dataLine)

        return self.dataLines

    def getStats(self):

        featuresStats = {}
        patternDict = {}

        for feature in self.features:
            featuresStats[feature] = 0

        for data in self.dataLines:

            row = ''

            for i in range(len(self.features)):
                if data[i] != 0:
                    featuresStats[self.features[i]] += 1
                    row += self.features[i] + ' + '

            if row in patternDict.keys():
                patternDict[row] += 1
            else:
                patternDict[row] = 1

        return (featuresStats, patternDict)