コード例 #1
0
ファイル: Generator.py プロジェクト: janmoravec/czech-crime
    def __init__(self, year, month, omitZeroValues, onCompleteCallback):

        # null everything
        self.areaLookup = ""
        self.crimeLookup = ""
        self.timeLookup = ""
        self.writer = ""
        self.areaWriter = ""
        self.crimeWriter = ""
        self.timeWriter = ""
        self.recordId = 1

        self.year = year
        self.month = month
        self.omitZeroValues = omitZeroValues
        self.onCompleteCallback = onCompleteCallback

        # 1) a) generate area lookup
        with xlrd.open_workbook("areaLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.areaLookup = AreaLookup(sh)

            # b) time lookup
        with xlrd.open_workbook("timeLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.timeLookup = TimeLookup(sh)

            # 2) generate lookups

            # area lookup
        if self.generateAreaLookup:

            with open("../generated/AreaLookup.csv", "wb") as f:

                rows = self.areaLookup.generate()
                numRows = len(rows)

                self.areaWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.areaWriter.writerow(row)

                    # crime lookup - is generated manually
        if self.generateCrimeLookup:

            with xlrd.open_workbook("../files/a______.xls", "wb") as wb:
                sh = wb.sheet_by_index(0)
                self.crimeLookup = CrimeLookup(sh)

                with open("../generated/CrimeLookup.csv", "wb") as f:

                    rows = self.crimeLookup.generate()
                    numRows = len(rows)

                    # 			self.crimeWriter = UnicodeWriter( f )
                    # 			for rowIndex in range( numRows ):
                    # 				row = rows[ rowIndex ]
                    # replace dot zero
                    # row[0] = row[0].replace( ".0","" )

                    # 				self.crimeWriter.writerow( [ row[0],row[1] ] )

                    # time lookup
        if self.generateTimeLookup:

            with open("../generated/TimeLookup.csv", "wb") as f:

                rows = self.timeLookup.generate()
                numRows = len(rows)

                self.timeWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.timeWriter.writerow(row)

                    # 3) go to folders and go through all folder and process all files
                    # itirate through all files

        if self.generateCrimeData:

            # -files
            # -2003
            # -1
            # -2
            # -...
            # -2004
            # -...

            directory = "../files"

            # itirate through all year folders
            yearFolders = listdir(directory)

            for yearFolder in yearFolders:

                # check if is year we're interested in
                if str(yearFolder) == str(self.year):

                    # itirate through all month folders
                    monthsFolders = listdir(directory + "/" + yearFolder)

                    for monthFolder in monthsFolders:

                        # check if is month we're interested in
                        if str(monthFolder) == str(self.month):
                            # itirate through files in month folder
                            files = listdir(directory + "/" + yearFolder + "/" + monthFolder)

                            # get time period id
                            periodId = self.timeLookup.getTimeIdByYearAndMonth(int(yearFolder), int(monthFolder))

                            for file in files:
                                # check only for excel files
                                if ".xls" in file or ".xlsx" in file:
                                    # omit files with underscore
                                    if not "__L" in file and not "__R" in file and not "__X" in file:

                                        # temp constrain to generate just one file
                                        # if "a0011__" in file :
                                        url = directory + "/" + yearFolder + "/" + monthFolder + "/" + file

                                        # create district sheet
                                        districtSheet = self.processFile(url, periodId)
                                        # temp
                                        if districtSheet:
                                            self.districtCrimeDataSheetsByCode[districtSheet.code] = districtSheet

                                            # self.files.append( self.processFile( url, periodId ) )

                                            # 4) add Letiste to respective districts
            transports = [
                {"from": "x004110", "to": "0011"},
                {"from": "x064160", "to": "0602"},
                {"from": "x074170", "to": "0704"},
                {"from": "x174150", "to": "1706"},
                {"from": "x194130", "to": "1903"},
                # add train stations
                {"from": "x060050", "to": "0602"},
                {"from": "x070050", "to": "0707"},
            ]

            lenTransports = len(transports)
            for transportIndex in range(lenTransports):
                transport = transports[transportIndex]
                baseDistrictSheet = self.districtCrimeDataSheetsByCode[transport["to"]]
                addingDistrictSheet = self.districtCrimeDataSheetsByCode[transport["from"]]
                baseDistrictSheet.addDistrictCrimeDataSheet(addingDistrictSheet)

                # 5) generate all files from district crime data
            rows = self.generate()

            # don't need all the objects any more
            self.clear()

            # 6) write to csv file
            fileName = str(self.year) + ":" + str("01-") + str(self.month)
            if not self.omitZeroValues:
                fileName = fileName + ":with-zeros"

            with open("../generated/crimeData-" + fileName + ".csv", "wb") as f:
                print "start writing file " + unicode(fileName)
                self.writer = UnicodeWriter(f)

                # write header
                # self.writer.writerow( self.COLUMN_NAMES )

                # write rest of the content
                numRows = len(rows)
                for rowIndex in range(numRows):

                    row = rows[rowIndex]
                    # print rows
                    self.writer.writerow(row)

                    # complete callback
                if self.onCompleteCallback:
                    self.onCompleteCallback()
コード例 #2
0
ファイル: Generator.py プロジェクト: janmoravec/czech-crime
class Generator:

    COLUMN_NAMES = [
        "id",
        "time",
        "area",
        "crime",
        "found",
        "found-end",
        "found-total",
        "solved",
        "solved-perc",
        "solved-additionally",
        "commited-drugged",
        "commited-alcohol",
        "commited-recidivst",
        "commited-under-15",
        "comitted-15-17",
        "comitted-under-18",
        "charged-total",
        "charged-recidivist",
        "charged-under-15",
        "charged-15-17",
        "charged-women",
        "damage-total",
        "damage-found",
    ]

    areaLookup = ""
    crimeLookup = ""
    timeLookup = ""
    writer = ""
    areaWriter = ""
    crimeWriter = ""
    timeWriter = ""
    recordId = 1
    files = []
    districtCrimeDataSheetsByCode = {}
    districtCrimeDataSheets = []

    # store areasheets with keys, so we can find them retrospectively to add another areasheet
    areaSheets = {}

    generateAreaLookup = False
    generateCrimeLookup = False
    generateTimeLookup = False
    generateCrimeData = True

    def __init__(self, year, month, omitZeroValues, onCompleteCallback):

        # null everything
        self.areaLookup = ""
        self.crimeLookup = ""
        self.timeLookup = ""
        self.writer = ""
        self.areaWriter = ""
        self.crimeWriter = ""
        self.timeWriter = ""
        self.recordId = 1

        self.year = year
        self.month = month
        self.omitZeroValues = omitZeroValues
        self.onCompleteCallback = onCompleteCallback

        # 1) a) generate area lookup
        with xlrd.open_workbook("areaLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.areaLookup = AreaLookup(sh)

            # b) time lookup
        with xlrd.open_workbook("timeLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.timeLookup = TimeLookup(sh)

            # 2) generate lookups

            # area lookup
        if self.generateAreaLookup:

            with open("../generated/AreaLookup.csv", "wb") as f:

                rows = self.areaLookup.generate()
                numRows = len(rows)

                self.areaWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.areaWriter.writerow(row)

                    # crime lookup - is generated manually
        if self.generateCrimeLookup:

            with xlrd.open_workbook("../files/a______.xls", "wb") as wb:
                sh = wb.sheet_by_index(0)
                self.crimeLookup = CrimeLookup(sh)

                with open("../generated/CrimeLookup.csv", "wb") as f:

                    rows = self.crimeLookup.generate()
                    numRows = len(rows)

                    # 			self.crimeWriter = UnicodeWriter( f )
                    # 			for rowIndex in range( numRows ):
                    # 				row = rows[ rowIndex ]
                    # replace dot zero
                    # row[0] = row[0].replace( ".0","" )

                    # 				self.crimeWriter.writerow( [ row[0],row[1] ] )

                    # time lookup
        if self.generateTimeLookup:

            with open("../generated/TimeLookup.csv", "wb") as f:

                rows = self.timeLookup.generate()
                numRows = len(rows)

                self.timeWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.timeWriter.writerow(row)

                    # 3) go to folders and go through all folder and process all files
                    # itirate through all files

        if self.generateCrimeData:

            # -files
            # -2003
            # -1
            # -2
            # -...
            # -2004
            # -...

            directory = "../files"

            # itirate through all year folders
            yearFolders = listdir(directory)

            for yearFolder in yearFolders:

                # check if is year we're interested in
                if str(yearFolder) == str(self.year):

                    # itirate through all month folders
                    monthsFolders = listdir(directory + "/" + yearFolder)

                    for monthFolder in monthsFolders:

                        # check if is month we're interested in
                        if str(monthFolder) == str(self.month):
                            # itirate through files in month folder
                            files = listdir(directory + "/" + yearFolder + "/" + monthFolder)

                            # get time period id
                            periodId = self.timeLookup.getTimeIdByYearAndMonth(int(yearFolder), int(monthFolder))

                            for file in files:
                                # check only for excel files
                                if ".xls" in file or ".xlsx" in file:
                                    # omit files with underscore
                                    if not "__L" in file and not "__R" in file and not "__X" in file:

                                        # temp constrain to generate just one file
                                        # if "a0011__" in file :
                                        url = directory + "/" + yearFolder + "/" + monthFolder + "/" + file

                                        # create district sheet
                                        districtSheet = self.processFile(url, periodId)
                                        # temp
                                        if districtSheet:
                                            self.districtCrimeDataSheetsByCode[districtSheet.code] = districtSheet

                                            # self.files.append( self.processFile( url, periodId ) )

                                            # 4) add Letiste to respective districts
            transports = [
                {"from": "x004110", "to": "0011"},
                {"from": "x064160", "to": "0602"},
                {"from": "x074170", "to": "0704"},
                {"from": "x174150", "to": "1706"},
                {"from": "x194130", "to": "1903"},
                # add train stations
                {"from": "x060050", "to": "0602"},
                {"from": "x070050", "to": "0707"},
            ]

            lenTransports = len(transports)
            for transportIndex in range(lenTransports):
                transport = transports[transportIndex]
                baseDistrictSheet = self.districtCrimeDataSheetsByCode[transport["to"]]
                addingDistrictSheet = self.districtCrimeDataSheetsByCode[transport["from"]]
                baseDistrictSheet.addDistrictCrimeDataSheet(addingDistrictSheet)

                # 5) generate all files from district crime data
            rows = self.generate()

            # don't need all the objects any more
            self.clear()

            # 6) write to csv file
            fileName = str(self.year) + ":" + str("01-") + str(self.month)
            if not self.omitZeroValues:
                fileName = fileName + ":with-zeros"

            with open("../generated/crimeData-" + fileName + ".csv", "wb") as f:
                print "start writing file " + unicode(fileName)
                self.writer = UnicodeWriter(f)

                # write header
                # self.writer.writerow( self.COLUMN_NAMES )

                # write rest of the content
                numRows = len(rows)
                for rowIndex in range(numRows):

                    row = rows[rowIndex]
                    # print rows
                    self.writer.writerow(row)

                    # complete callback
                if self.onCompleteCallback:
                    self.onCompleteCallback()

    def addRecordArray(self, sourceArr, arrToAdd):
        sourceLen = len(sourceArr)
        if sourceLen > 0:
            for i in range(sourceLen):
                # first three columns just identifies record
                if i > 2:
                    sourceArr[i] = str(float(sourceArr[i]) + float(arrToAdd[i]))
        else:
            sourceArr = copy(arrToAdd)
        return sourceArr

    def processFile(self, fileUrl, timeId):

        # print "======= processing file: " + fileUrl + " ========== "

        with xlrd.open_workbook(fileUrl) as wb:

            numSheets = wb.nsheets

            fileRecords = []
            sheetRecords = []

            districtName = wb.sheet_by_index(0).row(4)[2].value
            areaName = wb.sheet_by_index(0).row(5)[2].value
            districtCode = str(self.areaLookup.getAreaCodeByName(districtName))

            # if Letiste or train station, need to add next row to make it more specific
            firstSheetName = wb.sheet_by_index(0).name
            if districtCode == "-1" or firstSheetName == "a060050" or firstSheetName == "a070050":
                # print "getting district code"
                districtName = districtName + "-" + areaName
                districtCode = str(self.areaLookup.getAreaCodeByName(districtName))

                # data records for districts
            districtCrimeDataSheet = DistrictCrimeDataSheet(districtCode, districtName, timeId)
            self.districtCrimeDataSheets.append(districtCrimeDataSheet)
            # self.districtCrimeDataSheetsByCode[ districtCode ] = districtCrimeDataSheet

            print unicode(districtName) + " - " + unicode(districtCode)

            if districtCode == -1:
                print districtCode
                print districtName

            for sheetIndex in range(numSheets):

                # print "========= processing sheet " + str( sheetIndex ) + " ========="
                sheet = wb.sheet_by_index(sheetIndex)
                areaSheet = AreaCrimeDataSheet(sheet, timeId, self.omitZeroValues)
                districtCrimeDataSheet.addAreaCrimeDataSheet(areaSheet)

                # testing correct naming
                areaName = areaSheet.name
                # taking code straight from name of the sheet
                areaCode = areaSheet.code

                if areaCode == -1:
                    Logger.throwError("Unknown Area: " + unicode(areaCode) + "," + unicode(areaName))

                    # print unicode( areaName )
                print unicode(areaCode) + "," + unicode(areaName)

                # generatedRecords = areaSheet.generate()
                # print generatedRecords
                # lenRecords = len( generatedRecords )

                # for recordIndex in range( lenRecords ):
                # get single record
                ##	record = generatedRecords[ recordIndex ]
                # print "========= processing single record " + str( recordIndex ) + " ========="
                # 	record.insert( 0, str( timeId ) )
                # increment variable value
                # 	self.recordId += 1

                # fileRecords.extend( generatedRecords )

                # fileRecords.extend( districtCrimeDataSheet.generate() )

                # return fileRecords
            return districtCrimeDataSheet

    def generate(self):

        # print "======= generating ======="
        files = []
        lenDistrictSheet = len(self.districtCrimeDataSheets)

        for sheetIndex in range(lenDistrictSheet):
            # for districtSheetIndex in self.districtCrimeDataSheetsByCode:
            # print "========= new district ========== " + unicode( sheetIndex )

            districtSheet = self.districtCrimeDataSheets[
                sheetIndex
            ]  # self.districtCrimeDataSheetsByCode[ districtSheetIndex ]
            files.extend(districtSheet.generate())

        return files

    def clear(self):
        self.districtCrimeDataSheets = []
        self.districtCrimeDataSheetsByCode.clear()