def timeSeries(fullTable, fromDay, toDay, byCriteria, nameColumn, Altersgruppen, Geschlechter): regions = fullTable[:, [dt.first(nameColumn)], dt.by(byCriteria)] #regions = regions[:5,:] print("Creating time series for regions:") print(regions) dailysByCriteria = {} start = time.perf_counter() for i, lk in enumerate(regions[:, byCriteria].to_list()[0]): print("Processing Region '{}'".format(regions[i, nameColumn][0, 0])) start_region = time.perf_counter() pmu.printMemoryUsage("pre analyzeDailyAltersgruppenGeschlechter") dailysByCriteria[lk] = analyzeDailyAltersgruppenGeschlechter( fullTable, filterByDayAndCriteria(fromDay, toDay, (byCriteria == lk)), Altersgruppen, Geschlechter) finish = time.perf_counter() duration = finish - start print( "Region took {:.2f} seconds, elapsed {:.2f} minutes, time to completion: {:.2f} minutes" .format(finish - start_region, duration / 60, duration / (i + 1) * (regions.nrows - i) / 60)) pmu.printMemoryUsage("post analyzeDailyAltersgruppenGeschlechter") print("Done {} of {}, key = {} name = {}".format( i + 1, regions.nrows, lk, regions[i, nameColumn][0, 0])) #if lk >= 0: # break return regions, dailysByCriteria
def main(): dt.options.progress.enabled = False parser = argparse.ArgumentParser(description='Fast queries on all data') parser.add_argument('file', metavar='fileName', type=str, nargs='?', help='.Full unified NPGEO COVID19 Germany data as .csv or .jay file', default="archive_v2/all-data.jay") parser.add_argument('-d', '--output-dir', dest='outputDir', default="series") parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".") parser.add_argument('-i', '--incremental-update-dir', dest='incrementalUpdateDir', default="") parser.add_argument("--agegroups", help="also create columns for all seperate age groups", action="store_true") parser.add_argument("--gender", help="also create columns for all seperate gender groups", action="store_true") parser.add_argument("-v", "--verbose", help="make more noise", action="store_true") parser.add_argument("--memorylimit", type=int, help="number of records per partition") args = parser.parse_args() #print(args) partitioned = False pmu.printMemoryUsage("after start") if len(pmu.getJayTablePartitions(args.file)) > 0: fullTable = pmu.loadJayTablePartioned(args.file, tempDir=args.tempDir, memoryLimit=args.memorylimit, verbose=args.verbose) partitioned = True elif os.path.isfile(args.file): print("Loading " + args.file) fullTable = dt.fread(args.file, tempdir=args.tempDir, memory_limit=args.memoryLimit, verbose=args.verbose) print("Loading done loading table from ‘{}‘, rows: {} cols: {}".format(args.file, fullTable.nrows, fullTable.ncols)) pmu.printMemoryUsage("after load") oldTables = {} if args.incrementalUpdateDir != "": updateFiles = sorted(glob.glob(args.incrementalUpdateDir+"/*.csv")) for f in updateFiles: if not f.endswith("--nicht erhoben-.csv"): table = dt.fread(f) print("Load {}".format(f)) lkID = table[0,"IdLandkreis"]#.to_list()[0][0] print("lkID {}".format(lkID)) oldTables[lkID] = table analyze(fullTable, args, oldTables)
def main(): parser = argparse.ArgumentParser(description='Fast queries on all data') parser.add_argument( 'file', metavar='fileName', type=str, nargs='?', help='.Full unified NPGEO COVID19 Germany data as .csv or .jay file', default="archive_v2/all-data.jay") parser.add_argument('-d', '--output-dir', dest='outputDir', default="series") parser.add_argument("--agegroups", help="also create columns for all seperate age groups", action="store_true") args = parser.parse_args() #print(args) pmu.printMemoryUsage("after start") print("Loading " + args.file) fullTable = dt.fread(args.file) print("Loading done loading table from ‘{}‘, rows: {} cols: {}".format( args.file, fullTable.nrows, fullTable.ncols)) pmu.printMemoryUsage("after load") if False: print("Materializing fullTable") fullTable.materialize(to_memory=True) pmu.printMemoryUsage("after materialize") analyze(fullTable, args)
def main(): dt.options.progress.enabled = False parser = argparse.ArgumentParser(description='Fast queries on all data') parser.add_argument( 'file', metavar='fileName', type=str, nargs='?', help='.Full unified NPGEO COVID19 Germany data as .csv or .jay file', default="archive_v2/all-data.jay") parser.add_argument('-d', '--output-dir', dest='outputDir', default="series") parser.add_argument('-i', '--incremental-update-dir', dest='incrementalUpdateDir', default="") parser.add_argument("--agegroups", help="also create columns for all seperate age groups", action="store_true") parser.add_argument( "--gender", help="also create columns for all seperate gender groups", action="store_true") args = parser.parse_args() #print(args) pmu.printMemoryUsage("after start") print("Loading " + args.file) fullTable = dt.fread(args.file) print("Loading done loading table from ‘{}‘, rows: {} cols: {}".format( args.file, fullTable.nrows, fullTable.ncols)) pmu.printMemoryUsage("after load") oldTables = {} if args.incrementalUpdateDir != "": updateFiles = sorted(glob.glob(args.incrementalUpdateDir + "/*.csv")) for f in updateFiles: if not f.endswith("--nicht erhoben-.csv"): table = dt.fread(f) print("Load {}".format(f)) lkID = table[0, "IdLandkreis"] #.to_list()[0][0] print("lkID {}".format(lkID)) oldTables[lkID] = table if False: print("Materializing fullTable") fullTable.materialize(to_memory=True) pmu.printMemoryUsage("after materialize") analyze(fullTable, args, oldTables)
def isNewData(dataFilename, daysIncluded): pmu.printMemoryUsage("begin of isNewData") peekTable = dt.fread(dataFilename, max_nrows=1) print("Checking " + dataFilename) ##print(peekTable) ##datenStand = peekTable[0, dt.f.DatenstandISO] dss = peekTable[0, "Datenstand"] print("Datenstand", dss) ds = cd.datetimeFromDatenstandAny(dss) dsdy = cd.dayFromDate(ds) pmu.printMemoryUsage("isNewData") isNew = dsdy not in daysIncluded if isNew: print("contains new day {}".format(dsdy)) else: print("contains day {} already in full table".format(dsdy)) pmu.printMemoryUsage("end of isNewData") return isNew
def main(): #testDatePerf() start = time.perf_counter() lastCheckPointTime = start parser = argparse.ArgumentParser( description='Create a unfied data file from daily dumps') parser.add_argument('files', metavar='fileName', type=str, nargs='+', help='.NPGEO COVID19 Germany data as .csv file') parser.add_argument('-d', '--output-dir', dest='outputDir', default=".") parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".") parser.add_argument( "--flushread", help= "flush full table an re-read after checkpoint lower memory footprint", action="store_true") parser.add_argument("--force", help="build new database anyway", action="store_true") parser.add_argument( "--destructivesave", help="release memory gradually while saving and reload after saving", action="store_true") parser.add_argument("-v", "--verbose", help="make more noise", action="store_true") parser.add_argument("--partitionsize", type=int, help="number of records per partition", default=10000000) parser.add_argument("--memorylimit", type=int, help="maximum memory limit for a database file") parser.add_argument( "--checkpoint", type=int, help="write checkpoint after amount of minutes elapsed", default=10) parser.add_argument( "--nthreads", type=int, help= "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores", default=0) args = parser.parse_args() print(args) # print("args.inMemory",args.inMemory) # print("args.materializeNew",args.materializeNew) # print("args.noMaterialize",args.noMaterialize) if args.nthreads != 0: dt.options.nthreads = args.nthreads print("dt.options.nthreads", dt.options.nthreads) fullTable = None jayFile = "all-data.jay" jayPath = os.path.join(args.outputDir, jayFile) print(jayPath) pmu.printMemoryUsage("after start") partitioned = False daysIncluded = [] if len(pmu.getJayTablePartitions(jayPath)) > 0: fullTable = pmu.loadJayTablePartioned(jayPath, tempDir=args.tempDir, memoryLimit=args.memorylimit, verbose=args.verbose) if fullTable == None: print( "The file {} is not a valid jay file, please remove it and retry" ) exit(1) partitioned = True elif os.path.isfile(jayPath): print("Loading " + jayPath) fullTable = dt.fread(jayPath) if fullTable is not None: pmu.printMemoryUsage("after load") daysIncluded = sorted( fullTable[:, [dt.first(dt.f.DatenstandTag)], dt.by(dt.f.DatenstandTag)].to_list()[0]) print("Days in full table:") print(daysIncluded) pmu.printMemoryUsage("after first query") addedData = False for fa in args.files: files = sorted(glob.glob(fa)) for f in files: if isNewData(f, daysIncluded): addedData = True fstart = time.perf_counter() pmu.printMemoryUsage("after isNewData query") t = tableData(f) pmu.printMemoryUsage("after tabledata query") print("Hashing " + f) newTable = unify(t) pmu.printMemoryUsage("after hashing") save(newTable, f, args.outputDir) pmu.printMemoryUsage("after newTable save") if fullTable is None: fullTable = newTable else: #print("full fields", fullTable.names) checkColumns(fullTable.names, newTable.names) pmu.printMemoryUsage("before fulltable rbind") fullTable.rbind(newTable) # memory gets used here pmu.printMemoryUsage("after rbind") ffinish = time.perf_counter() secs = ffinish - fstart #print("fullTable", fullTable) print("newTable rows = {}".format(newTable.nrows)) print("fullTable rows = {}".format(fullTable.nrows)) print( "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours". format(secs, secs / 60, secs / 60 / 60)) if time.perf_counter() - lastCheckPointTime > float( args.checkpoint) * 60: print("Saving checkpoint @ {}".format(datetime.now())) pmu.saveJayTablePartioned(fullTable, jayFile, args.outputDir, args.partitionsize, True, args.destructivesave) if args.flushread or args.destructivesave: print("Re-reading checkpoint @ {}".format( datetime.now())) fullTable = None fullTable = pmu.loadJayTablePartioned( jayPath, tempDir=args.tempDir, memoryLimit=args.memorylimit, verbose=args.verbose) lastCheckPointTime = time.perf_counter() print("Checkpoint done @ {}".format(datetime.now())) if addedData or not partitioned: pmu.printMemoryUsage("before full save") #pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir) pmu.saveJayTablePartioned(fullTable, "all-data.jay", args.outputDir, args.partitionsize, True, args.destructivesave) pmu.printMemoryUsage("after full save") else: print("No new data added, not saving 'all-data.jay'") #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir) finish = time.perf_counter() secs = finish - start print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format( secs, secs / 60, secs / 60 / 60))
def unify(table): dss = table[0, "Datenstand"] ds = cd.datetimeFromDatenstandAny(dss) dsdy = cd.dayFromDate(ds) hasRefdatum = "Refdatum" in table.names hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names #t = table.copy() t = table if "Altersgruppe2" in table.names: t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])] if not "DatenstandISO" in table.names: isodate = cd.dateStrYMDFromDay(dsdy) t = t[:, dt.f[:].extend({"DatenstandISO": isodate})] if not hasRefdatum: t = t[:, dt.f[:].extend({ "Refdatum": str(cd.day0d), "RefdatumISO": dt.f.MeldedatumISO })] if not hasErkrankungsbeginn: t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})] if "NeuGenesen" not in table.names: t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})] t = t[:, dt.f[:].extend({ "FallGruppe": "", "MeldeTag": nan, "RefTag": nan, "DatenstandTag": dsdy })] #t = t[:, dt.f[:].extend({"Bevoelkerung":0, "FaellePro100k":0.0, "TodesfaellePro100k":0.0, "isStadt":False})] #t = t[:, dt.f[:].extend({"Flaeche":0.0, "FaelleProKm2":0.0, "TodesfaelleProKm2":0.0, "Dichte":0.0})] #print("unified fields", t.names) #Bevoelkerung = loadLandkreisBeveolkerung() #Flaeche = loadLandkreisFlaeche() #Census = loadCensus() #pmu.printMemoryUsage("unify pre realize ") #t.materialize(to_memory=True) pmu.printMemoryUsage("unify pre dict") d = t.to_dict() pmu.printMemoryUsage("unify post dict") print("> iterating through {} rows".format(t.nrows)) start = time.perf_counter() for r in range(t.nrows): mds = d["Meldedatum"][r] if pmu.is_int(mds): md = cd.datetimeFromStampStr(mds) else: md = datetimeFromDateStr3(mds) mdy = cd.dayFromDate(md) d["MeldeTag"][r] = mdy if not hasRefdatum: d["Refdatum"][r] = str(md) d["RefTag"][r] = mdy fg = str(d["IdLandkreis"] [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str( int(d["MeldeTag"][r])) if int(d["IstErkrankungsbeginn"][r]) == 1: rds = d["Refdatum"][r] if pmu.is_int(rds): rd = cd.datetimeFromStampStr(rds) else: rd = datetimeFromDateStr3(rds) rdy = cd.dayFromDate(rd) d["RefTag"][r] = rdy fg = fg + ":" + str(rdy) d["FallGruppe"][r] = fg checkLandkreisData(d, r, Census, Flaeche) finish = time.perf_counter() print("< iterating through {} rows done, {:.1f} rows/sec".format( t.nrows, t.nrows / (finish - start))) pmu.printMemoryUsage("end of unify, pre frame") t = dt.Frame(d) pmu.printMemoryUsage("end of unify, post frame") return t
def analyze(fullTable, args): #fullTable = fullTable[dt.f.DatenstandTag > 382 - 20,:] print("Analyzing") pmu.printMemoryUsage("begin analyze") print("Keys:") print(fullTable.keys()) firstDumpDay = cint(fullTable[:, "DatenstandTag"].min()) lastDumpDay = cint(fullTable[:, "DatenstandTag"].max()) print("firstDumpDay", firstDumpDay) print("lastDumpDay", lastDumpDay) #fromDay = lastDumpDay-27 fromDay = firstDumpDay toDay = lastDumpDay + 1 fullTable = fullTable[:, dt.f[:].extend( {"MeldeDelay": dt.f.DatenstandTag - dt.f.MeldeTag - 1})] fullTable = fullTable[:, dt.f[:].extend( {"RefDelay": dt.f.DatenstandTag - dt.f.RefTag - 1})] fullTable.materialize() Altersgruppen = [] if args.agegroups: Altersgruppen = dt.unique(fullTable[:, "Altersgruppe"]).to_list()[0] print("Altersgruppen", Altersgruppen) Geschlechter = dt.unique(fullTable[:, "Geschlecht"]).to_list()[0] print("Geschlechter", Geschlechter) census = dt.fread("CensusByRKIAgeGroups.csv") censusDeutschland = census[dt.f.Name == "Deutschland", :] print(censusDeutschland) flaechen = loadFlaechen() #for id in range(1,16): # censusBL = census[dt.f.Code == id, :] # print(censusBL) print("Processing 'Deutschland'") pmu.printMemoryUsage("begin Deutschland") deutschland = analyzeDailyAltersgruppenGeschlechter( fullTable, filterByDay(fromDay, toDay), Altersgruppen, Geschlechter) deutschland = insertDates(deutschland) deutschland = insertRegionInfo(deutschland, 0, "Deutschland", "BR", 0, "Deutschland", flaechen[0]) print(deutschland) pmu.printMemoryUsage("pre makeIncidenceColumns") deutschland = makeIncidenceColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter) print(deutschland) pmu.printMemoryUsage("pre save") pmu.saveCsvTable(deutschland, "series-{}-{}.csv".format(0, "Deutschland"), args.outputDir) pmu.printMemoryUsage("post save") deutschland = None #exit(0) print("Processing Bundesländer") bundeslaender, bundeslaender_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdBundesland, dt.f.Bundesland, Altersgruppen, Geschlechter) pmu.printMemoryUsage("post Bundesländer timeSeries") for i in range(bundeslaender.nrows): bl_name = bundeslaender[i, dt.f.Bundesland].to_list()[0][0] bl_id = bundeslaender[i, dt.f.IdBundesland].to_list()[0][0] if bl_id > 0: #bundeslaender_numbers[bl_id] = bundeslaender_numbers[bl_id][:, dt.f[:].extend( # {"IdLandkreis": bl_id, "Landkreis": bl_name, "IdBundesland": bl_id, "Bundesland": bl_name, "Flaeche" : flaechen[bl_id]})] bundeslaender_numbers[bl_id] = insertDates( bundeslaender_numbers[bl_id]) bundeslaender_numbers[bl_id] = insertRegionInfo( bundeslaender_numbers[bl_id], bl_id, bl_name, "BL", bl_id, bl_name, flaechen[0]) censusBL = census[dt.f.IdLandkreis == bl_id, :] print(censusBL) bundeslaender_numbers[bl_id] = makeIncidenceColumns( bundeslaender_numbers[bl_id], censusBL, Altersgruppen, Geschlechter) pmu.printMemoryUsage("pre save {}".format(bl_name)) pmu.saveCsvTable(bundeslaender_numbers[bl_id], "series-{}-{}.csv".format(bl_id, bl_name), args.outputDir) bundeslaender = None bundeslaender_numbers = None print("Processing Landkreise'") landKreise, landkreise_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdLandkreis, dt.f.Landkreis, Altersgruppen, Geschlechter) pmu.printMemoryUsage("post Landkreise timeSeries") #print(landKreise) #print(landkreise_numbers) for i in range(landKreise.nrows): print(i) lk_name = landKreise[i, dt.f.Landkreis].to_list()[0][0] lk_id = landKreise[i, dt.f.IdLandkreis].to_list()[0][0] if lk_id > 0: censusLK = census[dt.f.IdLandkreis == lk_id, :] bl_name = censusLK[0, dt.f.Bundesland].to_list()[0][0] bl_id = censusLK[0, dt.f.IdBundesland].to_list()[0][0] lk_typ = landKreisTyp(lk_id, lk_name) landkreise_numbers[lk_id] = insertDates(landkreise_numbers[lk_id]) landkreise_numbers[lk_id] = insertRegionInfo( landkreise_numbers[lk_id], lk_id, lk_name, lk_typ, bl_id, bl_name, flaechen[lk_id]) #landkreise_numbers[lk_id] = landkreise_numbers[lk_id][:, dt.f[:].extend( # {"IdLandkreis": lk_id, "Landkreis": lk_name, "IdBundesland": bl_id, "Bundesland": bl_name, # "Flaeche": flaechen[lk_id]})] print(censusLK) landkreise_numbers[lk_id] = makeIncidenceColumns( landkreise_numbers[lk_id], censusLK, Altersgruppen, Geschlechter) pmu.printMemoryUsage("pre save {}".format(lk_name)) pmu.saveCsvTable(landkreise_numbers[lk_id], "series-{}-{}.csv".format(lk_id, lk_name), args.outputDir) #print(landKreise) return fullTable
def main(): #testDatePerf() start = time.perf_counter() lastCheckPointTime = start parser = argparse.ArgumentParser( description='Create a unfied data file from daily dumps') parser.add_argument('files', metavar='fileName', type=str, nargs='+', help='.NPGEO COVID19 Germany data as .csv file') parser.add_argument('-d', '--output-dir', dest='outputDir', default=".") #parser.add_argument("--flushmemfull", help="flush full table to disk for lower memory footprint", # action="store_true") parser.add_argument( "--materializeNew", help="materialize new table to disk for lower memory footprint", action="store_true") parser.add_argument( "--noMaterialize", help= "run with higher memory footprint, or much higher memory footprint with --in-memory", action="store_true") parser.add_argument("--inMemory", help="run faster but with higher memory footprint", action="store_true") parser.add_argument( "--checkpoint", help="write checkpoint after amount of minutes elapsed", default=10) parser.add_argument( "--nthreads", help= "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores", default=0) args = parser.parse_args() print(args) print("args.inMemory", args.inMemory) print("args.materializeNew", args.materializeNew) print("args.noMaterialize", args.noMaterialize) if args.nthreads != 0: dt.options.nthreads = args.nthreads print("dt.options.nthreads", dt.options.nthreads) fullTable = None jayPath = args.outputDir + "/all-data.jay" print(jayPath) pmu.printMemoryUsage("after start") daysIncluded = [] if os.path.isfile(jayPath): print("Loading " + jayPath) fullTable = dt.fread(jayPath) pmu.printMemoryUsage("after load") daysIncluded = sorted( fullTable[:, [dt.first(dt.f.DatenstandTag)], dt.by(dt.f.DatenstandTag)].to_list()[0]) print("Days in full table:") print(daysIncluded) pmu.printMemoryUsage("after first query") addedData = False for fa in args.files: files = sorted(glob.glob(fa)) for f in files: if isNewData(f, daysIncluded): addedData = True fstart = time.perf_counter() pmu.printMemoryUsage("after isNewData query") t = tableData(f) pmu.printMemoryUsage("after tabledata query") print("Hashing " + f) newTable = unify(t) pmu.printMemoryUsage("after hashing") save(newTable, f, args.outputDir) pmu.printMemoryUsage("after newTable save") if fullTable is None: fullTable = newTable else: #print("full fields", fullTable.names) checkColumns(fullTable.names, newTable.names) pmu.printMemoryUsage("after checkColumns") if not args.noMaterialize: fullTable.materialize(to_memory=args.inMemory) pmu.printMemoryUsage("after materialize fullTable") if args.materializeNew: newTable.materialize(to_memory=args.inMemory) pmu.printMemoryUsage("after materialize newTable") pmu.printMemoryUsage("before fulltable rbind") fullTable.rbind(newTable) # memory gets used here pmu.printMemoryUsage("after rbind") ffinish = time.perf_counter() secs = ffinish - fstart #print("fullTable", fullTable) print("newTable rows = {}".format(newTable.nrows)) print("fullTable rows = {}".format(fullTable.nrows)) print( "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours". format(secs, secs / 60, secs / 60 / 60)) if time.perf_counter() - lastCheckPointTime > float( args.checkpoint) * 60: #checkname = args.outputDir+"/"+"all-data.check.jay" #print("Saving checkpoint: " + checkname) #pmu.saveJayTable(fullTable,"all-data.check.jay",args.outputDir) pmu.saveCsvTable(fullTable, "all-data.check.csv", args.outputDir) fullTable = None #fullTable = dt.fread(args.outputDir+"/all-data.check.csv") fullTable = dt.fread(args.outputDir + "/all-data.check.jay") #fullTable.to_jay(checkname) #print("Saving done:" + checkname) lastCheckPointTime = time.perf_counter() if addedData: pmu.printMemoryUsage("before full save") pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir) pmu.printMemoryUsage("after full save") else: print("No new data added, not saving 'all-data.ja'") #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir) finish = time.perf_counter() secs = finish - start print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format( secs, secs / 60, secs / 60 / 60))
def analyze(fullTable, args, oldTables): #fullTable = fullTable[dt.f.DatenstandTag <= 387,:] print("Analyzing") pmu.printMemoryUsage("begin analyze") print("Keys:") print(fullTable.keys()) print(list(zip(fullTable.names, fullTable.stypes))) daysInfullTable = dt.unique(fullTable[:, "DatenstandTag"]).to_list()[0] firstDumpDay = min(daysInfullTable) lastDumpDay = max(daysInfullTable) maxMeldeDay = cint(fullTable[:,"MeldeTag"].max()) if maxMeldeDay > lastDumpDay: print("Future Date in Meldetag ({}), clipping to yesterday, Datenstandtag-1 = {}".format(maxMeldeDay, lastDumpDay)) fullTable["MeldeTag">=lastDumpDay,"MeldeTag"] = lastDumpDay -1 print("firstDumpDay", firstDumpDay) print("lastDumpDay",lastDumpDay) print("maxMeldeDay",maxMeldeDay) fromDay = firstDumpDay toDay = lastDumpDay+1 #fromDay = lastDumpDay-1 if len(oldTables)>0: # calculate which rows are needed for the update daysInOldTables = dt.unique(oldTables[0][:, "DatenstandTag"]).to_list()[0] newDays = sorted(list(set(daysInfullTable).difference(set(daysInOldTables)))) print("newDays",newDays) if len(newDays) == 0: print("Nothing to update") exit(9) minNewDay = min(newDays) maxNewDay = max(newDays) minNewDay7daysAgo = minNewDay - 7 maxNewDay7daysAgo = maxNewDay - 7 fullTable = fullTable[((dt.f.DatenstandTag >= minNewDay) & (dt.f.DatenstandTag <= maxNewDay)) | ((dt.f.DatenstandTag >= minNewDay7daysAgo) & (dt.f.DatenstandTag <= maxNewDay7daysAgo)),:] #fullTable.materialize() daysInfullTable = dt.unique(fullTable[:, "DatenstandTag"]).to_list()[0] print("daysInfullTable",daysInfullTable) fullTable = fullTable[:, dt.f[:].extend({"MeldeDelay": dt.f.DatenstandTag-dt.f.MeldeTag-1})] fullTable = fullTable[:, dt.f[:].extend({"RefDelay": dt.f.DatenstandTag-dt.f.RefTag-1})] #fullTable.materialize() Altersgruppen = [] if args.agegroups: Altersgruppen = dt.unique(fullTable[:,"Altersgruppe"]).to_list()[0] print("Altersgruppen", Altersgruppen) Geschlechter = [] if args.gender: Geschlechter = dt.unique(fullTable[:,"Geschlecht"]).to_list()[0] print("Geschlechter", Geschlechter) census = dt.fread("CensusByRKIAgeGroups.csv") censusDeutschland = census[dt.f.Name == "Deutschland",:] print(censusDeutschland) flaechen = loadFlaechen() print("Processing 'Deutschland'") pmu.printMemoryUsage("begin Deutschland") deutschland = analyzeDailyAltersgruppenGeschlechter(fullTable, fromDay, toDay, True, True, Altersgruppen, Geschlechter) deutschland = insertDates(deutschland) deutschland = insertRegionInfo(deutschland, 0, "Deutschland", "BR", 0, "Deutschland", flaechen[0]) deutschland = insertEinwohnerColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter, "Flaeche") print(deutschland) pmu.printMemoryUsage("pre makeIncidenceColumns") #deutschland = makeIncidenceColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter) #print(deutschland) if len(oldTables) > 0: deutschland = updateOldTable(oldTables[0], deutschland) pmu.printMemoryUsage("pre save") pmu.saveCsvTable(deutschland, "series-{}-{}.csv".format(0, "Deutschland"), args.outputDir) pmu.printMemoryUsage("post save") deutschland = None #exit(0) print("Processing Bundesländer") bundeslaender, bundeslaender_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdBundesland, dt.f.Bundesland, Altersgruppen, Geschlechter) pmu.printMemoryUsage("post Bundesländer timeSeries") for i in range(bundeslaender.nrows): bl_name=bundeslaender[i,dt.f.Bundesland].to_list()[0][0] bl_id=bundeslaender[i,dt.f.IdBundesland].to_list()[0][0] if bl_id > 0: bundeslaender_numbers[bl_id] = insertDates(bundeslaender_numbers[bl_id]) bundeslaender_numbers[bl_id] = insertRegionInfo(bundeslaender_numbers[bl_id], bl_id, bl_name, "BL", bl_id, bl_name, flaechen[0]) censusBL = census[dt.f.IdLandkreis == bl_id, :] bundeslaender_numbers[bl_id] = insertEinwohnerColumns(bundeslaender_numbers[bl_id], censusBL, Altersgruppen, Geschlechter, "Flaeche") if len(oldTables) > 0: bundeslaender_numbers[bl_id] = updateOldTable(oldTables[bl_id], bundeslaender_numbers[bl_id]) print(censusBL) pmu.printMemoryUsage("pre save {}".format(bl_name)) pmu.saveCsvTable(bundeslaender_numbers[bl_id], "series-{}-{}.csv".format(bl_id, bl_name), args.outputDir) bundeslaender = None bundeslaender_numbers = None print("Processing Landkreise'") landKreise, landkreise_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdLandkreis, dt.f.Landkreis, Altersgruppen, Geschlechter) pmu.printMemoryUsage("post Landkreise timeSeries") #print(landKreise) #print(landkreise_numbers) for i in range(landKreise.nrows): print(i) lk_name = landKreise[i, dt.f.Landkreis].to_list()[0][0] lk_id = landKreise[i, dt.f.IdLandkreis].to_list()[0][0] if lk_name == "LK Saarpfalz-Kreis": lk_name = "LK Saar-Pfalz-Kreis" if lk_id > 0: censusLK = census[dt.f.IdLandkreis == lk_id, :] bl_name = censusLK[0,dt.f.Bundesland].to_list()[0][0] bl_id = censusLK[0, dt.f.IdBundesland].to_list()[0][0] lk_typ = landKreisTyp(lk_id, lk_name) landkreise_numbers[lk_id] = insertDates(landkreise_numbers[lk_id]) landkreise_numbers[lk_id] = insertRegionInfo(landkreise_numbers[lk_id], lk_id, lk_name, lk_typ, bl_id, bl_name, flaechen[lk_id]) #print(censusLK) landkreise_numbers[lk_id] = insertEinwohnerColumns(landkreise_numbers[lk_id], censusLK, Altersgruppen, Geschlechter, "Flaeche") if len(oldTables) > 0: landkreise_numbers[lk_id] = updateOldTable(oldTables[lk_id], landkreise_numbers[lk_id]) pmu.printMemoryUsage("pre save {}".format(lk_name)) pmu.saveCsvTable(landkreise_numbers[lk_id], "series-{}-{}.csv".format(lk_id, lk_name), args.outputDir) #print(landKreise) return fullTable
def main(): #testDatePerf() start = time.perf_counter() lastCheckPointTime = start parser = argparse.ArgumentParser( description='Create a unfied data file from daily dumps') parser.add_argument('files', metavar='fileName', type=str, nargs='+', help='.NPGEO COVID19 Germany data as .csv file') parser.add_argument('-d', '--output-dir', dest='outputDir', default=".") parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".") parser.add_argument( "--flushread", help= "flush full table an re-read after checkpoint lower memory footprint", action="store_true") parser.add_argument( "--partition", help= "save data in partionions instead of one file; slower, but you can see progress and maybe need less memory, but ymmv", action="store_true") parser.add_argument("--backup", help="create backup files before overwriting", action="store_true") parser.add_argument("--resume", help="read already unified .csv files first", action="store_true") parser.add_argument( "--unsafe", help= "directly overwrite output files, will corrupt the output file when killed while writing, but uses less disk space (only applies to single .jay file in non-partition mode)", action="store_true") parser.add_argument("--force", help="build new database anyway", action="store_true") parser.add_argument( "--destructivesave", help= "release memory gradually while saving and reload after saving (experimental, untested, only applies to partiioned write)", action="store_true") #parser.add_argument("--incremental", help="only load partial data", action="store_true") parser.add_argument("-v", "--verbose", help="make more noise", action="store_true") parser.add_argument("--partitionsize", type=int, help="number of records per partition", default=10000000) parser.add_argument("--memorylimit", type=int, help="maximum memory limit for a database file") parser.add_argument( "--checkpoint", type=int, help="write checkpoint after amount of minutes elapsed", default=60) parser.add_argument( "--nthreads", type=int, help= "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores", default=0) args = parser.parse_args() print(args) # print("args.inMemory",args.inMemory) # print("args.materializeNew",args.materializeNew) # print("args.noMaterialize",args.noMaterialize) if args.nthreads != 0: dt.options.nthreads = args.nthreads print("dt.options.nthreads", dt.options.nthreads) fullTable = None jayFile = "all-data.jay" jayPath = os.path.join(args.outputDir, jayFile) print(jayPath) pmu.printMemoryUsage("after start") partitioned = False if not args.force: if os.path.isfile(jayPath): print("Loading " + jayPath) fullTable = dt.fread(jayPath, tempdir=args.tempDir, memory_limit=args.memorylimit, verbose=args.verbose) elif len(pmu.getJayTablePartitions(jayPath)) > 0: fullTable = pmu.loadJayTablePartioned( jayPath, tempdir=args.tempDir, memory_limit=args.memorylimit, verbose=args.verbose) if fullTable == None: print( "The file {} is not a valid jay file, please remove it and retry" ) exit(1) partitioned = True daysIncluded = [] addedData = False version = 1 lastversion = 0 for fa in args.files: files = sorted(glob.glob(fa)) for f in files: if fullTable is not None and version != lastversion: pmu.printMemoryUsage("after load") daysIncluded = sorted( fullTable[:, [dt.first(dt.f.DatenstandTag)], dt.by(dt.f.DatenstandTag)].to_list()[0]) print("Days in full table:") print(daysIncluded) pmu.printMemoryUsage("after first query") lastversion = version if isNewData(f, daysIncluded): pmu.printMemoryUsage("after isNewData query") fstart = time.perf_counter() unifiedTable = None if args.resume: unifiedTable = load(f, args.outputDir) addedData = True version = version + 1 if unifiedTable is None: t = tableData(f) pmu.printMemoryUsage("after tabledata query") print("Unifying " + f) unifiedTable = unify(t) pmu.printMemoryUsage("after hashing") save(unifiedTable, f, args.outputDir) pmu.printMemoryUsage("after unifiedTable save") if fullTable is None: fullTable = unifiedTable else: #print("full fields", fullTable.names) checkColumns(fullTable.names, unifiedTable.names) #print("unifiedTable.names",unifiedTable.names) pmu.printMemoryUsage("before fulltable rbind") fullTable.rbind(unifiedTable) # memory gets used here #print("fullTable.names",fullTable.names) pmu.printMemoryUsage("after rbind") ffinish = time.perf_counter() secs = ffinish - fstart #print("fullTable", fullTable) print("unifiedTable rows = {}".format(unifiedTable.nrows)) print("fullTable rows = {}".format(fullTable.nrows)) print( "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours". format(secs, secs / 60, secs / 60 / 60)) if time.perf_counter() - lastCheckPointTime > float( args.checkpoint) * 60: print("Saving checkpoint @ {}".format(datetime.now())) if args.partition: pmu.saveJayTablePartioned(fullTable, jayFile, args.outputDir, args.partitionsize, True, args.destructivesave) if args.flushread or args.destructivesave: print("Re-reading checkpoint @ {}".format( datetime.now())) fullTable = None fullTable = pmu.loadJayTablePartioned( jayPath, tempdir=args.tempDir, memory_limit=args.memorylimit, verbose=args.verbose) else: pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir, args.backup, args.unsafe) lastCheckPointTime = time.perf_counter() print("Checkpoint done @ {}".format(datetime.now())) if addedData or (args.partition != partitioned): pmu.printMemoryUsage("before full save") if args.partition: pmu.saveJayTablePartioned(fullTable, "all-data.jay", args.outputDir, args.partitionsize, True, args.destructivesave) else: pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir, args.backup, args.unsafe) pmu.printMemoryUsage("after full save") else: print("No new data added, not saving.'") #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir) finish = time.perf_counter() secs = finish - start print("Finished in {:.1f} secs or {:.1f} mins or {:.1f} hours".format( secs, secs / 60, secs / 60 / 60))
def unify(table, makeFallGruppe=False): dss = table[0, "Datenstand"] ds = cd.datetimeFromDatenstandAny(dss) if 'FID' in table.names: table.names = {"FID": "ObjectId"} dsdy = cd.dayFromDate(ds) dsisodate = cd.dateStrYMDFromDay(dsdy) hasRefdatum = "Refdatum" in table.names hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names #t = table.copy() t = table if "Altersgruppe2" in table.names: t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])] if not "DatenstandISO" in table.names: t = t[:, dt.f[:].extend({"DatenstandISO": dsisodate})] if not hasRefdatum: t = t[:, dt.f[:].extend({"Refdatum": 0})] hasRefdatumISO = "RefdatumISO" in table.names if not hasRefdatumISO: #print("t1",t.names) t = t[:, dt.f[:].extend({"RefdatumISO": ""})] #print("t2",t.names) hasMeldedatumISO = "MeldedatumISO" in table.names if not hasMeldedatumISO: t = t[:, dt.f[:].extend({"MeldedatumISO": ""})] if not hasErkrankungsbeginn: t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})] if "NeuGenesen" not in table.names: t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})] if makeFallGruppe: t = t[:, dt.f[:].extend({ "FallGruppe": "", "MeldeTag": nan, "RefTag": nan, "DatenstandTag": dsdy })] t = t[:, dt.f[:].extend({ "MeldeTag": nan, "RefTag": nan, "DatenstandTag": dsdy })] pmu.printMemoryUsage("unify pre dict") d = t.to_dict() pmu.printMemoryUsage("unify post dict") print("> iterating through {} rows".format(t.nrows)) start = time.perf_counter() for r in range(t.nrows): mds = d["Meldedatum"][r] if pmu.is_int(mds): md = cd.datetimeFromStampStr(mds) else: md = datetimeFromDateStr3(mds) d["Meldedatum"][r] = ticksFromDateTime(md) mdy = cd.dayFromDate(md) d["MeldeTag"][r] = mdy if not hasRefdatum: d["Refdatum"][r] = ticksFromDateTime(md) d["RefTag"][r] = mdy if not hasMeldedatumISO: d["MeldedatumISO"][r] = cd.dateStrYMDFromDay(mdy) if makeFallGruppe: fg = str(d["IdLandkreis"] [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str( int(d["MeldeTag"][r])) #if int(d["IstErkrankungsbeginn"][r]) == 1: rds = d["Refdatum"][r] if pmu.is_int(rds): rd = cd.datetimeFromStampStr(rds) else: rd = datetimeFromDateStr3(rds) d["Refdatum"][r] = ticksFromDateTime(rd) rdy = cd.dayFromDate(rd) d["RefTag"][r] = rdy if not hasRefdatumISO: d["RefdatumISO"][r] = cd.dateStrYMDFromDay(rdy) if makeFallGruppe: fg = fg + ":" + str(rdy) if makeFallGruppe: d["FallGruppe"][r] = fg checkLandkreisData(d, r, Census, Flaeche) finish = time.perf_counter() print("< iterating through {} rows done, {:.1f} rows/sec".format( t.nrows, t.nrows / (finish - start))) pmu.printMemoryUsage("end of unify, pre frame") t = dt.Frame(d) pmu.printMemoryUsage("end of unify, post frame") return t