コード例 #1
0
def timeSeries(fullTable, fromDay, toDay, byCriteria, nameColumn,
               Altersgruppen, Geschlechter):
    regions = fullTable[:, [dt.first(nameColumn)], dt.by(byCriteria)]
    #regions = regions[:5,:]
    print("Creating time series for regions:")
    print(regions)
    dailysByCriteria = {}
    start = time.perf_counter()
    for i, lk in enumerate(regions[:, byCriteria].to_list()[0]):
        print("Processing Region '{}'".format(regions[i, nameColumn][0, 0]))
        start_region = time.perf_counter()

        pmu.printMemoryUsage("pre analyzeDailyAltersgruppenGeschlechter")
        dailysByCriteria[lk] = analyzeDailyAltersgruppenGeschlechter(
            fullTable,
            filterByDayAndCriteria(fromDay, toDay, (byCriteria == lk)),
            Altersgruppen, Geschlechter)
        finish = time.perf_counter()
        duration = finish - start
        print(
            "Region took {:.2f} seconds, elapsed {:.2f} minutes, time to completion: {:.2f} minutes"
            .format(finish - start_region, duration / 60,
                    duration / (i + 1) * (regions.nrows - i) / 60))

        pmu.printMemoryUsage("post analyzeDailyAltersgruppenGeschlechter")
        print("Done {} of {}, key = {} name = {}".format(
            i + 1, regions.nrows, lk, regions[i, nameColumn][0, 0]))
        #if lk >= 0:
        #    break
    return regions, dailysByCriteria
コード例 #2
0
ファイル: database.py プロジェクト: kathsel/Corona
def main():
    dt.options.progress.enabled = False

    parser = argparse.ArgumentParser(description='Fast queries on all data')
    parser.add_argument('file', metavar='fileName', type=str, nargs='?',
                        help='.Full unified NPGEO COVID19 Germany data as .csv or .jay file',
                        default="archive_v2/all-data.jay")
    parser.add_argument('-d', '--output-dir', dest='outputDir', default="series")
    parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".")
    parser.add_argument('-i', '--incremental-update-dir', dest='incrementalUpdateDir', default="")
    parser.add_argument("--agegroups", help="also create columns for all seperate age groups", action="store_true")
    parser.add_argument("--gender", help="also create columns for all seperate gender groups", action="store_true")
    parser.add_argument("-v", "--verbose", help="make more noise", action="store_true")
    parser.add_argument("--memorylimit", type=int, help="number of records per partition")

    args = parser.parse_args()
    #print(args)
    partitioned = False
    pmu.printMemoryUsage("after start")
    if len(pmu.getJayTablePartitions(args.file)) > 0:
        fullTable = pmu.loadJayTablePartioned(args.file, tempDir=args.tempDir, memoryLimit=args.memorylimit, verbose=args.verbose)
        partitioned = True
    elif os.path.isfile(args.file):
        print("Loading " + args.file)
        fullTable = dt.fread(args.file, tempdir=args.tempDir, memory_limit=args.memoryLimit, verbose=args.verbose)

    print("Loading done loading table from ‘{}‘, rows: {} cols: {}".format(args.file, fullTable.nrows, fullTable.ncols))
    pmu.printMemoryUsage("after load")

    oldTables = {}
    if args.incrementalUpdateDir != "":
        updateFiles = sorted(glob.glob(args.incrementalUpdateDir+"/*.csv"))
        for f in updateFiles:
            if not f.endswith("--nicht erhoben-.csv"):
                table = dt.fread(f)
                print("Load {}".format(f))
                lkID = table[0,"IdLandkreis"]#.to_list()[0][0]
                print("lkID {}".format(lkID))
                oldTables[lkID] = table

    analyze(fullTable, args, oldTables)
コード例 #3
0
def main():
    parser = argparse.ArgumentParser(description='Fast queries on all data')
    parser.add_argument(
        'file',
        metavar='fileName',
        type=str,
        nargs='?',
        help='.Full unified NPGEO COVID19 Germany data as .csv or .jay file',
        default="archive_v2/all-data.jay")
    parser.add_argument('-d',
                        '--output-dir',
                        dest='outputDir',
                        default="series")
    parser.add_argument("--agegroups",
                        help="also create columns for all seperate age groups",
                        action="store_true")

    args = parser.parse_args()
    #print(args)
    pmu.printMemoryUsage("after start")
    print("Loading " + args.file)
    fullTable = dt.fread(args.file)
    print("Loading done loading table from ‘{}‘, rows: {} cols: {}".format(
        args.file, fullTable.nrows, fullTable.ncols))
    pmu.printMemoryUsage("after load")

    if False:
        print("Materializing fullTable")
        fullTable.materialize(to_memory=True)
        pmu.printMemoryUsage("after materialize")

    analyze(fullTable, args)
コード例 #4
0
ファイル: database.py プロジェクト: ralfr/Corona
def main():
    dt.options.progress.enabled = False

    parser = argparse.ArgumentParser(description='Fast queries on all data')
    parser.add_argument(
        'file',
        metavar='fileName',
        type=str,
        nargs='?',
        help='.Full unified NPGEO COVID19 Germany data as .csv or .jay file',
        default="archive_v2/all-data.jay")
    parser.add_argument('-d',
                        '--output-dir',
                        dest='outputDir',
                        default="series")
    parser.add_argument('-i',
                        '--incremental-update-dir',
                        dest='incrementalUpdateDir',
                        default="")
    parser.add_argument("--agegroups",
                        help="also create columns for all seperate age groups",
                        action="store_true")
    parser.add_argument(
        "--gender",
        help="also create columns for all seperate gender groups",
        action="store_true")

    args = parser.parse_args()
    #print(args)
    pmu.printMemoryUsage("after start")
    print("Loading " + args.file)
    fullTable = dt.fread(args.file)
    print("Loading done loading table from ‘{}‘, rows: {} cols: {}".format(
        args.file, fullTable.nrows, fullTable.ncols))
    pmu.printMemoryUsage("after load")

    oldTables = {}
    if args.incrementalUpdateDir != "":
        updateFiles = sorted(glob.glob(args.incrementalUpdateDir + "/*.csv"))
        for f in updateFiles:
            if not f.endswith("--nicht erhoben-.csv"):
                table = dt.fread(f)
                print("Load {}".format(f))
                lkID = table[0, "IdLandkreis"]  #.to_list()[0][0]
                print("lkID {}".format(lkID))
                oldTables[lkID] = table

    if False:
        print("Materializing fullTable")
        fullTable.materialize(to_memory=True)
        pmu.printMemoryUsage("after materialize")

    analyze(fullTable, args, oldTables)
コード例 #5
0
def isNewData(dataFilename, daysIncluded):
    pmu.printMemoryUsage("begin of isNewData")
    peekTable = dt.fread(dataFilename, max_nrows=1)
    print("Checking " + dataFilename)
    ##print(peekTable)
    ##datenStand = peekTable[0, dt.f.DatenstandISO]
    dss = peekTable[0, "Datenstand"]
    print("Datenstand", dss)
    ds = cd.datetimeFromDatenstandAny(dss)
    dsdy = cd.dayFromDate(ds)
    pmu.printMemoryUsage("isNewData")
    isNew = dsdy not in daysIncluded
    if isNew:
        print("contains new day {}".format(dsdy))
    else:
        print("contains day {} already in full table".format(dsdy))
    pmu.printMemoryUsage("end of isNewData")

    return isNew
コード例 #6
0
def main():
    #testDatePerf()
    start = time.perf_counter()
    lastCheckPointTime = start
    parser = argparse.ArgumentParser(
        description='Create a unfied data file from daily dumps')
    parser.add_argument('files',
                        metavar='fileName',
                        type=str,
                        nargs='+',
                        help='.NPGEO COVID19 Germany data as .csv file')
    parser.add_argument('-d', '--output-dir', dest='outputDir', default=".")
    parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".")
    parser.add_argument(
        "--flushread",
        help=
        "flush full table an re-read after checkpoint lower memory footprint",
        action="store_true")
    parser.add_argument("--force",
                        help="build new database anyway",
                        action="store_true")
    parser.add_argument(
        "--destructivesave",
        help="release memory gradually while saving and reload after saving",
        action="store_true")
    parser.add_argument("-v",
                        "--verbose",
                        help="make more noise",
                        action="store_true")
    parser.add_argument("--partitionsize",
                        type=int,
                        help="number of records per partition",
                        default=10000000)
    parser.add_argument("--memorylimit",
                        type=int,
                        help="maximum memory limit for a database file")
    parser.add_argument(
        "--checkpoint",
        type=int,
        help="write checkpoint after amount of minutes elapsed",
        default=10)
    parser.add_argument(
        "--nthreads",
        type=int,
        help=
        "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores",
        default=0)

    args = parser.parse_args()
    print(args)
    # print("args.inMemory",args.inMemory)
    # print("args.materializeNew",args.materializeNew)
    # print("args.noMaterialize",args.noMaterialize)

    if args.nthreads != 0:
        dt.options.nthreads = args.nthreads
    print("dt.options.nthreads", dt.options.nthreads)

    fullTable = None
    jayFile = "all-data.jay"
    jayPath = os.path.join(args.outputDir, jayFile)
    print(jayPath)
    pmu.printMemoryUsage("after start")

    partitioned = False
    daysIncluded = []
    if len(pmu.getJayTablePartitions(jayPath)) > 0:
        fullTable = pmu.loadJayTablePartioned(jayPath,
                                              tempDir=args.tempDir,
                                              memoryLimit=args.memorylimit,
                                              verbose=args.verbose)
        if fullTable == None:
            print(
                "The file {} is not a valid jay file, please remove it and retry"
            )
            exit(1)
        partitioned = True
    elif os.path.isfile(jayPath):
        print("Loading " + jayPath)
        fullTable = dt.fread(jayPath)

    if fullTable is not None:
        pmu.printMemoryUsage("after load")
        daysIncluded = sorted(
            fullTable[:, [dt.first(dt.f.DatenstandTag)],
                      dt.by(dt.f.DatenstandTag)].to_list()[0])
        print("Days in full table:")
        print(daysIncluded)
        pmu.printMemoryUsage("after first query")

    addedData = False
    for fa in args.files:
        files = sorted(glob.glob(fa))
        for f in files:
            if isNewData(f, daysIncluded):
                addedData = True
                fstart = time.perf_counter()
                pmu.printMemoryUsage("after isNewData query")
                t = tableData(f)
                pmu.printMemoryUsage("after tabledata query")

                print("Hashing " + f)
                newTable = unify(t)
                pmu.printMemoryUsage("after hashing")
                save(newTable, f, args.outputDir)
                pmu.printMemoryUsage("after newTable save")
                if fullTable is None:
                    fullTable = newTable
                else:
                    #print("full fields", fullTable.names)
                    checkColumns(fullTable.names, newTable.names)
                    pmu.printMemoryUsage("before fulltable rbind")
                    fullTable.rbind(newTable)  # memory gets used here
                    pmu.printMemoryUsage("after rbind")
                ffinish = time.perf_counter()
                secs = ffinish - fstart
                #print("fullTable", fullTable)
                print("newTable rows = {}".format(newTable.nrows))
                print("fullTable rows = {}".format(fullTable.nrows))
                print(
                    "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours".
                    format(secs, secs / 60, secs / 60 / 60))
                if time.perf_counter() - lastCheckPointTime > float(
                        args.checkpoint) * 60:
                    print("Saving checkpoint @ {}".format(datetime.now()))
                    pmu.saveJayTablePartioned(fullTable, jayFile,
                                              args.outputDir,
                                              args.partitionsize, True,
                                              args.destructivesave)
                    if args.flushread or args.destructivesave:
                        print("Re-reading checkpoint @ {}".format(
                            datetime.now()))
                        fullTable = None
                        fullTable = pmu.loadJayTablePartioned(
                            jayPath,
                            tempDir=args.tempDir,
                            memoryLimit=args.memorylimit,
                            verbose=args.verbose)
                    lastCheckPointTime = time.perf_counter()
                    print("Checkpoint done @ {}".format(datetime.now()))

    if addedData or not partitioned:
        pmu.printMemoryUsage("before full save")
        #pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir)
        pmu.saveJayTablePartioned(fullTable, "all-data.jay", args.outputDir,
                                  args.partitionsize, True,
                                  args.destructivesave)
        pmu.printMemoryUsage("after full save")
    else:
        print("No new data added, not saving 'all-data.jay'")
    #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir)
    finish = time.perf_counter()
    secs = finish - start
    print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format(
        secs, secs / 60, secs / 60 / 60))
コード例 #7
0
def unify(table):
    dss = table[0, "Datenstand"]
    ds = cd.datetimeFromDatenstandAny(dss)

    dsdy = cd.dayFromDate(ds)
    hasRefdatum = "Refdatum" in table.names
    hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names
    #t = table.copy()
    t = table
    if "Altersgruppe2" in table.names:
        t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])]
    if not "DatenstandISO" in table.names:
        isodate = cd.dateStrYMDFromDay(dsdy)
        t = t[:, dt.f[:].extend({"DatenstandISO": isodate})]
    if not hasRefdatum:
        t = t[:, dt.f[:].extend({
            "Refdatum": str(cd.day0d),
            "RefdatumISO": dt.f.MeldedatumISO
        })]
    if not hasErkrankungsbeginn:
        t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})]

    if "NeuGenesen" not in table.names:
        t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})]

    t = t[:, dt.f[:].extend({
        "FallGruppe": "",
        "MeldeTag": nan,
        "RefTag": nan,
        "DatenstandTag": dsdy
    })]

    #t = t[:, dt.f[:].extend({"Bevoelkerung":0, "FaellePro100k":0.0, "TodesfaellePro100k":0.0, "isStadt":False})]
    #t = t[:, dt.f[:].extend({"Flaeche":0.0, "FaelleProKm2":0.0, "TodesfaelleProKm2":0.0, "Dichte":0.0})]

    #print("unified fields", t.names)

    #Bevoelkerung = loadLandkreisBeveolkerung()
    #Flaeche = loadLandkreisFlaeche()
    #Census = loadCensus()

    #pmu.printMemoryUsage("unify pre realize ")
    #t.materialize(to_memory=True)

    pmu.printMemoryUsage("unify pre dict")
    d = t.to_dict()
    pmu.printMemoryUsage("unify post dict")

    print("> iterating through {} rows".format(t.nrows))
    start = time.perf_counter()
    for r in range(t.nrows):
        mds = d["Meldedatum"][r]
        if pmu.is_int(mds):
            md = cd.datetimeFromStampStr(mds)
        else:
            md = datetimeFromDateStr3(mds)
        mdy = cd.dayFromDate(md)
        d["MeldeTag"][r] = mdy
        if not hasRefdatum:
            d["Refdatum"][r] = str(md)
            d["RefTag"][r] = mdy

        fg = str(d["IdLandkreis"]
                 [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str(
                     int(d["MeldeTag"][r]))

        if int(d["IstErkrankungsbeginn"][r]) == 1:
            rds = d["Refdatum"][r]
            if pmu.is_int(rds):
                rd = cd.datetimeFromStampStr(rds)
            else:
                rd = datetimeFromDateStr3(rds)
            rdy = cd.dayFromDate(rd)
            d["RefTag"][r] = rdy
            fg = fg + ":" + str(rdy)
        d["FallGruppe"][r] = fg
        checkLandkreisData(d, r, Census, Flaeche)

    finish = time.perf_counter()

    print("< iterating through {} rows done, {:.1f} rows/sec".format(
        t.nrows, t.nrows / (finish - start)))

    pmu.printMemoryUsage("end of unify, pre frame")
    t = dt.Frame(d)
    pmu.printMemoryUsage("end of unify, post frame")
    return t
コード例 #8
0
def analyze(fullTable, args):
    #fullTable = fullTable[dt.f.DatenstandTag > 382 - 20,:]
    print("Analyzing")
    pmu.printMemoryUsage("begin analyze")
    print("Keys:")
    print(fullTable.keys())
    firstDumpDay = cint(fullTable[:, "DatenstandTag"].min())
    lastDumpDay = cint(fullTable[:, "DatenstandTag"].max())
    print("firstDumpDay", firstDumpDay)
    print("lastDumpDay", lastDumpDay)

    #fromDay = lastDumpDay-27
    fromDay = firstDumpDay
    toDay = lastDumpDay + 1

    fullTable = fullTable[:, dt.f[:].extend(
        {"MeldeDelay": dt.f.DatenstandTag - dt.f.MeldeTag - 1})]
    fullTable = fullTable[:, dt.f[:].extend(
        {"RefDelay": dt.f.DatenstandTag - dt.f.RefTag - 1})]
    fullTable.materialize()

    Altersgruppen = []
    if args.agegroups:
        Altersgruppen = dt.unique(fullTable[:, "Altersgruppe"]).to_list()[0]

    print("Altersgruppen", Altersgruppen)

    Geschlechter = dt.unique(fullTable[:, "Geschlecht"]).to_list()[0]
    print("Geschlechter", Geschlechter)

    census = dt.fread("CensusByRKIAgeGroups.csv")
    censusDeutschland = census[dt.f.Name == "Deutschland", :]
    print(censusDeutschland)

    flaechen = loadFlaechen()
    #for id in range(1,16):
    #    censusBL = census[dt.f.Code == id, :]
    #    print(censusBL)

    print("Processing 'Deutschland'")
    pmu.printMemoryUsage("begin Deutschland")
    deutschland = analyzeDailyAltersgruppenGeschlechter(
        fullTable, filterByDay(fromDay, toDay), Altersgruppen, Geschlechter)
    deutschland = insertDates(deutschland)
    deutschland = insertRegionInfo(deutschland, 0, "Deutschland", "BR", 0,
                                   "Deutschland", flaechen[0])

    print(deutschland)
    pmu.printMemoryUsage("pre makeIncidenceColumns")

    deutschland = makeIncidenceColumns(deutschland, censusDeutschland,
                                       Altersgruppen, Geschlechter)
    print(deutschland)
    pmu.printMemoryUsage("pre save")
    pmu.saveCsvTable(deutschland, "series-{}-{}.csv".format(0, "Deutschland"),
                     args.outputDir)
    pmu.printMemoryUsage("post save")
    deutschland = None

    #exit(0)

    print("Processing Bundesländer")
    bundeslaender, bundeslaender_numbers = timeSeries(fullTable, fromDay,
                                                      toDay, dt.f.IdBundesland,
                                                      dt.f.Bundesland,
                                                      Altersgruppen,
                                                      Geschlechter)
    pmu.printMemoryUsage("post Bundesländer timeSeries")
    for i in range(bundeslaender.nrows):
        bl_name = bundeslaender[i, dt.f.Bundesland].to_list()[0][0]
        bl_id = bundeslaender[i, dt.f.IdBundesland].to_list()[0][0]

        if bl_id > 0:
            #bundeslaender_numbers[bl_id] = bundeslaender_numbers[bl_id][:, dt.f[:].extend(
            #    {"IdLandkreis": bl_id, "Landkreis": bl_name, "IdBundesland": bl_id, "Bundesland": bl_name, "Flaeche" : flaechen[bl_id]})]
            bundeslaender_numbers[bl_id] = insertDates(
                bundeslaender_numbers[bl_id])
            bundeslaender_numbers[bl_id] = insertRegionInfo(
                bundeslaender_numbers[bl_id], bl_id, bl_name, "BL", bl_id,
                bl_name, flaechen[0])
            censusBL = census[dt.f.IdLandkreis == bl_id, :]
            print(censusBL)
            bundeslaender_numbers[bl_id] = makeIncidenceColumns(
                bundeslaender_numbers[bl_id], censusBL, Altersgruppen,
                Geschlechter)
        pmu.printMemoryUsage("pre save {}".format(bl_name))

        pmu.saveCsvTable(bundeslaender_numbers[bl_id],
                         "series-{}-{}.csv".format(bl_id,
                                                   bl_name), args.outputDir)
    bundeslaender = None
    bundeslaender_numbers = None

    print("Processing Landkreise'")
    landKreise, landkreise_numbers = timeSeries(fullTable, fromDay, toDay,
                                                dt.f.IdLandkreis,
                                                dt.f.Landkreis, Altersgruppen,
                                                Geschlechter)
    pmu.printMemoryUsage("post Landkreise timeSeries")
    #print(landKreise)
    #print(landkreise_numbers)
    for i in range(landKreise.nrows):
        print(i)
        lk_name = landKreise[i, dt.f.Landkreis].to_list()[0][0]
        lk_id = landKreise[i, dt.f.IdLandkreis].to_list()[0][0]
        if lk_id > 0:
            censusLK = census[dt.f.IdLandkreis == lk_id, :]
            bl_name = censusLK[0, dt.f.Bundesland].to_list()[0][0]
            bl_id = censusLK[0, dt.f.IdBundesland].to_list()[0][0]
            lk_typ = landKreisTyp(lk_id, lk_name)

            landkreise_numbers[lk_id] = insertDates(landkreise_numbers[lk_id])
            landkreise_numbers[lk_id] = insertRegionInfo(
                landkreise_numbers[lk_id], lk_id, lk_name, lk_typ, bl_id,
                bl_name, flaechen[lk_id])
            #landkreise_numbers[lk_id] = landkreise_numbers[lk_id][:, dt.f[:].extend(
            #   {"IdLandkreis": lk_id, "Landkreis": lk_name, "IdBundesland": bl_id, "Bundesland": bl_name,
            #    "Flaeche": flaechen[lk_id]})]
            print(censusLK)
            landkreise_numbers[lk_id] = makeIncidenceColumns(
                landkreise_numbers[lk_id], censusLK, Altersgruppen,
                Geschlechter)
        pmu.printMemoryUsage("pre save {}".format(lk_name))
        pmu.saveCsvTable(landkreise_numbers[lk_id],
                         "series-{}-{}.csv".format(lk_id,
                                                   lk_name), args.outputDir)
    #print(landKreise)

    return fullTable
コード例 #9
0
def main():
    #testDatePerf()
    start = time.perf_counter()
    lastCheckPointTime = start
    parser = argparse.ArgumentParser(
        description='Create a unfied data file from daily dumps')
    parser.add_argument('files',
                        metavar='fileName',
                        type=str,
                        nargs='+',
                        help='.NPGEO COVID19 Germany data as .csv file')
    parser.add_argument('-d', '--output-dir', dest='outputDir', default=".")
    #parser.add_argument("--flushmemfull", help="flush full table to disk for lower memory footprint",
    #                    action="store_true")
    parser.add_argument(
        "--materializeNew",
        help="materialize new table to disk for lower memory footprint",
        action="store_true")
    parser.add_argument(
        "--noMaterialize",
        help=
        "run with higher memory footprint, or much higher memory footprint with --in-memory",
        action="store_true")
    parser.add_argument("--inMemory",
                        help="run faster but with higher memory footprint",
                        action="store_true")
    parser.add_argument(
        "--checkpoint",
        help="write checkpoint after amount of minutes elapsed",
        default=10)
    parser.add_argument(
        "--nthreads",
        help=
        "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores",
        default=0)

    args = parser.parse_args()
    print(args)
    print("args.inMemory", args.inMemory)
    print("args.materializeNew", args.materializeNew)
    print("args.noMaterialize", args.noMaterialize)

    if args.nthreads != 0:
        dt.options.nthreads = args.nthreads
    print("dt.options.nthreads", dt.options.nthreads)

    fullTable = None
    jayPath = args.outputDir + "/all-data.jay"
    print(jayPath)
    pmu.printMemoryUsage("after start")

    daysIncluded = []
    if os.path.isfile(jayPath):
        print("Loading " + jayPath)
        fullTable = dt.fread(jayPath)
        pmu.printMemoryUsage("after load")
        daysIncluded = sorted(
            fullTable[:, [dt.first(dt.f.DatenstandTag)],
                      dt.by(dt.f.DatenstandTag)].to_list()[0])
        print("Days in full table:")
        print(daysIncluded)
        pmu.printMemoryUsage("after first query")

    addedData = False
    for fa in args.files:
        files = sorted(glob.glob(fa))
        for f in files:
            if isNewData(f, daysIncluded):
                addedData = True
                fstart = time.perf_counter()
                pmu.printMemoryUsage("after isNewData query")
                t = tableData(f)
                pmu.printMemoryUsage("after tabledata query")

                print("Hashing " + f)
                newTable = unify(t)
                pmu.printMemoryUsage("after hashing")
                save(newTable, f, args.outputDir)
                pmu.printMemoryUsage("after newTable save")
                if fullTable is None:
                    fullTable = newTable
                else:
                    #print("full fields", fullTable.names)
                    checkColumns(fullTable.names, newTable.names)
                    pmu.printMemoryUsage("after checkColumns")
                    if not args.noMaterialize:
                        fullTable.materialize(to_memory=args.inMemory)
                        pmu.printMemoryUsage("after materialize fullTable")
                    if args.materializeNew:
                        newTable.materialize(to_memory=args.inMemory)
                        pmu.printMemoryUsage("after materialize newTable")

                    pmu.printMemoryUsage("before fulltable rbind")
                    fullTable.rbind(newTable)  # memory gets used here
                    pmu.printMemoryUsage("after rbind")
                ffinish = time.perf_counter()
                secs = ffinish - fstart
                #print("fullTable", fullTable)
                print("newTable rows = {}".format(newTable.nrows))
                print("fullTable rows = {}".format(fullTable.nrows))
                print(
                    "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours".
                    format(secs, secs / 60, secs / 60 / 60))
                if time.perf_counter() - lastCheckPointTime > float(
                        args.checkpoint) * 60:
                    #checkname = args.outputDir+"/"+"all-data.check.jay"
                    #print("Saving checkpoint: " + checkname)
                    #pmu.saveJayTable(fullTable,"all-data.check.jay",args.outputDir)
                    pmu.saveCsvTable(fullTable, "all-data.check.csv",
                                     args.outputDir)
                    fullTable = None
                    #fullTable = dt.fread(args.outputDir+"/all-data.check.csv")
                    fullTable = dt.fread(args.outputDir +
                                         "/all-data.check.jay")
                    #fullTable.to_jay(checkname)
                    #print("Saving done:" + checkname)
                    lastCheckPointTime = time.perf_counter()

    if addedData:
        pmu.printMemoryUsage("before full save")
        pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir)
        pmu.printMemoryUsage("after full save")
    else:
        print("No new data added, not saving 'all-data.ja'")
    #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir)
    finish = time.perf_counter()
    secs = finish - start
    print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format(
        secs, secs / 60, secs / 60 / 60))
コード例 #10
0
ファイル: database.py プロジェクト: kathsel/Corona
def analyze(fullTable, args, oldTables):
    #fullTable = fullTable[dt.f.DatenstandTag <= 387,:]

    print("Analyzing")
    pmu.printMemoryUsage("begin analyze")
    print("Keys:")
    print(fullTable.keys())
    print(list(zip(fullTable.names, fullTable.stypes)))

    daysInfullTable = dt.unique(fullTable[:, "DatenstandTag"]).to_list()[0]
    firstDumpDay = min(daysInfullTable)
    lastDumpDay = max(daysInfullTable)
    maxMeldeDay = cint(fullTable[:,"MeldeTag"].max())
    if maxMeldeDay > lastDumpDay:
        print("Future Date in Meldetag ({}), clipping to yesterday, Datenstandtag-1 = {}".format(maxMeldeDay, lastDumpDay))
        fullTable["MeldeTag">=lastDumpDay,"MeldeTag"] = lastDumpDay -1

    print("firstDumpDay", firstDumpDay)
    print("lastDumpDay",lastDumpDay)
    print("maxMeldeDay",maxMeldeDay)

    fromDay = firstDumpDay
    toDay = lastDumpDay+1
    #fromDay = lastDumpDay-1
    if len(oldTables)>0:

        # calculate which rows are needed for the update
        daysInOldTables = dt.unique(oldTables[0][:, "DatenstandTag"]).to_list()[0]
        newDays = sorted(list(set(daysInfullTable).difference(set(daysInOldTables))))
        print("newDays",newDays)
        if len(newDays) == 0:
            print("Nothing to update")
            exit(9)
        minNewDay = min(newDays)
        maxNewDay = max(newDays)
        minNewDay7daysAgo = minNewDay - 7
        maxNewDay7daysAgo = maxNewDay - 7

        fullTable = fullTable[((dt.f.DatenstandTag >= minNewDay) & (dt.f.DatenstandTag <= maxNewDay)) |
                                ((dt.f.DatenstandTag >= minNewDay7daysAgo) & (dt.f.DatenstandTag <= maxNewDay7daysAgo)),:]
        #fullTable.materialize()
        daysInfullTable = dt.unique(fullTable[:, "DatenstandTag"]).to_list()[0]
        print("daysInfullTable",daysInfullTable)

    fullTable = fullTable[:, dt.f[:].extend({"MeldeDelay": dt.f.DatenstandTag-dt.f.MeldeTag-1})]
    fullTable = fullTable[:, dt.f[:].extend({"RefDelay": dt.f.DatenstandTag-dt.f.RefTag-1})]
    #fullTable.materialize()

    Altersgruppen = []
    if args.agegroups:
        Altersgruppen = dt.unique(fullTable[:,"Altersgruppe"]).to_list()[0]

    print("Altersgruppen", Altersgruppen)

    Geschlechter = []
    if args.gender:
        Geschlechter = dt.unique(fullTable[:,"Geschlecht"]).to_list()[0]
    print("Geschlechter", Geschlechter)

    census = dt.fread("CensusByRKIAgeGroups.csv")
    censusDeutschland = census[dt.f.Name == "Deutschland",:]
    print(censusDeutschland)

    flaechen = loadFlaechen()

    print("Processing 'Deutschland'")
    pmu.printMemoryUsage("begin Deutschland")

    deutschland = analyzeDailyAltersgruppenGeschlechter(fullTable, fromDay, toDay, True, True, Altersgruppen, Geschlechter)
    deutschland = insertDates(deutschland)
    deutschland = insertRegionInfo(deutschland, 0, "Deutschland", "BR", 0, "Deutschland", flaechen[0])
    deutschland = insertEinwohnerColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter, "Flaeche")

    print(deutschland)
    pmu.printMemoryUsage("pre makeIncidenceColumns")

    #deutschland = makeIncidenceColumns(deutschland, censusDeutschland, Altersgruppen, Geschlechter)
    #print(deutschland)
    if len(oldTables) > 0: deutschland = updateOldTable(oldTables[0], deutschland)
    pmu.printMemoryUsage("pre save")
    pmu.saveCsvTable(deutschland, "series-{}-{}.csv".format(0, "Deutschland"), args.outputDir)
    pmu.printMemoryUsage("post save")
    deutschland = None

    #exit(0)

    print("Processing Bundesländer")
    bundeslaender, bundeslaender_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdBundesland, dt.f.Bundesland, Altersgruppen, Geschlechter)
    pmu.printMemoryUsage("post Bundesländer timeSeries")
    for i in range(bundeslaender.nrows):
        bl_name=bundeslaender[i,dt.f.Bundesland].to_list()[0][0]
        bl_id=bundeslaender[i,dt.f.IdBundesland].to_list()[0][0]

        if bl_id > 0:
            bundeslaender_numbers[bl_id] = insertDates(bundeslaender_numbers[bl_id])
            bundeslaender_numbers[bl_id] = insertRegionInfo(bundeslaender_numbers[bl_id], bl_id, bl_name, "BL", bl_id, bl_name, flaechen[0])
            censusBL = census[dt.f.IdLandkreis == bl_id, :]
            bundeslaender_numbers[bl_id] = insertEinwohnerColumns(bundeslaender_numbers[bl_id], censusBL, Altersgruppen, Geschlechter, "Flaeche")
            if len(oldTables) > 0:
                bundeslaender_numbers[bl_id] = updateOldTable(oldTables[bl_id], bundeslaender_numbers[bl_id])
            print(censusBL)

        pmu.printMemoryUsage("pre save {}".format(bl_name))
        pmu.saveCsvTable(bundeslaender_numbers[bl_id], "series-{}-{}.csv".format(bl_id, bl_name), args.outputDir)
    bundeslaender = None
    bundeslaender_numbers = None

    print("Processing Landkreise'")
    landKreise, landkreise_numbers = timeSeries(fullTable, fromDay, toDay, dt.f.IdLandkreis, dt.f.Landkreis, Altersgruppen, Geschlechter)
    pmu.printMemoryUsage("post Landkreise timeSeries")
    #print(landKreise)
    #print(landkreise_numbers)
    for i in range(landKreise.nrows):
        print(i)
        lk_name = landKreise[i, dt.f.Landkreis].to_list()[0][0]
        lk_id = landKreise[i, dt.f.IdLandkreis].to_list()[0][0]
        if lk_name == "LK Saarpfalz-Kreis":
            lk_name = "LK Saar-Pfalz-Kreis"

        if lk_id > 0:
            censusLK = census[dt.f.IdLandkreis == lk_id, :]
            bl_name = censusLK[0,dt.f.Bundesland].to_list()[0][0]
            bl_id = censusLK[0, dt.f.IdBundesland].to_list()[0][0]
            lk_typ = landKreisTyp(lk_id, lk_name)

            landkreise_numbers[lk_id] = insertDates(landkreise_numbers[lk_id])
            landkreise_numbers[lk_id] = insertRegionInfo(landkreise_numbers[lk_id], lk_id, lk_name, lk_typ, bl_id,
                                                             bl_name, flaechen[lk_id])
            #print(censusLK)
            landkreise_numbers[lk_id] = insertEinwohnerColumns(landkreise_numbers[lk_id], censusLK, Altersgruppen,
                                                                Geschlechter, "Flaeche")
            if len(oldTables) > 0:
                landkreise_numbers[lk_id] = updateOldTable(oldTables[lk_id], landkreise_numbers[lk_id])

        pmu.printMemoryUsage("pre save {}".format(lk_name))
        pmu.saveCsvTable(landkreise_numbers[lk_id], "series-{}-{}.csv".format(lk_id, lk_name), args.outputDir)
    #print(landKreise)

    return fullTable
コード例 #11
0
ファイル: unify.py プロジェクト: scharrenberg/Corona
def main():
    #testDatePerf()
    start = time.perf_counter()
    lastCheckPointTime = start
    parser = argparse.ArgumentParser(
        description='Create a unfied data file from daily dumps')
    parser.add_argument('files',
                        metavar='fileName',
                        type=str,
                        nargs='+',
                        help='.NPGEO COVID19 Germany data as .csv file')
    parser.add_argument('-d', '--output-dir', dest='outputDir', default=".")
    parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".")
    parser.add_argument(
        "--flushread",
        help=
        "flush full table an re-read after checkpoint lower memory footprint",
        action="store_true")
    parser.add_argument(
        "--partition",
        help=
        "save data in partionions instead of one file; slower, but you can see progress and maybe need less memory, but ymmv",
        action="store_true")
    parser.add_argument("--backup",
                        help="create backup files before overwriting",
                        action="store_true")
    parser.add_argument("--resume",
                        help="read already unified .csv files first",
                        action="store_true")
    parser.add_argument(
        "--unsafe",
        help=
        "directly overwrite output files, will corrupt the output file when killed while writing, but uses less disk space (only applies to single .jay file in non-partition mode)",
        action="store_true")
    parser.add_argument("--force",
                        help="build new database anyway",
                        action="store_true")
    parser.add_argument(
        "--destructivesave",
        help=
        "release memory gradually while saving and reload after saving (experimental, untested, only applies to partiioned write)",
        action="store_true")
    #parser.add_argument("--incremental", help="only load partial data", action="store_true")
    parser.add_argument("-v",
                        "--verbose",
                        help="make more noise",
                        action="store_true")
    parser.add_argument("--partitionsize",
                        type=int,
                        help="number of records per partition",
                        default=10000000)
    parser.add_argument("--memorylimit",
                        type=int,
                        help="maximum memory limit for a database file")
    parser.add_argument(
        "--checkpoint",
        type=int,
        help="write checkpoint after amount of minutes elapsed",
        default=60)
    parser.add_argument(
        "--nthreads",
        type=int,
        help=
        "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores",
        default=0)

    args = parser.parse_args()
    print(args)
    # print("args.inMemory",args.inMemory)
    # print("args.materializeNew",args.materializeNew)
    # print("args.noMaterialize",args.noMaterialize)

    if args.nthreads != 0:
        dt.options.nthreads = args.nthreads
    print("dt.options.nthreads", dt.options.nthreads)

    fullTable = None
    jayFile = "all-data.jay"
    jayPath = os.path.join(args.outputDir, jayFile)
    print(jayPath)
    pmu.printMemoryUsage("after start")

    partitioned = False
    if not args.force:
        if os.path.isfile(jayPath):
            print("Loading " + jayPath)
            fullTable = dt.fread(jayPath,
                                 tempdir=args.tempDir,
                                 memory_limit=args.memorylimit,
                                 verbose=args.verbose)
        elif len(pmu.getJayTablePartitions(jayPath)) > 0:
            fullTable = pmu.loadJayTablePartioned(
                jayPath,
                tempdir=args.tempDir,
                memory_limit=args.memorylimit,
                verbose=args.verbose)
            if fullTable == None:
                print(
                    "The file {} is not a valid jay file, please remove it and retry"
                )
                exit(1)
            partitioned = True

    daysIncluded = []
    addedData = False
    version = 1
    lastversion = 0
    for fa in args.files:
        files = sorted(glob.glob(fa))
        for f in files:
            if fullTable is not None and version != lastversion:
                pmu.printMemoryUsage("after load")
                daysIncluded = sorted(
                    fullTable[:, [dt.first(dt.f.DatenstandTag)],
                              dt.by(dt.f.DatenstandTag)].to_list()[0])
                print("Days in full table:")
                print(daysIncluded)
                pmu.printMemoryUsage("after first query")
                lastversion = version

            if isNewData(f, daysIncluded):
                pmu.printMemoryUsage("after isNewData query")
                fstart = time.perf_counter()
                unifiedTable = None
                if args.resume:
                    unifiedTable = load(f, args.outputDir)

                addedData = True
                version = version + 1
                if unifiedTable is None:
                    t = tableData(f)
                    pmu.printMemoryUsage("after tabledata query")

                    print("Unifying " + f)
                    unifiedTable = unify(t)
                    pmu.printMemoryUsage("after hashing")
                    save(unifiedTable, f, args.outputDir)
                    pmu.printMemoryUsage("after unifiedTable save")
                if fullTable is None:
                    fullTable = unifiedTable
                else:
                    #print("full fields", fullTable.names)
                    checkColumns(fullTable.names, unifiedTable.names)
                    #print("unifiedTable.names",unifiedTable.names)
                    pmu.printMemoryUsage("before fulltable rbind")
                    fullTable.rbind(unifiedTable)  # memory gets used here
                    #print("fullTable.names",fullTable.names)

                    pmu.printMemoryUsage("after rbind")
                ffinish = time.perf_counter()
                secs = ffinish - fstart
                #print("fullTable", fullTable)
                print("unifiedTable rows = {}".format(unifiedTable.nrows))
                print("fullTable rows = {}".format(fullTable.nrows))
                print(
                    "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours".
                    format(secs, secs / 60, secs / 60 / 60))
                if time.perf_counter() - lastCheckPointTime > float(
                        args.checkpoint) * 60:
                    print("Saving checkpoint @ {}".format(datetime.now()))
                    if args.partition:
                        pmu.saveJayTablePartioned(fullTable, jayFile,
                                                  args.outputDir,
                                                  args.partitionsize, True,
                                                  args.destructivesave)
                        if args.flushread or args.destructivesave:
                            print("Re-reading checkpoint @ {}".format(
                                datetime.now()))
                            fullTable = None
                            fullTable = pmu.loadJayTablePartioned(
                                jayPath,
                                tempdir=args.tempDir,
                                memory_limit=args.memorylimit,
                                verbose=args.verbose)
                    else:
                        pmu.saveJayTable(fullTable, "all-data.jay",
                                         args.outputDir, args.backup,
                                         args.unsafe)

                    lastCheckPointTime = time.perf_counter()
                    print("Checkpoint done @ {}".format(datetime.now()))

    if addedData or (args.partition != partitioned):
        pmu.printMemoryUsage("before full save")
        if args.partition:
            pmu.saveJayTablePartioned(fullTable, "all-data.jay",
                                      args.outputDir, args.partitionsize, True,
                                      args.destructivesave)
        else:
            pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir,
                             args.backup, args.unsafe)
        pmu.printMemoryUsage("after full save")
    else:
        print("No new data added, not saving.'")
    #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir)
    finish = time.perf_counter()
    secs = finish - start
    print("Finished in {:.1f} secs or {:.1f} mins or {:.1f} hours".format(
        secs, secs / 60, secs / 60 / 60))
コード例 #12
0
ファイル: unify.py プロジェクト: scharrenberg/Corona
def unify(table, makeFallGruppe=False):
    dss = table[0, "Datenstand"]
    ds = cd.datetimeFromDatenstandAny(dss)

    if 'FID' in table.names:
        table.names = {"FID": "ObjectId"}

    dsdy = cd.dayFromDate(ds)
    dsisodate = cd.dateStrYMDFromDay(dsdy)
    hasRefdatum = "Refdatum" in table.names
    hasErkrankungsbeginn = "IstErkrankungsbeginn" in table.names
    #t = table.copy()
    t = table
    if "Altersgruppe2" in table.names:
        t = t[:, dt.f[:].remove(dt.f["Altersgruppe2"])]
    if not "DatenstandISO" in table.names:
        t = t[:, dt.f[:].extend({"DatenstandISO": dsisodate})]
    if not hasRefdatum:
        t = t[:, dt.f[:].extend({"Refdatum": 0})]

    hasRefdatumISO = "RefdatumISO" in table.names
    if not hasRefdatumISO:
        #print("t1",t.names)
        t = t[:, dt.f[:].extend({"RefdatumISO": ""})]
        #print("t2",t.names)

    hasMeldedatumISO = "MeldedatumISO" in table.names
    if not hasMeldedatumISO:
        t = t[:, dt.f[:].extend({"MeldedatumISO": ""})]

    if not hasErkrankungsbeginn:
        t = t[:, dt.f[:].extend({"IstErkrankungsbeginn": 0})]

    if "NeuGenesen" not in table.names:
        t = t[:, dt.f[:].extend({"NeuGenesen": -9, "AnzahlGenesen": 0})]

    if makeFallGruppe:
        t = t[:, dt.f[:].extend({
            "FallGruppe": "",
            "MeldeTag": nan,
            "RefTag": nan,
            "DatenstandTag": dsdy
        })]
    t = t[:, dt.f[:].extend({
        "MeldeTag": nan,
        "RefTag": nan,
        "DatenstandTag": dsdy
    })]

    pmu.printMemoryUsage("unify pre dict")
    d = t.to_dict()
    pmu.printMemoryUsage("unify post dict")

    print("> iterating through {} rows".format(t.nrows))
    start = time.perf_counter()
    for r in range(t.nrows):
        mds = d["Meldedatum"][r]
        if pmu.is_int(mds):
            md = cd.datetimeFromStampStr(mds)
        else:
            md = datetimeFromDateStr3(mds)
            d["Meldedatum"][r] = ticksFromDateTime(md)

        mdy = cd.dayFromDate(md)
        d["MeldeTag"][r] = mdy
        if not hasRefdatum:
            d["Refdatum"][r] = ticksFromDateTime(md)
            d["RefTag"][r] = mdy
        if not hasMeldedatumISO:
            d["MeldedatumISO"][r] = cd.dateStrYMDFromDay(mdy)

        if makeFallGruppe:
            fg = str(d["IdLandkreis"]
                     [r]) + d["Altersgruppe"][r] + d["Geschlecht"][r] + str(
                         int(d["MeldeTag"][r]))

        #if int(d["IstErkrankungsbeginn"][r]) == 1:
        rds = d["Refdatum"][r]
        if pmu.is_int(rds):
            rd = cd.datetimeFromStampStr(rds)
        else:
            rd = datetimeFromDateStr3(rds)
            d["Refdatum"][r] = ticksFromDateTime(rd)
        rdy = cd.dayFromDate(rd)
        d["RefTag"][r] = rdy
        if not hasRefdatumISO:
            d["RefdatumISO"][r] = cd.dateStrYMDFromDay(rdy)
        if makeFallGruppe:
            fg = fg + ":" + str(rdy)

        if makeFallGruppe:
            d["FallGruppe"][r] = fg
        checkLandkreisData(d, r, Census, Flaeche)

    finish = time.perf_counter()

    print("< iterating through {} rows done, {:.1f} rows/sec".format(
        t.nrows, t.nrows / (finish - start)))

    pmu.printMemoryUsage("end of unify, pre frame")
    t = dt.Frame(d)
    pmu.printMemoryUsage("end of unify, post frame")
    return t