Ejemplo n.º 1
0
def test_first_2d_dt():
    df_in = dt.Frame([[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1],
                      [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, 8, None, 1]])
    df_reduce = df_in[:, [first(f.C0), first(f.C1)], "C0"]
    assert_equals(df_reduce, dt.Frame([[None, 0, 1, 2, 3, 5, 8, 9],
                                       [None, 0, 1, 2, 3, 5, 8, 9],
                                       [3, 0, 1, 0, 5, 2, 1, 0]],
                                      names=["C0", "C1", "C2"]))
Ejemplo n.º 2
0
def test_first():
    assert str(dt.first(f.A)) == str(f.A.first())
    assert str(dt.first(f[:])) == str(f[:].first())
    DT = dt.Frame({'A': ['1', '1', '2', '1', '2'],
                   'B': [None, '2', '3', '4', '5'],
                   'C': [1, 2, 1, 1, 2]})

    assert_equals(DT[:, f.A.first()], DT[:, dt.first(f.A)])
    assert_equals(DT[:, f[:].first()], DT[:, dt.first(f[:])])
Ejemplo n.º 3
0
def test_first_2d_dt():
    df_in = dt.Frame([[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1],
                      [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, 8, None, 1]])
    df_reduce = df_in[:, [first(f.C0), first(f.C1)], "C0"]
    frame_integrity_check(df_reduce)
    assert df_reduce.shape == (8, 3)
    assert df_reduce.ltypes == (ltype.int, ltype.int, ltype.int,)
    assert df_reduce.to_list() == [[None, 0, 1, 2, 3, 5, 8, 9],
                                   [None, 0, 1, 2, 3, 5, 8, 9],
                                   [3, 0, 1, 0, 5, 2, 1, 0]]
Ejemplo n.º 4
0
def test_first_dt_range():
    df_in = dt.Frame(A=range(10))[3::3, :]
    df_reduce = df_in[:, first(f.A)]
    frame_integrity_check(df_reduce)
    assert df_reduce.shape == (1, 1)
    assert df_reduce.ltypes == (ltype.int,)
    assert df_reduce.to_list() == [[3]]
Ejemplo n.º 5
0
def test_first_dt():
    df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1])
    df_reduce = df_in[:, first(f.C0)]
    frame_integrity_check(df_reduce)
    assert df_reduce.shape == (1, 1)
    assert df_reduce.ltypes == (ltype.int,)
    assert df_reduce.to_list() == [[9]]
Ejemplo n.º 6
0
def test_first_dt_groupby():
    df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1])
    df_reduce = df_in[:, first(f.C0), "C0"]
    assert_equals(
        df_reduce,
        dt.Frame([[None, 0, 1, 2, 3, 5, 8, 9], [None, 0, 1, 2, 3, 5, 8, 9]],
                 names=["C0", "C1"]))
Ejemplo n.º 7
0
def timeSeries(fullTable, fromDay, toDay, byCriteria, nameColumn,
               Altersgruppen, Geschlechter):
    regions = fullTable[:, [dt.first(nameColumn)], dt.by(byCriteria)]
    #regions = regions[:5,:]
    print("Creating time series for regions:")
    print(regions)
    dailysByCriteria = {}
    start = time.perf_counter()
    for i, lk in enumerate(regions[:, byCriteria].to_list()[0]):
        print("Processing Region '{}'".format(regions[i, nameColumn][0, 0]))
        start_region = time.perf_counter()

        pmu.printMemoryUsage("pre analyzeDailyAltersgruppenGeschlechter")
        dailysByCriteria[lk] = analyzeDailyAltersgruppenGeschlechter(
            fullTable,
            filterByDayAndCriteria(fromDay, toDay, (byCriteria == lk)),
            Altersgruppen, Geschlechter)
        finish = time.perf_counter()
        duration = finish - start
        print(
            "Region took {:.2f} seconds, elapsed {:.2f} minutes, time to completion: {:.2f} minutes"
            .format(finish - start_region, duration / 60,
                    duration / (i + 1) * (regions.nrows - i) / 60))

        pmu.printMemoryUsage("post analyzeDailyAltersgruppenGeschlechter")
        print("Done {} of {}, key = {} name = {}".format(
            i + 1, regions.nrows, lk, regions[i, nameColumn][0, 0]))
        #if lk >= 0:
        #    break
    return regions, dailysByCriteria
Ejemplo n.º 8
0
def test_first_dt():
    df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1])
    df_reduce = df_in[:, first(f.C0)]
    df_reduce.internal.check()
    assert df_reduce.shape == (1, 1)
    assert df_reduce.ltypes == (ltype.int, )
    assert df_reduce.topython() == [[9]]
Ejemplo n.º 9
0
def test_first_dt_range():
    df_in = dt.Frame(A=range(10))[3::3, :]
    df_reduce = df_in[:, first(f.A)]
    df_reduce.internal.check()
    assert df_reduce.shape == (1, 1)
    assert df_reduce.ltypes == (ltype.int, )
    assert df_reduce.topython() == [[3]]
Ejemplo n.º 10
0
def test_first_dt_groupby():
    df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1])
    df_reduce = df_in[:, first(f.C0), "C0"]
    frame_integrity_check(df_reduce)
    assert df_reduce.shape == (8, 2)
    assert df_reduce.ltypes == (ltype.int, ltype.int,)
    assert df_reduce.to_list() == [[None, 0, 1, 2, 3, 5, 8, 9],
                                   [None, 0, 1, 2, 3, 5, 8, 9]]
Ejemplo n.º 11
0
def test_first_dt_integer_large(numpy):
    n = 12345678
    a_in = numpy.random.randint(2**20, size=n, dtype=numpy.int32)
    df_in = dt.Frame(a_in)
    df_reduce = df_in[:, first(f.C0)]
    assert df_reduce.shape == (1, 1)
    assert df_reduce.ltypes == (ltype.int, )
    assert df_reduce.topython() == [[a_in[0]]]
Ejemplo n.º 12
0
def test_first_dt_groupby():
    df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1])
    df_reduce = df_in[:, first(f.C0), "C0"]
    df_reduce.internal.check()
    assert df_reduce.shape == (8, 2)
    assert df_reduce.ltypes == (
        ltype.int,
        ltype.int,
    )
    assert df_reduce.topython() == [[None, 0, 1, 2, 3, 5, 8, 9],
                                    [None, 0, 1, 2, 3, 5, 8, 9]]
Ejemplo n.º 13
0
def test_first_array():
    assert first([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]) == 9
    assert first((3.5, 17.9, -4.4)) == 3.5
    assert first([]) == None
# collapse release
statusMsg('Collapsing emx-release....')
subjects['associatedRD3Releases'] = dt.Frame([
    flattenValueArray(
        array=subjects[f.subjectID==d, f.release][f.release != None, :].to_list()[0]
    )
    for d in subjects[:, f.subjectID].to_list()[0]
])

# DISTINCT RECORDS ONLY
# since all information has been flattend and repeated by subject, it is
# possible to select only the distinct records.
statusMsg('Complete! Selecting distinct records only....')

subjects = subjects[:, first(f[:]), dt.by(f.subjectID)]

#//////////////////////////////////////////////////////////////////////////////

# ~ 2 ~ 
# RESHAPE SAMPLES
# Sample metadata will need to be processed a bit differently than subject
# metadata. The idea is to have all samples listed horizontally by subject.
# This means that for each subject there will be a column for all samples
# released in DF1, DF2, DF3, and so on. It was done this way since so that
# references to other tables can be made.
statusMsg('Summarizing sample metadata....')

# recode subjectID --- extract subject ID only (i.e., remove '_original', etc.)
samples.names={'subject': 'subjectID'}
samples['subjectID']=dt.Frame([
Ejemplo n.º 15
0
def main():
    #testDatePerf()
    start = time.perf_counter()
    lastCheckPointTime = start
    parser = argparse.ArgumentParser(
        description='Create a unfied data file from daily dumps')
    parser.add_argument('files',
                        metavar='fileName',
                        type=str,
                        nargs='+',
                        help='.NPGEO COVID19 Germany data as .csv file')
    parser.add_argument('-d', '--output-dir', dest='outputDir', default=".")
    #parser.add_argument("--flushmemfull", help="flush full table to disk for lower memory footprint",
    #                    action="store_true")
    parser.add_argument(
        "--materializeNew",
        help="materialize new table to disk for lower memory footprint",
        action="store_true")
    parser.add_argument(
        "--noMaterialize",
        help=
        "run with higher memory footprint, or much higher memory footprint with --in-memory",
        action="store_true")
    parser.add_argument("--inMemory",
                        help="run faster but with higher memory footprint",
                        action="store_true")
    parser.add_argument(
        "--checkpoint",
        help="write checkpoint after amount of minutes elapsed",
        default=10)
    parser.add_argument(
        "--nthreads",
        help=
        "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores",
        default=0)

    args = parser.parse_args()
    print(args)
    print("args.inMemory", args.inMemory)
    print("args.materializeNew", args.materializeNew)
    print("args.noMaterialize", args.noMaterialize)

    if args.nthreads != 0:
        dt.options.nthreads = args.nthreads
    print("dt.options.nthreads", dt.options.nthreads)

    fullTable = None
    jayPath = args.outputDir + "/all-data.jay"
    print(jayPath)
    pmu.printMemoryUsage("after start")

    daysIncluded = []
    if os.path.isfile(jayPath):
        print("Loading " + jayPath)
        fullTable = dt.fread(jayPath)
        pmu.printMemoryUsage("after load")
        daysIncluded = sorted(
            fullTable[:, [dt.first(dt.f.DatenstandTag)],
                      dt.by(dt.f.DatenstandTag)].to_list()[0])
        print("Days in full table:")
        print(daysIncluded)
        pmu.printMemoryUsage("after first query")

    addedData = False
    for fa in args.files:
        files = sorted(glob.glob(fa))
        for f in files:
            if isNewData(f, daysIncluded):
                addedData = True
                fstart = time.perf_counter()
                pmu.printMemoryUsage("after isNewData query")
                t = tableData(f)
                pmu.printMemoryUsage("after tabledata query")

                print("Hashing " + f)
                newTable = unify(t)
                pmu.printMemoryUsage("after hashing")
                save(newTable, f, args.outputDir)
                pmu.printMemoryUsage("after newTable save")
                if fullTable is None:
                    fullTable = newTable
                else:
                    #print("full fields", fullTable.names)
                    checkColumns(fullTable.names, newTable.names)
                    pmu.printMemoryUsage("after checkColumns")
                    if not args.noMaterialize:
                        fullTable.materialize(to_memory=args.inMemory)
                        pmu.printMemoryUsage("after materialize fullTable")
                    if args.materializeNew:
                        newTable.materialize(to_memory=args.inMemory)
                        pmu.printMemoryUsage("after materialize newTable")

                    pmu.printMemoryUsage("before fulltable rbind")
                    fullTable.rbind(newTable)  # memory gets used here
                    pmu.printMemoryUsage("after rbind")
                ffinish = time.perf_counter()
                secs = ffinish - fstart
                #print("fullTable", fullTable)
                print("newTable rows = {}".format(newTable.nrows))
                print("fullTable rows = {}".format(fullTable.nrows))
                print(
                    "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours".
                    format(secs, secs / 60, secs / 60 / 60))
                if time.perf_counter() - lastCheckPointTime > float(
                        args.checkpoint) * 60:
                    #checkname = args.outputDir+"/"+"all-data.check.jay"
                    #print("Saving checkpoint: " + checkname)
                    #pmu.saveJayTable(fullTable,"all-data.check.jay",args.outputDir)
                    pmu.saveCsvTable(fullTable, "all-data.check.csv",
                                     args.outputDir)
                    fullTable = None
                    #fullTable = dt.fread(args.outputDir+"/all-data.check.csv")
                    fullTable = dt.fread(args.outputDir +
                                         "/all-data.check.jay")
                    #fullTable.to_jay(checkname)
                    #print("Saving done:" + checkname)
                    lastCheckPointTime = time.perf_counter()

    if addedData:
        pmu.printMemoryUsage("before full save")
        pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir)
        pmu.printMemoryUsage("after full save")
    else:
        print("No new data added, not saving 'all-data.ja'")
    #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir)
    finish = time.perf_counter()
    secs = finish - start
    print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format(
        secs, secs / 60, secs / 60 / 60))
Ejemplo n.º 16
0
def loadAndProcessData(dataFilename):
    print("Loading " + dataFilename)

    fullTable = dt.fread(dataFilename)
    print("Loading done loading table from ‘" + dataFilename + "‘, keys:")
    print(fullTable.keys())
    cases = fullTable[:, 'AnzahlFall'].sum()[0, 0]
    dead = fullTable[:, 'AnzahlTodesfall'].sum()[0, 0]

    lastDay = fullTable[:, 'MeldeDay'].max()[0, 0]
    lastnewCaseOnDay = fullTable[:, 'newCaseOnDay'].max()[0, 0]
    print("File stats: lastDay {} lastnewCaseOnDay {} cases {} dead {}".format(
        lastDay, lastnewCaseOnDay, cases, dead))

    newTable = fullTable[:, dt.f[:].
                         extend({"erkMeldeDelay": dt.f.MeldeDay -
                                 dt.f.RefDay})]
    #print(newTable.keys())

    #dt.by(dt.f.Bundesland)]
    alldays = fullTable[:, [
        dt.sum(dt.f.AnzahlFall),
        dt.sum(dt.f.FaellePro100k),
        dt.sum(dt.f.AnzahlTodesfall),
        dt.sum(dt.f.TodesfaellePro100k),
        dt.mean(dt.f.Bevoelkerung),
        dt.max(dt.f.MeldeDay),
        dt.first(dt.f.LandkreisTyp),
        dt.first(dt.f.Bundesland)
    ],
                        dt.by(dt.f.Landkreis)]

    last7days = fullTable[dt.f.newCaseOnDay > lastDay -
                          7, :][:, [
                              dt.sum(dt.f.AnzahlFall),
                              dt.sum(dt.f.FaellePro100k),
                              dt.sum(dt.f.AnzahlTodesfall),
                              dt.sum(dt.f.TodesfaellePro100k)
                          ],
                                dt.by(dt.f.Landkreis)]
    last7days.names = [
        "Landkreis", "AnzahlFallLetzte7Tage", "FaellePro100kLetzte7Tage",
        "AnzahlTodesfallLetzte7Tage", "TodesfaellePro100kLetzte7Tage"
    ]
    last7days[dt.f.AnzahlFallLetzte7Tage < 0, "AnzahlFallLetzte7Tage"] = 0
    last7days[dt.f.FaellePro100kLetzte7Tage < 0,
              "FaellePro100kLetzte7Tage"] = 0
    last7days[dt.f.AnzahlTodesfallLetzte7Tage < 0,
              "AnzahlTodesfallLetzte7Tage"] = 0
    last7days[dt.f.TodesfaellePro100kLetzte7Tage < 0,
              "TodesfaellePro100kLetzte7Tage"] = 0

    lastWeek7days = fullTable[(dt.f.newCaseOnDay > lastDay - 14) & (
        dt.f.newCaseOnDay <= lastDay - 7), :][:, [
            dt.sum(dt.f.AnzahlFall),
            dt.sum(dt.f.FaellePro100k),
            dt.sum(dt.f.AnzahlTodesfall),
            dt.sum(dt.f.TodesfaellePro100k)
        ],
                                              dt.by(dt.f.Landkreis)]
    #lastWeek7days[dt.f[1:] < 0, dt.f[1:]] = 0
    lastWeek7days.names = [
        "Landkreis", "AnzahlFallLetzte7TageDavor",
        "FaellePro100kLetzte7TageDavor", "AnzahlTodesfallLetzte7TageDavor",
        "TodesfaellePro100kLetzte7TageDavor"
    ]
    lastWeek7days[dt.f.AnzahlFallLetzte7TageDavor < 0,
                  "AnzahlFallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.FaellePro100kLetzte7TageDavor < 0,
                  "FaellePro100kLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.AnzahlTodesfallLetzte7TageDavor < 0,
                  "AnzahlTodesfallLetzte7TageDavor"] = 0
    lastWeek7days[dt.f.TodesfaellePro100kLetzte7TageDavor < 0,
                  "TodesfaellePro100kLetzte7TageDavor"] = 0

    allDaysExt0 = merge(alldays, last7days, "Landkreis")
    allDaysExt1 = merge(allDaysExt0, lastWeek7days, "Landkreis")

    Rw = dt.f.AnzahlFallLetzte7Tage / dt.f.AnzahlFallLetzte7TageDavor

    allDaysExt2 = allDaysExt1[:, dt.f[:].extend({"AnzahlFallTrend": Rw})]
    allDaysExt3 = allDaysExt2[:, dt.f[:].extend({
        "FaellePro100kTrend":
        dt.f.FaellePro100kLetzte7Tage - dt.f.FaellePro100kLetzte7TageDavor
    })]
    allDaysExt4 = allDaysExt3[:, dt.f[:].extend({
        "TodesfaellePro100kTrend":
        dt.f.TodesfaellePro100kLetzte7Tage -
        dt.f.TodesfaellePro100kLetzte7TageDavor
    })]

    allDaysExt5 = allDaysExt4[:, dt.f[:].extend({
        "Kontaktrisiko":
        dt.f.Bevoelkerung / 6.25 /
        ((dt.f.AnzahlFallLetzte7Tage + dt.f.AnzahlFallLetzte7TageDavor) * Rw)
    })]
    allDaysExt6 = allDaysExt5[:, dt.f[:].extend(
        {"LetzteMeldung": lastDay - dt.f.MeldeDay})]

    allDaysExt6[dt.f.Kontaktrisiko * 2 == dt.f.Kontaktrisiko,
                "Kontaktrisiko"] = 999999

    sortedByRisk = allDaysExt6.sort(
        ["Kontaktrisiko", "LetzteMeldung", "FaellePro100k"])
    #print(sortedByRisk)
    allDaysExt = sortedByRisk[:, dt.f[:].extend({"Rang": 0})]
    allDaysExt[:, "Rang"] = np.arange(1, allDaysExt.nrows + 1)
    #print(allDaysExt)

    print("Column names frame order:", list(enumerate(allDaysExt.names)))

    data = allDaysExt.to_pandas()
    return data
Ejemplo n.º 17
0
def test_first_dt():
    df_in = dt.Frame([9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1])
    df_reduce = df_in[:, first(f.C0)]
    assert_equals(df_reduce, dt.Frame(C0=[9]))
Ejemplo n.º 18
0
def test_first_dt_range():
    df_in = dt.Frame(A=range(10))[3::3, :]
    df_reduce = df_in[:, first(f.A)]
    assert_equals(df_reduce, dt.Frame(A=[3]))
Ejemplo n.º 19
0
def test_first_empty_frame():
    DT = dt.Frame(A=[], stype=dt.float32)
    RZ = DT[:, first(f.A)]
    assert_equals(RZ, dt.Frame(A=[None], stype=dt.float32))
Ejemplo n.º 20
0
def main():
    #testDatePerf()
    start = time.perf_counter()
    lastCheckPointTime = start
    parser = argparse.ArgumentParser(
        description='Create a unfied data file from daily dumps')
    parser.add_argument('files',
                        metavar='fileName',
                        type=str,
                        nargs='+',
                        help='.NPGEO COVID19 Germany data as .csv file')
    parser.add_argument('-d', '--output-dir', dest='outputDir', default=".")
    parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".")
    parser.add_argument(
        "--flushread",
        help=
        "flush full table an re-read after checkpoint lower memory footprint",
        action="store_true")
    parser.add_argument("--force",
                        help="build new database anyway",
                        action="store_true")
    parser.add_argument(
        "--destructivesave",
        help="release memory gradually while saving and reload after saving",
        action="store_true")
    parser.add_argument("-v",
                        "--verbose",
                        help="make more noise",
                        action="store_true")
    parser.add_argument("--partitionsize",
                        type=int,
                        help="number of records per partition",
                        default=10000000)
    parser.add_argument("--memorylimit",
                        type=int,
                        help="maximum memory limit for a database file")
    parser.add_argument(
        "--checkpoint",
        type=int,
        help="write checkpoint after amount of minutes elapsed",
        default=10)
    parser.add_argument(
        "--nthreads",
        type=int,
        help=
        "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores",
        default=0)

    args = parser.parse_args()
    print(args)
    # print("args.inMemory",args.inMemory)
    # print("args.materializeNew",args.materializeNew)
    # print("args.noMaterialize",args.noMaterialize)

    if args.nthreads != 0:
        dt.options.nthreads = args.nthreads
    print("dt.options.nthreads", dt.options.nthreads)

    fullTable = None
    jayFile = "all-data.jay"
    jayPath = os.path.join(args.outputDir, jayFile)
    print(jayPath)
    pmu.printMemoryUsage("after start")

    partitioned = False
    daysIncluded = []
    if len(pmu.getJayTablePartitions(jayPath)) > 0:
        fullTable = pmu.loadJayTablePartioned(jayPath,
                                              tempDir=args.tempDir,
                                              memoryLimit=args.memorylimit,
                                              verbose=args.verbose)
        if fullTable == None:
            print(
                "The file {} is not a valid jay file, please remove it and retry"
            )
            exit(1)
        partitioned = True
    elif os.path.isfile(jayPath):
        print("Loading " + jayPath)
        fullTable = dt.fread(jayPath)

    if fullTable is not None:
        pmu.printMemoryUsage("after load")
        daysIncluded = sorted(
            fullTable[:, [dt.first(dt.f.DatenstandTag)],
                      dt.by(dt.f.DatenstandTag)].to_list()[0])
        print("Days in full table:")
        print(daysIncluded)
        pmu.printMemoryUsage("after first query")

    addedData = False
    for fa in args.files:
        files = sorted(glob.glob(fa))
        for f in files:
            if isNewData(f, daysIncluded):
                addedData = True
                fstart = time.perf_counter()
                pmu.printMemoryUsage("after isNewData query")
                t = tableData(f)
                pmu.printMemoryUsage("after tabledata query")

                print("Hashing " + f)
                newTable = unify(t)
                pmu.printMemoryUsage("after hashing")
                save(newTable, f, args.outputDir)
                pmu.printMemoryUsage("after newTable save")
                if fullTable is None:
                    fullTable = newTable
                else:
                    #print("full fields", fullTable.names)
                    checkColumns(fullTable.names, newTable.names)
                    pmu.printMemoryUsage("before fulltable rbind")
                    fullTable.rbind(newTable)  # memory gets used here
                    pmu.printMemoryUsage("after rbind")
                ffinish = time.perf_counter()
                secs = ffinish - fstart
                #print("fullTable", fullTable)
                print("newTable rows = {}".format(newTable.nrows))
                print("fullTable rows = {}".format(fullTable.nrows))
                print(
                    "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours".
                    format(secs, secs / 60, secs / 60 / 60))
                if time.perf_counter() - lastCheckPointTime > float(
                        args.checkpoint) * 60:
                    print("Saving checkpoint @ {}".format(datetime.now()))
                    pmu.saveJayTablePartioned(fullTable, jayFile,
                                              args.outputDir,
                                              args.partitionsize, True,
                                              args.destructivesave)
                    if args.flushread or args.destructivesave:
                        print("Re-reading checkpoint @ {}".format(
                            datetime.now()))
                        fullTable = None
                        fullTable = pmu.loadJayTablePartioned(
                            jayPath,
                            tempDir=args.tempDir,
                            memoryLimit=args.memorylimit,
                            verbose=args.verbose)
                    lastCheckPointTime = time.perf_counter()
                    print("Checkpoint done @ {}".format(datetime.now()))

    if addedData or not partitioned:
        pmu.printMemoryUsage("before full save")
        #pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir)
        pmu.saveJayTablePartioned(fullTable, "all-data.jay", args.outputDir,
                                  args.partitionsize, True,
                                  args.destructivesave)
        pmu.printMemoryUsage("after full save")
    else:
        print("No new data added, not saving 'all-data.jay'")
    #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir)
    finish = time.perf_counter()
    secs = finish - start
    print("--> Wall time {:.1f} secs or {:.1f} mins or {:.1f} hours".format(
        secs, secs / 60, secs / 60 / 60))
Ejemplo n.º 21
0
def test_first_dt_integer_large(numpy):
    n = 12345678
    a_in = numpy.random.randint(2**20, size=n, dtype=numpy.int32)
    df_in = dt.Frame(a_in)
    df_reduce = df_in[:, first(f.C0)]
    assert_equals(df_reduce, dt.Frame(C0=[a_in[0]]))
Ejemplo n.º 22
0
def test_first_array():
    a_in = [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]
    a_reduce = first(a_in)
    assert a_reduce == 9
Ejemplo n.º 23
0
def test_first_2d_array():
    a_in = [[9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1],
            [0, 1, 0, 5, 3, 8, 1, 0, 2, 5, 8, None, 1]]
    a_reduce = first(a_in)
    assert a_reduce == [9, 8, 2, 3, None, None, 3, 0, 5, 5, 8, None, 1]
Ejemplo n.º 24
0
def main():
    #testDatePerf()
    start = time.perf_counter()
    lastCheckPointTime = start
    parser = argparse.ArgumentParser(
        description='Create a unfied data file from daily dumps')
    parser.add_argument('files',
                        metavar='fileName',
                        type=str,
                        nargs='+',
                        help='.NPGEO COVID19 Germany data as .csv file')
    parser.add_argument('-d', '--output-dir', dest='outputDir', default=".")
    parser.add_argument('-t', '--temp-dir', dest='tempDir', default=".")
    parser.add_argument(
        "--flushread",
        help=
        "flush full table an re-read after checkpoint lower memory footprint",
        action="store_true")
    parser.add_argument(
        "--partition",
        help=
        "save data in partionions instead of one file; slower, but you can see progress and maybe need less memory, but ymmv",
        action="store_true")
    parser.add_argument("--backup",
                        help="create backup files before overwriting",
                        action="store_true")
    parser.add_argument("--resume",
                        help="read already unified .csv files first",
                        action="store_true")
    parser.add_argument(
        "--unsafe",
        help=
        "directly overwrite output files, will corrupt the output file when killed while writing, but uses less disk space (only applies to single .jay file in non-partition mode)",
        action="store_true")
    parser.add_argument("--force",
                        help="build new database anyway",
                        action="store_true")
    parser.add_argument(
        "--destructivesave",
        help=
        "release memory gradually while saving and reload after saving (experimental, untested, only applies to partiioned write)",
        action="store_true")
    #parser.add_argument("--incremental", help="only load partial data", action="store_true")
    parser.add_argument("-v",
                        "--verbose",
                        help="make more noise",
                        action="store_true")
    parser.add_argument("--partitionsize",
                        type=int,
                        help="number of records per partition",
                        default=10000000)
    parser.add_argument("--memorylimit",
                        type=int,
                        help="maximum memory limit for a database file")
    parser.add_argument(
        "--checkpoint",
        type=int,
        help="write checkpoint after amount of minutes elapsed",
        default=60)
    parser.add_argument(
        "--nthreads",
        type=int,
        help=
        "number of concurrent threads used by python dataframes, 0 = as many as cores, 1 single-thread, -3 = 3 threads less than cores",
        default=0)

    args = parser.parse_args()
    print(args)
    # print("args.inMemory",args.inMemory)
    # print("args.materializeNew",args.materializeNew)
    # print("args.noMaterialize",args.noMaterialize)

    if args.nthreads != 0:
        dt.options.nthreads = args.nthreads
    print("dt.options.nthreads", dt.options.nthreads)

    fullTable = None
    jayFile = "all-data.jay"
    jayPath = os.path.join(args.outputDir, jayFile)
    print(jayPath)
    pmu.printMemoryUsage("after start")

    partitioned = False
    if not args.force:
        if os.path.isfile(jayPath):
            print("Loading " + jayPath)
            fullTable = dt.fread(jayPath,
                                 tempdir=args.tempDir,
                                 memory_limit=args.memorylimit,
                                 verbose=args.verbose)
        elif len(pmu.getJayTablePartitions(jayPath)) > 0:
            fullTable = pmu.loadJayTablePartioned(
                jayPath,
                tempdir=args.tempDir,
                memory_limit=args.memorylimit,
                verbose=args.verbose)
            if fullTable == None:
                print(
                    "The file {} is not a valid jay file, please remove it and retry"
                )
                exit(1)
            partitioned = True

    daysIncluded = []
    addedData = False
    version = 1
    lastversion = 0
    for fa in args.files:
        files = sorted(glob.glob(fa))
        for f in files:
            if fullTable is not None and version != lastversion:
                pmu.printMemoryUsage("after load")
                daysIncluded = sorted(
                    fullTable[:, [dt.first(dt.f.DatenstandTag)],
                              dt.by(dt.f.DatenstandTag)].to_list()[0])
                print("Days in full table:")
                print(daysIncluded)
                pmu.printMemoryUsage("after first query")
                lastversion = version

            if isNewData(f, daysIncluded):
                pmu.printMemoryUsage("after isNewData query")
                fstart = time.perf_counter()
                unifiedTable = None
                if args.resume:
                    unifiedTable = load(f, args.outputDir)

                addedData = True
                version = version + 1
                if unifiedTable is None:
                    t = tableData(f)
                    pmu.printMemoryUsage("after tabledata query")

                    print("Unifying " + f)
                    unifiedTable = unify(t)
                    pmu.printMemoryUsage("after hashing")
                    save(unifiedTable, f, args.outputDir)
                    pmu.printMemoryUsage("after unifiedTable save")
                if fullTable is None:
                    fullTable = unifiedTable
                else:
                    #print("full fields", fullTable.names)
                    checkColumns(fullTable.names, unifiedTable.names)
                    #print("unifiedTable.names",unifiedTable.names)
                    pmu.printMemoryUsage("before fulltable rbind")
                    fullTable.rbind(unifiedTable)  # memory gets used here
                    #print("fullTable.names",fullTable.names)

                    pmu.printMemoryUsage("after rbind")
                ffinish = time.perf_counter()
                secs = ffinish - fstart
                #print("fullTable", fullTable)
                print("unifiedTable rows = {}".format(unifiedTable.nrows))
                print("fullTable rows = {}".format(fullTable.nrows))
                print(
                    "-> File time {:.1f} secs or {:.1f} mins or {:.1f} hours".
                    format(secs, secs / 60, secs / 60 / 60))
                if time.perf_counter() - lastCheckPointTime > float(
                        args.checkpoint) * 60:
                    print("Saving checkpoint @ {}".format(datetime.now()))
                    if args.partition:
                        pmu.saveJayTablePartioned(fullTable, jayFile,
                                                  args.outputDir,
                                                  args.partitionsize, True,
                                                  args.destructivesave)
                        if args.flushread or args.destructivesave:
                            print("Re-reading checkpoint @ {}".format(
                                datetime.now()))
                            fullTable = None
                            fullTable = pmu.loadJayTablePartioned(
                                jayPath,
                                tempdir=args.tempDir,
                                memory_limit=args.memorylimit,
                                verbose=args.verbose)
                    else:
                        pmu.saveJayTable(fullTable, "all-data.jay",
                                         args.outputDir, args.backup,
                                         args.unsafe)

                    lastCheckPointTime = time.perf_counter()
                    print("Checkpoint done @ {}".format(datetime.now()))

    if addedData or (args.partition != partitioned):
        pmu.printMemoryUsage("before full save")
        if args.partition:
            pmu.saveJayTablePartioned(fullTable, "all-data.jay",
                                      args.outputDir, args.partitionsize, True,
                                      args.destructivesave)
        else:
            pmu.saveJayTable(fullTable, "all-data.jay", args.outputDir,
                             args.backup, args.unsafe)
        pmu.printMemoryUsage("after full save")
    else:
        print("No new data added, not saving.'")
    #pmu.saveCsvTable(fullTable, "all-data.csv", args.outputDir)
    finish = time.perf_counter()
    secs = finish - start
    print("Finished in {:.1f} secs or {:.1f} mins or {:.1f} hours".format(
        secs, secs / 60, secs / 60 / 60))