Beispiel #1
0
def test_txt_append():
    fname = "testobj.txt"
    obj = "bla"
    just.append(obj, "testobj.txt")
    try:
        assert [x for x in just.iread(fname)] == [obj]
        just.append(obj, "testobj.txt")
        assert [x for x in just.iread(fname)] == [obj, obj]
    finally:
        os.remove(fname)
Beispiel #2
0
 def load_object_per_newline(cls, fname, nrows=None):
     """
     Iterates over a file containing an object per line (e.g. .jsonl or .txt).
     Will only handle new lines not seen earlier; it detects this by storing the number-of-objects seen.
     You should implement `object_to_row(cls, row)` on your class that returns a dictionary.
     """
     data = []
     name = fname + "_" + normalize_name(cls.__name__)
     newline_count = get_newline_count(name)
     for i, x in enumerate(just.iread(fname)):
         if nrows is None:
             if i < newline_count:
                 continue
         row = cls.object_to_row(x)
         if row is None:
             continue
         data.append(row)
         # breaking at approx 5 rows
         if nrows is not None and i > nrows:
             break
     if data:
         data = pd.DataFrame(data)
         if newline_count and nrows is None:
             data = pd.concat((data, load_df(name)))
         if nrows is None:
             data = save_df(data, name)
             n = i + 1
             save_newline_count(n, name)
     else:
         data = load_df(name)
     if nrows is not None:
         data = data.iloc[-nrows:]
     return data
Beispiel #3
0
def test_txt_iread():
    fname = "testobj.txt"
    obj = "1\n2\n3\n4\n5"
    just.write(obj, "testobj.txt")
    try:
        assert [x for x in just.iread(fname)] == [x for x in obj.split("\n")]
    finally:
        os.remove(fname)
Beispiel #4
0
def test_newl_iread():
    fname = "testobj.newl"
    obj = ["1", "2"]
    just.write(obj, "testobj.newl")
    try:
        assert [x for x in just.iread(fname)] == [x for x in obj]
    finally:
        os.remove(fname)
Beispiel #5
0
def test_csv_iread():
    fname = "testobj.csv"
    obj = [['"a"', '"b"']] + [['"1"', '"2"']] * 99
    just.write(obj, "testobj.csv")
    try:
        assert [x for x in just.iread(fname)] == [x for x in obj]
    finally:
        os.remove(fname)
Beispiel #6
0
def test_csv_iread_error():
    fname = "testobj.csv"
    obj = [['"a"', '"b"']] + [['"1"', '"2"', '"']] * 100
    just.write(obj, "testobj.csv")
    try:
        list(just.iread(fname))
        # should not reach here
    except ValueError:
        assert True
    finally:
        os.remove(fname)
def get_training_xy(data_path="./tracktrack/"):
    positions = list(just.iread(data_path + "positions.jsonl"))
    images = [
        matplotlib.image.imread(x) for x in just.glob(data_path + "im*.png")
    ]
    m = min(len(images), len(positions))

    X = prep_images(images[-m:])
    positions = positions[-m:]
    y = np.array(positions)

    return X, y
Beispiel #8
0
    def load(cls, nrows=None, **kwargs):
        old_text = ""
        results = []
        nrows = nrows or float("inf")
        for file_path in just.glob("~/nostalgia_data/input/whatsapp/*.txt"):
            row = 0
            for line in just.iread(file_path):
                try:
                    time = datetime_from_format(line[:offset],
                                                "%d/%m/%Y, %H:%M - ")
                except ValueError:
                    old_text += line + "\n"
                    continue
                line = old_text + line[offset:]
                old_text = ""
                try:
                    if line.startswith(
                            "Messages to this chat and calls are now secured"):
                        continue
                    sender, text = line.split(": ", 1)
                except ValueError:
                    print("ERR", line)
                    continue
                if line:
                    if row > nrows:
                        break
                    row += 1
                    results.append((time, sender, text))

        df = pd.DataFrame(results, columns=["time", "sender", "text"])
        # hack "order" into minute data
        same_minute = df.time == df.shift(1).time
        seconds = []
        second_prop = 0
        for x in same_minute:
            if x:
                second_prop += 1
            else:
                second_prop = 0
            seconds.append(
                pd.Timedelta(seconds=60 * second_prop / (second_prop + 1)))
        df["time"] = df["time"] + pd.Series(seconds)
        return cls(df)
Beispiel #9
0
    # https://schema.org/Blog
    # "http://schema.org/blogPost"
    # http://schema.org/WebPage # http://schema.org/BlogPosting

    # exlclude http://schema.org/QAPage

    it = 0
    imo = 0
    nice = 0
    wrong = 0
    from collections import Counter
    import just

    c = Counter()

    for x in just.iread("/home/pascal/nostal_tmp/person.jsonl"):
        if "/Person" in (str(x.get("microdata"))):
            it += 1
            y = x
            score = 0
            mc_count = 0
            for mc in y["microdata"]:
                if mc.get("type") in [
                        "http://schema.org/ImageObject",
                        "http://schema.org/QAPage",
                        "http://schema.org/Movie",
                        "http://schema.org/videoObject",
                        "http://schema.org/Organization",
                        "http://schema.org/VideoObject",
                        "http://schema.org/Question",
                        "http://schema.org/CreativeWork",
Beispiel #10
0
df = pd.DataFrame(np.random.random((10000, int(random.random() * 500))))

pdc.run_benchmarks([df], save=True)

df = pd.DataFrame({"a": [1]})
inferred = pdc.infer(df)
inferred
pdc.save(df, "test")
pdc.save(df, "~/test")
path = pdc.save(df, "~/test")
pdc.load(path)

# pdc.bench_exceptions = ()
# data_gen = (pd.DataFrame(np.random.random((1000, int(random.random() * 500)))) for i in range(2))

pdc.run_benchmarks(just.iread("~/csvlist.txt"))

pdc.train_model("size")

feats = json.loads(
    '{"num_obs": 1000, "num_cols": 289, "num_float_vars": 289, "num_str_vars": 0, "percent_float": 1.0, "percent_str": 0.0, "str_missing_proportion": NaN, "float_missing_proportion": NaN, "cardinality_quantile_proportion_25": 1.0, "cardinality_quantile_proportion_50": 1.0, "cardinality_quantile_proportion_75": 1.0, "float_equal_0_proportion": 0.0, "str_len_quantile_25": NaN, "str_len_quantile_50": NaN, "str_len_quantile_75": NaN}'
)

pdc.predict(
    pd.DataFrame(np.random.random((10000, int(random.random() * 500000)))))

# pdc = PandasCompressor("test1")

# d = pdc.get_features(pd.read_csv("/home/pascal/Downloads/results-20190622-143623.csv"))

# f**k = just.glob("/home/pascal/egoroot/tradex/data/cmc/*.csv.gz") * 30