def test_txt_append(): fname = "testobj.txt" obj = "bla" just.append(obj, "testobj.txt") try: assert [x for x in just.iread(fname)] == [obj] just.append(obj, "testobj.txt") assert [x for x in just.iread(fname)] == [obj, obj] finally: os.remove(fname)
def load_object_per_newline(cls, fname, nrows=None): """ Iterates over a file containing an object per line (e.g. .jsonl or .txt). Will only handle new lines not seen earlier; it detects this by storing the number-of-objects seen. You should implement `object_to_row(cls, row)` on your class that returns a dictionary. """ data = [] name = fname + "_" + normalize_name(cls.__name__) newline_count = get_newline_count(name) for i, x in enumerate(just.iread(fname)): if nrows is None: if i < newline_count: continue row = cls.object_to_row(x) if row is None: continue data.append(row) # breaking at approx 5 rows if nrows is not None and i > nrows: break if data: data = pd.DataFrame(data) if newline_count and nrows is None: data = pd.concat((data, load_df(name))) if nrows is None: data = save_df(data, name) n = i + 1 save_newline_count(n, name) else: data = load_df(name) if nrows is not None: data = data.iloc[-nrows:] return data
def test_txt_iread(): fname = "testobj.txt" obj = "1\n2\n3\n4\n5" just.write(obj, "testobj.txt") try: assert [x for x in just.iread(fname)] == [x for x in obj.split("\n")] finally: os.remove(fname)
def test_newl_iread(): fname = "testobj.newl" obj = ["1", "2"] just.write(obj, "testobj.newl") try: assert [x for x in just.iread(fname)] == [x for x in obj] finally: os.remove(fname)
def test_csv_iread(): fname = "testobj.csv" obj = [['"a"', '"b"']] + [['"1"', '"2"']] * 99 just.write(obj, "testobj.csv") try: assert [x for x in just.iread(fname)] == [x for x in obj] finally: os.remove(fname)
def test_csv_iread_error(): fname = "testobj.csv" obj = [['"a"', '"b"']] + [['"1"', '"2"', '"']] * 100 just.write(obj, "testobj.csv") try: list(just.iread(fname)) # should not reach here except ValueError: assert True finally: os.remove(fname)
def get_training_xy(data_path="./tracktrack/"): positions = list(just.iread(data_path + "positions.jsonl")) images = [ matplotlib.image.imread(x) for x in just.glob(data_path + "im*.png") ] m = min(len(images), len(positions)) X = prep_images(images[-m:]) positions = positions[-m:] y = np.array(positions) return X, y
def load(cls, nrows=None, **kwargs): old_text = "" results = [] nrows = nrows or float("inf") for file_path in just.glob("~/nostalgia_data/input/whatsapp/*.txt"): row = 0 for line in just.iread(file_path): try: time = datetime_from_format(line[:offset], "%d/%m/%Y, %H:%M - ") except ValueError: old_text += line + "\n" continue line = old_text + line[offset:] old_text = "" try: if line.startswith( "Messages to this chat and calls are now secured"): continue sender, text = line.split(": ", 1) except ValueError: print("ERR", line) continue if line: if row > nrows: break row += 1 results.append((time, sender, text)) df = pd.DataFrame(results, columns=["time", "sender", "text"]) # hack "order" into minute data same_minute = df.time == df.shift(1).time seconds = [] second_prop = 0 for x in same_minute: if x: second_prop += 1 else: second_prop = 0 seconds.append( pd.Timedelta(seconds=60 * second_prop / (second_prop + 1))) df["time"] = df["time"] + pd.Series(seconds) return cls(df)
# https://schema.org/Blog # "http://schema.org/blogPost" # http://schema.org/WebPage # http://schema.org/BlogPosting # exlclude http://schema.org/QAPage it = 0 imo = 0 nice = 0 wrong = 0 from collections import Counter import just c = Counter() for x in just.iread("/home/pascal/nostal_tmp/person.jsonl"): if "/Person" in (str(x.get("microdata"))): it += 1 y = x score = 0 mc_count = 0 for mc in y["microdata"]: if mc.get("type") in [ "http://schema.org/ImageObject", "http://schema.org/QAPage", "http://schema.org/Movie", "http://schema.org/videoObject", "http://schema.org/Organization", "http://schema.org/VideoObject", "http://schema.org/Question", "http://schema.org/CreativeWork",
df = pd.DataFrame(np.random.random((10000, int(random.random() * 500)))) pdc.run_benchmarks([df], save=True) df = pd.DataFrame({"a": [1]}) inferred = pdc.infer(df) inferred pdc.save(df, "test") pdc.save(df, "~/test") path = pdc.save(df, "~/test") pdc.load(path) # pdc.bench_exceptions = () # data_gen = (pd.DataFrame(np.random.random((1000, int(random.random() * 500)))) for i in range(2)) pdc.run_benchmarks(just.iread("~/csvlist.txt")) pdc.train_model("size") feats = json.loads( '{"num_obs": 1000, "num_cols": 289, "num_float_vars": 289, "num_str_vars": 0, "percent_float": 1.0, "percent_str": 0.0, "str_missing_proportion": NaN, "float_missing_proportion": NaN, "cardinality_quantile_proportion_25": 1.0, "cardinality_quantile_proportion_50": 1.0, "cardinality_quantile_proportion_75": 1.0, "float_equal_0_proportion": 0.0, "str_len_quantile_25": NaN, "str_len_quantile_50": NaN, "str_len_quantile_75": NaN}' ) pdc.predict( pd.DataFrame(np.random.random((10000, int(random.random() * 500000))))) # pdc = PandasCompressor("test1") # d = pdc.get_features(pd.read_csv("/home/pascal/Downloads/results-20190622-143623.csv")) # f**k = just.glob("/home/pascal/egoroot/tradex/data/cmc/*.csv.gz") * 30