def load_image_texts(cls, glob_pattern_s, nrows=None): import pytesseract from PIL import Image if isinstance(glob_pattern_s, list): fnames = set() for glob_pattern in glob_pattern_s: fnames.update(set(just.glob(glob_pattern))) glob_pattern = "_".join(glob_pattern_s) else: fnames = set(just.glob(glob_pattern)) name = glob_pattern + "_" + normalize_name(cls.__name__) processed_files = get_processed_files(name) to_process = fnames.difference(processed_files) objects = [] cache = get_cache("tesseract") if nrows is not None: if not to_process: return load_df(name).iloc[-nrows:] else: to_process = list(to_process)[-nrows:] if to_process: for fname in to_process: if fname in cache: text = cache[fname] else: try: text = pytesseract.image_to_string( Image.open(just.make_path(fname))) except OSError as e: print("ERR", fname, e) continue cache[fname] = text time = datetime_from_timestamp(os.path.getmtime(fname), "utc") data = { "text": text, "path": fname, "title": fname.split("/")[-1], "time": time } objects.append(data) data = pd.DataFrame(objects) if processed_files and nrows is None: data = pd.concat((data, load_df(name))) for x in ["time", "start", "end"]: if x in data: data = data.sort_values(x) break if nrows is None: save_df(data, name) save_processed_files(fnames | processed_files, name) else: data = load_df(name) if nrows is not None: data = data.iloc[-nrows:] return data
def load(cls, nrows=None, from_cache=True, **kwargs): dfs = [ cls.load_data_file_modified_time(file_path, nrows=nrows, from_cache=from_cache) for file_path in just.glob("~/nostalgia_data/input/google/Takeout/Mail/*.mbox") ] dfs = [x for x in dfs if not x.empty] return cls(pd.concat(dfs))
def load(cls, file_glob="~/Downloads/timeline_data-*", nrows=None): df = pd.read_csv(max(just.glob(file_glob)), nrows=nrows) unique_locs = set([((y, z), x) for x, y, z in zip(df.name, df.lat, df.lon) if y != "nan"]) excluded_transport_names = set(df[df.name == df.category].name) details_data = [] for (latitude, longitude), name in unique_locs: d = get_results((latitude, longitude), name, excluded_transport_names) if d is None: continue d["lat"] = latitude d["lon"] = longitude d["name"] = name details_data.append(d) details_data = pd.DataFrame(details_data) # all_loc = df.merge(details_data, on=["name", "lat", "lon"], how="outer") places = df.merge(details_data, on=["name", "lat", "lon"], how="inner") # all_loc = process(all_loc, excluded_transport_names) home_regex = "|".join(cls.home) work_regex = "|".join(cls.work) hometown_regex = "|".join(cls.hometown) places = process(places, excluded_transport_names, home_regex, work_regex, hometown_regex) return cls(places)
def load_dataframe_per_json_file(cls, glob_pattern, key="", nrows=None): fnames = set(just.glob(glob_pattern)) name = glob_pattern + "_" + normalize_name(cls.__name__) processed_files = get_processed_files(name) to_process = fnames.difference(processed_files) objects = [] if nrows is not None: if not to_process: to_process = list(processed_files)[-nrows:] else: to_process = list(to_process)[-nrows:] if to_process: print("processing {} files".format(len(to_process))) for fname in to_process: data = read_array_of_dict_from_json(fname, key, nrows) data = cls.handle_dataframe_per_file(data, fname) if data is None: continue objects.append(data) data = pd.concat(objects) if processed_files and nrows is None: data = pd.concat((data, load_df(name))) for x in ["time", "start", "end"]: if x in data: data = data.sort_values(x) break if nrows is None: save_df(data, name) save_processed_files(fnames | processed_files, name) else: data = load_df(name) if nrows is not None: data = data.iloc[-nrows:] return data
def load(cls, nrows=None): photo_glob = "~/nostalgia_data/input/google/Takeout/Google Photos/*/*" pics = [] nrows = nrows or float("inf") rows = 0 for fname in just.glob(photo_glob): if fname.endswith(".json"): continue try: meta = just.read(fname + ".json") except FileNotFoundError: continue if rows == nrows: break date = datetime.fromtimestamp( int(meta["photoTakenTime"]["timestamp"]), tz) latitude, longitude = format_latlng( (meta["geoData"]["latitude"], meta["geoData"]["longitude"])).split(", ") title = meta["title"] pics.append({ "path": "file://" + fname, "lat": latitude, "lon": longitude, "title": title, "time": date, }) rows += 1 pics = pd.DataFrame(pics) return cls(pics)
def load(cls, nrows=None): files = just.glob("~/nostalgia_data/input/mijn_chipkaart/*.csv") data = pd.concat([pd.read_csv(x, sep=";", nrows=nrows) for x in files]) data["Bedrag"] = [float(x.replace(",", ".")) for x in data["Bedrag"]] data["Datum"] = [ parse_date_tz(x + " " + y).start_date for x, y in zip(data.Datum, data["Check-uit"]) ] return cls(data)
def multi_read(star_path, no_exist=None, unknown_type="RAISE", ignore_exceptions=None): return { x: read(x, no_exist, unknown_type, ignore_exceptions) for x in glob(star_path) }
def load(cls, nrows=None): files = just.glob( "~/nostalgia_data/input/ns_chipkaart/reistransacties-*.xls") data = pd.concat([pd.read_excel(x, nrows=nrows) for x in files]).iloc[:-1] data["Datum"] = data["Datum"].apply(parse_date_tz) data["Vertrek"], data["Bestemming"] = data["Omschrijving"].str.split( ":", 1).str[1].str.split(" - ", 1).str return cls(data)
def load(cls, nrows=None): files = "~/nostalgia_data/input/apple/*/Apple_Media_Services/Stores Activity/Account and Transaction History/Store*History.csv" dfs = [ pd.read_csv(f, parse_dates=["Item Purchased Date"]) for f in just.glob(files) ] applications = pd.merge(dfs[0], dfs[1], how="outer", on="Item Purchased Date") return cls(applications)
def get_training_xy(data_path="./tracktrack/"): positions = list(just.iread(data_path + "positions.jsonl")) images = [ matplotlib.image.imread(x) for x in just.glob(data_path + "im*.png") ] m = min(len(images), len(positions)) X = prep_images(images[-m:]) positions = positions[-m:] y = np.array(positions) return X, y
def test_multi_read(): obj = ["a", "b"] fnames = ["a.txt", "b.txt"] just.multi_write(obj, fnames) try: full_names = just.glob("*.txt") multi_content = just.multi_read("*.txt") for o, f in zip(obj, fnames): full_name = [x for x in full_names if x.endswith(f)][0] assert multi_content[full_name] == o finally: for fname in fnames: os.remove(fname)
def load(cls, nrows=None): path = "~/nostalgia_data/input/samsung/samsunghealth_*/com.samsung.health.sleep_stage.*.csv" fname = just.glob(path)[0] data = cls.load_data_file_modified_time(fname, nrows=nrows, skiprows=1) data["start_time"] = [ datetime_from_format(x, "%Y-%m-%d %H:%M:%S.%f") for x in data["start_time"] ] data["end_time"] = [ datetime_from_format(x, "%Y-%m-%d %H:%M:%S.%f") for x in data["end_time"] ] return cls(data)
def load(cls, nrows=None): files = "~/nostalgia_data/input/apple/*/iCloudUsageData Set*.csv" icloud = pd.concat([ pd.read_csv(f, skiprows=1, error_bad_lines=False) for f in just.glob(files) ]) icloud = icloud.iloc[:icloud.loc[ icloud.Date == "Photos: Delete photo/video from iCloud Photo Library"].index. to_list()[0]] icloud["File Capture Date"] = icloud["File Capture Date"].apply( lambda x: datetime_from_format(x, "%Y-%m-%d")) return cls(icloud)
def latest_file_is_historic(cls, glob, key_name="", nrows=None, from_cache=True): """ Glob is for using a wildcard pattern, and the last created file will be loaded. See `load_data_file_modified_time` for further reference. Returns a pd.DataFrame """ recent = max([x for x in just.glob(glob) if "(" not in x], key=os.path.getctime) return cls.load_data_file_modified_time(recent, key_name, nrows, from_cache)
def load_from_download(ingest_glob, vendor, recent_only=True, delete_existing=True): ingest_files = just.glob(ingest_glob) if not ingest_files: raise ValueError(f"Nothing to extract using {ingest_glob} - Aborting") nostalgia_input = "~/nostalgia_data/input/{}".format(vendor) if delete_existing: just.remove(nostalgia_input, allow_recursive=True) elif just.exists(nostalgia_input): raise ValueError(f"Cannot overwrite path {nostalgia_input}, pass delete_existing=True") fnames = sorted(ingest_files, key=os.path.getctime) if recent_only: fnames = fnames[-1:] for fname in fnames: with zipfile.ZipFile(fname, 'r') as zip_ref: out = os.path.expanduser(nostalgia_input) print("unpacking from", fname, "to", out) zip_ref.extractall(out)
def load(cls, nrows=None): files = just.glob("~/nostalgia_data/input/abnamro/*.xls") abn = pd.concat([ pd.read_excel(x, nrows=nrows, converters={"transactiondate": convert_date}) for x in files ]) abn["preciseDate"] = abn["description"].apply(find_date) if abn.preciseDate.isnull().iloc[0]: abn.preciseDate.iloc[0] = abn.transactiondate.iloc[0] if abn.preciseDate.isnull().iloc[-1]: abn.preciseDate.iloc[-1] = abn.transactiondate.iloc[-1] abn.preciseDate = (abn.preciseDate.map( lambda x: time.mktime(pd.datetime.timetuple(x)) if not pd.isna(x) else np.nan).interpolate("values").map(datetime_from_timestamp)) return cls(abn)
def load(cls, nrows=None): file_path = "~/nostalgia_data/input/facebook" chat_paths = just.glob(f"{file_path}/messages/inbox/*/message_1.json") face = pd.concat( [read_array_of_dict_from_json(chat_file, "messages", nrows) for chat_file in chat_paths] ) face = face.reset_index(drop=True).sort_values("timestamp_ms") face["time"] = pd.to_datetime(face["timestamp_ms"], unit='ms', utc=True).dt.tz_convert(tz) face.drop("timestamp_ms", axis=1, inplace=True) face.loc[ (face["type"] != "Generic") | face["content"].isnull(), "content" ] = "<INTERACTIVE>" face["path"] = "" if "photos" in face: not_null = face["photos"].notnull() face.loc[not_null, "path"] = [file_path + x[0]["uri"] for x in face[not_null]["photos"]] # if "photos" in face and isinstance(face["photos"]): # face["path"] = [x.get("uri") if x else x for x in face["photos"]] return cls(face)
def load(cls, nrows=None, **kwargs): old_text = "" results = [] nrows = nrows or float("inf") for file_path in just.glob("~/nostalgia_data/input/whatsapp/*.txt"): row = 0 for line in just.iread(file_path): try: time = datetime_from_format(line[:offset], "%d/%m/%Y, %H:%M - ") except ValueError: old_text += line + "\n" continue line = old_text + line[offset:] old_text = "" try: if line.startswith( "Messages to this chat and calls are now secured"): continue sender, text = line.split(": ", 1) except ValueError: print("ERR", line) continue if line: if row > nrows: break row += 1 results.append((time, sender, text)) df = pd.DataFrame(results, columns=["time", "sender", "text"]) # hack "order" into minute data same_minute = df.time == df.shift(1).time seconds = [] second_prop = 0 for x in same_minute: if x: second_prop += 1 else: second_prop = 0 seconds.append( pd.Timedelta(seconds=60 * second_prop / (second_prop + 1))) df["time"] = df["time"] + pd.Series(seconds) return cls(df)
def load(cls, nrows=None): file_path = "~/nostalgia_data/input/facebook" chat_paths = just.glob(f"{file_path}/messages/inbox/*/message_*.json") face = [read_array_of_dict_from_json(chat_file, "messages", nrows) for chat_file in chat_paths] for df in face: senders = df.sender_name.unique() if len(senders) == 2: df.loc[df.sender_name == senders[0], "receiver_name"] = senders[1] df.loc[df.sender_name == senders[1], "receiver_name"] = senders[0] elif len(senders) > 2: df["receiver_name"] = ", ".join([x for x in senders if isinstance(x, str)]) face = [x for x in face] face = pd.concat(face) face = face.reset_index(drop=True).sort_values("timestamp_ms") face["time"] = pd.to_datetime(face["timestamp_ms"], unit="ms", utc=True).dt.tz_convert(tz) face.drop("timestamp_ms", axis=1, inplace=True) face.loc[(face["type"] != "Generic") | face["content"].isnull(), "content"] = "<INTERACTIVE>" face["path"] = "" if "photos" in face: not_null = face["photos"].notnull() face.loc[not_null, "path"] = [file_path + "/" + x[0]["uri"] for x in face[not_null]["photos"]] # if "photos" in face and isinstance(face["photos"]): # face["path"] = [x.get("uri") if x else x for x in face["photos"]] return cls(face)
import re import just import json import gzip from auto_extract import parse_article import tqdm from utils import KEYS_TO_KEEP def slug_url(url): pre_slug = re.sub(r"[-\s]+", "-", url) slugged_url = re.sub(r"[^\w\s-]", "", pre_slug).strip().lower()[-150:] return slugged_url for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/old/html/*.json")): ctime = os.path.getctime(x) with open(x) as f: print("processing", x) data = json.load(f) html = data["html"] url = data["url"] slugged_url = slug_url(url) article = parse_article(html, url) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) meta["creation_time"] = ctime meta["slugged_url"] = slugged_url html_path = "/home/pascal/.nostalgia/html/{}_{}.html.gz".format( ctime, slugged_url) with gzip.GzipFile(html_path, "w") as f: f.write(html.encode("utf8"))
# </TimeSpan> # </Placemark> import pandas as pd import lxml.etree from dateutil.parser import parse as date_parse import just from nostalgia.times import yesterday from nostalgia.utils import format_latlng N = {"klm": "http://www.opengis.net/kml/2.2"} # cats = set() days = [] for fname in sorted( just.glob("/home/pascal/Downloads/ghistory/history-*.kml")): # for fname in sorted(["/home/pascal/Downloads/ghistory/history-2018-07-25.kml"]): tree = lxml.etree.parse(fname) for placemark in tree.xpath("//klm:Placemark", namespaces=N): name = placemark.xpath("./klm:name/text()", namespaces=N)[0] address = placemark.xpath("./klm:address/text()", namespaces=N) address = address[0] if address else None start = date_parse( placemark.xpath("./klm:TimeSpan/klm:begin/text()", namespaces=N)[0]) end = date_parse( placemark.xpath("./klm:TimeSpan/klm:end/text()", namespaces=N)[0]) category = placemark.xpath( "./klm:ExtendedData/klm:Data[@name='Category']/klm:value/text()", namespaces=N) category = category[0] if category else None
import just import pandas as pd import numpy as np fnames = sorted(just.glob("data/*")) titles = [] for fname in fnames: titles.extend(list(pd.read_csv(fname)["title"])) titles = np.array(titles) class_list = ["Discussion", "News", "Project", "Research"] classes = {x[0]: x for x in class_list} class_ind = {x: n for n, x in enumerate(class_list)} X_train = [x for x in titles if x[0] == "[" and x[2] == "]" and x[1] in classes] y_train = np.array([classes[x[1]] for x in X_train]) y_train_multi = np.array(pd.get_dummies(y_train)) # hide class labels ;) X_train = np.array([x[3:] for x in X_train]) X_val = X_train[1::2] y_val = y_train[1::2] y_val_multi = y_train_multi[1::2] X_train = X_train[0::2] y_train = y_train[0::2] y_train_multi = y_train_multi[0::2] X_test = titles[np.array([x[0] != "[" for x in titles])]
def record(data_name, data_path="~/tracktrack/"): path = just.make_path(data_path + data_name + "/") offset = len(just.glob(path + "/im*.png")) for image, it, mouse_pos in yield_images(): cv2.imwrite(path + "/im_{}.png".format(it + offset), image) just.append(mouse_pos, path + "/positions.jsonl")
import just import json import gzip from auto_extract import parse_article import tqdm from utils import KEYS_TO_KEEP def slug_url(url): pre_slug = re.sub(r"[-\s]+", "-", url) slugged_url = re.sub(r"[^\w\s-]", "", pre_slug).strip().lower()[-150:] return slugged_url for x in tqdm.tqdm( just.glob("/home/pascal/.nostalgia_chrome/old/html/*.json")): ctime = os.path.getctime(x) with open(x) as f: print("processing", x) data = json.load(f) html = data["html"] url = data["url"] slugged_url = slug_url(url) article = parse_article(html, url) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) meta["creation_time"] = ctime meta["slugged_url"] = slugged_url html_path = "/home/pascal/.nostalgia_chrome/html/{}_{}.html.gz".format( ctime, slugged_url) with gzip.GzipFile(html_path, "w") as f: f.write(html.encode("utf8"))
import gzip import os import just from auto_extract import parse_article import tqdm from urllib.parse import urlparse import tldextract from utils import KEYS_TO_KEEP for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/meta/v1/*.json")): print("processing", x) meta = just.read(x) if "extruct" in meta: print("skipping", x) continue html_path = "/home/pascal/.nostalgia/html/" + x.split("/")[-1].rstrip( ".json") + ".html.gz" if os.path.exists(html_path): with gzip.GzipFile(html_path, "r") as f: html = f.read() article = parse_article(html, meta["url"]) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) just.write(meta, x) os.system("touch '{}' -r '{}'".format(x, html_path)) print("done", x)
for unit in ['b', 'kb', 'mb', 'gb', 'tb']: if size < 1000.0: break size /= 1000.0 return f"{size:.{decimal_places}f}{unit}" dir_path = "/home/pascal/shrynk" print("Processing:", dir_path) old_total = 0 new_total = 0 if not dir_path.endswith("/"): dir_path += "/" fnames = just.glob(dir_path + "*.json") + just.glob(dir_path + "*.csv") random.shuffle(fnames) for x in fnames: print(x) if x.endswith(".json"): shrynk = jc tp = "JSON" elif x.endswith(".csv"): shrynk = pdc tp = "CSV" old_size = os.path.getsize(x) old_total += old_size data = shrynk.load(x) new_file = shrynk.save( data,
import json import time import os import just import numpy as np import tqdm from features import get_features from compress import COMPRESSIONS import just from itertools import islice import pandas as pd g = (x for x in just.glob("/root/data/data/cmc/201*")) while g: paths = list(islice(g, 500)) data = pd.concat([pd.read_csv(x) for x in paths]) data.to_parquet(paths[0].split("/")[-1].split(".")[0] + ".parquet", engine="pyarrow", compression="brotli") for x in paths: os.remove(x) if len(elements) < 500: break # data = pd.concat([pd.read_csv(x) for x in just.glob("/root/data/data/cmc/20181210*")[:100]]) # for engine, compression in [ # ("csv", None),
import gzip import os import just from auto_extract import parse_article import tqdm from urllib.parse import urlparse import tldextract from utils import KEYS_TO_KEEP for x in tqdm.tqdm( just.glob("/home/pascal/nostalgia_data_chrome/meta/v1/*.json")): print("processing", x) meta = just.read(x) if "extruct" in meta: print("skipping", x) continue html_path = ("/home/pascal/nostalgia_data_chrome/html/" + x.split("/")[-1].rstrip(".json") + ".html.gz") if os.path.exists(html_path): with gzip.GzipFile(html_path, "r") as f: html = f.read() article = parse_article(html, meta["url"]) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) just.write(meta, x) os.system("touch '{}' -r '{}'".format(x, html_path)) print("done", x)
# for i, x in enumerate(tree.xpath("//a[text() = ' CSV ']/@href")): # if skip > i: # continue # skip = i # url = "https://vincentarelbundock.github.io/Rdatasets/" + x # try: # txt = s.get(url).text # except Exception as e: # print("error", e) # continue # with open(os.path.expanduser("~/rcsvs/" + secure_filename(x)), "w") as f: # f.write(txt) # print(i) dfs = [] for x in just.glob("/home/pascal/rcsvs/*"): try: dfs.append(pd.read_csv(x)) except Exception as e: pass from shrynk.pandas import save, infer, PandasCompressor # pdc = PandasCompressor("default") # pdc.run_benchmarks(dfs) # original size 130M of all .csvs bundled in R packages (blind test-set) # optimize=write_time = 113M in 47.7s # optimize=size = 21M in 6m29s # zip each file in folder = 30M in 5.8s