Exemple #1
0
    def load_image_texts(cls, glob_pattern_s, nrows=None):
        import pytesseract
        from PIL import Image

        if isinstance(glob_pattern_s, list):
            fnames = set()
            for glob_pattern in glob_pattern_s:
                fnames.update(set(just.glob(glob_pattern)))
            glob_pattern = "_".join(glob_pattern_s)
        else:
            fnames = set(just.glob(glob_pattern))
        name = glob_pattern + "_" + normalize_name(cls.__name__)
        processed_files = get_processed_files(name)
        to_process = fnames.difference(processed_files)
        objects = []

        cache = get_cache("tesseract")

        if nrows is not None:
            if not to_process:
                return load_df(name).iloc[-nrows:]
            else:
                to_process = list(to_process)[-nrows:]
        if to_process:
            for fname in to_process:
                if fname in cache:
                    text = cache[fname]
                else:
                    try:
                        text = pytesseract.image_to_string(
                            Image.open(just.make_path(fname)))
                    except OSError as e:
                        print("ERR", fname, e)
                        continue
                    cache[fname] = text
                time = datetime_from_timestamp(os.path.getmtime(fname), "utc")
                data = {
                    "text": text,
                    "path": fname,
                    "title": fname.split("/")[-1],
                    "time": time
                }
                objects.append(data)
            data = pd.DataFrame(objects)
            if processed_files and nrows is None:
                data = pd.concat((data, load_df(name)))
            for x in ["time", "start", "end"]:
                if x in data:
                    data = data.sort_values(x)
                    break
            if nrows is None:
                save_df(data, name)
                save_processed_files(fnames | processed_files, name)
        else:
            data = load_df(name)
        if nrows is not None:
            data = data.iloc[-nrows:]
        return data
Exemple #2
0
 def load(cls, nrows=None, from_cache=True, **kwargs):
     dfs = [
         cls.load_data_file_modified_time(file_path, nrows=nrows, from_cache=from_cache)
         for file_path in just.glob("~/nostalgia_data/input/google/Takeout/Mail/*.mbox")
     ]
     dfs = [x for x in dfs if not x.empty]
     return cls(pd.concat(dfs))
Exemple #3
0
    def load(cls, file_glob="~/Downloads/timeline_data-*", nrows=None):
        df = pd.read_csv(max(just.glob(file_glob)), nrows=nrows)

        unique_locs = set([((y, z), x)
                           for x, y, z in zip(df.name, df.lat, df.lon)
                           if y != "nan"])

        excluded_transport_names = set(df[df.name == df.category].name)

        details_data = []
        for (latitude, longitude), name in unique_locs:
            d = get_results((latitude, longitude), name,
                            excluded_transport_names)
            if d is None:
                continue
            d["lat"] = latitude
            d["lon"] = longitude
            d["name"] = name
            details_data.append(d)

        details_data = pd.DataFrame(details_data)

        # all_loc = df.merge(details_data, on=["name", "lat", "lon"], how="outer")
        places = df.merge(details_data, on=["name", "lat", "lon"], how="inner")

        # all_loc = process(all_loc, excluded_transport_names)

        home_regex = "|".join(cls.home)
        work_regex = "|".join(cls.work)
        hometown_regex = "|".join(cls.hometown)
        places = process(places, excluded_transport_names, home_regex,
                         work_regex, hometown_regex)

        return cls(places)
Exemple #4
0
 def load_dataframe_per_json_file(cls, glob_pattern, key="", nrows=None):
     fnames = set(just.glob(glob_pattern))
     name = glob_pattern + "_" + normalize_name(cls.__name__)
     processed_files = get_processed_files(name)
     to_process = fnames.difference(processed_files)
     objects = []
     if nrows is not None:
         if not to_process:
             to_process = list(processed_files)[-nrows:]
         else:
             to_process = list(to_process)[-nrows:]
     if to_process:
         print("processing {} files".format(len(to_process)))
         for fname in to_process:
             data = read_array_of_dict_from_json(fname, key, nrows)
             data = cls.handle_dataframe_per_file(data, fname)
             if data is None:
                 continue
             objects.append(data)
         data = pd.concat(objects)
         if processed_files and nrows is None:
             data = pd.concat((data, load_df(name)))
         for x in ["time", "start", "end"]:
             if x in data:
                 data = data.sort_values(x)
                 break
         if nrows is None:
             save_df(data, name)
             save_processed_files(fnames | processed_files, name)
     else:
         data = load_df(name)
     if nrows is not None:
         data = data.iloc[-nrows:]
     return data
Exemple #5
0
    def load(cls, nrows=None):
        photo_glob = "~/nostalgia_data/input/google/Takeout/Google Photos/*/*"
        pics = []
        nrows = nrows or float("inf")
        rows = 0
        for fname in just.glob(photo_glob):
            if fname.endswith(".json"):
                continue
            try:
                meta = just.read(fname + ".json")
            except FileNotFoundError:
                continue
            if rows == nrows:
                break
            date = datetime.fromtimestamp(
                int(meta["photoTakenTime"]["timestamp"]), tz)
            latitude, longitude = format_latlng(
                (meta["geoData"]["latitude"],
                 meta["geoData"]["longitude"])).split(", ")
            title = meta["title"]
            pics.append({
                "path": "file://" + fname,
                "lat": latitude,
                "lon": longitude,
                "title": title,
                "time": date,
            })
            rows += 1

        pics = pd.DataFrame(pics)
        return cls(pics)
Exemple #6
0
 def load(cls, nrows=None):
     files = just.glob("~/nostalgia_data/input/mijn_chipkaart/*.csv")
     data = pd.concat([pd.read_csv(x, sep=";", nrows=nrows) for x in files])
     data["Bedrag"] = [float(x.replace(",", ".")) for x in data["Bedrag"]]
     data["Datum"] = [
         parse_date_tz(x + " " + y).start_date for x, y in zip(data.Datum, data["Check-uit"])
     ]
     return cls(data)
Exemple #7
0
def multi_read(star_path,
               no_exist=None,
               unknown_type="RAISE",
               ignore_exceptions=None):
    return {
        x: read(x, no_exist, unknown_type, ignore_exceptions)
        for x in glob(star_path)
    }
Exemple #8
0
 def load(cls, nrows=None):
     files = just.glob(
         "~/nostalgia_data/input/ns_chipkaart/reistransacties-*.xls")
     data = pd.concat([pd.read_excel(x, nrows=nrows)
                       for x in files]).iloc[:-1]
     data["Datum"] = data["Datum"].apply(parse_date_tz)
     data["Vertrek"], data["Bestemming"] = data["Omschrijving"].str.split(
         ":", 1).str[1].str.split(" - ", 1).str
     return cls(data)
Exemple #9
0
 def load(cls, nrows=None):
     files = "~/nostalgia_data/input/apple/*/Apple_Media_Services/Stores Activity/Account and Transaction History/Store*History.csv"
     dfs = [
         pd.read_csv(f, parse_dates=["Item Purchased Date"])
         for f in just.glob(files)
     ]
     applications = pd.merge(dfs[0],
                             dfs[1],
                             how="outer",
                             on="Item Purchased Date")
     return cls(applications)
def get_training_xy(data_path="./tracktrack/"):
    positions = list(just.iread(data_path + "positions.jsonl"))
    images = [
        matplotlib.image.imread(x) for x in just.glob(data_path + "im*.png")
    ]
    m = min(len(images), len(positions))

    X = prep_images(images[-m:])
    positions = positions[-m:]
    y = np.array(positions)

    return X, y
Exemple #11
0
def test_multi_read():
    obj = ["a", "b"]
    fnames = ["a.txt", "b.txt"]
    just.multi_write(obj, fnames)
    try:
        full_names = just.glob("*.txt")
        multi_content = just.multi_read("*.txt")
        for o, f in zip(obj, fnames):
            full_name = [x for x in full_names if x.endswith(f)][0]
            assert multi_content[full_name] == o
    finally:
        for fname in fnames:
            os.remove(fname)
Exemple #12
0
 def load(cls, nrows=None):
     path = "~/nostalgia_data/input/samsung/samsunghealth_*/com.samsung.health.sleep_stage.*.csv"
     fname = just.glob(path)[0]
     data = cls.load_data_file_modified_time(fname, nrows=nrows, skiprows=1)
     data["start_time"] = [
         datetime_from_format(x, "%Y-%m-%d %H:%M:%S.%f")
         for x in data["start_time"]
     ]
     data["end_time"] = [
         datetime_from_format(x, "%Y-%m-%d %H:%M:%S.%f")
         for x in data["end_time"]
     ]
     return cls(data)
Exemple #13
0
    def load(cls, nrows=None):
        files = "~/nostalgia_data/input/apple/*/iCloudUsageData Set*.csv"

        icloud = pd.concat([
            pd.read_csv(f, skiprows=1, error_bad_lines=False)
            for f in just.glob(files)
        ])
        icloud = icloud.iloc[:icloud.loc[
            icloud.Date ==
            "Photos: Delete photo/video from iCloud Photo Library"].index.
                             to_list()[0]]
        icloud["File Capture Date"] = icloud["File Capture Date"].apply(
            lambda x: datetime_from_format(x, "%Y-%m-%d"))
        return cls(icloud)
Exemple #14
0
 def latest_file_is_historic(cls,
                             glob,
                             key_name="",
                             nrows=None,
                             from_cache=True):
     """
     Glob is for using a wildcard pattern, and the last created file will be loaded.
     See `load_data_file_modified_time` for further reference.
     Returns a pd.DataFrame
     """
     recent = max([x for x in just.glob(glob) if "(" not in x],
                  key=os.path.getctime)
     return cls.load_data_file_modified_time(recent, key_name, nrows,
                                             from_cache)
Exemple #15
0
def load_from_download(ingest_glob, vendor, recent_only=True, delete_existing=True):
    ingest_files = just.glob(ingest_glob)
    if not ingest_files:
        raise ValueError(f"Nothing to extract using {ingest_glob} - Aborting")
    nostalgia_input = "~/nostalgia_data/input/{}".format(vendor)
    if delete_existing:
        just.remove(nostalgia_input, allow_recursive=True)
    elif just.exists(nostalgia_input):
        raise ValueError(f"Cannot overwrite path {nostalgia_input}, pass delete_existing=True")
    fnames = sorted(ingest_files, key=os.path.getctime)
    if recent_only:
        fnames = fnames[-1:]
    for fname in fnames:
        with zipfile.ZipFile(fname, 'r') as zip_ref:
            out = os.path.expanduser(nostalgia_input)
            print("unpacking from", fname, "to", out)
            zip_ref.extractall(out)
Exemple #16
0
    def load(cls, nrows=None):
        files = just.glob("~/nostalgia_data/input/abnamro/*.xls")
        abn = pd.concat([
            pd.read_excel(x,
                          nrows=nrows,
                          converters={"transactiondate": convert_date})
            for x in files
        ])
        abn["preciseDate"] = abn["description"].apply(find_date)
        if abn.preciseDate.isnull().iloc[0]:
            abn.preciseDate.iloc[0] = abn.transactiondate.iloc[0]
        if abn.preciseDate.isnull().iloc[-1]:
            abn.preciseDate.iloc[-1] = abn.transactiondate.iloc[-1]

        abn.preciseDate = (abn.preciseDate.map(
            lambda x: time.mktime(pd.datetime.timetuple(x)) if not pd.isna(x)
            else np.nan).interpolate("values").map(datetime_from_timestamp))

        return cls(abn)
Exemple #17
0
 def load(cls, nrows=None):
     file_path = "~/nostalgia_data/input/facebook"
     chat_paths = just.glob(f"{file_path}/messages/inbox/*/message_1.json")
     face = pd.concat(
         [read_array_of_dict_from_json(chat_file, "messages", nrows) for chat_file in chat_paths]
     )
     face = face.reset_index(drop=True).sort_values("timestamp_ms")
     face["time"] = pd.to_datetime(face["timestamp_ms"], unit='ms', utc=True).dt.tz_convert(tz)
     face.drop("timestamp_ms", axis=1, inplace=True)
     face.loc[
         (face["type"] != "Generic") | face["content"].isnull(), "content"
     ] = "<INTERACTIVE>"
     face["path"] = ""
     if "photos" in face:
         not_null = face["photos"].notnull()
         face.loc[not_null, "path"] = [file_path + x[0]["uri"] for x in face[not_null]["photos"]]
     # if "photos" in face and isinstance(face["photos"]):
     #     face["path"] = [x.get("uri") if x else x for x in face["photos"]]
     return cls(face)
Exemple #18
0
    def load(cls, nrows=None, **kwargs):
        old_text = ""
        results = []
        nrows = nrows or float("inf")
        for file_path in just.glob("~/nostalgia_data/input/whatsapp/*.txt"):
            row = 0
            for line in just.iread(file_path):
                try:
                    time = datetime_from_format(line[:offset],
                                                "%d/%m/%Y, %H:%M - ")
                except ValueError:
                    old_text += line + "\n"
                    continue
                line = old_text + line[offset:]
                old_text = ""
                try:
                    if line.startswith(
                            "Messages to this chat and calls are now secured"):
                        continue
                    sender, text = line.split(": ", 1)
                except ValueError:
                    print("ERR", line)
                    continue
                if line:
                    if row > nrows:
                        break
                    row += 1
                    results.append((time, sender, text))

        df = pd.DataFrame(results, columns=["time", "sender", "text"])
        # hack "order" into minute data
        same_minute = df.time == df.shift(1).time
        seconds = []
        second_prop = 0
        for x in same_minute:
            if x:
                second_prop += 1
            else:
                second_prop = 0
            seconds.append(
                pd.Timedelta(seconds=60 * second_prop / (second_prop + 1)))
        df["time"] = df["time"] + pd.Series(seconds)
        return cls(df)
Exemple #19
0
 def load(cls, nrows=None):
     file_path = "~/nostalgia_data/input/facebook"
     chat_paths = just.glob(f"{file_path}/messages/inbox/*/message_*.json")
     face = [read_array_of_dict_from_json(chat_file, "messages", nrows) for chat_file in chat_paths]
     for df in face:
         senders = df.sender_name.unique()
         if len(senders) == 2:
             df.loc[df.sender_name == senders[0], "receiver_name"] = senders[1]
             df.loc[df.sender_name == senders[1], "receiver_name"] = senders[0]
         elif len(senders) > 2:
             df["receiver_name"] = ", ".join([x for x in senders if isinstance(x, str)])
     face = [x for x in face]
     face = pd.concat(face)
     face = face.reset_index(drop=True).sort_values("timestamp_ms")
     face["time"] = pd.to_datetime(face["timestamp_ms"], unit="ms", utc=True).dt.tz_convert(tz)
     face.drop("timestamp_ms", axis=1, inplace=True)
     face.loc[(face["type"] != "Generic") | face["content"].isnull(), "content"] = "<INTERACTIVE>"
     face["path"] = ""
     if "photos" in face:
         not_null = face["photos"].notnull()
         face.loc[not_null, "path"] = [file_path + "/" + x[0]["uri"] for x in face[not_null]["photos"]]
     # if "photos" in face and isinstance(face["photos"]):
     #     face["path"] = [x.get("uri") if x else x for x in face["photos"]]
     return cls(face)
Exemple #20
0
import re
import just
import json
import gzip
from auto_extract import parse_article
import tqdm
from utils import KEYS_TO_KEEP


def slug_url(url):
    pre_slug = re.sub(r"[-\s]+", "-", url)
    slugged_url = re.sub(r"[^\w\s-]", "", pre_slug).strip().lower()[-150:]
    return slugged_url


for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/old/html/*.json")):
    ctime = os.path.getctime(x)
    with open(x) as f:
        print("processing", x)
        data = json.load(f)
        html = data["html"]
        url = data["url"]
        slugged_url = slug_url(url)
        article = parse_article(html, url)
        meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True)
        meta["creation_time"] = ctime
        meta["slugged_url"] = slugged_url
        html_path = "/home/pascal/.nostalgia/html/{}_{}.html.gz".format(
            ctime, slugged_url)
        with gzip.GzipFile(html_path, "w") as f:
            f.write(html.encode("utf8"))
#   </TimeSpan>
# </Placemark>

import pandas as pd
import lxml.etree
from dateutil.parser import parse as date_parse
import just
from nostalgia.times import yesterday
from nostalgia.utils import format_latlng

N = {"klm": "http://www.opengis.net/kml/2.2"}

# cats = set()
days = []
for fname in sorted(
        just.glob("/home/pascal/Downloads/ghistory/history-*.kml")):
    # for fname in sorted(["/home/pascal/Downloads/ghistory/history-2018-07-25.kml"]):
    tree = lxml.etree.parse(fname)
    for placemark in tree.xpath("//klm:Placemark", namespaces=N):
        name = placemark.xpath("./klm:name/text()", namespaces=N)[0]
        address = placemark.xpath("./klm:address/text()", namespaces=N)
        address = address[0] if address else None
        start = date_parse(
            placemark.xpath("./klm:TimeSpan/klm:begin/text()",
                            namespaces=N)[0])
        end = date_parse(
            placemark.xpath("./klm:TimeSpan/klm:end/text()", namespaces=N)[0])
        category = placemark.xpath(
            "./klm:ExtendedData/klm:Data[@name='Category']/klm:value/text()",
            namespaces=N)
        category = category[0] if category else None
Exemple #22
0
import just
import pandas as pd
import numpy as np

fnames = sorted(just.glob("data/*"))
titles = []
for fname in fnames:
    titles.extend(list(pd.read_csv(fname)["title"]))

titles = np.array(titles)

class_list = ["Discussion", "News", "Project", "Research"]
classes = {x[0]: x for x in class_list}
class_ind = {x: n for n, x in enumerate(class_list)}

X_train = [x for x in titles if x[0] == "[" and x[2] == "]" and x[1] in classes]
y_train = np.array([classes[x[1]] for x in X_train])
y_train_multi = np.array(pd.get_dummies(y_train))

# hide class labels ;)
X_train = np.array([x[3:] for x in X_train])


X_val = X_train[1::2]
y_val = y_train[1::2]
y_val_multi = y_train_multi[1::2]
X_train = X_train[0::2]
y_train = y_train[0::2]
y_train_multi = y_train_multi[0::2]

X_test = titles[np.array([x[0] != "[" for x in titles])]
Exemple #23
0
def record(data_name, data_path="~/tracktrack/"):
    path = just.make_path(data_path + data_name + "/")
    offset = len(just.glob(path + "/im*.png"))
    for image, it, mouse_pos in yield_images():
        cv2.imwrite(path + "/im_{}.png".format(it + offset), image)
        just.append(mouse_pos, path + "/positions.jsonl")
Exemple #24
0
import just
import json
import gzip
from auto_extract import parse_article
import tqdm
from utils import KEYS_TO_KEEP


def slug_url(url):
    pre_slug = re.sub(r"[-\s]+", "-", url)
    slugged_url = re.sub(r"[^\w\s-]", "", pre_slug).strip().lower()[-150:]
    return slugged_url


for x in tqdm.tqdm(
        just.glob("/home/pascal/.nostalgia_chrome/old/html/*.json")):
    ctime = os.path.getctime(x)
    with open(x) as f:
        print("processing", x)
        data = json.load(f)
        html = data["html"]
        url = data["url"]
        slugged_url = slug_url(url)
        article = parse_article(html, url)
        meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True)
        meta["creation_time"] = ctime
        meta["slugged_url"] = slugged_url
        html_path = "/home/pascal/.nostalgia_chrome/html/{}_{}.html.gz".format(
            ctime, slugged_url)
        with gzip.GzipFile(html_path, "w") as f:
            f.write(html.encode("utf8"))
Exemple #25
0
import gzip
import os
import just
from auto_extract import parse_article
import tqdm
from urllib.parse import urlparse
import tldextract
from utils import KEYS_TO_KEEP

for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/meta/v1/*.json")):
    print("processing", x)
    meta = just.read(x)
    if "extruct" in meta:
        print("skipping", x)
        continue
    html_path = "/home/pascal/.nostalgia/html/" + x.split("/")[-1].rstrip(
        ".json") + ".html.gz"
    if os.path.exists(html_path):
        with gzip.GzipFile(html_path, "r") as f:
            html = f.read()
        article = parse_article(html, meta["url"])
        meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True)
        just.write(meta, x)
        os.system("touch '{}' -r '{}'".format(x, html_path))
        print("done", x)
Exemple #26
0
    for unit in ['b', 'kb', 'mb', 'gb', 'tb']:
        if size < 1000.0:
            break
        size /= 1000.0
    return f"{size:.{decimal_places}f}{unit}"


dir_path = "/home/pascal/shrynk"
print("Processing:", dir_path)

old_total = 0
new_total = 0
if not dir_path.endswith("/"):
    dir_path += "/"

fnames = just.glob(dir_path + "*.json") + just.glob(dir_path + "*.csv")
random.shuffle(fnames)

for x in fnames:
    print(x)
    if x.endswith(".json"):
        shrynk = jc
        tp = "JSON"
    elif x.endswith(".csv"):
        shrynk = pdc
        tp = "CSV"
    old_size = os.path.getsize(x)
    old_total += old_size
    data = shrynk.load(x)
    new_file = shrynk.save(
        data,
Exemple #27
0
import json
import time
import os
import just
import numpy as np

import tqdm

from features import get_features
from compress import COMPRESSIONS

import just
from itertools import islice
import pandas as pd

g = (x for x in just.glob("/root/data/data/cmc/201*"))

while g:
    paths = list(islice(g, 500))
    data = pd.concat([pd.read_csv(x) for x in paths])
    data.to_parquet(paths[0].split("/")[-1].split(".")[0] + ".parquet",
                    engine="pyarrow",
                    compression="brotli")
    for x in paths:
        os.remove(x)
    if len(elements) < 500:
        break

# data = pd.concat([pd.read_csv(x) for x in just.glob("/root/data/data/cmc/20181210*")[:100]])
# for engine, compression in [
#     ("csv", None),
import gzip
import os
import just
from auto_extract import parse_article
import tqdm
from urllib.parse import urlparse
import tldextract
from utils import KEYS_TO_KEEP

for x in tqdm.tqdm(
        just.glob("/home/pascal/nostalgia_data_chrome/meta/v1/*.json")):
    print("processing", x)
    meta = just.read(x)
    if "extruct" in meta:
        print("skipping", x)
        continue

    html_path = ("/home/pascal/nostalgia_data_chrome/html/" +
                 x.split("/")[-1].rstrip(".json") + ".html.gz")
    if os.path.exists(html_path):
        with gzip.GzipFile(html_path, "r") as f:
            html = f.read()
        article = parse_article(html, meta["url"])
        meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True)
        just.write(meta, x)
        os.system("touch '{}' -r '{}'".format(x, html_path))
        print("done", x)
Exemple #29
0
# for i, x in enumerate(tree.xpath("//a[text() = ' CSV ']/@href")):
#     if skip > i:
#         continue
#     skip = i
#     url = "https://vincentarelbundock.github.io/Rdatasets/" + x
#     try:
#         txt = s.get(url).text
#     except Exception as e:
#         print("error", e)
#         continue
#     with open(os.path.expanduser("~/rcsvs/" + secure_filename(x)), "w") as f:
#         f.write(txt)
#     print(i)

dfs = []
for x in just.glob("/home/pascal/rcsvs/*"):
    try:
        dfs.append(pd.read_csv(x))
    except Exception as e:
        pass

from shrynk.pandas import save, infer, PandasCompressor

# pdc = PandasCompressor("default")
# pdc.run_benchmarks(dfs)

# original size 130M of all .csvs bundled in R packages (blind test-set)
# optimize=write_time        = 113M in 47.7s
# optimize=size              = 21M in 6m29s
# zip each file in folder    = 30M in 5.8s