def test_csv_iread_problem_lines(): fname = "testobj.csv" obj = ["a"] + [['"a"', '"b"']] just.write(obj, "testobj.csv") try: just.read(fname) except ValueError: assert True finally: os.remove(fname)
def read_array_of_dict_from_json(fname, key_name=None, nrows=None): """ This is an iterative way to read a json file without having to construct Python elements for everything. It can be a lot faster. Example data: {"participants": {"name": "a", "name": "b", "messages": [{"sender": "a", "time": 123}, {"sender": "b", "time": 124}]}} Function call: read_array_of_dict_from_json(fname, "messages", nrows=1) Returns: pd.DataFrame([{"sender": "a", "time": 123}]) """ if fname.endswith(".jsonl"): if not key_name: return pd.read_json(fname, lines=True) else: return pd.DataFrame([x[key_name] for x in just.read(fname)]) if nrows is None: if not key_name: return pd.read_json(fname, lines=fname.endswith(".jsonl")) else: return pd.DataFrame(just.read(fname)[key_name]) import ijson with open(just.make_path(fname)) as f: parser = ijson.parse(f) capture = False rows = [] row = {} map_key = "" num = 0 for prefix, event, value in parser: if num > nrows: break if prefix == key_name and event == "start_array": capture = True if not capture: continue if event == "start_map": continue elif event == "map_key": map_key = value elif event == "end_map": rows.append(row) row = {} num += 1 elif map_key: row[map_key] = value return pd.DataFrame(rows)
def get_model(model_name=None, models_path="models/"): if model_name: model_file = models_path + model_name + ".json" weights_file = models_path + model_name + ".h5" if os.path.isfile(model_file): model = model_from_json(json.dumps(just.read(model_file))) # load weights into new model model.load_weights(weights_file) print("Loaded model from disk") else: print("Cannot read model, creating fresh one") # Create the model model = Sequential() model.add(BatchNormalization(input_shape=(3, 72, 128))) model.add(Convolution2D(32, 3, 3, border_mode='same', activation='relu')) model.add(Activation('relu')) model.add(Dropout(0.15)) model.add(BatchNormalization()) model.add(Convolution2D(32, 3, 3, activation='relu', border_mode='same')) model.add(MaxPooling2D(pool_size=(2, 2))) model.add(Flatten()) model.add(Dense(512, activation='relu')) model.add(Dropout(0.5)) # to prediction model.add(Dense(2)) model.add(Activation('linear')) model.compile(loss='mean_squared_error', optimizer="adam") return model
def load(cls, nrows=None): photo_glob = "~/nostalgia_data/input/google/Takeout/Google Photos/*/*" pics = [] nrows = nrows or float("inf") rows = 0 for fname in just.glob(photo_glob): if fname.endswith(".json"): continue try: meta = just.read(fname + ".json") except FileNotFoundError: continue if rows == nrows: break date = datetime.fromtimestamp( int(meta["photoTakenTime"]["timestamp"]), tz) latitude, longitude = format_latlng( (meta["geoData"]["latitude"], meta["geoData"]["longitude"])).split(", ") title = meta["title"] pics.append({ "path": "file://" + fname, "lat": latitude, "lon": longitude, "title": title, "time": date, }) rows += 1 pics = pd.DataFrame(pics) return cls(pics)
def file_modified_since_last(fname, name): path = just.make_path("~/nostalgia_data/seen/" + slugify(name) + ".json") last_run_mt = float(just.read(path, no_exist=0)) modified_time = os.path.getmtime(fname) if last_run_mt != modified_time: return modified_time else: return None
def view(path): if path.endswith("gz"): from requests_viewer import view_html view_html(just.read(path)) else: import webbrowser webbrowser.open("file://" + path)
def __init__(self, model_name, encoder_decoder=None, hidden_units=128, base_path="models/"): self.model_name = model_name self.h5_path = base_path + model_name + ".h5" self.pkl_path = base_path + model_name + ".pkl" self.model = None self.hidden_units = hidden_units if encoder_decoder is None: self.encoder_decoder = just.read(self.pkl_path) else: self.encoder_decoder = encoder_decoder
def load(cls, file_path="~/nostalgia_data/input/shazam.json", nrows=None): shazam = pd.DataFrame( [( datetime_from_timestamp(x["timestamp"], x["timezone"]), x["track"]["heading"]["title"], x["track"]["heading"]["subtitle"], ) for x in just.read(file_path)["tags"]], columns=["time", "title", "artist"], ) return cls(shazam)
def check_seen(name, value): path = "~/nostalgia_data/seen/" + slugify(name) + ".json" is_new = True res = just.read(path, no_exist=False) if res: if isinstance(value, tuple): value = list(value) is_new = res != value if is_new: just.write(value, path) return is_new
def get_title(x): if not x.get("domain"): return "" if x["path"] in CACHE: return CACHE[x["path"]] # tree = lxml.html.fromstring(just.read(x["path"])) # title = tree.xpath("/html/head/title/text()") or tree.xpath("//title/text()") or [""] # destroy_tree(tree) # title = title[0] match = re.search("<title>([^<]+)</title", just.read(x["path"]), re.MULTILINE) title = match.groups()[0].strip() if match is not None else "" CACHE[x["path"]] = title return title
def __init__(self, model_name, encoder_decoder=None, hidden_units=128, base_path="models/"): self.model_name = model_name # 模型名称 self.h5_path = base_path + model_name + ".h5" # 模型h5路径 self.pkl_path = base_path + model_name + ".pkl" # 数据pkl路径 self.model = None # 模型 self.hidden_units = hidden_units # 隐含单元 if encoder_decoder is None: # 判断有无编译码器 self.encoder_decoder = just.read(self.pkl_path) else: self.encoder_decoder = encoder_decoder
def search(): global matches word = "".join(letters).lower() matches = [x for x in emojis if word in x.lower()] or emojis[:len(imgs)] for i in range(len(imgs)): color = active_color if selected_index == i else None if i in range(len(matches)): uni, txt, code = matches[i].split("| ") if code not in cache: cache[code] = just.read(f"~/emoji_picker_images/{code}.base64", unknown_type="txt") imgs[i].update(data=cache[code]) else: txt = "" imgs[i].update(data="") inps[i].update(value=txt, text_color=color)
def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None art = parse_article(html, x["url"]) linked_data = get_linked_data_md(art) if linked_data is None: linked_data = get_linked_data_jd(art) CACHE[path] = linked_data return linked_data
def ensure_access_token(self): if self.access_token: return now = int(time.time()) if just.exists(self.ACCESS_TOKEN_FILE): access_token = just.read(self.ACCESS_TOKEN_FILE, unknown_type="json") if now > access_token['time'] + access_token['expires_in']: log.info('Cached access token is expired') os.unlink(self.ACCESS_TOKEN_FILE) else: self.access_token = access_token return self.access_token = self.get_access_token() self.access_token['time'] = now just.write(self.access_token, self.ACCESS_TOKEN_FILE, unknown_type="json")
def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None art = parse_article(html, x["url"]) if "youtube" not in art.domain: return None title = re.sub(" - YouTube$", "", art.tree.xpath("//title/text()")[0]) if title == "YouTube": CACHE[path] = None return None if not title: return None vc = art.tree.xpath("//span[contains(@class, 'view-count')]/text()") vc = re.sub("[^0-9]", "", vc[0]) if vc else None watch_part = urllib.parse.parse_qs(urllib.parse.urlparse( x["url"]).query)["v"] if watch_part: image = "http://i3.ytimg.com/vi/{}/maxresdefault.jpg".format( watch_part[0]) else: image = None channel = art.tree.xpath("//ytd-video-owner-renderer//a/text()") if not channel: channel = art.tree.xpath("//ytd-channel-name//a/text()") channel = " ".join(channel) linked_data = { "title": title, "type": "video", "source": "youtube", "image": image, "view_count": vc, "channel": channel, } CACHE[path] = linked_data return linked_data
def mockle(name, good=None, exception=""): name = get_path(name) if get_env() == "dev": if not exception: print("storing mocked", name) just.write(good, name) return good else: try: print("load mocked", name, "ignoring exception:") print(exception) return just.read(name) except: # do not raise "this" exception, but the original raise exception elif exception == "": return good else: raise exception
def __enter__(self): try: good = self.fn(self.args) except Exception as e: exception = e if self.env == "dev" and exception: try: print("load mocked", self.name, "ignoring exception:") print(exception) return just.read(self.name) except: # do not raise this exception, but the original raise exception elif self.env == "dev" and good is not None: print("storing mocked", self.name) just.write(good, self.name) return good elif exception: raise exception
def load_json_file_modified_time(cls, fname, nrows=None, from_cache=True, **kwargs): name = fname + "_" + normalize_name(cls.__name__) modified_time = os.path.getmtime(os.path.expanduser(fname)) last_modified = get_last_mod_time(name) if modified_time != last_modified or not from_cache: data = just.read(fname) data = cls.handle_json(data, **kwargs) data = pd.DataFrame(data) if nrows is None: save_df(data, name) save_last_mod_time(modified_time, name) else: data = load_df(name) if nrows is not None: data = data.iloc[-nrows:] return data
def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None tree = lxml.html.fromstring(html) res = tree.xpath("//input[@name='q' and @type='text']") if not res: linked_data = None else: linked_data = {"title": res[0].value} CACHE[path] = linked_data return linked_data
def main(): conf_path = "~/nostalgia_data/config/fitbit/config.json" if not just.exists(conf_path): webbrowser.open("https://dev.fitbit.com/apps/new") webbrowser.open( "https://raw.githubusercontent.com/nostalgia-dev/nostalgia_fitbit/master/docs/fitbit_app.png" ) client_id = getpass.getpass("Client ID: ") client_secret = getpass.getpass("Client Secret: ") info = {"client_id": client_id, "client_secret": client_secret} just.write(info, conf_path) print("Saved in:", conf_path) config = just.read(conf_path) if not config["client_id"] or not config["client_secret"]: msg = "Fill in a value for client_id and client_secret in '{}'".format( conf_path) raise ValueError(msg) fa = FitbitAuth(client_id=config['client_id'], client_secret=config['client_secret']) fa.ensure_access_token() try: f = Fitbit(access_token=fa.access_token['access_token']) print(json.dumps(f.profile, indent=2)) except requests.exceptions.HTTPError as e: print(e.response.status_code) if e.response.status_code == 429: print(e.response.headers) return raise export = FitbitExport(f, profile=f.profile) export.sync_sleep() export.sync_heartrate_intraday()
import gzip import os import just from auto_extract import parse_article import tqdm from urllib.parse import urlparse import tldextract from utils import KEYS_TO_KEEP for x in tqdm.tqdm(just.glob("/home/pascal/.nostalgia/meta/v1/*.json")): print("processing", x) meta = just.read(x) if "extruct" in meta: print("skipping", x) continue html_path = "/home/pascal/.nostalgia/html/" + x.split("/")[-1].rstrip( ".json") + ".html.gz" if os.path.exists(html_path): with gzip.GzipFile(html_path, "r") as f: html = f.read() article = parse_article(html, meta["url"]) meta = article.to_dict(keys=KEYS_TO_KEEP, skip_if_empty=True) just.write(meta, x) os.system("touch '{}' -r '{}'".format(x, html_path)) print("done", x)
def get_processed_files(name): path = "~/nostalgia_data/seen/" + slugify(name) + ".json" return set(just.read(path, no_exist=[]))
def get_newline_count(name): """ counts by row numbers in a file """ path = "~/nostalgia_data/seen/" + slugify(name) + ".json" return just.read(path, no_exist=0)
def get_last_latest_file(name): path = "~/nostalgia_data/seen/" + slugify(name) + ".json" return just.read(path, no_exist=0)
import just usm = just.read("data/money/us.json") usd_min = set([x[0] for x in usm['symbols']["$"]] + [x[0] for x in usm['currencies']["dollar"]] + [x[0] for x in usm['currencies']["bucks"]]) usm['keywords'] = {k: v for k, v in usm['keywords'].items() if v in usd_min} del usm['keywords']["Tongan"] usm['symbols'] = {"$": usm['symbols']['$']} usm['currencies'] = { "dollar": usm['currencies']['dollar'], "bucks": usm['currencies']['bucks'] } usm['abbrevs'] = [x for x in usm['abbrevs'] if x in usm['keywords'].values()] usm['abbrevs'].remove("TOP") usm = just.write(usm, "data/money/us_min.json") ### import just usm = just.read("/Users/pascal/egoroot/natura/data/money/us.json") eur_min = set([x[0] for x in usm['symbols']["$"]] + [x[0] for x in usm['currencies']["dollar"]] + [x[0] for x in usm['currencies']["bucks"]] + [x[0] for x in usm['symbols']["€"]] + [x[0] for x in usm['currencies']["euro"]]) usm['keywords'] = {k: v for k, v in usm['keywords'].items() if v in eur_min} usm['symbols'] = {"$": usm['symbols']['$'], "€": usm['symbols']["€"]}
def replace_ndf_class(): ndf_replace = "class NDF:" original = just.read(NDF_PATH) ndf = [x for x in original.split("\n") if x.startswith("class NDF")][0] just.write(original.replace(ndf, ndf_replace), NDF_PATH) return original
def test_unsuccesful_read(): assert just.read("A" * 100, 42) == 42
cache[code] = just.read(f"~/emoji_picker_images/{code}.base64", unknown_type="txt") imgs[i].update(data=cache[code]) else: txt = "" imgs[i].update(data="") inps[i].update(value=txt, text_color=color) while True: event, values = window.read() print(event) do_search = True if emojis is None: print("loading emoji") emojis = just.read("~/emojis2.txt").split("\n") if event in (None, "Cancel"): # if user closes window or clicks cancel break elif event.startswith("Ok"): do_search = False elif event.startswith("Control"): if last_key.startswith("BackSpace") or event == "\x7f": letters = [] sofar.update(value="".join(letters)) search() elif event.startswith("Shift"): do_search = False elif event.startswith("Alt"): do_search = False elif event.startswith("Super"): do_search = False
def read(self, index): return just.read(self.path[index])
def write_error(fname): with open("/home/pascal/csvres.jsonl", "a") as f: f.write(json.dumps({"meta": {"fname": fname}, "error": True}) + "\n") if __name__ == "__main__": tot = 0 results = [] done = set() with open("/home/pascal/csvres.jsonl") as f: for line in f: done.add(json.loads(line)["meta"]["fname"]) lines = just.read("~/csvlist.txt").split("\n") for line in tqdm.tqdm(random.sample(lines, len(lines))): if line in done: continue try: mb = os.path.getsize(line) / 1000 / 1000 except FileNotFoundError: done.add(line) write_error(line) continue # tot += mb # if mb > 1: # print(mb, line) df, meta, X = get_meta(line) if X is not None: # print(mb, line)