def load_image_texts(cls, glob_pattern_s, nrows=None): import pytesseract from PIL import Image if isinstance(glob_pattern_s, list): fnames = set() for glob_pattern in glob_pattern_s: fnames.update(set(just.glob(glob_pattern))) glob_pattern = "_".join(glob_pattern_s) else: fnames = set(just.glob(glob_pattern)) name = glob_pattern + "_" + normalize_name(cls.__name__) processed_files = get_processed_files(name) to_process = fnames.difference(processed_files) objects = [] cache = get_cache("tesseract") if nrows is not None: if not to_process: return load_df(name).iloc[-nrows:] else: to_process = list(to_process)[-nrows:] if to_process: for fname in to_process: if fname in cache: text = cache[fname] else: try: text = pytesseract.image_to_string( Image.open(just.make_path(fname))) except OSError as e: print("ERR", fname, e) continue cache[fname] = text time = datetime_from_timestamp(os.path.getmtime(fname), "utc") data = { "text": text, "path": fname, "title": fname.split("/")[-1], "time": time } objects.append(data) data = pd.DataFrame(objects) if processed_files and nrows is None: data = pd.concat((data, load_df(name))) for x in ["time", "start", "end"]: if x in data: data = data.sort_values(x) break if nrows is None: save_df(data, name) save_processed_files(fnames | processed_files, name) else: data = load_df(name) if nrows is not None: data = data.iloc[-nrows:] return data
import re import just import numpy as np import pandas as pd from nostalgia.ndf import NDF import lxml.html import diskcache from auto_extract import parse_article from nostalgia.nlp import nlp from nostalgia.cache import get_cache from nostalgia.data_loading import read_array_of_dict_from_json from nostalgia.times import datetime_from_timestamp CACHE = get_cache("chrome_history") # def destroy_tree(tree): # node_tracker = {tree: [0, None]} # for node in tree.iterdescendants(): # parent = node.getparent() # node_tracker[node] = [node_tracker[parent][0] + 1, parent] # node_tracker = sorted( # [(depth, parent, child) for child, (depth, parent) in node_tracker.items()], # key=lambda x: x[0], # reverse=True, # ) # for _, parent, child in node_tracker: # if parent is None: # break
import os from googleapiclient.discovery import build from googleapiclient.http import HttpError import dotenv from nostalgia.cache import get_cache CACHE = get_cache("google_custom_search") dotenv.load_dotenv("google_custom_search/.env") dotenv.load_dotenv(".env") errored_count = 0 def google_custom_search(search_term, **kwargs): global errored_count search_term = search_term.lower() if search_term in CACHE: return CACHE[search_term] if errored_count > 4: return [] service = build("customsearch", "v1", developerKey=os.environ["MY_API_KEY"]) try: res = service.cse().list(q=search_term, cx=os.environ["MY_CSE_ID"], **kwargs).execute() except HttpError as e: print("error", e) errored_count += 1
import just import pandas as pd import lxml.html from nostalgia.cache import get_cache from nostalgia.ndf import NDF from datetime import datetime from nostalgia.times import tz CACHE = get_cache("linked_google_search") def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None tree = lxml.html.fromstring(html) res = tree.xpath("//input[@name='q' and @type='text']") if not res: linked_data = None else:
import json from datetime import datetime import just import pandas as pd from nostalgia.times import tz from auto_extract import parse_article from nostalgia.cache import get_cache from nostalgia.ndf import NDF from nostalgia.utils import normalize_name CACHE = get_cache("linked_events") def getter(dc, key, default=None): res = dc.get(key, default) if isinstance(res, list): res = res[0] elif isinstance(res, dict): res = json.dumps(res) return res def get_linked_data_jd(art): data = None try: jdata = art.jsonld except json.JSONDecodeError:
import json from datetime import datetime import just import pandas as pd from nostalgia.times import tz from auto_extract import parse_article from nostalgia.cache import get_cache from nostalgia.ndf import NDF from nostalgia.utils import normalize_name CACHE = get_cache("linked_person") def getter(dc, key, default=None): res = dc.get(key, default) if isinstance(res, list): res = res[0] elif isinstance(res, dict): res = json.dumps(res) return res def get_linked_data_jd(art): data = None try: jdata = art.jsonld except json.JSONDecodeError:
from datetime import datetime from urllib.parse import urljoin import just import pandas as pd from nostalgia.utils import parse_price from nostalgia.times import tz from nostalgia.nlp import nlp from nostalgia.ndf import NDF from auto_extract import parse_article from nostalgia.sources.web.get_keywords_for_product import get_keywords_for_product from nostalgia.cache import get_cache CACHE = get_cache("linked_offers") def getter(dc, key, default=None): res = dc.get(key, default) if isinstance(res, list): res = res[0] elif isinstance(res, dict): res = json.dumps(res) return res from natura import Finder finder = Finder()
import just import urllib.parse import pandas as pd from auto_extract import parse_article from nostalgia.cache import get_cache from datetime import datetime from nostalgia.times import tz from nostalgia.ndf import NDF from nostalgia.nlp import nlp CACHE = get_cache("linked_data_videos") def get_linked_data(x): path = x["path"] if path in CACHE: return CACHE[path] try: html = just.read(path) except EOFError: CACHE[path] = None return None if not html.strip(): CACHE[path] = None return None art = parse_article(html, x["url"])
import os import requests import dotenv from nostalgia.cache import get_cache CACHE = get_cache("darksky_weather") dotenv.load_dotenv(".env") dotenv.load_dotenv("~/nostalgia_data/.env") def _historic_weather(latitude, longitude, epoch_time): q = f"{latitude},{longitude},{epoch_time}" if q in CACHE: return CACHE[q] key = os.environ["DARKSKY_WEATHER_KEY"] resp = requests.get(f"https://api.darksky.net/forecast/{key}/{q}?units=si") json_response = resp.json() if "error" not in json_response: CACHE[q] = json_response return json_response def get_weather_at_nearest_hour(latitude, longitude, dt): day_timestamp = int(dt.replace(hour=0, minute=0, second=1).timestamp()) json_response = _historic_weather(latitude, longitude, day_timestamp) t = dt.timestamp() try: return min([(abs(x["time"] - t), x) for x in json_response["hourly"]["data"]])[1] except (IndexError, KeyError) as e:
# place_id = json_response.get("results", [{}])[0].get("place_id") country = geo_get_(json_response, "country") address = get_address(json_response) return {"city": city, "country": country, "formatted_address": address} dotenv.load_dotenv("google/.env") dotenv.load_dotenv(".env") PYTHON_ENV = os.environ.get("PYTHON_ENV", "dev") if PYTHON_ENV != "prod": KEY = None else: KEY = os.environ.get("GOOGLE_API_KEY", None) CACHE = get_cache("google_timeline") DETAILS_URL = "https://maps.googleapis.com/maps/api/place/details/json" NEARBY_URL = "https://maps.googleapis.com/maps/api/place/nearbysearch/json" s = requests.Session() # {'Boating', 'Cycling', 'Driving', 'Flying', 'In transit', 'Moving', 'On a bus', 'On a ferry', 'On a train', 'On a tram', 'On the subway', 'Running', 'Walking'} def get_results(latlng, name, excluded_transport_names): if name in excluded_transport_names: return geo_get_info(latlng) near_result = get_nearby_results(latlng, name, excluded_transport_names) if near_result is None: return None details = get_details(near_result["place_id"])