def user_summary_for(rtype, storages, output_path: Path): ustats = {} def reg(user, query, stats): if user not in ustats: ustats[user] = {} ustats[user][query] = stats with ProcessPoolExecutor() as pp: digests = pp.map(get_digest, [s.path for s in storages]) for s, digest in zip(storages, digests): everything = flatten([ch for ch in digest.changes.values()]) for user, items in group_by_key(everything, key=lambda x: x.user).items(): reg(user, s.name, len(items)) now = datetime.now() doc = dominate.document( title= f'axol tags summary for {[s.name for s in storages]}, rendered at {fdate(now)}' ) with doc.head: T.style(STYLE) raw_script(JS) # TODO necessary? # TODO FIXME can't inline due to some utf shit sortable_js = Path(__file__).absolute().parent / 'js' / 'sorttable.js' T.script(src=str(sortable_js)) ft = FormatTrait.for_(rtype) with doc.body: with T.table(cls='sortable'): emitted_head = False for user, stats in sorted(ustats.items(), key=lambda x: (-len(x[1]), x)): if not emitted_head: with T.thead(): T.td('user') for q, _ in stats.items(): T.td(q) emitted_head = True with T.tr(): T.td(ft.user_link(user)) for q, st in stats.items(): with T.td(sorttable_customkey=str(st)): # TODO I guess unclear which tag to choose though. T.a( q, href=f'summary/{q}.html' ) # TODO link to source in index? or on pinboard maybe # TODO also project onto user's tags straight away T.sup( str(st) if st < 5 else T.b( T.font(str(st), color='red'))) # TODO css output_path.write_text(str(doc)) logger.info('Dumped user summary to %s', output_path)
def sleeps_by_date() -> Dict[date, SleepEntry]: logger = get_logger() sleeps = load_sleeps() sleeps = [s for s in sleeps if s.graph.exists()] # TODO careful.. res = {} for dd, group in group_by_key(sleeps, key=lambda s: s.date_).items(): if len(group) == 1: res[dd] = group[0] else: # TODO short ones I can ignore I guess. but won't bother now logger.error('multiple sleeps on %s: %s', dd, group) return res
def get_tg_tasks(): forwarded = [] with codecs.open(BACKUP_PATH, 'r', 'utf-8') as bp: for line in bp.readlines(): j = json_loads(line) if j['event'] == 'message': if 'fwd_from' in j: forwarded.append(j) # apparently, date is appropriate as a 'unit of forwarding' grouped = group_by_key(forwarded, lambda f: f['date']) tasks = [] for _, group in sorted(grouped.items(), key=lambda f: f[0]): id_, title, texts = format_group(group) tasks.append((id_, title, texts)) return tasks
def render_summary(repo: Path, digest: Changes[Any], rendered: Path) -> Path: rtype = get_result_type(repo) # TODO ?? # ODO just get trait for type?? Cumulative = CumulativeBase.for_(rtype) NOW = datetime.now() name = repo.stem everything = flatten([ch for ch in digest.changes.values()]) before = len(everything) grouped = group_by_key(everything, key=Cumulative.cumkey) print(f'before: {before}, after: {len(grouped)}') cumulatives = list(map(Cumulative, grouped.values())) cumulatives = list(sorted(cumulatives, key=Cumulative.sortkey)) doc = dominate.document( title=f'axol results for {name}, rendered at {fdate(NOW)}') with doc.head: T.style(STYLE) raw_script(JS) with doc: T.h3("This is axol search summary") T.div( "You can use 'hide' function in JS (chrome debugger) to hide certain tags/subreddits/users" ) T.h4("Sources summary") # TODO wrap in div? with T.div(): Cumulative.sources_summary(everything) for cc in cumulatives: T.div(cc.format(), cls='item') rendered.mkdir(exist_ok=True, parents=True) sf = rendered.joinpath(name + '.html') with sf.open('w') as fo: fo.write(str(doc)) return sf
def _get_kml(items): kml = KmlMaker() from kython import group_by_key for lname, places in group_by_key(items, key=lambda p: p.lst).items(): color = places[0].color style_url = None if color is None else kml.make_icon_style(color=color) marks = [] for p in places: pm = K.Placemark( id=p.name, name=p.name, description=p.description, styleUrl=style_url, ) pm.geometry = Point(p.lng, p.lat) marks.append(pm) kml.add_folder( name=lname, items=marks, ) ss = kml.to_string(prettyprint=True) return '<?xml version="1.0" encoding="UTF-8"?>\n' + ss
def process(): import json from pprint import pprint from pathlib import Path from kython import group_by_key anns = json.loads(Path('./res.json').read_text()) anns = [a for a in anns if 'RhoChiPlanReviews' not in a['uri']] groups = group_by_key(anns, key=lambda a: a['user']) for k, g in sorted(groups.items(), key=lambda i: len(i[1])): print(f'{k}: {len(g)}') docs = [] for a in g: title = a["document"].get("title", [None])[0] uri = a["uri"] if is_visited(uri): docs.append((uri, a['links']['incontext'])) # TODO incontext?? # docs.append(title or uri) for u, x in sorted(docs): print(' ' + u) print(' ' + x)
def user_summary(storages, output_dir: Path): for src, st in group_by_key(storages, key=lambda s: s.source).items(): rtype = the(get_result_type(x) for x in st) outf = output_dir / (For(src).name + '_users.html') user_summary_for(rtype=rtype, storages=st, output_path=outf)
def render_latest(repo: Path, digest, rendered: Path): logger.info('processing %s', repo) rtype = get_result_type(repo) Format = FormatTrait.for_(rtype) Ignore = IgnoreTrait.for_(rtype) import pytz NOW = datetime.now(tz=pytz.utc) name = repo.stem doc = dominate.document( title=f'axol results for {name}, rendered at {fdate(NOW)}') with doc.head: T.style(STYLE) raw_script(JS) T.link( rel='stylesheet', href= "https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.48.2/codemirror.min.css" ) T.script( src= 'https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.48.2/codemirror.js' ) # TODO use min? citems: Iterator[Tuple[datetime, Item]] = chain.from_iterable( ((d, x) for x in zz) for d, zz in digest.changes.items()) # group according to link, so we can display already occuring items along with newer occurences items2: List[Sequence[Tuple[datetime, Item]]] = [ grp for _, grp in group_by_key(citems, key=lambda p: f'{p[1].link}').items() ] # TODO sort within each group? def min_dt(group: Sequence[Tuple[datetime, Item]]) -> datetime: return min(g[0] for g in group) # TODO ok, this is def too many types here... items3: Mapping[datetime, List[Sequence[Tuple[datetime, Item]]]] = group_by_key(items2, key=min_dt) rss = True if rss: # pip3 install feedgen from feedgen.feed import FeedGenerator # type: ignore fg = FeedGenerator() # TODO memorize items? fg.title(name) fg.id('axol/' + name) first = True for d, items in sorted(items3.items()): litems = list(items) logger.info('%s %s: atom, dumping %d items', name, d, len(litems)) if first: logger.info("SKIPPING first batch to prevent RSS bloat") first = False continue for zz in litems: fe = fg.add_entry() # TODO not sure about css? # TODO not sure which date should use? I gues crawling date makes more sense.. _d, z = zz[0] # TODO meh! id_ = z.uid # TODO FIXME!! fe.id(id_) title = Format.title(zz) or '<no title>' # meh fe.title(title) fe.link(href=Format.link(zz)) # TODO not sure if it's a reasonable date to use... fe.published(published=d) fe.author(author={'name': z.user}) # TODO maybe, concat users? ignored = Ignore.ignore_group(zz) if ignored is not None: # TODO not sure if it highlights with read or something? content = ignored else: content = Format.format(zz) # eh, XML was complaining at some non-utf characters content = str(content) # https://stackoverflow.com/a/25920392/706389 make lxml happy... content = re.sub( u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', content) fe.content(content=content, type='CDATA') # fe.updated(updated=NOW) # TODO assemble a summary similar to HTML? # fe.summary() atomfeed = fg.atom_str(pretty=True) # eh, my feed reader (miniflux) can't handle it if it's 'cdata' # not sure which one is right # ugh, that didn't work because escaping desicion is based on CDATA attribute... atomfeed = atomfeed.replace(b'type="CDATA"', b'type="html"') # fe._FeedEntry__atom_content['type'] = 'html' atomdir = rendered / 'atom' atomdir.mkdir(parents=True, exist_ok=True) (atomdir / (name + '.xml')).write_bytes(atomfeed) with doc: with T.div(id='sidebar'): T.label('Blacklisted:', for_='blacklisted') T.div(id='blacklisted') T.textarea(id='blacklist-edit', rows=10) T.button('apply', id='blacklist-apply') odd = True for d, items in sorted(items3.items(), reverse=True): litems = list(items) odd = not odd logger.info('%s %s: dumping %d items', name, d, len(litems)) with T.div(cls='day-changes'): with T.div(): T.b(fdate(d)) T.span(f'{len(litems)} items') with T.div( cls=f'day-changes-inner {"odd" if odd else "even"}'): for i in items: # TODO FIXME use getattr to specialise trait? # TODO FIXME ignore should be at changes collecting stage? ignored = Ignore.ignore_group(i) if ignored is not None: # TODO maybe let format result handle that... not sure T.div(ignored, cls='item ignored') # TODO log maybe? # TODO eh. need to handle in cumulatives... else: fi = Format.format(i) T.div(fi, cls='item') # f*****g hell.. didn't manage to render content inside iframe no matter how I tried.. # with T.iframe(id='blacklist', src=''): # pass # TODO perhaps needs to be iterative... rf = rendered / (name + '.html') with rf.open('w') as fo: fo.write(str(doc)) return rf