def get_ts_from_fn(fn): if fn is None: return None try: return fromtimestamp_pst(os.path.getmtime(fn)) except FileNotFoundError: print(f"couldn't find {fn}")
def construct_head_training_texts(thread: TumblrThread, base_head_timestamp: datetime, blog_name: str = bot_name): head_timestamp = roll_head_timestamp(base_head_timestamp=base_head_timestamp, actual_timestamp=fromtimestamp_pst(thread.timestamp)) _, text_selector, text_autoreviewer = make_nwo_prompts(thread, head_timestamp=head_timestamp, blog_name=blog_name, ml_prompt_format=False) return text_selector, text_autoreviewer
def count_posts_since_ts(post_payloads, ts): is_after = [ (fromtimestamp_pst(entry['timestamp']) - ts).total_seconds() > 0 for entry in post_payloads if not entry.get('is_pinned') ] if all(is_after): msg = f"count_posts_since_ts: all {len(is_after)} posts passed are after ts passed." msg += " Count returned will be a lower bound." print(msg) return sum(is_after)
def fetch_and_process(blog_name: str = bot_name, n: Optional[int] = None, offset : int = 0, include_unused_types=False, fetch_only=False, process_only=False): with open("data/head_training_data_raw_posts.pkl.gz", "rb") as f: posts = pickle.load(f) max_ts_posix = max(pp["timestamp"] for pp in posts) max_ts = fromtimestamp_pst(max_ts_posix).isoformat() print(f"loaded {len(posts)} raw posts, max ts {max_ts}") lines = load() max_processed_id = max(line["id"] for line in lines) print(f"loaded {len(lines)} existing records, max id {max_processed_id}") if process_only: new_posts = [pp for pp in posts if pp["id"] > max_processed_id] else: pool = ClientPool() new_posts = fetch_posts(pool, blog_name, n, offset, needs_private_client=True, stop_at_id=max_processed_id) posts.extend(new_posts) print(f"saving {len(posts)} raw posts") with open("data/head_training_data_raw_posts.pkl.gz", "wb") as f: pickle.dump(posts, f) if fetch_only: return lines base_head_timestamp = now_pst() lines_new = [post_to_line_entry(pp, base_head_timestamp, blog_name=blog_name, include_unused_types=include_unused_types) for pp in tqdm(new_posts, mininterval=0.3, smoothing=0)] lines.extend(lines_new) return lines
def fetch_posts(pool: ClientPool, blog_name: str = bot_name, n: Optional[int] = None, offset: int = 0, report_cadence=5000, needs_private_client=False, needs_dash_client=False, stop_at_id=0, before=None, screener=None): posts = [] ids = set() since_last_report = 0 n_ok = 0 n_full = 0 rejection_reasons = Counter() tqdm_bar = None if needs_private_client and needs_dash_client: raise ValueError( "fetch_posts: only one of needs_private_client and needs_dash_client can be true" ) client_getter = pool.get_client if needs_private_client: client_getter = pool.get_private_client if needs_dash_client: client_getter = pool.get_dashboard_client while True: client = client_getter() page, next_offset, total_posts = fetch_next_page(client, offset=offset, blog_name=blog_name, before=before) if not tqdm_bar: tqdm_bar = tqdm(total=total_posts) tqdm_bar.update(offset) tqdm_bar.set_postfix(cl=pool.client_name(client)) if (len(page) == 0) or (next_offset == offset): print(f"stopping, empty page after {len(posts)} posts") return posts since_last_report += len(page) if since_last_report >= report_cadence: pool.report() since_last_report = 0 nraw = len(page) page = [pp for pp in page if pp['id'] not in ids] ndedup = len(page) page = [ pp for pp in page if pp['id'] > stop_at_id or pp.get('is_pinned') # pins make id non-monotonic ] nafter = len(page) nbefore = ndedup - nafter page_ids = {pp['id'] for pp in page} delta_full = len(page) n_full += delta_full if screener: _page = [] reasons = [] for pp in page: ok, reason, _ = screener(pp) if ok: _page.append(pp) else: reasons.append(reason) rejection_reasons.update(reasons) page = _page n_ok += len(page) ids.update(page_ids) posts.extend(page) offset = next_offset if len(page) == 0: min_ts = None else: min_ts = fromtimestamp_pst(min(pp['timestamp'] for pp in page)).isoformat() tqdm_bar.update(delta_full) tqdm_bar.set_postfix(cl=pool.client_name(client), min_ts=min_ts, n_ok=n_ok, n_full=n_full) max_n = total_posts if n: max_n = min(n, max_n) if n_full >= max_n: print( f"stopping with {n_full} posts, {n_ok} OK: reached maximum {max_n}" ) print(f"rejection_reasons: {rejection_reasons.most_common()}") return posts if nbefore > 0: print( f"stopping with {n_full} posts, {n_ok} OK: {nbefore}/{ndedup} in current page are before id {stop_at_id}" ) print(f"rejection_reasons: {rejection_reasons.most_common()}") return posts
def sample_year_and_set_timestamp(thread: TumblrThread) -> TumblrThread: timestamp = fromtimestamp_pst(thread.timestamp) timestamp = sample_year_and_set(timestamp) return set_timestamp(thread, timestamp)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--dryrun", action="store_true") parser.add_argument("--hot-only", action="store_true") args = parser.parse_args() base_head_timestamp = now_pst() # trace print("loading trace logs") if args.hot_only: import persistence.traceability_singleton trace_logs = persistence.traceability_singleton.TRACE_LOGS.logs["data"] else: trace_logs = traceability.load_full_traceability_logs()["data"] print(f"loaded trace logs: {len(trace_logs)} rows") trace_logs = [ row for row in trace_logs if row.get("requested__state") in {"draft", "queue"} ] print(f"subsetted trace logs to draft/queue: {len(trace_logs)} rows") required_keys = [ "api__id", "prompt_autoreviewer", "choice_ix", "all_continuations", "timestamp_manual", "post_type", "state_reasons" ] keycounts = Counter() key_nonnull_counts = Counter() for row in trace_logs: for k in required_keys: keycounts[k] += (k in row) key_nonnull_counts[k] += (row.get(k) is not None) print(f"keycounts: {keycounts}\nkey_nonnull_counts: {key_nonnull_counts}") trace_logs = [ row for row in trace_logs if all( row.get(k) is not None for k in required_keys) ] print(f"subsetted trace logs to nwo / usable: {len(trace_logs)} rows") # don't let the model learn from its own use of the rts tag trace_logs = [ row for row in trace_logs if not row['state_reasons'].get('ml_rejected') ] print( f"removed model-rejected drafts from trace logs: {len(trace_logs)} rows" ) pool = ClientPool() current_queue = [ pp['id'] for pp in pool.get_private_client().queue( 'nostalgebraist-autoresponder')['posts'] ] trace_logs = [ row for row in trace_logs if row.get("api__id") not in current_queue ] print(f"removed currently queued posts: {len(trace_logs)} rows") trace_indices_to_texts = {} for i, row in enumerate(trace_logs): actual_timestamp = fromtimestamp_pst(row["timestamp_manual"]) subbed = sub_prompt_timestamp(base_head_timestamp, actual_timestamp, row["prompt_autoreviewer"]) trace_indices_to_texts[i] = subbed + row["all_continuations"][ row["choice_ix"]] trace_map = defaultdict(list) for i, row in enumerate(trace_logs): trace_map[row["api__id"]].append(i) # pub print("loading pub logs") with open("data/head_training_data.json", "r", encoding="utf-8") as f: pub_logs = json.load(f) print(f"loaded pub logs: {len(pub_logs)} rows") for row in pub_logs: gid = row["genesis_post_id"] row["genesis_or_published_id"] = gid if gid is not None else row["id"] pub_map = defaultdict(list) for i, row in enumerate(pub_logs): pub_map[row["genesis_or_published_id"]].append(i) # match print("matching...") trace_indices_to_targets = {} trace_indices_to_published_ids = {} n_accept = 0 n_reject = 0 n_skip = 0 n_multimatch = 0 iter_ = tqdm(trace_map.items(), total=len(trace_map), mininterval=1, smoothing=0) for api__id, group_trace_indices in iter_: pub_gids_matching_trace_id = pub_map.get(api__id, []) if len(pub_gids_matching_trace_id) == 0: # never published for trace_index in group_trace_indices: trace_indices_to_targets[trace_index] = "reject" trace_indices_to_published_ids[trace_index] = None n_reject += len(group_trace_indices) else: if len(pub_gids_matching_trace_id) > 1: # ??? n_multimatch += 1 matching_pub_row = pub_logs[pub_gids_matching_trace_id[0]] # assumes trace is ordered by time -- i believe this is true pubd_ix = group_trace_indices[-1] if trace_logs[pubd_ix]['requested__state'] != 'queue': trace_indices_to_targets[pubd_ix] = "accept" trace_indices_to_published_ids[pubd_ix] = matching_pub_row[ "id"] n_accept += 1 else: # queued posts i don't delete aren't signal trace_indices_to_targets[pubd_ix] = "skip" n_skip += 1 for trace_index in group_trace_indices[:-1]: trace_indices_to_targets[trace_index] = "reject" trace_indices_to_published_ids[trace_index] = None n_reject += len(group_trace_indices) - 1 iter_.set_postfix(n_accept=n_accept, n_reject=n_reject, n_skip=n_skip, zz_n_multimatch=n_multimatch) # verify n_accept_verify = sum(v == "accept" for v in trace_indices_to_targets.values()) n_reject_verify = sum(v == "reject" for v in trace_indices_to_targets.values()) n_skip_verify = sum(v == "skip" for v in trace_indices_to_targets.values()) print(f"\nn_accept: {n_accept_verify} vs {n_accept}") print(f"n_reject: {n_reject_verify} vs {n_reject}") print(f"n_skip: {n_skip_verify} vs {n_skip}") autoreview_train_data = [] for ix in sorted(trace_indices_to_targets.keys()): if trace_indices_to_targets[ix] == 'skip': continue autoreview_train_data.append({ "text": trace_indices_to_texts[ix], "target": trace_indices_to_targets[ix], "trace_api__id": trace_logs[ix]["api__id"], "pub_api__id": trace_indices_to_published_ids[ix], "post_type": trace_logs[ix]["post_type"] }) if not args.dryrun: with open("data/autoreview_train_data.json", "w", encoding="utf-8") as f: json.dump(autoreview_train_data, f, indent=1)
def compute_dynamic_mood_inputs( response_cache: ResponseCache, weighted_avg_start_time: pd.Timestamp = WEIGHTED_AVG_START_TIME, system: DynamicMoodSystem = None, ) -> pd.DataFrame: if system is None: system = DynamicMoodSystem() df = pd.DataFrame.from_records( [ { "timestamp": ident.timestamp, "blog_name": ident.blog_name, "logit_diff": sent["logit_diff"], "generated_logit_diff": sent.get("generated_logit_diff") if sent.get("generated_logit_diff") else ( [ pos_sent_to_logit_diff(entry) for entry in sent.get("generated_pos_sent") ] if "generated_pos_sent" in sent else None ), "p75_generated_logit_diff": sent.get("p75_generated_logit_diff"), "text_for_sentiment": sent.get("text_for_sentiment"), "generated_ts": sent.get("generated_ts"), } for ident, sent in response_cache.user_input_sentiments.items() ] ).drop_duplicates(subset=["timestamp"]) _filter = df.generated_logit_diff.notnull() & df.p75_generated_logit_diff.isnull() df.loc[_filter, "p75_generated_logit_diff"] = df.loc[ _filter, "generated_logit_diff" ].apply(lambda l: np.percentile(l, 75)) df["time"] = df.timestamp.apply(lambda ts: fromtimestamp_pst(ts)) _filter = df.generated_ts.notnull() df.loc[_filter, "time"] = df.loc[_filter, "generated_ts"] df = df.sort_values(by="time") _filter = (df["time"] < GENERATED_TS_FIRST_STABLE) | (df["generated_ts"].notnull()) if sum(_filter) < len(df): print(f"keeping {sum(_filter)} of {len(df)} rows") df = df[_filter] df["using_weighted_avg"] = df["time"] >= weighted_avg_start_time can_compute_determiner = df.p75_generated_logit_diff.notnull() & df.generated_ts.notnull() df["determiner"] = 0. df.loc[can_compute_determiner == False, "determiner"] = df.logit_diff compute_as_legacy = can_compute_determiner & (df.using_weighted_avg == False) compute_as_weighted_avg = can_compute_determiner & (df.using_weighted_avg == True) df.loc[compute_as_legacy, "determiner"] = df.loc[compute_as_legacy, :].apply(compute_determiner_legacy, axis=1) df.loc[compute_as_weighted_avg, "determiner"] = df.loc[compute_as_weighted_avg, :].apply(compute_determiner_weighted_avg, axis=1) mood_inputs = df.set_index("time") mood_inputs = system.set_centered_scaled_determiner(mood_inputs) duplicate_bug_filter = ~mood_inputs.index.duplicated(keep="first") duplicate_bug_filter = duplicate_bug_filter | ( mood_inputs.index < DUPLICATES_BUGFIX_START_TS ) mood_inputs = mood_inputs[duplicate_bug_filter] return mood_inputs