Beispiel #1
0
def get_ts_from_fn(fn):
    if fn is None:
        return None
    try:
        return fromtimestamp_pst(os.path.getmtime(fn))
    except FileNotFoundError:
        print(f"couldn't find {fn}")
Beispiel #2
0
def construct_head_training_texts(thread: TumblrThread, base_head_timestamp: datetime, blog_name: str = bot_name):
    head_timestamp = roll_head_timestamp(base_head_timestamp=base_head_timestamp,
                                         actual_timestamp=fromtimestamp_pst(thread.timestamp))
    _, text_selector, text_autoreviewer = make_nwo_prompts(thread,
                                                           head_timestamp=head_timestamp,
                                                           blog_name=blog_name,
                                                           ml_prompt_format=False)
    return text_selector, text_autoreviewer
Beispiel #3
0
def count_posts_since_ts(post_payloads, ts):
    is_after = [
        (fromtimestamp_pst(entry['timestamp']) - ts).total_seconds() > 0
        for entry in post_payloads if not entry.get('is_pinned')
    ]

    if all(is_after):
        msg = f"count_posts_since_ts: all {len(is_after)} posts passed are after ts passed."
        msg += " Count returned will be a lower bound."
        print(msg)

    return sum(is_after)
Beispiel #4
0
def fetch_and_process(blog_name: str = bot_name,
                      n: Optional[int] = None,
                      offset : int = 0,
                      include_unused_types=False,
                      fetch_only=False,
                      process_only=False):
    with open("data/head_training_data_raw_posts.pkl.gz", "rb") as f:
        posts = pickle.load(f)

    max_ts_posix = max(pp["timestamp"] for pp in posts)
    max_ts = fromtimestamp_pst(max_ts_posix).isoformat()
    print(f"loaded {len(posts)} raw posts, max ts {max_ts}")

    lines = load()
    max_processed_id = max(line["id"] for line in lines)
    print(f"loaded {len(lines)} existing records, max id {max_processed_id}")

    if process_only:
        new_posts = [pp for pp in posts if pp["id"] > max_processed_id]
    else:
        pool = ClientPool()

        new_posts = fetch_posts(pool, blog_name, n, offset, needs_private_client=True, stop_at_id=max_processed_id)

        posts.extend(new_posts)

        print(f"saving {len(posts)} raw posts")

        with open("data/head_training_data_raw_posts.pkl.gz", "wb") as f:
            pickle.dump(posts, f)

    if fetch_only:
        return lines

    base_head_timestamp = now_pst()

    lines_new = [post_to_line_entry(pp,
                                    base_head_timestamp,
                                    blog_name=blog_name,
                                    include_unused_types=include_unused_types)
                 for pp in tqdm(new_posts, mininterval=0.3, smoothing=0)]
    lines.extend(lines_new)
    return lines
def fetch_posts(pool: ClientPool,
                blog_name: str = bot_name,
                n: Optional[int] = None,
                offset: int = 0,
                report_cadence=5000,
                needs_private_client=False,
                needs_dash_client=False,
                stop_at_id=0,
                before=None,
                screener=None):
    posts = []
    ids = set()
    since_last_report = 0
    n_ok = 0
    n_full = 0

    rejection_reasons = Counter()

    tqdm_bar = None

    if needs_private_client and needs_dash_client:
        raise ValueError(
            "fetch_posts: only one of needs_private_client and needs_dash_client can be true"
        )

    client_getter = pool.get_client
    if needs_private_client:
        client_getter = pool.get_private_client
    if needs_dash_client:
        client_getter = pool.get_dashboard_client

    while True:
        client = client_getter()
        page, next_offset, total_posts = fetch_next_page(client,
                                                         offset=offset,
                                                         blog_name=blog_name,
                                                         before=before)

        if not tqdm_bar:
            tqdm_bar = tqdm(total=total_posts)
            tqdm_bar.update(offset)
            tqdm_bar.set_postfix(cl=pool.client_name(client))

        if (len(page) == 0) or (next_offset == offset):
            print(f"stopping, empty page after {len(posts)} posts")
            return posts

        since_last_report += len(page)
        if since_last_report >= report_cadence:
            pool.report()
            since_last_report = 0

        nraw = len(page)
        page = [pp for pp in page if pp['id'] not in ids]
        ndedup = len(page)

        page = [
            pp for pp in page if pp['id'] > stop_at_id
            or pp.get('is_pinned')  # pins make id non-monotonic
        ]
        nafter = len(page)
        nbefore = ndedup - nafter

        page_ids = {pp['id'] for pp in page}

        delta_full = len(page)
        n_full += delta_full

        if screener:
            _page = []
            reasons = []
            for pp in page:
                ok, reason, _ = screener(pp)
                if ok:
                    _page.append(pp)
                else:
                    reasons.append(reason)
            rejection_reasons.update(reasons)
            page = _page
        n_ok += len(page)

        ids.update(page_ids)
        posts.extend(page)
        offset = next_offset

        if len(page) == 0:
            min_ts = None
        else:
            min_ts = fromtimestamp_pst(min(pp['timestamp']
                                           for pp in page)).isoformat()
        tqdm_bar.update(delta_full)
        tqdm_bar.set_postfix(cl=pool.client_name(client),
                             min_ts=min_ts,
                             n_ok=n_ok,
                             n_full=n_full)

        max_n = total_posts
        if n:
            max_n = min(n, max_n)

        if n_full >= max_n:
            print(
                f"stopping with {n_full} posts, {n_ok} OK: reached maximum {max_n}"
            )
            print(f"rejection_reasons: {rejection_reasons.most_common()}")
            return posts

        if nbefore > 0:
            print(
                f"stopping with {n_full} posts, {n_ok} OK: {nbefore}/{ndedup} in current page are before id {stop_at_id}"
            )
            print(f"rejection_reasons: {rejection_reasons.most_common()}")
            return posts
def sample_year_and_set_timestamp(thread: TumblrThread) -> TumblrThread:
    timestamp = fromtimestamp_pst(thread.timestamp)

    timestamp = sample_year_and_set(timestamp)

    return set_timestamp(thread, timestamp)
Beispiel #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--dryrun", action="store_true")
    parser.add_argument("--hot-only", action="store_true")
    args = parser.parse_args()

    base_head_timestamp = now_pst()

    # trace
    print("loading trace logs")
    if args.hot_only:
        import persistence.traceability_singleton

        trace_logs = persistence.traceability_singleton.TRACE_LOGS.logs["data"]
    else:
        trace_logs = traceability.load_full_traceability_logs()["data"]

    print(f"loaded trace logs: {len(trace_logs)} rows")

    trace_logs = [
        row for row in trace_logs
        if row.get("requested__state") in {"draft", "queue"}
    ]

    print(f"subsetted trace logs to draft/queue:  {len(trace_logs)} rows")

    required_keys = [
        "api__id", "prompt_autoreviewer", "choice_ix", "all_continuations",
        "timestamp_manual", "post_type", "state_reasons"
    ]
    keycounts = Counter()
    key_nonnull_counts = Counter()

    for row in trace_logs:
        for k in required_keys:
            keycounts[k] += (k in row)
            key_nonnull_counts[k] += (row.get(k) is not None)

    print(f"keycounts: {keycounts}\nkey_nonnull_counts: {key_nonnull_counts}")

    trace_logs = [
        row for row in trace_logs if all(
            row.get(k) is not None for k in required_keys)
    ]

    print(f"subsetted trace logs to nwo / usable:  {len(trace_logs)} rows")

    # don't let the model learn from its own use of the rts tag
    trace_logs = [
        row for row in trace_logs
        if not row['state_reasons'].get('ml_rejected')
    ]

    print(
        f"removed model-rejected drafts from trace logs:  {len(trace_logs)} rows"
    )

    pool = ClientPool()
    current_queue = [
        pp['id'] for pp in pool.get_private_client().queue(
            'nostalgebraist-autoresponder')['posts']
    ]

    trace_logs = [
        row for row in trace_logs if row.get("api__id") not in current_queue
    ]

    print(f"removed currently queued posts:  {len(trace_logs)} rows")

    trace_indices_to_texts = {}
    for i, row in enumerate(trace_logs):
        actual_timestamp = fromtimestamp_pst(row["timestamp_manual"])

        subbed = sub_prompt_timestamp(base_head_timestamp, actual_timestamp,
                                      row["prompt_autoreviewer"])
        trace_indices_to_texts[i] = subbed + row["all_continuations"][
            row["choice_ix"]]

    trace_map = defaultdict(list)

    for i, row in enumerate(trace_logs):
        trace_map[row["api__id"]].append(i)

    # pub
    print("loading pub logs")
    with open("data/head_training_data.json", "r", encoding="utf-8") as f:
        pub_logs = json.load(f)

    print(f"loaded pub logs: {len(pub_logs)} rows")

    for row in pub_logs:
        gid = row["genesis_post_id"]
        row["genesis_or_published_id"] = gid if gid is not None else row["id"]

    pub_map = defaultdict(list)

    for i, row in enumerate(pub_logs):
        pub_map[row["genesis_or_published_id"]].append(i)

    # match
    print("matching...")

    trace_indices_to_targets = {}
    trace_indices_to_published_ids = {}

    n_accept = 0
    n_reject = 0
    n_skip = 0
    n_multimatch = 0

    iter_ = tqdm(trace_map.items(),
                 total=len(trace_map),
                 mininterval=1,
                 smoothing=0)

    for api__id, group_trace_indices in iter_:
        pub_gids_matching_trace_id = pub_map.get(api__id, [])

        if len(pub_gids_matching_trace_id) == 0:
            # never published
            for trace_index in group_trace_indices:
                trace_indices_to_targets[trace_index] = "reject"
                trace_indices_to_published_ids[trace_index] = None
            n_reject += len(group_trace_indices)
        else:
            if len(pub_gids_matching_trace_id) > 1:
                # ???
                n_multimatch += 1

            matching_pub_row = pub_logs[pub_gids_matching_trace_id[0]]

            # assumes trace is ordered by time -- i believe this is true
            pubd_ix = group_trace_indices[-1]

            if trace_logs[pubd_ix]['requested__state'] != 'queue':
                trace_indices_to_targets[pubd_ix] = "accept"
                trace_indices_to_published_ids[pubd_ix] = matching_pub_row[
                    "id"]
                n_accept += 1
            else:
                # queued posts i don't delete aren't signal
                trace_indices_to_targets[pubd_ix] = "skip"
                n_skip += 1

            for trace_index in group_trace_indices[:-1]:
                trace_indices_to_targets[trace_index] = "reject"
                trace_indices_to_published_ids[trace_index] = None

            n_reject += len(group_trace_indices) - 1

        iter_.set_postfix(n_accept=n_accept,
                          n_reject=n_reject,
                          n_skip=n_skip,
                          zz_n_multimatch=n_multimatch)

    # verify
    n_accept_verify = sum(v == "accept"
                          for v in trace_indices_to_targets.values())
    n_reject_verify = sum(v == "reject"
                          for v in trace_indices_to_targets.values())
    n_skip_verify = sum(v == "skip" for v in trace_indices_to_targets.values())

    print(f"\nn_accept: {n_accept_verify} vs {n_accept}")
    print(f"n_reject: {n_reject_verify} vs {n_reject}")
    print(f"n_skip: {n_skip_verify} vs {n_skip}")

    autoreview_train_data = []
    for ix in sorted(trace_indices_to_targets.keys()):
        if trace_indices_to_targets[ix] == 'skip':
            continue
        autoreview_train_data.append({
            "text":
            trace_indices_to_texts[ix],
            "target":
            trace_indices_to_targets[ix],
            "trace_api__id":
            trace_logs[ix]["api__id"],
            "pub_api__id":
            trace_indices_to_published_ids[ix],
            "post_type":
            trace_logs[ix]["post_type"]
        })

    if not args.dryrun:
        with open("data/autoreview_train_data.json", "w",
                  encoding="utf-8") as f:
            json.dump(autoreview_train_data, f, indent=1)
def compute_dynamic_mood_inputs(
    response_cache: ResponseCache,
    weighted_avg_start_time: pd.Timestamp = WEIGHTED_AVG_START_TIME,
    system: DynamicMoodSystem = None,
) -> pd.DataFrame:
    if system is None:
        system = DynamicMoodSystem()

    df = pd.DataFrame.from_records(
        [
            {
                "timestamp": ident.timestamp,
                "blog_name": ident.blog_name,
                "logit_diff": sent["logit_diff"],
                "generated_logit_diff": sent.get("generated_logit_diff")
                if sent.get("generated_logit_diff")
                else (
                    [
                        pos_sent_to_logit_diff(entry)
                        for entry in sent.get("generated_pos_sent")
                    ]
                    if "generated_pos_sent" in sent
                    else None
                ),
                "p75_generated_logit_diff": sent.get("p75_generated_logit_diff"),
                "text_for_sentiment": sent.get("text_for_sentiment"),
                "generated_ts": sent.get("generated_ts"),
            }
            for ident, sent in response_cache.user_input_sentiments.items()
        ]
    ).drop_duplicates(subset=["timestamp"])

    _filter = df.generated_logit_diff.notnull() & df.p75_generated_logit_diff.isnull()
    df.loc[_filter, "p75_generated_logit_diff"] = df.loc[
        _filter, "generated_logit_diff"
    ].apply(lambda l: np.percentile(l, 75))

    df["time"] = df.timestamp.apply(lambda ts: fromtimestamp_pst(ts))
    _filter = df.generated_ts.notnull()
    df.loc[_filter, "time"] = df.loc[_filter, "generated_ts"]
    df = df.sort_values(by="time")

    _filter = (df["time"] < GENERATED_TS_FIRST_STABLE) | (df["generated_ts"].notnull())
    if sum(_filter) < len(df):
        print(f"keeping {sum(_filter)} of {len(df)} rows")
    df = df[_filter]

    df["using_weighted_avg"] = df["time"] >= weighted_avg_start_time

    can_compute_determiner = df.p75_generated_logit_diff.notnull() & df.generated_ts.notnull()
    df["determiner"] = 0.
    df.loc[can_compute_determiner == False, "determiner"] = df.logit_diff

    compute_as_legacy = can_compute_determiner & (df.using_weighted_avg == False)
    compute_as_weighted_avg = can_compute_determiner & (df.using_weighted_avg == True)
    df.loc[compute_as_legacy, "determiner"] = df.loc[compute_as_legacy, :].apply(compute_determiner_legacy, axis=1)
    df.loc[compute_as_weighted_avg, "determiner"] = df.loc[compute_as_weighted_avg, :].apply(compute_determiner_weighted_avg, axis=1)

    mood_inputs = df.set_index("time")

    mood_inputs = system.set_centered_scaled_determiner(mood_inputs)

    duplicate_bug_filter = ~mood_inputs.index.duplicated(keep="first")
    duplicate_bug_filter = duplicate_bug_filter | (
        mood_inputs.index < DUPLICATES_BUGFIX_START_TS
    )
    mood_inputs = mood_inputs[duplicate_bug_filter]

    return mood_inputs