def prune(): with session() as sess: """prune machines which haven't removed themselves properly""" sess.execute(f""" delete from machines where updated_at < now() - interval '5 minutes' """) sess.commit()
def await_job(jid): with session() as sess: params = {'jid': jid} i = 0 while True: time.sleep(1) job = sess.execute( text(""" select state, method from jobs where id=:jid """), params).fetchone() # TODO notify them of error? # 10 minutes, give up if job.state == 'error' or i > 60 * 10: return Box(method=job.method, data_out=False) if job.state == 'done': ## don't delete actually, let Job.prune handle that. Need last_job (created_at) #delete from jobs where id=:jid returning method, data_out job = sess.execute( text(""" select method, data_out from jobs where id=:jid """), params).fetchone() sess.commit() return job i += 1
def ga(uid: Union[str, UUID4], category: str, action: str): """ I'm only tracking interesting server-side events right now (no cookies), I want to see what features are being used and important bits like sign-ups & book-thumbs. user-id is obfuscated https://developers.google.com/analytics/devguides/collection/protocol/v1/devguide#event """ # actually don't care about uid, just need a unique identifier. Note this is # a 1-way hash (right?), so I can't even decrypt - just want unique per-feature track uid_ = str(uid).encode() # to bytes uid_ = hashlib.sha256(uid_).hexdigest() url = "https://ssl.google-analytics.com/" url += "debug/collect" if DEBUG else "collect" res = requests.post(url, params=dict(v=1, tid=vars.GA, cid=uid_, t='event', ec=category, ea=action)) # if DEBUG: print(res.json()) if action in ('register', 'like', 'dislike', 'therapist', 'notes'): with session() as sess: if sess.execute( text(""" select is_superuser su from users where id=:uid """), dict(uid=uid)).fetchone().su: # don't notify of my own or Lisa's entries return send_mail('*****@*****.**', 'action', dict(category=category, action=action))
def nlp_on_rows(method, id, job_id): for_entries = method == 'entries' with session() as sess: uids = nlp_on_rows_(method, id, job_id, sess, set()) # profiles doesn't use uids, but create 1-el array to iterate anyway if not for_entries: uids = [None] for i, uid in enumerate(uids): uid = str(uid) gen_keywords(for_entries, uid, sess) if for_entries: # Then add a book job for every user who was affected. Delay by x minutes per # job (jobs are pruned by updated_at < ?, so posting to the future) # 9131155e: only update every x entries future = datetime.utcnow() + timedelta(minutes=i*3) sess.add(M.Job( method='books', data_in={'args': [uid]}, created_at=future, updated_at=future, )) sess.commit() else: match_profiles(sess) return {}
def themes(eids, algo='agglomorative'): logger.info("Themes") with session() as sess: # use Model to decrypt fields E, CE = M.Entry, M.CacheEntry res = sess.query(CE) \ .with_entities(CE.paras, CE.clean, CE.vectors) \ .join(E, E.id == CE.entry_id) \ .filter(E.id.in_(eids), func.array_length(CE.vectors,1)>0) \ .order_by(E.created_at.desc()) \ .all() # assert len(eids) == len(res) entries = pd.Series([e for r in res for e in r.paras]) stripped = pd.Series([c for r in res for c in r.clean]) vecs = np.vstack([r.vectors for r in res]).astype(np.float32) chain = Similars(vecs) if False and os.path.exists(vars.AE_PATH): chain = chain.autoencode(filename=vars.AE_PATH).cluster(algo=algo) else: chain = chain.normalize().cluster(algo=algo) clusters = chain.value() labels = chain.data.labels topics = [] for l, center in enumerate(clusters): mask = labels == l n_entries = mask.sum().item() print('n_entries', n_entries) if n_entries < 2: print('skipping') continue vecs_, stripped_, entries_ = vecs[mask], stripped[mask], entries[mask] dists = Similars(center, vecs_).normalize().cosine(abs=True).value().squeeze() entries_ = entries_.iloc[dists.argsort()].tolist()[:5] terms = top_terms(stripped_.tolist()) topics.append({ 'n_entries': n_entries, 'terms': terms, 'summary': entries_, # add full thing, will batch-compute next 'sentiment': None, }) groups = [t['summary'] for t in topics] batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300) for i, res in enumerate(batch_summaries): print(res) topics[i]['summary'] = res['summary'] topics[i]['sentiment'] = res['sentiment'] topics = {'terms': top_terms(stripped, 10), 'themes': topics} return topics
def prune(): with session() as sess: """prune completed or stuck jobs. Completed jobs aren't too useful for admins; error is.""" sess.execute(f""" delete from jobs where updated_at < now() - interval '10 minutes' and state in ('working', 'done') """) sess.commit()
def db(client): """await client to init_db""" with D.session() as sess: # wait for GPU to restart from no-db crash while True: sql = "select 1 from machines where status='on'" if M.await_row(sess, sql): break time.sleep(.5) yield sess
def themes(eids, algo='agglomorative'): with session() as sess: # use Model to decrypt fields res = sess.query(M.CacheEntry)\ .with_entities(M.CacheEntry.paras, M.CacheEntry.clean, M.CacheEntry.vectors)\ .join(M.Entry, M.Entry.id == M.CacheEntry.entry_id)\ .filter(M.Entry.id.in_(eids))\ .order_by(M.Entry.created_at.desc())\ .all() # assert len(eids) == len(res) entries = pd.Series([e for r in res for e in r.paras]) stripped = pd.Series([c for r in res for c in r.clean]) vecs = [] for r in res: if r.vectors: vecs += r.vectors # if not vecs: return False # TODO somethign else to return? vecs = np.vstack(vecs).astype(np.float32) clusters = Similars(vecs).normalize().cluster(algo=algo).value() topics = [] for l in range(clusters.max()): in_clust = clusters == l n_entries = in_clust.sum().item() print('n_entries', n_entries) if n_entries < 2: print('skipping') continue vecs_, stripped_, entries_ = vecs[in_clust],\ stripped.iloc[in_clust], entries.iloc[in_clust] center = vecs_.mean(axis=0)[np.newaxis, :] dists = Similars(center, vecs_).normalize().cosine().value().squeeze() entries_ = entries_.iloc[dists.argsort()].tolist()[:5] terms = top_terms(stripped_.tolist()) topics.append({ 'n_entries': n_entries, 'terms': terms, 'summary': entries_, # add full thing, will batch-compute next 'sentiment': None, }) groups = [t['summary'] for t in topics] batch_summaries = nlp_.summarization(groups, min_length=50, max_length=300) for i, res in enumerate(batch_summaries): print(res) topics[i]['summary'] = res['summary'] topics[i]['sentiment'] = res['sentiment'] topics = {'terms': top_terms(stripped, 10), 'themes': topics} return topics
def influencers(): with session() as sess: users = sess.execute( text(f""" select id::text from users where -- has logged in recently updated_at > now() - interval '2 days' and -- has been 1d since last-run (or never run) (extract(day from now() - last_influencers) >= 1 or last_influencers is null) """)).fetchall() for u in users: uid_ = dict(uid=u.id) sess.execute( text(f""" update users set last_influencers=now() where id=:uid """), uid_) sess.commit() res = influencers_(sess, u.id) if not res: continue # A field can get deleted while running XGB, causing a fkey constraint error. # https://docs.sqlalchemy.org/en/13/dialects/postgresql.html # Can't do on_conflict for FK constraints, get fresh ids and filter out missing ones. fids = [ x.id for x in sess.execute( text(""" select id::text from fields where user_id=:uid """), uid_).fetchall() ] next_preds, importances, all_imps = res for fid, others in importances.items(): if fid not in fids: continue inf_score, next_pred = all_imps[fid], next_preds[fid] insert = postgresql.insert(M.Influencer.__table__).values([ dict(field_id=fid, influencer_id=inf_id, score=score) for inf_id, score in others.items() if inf_id in fids ]) sess.execute( insert.on_conflict_do_update( constraint=M.Influencer.__table__.primary_key, set_=dict(score=insert.excluded.score))) sess.execute( text(""" update fields set influencer_score=:score, next_pred=:pred where id=:fid; """), dict(score=inf_score, pred=next_pred, fid=fid)) sess.commit() return {}
def cloud_up_maybe(): if is_dev(): return with session() as sess: if M.User.last_checkin(sess) > 15: return if M.Machine.gpu_status(sess) in ("on", "pending"): return logger.warning("Initing AWS Batch") M.Machine.notify_online(sess, 'batch', 'pending') boto3.client('batch').submit_job( jobName=str(uuid4()), jobQueue='gnothi-jq', jobDefinition='gnothi-jd', )
def predict_books(user_id, vecs_user, n_recs=30, centroids=False): with session() as sess: # TODO should I move this down further, to get more lines to test? fixt = fixtures.load_books(user_id) if fixt is not None: return fixt vecs_books, books = load_books(sess) sql = "select book_id as id, user_id, shelf from bookshelf where user_id=%(uid)s" shelf = pd.read_sql(sql, sess.bind, params={'uid': user_id}).set_index('id', drop=False) shelf_idx = books.id.isin(shelf.id) # normalize for cosine, and downstream DNN chain = Similars(vecs_user, vecs_books).normalize() vecs_user, vecs_books = chain.value() logger.info("Finding cosine similarities") if centroids: labels = chain.agglomorative().value() lhs = np.vstack([ vecs_user[labels == l].mean(0) for l in range(labels.max()) ]) chain = Similars(lhs, vecs_user) # Take best cluster-score for every book dist = chain.cosine().value().min(axis=0) # 0f29e591: minmax_scale(dist). norm_out=True works better # then map back onto books, so they're back in order (pandas index-matching) books['dist'] = dist if shelf_idx.sum() > 0: like, dislike = dist.min() - dist.std(), dist.max() + dist.std() shelf_map = dict(like=like, already_read=like, recommend=like, dislike=dislike, remove=None, ai=None) shelf['dist'] = shelf.shelf.apply(lambda k: shelf_map[k]) shelf.dist.fillna(books.dist, inplace=True) # fill in "remove" books.loc[shelf.index, 'dist'] = shelf.dist # indexes(id) match, so assigns correctly assert not books.dist.isna().any(), "Messed up merging shelf/books.dist by index" # e2eaea3f: save/load dnn dnn = train_books_predictor(books, vecs_books, shelf_idx) books['dist'] = dnn.predict(vecs_books) # dupes by title in libgen # r = books.sort_values('dist')\ df = books[~shelf_idx].sort_values('dist')\ .drop_duplicates('title', keep='first')\ .iloc[:n_recs] fixtures.save_books(user_id, df) return df
def multiple_book_jobs(uids): with session() as sess: sess.execute( satext(""" update users set last_books=null where id in :uids; """), dict(uids=tuple(uids))) sess.commit() # TODO handle this in run.py when it's consuming jobs def delay_books(uid, i): time.sleep(i * 60 * 5) # 5m Job.create_job('books', data_in=dict(args=[str(uid)])) for i, uid in enumerate(uids): threading.Thread(target=delay_books, args=(uid, i)).start()
def wrap_job(jid, method, fn): logger.info(f"Run job {method}") try: start = time.time() res = fn() sql = "update jobs set state='done', data_out=:data where id=:jid" logger.info(f"Job {method} complete {time.time() - start}") except Exception as err: err = str(traceback.format_exc()) # str(err) res = dict(error=err) sql = "update jobs set state='error', data_out=:data where id=:jid" logger.error(f"Job {method} error {time.time() - start} {err}") with session() as sess: sess.execute(satext(sql), dict(data=jsonb(res), jid=str(jid))) sess.commit()
def match_profiles(): with session() as sess: df = pd.read_sql( """ select e.user_id, c.vectors from cache_entries c inner join entries e on e.id=c.entry_id where c.vectors is not null """, sess.bind) if not df.shape[0]: return # flatten multi-paragraph entries df['vectors'] = df.vectors.apply(mean_) # then mean the semantic of all entries for this user. # TODO cluster or something, just mean-ing all their entries is stupid df = df.groupby(['user_id']).vectors.agg(mean_) uids = df.index.tolist() vecs_entries = np.vstack(df.values) # TODO add community (M.User.public == True) df = pd.read_sql( """ select c.user_id, c.vectors from cache_users c inner join users u on c.user_id=u.id where u.therapist=true and c.vectors is not null """, sess.bind) if not df.shape[0]: return match_ids = df.user_id.tolist() # This on the other hand is OK to mean, it's just their profile vecs_profiles = np.vstack(df.vectors.apply(mean_).values) dists = Similars(vecs_entries, vecs_profiles).normalize().cosine().value() sess.execute( text(""" delete from profile_matches where user_id in :uids """), dict(uids=tuple(uids))) sess.commit() # everything is in same order at this point sess.bulk_save_objects([ M.ProfileMatch(user_id=uid, match_id=mid, score=dists[i, j]) for i, uid in enumerate(uids) for j, mid in enumerate(match_ids) ]) sess.commit()
def clear_fixtures(self): all_ = FRESH == 'all' if 'books' in FRESH or all_: self.rm(f"{BASE}/books.pkl") if 'entries' in FRESH or all_: self.rm(f"{BASE}/entries.pkl") self.rm(f"{BASE}/nlp_entries.pkl") if 'profiles' in FRESH or all_: self.rm(f"{BASE}/nlp_profiles.pkl") if 'wiki' in FRESH or all_: self.rm(f"{BASE}/wiki", isdir=True) if 'influencers' in FRESH or all_: self.rm(f"{BASE}/xgb_hypers.pkl") if 'liben' in FRESH or all_: self.rm(f"/storage/libgen_testing.npy") with session() as sess: sess.execute("delete from books") sess.commit()
def run_job(job): jid_, k = str(job.id), job.method jid = {'jid': jid_} with session() as sess: data = sess.execute("select data_in from jobs where id=:jid", jid).fetchone().data_in args = data.get('args', []) kwargs = data.get('kwargs', {}) if k in ('entries', 'profiles'): kwargs['job_id'] = jid_ if k == 'books': nlp_.clear() os.system(f"python app/books.py --jid={jid_} --uid={args[0]}") return def fn(): return m[k](*args, **kwargs) M.Job.wrap_job(jid_, k, fn)
def cloud_up_maybe(): if is_dev(): return with session() as sess: if M.User.last_checkin(sess) > 10: return if M.Machine.gpu_status(sess) != "off": return logger.warning("Initing Paperspace") M.Machine.notify_online(sess, 'paperspace', 'pending') jobs = job_client.list() if any([j.state in up_states for j in jobs]): return vars_ = {**dict(vars), **{'MACHINE': 'paperspace'}} return job_client.create(machine_type='K80', container='lefnire/gnothi:gpu-0.0.13', project_id=vars.PAPERSPACE_PROJECT_ID, is_preemptible=True, command='python app/run.py', env_vars=vars_)
def run_books(user_id): with session() as sess: user_id = str(user_id) uid = {'uid': user_id} # don't run if ran recently (notice the inverse if & comparator, simpler) if sess.execute(text(f""" select 1 from users where id=:uid and last_books > {utcnow} - interval '10 minutes' """), uid).fetchone(): return sess.execute(text(f""" update users set last_books={utcnow} where id=:uid """), uid) sess.commit() entries = sess.execute(text(""" select c.vectors from cache_entries c inner join entries e on e.id=c.entry_id and e.user_id=:uid order by e.created_at desc; """), uid).fetchall() profile = sess.execute(text(""" select vectors from cache_users where user_id=:uid """), uid).fetchone() vecs = [] if profile and profile.vectors: vecs += profile.vectors for e in entries: if e.vectors: vecs += e.vectors vecs = np.vstack(vecs).astype(np.float32) res = predict_books(user_id, vecs) sess.execute(text(""" delete from bookshelf where user_id=:uid and shelf='ai' """), uid) sess.commit() res = res.rename(columns=dict(id='book_id', dist='score'))[['book_id', 'score']] res['user_id'] = user_id res['shelf'] = 'ai' res['created_at'] = res['updated_at'] = datetime.datetime.utcnow() res.to_sql('bookshelf', sess.bind, if_exists='append', index=False)
def create_job(method, data_in={}, **kwargs): """ Ensures certain jobs only created once at a time. Never manually add Job() call this instead """ with session() as sess: arg0 = data_in.get('args', [None])[0] if type(arg0) != str: arg0 = None # For entries, profiles: set ai_ran=False to queue them into the next batch, # then arg0 isn't used downstream (was previously). if method in ('entries', 'profiles') and arg0: table = dict(entries='entries', profiles='users')[method] sess.execute( satext(f""" update {table} set ai_ran=False where id=:id; """), dict(id=arg0)) sess.commit() exists = sess.execute( satext(""" select 1 from jobs -- maybe if we're mid-job, things have changed; so don't incl. working? rethink --where method=:method and state in ('new', 'working') and where method=:method and state='new' and case when method='influencers' then true when method='books' and data_in->'args'->>0=:arg0 then true when method='entries' then true when method='profiles' then true when method='habitica' then true else false end """), dict(method=method, arg0=arg0)).fetchone() if exists: return False j = Job(method=method, data_in=data_in, **kwargs) sess.add(j) sess.commit() sess.refresh(j) return str(j.id)
def predict_books(user_id, vecs_user, n_recs=30): with session() as sess: # TODO should I move this down further, to get more lines to test? fixt = fixtures.load_books(user_id) if fixt is not None: return fixt vecs_books, books = load_books(sess, user_id) # normalize for cosine, and downstream DNN chain = Similars(vecs_user, vecs_books).normalize() vecs_user, vecs_books = chain.value() logger.info("Finding cosine similarities") # Take best cluster-score for every book dist = chain.cosine(abs=True).value().min(axis=0) # 0f29e591: minmax_scale(dist). norm_out=True works better # then map back onto books, so they're back in order (pandas index-matching) # Push highly-rated books up, low-rated books down. Do that even stronger for user's own ratings. # Using negative-score because cosine DISTANCE (less is better) books['dist'] = dist books['dist'] = books.dist \ + (books.dist.std() * -books.global_score / 2.) \ + (books.dist.std() * -books.user_score) assert not books.dist.isna().any(), "Messed up merging shelf/books.dist by index" # e2eaea3f: save/load dnn dnn = train_books_predictor(books, vecs_books) books['dist'] = dnn.predict(vecs_books) # dupes by title in libgen # r = books.sort_values('dist')\ df = books[~books.user_rated].sort_values('dist')\ .drop_duplicates('title', keep='first')\ .iloc[:n_recs] fixtures.save_books(user_id, df) return df
def await_job(jid): with session() as sess: params = {'jid': jid} i = 0 while True: time.sleep(1) job = sess.execute( text(""" select state, method from jobs where id=:jid """), params).fetchone() # TODO notify them of error? # 10 minutes, give up if job.state == 'error' or i > 60 * 10: return Box(method=job.method, data_out=False) if job.state == 'done': job = sess.execute( text(""" select method, data_out from jobs where id=:jid """), params).fetchone() sess.commit() return job i += 1
return m[k](*args, **kwargs) M.Job.wrap_job(jid_, k, fn) # 3eb71b3: unloading models. multiprocessing handles better if __name__ == '__main__': logger.info(f"torch.cuda.current_device() {torch.cuda.current_device()}") logger.info(f"torch.cuda.device(0) {torch.cuda.device(0)}") logger.info(f"torch.cuda.device_count() {torch.cuda.device_count()}") logger.info( f"torch.cuda.get_device_name(0) {torch.cuda.get_device_name(0)}") logger.info(f"torch.cuda.is_available() {torch.cuda.is_available()}") logger.info("\n\n") with session() as sess: while True: M.Machine.notify_online(sess, vars.MACHINE) cloud_down_maybe(sess) # only allow 2 jobs at a time. if M.Machine.job_ct_on_machine(sess, vars.MACHINE) >= 2: time.sleep(1) continue # Find jobs job = M.Job.take_job(sess, "run_on='gpu'") if job: # aaf1ec95: multiprocessing.Process for problem models threading.Thread(target=run_job, args=(job, )).start() # run_job(job.id)
def eid_to_title(self, eid): with session() as sess: return sess.execute( text(""" select title from entries where id=:eid """), dict(eid=eid)).fetchone().title
def uid_to_email(self, uid): with session() as sess: return sess.execute( text(""" select email from users where id=:uid """), dict(uid=uid)).fetchone().email
def load_books_df(sess): # sort asc since that's how we mapped to vecs in first place (order_values) df = pd.read_sql("select * from books order by id asc", sess.bind) if df.shape[0]: return df.set_index('id', drop=False) logger.info("Load books MySQL") FIND_PROBLEMS = False ALL_BOOKS = False # for-sure psych. See tmp/topics.txt, or libgen.sql topics(lang='en') psych_topics = 'psychology|self-help|therapy' # good other psych topics, either mis-categorized or other psych_topics += '|anthropology|social|religion' psych_topics += '|^history|^education' sql = Box( select="select u.ID, u.Title, u.Author, d.descr, t.topic_descr", body=""" from updated u inner join description d on d.md5=u.MD5 inner join topics t on u.Topic=t.topic_id and t.lang='en' where u.Language = 'English' and title not regexp 'sams|teach yourself' -- remove junk and (length(d.descr) + length(u.Title)) > 200 and u.ID not in ('62056','72779','111551','165602','165606','239835','240399','272945','310202','339718','390651','530739','570667','581466','862274','862275','879029','935149','1157279','1204687','1210652','1307307','1410416','1517634','1568907','1592543','2103755','2128089','2130515','2187329','2270690','2270720','2275684','2275804','2277017','2284616','2285559','2314405','2325313','2329959','2340421','2347272','2374055','2397307','2412259','2420958','2421152','2421413','2423975') """, # handle u.Topic='' (1326526 rows) just_psych=f"and t.topic_descr regexp '{psych_topics}'", # find_problems just_ids="select distinct u.ID", where_id="and u.ID=:id" ) if FIND_PROBLEMS: # # Those MD5s: UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 636: character maps to <undefined> # TODO try instead create_engine(convert_unicode=True) with session('books') as sessb: ids = ' '.join([sql.just_ids, sql.body]) ids = [x.ID for x in sessb.execute(ids).fetchall()] problem_ids = [] for i, id in enumerate(tqdm(ids)): if i % 10000 == 0: problems = len(problem_ids) / len(ids) * 100 logger.info(f"{problems}% problems") try: row = ' '.join([sql.select, sql.body, sql.where_id]) sessb.execute(text(row), {'id': id}) except: problem_ids.append(id) problem_ids = ','.join([f"'{id}'" for id in problem_ids]) logger.info(f"and u.ID not in ({problem_ids})") exit(0) sql_ = [sql.select, sql.body] if not ALL_BOOKS: sql_ += [sql.just_psych] sql_ = ' '.join(sql_) with session('books') as sessb: df = pd.read_sql(sql_, sessb.bind) df = df.drop_duplicates(['Title', 'Author']) logger.info(f"n_books before cleanup {df.shape[0]}") logger.info("Removing HTML") broken = '(\?\?\?|\#\#\#)' # russian / other FIXME better way to handle df = df[~(df.Title + df.descr).str.contains(broken)] \ .drop_duplicates(['Title', 'Author']) # TODO reconsider df['descr'] = cleantext.multiple(df.descr.tolist(), [ cleantext.strip_html, cleantext.fix_punct, cleantext.only_ascii, cleantext.multiple_whitespace, cleantext.unmark ]) # books = books[books.clean.apply(lambda x: detect(x) == 'en')] logger.info(f"n_books after cleanup {df.shape[0]}") df = df.rename(columns=dict( ID='id', descr='text', Title='title', Author='author', topic_descr='topic', )) # drop dupes, keep longest desc df['txt_len'] = df.text.str.len() df = df.sort_values('txt_len', ascending=False)\ .drop_duplicates('id')\ .drop(columns=['txt_len'])\ .sort_values('id') df['thumbs'] = 0 logger.info(f"Saving books to DB") df.to_sql('books', sess.bind, index=False, chunksize=500, if_exists='append', method='multi') return df.set_index('id', drop=False)
create_database(vars.DB_FULL) import pytest, time from box import Box from fastapi.testclient import TestClient from lorem_text import lorem import common.database as D import common.models as M from common.fixtures import fixtures from app.main import app import logging logger = logging.getLogger(__name__) with D.session() as sess: for t in """ bookshelf cache_entries cache_users entries entries_tags field_entries fields jobs machines notes people shares shares_tags tags
def db_books(): with D.session('books') as sess: yield sess
def load_df(self): if exists(paths.df): logger.info("Load books.df") self.df = pd.read_feather(paths.df)\ .drop(columns=['index'])\ .set_index('id', drop=False) return # invalidate embeddings, they're out of sync try: os.remove(paths.vecs) except: pass logger.info("Load books MySQL") # 58fbd36a: limit to psychology topics sql = f""" select u.ID, u.Title, u.Author, d.descr, t.topic_descr from updated u inner join description d on d.md5=u.MD5 inner join topics t on u.Topic=t.topic_id -- later more languages; but I think it's only Russian in Libgen? and t.lang='en' where u.Language = 'English' -- Make sure there's some content to work with and length(d.descr) > 200 and length(u.Title) > 1 """ with session('books') as sessb: df = pd.read_sql(sql, sessb.bind) df = df.rename(columns=dict( ID='id', descr='text', Title='title', Author='author', topic_descr='topic', )) logger.info(f"n_books before cleanup {df.shape[0]}") logger.info("Remove HTML") # some books are literally just ######## df = df[~(df.title + df.text).str.contains('(\?\?\?|\#\#\#)')] df['text'] = CleanText(df.text.tolist())\ .strip_html()\ .only_ascii()\ .multiple_whitespace()\ .value() df['txt_len'] = df.text.str.len() # Ensure has content. Drop dupes, keeping those w longest description df = df[df.txt_len > 150]\ .sort_values('txt_len', ascending=False)\ .drop_duplicates('id')\ .drop_duplicates(['title', 'author'])\ .drop(columns=['txt_len']) # books = books[books.clean.apply(lambda x: detect(x) == 'en')] logger.info(f"n_books after cleanup {df.shape[0]}") logger.info(f"Save books.df") # Error: feather does not support serializing a non-default index for the index; you can .reset_index() to make the index into column(s) # I get ^ even though no index has yet been set. Have to manually reset_index() anyway df = df.reset_index() df.to_feather(paths.df) # call self, which returns newly-saved df (ensures consistent order, etc) self.load_df()
def influencers_(user_id): logging.info("Influencers") with session() as sess: fes = pd.read_sql(""" -- remove duplicates, use average. FIXME find the dupes bug with fe_clean as ( select field_id, created_at::date, avg(value) as value from field_entries group by field_id, created_at::date ), -- ensure enough data fe_ct as ( select field_id from fe_clean group by field_id having count(value) > 5 ) select fe.created_at, -- index fe.field_id::text, -- column, uuid->string fe.value -- value from fe_clean fe inner join fe_ct on fe_ct.field_id=fe.field_id -- just removes rows inner join fields f on f.id=fe.field_id where f.user_id=%(uid)s and f.excluded_at is null order by fe.created_at asc """, sess.bind, params={'uid': user_id}) if not fes.size: return None # not enough entries params = dict( uid=user_id, fids=tuple(fes.field_id.unique().tolist()) ) fs = pd.read_sql(""" select id::text, default_value, default_value_value from fields where user_id=%(uid)s and id in %(fids)s and excluded_at is null """, sess.bind, params=params) fs = {r.id: r for i, r in fs.iterrows()} # Easier pivot debugging # fields['field_id'] = fields.field_id.apply(lambda x: x[0:4]) fes = fes.pivot(index='created_at', columns='field_id', values='value') # fes = fes.resample('D') cols = fes.columns.tolist() hypers = hyperopt(fes, fs, user_id) xgb_args = {} # {'tree_method': 'gpu_hist', 'gpu_id': 0} next_preds = {} importances = {} all_imps = [] for t in cols: # remove up until they start tracking; we'll impute from there on up fvi = fes[t].first_valid_index() fes_ = impute_and_roll(fes[fvi:].copy(), fs) ### Next Preds ### ---------- # For next-pred, we keep target column. Yes, likely most predictive; but a rolling # trend is important info X = fes_ y = X[t] model = XGBRegressor(**xgb_args, **hypers) model.fit(X, y) preds = model.predict(X.iloc[-1:]) next_preds[t] = float(preds[0]) # model.fit(X, y) # what's this? was I fitting twice? ### Importances ### ----------- X = fes_.drop(columns=[t]) y = fes_[t] model = XGBRegressor(**xgb_args, **hypers) model.fit(X, y) imps = [float(x) for x in model.feature_importances_] # FIXME # /xgboost/sklearn.py:695: RuntimeWarning: invalid value encountered in true_divide return all_features / all_features.sum() # I think this is due to target having no different value, in which case # just leave like this. imps = [0. if np.isnan(imp) else imp for imp in imps] # put target col back in imps.insert(cols.index(t), 0.0) dict_ = dict(zip(cols, imps)) all_imps.append(dict_) importances[t] = dict_ all_imps = dict(pd.DataFrame(all_imps).mean()) return next_preds, importances, all_imps
def nlp_on_rows(method='entries'): for_entries = method == 'entries' # else is_profile with session() as sess: if for_entries: rows = sess.query(M.Entry) \ .filter( func.length(M.Entry.text) > 64, M.Entry.no_ai.isnot(True), M.Entry.ai_ran.isnot(True) ) else: rows = sess.query(M.User) \ .filter( func.length(M.User.bio) > 32, M.User.ai_ran.isnot(True) ) rows = rows.all() if not rows: return {} paras_grouped = [] uids = set() for r in rows: txt = r.text if for_entries \ else r.bio # r.profile_to_text() # TODO profile_to_text adds people paras_grouped.append(cleantext.markdown_split_paragraphs([txt])) if for_entries: uids.add(r.user_id) paras_flat = [p for paras in paras_grouped for p in paras] fkeys = [r.title for r in rows] \ if for_entries else [r.email for r in rows] fixt = fixtures.load_nlp_rows(fkeys, method=method) if fixt: if for_entries: clean_txt, embeds, titles, texts = fixt else: clean_txt, embeds = fixt else: clean_txt = cleantext.keywords( paras_flat, postags=['NOUN', 'ADJ', 'VERB', 'PROPN']) embeds = nlp_.sentence_encode(paras_flat).tolist() if for_entries: titles = nlp_.summarization(paras_grouped, min_length=5, max_length=20, with_sentiment=False) texts = nlp_.summarization(paras_grouped, min_length=30, max_length=250) for i, r in enumerate(rows): CM = M.CacheEntry if for_entries else M.CacheUser c = sess.query(CM).get(r.id) if not c: c = CM(entry_id=r.id) if for_entries else CM(user_id=r.id) sess.add(c) # Save the cache_entry (paras,clean,vectors) paras = paras_grouped[i] c.paras = paras ct = len(paras) c.clean = [' '.join(e) for e in clean_txt[:ct]] c.vectors = embeds[:ct] sess.commit() # Save the fixture for later fixt = (clean_txt[:ct], embeds[:ct], titles[i], texts[i]) \ if for_entries else (clean_txt[:ct], embeds[:ct]) fixt_k = r.title if for_entries else r.email fixtures.save_nlp_row(fixt_k, fixt, method=method) clean_txt, embeds = clean_txt[ct:], embeds[ct:] if for_entries: r.title_summary = titles[i]["summary"] r.text_summary = texts[i]["summary"] r.sentiment = texts[i]["sentiment"] r.ai_ran = True sess.commit() if for_entries: # 9131155e: only update every x entries M.Job.multiple_book_jobs(list(uids)) return {}