def random_sent_summary(): meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) ids = meta['children'] random_sent_id = random.choice(ids) result = ast.literal_eval( q.query(json.dumps({ "text": random_sent_id, "count": 205 }))) sims = result['result'] inter = result['keywords'] index = np.where(np.array(list(sims.keys())) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / meta_repo.count(), top3, top10)) '''tootal count 203. USE: [9.46, 3: 135, 10: 159], upvotes: [9.68,3: 150, 10: 165],
def title_summary(): repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 for id, row in repo.list(): result = ast.literal_eval( q.query(json.dumps({ "text": row, "count": 203 }))) sims = result['result'] inter = result['keywords'] index = np.where(np.array(list(sims.keys())) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10)) '''tootal count 203. USE: [13.62, 3: 106, 10: 148], upvotes: [20.56, 3: 83, 10: 124],
def encode(): title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_title_repo = CassandraDatabase(project_name='papers', repo_name='title_reviews', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_summary_repo = CassandraDatabase(project_name='papers', repo_name='summary_reviews', id_sql_type='BIGINT', content_sql_type="TEXT") # encoder = USERpcClient() encoder = InferRpcClient() # path = 'C:\zorba\storage\\vectorizer.joblib' # vectorizer = load(path) i = 0 for id, row in title_repo.list(): print(i) i += 1 title_vec = str(encoder.encode(row)['encoded']) summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded']) # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist()) # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist()) encoded_title_repo.write(id, title_vec) encoded_summary_repo.write(id, summary_vec)
def get_docs(repo_name): docs = [] ids = [] vecs_repo = CassandraDatabase(project_name='papers', repo_name= repo_name, id_sql_type='BIGINT', content_sql_type="TEXT") for id, row in vecs_repo.list(): docs.append(ast.literal_eval(row)) ids.append(id) return np.array(ids), np.array(docs)
def title_sents(): repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") sent_sum_map_repo = CassandraDatabase(project_name='papers', repo_name='sent_sum_map', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 k = 0 for id, row in repo.list(): k += 1 print(k) result = ast.literal_eval( q.query(json.dumps({ "text": row, "count": 203 }))) sims = result['result'] inter = result['keywords'] papers_ids = [] for sent_id in list(sims.keys()): paper_id = int(sent_sum_map_repo.read(sent_id)[0]) if paper_id not in papers_ids: papers_ids.append(paper_id) if paper_id == id: index = len(papers_ids) break # papers_ids = np.array(papers_ids) # index = np.where(np.array(papers_ids) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10)) '''tootal count 203. USE: [11.40, 3: 11, 10: 143], upvotes: [17.77, 3: 107, 10: 141],
def encode_sents(): sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents_count_vec', id_sql_type='BIGINT', content_sql_type="TEXT") # encoder = InferRpcClient() # encoder = USERpcClient() path = 'C:\zorba\storage\\vectorizer.joblib' vectorizer = load(path) i = 0 for id, row in sents_repo.list(): print(i) i += 1 # sent_vec = str(encoder.encode(row)['encoded']) sent_vec = str(vectorizer.transform([row]).toarray()[0].tolist()) encoded_sents_repo.write(id, sent_vec)
from services.cassandra_ import CassandraDatabase import ast import random meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") sent_sum_map_repo = CassandraDatabase(project_name='papers', repo_name='sent_sum_map', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents_count_vec', id_sql_type='BIGINT', content_sql_type="TEXT") for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) ids = meta['children'] print(id, ids, random.choice(ids)) print(encoded_sents_repo.read(random.choice(ids))[0]) break for child_id in ids: sent_sum_map_repo.write(child_id, str(id))
from joblib import dump, load from services.cassandra_ import CassandraDatabase import time from sklearn.feature_extraction import stop_words title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") stop_words = stop_words.ENGLISH_STOP_WORDS corpus = [] for id, row in summary_repo.list(): corpus.append(row) corpus.append(title_repo.read(id)[0]) path = 'C:\zorba\storage\\vectorizer.joblib' vectorizer = CountVectorizer(stop_words=stop_words) t1 = time.time() x = vectorizer.fit(corpus) print(time.time() - t1) print(len(vectorizer.get_feature_names())) dump(vectorizer, path) # vectorizer = load(path)