def title_summary(): repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 for id, row in repo.list(): result = ast.literal_eval( q.query(json.dumps({ "text": row, "count": 203 }))) sims = result['result'] inter = result['keywords'] index = np.where(np.array(list(sims.keys())) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10)) '''tootal count 203. USE: [13.62, 3: 106, 10: 148], upvotes: [20.56, 3: 83, 10: 124],
def random_sent_summary(): meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) ids = meta['children'] random_sent_id = random.choice(ids) result = ast.literal_eval( q.query(json.dumps({ "text": random_sent_id, "count": 205 }))) sims = result['result'] inter = result['keywords'] index = np.where(np.array(list(sims.keys())) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / meta_repo.count(), top3, top10)) '''tootal count 203. USE: [9.46, 3: 135, 10: 159], upvotes: [9.68,3: 150, 10: 165],
def get_docs(repo_name): docs = [] ids = [] vecs_repo = CassandraDatabase(project_name='papers', repo_name= repo_name, id_sql_type='BIGINT', content_sql_type="TEXT") for id, row in vecs_repo.list(): docs.append(ast.literal_eval(row)) ids.append(id) return np.array(ids), np.array(docs)
def split_into_sents(): count = 0 sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT") for id, text in summary_repo.list(): row = meta_repo.read(id)[0] meta = ast.literal_eval(row.replace('nan', '\'\'')) meta['children'] = [] sent_text = nltk.sent_tokenize(text) for sentence in sent_text: sents_repo.write(count, sentence) meta['children'].append(count) count+=1 meta_repo.write(id, str(meta))
def title_sents(): repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") sent_sum_map_repo = CassandraDatabase(project_name='papers', repo_name='sent_sum_map', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 k = 0 for id, row in repo.list(): k += 1 print(k) result = ast.literal_eval( q.query(json.dumps({ "text": row, "count": 203 }))) sims = result['result'] inter = result['keywords'] papers_ids = [] for sent_id in list(sims.keys()): paper_id = int(sent_sum_map_repo.read(sent_id)[0]) if paper_id not in papers_ids: papers_ids.append(paper_id) if paper_id == id: index = len(papers_ids) break # papers_ids = np.array(papers_ids) # index = np.where(np.array(papers_ids) == id)[0][0] if index < 3: top3 += 1 if index < 10: top10 += 1 loc += index print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10)) '''tootal count 203. USE: [11.40, 3: 11, 10: 143], upvotes: [17.77, 3: 107, 10: 141],
def encode_sents(): sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents_count_vec', id_sql_type='BIGINT', content_sql_type="TEXT") # encoder = InferRpcClient() # encoder = USERpcClient() path = 'C:\zorba\storage\\vectorizer.joblib' vectorizer = load(path) i = 0 for id, row in sents_repo.list(): print(i) i += 1 # sent_vec = str(encoder.encode(row)['encoded']) sent_vec = str(vectorizer.transform([row]).toarray()[0].tolist()) encoded_sents_repo.write(id, sent_vec)
from services.cassandra_ import CassandraDatabase import ast import random meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") sent_sum_map_repo = CassandraDatabase(project_name='papers', repo_name='sent_sum_map', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_sents_repo = CassandraDatabase(project_name='papers', repo_name='sents_count_vec', id_sql_type='BIGINT', content_sql_type="TEXT") for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) ids = meta['children'] print(id, ids, random.choice(ids)) print(encoded_sents_repo.read(random.choice(ids))[0]) break for child_id in ids: sent_sum_map_repo.write(child_id, str(id))
from services.keywords_RPC import KeywordExtractorRpcClient from services.cassandra_ import CassandraDatabase from services.cassandra_ import CassandraDatabase import ast import spacy nlp = spacy.load('en_core_web_sm') meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") extractor = KeywordExtractorRpcClient() i = 0 for id, row in meta_repo.list(): meta = ast.literal_eval(row.replace('nan', '\'\'')) keywords = meta['keywords'] text = summary_repo.read(id)[0] if len(keywords) != 1: keywords += list(extractor.extract(text)['keywords']) else: keywords = list(extractor.extract(text)['keywords']) keywords_lemmas = [token.lemma_ for token in nlp(' '.join(keywords))] meta['keywords_lemmas'] = list(set(keywords_lemmas)) meta['keywords'] = list(set(keywords)) meta_repo.write(id, str(meta))
import ast from sklearn.metrics.pairwise import cosine_similarity from services.encoders.USE_RPC import USERpcClient from services.encoders.infer_RPC import InferRpcClient from joblib import dump, load import spacy from collections import defaultdict os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' '''vectors repos [title_, summary_], [USE, Infer, Gru5, upvotes, jokes, reviews upv_score256, upvotes256, reviews256, jokes256, gru256, count_vec] ''' nlp = spacy.load('en_core_web_sm') meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") repo_name = 'summary_USE' encoder = USERpcClient() # encoder = InferRpcClient() # path = 'C:\zorba\storage\\vectorizer.joblib' # vectorizer = load(path) def encode(text): vector = str(encoder.encode(text)['encoded']) # vector = str(vectorizer.transform([text]).toarray()[0].tolist()) return vector def get_docs(repo_name): docs = [] ids = []
import pandas as pd from dateutil import parser import nltk import ast from services.cassandra_ import CassandraDatabase title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT") def split_into_sents(): count = 0 sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT") for id, text in summary_repo.list(): row = meta_repo.read(id)[0] meta = ast.literal_eval(row.replace('nan', '\'\'')) meta['children'] = [] sent_text = nltk.sent_tokenize(text) for sentence in sent_text: sents_repo.write(count, sentence) meta['children'].append(count) count+=1 meta_repo.write(id, str(meta)) def parse_record(record): title = record['paper'].lower().replace(u'\xa0', u' ').strip() summary = record['summary'].lower().replace(u'\xa0', u' ').strip() try: date = parser.parse(record['publish date'].replace(u'\xa0', u' ').strip()) except:
from sklearn.feature_extraction.text import CountVectorizer from joblib import dump, load from services.cassandra_ import CassandraDatabase import time from sklearn.feature_extraction import stop_words title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") stop_words = stop_words.ENGLISH_STOP_WORDS corpus = [] for id, row in summary_repo.list(): corpus.append(row) corpus.append(title_repo.read(id)[0]) path = 'C:\zorba\storage\\vectorizer.joblib' vectorizer = CountVectorizer(stop_words=stop_words) t1 = time.time() x = vectorizer.fit(corpus) print(time.time() - t1) print(len(vectorizer.get_feature_names())) dump(vectorizer, path) # vectorizer = load(path)
from services.querier_RPC import QuerierRpcClient import numpy as np from services.encoders.USE_RPC import USERpcClient from services.encoders.infer_RPC import InferRpcClient from joblib import dump, load import json import ast q = QuerierRpcClient() # path = 'C:\zorba\storage\\vectorizer.joblib' # vectorizer = load(path) # encoder = USERpcClient() encoder = InferRpcClient() summary_repo = CassandraDatabase(project_name='papers', repo_name= 'title', id_sql_type='BIGINT', content_sql_type="TEXT") loc = 0 top3 = 0 top10 = 0 j = 0 for id, row in summary_repo.list(): result = ast.literal_eval(q.query(json.dumps({"text":row, "count":203}))) sims = result['result'] inter = result['keywords'] index = np.where(np.array(list(sims.keys())) == id)[0][0] print(index, id) j+=1 if j==10: break
def encode(): title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT") summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_title_repo = CassandraDatabase(project_name='papers', repo_name='title_reviews', id_sql_type='BIGINT', content_sql_type="TEXT") encoded_summary_repo = CassandraDatabase(project_name='papers', repo_name='summary_reviews', id_sql_type='BIGINT', content_sql_type="TEXT") # encoder = USERpcClient() encoder = InferRpcClient() # path = 'C:\zorba\storage\\vectorizer.joblib' # vectorizer = load(path) i = 0 for id, row in title_repo.list(): print(i) i += 1 title_vec = str(encoder.encode(row)['encoded']) summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded']) # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist()) # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist()) encoded_title_repo.write(id, title_vec) encoded_summary_repo.write(id, summary_vec)