Beispiel #1
0
def title_summary():
    repo = CassandraDatabase(project_name='papers',
                             repo_name='title',
                             id_sql_type='BIGINT',
                             content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    for id, row in repo.list():
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": row,
                "count": 203
            })))
        sims = result['result']
        inter = result['keywords']
        index = np.where(np.array(list(sims.keys())) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10))
    '''tootal count 203. USE: [13.62, 3: 106, 10: 148], upvotes: [20.56, 3: 83, 10: 124], 
Beispiel #2
0
def random_sent_summary():
    meta_repo = CassandraDatabase(project_name='papers',
                                  repo_name='meta',
                                  id_sql_type='BIGINT',
                                  content_sql_type="TEXT")
    encoded_sents_repo = CassandraDatabase(project_name='papers',
                                           repo_name='sents',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    for id, row in meta_repo.list():
        meta = ast.literal_eval(row.replace('nan', '\'\''))
        ids = meta['children']
        random_sent_id = random.choice(ids)
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": random_sent_id,
                "count": 205
            })))
        sims = result['result']
        inter = result['keywords']
        index = np.where(np.array(list(sims.keys())) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / meta_repo.count(), top3,
                                            top10))
    '''tootal count 203. USE: [9.46, 3: 135, 10: 159], upvotes: [9.68,3: 150, 10: 165], 
Beispiel #3
0
def get_docs(repo_name):
    docs = []
    ids = []
    vecs_repo = CassandraDatabase(project_name='papers', repo_name= repo_name, id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    for id, row in vecs_repo.list():
        docs.append(ast.literal_eval(row))
        ids.append(id)
    return np.array(ids), np.array(docs)
def split_into_sents():
    count = 0
    sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT")
    for id, text in summary_repo.list():
        row = meta_repo.read(id)[0]
        meta = ast.literal_eval(row.replace('nan', '\'\''))
        meta['children'] = []
        sent_text = nltk.sent_tokenize(text)
        for sentence in sent_text:
            sents_repo.write(count, sentence)
            meta['children'].append(count)
            count+=1
        meta_repo.write(id, str(meta))
Beispiel #5
0
def title_sents():
    repo = CassandraDatabase(project_name='papers',
                             repo_name='title',
                             id_sql_type='BIGINT',
                             content_sql_type="TEXT")
    sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                          repo_name='sent_sum_map',
                                          id_sql_type='BIGINT',
                                          content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    k = 0
    for id, row in repo.list():
        k += 1
        print(k)
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": row,
                "count": 203
            })))
        sims = result['result']
        inter = result['keywords']

        papers_ids = []
        for sent_id in list(sims.keys()):
            paper_id = int(sent_sum_map_repo.read(sent_id)[0])
            if paper_id not in papers_ids:
                papers_ids.append(paper_id)
            if paper_id == id:
                index = len(papers_ids)
                break
        # papers_ids = np.array(papers_ids)
        # index = np.where(np.array(papers_ids) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10))
    '''tootal count 203. USE: [11.40, 3: 11, 10: 143], upvotes: [17.77, 3: 107, 10: 141],
def encode_sents():
    sents_repo = CassandraDatabase(project_name='papers',
                                   repo_name='sentences',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    encoded_sents_repo = CassandraDatabase(project_name='papers',
                                           repo_name='sents_count_vec',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    # encoder = InferRpcClient()
    # encoder = USERpcClient()
    path = 'C:\zorba\storage\\vectorizer.joblib'
    vectorizer = load(path)
    i = 0
    for id, row in sents_repo.list():
        print(i)
        i += 1
        # sent_vec = str(encoder.encode(row)['encoded'])
        sent_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        encoded_sents_repo.write(id, sent_vec)
Beispiel #7
0
from services.cassandra_ import CassandraDatabase
import ast
import random

meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                      repo_name='sent_sum_map',
                                      id_sql_type='BIGINT',
                                      content_sql_type="TEXT")
encoded_sents_repo = CassandraDatabase(project_name='papers',
                                       repo_name='sents_count_vec',
                                       id_sql_type='BIGINT',
                                       content_sql_type="TEXT")

for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    ids = meta['children']
    print(id, ids, random.choice(ids))
    print(encoded_sents_repo.read(random.choice(ids))[0])
    break
    for child_id in ids:
        sent_sum_map_repo.write(child_id, str(id))
from services.keywords_RPC import KeywordExtractorRpcClient
from services.cassandra_ import CassandraDatabase
from services.cassandra_ import CassandraDatabase
import ast
import spacy

nlp = spacy.load('en_core_web_sm')
meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers',
                                 repo_name='summary',
                                 id_sql_type='BIGINT',
                                 content_sql_type="TEXT")

extractor = KeywordExtractorRpcClient()
i = 0
for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    keywords = meta['keywords']
    text = summary_repo.read(id)[0]
    if len(keywords) != 1:
        keywords += list(extractor.extract(text)['keywords'])
    else:
        keywords = list(extractor.extract(text)['keywords'])

    keywords_lemmas = [token.lemma_ for token in nlp(' '.join(keywords))]
    meta['keywords_lemmas'] = list(set(keywords_lemmas))
    meta['keywords'] = list(set(keywords))
    meta_repo.write(id, str(meta))
Beispiel #9
0
import ast
from sklearn.metrics.pairwise import cosine_similarity
from services.encoders.USE_RPC import USERpcClient
from services.encoders.infer_RPC import InferRpcClient
from joblib import dump, load
import spacy
from collections import defaultdict

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'


'''vectors repos [title_, summary_], [USE, Infer, Gru5, upvotes, jokes, reviews
 upv_score256, upvotes256, reviews256, jokes256, gru256, count_vec] '''

nlp = spacy.load('en_core_web_sm')
meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT")
repo_name = 'summary_USE'
encoder = USERpcClient()
# encoder = InferRpcClient()
# path = 'C:\zorba\storage\\vectorizer.joblib'
# vectorizer = load(path)
def encode(text):
    vector = str(encoder.encode(text)['encoded'])
    # vector = str(vectorizer.transform([text]).toarray()[0].tolist())
    return vector



def get_docs(repo_name):
    docs = []
    ids = []
import pandas as pd
from dateutil import parser
import nltk
import ast
from services.cassandra_ import CassandraDatabase

title_repo = CassandraDatabase(project_name='papers', repo_name='title', id_sql_type='BIGINT', content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers', repo_name='summary', id_sql_type='BIGINT',
                                 content_sql_type="TEXT")
meta_repo = CassandraDatabase(project_name='papers', repo_name='meta', id_sql_type='BIGINT', content_sql_type="TEXT")

def split_into_sents():
    count = 0
    sents_repo = CassandraDatabase(project_name='papers', repo_name='sentences', id_sql_type='BIGINT', content_sql_type="TEXT")
    for id, text in summary_repo.list():
        row = meta_repo.read(id)[0]
        meta = ast.literal_eval(row.replace('nan', '\'\''))
        meta['children'] = []
        sent_text = nltk.sent_tokenize(text)
        for sentence in sent_text:
            sents_repo.write(count, sentence)
            meta['children'].append(count)
            count+=1
        meta_repo.write(id, str(meta))

def parse_record(record):
    title = record['paper'].lower().replace(u'\xa0', u' ').strip()
    summary = record['summary'].lower().replace(u'\xa0', u' ').strip()
    try:
        date = parser.parse(record['publish date'].replace(u'\xa0', u' ').strip())
    except:
from sklearn.feature_extraction.text import CountVectorizer
from joblib import dump, load
from services.cassandra_ import CassandraDatabase
import time
from sklearn.feature_extraction import stop_words

title_repo = CassandraDatabase(project_name='papers',
                               repo_name='title',
                               id_sql_type='BIGINT',
                               content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers',
                                 repo_name='summary',
                                 id_sql_type='BIGINT',
                                 content_sql_type="TEXT")

stop_words = stop_words.ENGLISH_STOP_WORDS
corpus = []
for id, row in summary_repo.list():
    corpus.append(row)
    corpus.append(title_repo.read(id)[0])

path = 'C:\zorba\storage\\vectorizer.joblib'

vectorizer = CountVectorizer(stop_words=stop_words)
t1 = time.time()
x = vectorizer.fit(corpus)
print(time.time() - t1)
print(len(vectorizer.get_feature_names()))

dump(vectorizer, path)
# vectorizer = load(path)
from services.querier_RPC import QuerierRpcClient
import numpy as np
from services.encoders.USE_RPC import USERpcClient
from services.encoders.infer_RPC import InferRpcClient
from joblib import dump, load
import json
import ast


q = QuerierRpcClient()
# path = 'C:\zorba\storage\\vectorizer.joblib'
# vectorizer = load(path)
# encoder = USERpcClient()
encoder = InferRpcClient()

summary_repo = CassandraDatabase(project_name='papers', repo_name= 'title', id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
loc = 0
top3 = 0
top10 = 0
j = 0
for id, row in summary_repo.list():
    result = ast.literal_eval(q.query(json.dumps({"text":row, "count":203})))
    sims = result['result']
    inter = result['keywords']
    index = np.where(np.array(list(sims.keys())) == id)[0][0]
    print(index, id)
    j+=1
    if j==10:
        break

def encode():
    title_repo = CassandraDatabase(project_name='papers',
                                   repo_name='title',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    summary_repo = CassandraDatabase(project_name='papers',
                                     repo_name='summary',
                                     id_sql_type='BIGINT',
                                     content_sql_type="TEXT")

    encoded_title_repo = CassandraDatabase(project_name='papers',
                                           repo_name='title_reviews',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    encoded_summary_repo = CassandraDatabase(project_name='papers',
                                             repo_name='summary_reviews',
                                             id_sql_type='BIGINT',
                                             content_sql_type="TEXT")
    # encoder = USERpcClient()
    encoder = InferRpcClient()
    # path = 'C:\zorba\storage\\vectorizer.joblib'
    # vectorizer = load(path)
    i = 0
    for id, row in title_repo.list():
        print(i)
        i += 1
        title_vec = str(encoder.encode(row)['encoded'])
        summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded'])
        # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist())
        encoded_title_repo.write(id, title_vec)
        encoded_summary_repo.write(id, summary_vec)