Esempio n. 1
0
def random_sent_summary():
    meta_repo = CassandraDatabase(project_name='papers',
                                  repo_name='meta',
                                  id_sql_type='BIGINT',
                                  content_sql_type="TEXT")
    encoded_sents_repo = CassandraDatabase(project_name='papers',
                                           repo_name='sents',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    for id, row in meta_repo.list():
        meta = ast.literal_eval(row.replace('nan', '\'\''))
        ids = meta['children']
        random_sent_id = random.choice(ids)
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": random_sent_id,
                "count": 205
            })))
        sims = result['result']
        inter = result['keywords']
        index = np.where(np.array(list(sims.keys())) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / meta_repo.count(), top3,
                                            top10))
    '''tootal count 203. USE: [9.46, 3: 135, 10: 159], upvotes: [9.68,3: 150, 10: 165], 
Esempio n. 2
0
def title_summary():
    repo = CassandraDatabase(project_name='papers',
                             repo_name='title',
                             id_sql_type='BIGINT',
                             content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    for id, row in repo.list():
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": row,
                "count": 203
            })))
        sims = result['result']
        inter = result['keywords']
        index = np.where(np.array(list(sims.keys())) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10))
    '''tootal count 203. USE: [13.62, 3: 106, 10: 148], upvotes: [20.56, 3: 83, 10: 124], 
def encode():
    title_repo = CassandraDatabase(project_name='papers',
                                   repo_name='title',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    summary_repo = CassandraDatabase(project_name='papers',
                                     repo_name='summary',
                                     id_sql_type='BIGINT',
                                     content_sql_type="TEXT")

    encoded_title_repo = CassandraDatabase(project_name='papers',
                                           repo_name='title_reviews',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    encoded_summary_repo = CassandraDatabase(project_name='papers',
                                             repo_name='summary_reviews',
                                             id_sql_type='BIGINT',
                                             content_sql_type="TEXT")
    # encoder = USERpcClient()
    encoder = InferRpcClient()
    # path = 'C:\zorba\storage\\vectorizer.joblib'
    # vectorizer = load(path)
    i = 0
    for id, row in title_repo.list():
        print(i)
        i += 1
        title_vec = str(encoder.encode(row)['encoded'])
        summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded'])
        # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist())
        encoded_title_repo.write(id, title_vec)
        encoded_summary_repo.write(id, summary_vec)
Esempio n. 4
0
def get_docs(repo_name):
    docs = []
    ids = []
    vecs_repo = CassandraDatabase(project_name='papers', repo_name= repo_name, id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    for id, row in vecs_repo.list():
        docs.append(ast.literal_eval(row))
        ids.append(id)
    return np.array(ids), np.array(docs)
Esempio n. 5
0
def title_sents():
    repo = CassandraDatabase(project_name='papers',
                             repo_name='title',
                             id_sql_type='BIGINT',
                             content_sql_type="TEXT")
    sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                          repo_name='sent_sum_map',
                                          id_sql_type='BIGINT',
                                          content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    k = 0
    for id, row in repo.list():
        k += 1
        print(k)
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": row,
                "count": 203
            })))
        sims = result['result']
        inter = result['keywords']

        papers_ids = []
        for sent_id in list(sims.keys()):
            paper_id = int(sent_sum_map_repo.read(sent_id)[0])
            if paper_id not in papers_ids:
                papers_ids.append(paper_id)
            if paper_id == id:
                index = len(papers_ids)
                break
        # papers_ids = np.array(papers_ids)
        # index = np.where(np.array(papers_ids) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10))
    '''tootal count 203. USE: [11.40, 3: 11, 10: 143], upvotes: [17.77, 3: 107, 10: 141],
def encode_sents():
    sents_repo = CassandraDatabase(project_name='papers',
                                   repo_name='sentences',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    encoded_sents_repo = CassandraDatabase(project_name='papers',
                                           repo_name='sents_count_vec',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    # encoder = InferRpcClient()
    # encoder = USERpcClient()
    path = 'C:\zorba\storage\\vectorizer.joblib'
    vectorizer = load(path)
    i = 0
    for id, row in sents_repo.list():
        print(i)
        i += 1
        # sent_vec = str(encoder.encode(row)['encoded'])
        sent_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        encoded_sents_repo.write(id, sent_vec)
Esempio n. 7
0
from services.cassandra_ import CassandraDatabase
import ast
import random

meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                      repo_name='sent_sum_map',
                                      id_sql_type='BIGINT',
                                      content_sql_type="TEXT")
encoded_sents_repo = CassandraDatabase(project_name='papers',
                                       repo_name='sents_count_vec',
                                       id_sql_type='BIGINT',
                                       content_sql_type="TEXT")

for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    ids = meta['children']
    print(id, ids, random.choice(ids))
    print(encoded_sents_repo.read(random.choice(ids))[0])
    break
    for child_id in ids:
        sent_sum_map_repo.write(child_id, str(id))
from joblib import dump, load
from services.cassandra_ import CassandraDatabase
import time
from sklearn.feature_extraction import stop_words

title_repo = CassandraDatabase(project_name='papers',
                               repo_name='title',
                               id_sql_type='BIGINT',
                               content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers',
                                 repo_name='summary',
                                 id_sql_type='BIGINT',
                                 content_sql_type="TEXT")

stop_words = stop_words.ENGLISH_STOP_WORDS
corpus = []
for id, row in summary_repo.list():
    corpus.append(row)
    corpus.append(title_repo.read(id)[0])

path = 'C:\zorba\storage\\vectorizer.joblib'

vectorizer = CountVectorizer(stop_words=stop_words)
t1 = time.time()
x = vectorizer.fit(corpus)
print(time.time() - t1)
print(len(vectorizer.get_feature_names()))

dump(vectorizer, path)
# vectorizer = load(path)