def encode():
    title_repo = CassandraDatabase(project_name='papers',
                                   repo_name='title',
                                   id_sql_type='BIGINT',
                                   content_sql_type="TEXT")
    summary_repo = CassandraDatabase(project_name='papers',
                                     repo_name='summary',
                                     id_sql_type='BIGINT',
                                     content_sql_type="TEXT")

    encoded_title_repo = CassandraDatabase(project_name='papers',
                                           repo_name='title_reviews',
                                           id_sql_type='BIGINT',
                                           content_sql_type="TEXT")
    encoded_summary_repo = CassandraDatabase(project_name='papers',
                                             repo_name='summary_reviews',
                                             id_sql_type='BIGINT',
                                             content_sql_type="TEXT")
    # encoder = USERpcClient()
    encoder = InferRpcClient()
    # path = 'C:\zorba\storage\\vectorizer.joblib'
    # vectorizer = load(path)
    i = 0
    for id, row in title_repo.list():
        print(i)
        i += 1
        title_vec = str(encoder.encode(row)['encoded'])
        summary_vec = str(encoder.encode(summary_repo.read(id)[0])['encoded'])
        # title_vec = str(vectorizer.transform([row]).toarray()[0].tolist())
        # summary_vec = str(vectorizer.transform([summary_repo.read(id)[0]]).toarray()[0].tolist())
        encoded_title_repo.write(id, title_vec)
        encoded_summary_repo.write(id, summary_vec)
Beispiel #2
0
def title_sents():
    repo = CassandraDatabase(project_name='papers',
                             repo_name='title',
                             id_sql_type='BIGINT',
                             content_sql_type="TEXT")
    sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                          repo_name='sent_sum_map',
                                          id_sql_type='BIGINT',
                                          content_sql_type="TEXT")
    loc = 0
    top3 = 0
    top10 = 0
    k = 0
    for id, row in repo.list():
        k += 1
        print(k)
        result = ast.literal_eval(
            q.query(json.dumps({
                "text": row,
                "count": 203
            })))
        sims = result['result']
        inter = result['keywords']

        papers_ids = []
        for sent_id in list(sims.keys()):
            paper_id = int(sent_sum_map_repo.read(sent_id)[0])
            if paper_id not in papers_ids:
                papers_ids.append(paper_id)
            if paper_id == id:
                index = len(papers_ids)
                break
        # papers_ids = np.array(papers_ids)
        # index = np.where(np.array(papers_ids) == id)[0][0]
        if index < 3:
            top3 += 1
        if index < 10:
            top10 += 1
        loc += index

    print('{:.2f} top3 {} top 10 {}'.format(loc / repo.count(), top3, top10))
    '''tootal count 203. USE: [11.40, 3: 11, 10: 143], upvotes: [17.77, 3: 107, 10: 141],
Beispiel #3
0
from services.cassandra_ import CassandraDatabase
import ast
import random

meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
sent_sum_map_repo = CassandraDatabase(project_name='papers',
                                      repo_name='sent_sum_map',
                                      id_sql_type='BIGINT',
                                      content_sql_type="TEXT")
encoded_sents_repo = CassandraDatabase(project_name='papers',
                                       repo_name='sents_count_vec',
                                       id_sql_type='BIGINT',
                                       content_sql_type="TEXT")

for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    ids = meta['children']
    print(id, ids, random.choice(ids))
    print(encoded_sents_repo.read(random.choice(ids))[0])
    break
    for child_id in ids:
        sent_sum_map_repo.write(child_id, str(id))
from services.cassandra_ import CassandraDatabase
from services.cassandra_ import CassandraDatabase
import ast
import spacy

nlp = spacy.load('en_core_web_sm')
meta_repo = CassandraDatabase(project_name='papers',
                              repo_name='meta',
                              id_sql_type='BIGINT',
                              content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers',
                                 repo_name='summary',
                                 id_sql_type='BIGINT',
                                 content_sql_type="TEXT")

extractor = KeywordExtractorRpcClient()
i = 0
for id, row in meta_repo.list():
    meta = ast.literal_eval(row.replace('nan', '\'\''))
    keywords = meta['keywords']
    text = summary_repo.read(id)[0]
    if len(keywords) != 1:
        keywords += list(extractor.extract(text)['keywords'])
    else:
        keywords = list(extractor.extract(text)['keywords'])

    keywords_lemmas = [token.lemma_ for token in nlp(' '.join(keywords))]
    meta['keywords_lemmas'] = list(set(keywords_lemmas))
    meta['keywords'] = list(set(keywords))
    meta_repo.write(id, str(meta))
from joblib import dump, load
from services.cassandra_ import CassandraDatabase
import time
from sklearn.feature_extraction import stop_words

title_repo = CassandraDatabase(project_name='papers',
                               repo_name='title',
                               id_sql_type='BIGINT',
                               content_sql_type="TEXT")
summary_repo = CassandraDatabase(project_name='papers',
                                 repo_name='summary',
                                 id_sql_type='BIGINT',
                                 content_sql_type="TEXT")

stop_words = stop_words.ENGLISH_STOP_WORDS
corpus = []
for id, row in summary_repo.list():
    corpus.append(row)
    corpus.append(title_repo.read(id)[0])

path = 'C:\zorba\storage\\vectorizer.joblib'

vectorizer = CountVectorizer(stop_words=stop_words)
t1 = time.time()
x = vectorizer.fit(corpus)
print(time.time() - t1)
print(len(vectorizer.get_feature_names()))

dump(vectorizer, path)
# vectorizer = load(path)