Example #1
0
def load_sentence():
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        for i, sent in enumerate(datastore.get_annotation(doc_id, 'sentence')):
            # Solr へ登録するデータ構造へ変換
            data.append({
                'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, 'sentence', i),
                'doc_id_i': doc_id,
                'anno_id_i': i,
                'name_s': 'sentence',
                'sentence_txt_ja': text[sent['begin']:sent['end']],
                'title_txt_ja': meta_info['title'],
                'url_s': meta_info['url'],
            })
    # Solr への登録を実行
    indexer.load('anno', data)
Example #2
0
def load_affiliation():
    anno_name = 'affiliation'
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        sents = datastore.get_annotation(doc_id, 'sentence')
        for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)):
            # Solr へ登録するデータ構造へ変換
            sent = find_x_including_y(sents, anno)
            data.append({
                'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i),
                'doc_id_i': doc_id,
                'anno_id_i': i,
                'name_s': anno_name,
                'sentence_txt_ja': text[sent['begin']:sent['end']],
                anno_name + '_txt_ja': text[anno['begin']:anno['end']],
                'title_txt_ja': meta_info['title'],
                'url_s': meta_info['url'],
            })
    # Solr への登録を実行
    indexer.load('anno', data)
Example #3
0
import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['id', 'content', 'meta_info'])
        print(row['id'], row['meta_info'], row['content'][:100])
    datastore.close()
    
import glob
import json
import urllib.request
import scraping
import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    values = []
    for filename in glob.glob('./data/wikipedia/*.html'):
        with open(filename) as fin:
            html = fin.read()
            text, title = scraping.scrape(html)
            print('scraped:', title)
            url = 'https://ja.wikipedia.org/wiki/{{}}'.format(
                urllib.parse.quote(title))
            values.append((text, json.dumps({'url': url, 'title': title})))
    datastore.load(values)

    print(list(datastore.get_all_ids(limit=-1)))
    datastore.close()
Example #5
0
import statistics

import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    lm = statistics.create_language_model(datastore.get_all_ids(limit=-1), N=3)
    context = ('古く', 'から')
    print(context, '->')

    prob_list = [(word, lm.score(word, context))
                 for word in lm.context_counts(lm.vocab.lookup(context))]
    prob_list.sort(key=lambda x: x[1], reverse=True)
    for word, prob in prob_list:
        print('\t{:s}: {:f}'.format(word, prob))
    datastore.close()
Example #6
0
import json
import urllib

import wiki_source
import sqlitedatastore

if __name__ == '__main__':

    sqlitedatastore.connect()
    values = []
    with open('./data/nation_list.txt') as f:
        for nation_name in f:
            print(nation_name.rstrip('\n'))
            title, text = wiki_source.wiki_mining(nation_name.rstrip('\n'))
            nation_url = 'https://ja.wikipedia.org/wiki/' + urllib.parse.quote(
                nation_name)
            values.append((text, json.dumps({
                'url': nation_url,
                'title': title
            })))
    sqlitedatastore.load(values)

    print(list(sqlitedatastore.get_all_ids(limit=-1)))

    for doc_id in sqlitedatastore.get_all_ids(limit=-1):
        print(doc_id)
        row = sqlitedatastore.get(doc_id, ['id', 'content', 'meta_info'])
        print(row['id'], json.loads(row['meta_info']), row['content'][:100])
    sqlitedatastore.close()