def load_sentence(): data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) for i, sent in enumerate(datastore.get_annotation(doc_id, 'sentence')): # Solr へ登録するデータ構造へ変換 data.append({ 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, 'sentence', i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': 'sentence', 'sentence_txt_ja': text[sent['begin']:sent['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], }) # Solr への登録を実行 indexer.load('anno', data)
def load_affiliation(): anno_name = 'affiliation' data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) sents = datastore.get_annotation(doc_id, 'sentence') for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)): # Solr へ登録するデータ構造へ変換 sent = find_x_including_y(sents, anno) data.append({ 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': anno_name, 'sentence_txt_ja': text[sent['begin']:sent['end']], anno_name + '_txt_ja': text[anno['begin']:anno['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], }) # Solr への登録を実行 indexer.load('anno', data)
import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['id', 'content', 'meta_info']) print(row['id'], row['meta_info'], row['content'][:100]) datastore.close()
import glob import json import urllib.request import scraping import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() values = [] for filename in glob.glob('./data/wikipedia/*.html'): with open(filename) as fin: html = fin.read() text, title = scraping.scrape(html) print('scraped:', title) url = 'https://ja.wikipedia.org/wiki/{{}}'.format( urllib.parse.quote(title)) values.append((text, json.dumps({'url': url, 'title': title}))) datastore.load(values) print(list(datastore.get_all_ids(limit=-1))) datastore.close()
import statistics import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() lm = statistics.create_language_model(datastore.get_all_ids(limit=-1), N=3) context = ('古く', 'から') print(context, '->') prob_list = [(word, lm.score(word, context)) for word in lm.context_counts(lm.vocab.lookup(context))] prob_list.sort(key=lambda x: x[1], reverse=True) for word, prob in prob_list: print('\t{:s}: {:f}'.format(word, prob)) datastore.close()
import json import urllib import wiki_source import sqlitedatastore if __name__ == '__main__': sqlitedatastore.connect() values = [] with open('./data/nation_list.txt') as f: for nation_name in f: print(nation_name.rstrip('\n')) title, text = wiki_source.wiki_mining(nation_name.rstrip('\n')) nation_url = 'https://ja.wikipedia.org/wiki/' + urllib.parse.quote( nation_name) values.append((text, json.dumps({ 'url': nation_url, 'title': title }))) sqlitedatastore.load(values) print(list(sqlitedatastore.get_all_ids(limit=-1))) for doc_id in sqlitedatastore.get_all_ids(limit=-1): print(doc_id) row = sqlitedatastore.get(doc_id, ['id', 'content', 'meta_info']) print(row['id'], json.loads(row['meta_info']), row['content'][:100]) sqlitedatastore.close()