def get(): doc_id = bottle.request.params.id names = bottle.request.params.names.split() row = datastore.get(doc_id, fl=['content']) text = row['content'] # text = re.sub(r'[。!]', '\n', text) data = { 'collection': { 'entity_types': [], }, 'annotation': { 'text': text, 'entities': [], 'relations': [], }, } mapping = {} for name in names: annos = datastore.get_annotation(doc_id, name) for i, anno in enumerate(annos): data['collection']['entity_types'].append({ 'type': name, 'bgColor': '#7fa2ff', 'borderColor': 'darken' }) Ti = 'T{0:d}'.format(len(data['annotation']['entities']) + 1) data['annotation']['entities'].append([ Ti, name, [[anno['begin'], anno['end']]] ]) mapping[(name, i)] = Ti for name in names: annos = datastore.get_annotation(doc_id, name) for i, anno in enumerate(annos): if 'link' not in anno: continue name_linked, i_linked = anno['link'] if (name, i) not in mapping or (name_linked, i_linked) not in mapping: continue data['annotation']['relations'].append([ 'R{0:d}'.format(len(data['annotation']['relations']) + 1), 'arg', [['src', mapping[(name, i)]], ['tgt', mapping[(name_linked, i_linked)]]] ]) return json.dumps(data, ensure_ascii=False)
def create_annotation(doc_id, ptn): row = datastore.get(doc_id, fl=['content']) text = row['content'] annos = [] for chunk in datastore.get_annotation(doc_id, 'chunk'): chunk_str = text[chunk['begin']:chunk['end']] m = ptn.search(chunk_str) if not m: continue anno = { 'begin': chunk['begin'] + m.start(), 'end': chunk['begin'] + m.end(), } print(text[anno['begin']:anno['end']]) annos.append(anno) return annos
def load_sentence(): data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) for i, sent in enumerate(datastore.get_annotation(doc_id, 'sentence')): # Solr へ登録するデータ構造へ変換 data.append({ 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, 'sentence', i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': 'sentence', 'sentence_txt_ja': text[sent['begin']:sent['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], }) # Solr への登録を実行 indexer.load('anno', data)
def load_affiliation(): anno_name = 'affiliation' data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) sents = datastore.get_annotation(doc_id, 'sentence') for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)): # Solr へ登録するデータ構造へ変換 sent = find_x_including_y(sents, anno) data.append({ 'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i), 'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': anno_name, 'sentence_txt_ja': text[sent['begin']:sent['end']], anno_name + '_txt_ja': text[anno['begin']:anno['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], }) # Solr への登録を実行 indexer.load('anno', data)
import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, ['id', 'content', 'meta_info']) print(row['id'], row['meta_info'], row['content'][:100]) datastore.close()
'doc_id_i': doc_id, 'anno_id_i': i, 'name_s': anno_name, 'sentence_txt_ja': text[sent['begin']:sent['end']], anno_name + '_txt_ja': text[anno['begin']:anno['end']], ref_anno_name + '_txt_ja': text[ref_anno['begin']:ref_anno['end']], 'title_txt_ja': meta_info['title'], 'url_s': meta_info['url'], } return data if __name__ == '__main__': datastore.connect() anno_name = 'cause' data = [] for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, fl=['content', 'meta_info']) text = row['content'] meta_info = json.loads(row['meta_info']) sents = datastore.get_annotation(doc_id, 'sentence') for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)): sent = find_x_including_y(sents, anno) data.append( create_index_data(doc_id, meta_info, anno_name, anno, i, sent, text)) # Solr への登録を実行 indexer.load('anno', data) datastore.close()
import sqlitedatastore as datastore from annoutil import find_x_including_y, find_xs_in_y if __name__ == '__main__': datastore.connect() anno_name = 'affiliation' for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, fl=['content']) text = row['content'] sentences = datastore.get_annotation(doc_id, 'sentence') tokens = datastore.get_annotation(doc_id, 'token') annos = datastore.get_annotation(doc_id, anno_name) for sentence in sentences: annos_in_sentence = find_xs_in_y(annos, sentence) if annos_in_sentence == []: continue prev = False for token in find_xs_in_y(tokens, sentence): if find_x_including_y(annos_in_sentence, token) is None: prev = False print('{0}\t{1}\t{2}'.format( text[token['begin']:token['end']], token['POS'], 'O')) else: if prev: print('{0}\t{1}\tI-{2}'.format( text[token['begin']:token['end']], token['POS'], anno_name)) else: print('{0}\t{1}\tB-{2}'.format( text[token['begin']:token['end']], token['POS'],
import sqlitedatastore as datastore if __name__ == '__main__': datastore.connect() data = [] doc_ids = [] for doc_id in datastore.get_all_ids(limit=-1): data.append(' '.join([ token['lemma'] for token in datastore.get_annotation(doc_id, 'token') ])) doc_ids.append(doc_id) vectorizer = TfidfVectorizer(analyzer='word', max_df=0.9) vecs = vectorizer.fit_transform(data) for doc_id, vec in zip(doc_ids, vecs.toarray()): meta_info = json.loads( datastore.get(doc_id, ['meta_info'])['meta_info']) title = meta_info['title'] print(doc_id, title) for w_id, tfidf in sorted(enumerate(vec), key=lambda x: x[1], reverse=True)[:10]: lemma = vectorizer.get_feature_names()[w_id] print('\t{0:s}: {1:f}'.format(lemma, tfidf)) datastore.close()
import sqlitedatastore as datastore import solrindexer as indexer if __name__ == '__main__': datastore.connect() print('#label', 'doc_id', 'sentence_id', 'text') results = indexer.search_annotation( fl_keyword_pairs=[ ('sentence_txt_ja', [['教育', '治安', '経済']]), ('name_s', [['sentence']]), ], rows=1000) for r in results['response']['docs']: text = datastore.get(r['doc_id_i'], ['content'])['content'] sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] # ラベルファイルのデータ構造へ変換 print(0, r['doc_id_i'], r['anno_id_i'], text[sent['begin']:sent['end']]) datastore.close()
sentences_train = sentences[:num_train] labels_train = labels[:num_train] features, vocab = mlclassifier.convert_into_features(sentences_train) # 学習 time_s = time.time() print(':::TRAIN START') model = mlclassifier.train(labels_train, features) print(':::TRAIN FINISHED', time.time() - time_s) # 学習モデルをファイルに保存 joblib.dump(model, 'result/model.pkl') joblib.dump(vocab, 'result/vocab.pkl') # 分類の実行 features_test = mlclassifier.convert_into_features_using_vocab( sentences[num_train:], vocab) predicteds = mlclassifier.classify(features_test, model) for predicted, (doc_id, sent, tokens), label in zip(predicteds, sentences[num_train:], labels[num_train:]): # 結果の確認 text = datastore.get(doc_id, ['content'])['content'] if predicted == label: print('correct ', ' ', label, predicted, text[sent['begin']:sent['end']]) else: print('incorrect', ' ', label, predicted, text[sent['begin']:sent['end']]) datastore.close()
def extract_relation(doc_id): text = datastore.get(doc_id, fl=['content'])['content'] all_chunks = datastore.get_annotation(doc_id, 'chunk') all_tokens = datastore.get_annotation(doc_id, 'token') anno_id = 0 for sent in datastore.get_annotation(doc_id, 'sentence'): chunks = find_xs_in_y(all_chunks, sent) tokens = find_xs_in_y(all_tokens, sent) for chunk in chunks: chunk_tokens = find_xs_in_y(tokens, chunk) if not any([ chunk_token['lemma'] == '与える' for chunk_token in chunk_tokens ]): continue affect, affect_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={'text': ['影響を']}) if affect is None: continue cause, cause_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={ 'pos1': ['助詞'], 'lemma1': ['は', 'も', 'が'], 'pos2_ng': ['助詞'], }) if cause is None: continue effect, effect_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={ 'pos1': ['助詞'], 'lemma1': ['に'], 'pos2_ng': ['助詞'], }) if effect is None: continue cause = extend_phrase(cause, cause_tokens, tokens, all_chunks) effect = extend_phrase(effect, effect_tokens, tokens, all_chunks) relation = { 'cause': { 'begin': cause['begin'], 'end': cause['end'], 'link': ('effect', anno_id), }, 'effect': { 'begin': effect['begin'], 'end': effect['end'], } } anno_id += 1 yield sent, relation