def get(): keywords = bottle.request.params.keywords.split() classifier_name = bottle.request.params.classifier results = indexer.search_annotation(fl_keyword_pairs=[ ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']]) ], rows=1000) for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) if classifier_name == 'ml': features = mlclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab_ml) predicteds = mlclassifier.classify(features, model_ml) elif classifier_name == 'dl': features = dlclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab_dl) predicteds = dlclassifier.classify(features, model_dl) elif classifier_name == 'rule': features = ruleclassifier.convert_into_features_using_rules( [(r['doc_id_i'], sent, tokens)], rule) predicteds = ruleclassifier.classify(features, rule) r['predicted'] = int(predicteds[0]) # covert from numpy.int to int print(r['predicted'], r['sentence_txt_ja']) return json.dumps(results, ensure_ascii=False)
def get(): name = bottle.request.params.name keywords = bottle.request.params.keywords.split() keywords_expanded = [[keyword] + [ synonym['term'] for synonym in dbpediaknowledge.get_synonyms(keyword) ] for keyword in keywords] if keywords_expanded != []: fl_keyword_pairs = [(name + '_txt_ja', keywords_expanded)] else: fl_keyword_pairs = [('name_s', [[name]])] results = indexer.search_annotation(fl_keyword_pairs) return json.dumps(results, ensure_ascii=False)
def get(): title = bottle.request.params.title.strip() keywords = bottle.request.params.keywords.split() results = indexer.search_annotation(fl_keyword_pairs=[ ('title_txt_ja', [[title]]), ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']]) ], rows=1000) for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) features = sentimentclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab) predicteds = mlclassifier.classify(features, model) r['predicted'] = int(predicteds[0]) # covert from numpy.int to int print(r['predicted'], r['sentence_txt_ja']) return json.dumps(results, ensure_ascii=False)
import sqlitedatastore as datastore import solrindexer as indexer if __name__ == '__main__': datastore.connect() print('#label', 'doc_id', 'sentence_id', 'text') results = indexer.search_annotation( fl_keyword_pairs=[ ('sentence_txt_ja', [['教育', '治安', '経済']]), ('name_s', [['sentence']]), ], rows=1000) for r in results['response']['docs']: text = datastore.get(r['doc_id_i'], ['content'])['content'] sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] # ラベルファイルのデータ構造へ変換 print(0, r['doc_id_i'], r['anno_id_i'], text[sent['begin']:sent['end']]) datastore.close()
# ラベル付与用データの作成 if __name__ == '__main__': datastore.connect() print('#label', 'doc_id', 'sentence_id', 'text') results = indexer.search_annotation(fl_keyword_pairs=[ ('sentence_txt_ja', [[ '肉', '魚', '茶', '塩', '野菜', '油', '森林', '砂漠', '草原', '海', '木材', '果樹', '麦', '米', ]]), ('name_s', [['sentence']]), ], rows=1000) for r in results['response']['docs']: text = datastore.get(r['doc_id_i'], ['content'])['content'] sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] # ラベルファイルのデータ構造へ変換 print(0, r['doc_id_i'], r['anno_id_i'],
import ruleclassifier import solrindexer as indexer import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() results = indexer.search_annotation(fl_keyword_pairs=[ ('name_s', [['sentence']]), ], rows=3000) sentences = [] for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) sentences.append((r['doc_id_i'], sent, tokens)) # ルール取得 rule = ruleclassifier.get_rule() # 分類 features = ruleclassifier.convert_into_features_using_rules( sentences, rule) predicteds = ruleclassifier.classify(features, rule) for predicted, (doc_id, sent, tokens) in zip(predicteds, sentences): if predicted == 1: text = datastore.get(doc_id, ['content'])['content'] print(predicted, text[sent['begin']:sent['end']]) datastore.close()
import json import solrindexer as indexer if __name__ == '__main__': results = indexer.search_annotation( fl_keyword_pairs=[ ('cause_txt_ja', [['気候変動']]), ('name_s', [['cause']]) ]) print(json.dumps(results, indent=4, ensure_ascii=False))
import json import solrindexer as indexer if __name__ == '__main__': results = indexer.search_annotation(fl_keyword_pairs=[ ('affiliation_txt_ja', [['インド']]) ], rows=5) print(json.dumps(results, indent=4, ensure_ascii=False))