コード例 #1
0
def get():
    keywords = bottle.request.params.keywords.split()
    classifier_name = bottle.request.params.classifier

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']])
    ],
                                        rows=1000)

    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)

        if classifier_name == 'ml':
            features = mlclassifier.convert_into_features_using_vocab(
                [(r['doc_id_i'], sent, tokens)], vocab_ml)
            predicteds = mlclassifier.classify(features, model_ml)
        elif classifier_name == 'dl':
            features = dlclassifier.convert_into_features_using_vocab(
                [(r['doc_id_i'], sent, tokens)], vocab_dl)
            predicteds = dlclassifier.classify(features, model_dl)
        elif classifier_name == 'rule':
            features = ruleclassifier.convert_into_features_using_rules(
                [(r['doc_id_i'], sent, tokens)], rule)
            predicteds = ruleclassifier.classify(features, rule)

        r['predicted'] = int(predicteds[0])  # covert from numpy.int to int
        print(r['predicted'], r['sentence_txt_ja'])

    return json.dumps(results, ensure_ascii=False)
コード例 #2
0
def get():
    name = bottle.request.params.name
    keywords = bottle.request.params.keywords.split()
    keywords_expanded = [[keyword] + [
        synonym['term'] for synonym in dbpediaknowledge.get_synonyms(keyword)
    ] for keyword in keywords]
    if keywords_expanded != []:
        fl_keyword_pairs = [(name + '_txt_ja', keywords_expanded)]
    else:
        fl_keyword_pairs = [('name_s', [[name]])]

    results = indexer.search_annotation(fl_keyword_pairs)
    return json.dumps(results, ensure_ascii=False)
コード例 #3
0
def get():
    title = bottle.request.params.title.strip()
    keywords = bottle.request.params.keywords.split()

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('title_txt_ja', [[title]]), ('sentence_txt_ja', [keywords]),
        ('name_s', [['sentence']])
    ],
                                        rows=1000)

    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)

        features = sentimentclassifier.convert_into_features_using_vocab(
            [(r['doc_id_i'], sent, tokens)], vocab)
        predicteds = mlclassifier.classify(features, model)

        r['predicted'] = int(predicteds[0])  # covert from numpy.int to int
        print(r['predicted'], r['sentence_txt_ja'])

    return json.dumps(results, ensure_ascii=False)
コード例 #4
0
import sqlitedatastore as datastore
import solrindexer as indexer

if __name__ == '__main__':
    datastore.connect()
    print('#label', 'doc_id', 'sentence_id', 'text')
    results = indexer.search_annotation(
        fl_keyword_pairs=[
            ('sentence_txt_ja', [['教育', '治安', '経済']]),
            ('name_s',         [['sentence']]),
        ], rows=1000)
    for r in results['response']['docs']:
        text = datastore.get(r['doc_id_i'], ['content'])['content']
        sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']]
        # ラベルファイルのデータ構造へ変換
        print(0, r['doc_id_i'], r['anno_id_i'], text[sent['begin']:sent['end']])
    datastore.close()
コード例 #5
0
# ラベル付与用データの作成
if __name__ == '__main__':
    datastore.connect()
    print('#label', 'doc_id', 'sentence_id', 'text')

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('sentence_txt_ja', [[
            '肉',
            '魚',
            '茶',
            '塩',
            '野菜',
            '油',
            '森林',
            '砂漠',
            '草原',
            '海',
            '木材',
            '果樹',
            '麦',
            '米',
        ]]),
        ('name_s', [['sentence']]),
    ],
                                        rows=1000)
    for r in results['response']['docs']:
        text = datastore.get(r['doc_id_i'], ['content'])['content']
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        # ラベルファイルのデータ構造へ変換
        print(0, r['doc_id_i'], r['anno_id_i'],
コード例 #6
0
import ruleclassifier
import solrindexer as indexer
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    results = indexer.search_annotation(fl_keyword_pairs=[
        ('name_s', [['sentence']]),
    ],
                                        rows=3000)
    sentences = []
    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)
        sentences.append((r['doc_id_i'], sent, tokens))

    # ルール取得
    rule = ruleclassifier.get_rule()

    # 分類
    features = ruleclassifier.convert_into_features_using_rules(
        sentences, rule)
    predicteds = ruleclassifier.classify(features, rule)
    for predicted, (doc_id, sent, tokens) in zip(predicteds, sentences):
        if predicted == 1:
            text = datastore.get(doc_id, ['content'])['content']
            print(predicted, text[sent['begin']:sent['end']])
    datastore.close()
コード例 #7
0
import json

import solrindexer as indexer

if __name__ == '__main__':
    results = indexer.search_annotation(
        fl_keyword_pairs=[
            ('cause_txt_ja', [['気候変動']]),
            ('name_s',       [['cause']])
        ])
    print(json.dumps(results, indent=4, ensure_ascii=False))
コード例 #8
0
import json

import solrindexer as indexer

if __name__ == '__main__':
    results = indexer.search_annotation(fl_keyword_pairs=[
        ('affiliation_txt_ja', [['インド']])
    ],
                                        rows=5)
    print(json.dumps(results, indent=4, ensure_ascii=False))