Ejemplo n.º 1
0
def get():
    keywords = bottle.request.params.keywords.split()
    classifier_name = bottle.request.params.classifier

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']])
    ],
                                        rows=1000)

    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)

        if classifier_name == 'ml':
            features = mlclassifier.convert_into_features_using_vocab(
                [(r['doc_id_i'], sent, tokens)], vocab_ml)
            predicteds = mlclassifier.classify(features, model_ml)
        elif classifier_name == 'dl':
            features = dlclassifier.convert_into_features_using_vocab(
                [(r['doc_id_i'], sent, tokens)], vocab_dl)
            predicteds = dlclassifier.classify(features, model_dl)
        elif classifier_name == 'rule':
            features = ruleclassifier.convert_into_features_using_rules(
                [(r['doc_id_i'], sent, tokens)], rule)
            predicteds = ruleclassifier.classify(features, rule)

        r['predicted'] = int(predicteds[0])  # covert from numpy.int to int
        print(r['predicted'], r['sentence_txt_ja'])

    return json.dumps(results, ensure_ascii=False)
Ejemplo n.º 2
0
def get():
    doc_id = bottle.request.params.id
    names = bottle.request.params.names.split()

    row = datastore.get(doc_id, fl=['content'])
    text = row['content']
    # text = re.sub(r'[。!]', '\n', text)

    data = {
        'collection': {
            'entity_types':     [],
        },
        'annotation': {
            'text':             text,
            'entities':         [],
            'relations':        [],
        },
    }

    mapping = {}
    for name in names:
        annos = datastore.get_annotation(doc_id, name)
        for i, anno in enumerate(annos):
            data['collection']['entity_types'].append({
                'type':        name,
                'bgColor':     '#7fa2ff',
                'borderColor': 'darken'
            })

            Ti = 'T{0:d}'.format(len(data['annotation']['entities']) + 1)
            data['annotation']['entities'].append([
                Ti,
                name,
                [[anno['begin'], anno['end']]]
            ])
            mapping[(name, i)] = Ti

    for name in names:
        annos = datastore.get_annotation(doc_id, name)
        for i, anno in enumerate(annos):
            if 'link' not in anno:
                continue
            name_linked, i_linked = anno['link']
            if (name, i) not in mapping or (name_linked, i_linked) not in mapping:
                continue

            data['annotation']['relations'].append([
                'R{0:d}'.format(len(data['annotation']['relations']) + 1),
                'arg',
                [['src', mapping[(name, i)]], ['tgt', mapping[(name_linked, i_linked)]]]
            ])

    return json.dumps(data, ensure_ascii=False)
Ejemplo n.º 3
0
def create_language_model(doc_ids, N=3):
    sents = []
    for doc_id in doc_ids:
        all_tokens = datastore.get_annotation(doc_id, 'token')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            tokens = find_xs_in_y(all_tokens, sent)
            sents.append(['__BOS__'] + [token['lemma']
                                        for token in tokens] + ['__EOS__'])
    vocab = Vocabulary([word for sent in sents for word in sent])
    text_ngrams = [ngrams(sent, N) for sent in sents]
    lm = MLE(order=N, vocabulary=vocab)
    lm.fit(text_ngrams)
    return lm
Ejemplo n.º 4
0
def load_affiliation():
    anno_name = 'affiliation'
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        sents = datastore.get_annotation(doc_id, 'sentence')
        for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)):
            # Solr へ登録するデータ構造へ変換
            sent = find_x_including_y(sents, anno)
            data.append({
                'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i),
                'doc_id_i': doc_id,
                'anno_id_i': i,
                'name_s': anno_name,
                'sentence_txt_ja': text[sent['begin']:sent['end']],
                anno_name + '_txt_ja': text[anno['begin']:anno['end']],
                'title_txt_ja': meta_info['title'],
                'url_s': meta_info['url'],
            })
    # Solr への登録を実行
    indexer.load('anno', data)
Ejemplo n.º 5
0
def create_index_data(doc_id, meta_info, anno_name, anno, i, sent, text):
    ref_anno_name, link = anno['link']
    ref_anno = datastore.get_annotation(doc_id, ref_anno_name)[link]
    data = {
        'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i),
        'doc_id_i': doc_id,
        'anno_id_i': i,
        'name_s': anno_name,
        'sentence_txt_ja': text[sent['begin']:sent['end']],
        anno_name + '_txt_ja': text[anno['begin']:anno['end']],
        ref_anno_name + '_txt_ja': text[ref_anno['begin']:ref_anno['end']],
        'title_txt_ja': meta_info['title'],
        'url_s': meta_info['url'],
    }
    return data
Ejemplo n.º 6
0
def get():
    title = bottle.request.params.title.strip()
    keywords = bottle.request.params.keywords.split()

    results = indexer.search_annotation(fl_keyword_pairs=[
        ('title_txt_ja', [[title]]), ('sentence_txt_ja', [keywords]),
        ('name_s', [['sentence']])
    ],
                                        rows=1000)

    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)

        features = sentimentclassifier.convert_into_features_using_vocab(
            [(r['doc_id_i'], sent, tokens)], vocab)
        predicteds = mlclassifier.classify(features, model)

        r['predicted'] = int(predicteds[0])  # covert from numpy.int to int
        print(r['predicted'], r['sentence_txt_ja'])

    return json.dumps(results, ensure_ascii=False)
def create_annotation(doc_id, ptn):
    row = datastore.get(doc_id, fl=['content'])
    text = row['content']
    annos = []
    for chunk in datastore.get_annotation(doc_id, 'chunk'):
        chunk_str = text[chunk['begin']:chunk['end']]
        m = ptn.search(chunk_str)
        if not m:
            continue
        anno = {
            'begin': chunk['begin'] + m.start(),
            'end': chunk['begin'] + m.end(),
        }
        print(text[anno['begin']:anno['end']])
        annos.append(anno)
    return annos
Ejemplo n.º 8
0
def load_sentence():
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        for i, sent in enumerate(datastore.get_annotation(doc_id, 'sentence')):
            # Solr へ登録するデータ構造へ変換
            data.append({
                'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, 'sentence', i),
                'doc_id_i': doc_id,
                'anno_id_i': i,
                'name_s': 'sentence',
                'sentence_txt_ja': text[sent['begin']:sent['end']],
                'title_txt_ja': meta_info['title'],
                'url_s': meta_info['url'],
            })
    # Solr への登録を実行
    indexer.load('anno', data)
Ejemplo n.º 9
0
import mlclassifier
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    # ラベル付きデータ読み込み
    sentences = []
    labels = []
    with open('./data/labels.txt') as f:
        for line in f:
            if line.startswith('#'):
                continue
            d = line.rstrip().split()
            label, doc_id, sent_id = int(d[0]), d[1], int(d[2])
            sent = datastore.get_annotation(doc_id, 'sentence')[sent_id]
            tokens = find_xs_in_y(datastore.get_annotation(doc_id, 'token'),
                                  sent)
            sentences.append((doc_id, sent, tokens))
            labels.append(label)

    # 学習データ特徴量生成
    num_train = int(len(sentences) * 0.8)
    sentences_train = sentences[:num_train]
    labels_train = labels[:num_train]
    features, vocab = mlclassifier.convert_into_features(sentences_train)

    # 学習
    time_s = time.time()
    print(':::TRAIN START')
    model = mlclassifier.train(labels_train, features)
Ejemplo n.º 10
0
import sentimentclassifier
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    # ラベル付きデータ読み込み
    sentences = []
    labels = []
    with open('data/labels_sentiment.txt') as f:
        for line in f:
            if line.startswith('#'):
                continue
            d = line.rstrip().split()
            label, doc_id, sent_id = int(d[0]), d[1], int(d[2])
            sent = datastore.get_annotation(doc_id, 'sentence')[sent_id]
            tokens = find_xs_in_y(
                datastore.get_annotation(doc_id, 'token'), sent)
            sentences.append((doc_id, sent, tokens))
            labels.append(label)

    # 学習データ特徴量生成
    num_train = int(len(sentences) * 0.8)
    sentences_train = sentences[:num_train]
    labels_train = labels[:num_train]
    features, vocab = sentimentclassifier.convert_into_features(sentences_train)

    # 学習
    time_s = time.time()
    print(':::TRAIN START')
    model = mlclassifier.train(labels_train, features)
Ejemplo n.º 11
0
import math

from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

from annoutil import find_xs_in_y
import sqlitedatastore as datastore

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

if __name__ == '__main__':
    datastore.connect()
    sentences = []
    for doc_id in datastore.get_all_ids(limit=-1):
        all_tokens = datastore.get_annotation(doc_id, 'token')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            tokens = find_xs_in_y(all_tokens, sent)
            sentences.append(
                [token['lemma'] for token in tokens if token.get('NE') == 'O'])

    n_sent = 20
    docs = [
        list(itertools.chain.from_iterable(sentences[i:i + n_sent]))
        for i in range(0, len(sentences), n_sent)
    ]

    dictionary = Dictionary(docs)
    dictionary.filter_extremes(no_below=2, no_above=0.3)
    corpus = [dictionary.doc2bow(doc) for doc in docs]
Ejemplo n.º 12
0
import sqlitedatastore as datastore
import triematcher as matcher
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    dic_positive, dic_negative = matcher.get_sentiment_dictionaries()
    doc_id = 1
    for sent in datastore.get_annotation(doc_id, 'sentence'):
        tokens = find_xs_in_y(datastore.get_annotation(doc_id, 'token'), sent)
        text = ''.join([token['lemma'] for token in tokens])
        print(text, '-->')
        print('\tpositive:', matcher.search_terms(text, dic_positive))
        print('\tnegative:', matcher.search_terms(text, dic_negative))
    datastore.close()
Ejemplo n.º 13
0
        'doc_id_i': doc_id,
        'anno_id_i': i,
        'name_s': anno_name,
        'sentence_txt_ja': text[sent['begin']:sent['end']],
        anno_name + '_txt_ja': text[anno['begin']:anno['end']],
        ref_anno_name + '_txt_ja': text[ref_anno['begin']:ref_anno['end']],
        'title_txt_ja': meta_info['title'],
        'url_s': meta_info['url'],
    }
    return data


if __name__ == '__main__':
    datastore.connect()
    anno_name = 'cause'
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, fl=['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        sents = datastore.get_annotation(doc_id, 'sentence')
        for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)):
            sent = find_x_including_y(sents, anno)
            data.append(
                create_index_data(doc_id, meta_info, anno_name, anno, i, sent,
                                  text))

    # Solr への登録を実行
    indexer.load('anno', data)
    datastore.close()
Ejemplo n.º 14
0
import json

from sklearn.feature_extraction.text import TfidfVectorizer

import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()

    data = []
    doc_ids = []
    for doc_id in datastore.get_all_ids(limit=-1):
        data.append(' '.join([
            token['lemma']
            for token in datastore.get_annotation(doc_id, 'token')
        ]))
        doc_ids.append(doc_id)

    vectorizer = TfidfVectorizer(analyzer='word', max_df=0.9)
    vecs = vectorizer.fit_transform(data)

    for doc_id, vec in zip(doc_ids, vecs.toarray()):
        meta_info = json.loads(
            datastore.get(doc_id, ['meta_info'])['meta_info'])
        title = meta_info['title']
        print(doc_id, title)

        for w_id, tfidf in sorted(enumerate(vec),
                                  key=lambda x: x[1],
                                  reverse=True)[:10]:
            lemma = vectorizer.get_feature_names()[w_id]
Ejemplo n.º 15
0
import sqlitedatastore as datastore
from annoutil import find_x_including_y, find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    anno_name = 'affiliation'

    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, fl=['content'])
        text = row['content']
        sentences = datastore.get_annotation(doc_id, 'sentence')
        tokens = datastore.get_annotation(doc_id, 'token')
        annos = datastore.get_annotation(doc_id, anno_name)
        for sentence in sentences:
            annos_in_sentence = find_xs_in_y(annos, sentence)
            if annos_in_sentence == []:
                continue
            prev = False
            for token in find_xs_in_y(tokens, sentence):
                if find_x_including_y(annos_in_sentence, token) is None:
                    prev = False
                    print('{0}\t{1}\t{2}'.format(
                        text[token['begin']:token['end']], token['POS'], 'O'))
                else:
                    if prev:
                        print('{0}\t{1}\tI-{2}'.format(
                            text[token['begin']:token['end']], token['POS'],
                            anno_name))
                    else:
                        print('{0}\t{1}\tB-{2}'.format(
                            text[token['begin']:token['end']], token['POS'],
Ejemplo n.º 16
0
import sqlitedatastore as datastore
import solrindexer as indexer

if __name__ == '__main__':
    datastore.connect()
    print('#label', 'doc_id', 'sentence_id', 'text')
    results = indexer.search_annotation(
        fl_keyword_pairs=[
            ('sentence_txt_ja', [['教育', '治安', '経済']]),
            ('name_s',         [['sentence']]),
        ], rows=1000)
    for r in results['response']['docs']:
        text = datastore.get(r['doc_id_i'], ['content'])['content']
        sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']]
        # ラベルファイルのデータ構造へ変換
        print(0, r['doc_id_i'], r['anno_id_i'], text[sent['begin']:sent['end']])
    datastore.close()
#!/usr/bin/env python

import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    for doc_id in datastore.get_all_ids(limit=3):
        row = datastore.get(doc_id, fl=['content'])
        text = row['content']

        print('tokens:')
        for token in datastore.get_annotation(doc_id, 'token'):
            print('    ', token['POS'], '\t',
                  text[token['begin']:token['end']])

        print('chunks:')
        chunks = datastore.get_annotation(doc_id, 'chunk')
        for chunk in chunks:
            _, link = chunk['link']
            print('    ', text[chunk['begin']:chunk['end']])
            if link != -1:
                parent = chunks[link]
                print('\t-->', text[parent['begin']:parent['end']])
            else:
                print('\t-->', 'None')

        print('sentences:')
        for sent in datastore.get_annotation(doc_id, 'sentence'):
            print('    ', text[sent['begin']:sent['end']])

    datastore.close()
Ejemplo n.º 18
0
def extract_relation(doc_id):
    text = datastore.get(doc_id, fl=['content'])['content']
    all_chunks = datastore.get_annotation(doc_id, 'chunk')
    all_tokens = datastore.get_annotation(doc_id, 'token')
    anno_id = 0
    for sent in datastore.get_annotation(doc_id, 'sentence'):
        chunks = find_xs_in_y(all_chunks, sent)
        tokens = find_xs_in_y(all_tokens, sent)
        for chunk in chunks:
            chunk_tokens = find_xs_in_y(tokens, chunk)
            if not any([
                    chunk_token['lemma'] == '与える'
                    for chunk_token in chunk_tokens
            ]):
                continue

            affect, affect_tokens = find_child(chunk,
                                               chunks,
                                               tokens,
                                               text,
                                               all_chunks,
                                               child_cond={'text': ['影響を']})
            if affect is None:
                continue

            cause, cause_tokens = find_child(chunk,
                                             chunks,
                                             tokens,
                                             text,
                                             all_chunks,
                                             child_cond={
                                                 'pos1': ['助詞'],
                                                 'lemma1': ['は', 'も', 'が'],
                                                 'pos2_ng': ['助詞'],
                                             })
            if cause is None:
                continue

            effect, effect_tokens = find_child(chunk,
                                               chunks,
                                               tokens,
                                               text,
                                               all_chunks,
                                               child_cond={
                                                   'pos1': ['助詞'],
                                                   'lemma1': ['に'],
                                                   'pos2_ng': ['助詞'],
                                               })
            if effect is None:
                continue

            cause = extend_phrase(cause, cause_tokens, tokens, all_chunks)
            effect = extend_phrase(effect, effect_tokens, tokens, all_chunks)

            relation = {
                'cause': {
                    'begin': cause['begin'],
                    'end': cause['end'],
                    'link': ('effect', anno_id),
                },
                'effect': {
                    'begin': effect['begin'],
                    'end': effect['end'],
                }
            }

            anno_id += 1
            yield sent, relation
Ejemplo n.º 19
0
import ruleclassifier
import solrindexer as indexer
import sqlitedatastore as datastore
from annoutil import find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    results = indexer.search_annotation(fl_keyword_pairs=[
        ('name_s', [['sentence']]),
    ],
                                        rows=3000)
    sentences = []
    for r in results['response']['docs']:
        sent = datastore.get_annotation(r['doc_id_i'],
                                        'sentence')[r['anno_id_i']]
        tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'),
                              sent)
        sentences.append((r['doc_id_i'], sent, tokens))

    # ルール取得
    rule = ruleclassifier.get_rule()

    # 分類
    features = ruleclassifier.convert_into_features_using_rules(
        sentences, rule)
    predicteds = ruleclassifier.classify(features, rule)
    for predicted, (doc_id, sent, tokens) in zip(predicteds, sentences):
        if predicted == 1:
            text = datastore.get(doc_id, ['content'])['content']
            print(predicted, text[sent['begin']:sent['end']])
    datastore.close()
Ejemplo n.º 20
0
import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    anno_name = 'affiliation'
    for doc_id in datastore.get_all_ids(limit=-1):
        text = datastore.get(doc_id, fl=['content'])['content']
        with open('result/brat/{0}.txt'.format(doc_id), 'w') as f:
            f.write(text)
        with open('result/brat/{0}.ann'.format(doc_id), 'w') as f:
            for i, anno in enumerate(
                    datastore.get_annotation(doc_id, anno_name)):
                f.write('T{0}\t{1} {2} {3}\t{4}\n'.format(
                    i, 'affiliation', anno['begin'], anno['end'],
                    text[anno['begin']:anno['end']]))
    datastore.close()