Example #1
0
def get():
    doc_id = bottle.request.params.id
    names = bottle.request.params.names.split()

    row = datastore.get(doc_id, fl=['content'])
    text = row['content']
    # text = re.sub(r'[。!]', '\n', text)

    data = {
        'collection': {
            'entity_types':     [],
        },
        'annotation': {
            'text':             text,
            'entities':         [],
            'relations':        [],
        },
    }

    mapping = {}
    for name in names:
        annos = datastore.get_annotation(doc_id, name)
        for i, anno in enumerate(annos):
            data['collection']['entity_types'].append({
                'type':        name,
                'bgColor':     '#7fa2ff',
                'borderColor': 'darken'
            })

            Ti = 'T{0:d}'.format(len(data['annotation']['entities']) + 1)
            data['annotation']['entities'].append([
                Ti,
                name,
                [[anno['begin'], anno['end']]]
            ])
            mapping[(name, i)] = Ti

    for name in names:
        annos = datastore.get_annotation(doc_id, name)
        for i, anno in enumerate(annos):
            if 'link' not in anno:
                continue
            name_linked, i_linked = anno['link']
            if (name, i) not in mapping or (name_linked, i_linked) not in mapping:
                continue

            data['annotation']['relations'].append([
                'R{0:d}'.format(len(data['annotation']['relations']) + 1),
                'arg',
                [['src', mapping[(name, i)]], ['tgt', mapping[(name_linked, i_linked)]]]
            ])

    return json.dumps(data, ensure_ascii=False)
def create_annotation(doc_id, ptn):
    row = datastore.get(doc_id, fl=['content'])
    text = row['content']
    annos = []
    for chunk in datastore.get_annotation(doc_id, 'chunk'):
        chunk_str = text[chunk['begin']:chunk['end']]
        m = ptn.search(chunk_str)
        if not m:
            continue
        anno = {
            'begin': chunk['begin'] + m.start(),
            'end': chunk['begin'] + m.end(),
        }
        print(text[anno['begin']:anno['end']])
        annos.append(anno)
    return annos
Example #3
0
def load_sentence():
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        for i, sent in enumerate(datastore.get_annotation(doc_id, 'sentence')):
            # Solr へ登録するデータ構造へ変換
            data.append({
                'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, 'sentence', i),
                'doc_id_i': doc_id,
                'anno_id_i': i,
                'name_s': 'sentence',
                'sentence_txt_ja': text[sent['begin']:sent['end']],
                'title_txt_ja': meta_info['title'],
                'url_s': meta_info['url'],
            })
    # Solr への登録を実行
    indexer.load('anno', data)
Example #4
0
def load_affiliation():
    anno_name = 'affiliation'
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        sents = datastore.get_annotation(doc_id, 'sentence')
        for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)):
            # Solr へ登録するデータ構造へ変換
            sent = find_x_including_y(sents, anno)
            data.append({
                'id': '{0:d}.{1:s}.{2:d}'.format(doc_id, anno_name, i),
                'doc_id_i': doc_id,
                'anno_id_i': i,
                'name_s': anno_name,
                'sentence_txt_ja': text[sent['begin']:sent['end']],
                anno_name + '_txt_ja': text[anno['begin']:anno['end']],
                'title_txt_ja': meta_info['title'],
                'url_s': meta_info['url'],
            })
    # Solr への登録を実行
    indexer.load('anno', data)
Example #5
0
import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, ['id', 'content', 'meta_info'])
        print(row['id'], row['meta_info'], row['content'][:100])
    datastore.close()
    
Example #6
0
        'doc_id_i': doc_id,
        'anno_id_i': i,
        'name_s': anno_name,
        'sentence_txt_ja': text[sent['begin']:sent['end']],
        anno_name + '_txt_ja': text[anno['begin']:anno['end']],
        ref_anno_name + '_txt_ja': text[ref_anno['begin']:ref_anno['end']],
        'title_txt_ja': meta_info['title'],
        'url_s': meta_info['url'],
    }
    return data


if __name__ == '__main__':
    datastore.connect()
    anno_name = 'cause'
    data = []
    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, fl=['content', 'meta_info'])
        text = row['content']
        meta_info = json.loads(row['meta_info'])
        sents = datastore.get_annotation(doc_id, 'sentence')
        for i, anno in enumerate(datastore.get_annotation(doc_id, anno_name)):
            sent = find_x_including_y(sents, anno)
            data.append(
                create_index_data(doc_id, meta_info, anno_name, anno, i, sent,
                                  text))

    # Solr への登録を実行
    indexer.load('anno', data)
    datastore.close()
Example #7
0
import sqlitedatastore as datastore
from annoutil import find_x_including_y, find_xs_in_y

if __name__ == '__main__':
    datastore.connect()
    anno_name = 'affiliation'

    for doc_id in datastore.get_all_ids(limit=-1):
        row = datastore.get(doc_id, fl=['content'])
        text = row['content']
        sentences = datastore.get_annotation(doc_id, 'sentence')
        tokens = datastore.get_annotation(doc_id, 'token')
        annos = datastore.get_annotation(doc_id, anno_name)
        for sentence in sentences:
            annos_in_sentence = find_xs_in_y(annos, sentence)
            if annos_in_sentence == []:
                continue
            prev = False
            for token in find_xs_in_y(tokens, sentence):
                if find_x_including_y(annos_in_sentence, token) is None:
                    prev = False
                    print('{0}\t{1}\t{2}'.format(
                        text[token['begin']:token['end']], token['POS'], 'O'))
                else:
                    if prev:
                        print('{0}\t{1}\tI-{2}'.format(
                            text[token['begin']:token['end']], token['POS'],
                            anno_name))
                    else:
                        print('{0}\t{1}\tB-{2}'.format(
                            text[token['begin']:token['end']], token['POS'],
Example #8
0
import sqlitedatastore as datastore

if __name__ == '__main__':
    datastore.connect()

    data = []
    doc_ids = []
    for doc_id in datastore.get_all_ids(limit=-1):
        data.append(' '.join([
            token['lemma']
            for token in datastore.get_annotation(doc_id, 'token')
        ]))
        doc_ids.append(doc_id)

    vectorizer = TfidfVectorizer(analyzer='word', max_df=0.9)
    vecs = vectorizer.fit_transform(data)

    for doc_id, vec in zip(doc_ids, vecs.toarray()):
        meta_info = json.loads(
            datastore.get(doc_id, ['meta_info'])['meta_info'])
        title = meta_info['title']
        print(doc_id, title)

        for w_id, tfidf in sorted(enumerate(vec),
                                  key=lambda x: x[1],
                                  reverse=True)[:10]:
            lemma = vectorizer.get_feature_names()[w_id]
            print('\t{0:s}: {1:f}'.format(lemma, tfidf))
    datastore.close()
Example #9
0
import sqlitedatastore as datastore
import solrindexer as indexer

if __name__ == '__main__':
    datastore.connect()
    print('#label', 'doc_id', 'sentence_id', 'text')
    results = indexer.search_annotation(
        fl_keyword_pairs=[
            ('sentence_txt_ja', [['教育', '治安', '経済']]),
            ('name_s',         [['sentence']]),
        ], rows=1000)
    for r in results['response']['docs']:
        text = datastore.get(r['doc_id_i'], ['content'])['content']
        sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']]
        # ラベルファイルのデータ構造へ変換
        print(0, r['doc_id_i'], r['anno_id_i'], text[sent['begin']:sent['end']])
    datastore.close()
Example #10
0
    sentences_train = sentences[:num_train]
    labels_train = labels[:num_train]
    features, vocab = mlclassifier.convert_into_features(sentences_train)

    # 学習
    time_s = time.time()
    print(':::TRAIN START')
    model = mlclassifier.train(labels_train, features)
    print(':::TRAIN FINISHED', time.time() - time_s)

    # 学習モデルをファイルに保存
    joblib.dump(model, 'result/model.pkl')
    joblib.dump(vocab, 'result/vocab.pkl')

    # 分類の実行
    features_test = mlclassifier.convert_into_features_using_vocab(
        sentences[num_train:], vocab)
    predicteds = mlclassifier.classify(features_test, model)
    for predicted, (doc_id, sent,
                    tokens), label in zip(predicteds, sentences[num_train:],
                                          labels[num_train:]):
        # 結果の確認
        text = datastore.get(doc_id, ['content'])['content']
        if predicted == label:
            print('correct  ', '  ', label, predicted,
                  text[sent['begin']:sent['end']])
        else:
            print('incorrect', '  ', label, predicted,
                  text[sent['begin']:sent['end']])
    datastore.close()
Example #11
0
def extract_relation(doc_id):
    text = datastore.get(doc_id, fl=['content'])['content']
    all_chunks = datastore.get_annotation(doc_id, 'chunk')
    all_tokens = datastore.get_annotation(doc_id, 'token')
    anno_id = 0
    for sent in datastore.get_annotation(doc_id, 'sentence'):
        chunks = find_xs_in_y(all_chunks, sent)
        tokens = find_xs_in_y(all_tokens, sent)
        for chunk in chunks:
            chunk_tokens = find_xs_in_y(tokens, chunk)
            if not any([
                    chunk_token['lemma'] == '与える'
                    for chunk_token in chunk_tokens
            ]):
                continue

            affect, affect_tokens = find_child(chunk,
                                               chunks,
                                               tokens,
                                               text,
                                               all_chunks,
                                               child_cond={'text': ['影響を']})
            if affect is None:
                continue

            cause, cause_tokens = find_child(chunk,
                                             chunks,
                                             tokens,
                                             text,
                                             all_chunks,
                                             child_cond={
                                                 'pos1': ['助詞'],
                                                 'lemma1': ['は', 'も', 'が'],
                                                 'pos2_ng': ['助詞'],
                                             })
            if cause is None:
                continue

            effect, effect_tokens = find_child(chunk,
                                               chunks,
                                               tokens,
                                               text,
                                               all_chunks,
                                               child_cond={
                                                   'pos1': ['助詞'],
                                                   'lemma1': ['に'],
                                                   'pos2_ng': ['助詞'],
                                               })
            if effect is None:
                continue

            cause = extend_phrase(cause, cause_tokens, tokens, all_chunks)
            effect = extend_phrase(effect, effect_tokens, tokens, all_chunks)

            relation = {
                'cause': {
                    'begin': cause['begin'],
                    'end': cause['end'],
                    'link': ('effect', anno_id),
                },
                'effect': {
                    'begin': effect['begin'],
                    'end': effect['end'],
                }
            }

            anno_id += 1
            yield sent, relation