def get(): keywords = bottle.request.params.keywords.split() classifier_name = bottle.request.params.classifier results = indexer.search_annotation(fl_keyword_pairs=[ ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']]) ], rows=1000) for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) if classifier_name == 'ml': features = mlclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab_ml) predicteds = mlclassifier.classify(features, model_ml) elif classifier_name == 'dl': features = dlclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab_dl) predicteds = dlclassifier.classify(features, model_dl) elif classifier_name == 'rule': features = ruleclassifier.convert_into_features_using_rules( [(r['doc_id_i'], sent, tokens)], rule) predicteds = ruleclassifier.classify(features, rule) r['predicted'] = int(predicteds[0]) # covert from numpy.int to int print(r['predicted'], r['sentence_txt_ja']) return json.dumps(results, ensure_ascii=False)
def _extend(chunk, chunk_tokens): for child in all_chunks: _, link = child['link'] if link == -1: continue if all_chunks[link] != chunk: continue child_tokens = find_xs_in_y(tokens, child) if child_tokens[0]['POS'] == chunk_tokens[0]['POS']: return [child] + _extend(child, child_tokens) return []
def create_language_model(doc_ids, N=3): sents = [] for doc_id in doc_ids: all_tokens = datastore.get_annotation(doc_id, 'token') for sent in datastore.get_annotation(doc_id, 'sentence'): tokens = find_xs_in_y(all_tokens, sent) sents.append(['__BOS__'] + [token['lemma'] for token in tokens] + ['__EOS__']) vocab = Vocabulary([word for sent in sents for word in sent]) text_ngrams = [ngrams(sent, N) for sent in sents] lm = MLE(order=N, vocabulary=vocab) lm.fit(text_ngrams) return lm
def find_child(parent, chunks_in_sent, tokens_in_sent, text, all_chunks, child_cond): for child in chunks_in_sent: _, link = child['link'] if link == -1 or all_chunks[link] != parent: continue child_tokens = find_xs_in_y(tokens_in_sent, child) if text[child['begin']:child['end']] in child_cond.get('text', []): return child, child_tokens if child_tokens[-1]['POS'] in child_cond.get('pos1', []) and \ child_tokens[-1]['lemma'] in child_cond.get('lemma1', []) and \ child_tokens[-2]['POS'] not in child_cond.get('pos2_ng', []): return child, child_tokens return None, None
def get(): title = bottle.request.params.title.strip() keywords = bottle.request.params.keywords.split() results = indexer.search_annotation(fl_keyword_pairs=[ ('title_txt_ja', [[title]]), ('sentence_txt_ja', [keywords]), ('name_s', [['sentence']]) ], rows=1000) for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) features = sentimentclassifier.convert_into_features_using_vocab( [(r['doc_id_i'], sent, tokens)], vocab) predicteds = mlclassifier.classify(features, model) r['predicted'] = int(predicteds[0]) # covert from numpy.int to int print(r['predicted'], r['sentence_txt_ja']) return json.dumps(results, ensure_ascii=False)
import sqlitedatastore as datastore from annoutil import find_x_including_y, find_xs_in_y if __name__ == '__main__': datastore.connect() anno_name = 'affiliation' for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, fl=['content']) text = row['content'] sentences = datastore.get_annotation(doc_id, 'sentence') tokens = datastore.get_annotation(doc_id, 'token') annos = datastore.get_annotation(doc_id, anno_name) for sentence in sentences: annos_in_sentence = find_xs_in_y(annos, sentence) if annos_in_sentence == []: continue prev = False for token in find_xs_in_y(tokens, sentence): if find_x_including_y(annos_in_sentence, token) is None: prev = False print('{0}\t{1}\t{2}'.format( text[token['begin']:token['end']], token['POS'], 'O')) else: if prev: print('{0}\t{1}\tI-{2}'.format( text[token['begin']:token['end']], token['POS'], anno_name)) else: print('{0}\t{1}\tB-{2}'.format( text[token['begin']:token['end']], token['POS'],
import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() # ラベル付きデータ読み込み sentences = [] labels = [] with open('./data/labels.txt') as f: for line in f: if line.startswith('#'): continue d = line.rstrip().split() label, doc_id, sent_id = int(d[0]), d[1], int(d[2]) sent = datastore.get_annotation(doc_id, 'sentence')[sent_id] tokens = find_xs_in_y(datastore.get_annotation(doc_id, 'token'), sent) sentences.append((doc_id, sent, tokens)) labels.append(label) # 学習データ特徴量生成 num_train = int(len(sentences) * 0.8) sentences_train = sentences[:num_train] labels_train = labels[:num_train] features, vocab = mlclassifier.convert_into_features(sentences_train) # 学習 time_s = time.time() print(':::TRAIN START') model = mlclassifier.train(labels_train, features) print(':::TRAIN FINISHED', time.time() - time_s)
import ruleclassifier import solrindexer as indexer import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() results = indexer.search_annotation(fl_keyword_pairs=[ ('name_s', [['sentence']]), ], rows=3000) sentences = [] for r in results['response']['docs']: sent = datastore.get_annotation(r['doc_id_i'], 'sentence')[r['anno_id_i']] tokens = find_xs_in_y(datastore.get_annotation(r['doc_id_i'], 'token'), sent) sentences.append((r['doc_id_i'], sent, tokens)) # ルール取得 rule = ruleclassifier.get_rule() # 分類 features = ruleclassifier.convert_into_features_using_rules( sentences, rule) predicteds = ruleclassifier.classify(features, rule) for predicted, (doc_id, sent, tokens) in zip(predicteds, sentences): if predicted == 1: text = datastore.get(doc_id, ['content'])['content'] print(predicted, text[sent['begin']:sent['end']]) datastore.close()
import sqlitedatastore as datastore from annoutil import find_xs_in_y if __name__ == '__main__': datastore.connect() for doc_id in datastore.get_all_ids(limit=-1): row = datastore.get(doc_id, fl=['content']) text = row['content'] sentences = datastore.get_annotation(doc_id, 'sentence') tokens = datastore.get_annotation(doc_id, 'token') for sentence in sentences: for token in find_xs_in_y(tokens, sentence): print('{0}\t{1}\t{2}\t{3}\t{4}'.format( text[token['begin']:token['end']], token['POS'], doc_id, token['begin'], token['end'])) print() # 文の区切り datastore.close()
from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from annoutil import find_xs_in_y import sqlitedatastore as datastore logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if __name__ == '__main__': datastore.connect() sentences = [] for doc_id in datastore.get_all_ids(limit=-1): all_tokens = datastore.get_annotation(doc_id, 'token') for sent in datastore.get_annotation(doc_id, 'sentence'): tokens = find_xs_in_y(all_tokens, sent) sentences.append( [token['lemma'] for token in tokens if token.get('NE') == 'O']) n_sent = 20 docs = [ list(itertools.chain.from_iterable(sentences[i:i + n_sent])) for i in range(0, len(sentences), n_sent) ] dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=2, no_above=0.3) corpus = [dictionary.doc2bow(doc) for doc in docs] lda = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10)
def extract_relation(doc_id): text = datastore.get(doc_id, fl=['content'])['content'] all_chunks = datastore.get_annotation(doc_id, 'chunk') all_tokens = datastore.get_annotation(doc_id, 'token') anno_id = 0 for sent in datastore.get_annotation(doc_id, 'sentence'): chunks = find_xs_in_y(all_chunks, sent) tokens = find_xs_in_y(all_tokens, sent) for chunk in chunks: chunk_tokens = find_xs_in_y(tokens, chunk) if not any([ chunk_token['lemma'] == '与える' for chunk_token in chunk_tokens ]): continue affect, affect_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={'text': ['影響を']}) if affect is None: continue cause, cause_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={ 'pos1': ['助詞'], 'lemma1': ['は', 'も', 'が'], 'pos2_ng': ['助詞'], }) if cause is None: continue effect, effect_tokens = find_child(chunk, chunks, tokens, text, all_chunks, child_cond={ 'pos1': ['助詞'], 'lemma1': ['に'], 'pos2_ng': ['助詞'], }) if effect is None: continue cause = extend_phrase(cause, cause_tokens, tokens, all_chunks) effect = extend_phrase(effect, effect_tokens, tokens, all_chunks) relation = { 'cause': { 'begin': cause['begin'], 'end': cause['end'], 'link': ('effect', anno_id), }, 'effect': { 'begin': effect['begin'], 'end': effect['end'], } } anno_id += 1 yield sent, relation