def get_parsed_xml_from_rel_path(path): """ 実行ファイルからの相対パスでxmlファイルを読み込む。xmlファイルがない場合は同名のtxtファイルを解析結果として返す(xmlファイルも作る)。 """ source_xml_file = path_helpers.get_rel_path_from_working_directory( __file__, path + '.xml') try: with open(source_xml_file) as f: xml_text = f.read() except FileNotFoundError: source_txt_file = path_helpers.get_rel_path_from_working_directory( __file__, path + '.txt') with open(source_txt_file) as source: text = source.read() xml_text = convert_into_xml(text) with open(source_xml_file, 'w') as output: output.write(xml_text) return (xml_text)
def convert_xml_sentence_into_png(xml_sentence, *, output='graph', format='png'): sentence_id = int(xml_sentence.get('id')) g = pd.Dot(graph_type='digraph', rankdir='BT', splines='false', margin='0', fontsize='9.5', layout='dot') for token in xml_sentence.findall('tokens/token'): idx = token.get('id') word = token.find('word').text n = pd.Node(idx, label='"%s"' % word) # label="," はなぜかerrorで、'","'だと通る。 g.add_node(n) for dep in xml_sentence.findall( 'dependencies[@type="collapsed-dependencies"]/dep'): dep_type = dep.get('type') dn = dep.find('governor').get('idx') gn = dep.find('dependent').get('idx') e = pd.Edge(gn, dn, label='"%s"' % dep_type) # label="," はなぜかerrorで、'","'だと通る。 g.add_edge(e) if output in ['graph', 'both']: graph_path = get_rel_path_from_working_directory( __file__, '../data/graph57/sentence_%s.png' % sentence_id) print('creating graph of sentence%s' % sentence_id) g.write_png(graph_path) if output in ['dot', 'both']: dot_path = get_rel_path_from_working_directory( __file__, '../data/dot57/sentence_%s.dot' % sentence_id) print('creating dot file of sentence%s' % sentence_id) with open(dot_path, 'w') as f: f.write(g.to_string())
def remove_no_appear_features(features, sentences): feature_candidates = np.array( [stem(w) for s in sentences for w in s.split()[1:]]) features = np.array(features) removed_features = np.intersect1d(features, feature_candidates) print('narraw %d features to %d.' % (len(features), len(removed_features))) return removed_features if __name__ == '__main__': N = 5 ETA = 1e-3 EPOCH = 5000 source_file = get_rel_path_from_working_directory(__file__, '../data/sentiment.txt') features_file = get_rel_path_from_working_directory( __file__, '../data/features.txt') with open(source_file, encoding='Windows-1252') as f: sentences = f.readlines() with open(features_file) as f: features = f.read().split() shuffle(sentences) size = len(sentences) result = [] for k in range(N): print('Learning%d' % k) training_sentences = sentences[:round(k / N * size)] + sentences[round( (k + 1) / N * size):] testing_sentences = sentences[round(k / N * size):round((k + 1) / N *
(i, with_data_num)) # consoleに上書き出力 else: print('\nsetting data with area info to redis.') r.mset(dic) name_list_without_data = list(names_without_data - names_with_data) with_data_num = len(names_with_data) without_data_num = len(name_list_without_data) i = 0 while True: partial_name_list_without_data = name_list_without_data[ i * max_dic_size:(i + 1) * max_dic_size] if not partial_name_list_without_data: break dic = {name: 'None' for name in partial_name_list_without_data} print('setting data without area info to redis.') r.mset(dic) i += 1 print( 'all %d artists, %d artists have area info, %d artists have no area info.' % (with_data_num + without_data_num, with_data_num, without_data_num)) if __name__ == '__main__': # r = redis.Redis(host='localhost', port=6379, db=0) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.StrictRedis(connection_pool=pool) r.flushall() json_file = get_rel_path_from_working_directory(__file__, '../data/artist.json') create_name_area_kvs(json_file)
50. 文区切り (. or ; or : or ? or !) → 空白文字 → 英大文字というパターンを文の区切りと見なし,入力された文書を1行1文の形式で出力せよ. """ from mymodule.path_helpers import get_rel_path_from_working_directory import sys import re def devide_text_into_sentences(text): """ 文章を文に分ける。 Args: text(str): 文章 Returns sentences(list of str): 文のリスト """ pattern = r'([.:;?!])\s+([A-Z])' splited_text = re.split(pattern, text) splited_text.insert(0, '') it = iter(splited_text) sentences = [''.join([x, y, z]) for x, y, z in zip(it, it, it)] return sentences if __name__ == '__main__': source_file = get_rel_path_from_working_directory(__file__, '../data/nlp.txt') with open(source_file) as f: text = f.read() sentences = devide_text_into_sentences(text) for s in sentences: print(s)
added_list = [1, 1, 0] if sentiment == '+1' else [1, 0, 1] for w in s[3:].split(): stemmed_word = stem(w) if stemmed_word in stopwords: continue elif stemmed_word not in stems_dict: stems_dict[stemmed_word] = added_list else: stems_dict[stemmed_word] = [ a + b for a, b in zip(stems_dict[stemmed_word], added_list) ] filtered_dict = { k: v for k, v in stems_dict.items() if v[0] > 3 # 14639 -> 4538 and not 5 / 11 < v[1] / v[0] < 6 / 11 # 4538 -> 3893 } return filtered_dict.keys() if __name__ == '__main__': file_path = get_rel_path_from_working_directory(__file__, '../data/sentiment.txt') with open(file_path, encoding='Windows-1252') as f: sentences = f.readlines() features = extract_features(sentences) output_file_path = get_rel_path_from_working_directory( __file__, '../data/features.txt') with open(output_file_path, encoding='Windows-1252', mode='w') as f: f.write(' '.join(features)) print('features.txt file is made.')
from mymodule.path_helpers import get_rel_path_from_working_directory import numpy as np from stemming.porter2 import stem from nlp100_73 import sigmoid def create_x_data(sentences, features): words = sentence.split() stemmed_words = [stem(w) for w in words] x_line = [1] + [1 if f in stemmed_words else 0 for f in features] x = np.array(x_line) return x if __name__ == '__main__': features_file = get_rel_path_from_working_directory( __file__, '../data/features.txt') theta_file = get_rel_path_from_working_directory(__file__, '../data/theta.npy') with open(features_file) as f: features = f.read().split() theta = np.load(theta_file) while True: sentence = input('sentence: ') x = create_x_data(sentence, features) h = sigmoid(x, theta) if h > 0.5: print('label: +1 (prediction: %f)' % h) else: print('label: -1 (prediction: %f)' % (1 - h))
""" 指定した番号のファイルを実行する。 """ import subprocess import sys import os from mymodule.path_helpers import get_rel_path_from_working_directory from IPython import embed # 環境変数PYTHONPATHにこのファイルのディレクトリを追加 # sys.path.append()だとsubprocess実行時にsys.pathが初期化されるので意味がないことに注意。 directory_path = os.path.dirname(os.path.realpath(__file__)) # subprocess.run(["export", "PYTHONPATH=$PYTHONPATH:%s" % directory_path], shell = True) os.putenv("PYTHONPATH", os.pathsep.join([os.getenv("PYTHONPATH", ""), directory_path])) if len(sys.argv) == 1: # sys.exit() print("実行したい問題の番号(0~99)を入力してください。") sys.argv.append(input()) file_num = sys.argv[1] chapter_num = int(file_num[0]) + 1 rel_path = "./chapter%d/code/nlp100_%s.py" % (chapter_num, file_num) execute_file_path = get_rel_path_from_working_directory(__file__, rel_path) subprocess.run(["python", execute_file_path, *sys.argv[2:]])
stems_dict = {} for s in sentences: for w in s.split(): stemmed_word = stem(w) stems_dict[ stemmed_word] = 1 if stemmed_word not in stems_dict else stems_dict[ stemmed_word] + 1 sorted_stems_dict = sorted(stems_dict.items(), key=lambda x: x[1]) pprint(sorted_stems_dict) stopwords = '. the , a and of it to is that in as but with this for an be on you not by one more about are has at from than have " all -- his so if or what i too there who just into will can'.split( ) """ 与えられた文章群から単語分割してステミングして出現回数順に並べたものから目視でピックアップ。 映画に関する文章なので映画に関連する語も上位に出現したがそれはストップワードではないので除外。 """ def is_stopword(word): from stemming.porter2 import stem return stem(word.lower()) in stopwords if __name__ == '__main__': file_path = get_rel_path_from_working_directory(__file__, '../data/sentiment.txt') with open(file_path, encoding='Windows-1252') as f: sentences = f.readlines() check_stems_dict_from_sentences(sentences)
def count_sentiment_sentences(file_path): pos_count = 0 neg_count = 0 with open(file_path, encoding='Windows-1252') as f: for line in f: if line.startswith('+1 '): pos_count += 1 elif line.startswith('-1 '): neg_count += 1 else: raise Exception return pos_count, neg_count if __name__ == '__main__': pos_file = get_rel_path_from_working_directory(__file__, '../data/rt-polaritydata/rt-polaritydata/rt-polarity.pos') neg_file = get_rel_path_from_working_directory(__file__, '../data/rt-polaritydata/rt-polaritydata/rt-polarity.neg') # detect_file_encoding(pos_file) res_list = [] with open(pos_file, encoding='Windows-1252') as f: res_list.extend(['+1 ' + line for line in f]) with open(neg_file, encoding='Windows-1252') as f: res_list.extend(['-1 ' + line for line in f]) shuffle(res_list) output_file = get_rel_path_from_working_directory(__file__, '../data/sentiment.txt') with open(output_file, 'w') as f: f.write(''.join(res_list)) print('sentiment.txt file is made.') pos_count, neg_count = count_sentiment_sentences(output_file)