コード例 #1
0
ファイル: nlp100_53.py プロジェクト: teanakamura/Training
def get_parsed_xml_from_rel_path(path):
    """
    実行ファイルからの相対パスでxmlファイルを読み込む。xmlファイルがない場合は同名のtxtファイルを解析結果として返す(xmlファイルも作る)。
    """
    source_xml_file = path_helpers.get_rel_path_from_working_directory(
        __file__, path + '.xml')
    try:
        with open(source_xml_file) as f:
            xml_text = f.read()
    except FileNotFoundError:
        source_txt_file = path_helpers.get_rel_path_from_working_directory(
            __file__, path + '.txt')
        with open(source_txt_file) as source:
            text = source.read()
        xml_text = convert_into_xml(text)
        with open(source_xml_file, 'w') as output:
            output.write(xml_text)
    return (xml_text)
コード例 #2
0
def convert_xml_sentence_into_png(xml_sentence,
                                  *,
                                  output='graph',
                                  format='png'):
    sentence_id = int(xml_sentence.get('id'))
    g = pd.Dot(graph_type='digraph',
               rankdir='BT',
               splines='false',
               margin='0',
               fontsize='9.5',
               layout='dot')
    for token in xml_sentence.findall('tokens/token'):
        idx = token.get('id')
        word = token.find('word').text
        n = pd.Node(idx,
                    label='"%s"' % word)  # label="," はなぜかerrorで、'","'だと通る。
        g.add_node(n)
    for dep in xml_sentence.findall(
            'dependencies[@type="collapsed-dependencies"]/dep'):
        dep_type = dep.get('type')
        dn = dep.find('governor').get('idx')
        gn = dep.find('dependent').get('idx')
        e = pd.Edge(gn, dn,
                    label='"%s"' % dep_type)  # label="," はなぜかerrorで、'","'だと通る。
        g.add_edge(e)
    if output in ['graph', 'both']:
        graph_path = get_rel_path_from_working_directory(
            __file__, '../data/graph57/sentence_%s.png' % sentence_id)
        print('creating graph of sentence%s' % sentence_id)
        g.write_png(graph_path)
    if output in ['dot', 'both']:
        dot_path = get_rel_path_from_working_directory(
            __file__, '../data/dot57/sentence_%s.dot' % sentence_id)
        print('creating dot file of sentence%s' % sentence_id)
        with open(dot_path, 'w') as f:
            f.write(g.to_string())
コード例 #3
0
ファイル: nlp100_78.py プロジェクト: teanakamura/Training

def remove_no_appear_features(features, sentences):
    feature_candidates = np.array(
        [stem(w) for s in sentences for w in s.split()[1:]])
    features = np.array(features)
    removed_features = np.intersect1d(features, feature_candidates)
    print('narraw %d features to %d.' % (len(features), len(removed_features)))
    return removed_features


if __name__ == '__main__':
    N = 5
    ETA = 1e-3
    EPOCH = 5000
    source_file = get_rel_path_from_working_directory(__file__,
                                                      '../data/sentiment.txt')
    features_file = get_rel_path_from_working_directory(
        __file__, '../data/features.txt')
    with open(source_file, encoding='Windows-1252') as f:
        sentences = f.readlines()
    with open(features_file) as f:
        features = f.read().split()
    shuffle(sentences)
    size = len(sentences)
    result = []

    for k in range(N):
        print('Learning%d' % k)
        training_sentences = sentences[:round(k / N * size)] + sentences[round(
            (k + 1) / N * size):]
        testing_sentences = sentences[round(k / N * size):round((k + 1) / N *
コード例 #4
0
                             (i, with_data_num))  # consoleに上書き出力
        else:
            print('\nsetting data with area info to redis.')
            r.mset(dic)
            name_list_without_data = list(names_without_data - names_with_data)
            with_data_num = len(names_with_data)
            without_data_num = len(name_list_without_data)
            i = 0
            while True:
                partial_name_list_without_data = name_list_without_data[
                    i * max_dic_size:(i + 1) * max_dic_size]
                if not partial_name_list_without_data: break
                dic = {name: 'None' for name in partial_name_list_without_data}
                print('setting data without area info to redis.')
                r.mset(dic)
                i += 1
            print(
                'all %d artists, %d artists have area info, %d artists have no area info.'
                % (with_data_num + without_data_num, with_data_num,
                   without_data_num))


if __name__ == '__main__':
    # r = redis.Redis(host='localhost', port=6379, db=0)
    pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
    r = redis.StrictRedis(connection_pool=pool)
    r.flushall()
    json_file = get_rel_path_from_working_directory(__file__,
                                                    '../data/artist.json')
    create_name_area_kvs(json_file)
コード例 #5
0
50. 文区切り
(. or ; or : or ? or !) → 空白文字 → 英大文字というパターンを文の区切りと見なし,入力された文書を1行1文の形式で出力せよ.
"""

from mymodule.path_helpers import get_rel_path_from_working_directory
import sys
import re

def devide_text_into_sentences(text):
    """
    文章を文に分ける。
    Args:
        text(str): 文章
    Returns
        sentences(list of str): 文のリスト
    """
    pattern = r'([.:;?!])\s+([A-Z])'
    splited_text = re.split(pattern, text)
    splited_text.insert(0, '')
    it = iter(splited_text)
    sentences = [''.join([x, y, z]) for x, y, z in zip(it, it, it)]
    return sentences

if __name__ == '__main__':
    source_file = get_rel_path_from_working_directory(__file__, '../data/nlp.txt')
    with open(source_file) as f:
        text = f.read()
    sentences = devide_text_into_sentences(text)
    for s in sentences:
        print(s)
コード例 #6
0
ファイル: nlp100_72.py プロジェクト: teanakamura/Training
        added_list = [1, 1, 0] if sentiment == '+1' else [1, 0, 1]
        for w in s[3:].split():
            stemmed_word = stem(w)
            if stemmed_word in stopwords:
                continue
            elif stemmed_word not in stems_dict:
                stems_dict[stemmed_word] = added_list
            else:
                stems_dict[stemmed_word] = [
                    a + b for a, b in zip(stems_dict[stemmed_word], added_list)
                ]
    filtered_dict = {
        k: v
        for k, v in stems_dict.items() if v[0] > 3  # 14639 -> 4538
        and not 5 / 11 < v[1] / v[0] < 6 / 11  # 4538 -> 3893
    }
    return filtered_dict.keys()


if __name__ == '__main__':
    file_path = get_rel_path_from_working_directory(__file__,
                                                    '../data/sentiment.txt')
    with open(file_path, encoding='Windows-1252') as f:
        sentences = f.readlines()
    features = extract_features(sentences)
    output_file_path = get_rel_path_from_working_directory(
        __file__, '../data/features.txt')
    with open(output_file_path, encoding='Windows-1252', mode='w') as f:
        f.write(' '.join(features))
    print('features.txt file is made.')
コード例 #7
0
ファイル: nlp100_74.py プロジェクト: teanakamura/Training
from mymodule.path_helpers import get_rel_path_from_working_directory
import numpy as np
from stemming.porter2 import stem
from nlp100_73 import sigmoid


def create_x_data(sentences, features):
    words = sentence.split()
    stemmed_words = [stem(w) for w in words]
    x_line = [1] + [1 if f in stemmed_words else 0 for f in features]
    x = np.array(x_line)
    return x


if __name__ == '__main__':
    features_file = get_rel_path_from_working_directory(
        __file__, '../data/features.txt')
    theta_file = get_rel_path_from_working_directory(__file__,
                                                     '../data/theta.npy')
    with open(features_file) as f:
        features = f.read().split()
    theta = np.load(theta_file)
    while True:
        sentence = input('sentence: ')
        x = create_x_data(sentence, features)
        h = sigmoid(x, theta)
        if h > 0.5:
            print('label: +1 (prediction: %f)' % h)
        else:
            print('label: -1 (prediction: %f)' % (1 - h))
コード例 #8
0
"""
指定した番号のファイルを実行する。
"""

import subprocess
import sys
import os
from mymodule.path_helpers import get_rel_path_from_working_directory
from IPython import embed

# 環境変数PYTHONPATHにこのファイルのディレクトリを追加
# sys.path.append()だとsubprocess実行時にsys.pathが初期化されるので意味がないことに注意。
directory_path = os.path.dirname(os.path.realpath(__file__))
# subprocess.run(["export", "PYTHONPATH=$PYTHONPATH:%s" % directory_path], shell = True)
os.putenv("PYTHONPATH",
          os.pathsep.join([os.getenv("PYTHONPATH", ""), directory_path]))

if len(sys.argv) == 1:
    # sys.exit()
    print("実行したい問題の番号(0~99)を入力してください。")
    sys.argv.append(input())
file_num = sys.argv[1]
chapter_num = int(file_num[0]) + 1
rel_path = "./chapter%d/code/nlp100_%s.py" % (chapter_num, file_num)
execute_file_path = get_rel_path_from_working_directory(__file__, rel_path)
subprocess.run(["python", execute_file_path, *sys.argv[2:]])
コード例 #9
0
    stems_dict = {}
    for s in sentences:
        for w in s.split():
            stemmed_word = stem(w)
            stems_dict[
                stemmed_word] = 1 if stemmed_word not in stems_dict else stems_dict[
                    stemmed_word] + 1
    sorted_stems_dict = sorted(stems_dict.items(), key=lambda x: x[1])
    pprint(sorted_stems_dict)


stopwords = '. the , a and of it to is that in as but with this for an be on you not by one more about are has at from than have " all -- his so if or what i too there who just into will can'.split(
)
"""
与えられた文章群から単語分割してステミングして出現回数順に並べたものから目視でピックアップ。
映画に関する文章なので映画に関連する語も上位に出現したがそれはストップワードではないので除外。
"""


def is_stopword(word):
    from stemming.porter2 import stem
    return stem(word.lower()) in stopwords


if __name__ == '__main__':
    file_path = get_rel_path_from_working_directory(__file__,
                                                    '../data/sentiment.txt')
    with open(file_path, encoding='Windows-1252') as f:
        sentences = f.readlines()
    check_stems_dict_from_sentences(sentences)
コード例 #10
0
def count_sentiment_sentences(file_path):
    pos_count = 0
    neg_count = 0
    with open(file_path,  encoding='Windows-1252') as f:
        for line in f:
            if line.startswith('+1 '):
                pos_count += 1
            elif line.startswith('-1 '):
                neg_count += 1
            else:
                raise Exception
    return pos_count, neg_count


if __name__ == '__main__':
    pos_file = get_rel_path_from_working_directory(__file__, '../data/rt-polaritydata/rt-polaritydata/rt-polarity.pos')
    neg_file = get_rel_path_from_working_directory(__file__, '../data/rt-polaritydata/rt-polaritydata/rt-polarity.neg')
    # detect_file_encoding(pos_file)
    res_list = []
    with open(pos_file, encoding='Windows-1252') as f:
        res_list.extend(['+1 ' + line for line in f])
    with open(neg_file, encoding='Windows-1252') as f:
        res_list.extend(['-1 ' + line for line in f])
    shuffle(res_list)

    output_file = get_rel_path_from_working_directory(__file__, '../data/sentiment.txt')
    with open(output_file, 'w') as f:
        f.write(''.join(res_list))
    print('sentiment.txt file is made.')

    pos_count, neg_count = count_sentiment_sentences(output_file)