Ejemplo n.º 1
0
    'Daily Mail'
]), ['TITLE', 'CATEGORY']]

# データの分割
train, valid_test = train_test_split(df,
                                     test_size=0.2,
                                     shuffle=True,
                                     random_state=123,
                                     stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test,
                               test_size=0.5,
                               shuffle=True,
                               random_state=123,
                               stratify=valid_test['CATEGORY'])
# 特徴ベクトルの作成
X_train = torch.stack([transform_w2v(text) for text in train['TITLE']])
X_valid = torch.stack([transform_w2v(text) for text in valid['TITLE']])
X_test = torch.stack([transform_w2v(text) for text in test['TITLE']])

# ラベルベクトルの作成
category_dict = {'b': 0, 't': 1, 'e': 2, 'm': 3}
y_train = torch.tensor(
    train['CATEGORY'].map(lambda x: category_dict[x]).values)
y_valid = torch.tensor(
    valid['CATEGORY'].map(lambda x: category_dict[x]).values)
y_test = torch.tensor(test['CATEGORY'].map(lambda x: category_dict[x]).values)

# Datasetを作成するには、X_train, y_trainを利用
dataset_train = NewsDataset(X_train, y_train)
dataset_valid = NewsDataset(X_valid, y_valid)
dataset_test = NewsDataset(X_test, y_test)
Ejemplo n.º 2
0
from torch import nn
# データの読込
df = pd.read_csv('./../chapter06/data/NewsAggregatorDataset/newsCorpora_re.csv', header=None, sep='\t',
                 names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[
    df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE',
                                                                                                             'CATEGORY']]

# データの分割
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123,
                               stratify=valid_test['CATEGORY'])

X_train = torch.stack([transform_w2v(text) for text in train['TITLE']])

# SGLNetという単層ニューラルネットワークを定義
class SGLNet(nn.Module):
    #  ネットのlayerを定義
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc = nn.Linear(input_size, output_size, bias=False)
        nn.init.normal_(self.fc.weight, 0.0, 1.0)  # 正規乱数で重みを初期化

    #  forwardで入力データが順伝播時に通るレイヤーを順に配置しておく
    def forward(self, x):
        x = self.fc(x)
        return x