Beispiel #1
0
# coding:utf-8
import random
import torch.nn as nn
import torch.optim as optim
import dataHandler
from model import PoetryModel
from utils import *
import pickle as p

device = torch.device("cuda: 0" if torch.cuda.is_available() else "cpu")

# load data
print("load data...")
data = dataHandler.parseRawData(constrain=5)

with open("train_data.txt", 'w', encoding='utf-8') as f:
    for poem in data:
        f.write(poem + "\n")

# build word_to_idx
print("build word_to_idx...")
word_to_ix = {}
for sent in data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_to_ix['<EOP>'] = len(word_to_ix)
word_to_ix['<START>'] = len(word_to_ix)

VOCAB_SIZE = len(word_to_ix)
print("VOCAB_SIZE:", VOCAB_SIZE)
Beispiel #2
0
# coding:utf-8
import random
import torch.nn as nn
import torch.optim as optim
import dataHandler
from model import PoetryModel
from utils import *
import cPickle as p

data = dataHandler.parseRawData()  # All if author=None
# data = dataHandler.parseRawData(author="李白".decode('utf-8'),constrain=5)  # All if author=None
# random.shuffle(data)
for s in data:
    print s
word_to_ix = {}

for sent in data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
word_to_ix['<EOP>'] = len(word_to_ix)
word_to_ix['<START>'] = len(word_to_ix)

VOCAB_SIZE = len(word_to_ix)

print "VOCAB_SIZE:", VOCAB_SIZE
print "data_size", len(data)

for i in range(len(data)):
    data[i] = toList(data[i])
    data[i].append("<EOP>")
Beispiel #3
0
from sys import exit	#exit(0) for debug purpose
import json


#
# 0:  No GPU cuda support
# 1:  Use CPU instead of GPU
#
CUDA_GPU = 0


#
# Parse the raw data from Chinese-poetry json files
#
data = dataHandler.parseRawData(author=None, constrain=5)  # All if author=None
#data = dataHandler.parseRawData(author="李白".decode('utf-8'),constrain=5)  # All if author=None
#data = dataHandler.parseRawData(author="杜甫".decode('utf-8'),constrain=5)  # All if author=None
# random.shuffle(data)
for s in data:
    #print s
    pass
word_to_ix = {}


#
# Convert the result to word_to_ix data structure
# word_to_ix is a dictionary.  
#    Key is the Chinese word, 
#    Value is the counter of the Key appeared in the poems
#