Ejemplo n.º 1
0
from config import VOCAB_HOME

from acqdivReader import AcqdivReader

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--language", dest="language", type=str)
import random

args = parser.parse_args()
print(args)

acqdivCorpusReader = AcqdivReader(args.language)

vocabulary = set()
iterator = acqdivCorpusReader.iterator()
for utterance in iterator:
    utterance = utterance.split(" ")
    for word in utterance:
        vocabulary.add(word)
with open(VOCAB_HOME + args.language + '-vocab.txt', "w") as outFile:
    for word in vocabulary:
        print(word, file=outFile)
parser.add_argument("--weight_dropout_in", type=float, default=0.01)
parser.add_argument("--weight_dropout_hidden", type=float, default=0.1)
parser.add_argument("--char_dropout_prob", type=float, default=0.33)
parser.add_argument("--char_noise_prob", type = float, default= 0.01)
parser.add_argument("--learning_rate", type = float, default= 0.1)
parser.add_argument("--myID", type=int, default=random.randint(0,1000000000))
parser.add_argument("--sequence_length", type=int, default=20)


args=parser.parse_args()
print(args)


from acqdivReader import AcqdivReader, AcqdivReaderPartition

acqdivCorpusReader = AcqdivReader(args.language)



def plus(it1, it2):
   for x in it1:
      yield x
   for x in it2:
      yield x

# read the character vocabulary
try:
   with open("/checkpoint/mhahn/char-vocab-acqdiv-"+args.language, "r") as inFile:
     itos = inFile.read().strip().split("\n")
except FileNotFoundError: # or, if that fails, construct one for the language
    print("Creating new vocab")
import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--language", dest="language", type=str)
import random

args = parser.parse_args()
print(args)

from acqdivReader import AcqdivReader, AcqdivReaderPartition

acqdivCorpusReader = AcqdivReader(args.language)

import syllabificationJapanese

syllables = {}

for chunk in acqdivCorpusReader.iterator():
    #       print(len(chunk))

    # tokenize chunk into valid syllables
    words = chunk.split(" ")
    for word in words:
        if word == "\n":
            continue
        if len(word) == 0:
            continue
        if word == "n":
            continue
        if word == "???":
            continue