from config import VOCAB_HOME from acqdivReader import AcqdivReader import argparse parser = argparse.ArgumentParser() parser.add_argument("--language", dest="language", type=str) import random args = parser.parse_args() print(args) acqdivCorpusReader = AcqdivReader(args.language) vocabulary = set() iterator = acqdivCorpusReader.iterator() for utterance in iterator: utterance = utterance.split(" ") for word in utterance: vocabulary.add(word) with open(VOCAB_HOME + args.language + '-vocab.txt', "w") as outFile: for word in vocabulary: print(word, file=outFile)
parser.add_argument("--weight_dropout_in", type=float, default=0.01) parser.add_argument("--weight_dropout_hidden", type=float, default=0.1) parser.add_argument("--char_dropout_prob", type=float, default=0.33) parser.add_argument("--char_noise_prob", type = float, default= 0.01) parser.add_argument("--learning_rate", type = float, default= 0.1) parser.add_argument("--myID", type=int, default=random.randint(0,1000000000)) parser.add_argument("--sequence_length", type=int, default=20) args=parser.parse_args() print(args) from acqdivReader import AcqdivReader, AcqdivReaderPartition acqdivCorpusReader = AcqdivReader(args.language) def plus(it1, it2): for x in it1: yield x for x in it2: yield x # read the character vocabulary try: with open("/checkpoint/mhahn/char-vocab-acqdiv-"+args.language, "r") as inFile: itos = inFile.read().strip().split("\n") except FileNotFoundError: # or, if that fails, construct one for the language print("Creating new vocab")
import argparse parser = argparse.ArgumentParser() parser.add_argument("--language", dest="language", type=str) import random args = parser.parse_args() print(args) from acqdivReader import AcqdivReader, AcqdivReaderPartition acqdivCorpusReader = AcqdivReader(args.language) import syllabificationJapanese syllables = {} for chunk in acqdivCorpusReader.iterator(): # print(len(chunk)) # tokenize chunk into valid syllables words = chunk.split(" ") for word in words: if word == "\n": continue if len(word) == 0: continue if word == "n": continue if word == "???": continue