Esempio n. 1
0
from config import VOCAB_HOME

from acqdivReader import AcqdivReader

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--language", dest="language", type=str)
import random

args = parser.parse_args()
print(args)

acqdivCorpusReader = AcqdivReader(args.language)

vocabulary = set()
iterator = acqdivCorpusReader.iterator()
for utterance in iterator:
    utterance = utterance.split(" ")
    for word in utterance:
        vocabulary.add(word)
with open(VOCAB_HOME + args.language + '-vocab.txt', "w") as outFile:
    for word in vocabulary:
        print(word, file=outFile)
parser = argparse.ArgumentParser()
parser.add_argument("--language", dest="language", type=str)
import random

args = parser.parse_args()
print(args)

from acqdivReader import AcqdivReader, AcqdivReaderPartition

acqdivCorpusReader = AcqdivReader(args.language)

import syllabificationJapanese

syllables = {}

for chunk in acqdivCorpusReader.iterator():
    #       print(len(chunk))

    # tokenize chunk into valid syllables
    words = chunk.split(" ")
    for word in words:
        if word == "\n":
            continue
        if len(word) == 0:
            continue
        if word == "n":
            continue
        if word == "???":
            continue
        syllabification = syllabificationJapanese.syllabify(word)
        if syllabification is None: