Beispiel #1
0
def main():
    s = ''

    parser = argparse.ArgumentParser()
    parser.add_argument('--model-path', type=str, default=GENERATOR_MODEL_DIR)
    parser.add_argument('--token-vocab-path', type=str, default=None)
    parser.add_argument('--stress-vocab-path', type=str, default=None)
    parser.add_argument('--metre-schema', type=str, default='+-')
    parser.add_argument('--rhyme-pattern', type=str, default='abab')
    parser.add_argument('--n-syllables', type=int, default=8)
    parser.add_argument('--sampling-k', type=int, default=50000)
    parser.add_argument('--beam-width', type=int, default=None)
    parser.add_argument('--temperature', type=float, default=1.0)
    parser.add_argument('--last-text', type=str, default="")
    parser.add_argument('--count', type=int, default=1)
    args = parser.parse_args()

    kwargs = vars(args)
    count = kwargs.pop('count')

    engine = Engine()
    engine.load(RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT)
    for seed in range(count):
        print(seed)
        try:
            poem = engine.generate_poem(seed=seed, **kwargs)
            print(poem)
            s += poem
        except AssertionError as e:
            print("Error: ", e)
            s += e
    return s
Beispiel #2
0
    def handle(self, *args, **options):
        engine = Engine(language="ru")
        poems = Poem.objects.all()
        begin = int(options.get('from'))
        end = int(
            options.get('to')) if options.get('to') is not None else len(poems)
        poems = Poem.objects.all()[begin:end]

        xml_path = str(
            options.get('xml')) if options.get('xml') is not None else None
        raw_path = str(
            options.get('raw')) if options.get('raw') is not None else None

        db = options.get('db')
        author = options.get("author")
        markup_version = MarkupVersion.objects.get_or_create(name=author)[0]
        ModelMarkup.objects.filter(markup_version=markup_version).delete()

        xml_writer = None
        raw_writer = None
        if xml_path is not None:
            xml_path = os.path.join(BASE_DIR, xml_path)
            xml_writer = Writer(FileType.XML, xml_path)
            xml_writer.open()
        if raw_path is not None:
            raw_path = os.path.join(BASE_DIR, raw_path)
            raw_writer = Writer(FileType.RAW, raw_path)
            raw_writer.open()
        i = 0
        stress_predictor = engine.get_stress_predictor(
            stress_model_path=STRESS_MODEL,
            zalyzniak_dict=ZALYZNYAK_DICT,
            stress_trie_path=TRIE_PATH,
            raw_stress_dict_path=RAW_DICT_PATH)
        for p in poems:
            if "Automatic" in author:
                markup = Markup.process_text(p.text, stress_predictor)
                markup, result = MetreClassifier.improve_markup(markup)
                if xml_writer is not None:
                    xml_writer.write_markup(markup)
                if raw_writer is not None:
                    raw_writer.write_markup(markup)
                if db:
                    ModelMarkup.objects.create(poem=p,
                                               text=markup.to_json(),
                                               author="Automatic2",
                                               additional=result.to_json(),
                                               markup_version=markup_version)
            else:
                markup = p.markups.filter(author=author)[0]
                if xml_writer is not None:
                    xml_writer.write_markup(markup.get_markup())
                if raw_writer is not None:
                    raw_writer.write_markup(markup.get_markup())
            i += 1
            print(i)
        if raw_writer is not None:
            raw_writer.close()
        if xml_writer is not None:
            xml_writer.close()
Beispiel #3
0
 def setUpClass(cls):
     cls.engine = Engine(language="ru")
     cls.engine.load(stress_model_path=RU_STRESS_DEFAULT_MODEL,
                     g2p_model_path=RU_G2P_DEFAULT_MODEL,
                     zalyzniak_dict=ZALYZNYAK_DICT,
                     raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
                     stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH)
Beispiel #4
0
def run(stress_model_path, g2p_model_path, grapheme_set, g2p_dict_path, aligner_dump_path, raw_stress_dict_path,
        stress_trie_path, zalyzniak_dict, ru_wiki_dict, cmu_dict):
    raw_writer = Writer(FileType.RAW, MARKUPS_DUMP_RAW_PATH)
    raw_writer.open()
    i = 0
    path = "/media/data/stihi_ru_clean"
    paths = get_paths(path, "")
    engine = Engine(language="ru")
    engine.load(stress_model_path, g2p_model_path, grapheme_set, g2p_dict_path, aligner_dump_path, raw_stress_dict_path,
        stress_trie_path, zalyzniak_dict, ru_wiki_dict, cmu_dict)
    for filename in paths:
        with open(filename, "r", encoding="utf-8") as file:
            text = ""
            is_text = False
            try:
                for file_line in file:
                    if "<div" in file_line:
                        is_text = True
                    elif "</div>" in file_line:
                        is_text = False
                        clean_text = ""
                        skip = False
                        lines = text.split("\n")
                        for line in lines:
                            if line == "":
                                continue
                            for ch in line:
                                if "a" < ch < "z" or "A" < ch < "Z" or ch == "Ј":
                                    skip = True
                                    break
                            clean_text += line.strip() + "\n"
                        if not skip:
                            print(clean_text.split("\n")[:2])
                            markup, result = engine.get_improved_markup(clean_text)
                            raw_writer.write_markup(markup)
                        else:
                            print("Skipped")
                        i += 1
                        print(i)
                        text = ""
                    elif is_text:
                        text += file_line.strip() + "\n"
            except Exception as e:
                pass
    raw_writer.close()
Beispiel #5
0
 def setUpClass(cls):
     cls.engine = Engine(language="ru")
     cls.engine.load(
         stress_model_path=RU_STRESS_DEFAULT_MODEL,
         g2p_model_path=RU_G2P_DEFAULT_MODEL,
         zalyzniak_dict=ZALYZNYAK_DICT,
         ru_wiki_dict=RU_WIKI_DICT,
         cmu_dict=CMU_DICT,
         raw_stress_dict_path=RU_GRAPHEME_STRESS_PATH,
         stress_trie_path=RU_GRAPHEME_STRESS_TRIE_PATH,
         aligner_dump_path=RU_ALIGNER_DEFAULT_PATH,
         g2p_dict_path=RU_G2P_DICT_PATH
     )
Beispiel #6
0
from rupo.settings import RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT, GENERATOR_MODEL_DIR


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-path', type=str, default=GENERATOR_MODEL_DIR)
    parser.add_argument('--token-vocab-path', type=str, default=None)
    parser.add_argument('--stress-vocab-path', type=str, default=None)
    parser.add_argument('--metre-schema', type=str, default='+-')
    parser.add_argument('--rhyme-pattern', type=str, default='abab')
    parser.add_argument('--n-syllables', type=int, default=16)
    parser.add_argument('--sampling-k', type=int, default=None)
    parser.add_argument('--beam-width', type=int, default=8)
    parser.add_argument('--temperature', type=float, default=2.0)
    parser.add_argument('--last-text', type=str, default="")
    parser.add_argument('--count', type=int, default=20)
    args = parser.parse_args()

    kwargs = vars(args)
    count = kwargs.pop('count')

    engine = Engine()
    engine.load(RU_STRESS_DEFAULT_MODEL, ZALYZNYAK_DICT)
    for seed in range(count):
        print(seed)
        try:
            poem = engine.generate_poem(seed=seed, **kwargs)
            print(poem)
        except AssertionError as e:
            print("Error: ", e)
Beispiel #7
0
def main():
    parser = ArgumentParser()
    parser.add_argument('-s', '--src', dest='src_name', type=str, required=True,
                        help='A text file with source dictionary.')
    parser.add_argument('-d', '--dst', dest='dst_name', type=str, required=True,
                        help='A JSON file with destination dictionary.')
    args = parser.parse_args()

    with codecs.open(args.src_name, mode='r', encoding='utf-8', errors='ignore') as fp:
        all_words = sorted(list(set(
            map(
                lambda it5: it5[1],
                filter(
                    lambda it4: check_word(it4[1]) and (int(it4[0]) >= 10),
                    map(
                        lambda it3: it3.lower().strip().split(),
                        filter(lambda it2: len(it2) > 0, map(lambda it1: it1.strip(), fp.readlines()))
                    )
                )
            )
        )))
    print('Number of selected words is {0}.'.format(len(all_words)))
    words_dict = dict()
    morph = pymorphy2.MorphAnalyzer()
    to_ud20 = converters.converter('opencorpora-int', 'ud20')
    engine = Engine(language="ru")
    engine.load(
        os.path.join(os.path.dirname(__file__), 'rupo', 'rupo', 'data', 'stress_models',
                     'stress_ru_LSTM64_dropout0.2_acc99_wer8.h5'),
        os.path.join(os.path.dirname(__file__), 'rupo', 'rupo', 'data', 'dict', 'zaliznyak.txt')
    )
    syllables_of_words = dict()
    counter = 0
    unknown_counter = 0
    for cur_word in all_words:
        if cur_word in syllables_of_words:
            n_syllables = syllables_of_words[cur_word]
        else:
            n_syllables = len(engine.get_word_syllables(cur_word))
            syllables_of_words[cur_word] = n_syllables
        if n_syllables == 0:
            continue
        parsing = morph.parse(cur_word)
        if unknown_word(parsing):
            unknown_counter += 1
        else:
            for it in parsing:
                morphodata = get_morphodata(to_ud20(str(it.tag)))
                if morphodata is None:
                    continue
                if morphodata in words_dict:
                    if n_syllables in words_dict[morphodata]:
                        words_dict[morphodata][n_syllables].add(cur_word)
                    else:
                        words_dict[morphodata][n_syllables] = {cur_word}
                else:
                    words_dict[morphodata] = {n_syllables: {cur_word}}
        counter += 1
        if counter % 10000 == 0:
            print('{0} words have been processed...'.format(counter))
    print('There are {0} unknown words.'.format(unknown_counter))
    for morphodata in words_dict:
        for n_syllables in words_dict[morphodata]:
            words_dict[morphodata][n_syllables] = sorted(list(words_dict[morphodata][n_syllables]))
    with codecs.open(args.dst_name, mode='w', encoding='utf-8', errors='ignore') as fp:
        json.dump(words_dict, fp, ensure_ascii=False, indent=4)
Beispiel #8
0
 def setUpClass(cls):
     cls.engine = Engine(language="ru")
     cls.engine.load(stress_model_path=RU_STRESS_DEFAULT_MODEL,
                     zalyzniak_dict=ZALYZNYAK_DICT)
Beispiel #9
0

nltk.download('punkt')
with open('pushkin.txt', encoding='utf-8') as f:
    text = f.read()

# TODO: DELETE THIS
#text = text[:10000]

text = clean_text(text)
tokens = word_tokenize(text)  # ~100k words

print('Loading engine...')

# Load engine
ENGINE = Engine(language='ru')

# Takes long time
ENGINE.load('~/AutoPoetry/stress_ru.h5', '/home/sp/AutoPoetry/zaliznyak.txt')

print('Engine loaded!')

words = set(tokens)
len(words)

"""
word_freq = {}
for _word in text:
    word_freq[_word] = word_freq.get(_word, 0) + 1

ignored_words = set()