Exemple #1
0
    def __init__(self, lang='es', text_analysis=None):
        try:
            if text_analysis is None:
                self.ta = TextAnalysis(lang=lang)
            else:
                self.ta = text_analysis
            file_lexicon = DIR_INPUT + 'NRC-VAD-Lexicon.txt'
            file_word_embedding_en = DIR_MODELS + 'word_embedding_en.model'
            file_word_embedding_es = DIR_MODELS + 'word_embedding_es.model'
            file_syllable_embedding_en = DIR_MODELS + 'syllable_embedding_en.model'
            file_syllable_embedding_es = DIR_MODELS + 'syllable_embedding_es.model'
            file_phoneme_embedding_en = DIR_MODELS + 'phoneme_embedding_en.model'
            file_phoneme_embedding_es = DIR_MODELS + 'phoneme_embedding_es.model'
            print('Loading Lexicons and Embedding.....')
            if lang == 'es':
                epi = epitran.Epitran('spa-Latn')
                lexicon = self.ta.import_lexicon_vad(file_lexicon, lang=lang)
                word_embedding = Word2Vec.load(file_word_embedding_es)
                syllable_embedding = Word2Vec.load(file_syllable_embedding_es)
                phoneme_embedding = Word2Vec.load(file_phoneme_embedding_es)
            else:
                epi = epitran.Epitran('eng-Latn')
                lexicon = self.ta.import_lexicon_vad(file_lexicon, lang=lang)
                word_embedding = Word2Vec.load(file_word_embedding_en)
                syllable_embedding = Word2Vec.load(file_syllable_embedding_en)
                phoneme_embedding = Word2Vec.load(file_phoneme_embedding_en)

            self.epi = epi
            self.lexicon = lexicon
            self.word_embedding = word_embedding
            self.syllable_embedding = syllable_embedding
            self.phoneme_embedding = phoneme_embedding
        except Exception as e:
            Utils.standard_error(sys.exc_info())
            print('Error FeatureExtraction: {0}'.format(e))
Exemple #2
0
class FicheEng(CardLanguage):
    #Phonetic translation
    eng = epitran.Epitran('eng-Latn')

    #Apply phonetic translation
    def translate(self, word):
        self.phonetic = eng.transliterate(unicode(word, 'utf-8'))
Exemple #3
0
class FicheSp(CardLanguage):
    #Phonetic translation
    sp = epitran.Epitran('spa-Latn')

    #Apply phonetic translation
    def translate(self, word):
        self.phonetic = sp.transliterate(unicode(word, 'utf-8'))
Exemple #4
0
    def __init__(self):
        import epitran
        print(".. load rus-Cyrl")
        self.epi = epitran.Epitran('rus-Cyrl')

        self.target_file = f'{cf.conf_dir}/langs/voc/ru-map.json'
        self.target_file_rev = f'{cf.conf_dir}/langs/voc/ru-rev-map.json'
Exemple #5
0
 def __init__(self, lang):
     lang_ipa = {'es': 'spa-Latn', 'en': 'eng-Latn'}
     lang_stemm = {'es': 'spanish', 'en': 'english'}
     self.lang = lang
     self.stemmer = SnowballStemmer(language=lang_stemm[lang])
     self.epi = epitran.Epitran(lang_ipa[lang])
     self.nlp = self.load_sapcy(lang)
def gettrans():
    lang = request.args.get('lang', 'eng-Latn', type=str)
    textin = request.args.get('textin', '', type=str)
    epi = epitran.Epitran(lang,
                          cedict_file='data/cedict_1_0_ts_utf-8_mdbg.txt')
    trans = ' '.join([epi.transliterate(w) for w in textin.split(' ')])
    return trans
Exemple #7
0
class FicheFr(CardLanguage):
    #Phonetic translation
    fr = epitran.Epitran('fra-Latn')

    #Apply phonetic translation
    def translate(self, word):
        self.phonetic = fr.transliterate(unicode(word, 'utf-8'))
Exemple #8
0
def main(code):
    epi = epitran.Epitran(code)
    for line in sys.stdin:  # pointless
        line = line.decode('utf-8')
        line = unicodedata.normalize('NFD', line.lower())
        line = epi.transliterate(line)
        line = line.encode('utf-8')
        sys.stdout.write(line)
Exemple #9
0
def main(fn):
    epi = epitran.Epitran('uig-Arab')
    vwis = epitran.vector.VectorsWithIPASpace('uig-Arab', ['uig-Arab'])
    tree = etree.parse(fn)
    root = tree.getroot()
    for token in root.findall('.//TOKEN'):
        # print(token.text.encode('utf-8'))
        print(epi.transliterate(unicode(token.text)).encode('utf-8'))
def main(mode, fn):
    epi = epitran.Epitran(mode)
    with open(fn, encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            a, b, gloss = line.split('\t')
            ipa = epi.transliterate(a)
            print('\t'.join([ipa, a, b, gloss]))
Exemple #11
0
def convert_to_ipa(texts):
    epi = epitran.Epitran('eng-Latn')
    for text_mel_pair in texts:
        text_mel_pair[1] = ipa.convert(text_mel_pair[1])
        foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1])
        for word in foreign_words:
            text_mel_pair[1] = text_mel_pair[1].replace(
                word, epi.transliterate(word[0:len(word) - 1]))
def detect_languages(langs, trans):
    with open("./text_logger/languges.json", "r") as read_file:
        lg = json.load(read_file)
    
    with open("./text_logger/languges_for_transcription.json", "r") as read_file:
        tc = json.load(read_file)
    
    rs = []
    nd = []
    global epis
    
    def add_print(langu):
        print_on_magenta(f'---> added {langu} language')
    
    def add2_print(langu, is_not_sup):
        if is_not_sup:
            print(f'\tlanguage {langu} will be trancripted (with limited support)')
        else:
            print(f'\tlanguage {langu} will be trancripted')
    
    for lang, need in zip(langs,trans):
        
        f = False
        
        if lang in lg.values():
            rs.append(lang)
            f = True
            add_print(lang)
        elif lang in lg.keys():
            rs.append(lg[lang])
            f = True
            add_print(lang)
        else:
            for k in lg.keys():
                if k.startswith(lang):
                    rs.append(lg[k])
                    add_print(k)
                    f = True
                    break
        
        if not f:
            print_on_red(f"I donna this language: '{lang}'. See json file to correct it")
        else:
            nd.append(need)
            
            itlang = rs[len(rs)-1]
            epitran_lang = [key for key, _ in tc.items() if key.startswith(itlang)][0]
            
            if need:
                epis[itlang] = epitran.Epitran(epitran_lang)
                add2_print(*tc[epitran_lang])
            


    if len(rs) == 0:
        print_on_red('There are no correct languages in ur list. See json file to correct it')

    return rs, nd
Exemple #13
0
def convert_to_ipa(texts):
    print("Converting training files to IPA notation...")
    epi = epitran.Epitran('eng-Latn', ligatures=True)
    for text_mel_pair in texts:
        text_mel_pair[1] = ipa.convert(english_cleaners(text_mel_pair[1]))
        foreign_words = re.findall(r"[^ ]{0,}\*", text_mel_pair[1])
        for word in foreign_words:
            text_mel_pair[1] = text_mel_pair[1].replace(
                word, epi.transliterate(word[0:len(word) - 1]))
Exemple #14
0
def main(code, op, infiles, output):
    epi = epitran.Epitran(code)
    ft = panphon.FeatureTable()
    space = Counter()
    for fn in infiles:
        logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
        add_file = add_file_op if op else add_file_gen
        space.update(add_file(epi, ft, fn))
    print_space(output, space)
def generate_from_file(tacotron2_path, waveglow_path, text_file, output_directory):

  # Make synthesis paths

  if not os.path.exists(output_directory):
    os.makedirs(output_directory)
    print("Creating directory " + output_directory + "...")

  hparams = create_hparams()
  hparams.sampling_rate = 22050

  print("Loading models...")
  model = load_model(hparams)
  model.load_state_dict(torch.load(tacotron2_path)['state_dict'])
  _ = model.cuda().eval().half()

  waveglow = torch.load(waveglow_path)['model']
  waveglow.cuda().eval().half()
  for k in waveglow.convinv:
      k.float()
  denoiser = Denoiser(waveglow)

  genlist = []
  with open(text_file) as file:
    for line in file:
      genlist.append(line.strip())

  for entry in genlist:
    wav_name = "_".join(entry.split(" ")[:4]).lower() + ".wav"

    epi = epitran.Epitran('eng-Latn', ligatures = True)
    if hparams.preprocessing == "ipa":
      entry = ipa.convert(english_cleaners(entry))
      foreign_words = re.findall(r"[^ ]{0,}\*", entry)
      for word in foreign_words:
        entry = entry.replace(word, epi.transliterate(word[0:len(word)-1]))
    if hparams.preprocessing == "arpabet":
      entry = make_arpabet(entry)

    # Text sequencer
    if hparams.preprocessing is not None:
      sequence = np.array(text_to_sequence(entry, None))[None, :]
    else:
      sequence = np.array(text_to_sequence(entry, ['english_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
      torch.from_numpy(sequence)).cuda().long()

    # Synthesis
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
    with torch.no_grad():
      audio = waveglow.infer(mel_outputs_postnet, sigma=0.666)
    audio_denoised = denoiser(audio, strength=0.01)[:, 0]

    # Save audio
    print ("Saving " + wav_name)
    write(os.path.join(output_directory, wav_name), hparams.sampling_rate, audio_denoised[0].data.cpu().numpy())
def main(unused_argv):
  if not FLAGS.language_id:
    raise ValueError("Specify --language_id!")

  logging.info("Initializing Epitran for \"%s\" ...", FLAGS.language_id)
  epi = epitran.Epitran(FLAGS.language_id)

  logging.info("Processing Bible ...")
  reader = BibleEpitranReader(epi)
  reader.read()
def main(mode, fnin, fnout):
    epi = epitran.Epitran(mode)
    tree = etree.parse(fnin)
    root = tree.getroot()
    with open(fnout, 'w', encoding='utf-8') as fout:
        writer = csv.writer(fout, dialect='excel-tab')
        for entry in root.xpath('//ENTRY'):
            lemma = entry.find('LEMMA').text
            gloss = entry.find('GLOSS').text
            writer.writerow([epi.transliterate(lemma), gloss])
Exemple #18
0
def main(fn):
    epi = epitran.Epitran('ori-Orya')
    with open(fn, encoding='utf-8') as f:
        reader = csv.reader(f, dialect='excel-tab')
        for lemma, input, props, gloss in reader:
            props = props.replace(' ', '+')
            gold_analysis = lemma + props
            phonemic_input = epi.transliterate(input)
            print('\t'.join(
                [input, phonemic_input, lemma, gold_analysis, gloss]))
 def test_quick(self):
     y = u"p͡f"
     epi = epitran.Epitran('eng-Latn')
     y = epi.transliterate(
         "At Müller's execution there was great competition for front seats,"
     )
     #y += " ɡɹât͡ʃi"
     y += "?"
     res = extract_from_sentence(y, ignore_tones=True, ignore_arcs=True)
     print(res)
Exemple #20
0
def init():
    """Init all the phonemisers."""
    languages = [
        p.name for p in pathlib.Path("lib/data/phon/", ).glob("*")
        if not p.name == "README.md"
    ]
    lookup_tables = {}

    # If we have Epitran
    for language in iso_2to3:
        if language.startswith("zh-"):
            lookup_tables[language] = epitran.Epitran(
                iso_2to3[language],
                cedict_file="lib/data/dict/zh",
            )
        else:
            lookup_tables[language] = epitran.Epitran(iso_2to3[language])

    # Otherwise fallback to TSV-style
    for language in languages:
        if language == "zh":
            continue
        if language.startswith("ja-"):
            continue
        lines = open("lib/data/phon/" + language).readlines()
        if len(lines) == 0:
            continue
        lookup_tables[language] = {}
        for line in lines:
            if line.strip() == "":
                continue
            kv = line.strip().split("\t")
            if len(kv) != 2:
                print("!", kv, file=sys.stderr)
                continue
            k = kv[0].strip()
            v = kv[1].strip()
            if k not in lookup_tables[language]:
                lookup_tables[language][k] = []
            lookup_tables[language][k].append(v)

    return lookup_tables
def create_epitran_dict():
    """Return a dictionary of languages to Epitran Objects."""
    codes = pd.read_csv(SUPPORTED_LANGS_PATH, sep='\t', header=0, error_bad_lines=False)['Code']
    epitran_dict = {}
    for code in codes:
        if code[:3] in epitran_dict: continue
        try:
            epitran_dict[code[:3]] = epitran.Epitran(f'{code}')
        except OSError:
            continue
    return epitran_dict
Exemple #22
0
def read_input(input_, langscript):
    space = set()
    epi = epitran.Epitran(langscript)
    ft = panphon.featuretable.FeatureTable()
    for dirname in input_[0]:
        for fn in glob.glob(os.path.join(dirname, '*.ltf.xml')):
            for token in read_tokens(fn):
                ipa = epi.transliterate(token)
                for seg in ft.segs_safe(ipa):
                    space.add(seg)
    return space
Exemple #23
0
def to_ipa(fname, lang1, lang2):
    epitran_map = {}
    with open(map_file, "r", encoding="utf-8") as f:
        for line in f:
            tks = line.strip().split("\t")
            epitran_map[tks[1]] = tks[0]
    epi1 = epitran.Epitran(epitran_map[lang1])
    epi2 = epitran.Epitran(epitran_map[lang2])
    fsave = fname + ".ipa"
    with open(fname, "r",
              encoding="utf-8") as f, open(fsave, "w+",
                                           encoding="utf-8") as fout:
        for line in f:
            tks = line.strip().split(" ||| ")
            if len(tks) < 3:
                continue
            _tks = [x for x in tks]
            tks[1] = epi1.transliterate(tks[1])
            tks[2] = epi2.transliterate(tks[2])
            # tks = tks[:3] + _tks[2:3] + tks[3:]
            fout.write(" ||| ".join(tks) + "\n")
Exemple #24
0
def transcribe_song_fr(text):
    import epitran
    import codecs
    epi = epitran.Epitran('fra-Latn')

    # f = codecs.open(filename, 'r', 'utf8')
    # text = f.read()
    # f.close()

    transcribed_song = epi.transliterate(text)
    print(text, transcribed_song)
    return transcribed_song
Exemple #25
0
def _phonemize(text, language):
    try:
        seperators = Separator(word=' ', phone='')
        phonemes = phonemize(text,
                             separator=seperators,
                             backend='espeak',
                             language=language)
    except RuntimeError:
        epi = epitran.Epitran(language)
        phonemes = epi.transliterate(text, normpunc=True)
    phonemes.replace('\n', ' ', 1)
    return phonemes
Exemple #26
0
    def __init__(self, code, table, decompose=True, cedict_file=None):
        """Construct object for re-romanizing Epitran output.

        This class converts orthographic input, via Epitran, to a more
        conventional romanization that should be more readable to most humans.

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            table (str): Name of re-romanization table
            decompose (bool): apply decomposing normalization
        """
        self.epi = epitran.Epitran(code, cedict_file=cedict_file)
        self.mapping = self._load_reromanizer(table, decompose)
Exemple #27
0
def multip_write_features_to_csv():
    lexicon = 'saldo'
    corpus = 'news'
    gold_blends = blend_keys()

    wg_path = '/home/adam/Documents/Magisteruppsats_VT18/ddata/word_embeddings/corpora/w2v_newsa_min1'
    wsm = gs.models.Word2Vec.load(wg_path)
    cg_path = '/home/adam/Documents/lexical_blends_project/embeddings/saldo_embeddings_window5_skipgram_negsampling_fasttext'
    csm = gs.models.Word2Vec.load(cg_path)
    epit = epitran.Epitran('swe-Latn')

    csvf = open('{0}_features_overlap_split_020818.csv'.format(lexicon),
                '+w',
                newline='')
    csvw = csv.writer(csvf, delimiter=',')

    T, F = 0, 0

    dataf = f'/home/adam/Documents/lexical_blends_project/lexicon_wordlists/{lexicon}_{corpus}_wordlist_f.pickle'

    with open(dataf, 'rb') as f:
        freqd = pickle.load(f)

    # overlap
    candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blend_candidates_1/'
    # noverlap
    #candidate_folder = '/home/adam/Documents/lexical_blends_project/saldo_blends_candidates_noverlap_1/'

    cand_set = []

    for i, filename in enumerate(listdir(candidate_folder)):
        blend = filename.split('_')[0]
        #print('#', i ,'reading', blend, 'from', candidate_folder+filename)
        with open(candidate_folder + filename) as f:
            for ln in f:
                cw1, cw2 = ln.rstrip().split(',')
                if blend in [cw1, cw2]:
                    continue
                sw1, sw2 = gold_blends[blend]
                cand_set.append((blend, cw1, cw2, lexicon, corpus, sw1, sw2,
                                 freqd, csm, wsm, epit))

    for cand_chunk in chunks(cand_set, 10):
        with Pool(3) as p:
            entires = p.starmap(extract_sample_features, cand_chunk)
            print('# writing entries')
            for entry in entires:
                for e in entry:
                    csvw.writerow(list(map(lambda x: str(x), e[0].values())))

    csvf.close()
Exemple #28
0
def result():    
    db=sqlite3.connect('coarsewords.db')
    wordsearch = request.args.get('word')
    langsearch = request.args.get('lang')
    import epitran
    epi = epitran.Epitran(epitran_langs[langsearch])
    wordipa = ''
    try:
        wordipa = epi.transliterate(wordsearch)
    except KeyError:
        pass
    res = db.execute(" select word, categories, phonetics, definition, etymology, langs from coarseword WHERE word='"+wordsearch+"' or phonetics='"+wordipa+"'")
    words = res.fetchall()
    return render_template('result.html', words=words) 
Exemple #29
0
def main(fnin):
    epi = epitran.Epitran('hin-Deva')
    st = mstem.Stemmer('ben-IPA', ['../lexicons/ben.tsv'])
    with open(fnin, encoding='utf-8') as fin:
        for line in fin:
            line = line.strip()
            for token in line.split(' '):
                ipa = epi.transliterate(token)
                parse = st.gloss(ipa)
                lemma = parse[0]
                morph = '+'.join(parse[1:])
                morph = morph if morph else '<unk>'
                print('w:{}~l:{}~m:{}~ipa:{}'.format(token, lemma, morph, ipa))
            print('')
Exemple #30
0
 def to_phoneme_de(self, language='deu-Latn'):
     epi = epitran.Epitran(language)  # lang='deu-Latn' oder 'eng-Latn'
     for line in self.data:
         temp = ''
         for word in line['s'].split():
             #print(word)
             try:
                 phoneme = epi.transliterate(word)
                 print(phoneme)
                 temp = temp + phoneme + ' '
                 #print(type(phoneme), phoneme, word)
             except:
                 temp += word.join(' ')
         #print(temp)
         line['s'] = temp.strip()