Example #1
0
    def __init__(self, embed_size, hidden_size, panphon, model_name, load_model=False, train_file=None, val_file=None):
        self.model_name = model_name
        self.model = dy.ParameterCollection()
        self.panphon = panphon
        
        if self.panphon:
            self.ft = pp.FeatureTable()
            self.ws_panphon = self.model.add_parameters((embed_size, VEC_SIZE))
            self.bs_panphon = self.model.add_parameters((embed_size))
        else:
            self.source_vocab = defaultdict(lambda: len(self.source_vocab))
            self.target_vocab = defaultdict(lambda: len(self.target_vocab))
            self.source_lookup = self.model.add_lookup_parameters((len(self.source_vocab), embed_size))
            self.target_lookup = self.model.add_lookup_parameters((len(self.target_vocab), embed_size))

        self.training_data = self.read_train(train_file)
        if val_file:
            self.validation_data = self.read_data(val_file)

        self.source_lstm_forward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model)
        self.source_lstm_backward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model)
        self.target_lstm_forward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model)
        self.target_lstm_backward = dy.LSTMBuilder(1, embed_size, hidden_size/2, self.model)

        # load model only if flag is true. will overwrite existing model if flag is false. set flag to True for fine-tuning or encoding
        if load_model:
            self.model.populate(self.model_name)
            print("Populated! " + self.model_name)

        print('done')
Example #2
0
    def __init__(self, code, preproc=True, postproc=True, ligatures=False,
                 rev=False, rev_preproc=True, rev_postproc=True):
        """Constructs the backend object epitran uses for most languages

        Args:
            code (str): ISO 639-3 code and ISO 15924 code joined with a hyphen
            preproc (bool): if True, apply preprocessor
            postproc (bool): if True, apply postprocessors
            ligatures (bool): if True, use phonetic ligatures for affricates
                              instead of standard IPA
            rev (bool): if True, load reverse transliteration
            rev_preproc (bool): if True, apply preprocessor when reverse transliterating
            rev_postproc (bool): if True, apply postprocessor when reverse transliterating
        """
        self.rev = rev
        self.g2p = self._load_g2p_map(code, False)
        self.regexp = self._construct_regex(self.g2p.keys())
        self.puncnorm = PuncNorm()
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
        self.preprocessor = PrePostProcessor(code, 'pre', False)
        self.postprocessor = PrePostProcessor(code, 'post', False)
        self.strip_diacritics = StripDiacritics(code)
        self.preproc = preproc
        self.postproc = postproc
        self.ligatures = ligatures
        self.rev_preproc = rev_preproc
        self.rev_postproc = rev_postproc
        if rev:
            self.rev_g2p = self._load_g2p_map(code, True)
            self.rev_regexp = self._construct_regex(self.rev_g2p.keys())
            self.rev_preprocessor = PrePostProcessor(code, 'pre', True)
            self.rev_postprocessor = PrePostProcessor(code, 'post', True)

        self.nils = defaultdict(int)
Example #3
0
    def __init__(self, incl_stress=True, incl_syllables=True):
        def preproc(row):
            row = row.split()

            if not incl_stress:
                row = map(lambda seg: seg.replace("1", ""), row)
            if not incl_syllables:
                row = map(lambda seg: seg.replace("-", ""), row)

            if incl_stress and incl_syllables:
                # Then let's move the stress to the beginning of its syllable.
                last_syl_marker_idx = -1
                for idx, unit in enumerate(row[:]):
                    last_syl_marker_idx = idx if unit == self.SYLL else last_syl_marker_idx
                    if self.STRESS in unit:
                        row[idx] = unit.replace(self.STRESS, "")
                        row.insert(last_syl_marker_idx + 1, self.STRESS)

            return " ".join([
                self.ipa_map[segment] if segment in self.ipa_map else segment
                for segment in row
            ])

        raw_rows = open(self.PATH).readlines()
        self.rows = [preproc(row) for row in raw_rows]

        self._phonemes = self.get_phonemes()
        self.feature_table = panphon.FeatureTable()
Example #4
0
 def __init__(self, code):
     self.g2p = self._load_g2p_map(code)
     self.regexp = self._construct_regex()
     self.puncnorm = self._load_punc_norm_map()
     self.puncnorm_vals = self.puncnorm.values()
     self.ft = panphon.FeatureTable()
     self.num_panphon_fts = len(self.ft.names)
     self.preprocessor = PrePostProcessor(code, 'pre')
Example #5
0
    def __init__(self, infile=sys.stdin):
        """Validate Unicode IPA from file relative to panphon database.

        infile -- File from which input is taken; by default, STDIN.
        """
        self.ws_punc_regex = re.compile(r'[," \t\n]', re.V1 | re.U)
        self.ft = panphon.FeatureTable()
        self._validate_file(infile)
Example #6
0
def main(infiles, output):
    flite = epitran.flite.Flite()
    ft = panphon.FeatureTable()
    space = Counter()
    for fn in infiles:
        logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
        space.update(add_file(flite, ft, fn))
    print_space(output, space)
Example #7
0
def main(code, op, infiles, output):
    epi = epitran.Epitran(code)
    ft = panphon.FeatureTable()
    space = Counter()
    for fn in infiles:
        logging.debug(u'Scanning:\t{}'.format(fn).encode('utf-8'))
        add_file = add_file_op if op else add_file_gen
        space.update(add_file(epi, ft, fn))
    print_space(output, space)
Example #8
0
def main(fn):
    ft = panphon.FeatureTable()
    xs = epitran.xsampa.XSampa()
    with open(fn, 'rb') as f:
        reader = csv.reader(f, encoding='utf-8')
        next(reader)
        phones = set()
        for orth, phon in reader:
            phones = phones.union(set(ft.segs_safe(phon)))
    print(len(phones))
    print(sorted(list(map(xs.ipa2xs, phones))))
	def __init__(self, model, C2I):

		self.model = model
		self.C2I = C2I
		self.E_plus = model.add_lookup_parameters((22, EMBEDDING_SIZE))
		self.E_minus = model.add_lookup_parameters((22, EMBEDDING_SIZE))
		self.E_not_relevant = model.add_lookup_parameters((22, EMBEDDING_SIZE))
		self.E = model.add_lookup_parameters((len(C2I), EMBEDDING_SIZE))
		self.ft = panphon.FeatureTable()
		self.E_lang = model.add_lookup_parameters((7, EMBEDDING_SIZE))
		self.langs = ["s", "i", "r", "f", "p", "l", "sep"]
		self.W_combine = model.add_parameters((EMBEDDING_SIZE, 2 * EMBEDDING_SIZE))
Example #10
0
    def __init__(self, filename, panphon=False):
        self.panphon = panphon
        if self.panphon:
            self.ft = pp.FeatureTable()
            char_function = None
        else:
            self.source_vocab = defaultdict(lambda: len(self.source_vocab))
            self.target_vocab = defaultdict(lambda: len(self.target_vocab))
            char_function = self.char2int
            self.source_vocab[UNK]
            self.target_vocab[UNK]

        self.data = self.read_data(filename, char_function)
Example #11
0
def main():
    ft = panphon.FeatureTable()
    with open('mapping.yml') as f:
        mapping = yaml.load(f, Loader=yaml.Loader)
    phon2wav = defaultdict(list)
    for wav, phon in mapping.items():
        phon2wav[phon['tone_dias']].append(wav)
    phons = list(phon2wav.keys())
    random.seed(256)
    random.shuffle(phons)
    split1 = int(0.7 * len(phons))
    split2 = int(0.8 * len(phons))
    serialize_partition('train', mapping, phons[:split1], ft, phon2wav)
    serialize_partition('dev', mapping, phons[split1:split2], ft, phon2wav)
    serialize_partition('test', mapping, phons[split2:], ft, phon2wav)
Example #12
0
    def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None):
        """Construct a Flite "wrapper"

        Args:
            arpabet (str): file containing ARPAbet to IPA mapping
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
            cedict_filename (str): path to CC-CEDict dictionary (included for
                                   compatibility)
        """
        arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv'))
        self.arpa_map = self._read_arpabet(arpabet)
        self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U)
        self.letter_re = re.compile(r"[A-Za-z'’]+")
        self.puncnorm = PuncNorm()
        self.ligatures = ligatures
        self.ft = panphon.FeatureTable()
Example #13
0
    def __init__(self, arpabet='arpabet', ligatures=False, **kwargs):
        """Construct a Flite "wrapper"

        Args:
            arpabet (str): file containing ARPAbet to IPA mapping
            ligatures (bool): if True, use non-standard ligatures instead of
                              standard IPA
        """
        arpabet = pkg_resources.resource_filename(
            __name__, os.path.join('data', arpabet + '.csv'))
        self.arpa_map = self._read_arpabet(arpabet)
        self.chunk_re = re.compile(r"([A-Za-z'’]+|[^A-Za-z'’]+)", re.U)
        self.letter_re = re.compile(r"[A-Za-z'’]+")
        self.regexp = re.compile(r'[A-Za-z]')
        self.puncnorm = PuncNorm()
        self.ligatures = ligatures
        self.ft = panphon.FeatureTable()
        self.num_panphon_fts = len(self.ft.names)
Example #14
0
 def setUp(self):
     self.ft = panphon.FeatureTable()
     self.xs = panphon.xsampa.XSampa()
Example #15
0
 def __init__(self):
     """Construct an IPA-XSampa conversion object
     """
     self.trie = self._read_ipa2xs()
     self.ft = panphon.FeatureTable()
def acousticArrayValuesNeg(word):
    properties = ['syl', 'son', 'cons', 'cont', 'delrel', 'lat', 'nas', 'strid', 'voi', 'sg', 'cg', 'ant', 'cor', 'distr', 'lab', 'hi', 'lo', 'back', 'round', 'velaric', 'tense', 'long']
    ft=panphon.FeatureTable()
    acoustic_array = ft.word_array(properties, word)
    return ((acoustic_array<0)*acoustic_array).sum(axis=0) 
Example #17
0
 def setUp(self):
     self.dist = distance.Distance(feature_model=feature_model)
     self.ft = panphon.FeatureTable()
def phonemeCount(word):
    properties = ['syl', 'son', 'cons', 'cont', 'delrel', 'lat', 'nas', 'strid', 'voi', 'sg', 'cg', 'ant', 'cor', 'distr', 'lab', 'hi', 'lo', 'back', 'round', 'velaric', 'tense', 'long']
    ft=panphon.FeatureTable()
    acoustic_array = ft.word_array(properties, word)
    return (acoustic_array.shape[0])
"""Script to make ground truth phonological feature representations.

$ python src/features.py
"""

import os

import fire
import pandas as pd
import panphon
from tqdm import tqdm

from utils import write
from wikipron import LANGUAGES, load_inventory

FEATURES = panphon.FeatureTable()


def feature_vector(phoneme):
    return FEATURES.word_fts(phoneme)[0].numeric()


def create_features(language):
    inventory = load_inventory(language)
    phonemes = inventory["Phoneme"]
    representations = {
        phoneme: feature_vector(phoneme)
        for phoneme in phonemes
    }
    features = pd.DataFrame(representations, index=FEATURES.names).T
    os.makedirs(f"data/phoneme/features/{language}", exist_ok=True)
Example #20
0
import panphon
import difflib
import unicodedata
ft = panphon.FeatureTable()


def prefilter(string):
    string = string.replace('d̥', "t")
    string = string.replace("ɡ̥", "k")
    string = string.replace("b̥", "b")
    string = string.replace("'", "ʼ")
    return string


s = u"thi"
errorlist = []

segdict = ft.ipa_segs(s)
segpile = u"ɪaaːăbʲbʷb̞b̥cddʒdʲdːd̚d̥d͡zd͡ʑd͡ʒd͡ʒːeeːe̞ffʲfʷfːɡɡʲɡʷɡːɡ̟ʲhhʷiiːi̞i̥i̯jkk'kxkʰkʲkʷkʷ'kːk̟ʲk̟̚k͡p̚llʲlːmmʲmʷmːnnʲnːn̺ooːo̞o̥pp'pfpʰpʲpʷpːp̚rrːssʲsːtt'tstsʰtɕtɕʰtʃtʰtʲtʷ'tːt̚t̪t̪ʰt̪̚t͡st͡sʼt͡ɕt͡ɬt͡ʃt͡ʃʲt͡ʃʼt͡ʃːuuəuːvvʲvʷvːv̞v̞ʲwxyzzʲäæçðøŋŋ̟ŋ͡mœɐɐ̞ɑɓɔɕɕːɗəɛɟɡɡ̥ɣɤɤɐ̞ɤ̆ɥɦɨɪɫɯɯ̟ɯ̥ɰɱɲɴɸɹɹ̩ɻɻ̩ɽɾɾʲɾ̠ʀʂʃʃʲːʊʋʋʲʌʎʏʐʑʒʒ͡ɣʔʝββ̞θχḁ"
segpilebad = u"ăb̥d̚d̥d͡zd͡ʑd͡ʒd͡ʒːeeːe̞ɡɡʲɡʷɡːɡ̟ʲhhʷk'kxkʰkʲkʷkʷ'k̟ʲk̟̚k͡p̚llʲlːmmʲmʷmːnnʲnːn̺ooːo̞o̥pp'pfpʰpʲpʷpːp̚rrːssʲsːtt'tstsʰtɕtɕʰtʃtʰtʲtʷ'tːt̚t̪t̪ʰt̪̚t͡st͡sʼt͡ɕt͡ɬt͡ʃt͡ʃʲt͡ʃʼt͡ʃːuuəuːvvʲvʷvːv̞v̞ʲwxyzzʲäæçðøŋŋ̟ŋ͡mœɐɐ̞ɑɓɔɕɕːɗəɛɟɡɡ̥ɣɤɤɐ̞ɤ̆ɥɦɨɪɫɯɯ̟ɯ̥ɰɱɲɴɸɹɹ̩ɻɻ̩ɽɾɾʲɾ̠ʀʂʃʃʲːʊʋʋʲʌʎʏʐʑʒʒ͡ɣʔʝββ̞θχḁ"
seglist = [
    u"ɪ", u"a", u"aː", u"ă", u"b", u"bʲ", u"bʷ", u"bː", u"b̞", u"b̥", u"c",
    u"d", u"dʒ", u"dʲ", u"dː", u"d̚", u"d̥", u"d͡z", u"d͡ʑ", u"d͡ʒ", u"d͡ʒː",
    u"e", u"eː", u"e̞", u"f", u"fʲ", u"fʷ", u"fː", u"ɡ", u"ɡʲ", u"ɡʷ", u"ɡː",
    u"ɡ̟ʲ", u"h", u"hʷ", u"i", u"iː", u"i̞", u"i̥", u"i̯", u"j", u"k", u"k'",
    u"kx", u"kʰ", u"kʲ", u"kʷ", u"kʷ'", u"kː", u"k̟ʲ", u"k̟̚", u"k͡p̚", u"l",
    u"lʲ", u"lː", u"m", u"mʲ", u"mʷ", u"mː", u"n", u"nʲ", u"nː", u"n̺", u"o",
    u"oː", u"o̞", u"o̥", u"p", u"p'", u"pf", u"pʰ", u"pʲ", u"pʷ", u"pː", u"p̚",
    u"r", u"rː", u"s", u"sʲ", u"sː", u"t", u"t'", u"ts", u"tsʰ", u"tɕ", u"tɕʰ",
    u"tʃ", u"tʰ", u"tʲ", u"tʷ'", u"tː", u"t̚", u"t̪", u"t̪ʰ", u"t̪̚", u"t͡s",
    u"t͡sʼ", u"t͡ɕ", u"t͡ɬ", u"t͡ʃ", u"t͡ʃʲ", u"t͡ʃʼ", u"t͡ʃː", u"u", u"uə",
Example #21
0
    def __init__(self):

        self.feature_table = panphon.FeatureTable()
Example #22
0
class XYGram:
    features = [ 'syl', 'son', 'cont', 'nas', 'ant', 'cor', 'hi', 'lo', 'back' ]
    ft       = panphon.FeatureTable()

    def __init__(self, lang1, lang2, max_offset=3, max_features=3):
        self.epi          = (epitran.Epitran(lang1), epitran.Epitran(lang2))
        self.max_offset   = max_offset
        self.max_features = min(max_features, len(XYGram.features))

    def _allFeatureCombos(self, v):
        result = []
        for r in range(1, self.max_features + 1):
            result += list(itertools.combinations(v, r))
        return result

    # lang: 1 or 2 based on which of the two languages
    def generateXYGram(self, s, lang):
        epi = self.epi[lang - 1]
        ft_vector = XYGram.ft.word_array(XYGram.features, epi.transliterate(s))

        d = {}
        for i in range(len(s)):
            for j in range(i + 1, min(i + self.max_offset, len(s) + 1)):
                fv = ft_vector[i:j]
                tmp1 = [ [ k for k, x in enumerate(v) if x >= 0 ] for v in fv ]
                tmp2 = [ self._allFeatureCombos(v) for v in tmp1 ]
                keys = list(itertools.product(*tmp2))
                for k in keys:
                    d[k] = d.get(k, 0) + 1
        return d

    # Prereq: v1, v2 int lists of equal length
    def cosineSimilarity(self, v1, v2):
        if (len(v1) != len(v2)):
            raise ValueError
        return 1 - spatial.distance.cosine(v1, v2)

    # Prereq: v1, v2 int lists of equal length
    def jaccardSimilarity(self, v1, v2):
        if (len(v1) != len(v2)):
            raise ValueError
        return jaccard_similarity_score(v1, v2)

    def compareXYGram(self, xy1, xy2):
        # Vectorize dictionaries with same keys
        v1 = []
        v2 = []

        k1 = set(xy1.keys())
        k2 = set(xy2.keys())
        k  = k1.union(k2)
        for key in k:
            v1.append(xy1.get(key, 0))
            v2.append(xy2.get(key, 0))

        return self.jaccardSimilarity(v1, v2)

    def compareRaw(self, s1, s2):
        xy1 = self.generateXYGram(s1, 1)
        xy2 = self.generateXYGram(s2, 2)
        return self.compareXYGram(xy1, xy2)
Example #23
0
 def setUp(self):
     self.ft = panphon.FeatureTable()