def v(str): c = iphon_configparser.parser() c.read_string(string=str) return commit.validate(c)
if args.bibkey: ref_info.update(from_bibkey(args.glottocode, args.bibkey)) if args.sil_pacific: ref_info.update(from_sil_pacific(args.sil_pacific)) if args.phon_invs_tb: args.simple = True args.bibkey = 'hh:hld:Namkung:Tibeto-Burman' ref_info.update(from_bibkey(args.glottocode, args.bibkey)) ref_info['url'] = 'https://stedt.berkeley.edu/pubs_and_prods/STEDT_Monograph3_Phonological-Inv-TB.pdf' ref_info['author'] = 'Namkung, Ju' ini_path = find_path(args.glottocode) # build the file ini = iphon_configparser.parser() ini['core'] = { 'name': args.name , 'glottocode': args.glottocode , 'dialect': 'OPTIONAL' , 'dialect_name': args.dialect_name } ini['source'] = { 'glottolog': (args.bibkey if args.bibkey else "IDEAL") , 'url': maybe(ref_info, 'url', "IDEAL") , 'doi': maybe(ref_info, 'doi', "OPTIONAL") , 'author': maybe(ref_info, 'author', "OPTIONAL (but REQUIRED if there's no glottolog ID)") , 'title': maybe(ref_info, 'title', "OPTIONAL (but REQUIRED if there's no glottolog ID)") , 'publisher': maybe(ref_info, 'publisher', "OPTIONAL") , 'volume': maybe(ref_info, 'volume', "OPTIONAL")
if len(phonemes) == 0 or phonemes[0] == 'required': raise MissingPropertyError('No phonemes given') if 'y' in phonemes: print('Warning: /y/ listed in phonemes - make sure you don\'t mean /j/!') canonical_phonemes = [get_canonical(phoneme) for phoneme in phonemes] # -- Allophonic rules tests -- for rule_raw in doculect['allophonic_rules']: rule = parse_allophonic_rule(rule_raw) for phoneme in rule['phonemes']: if phoneme not in canonical_phonemes: raise InvalidPropertyError('Phoneme {} in rule {} not listed as canonical in phonemes section'.format(phoneme, rule)) return True if __name__ == '__main__': filename = sys.argv[1] doculect = iphon_configparser.parser() file_path = path.join('doculects', '{}.ini'.format(filename)) if not(path.isdir('doculects')): raise NotFoundError('Doculects directory not found - this script must be run from the main IPHON directory') if not(path.isfile(file_path)): raise NotFoundError('File not found') doculect.read(file_path, encoding='utf-8') validate(doculect) # if it's invalid, this will throw an exception subprocess.run(['git', 'add', file_path])
def read_ini(path, sql): ini = iphon_configparser.parser() ini.read(path, encoding='utf-8') validate(ini) doculect = OrderedDict() doculect['inventory_id'] = os.path.split( path)[-1][:-4] # get the filename minus the .ini doculect[DOCULECT_NAME_COL] = ini['core']['name'] doculect['glottocode'] = ini['core']['glottocode'] doculect['dialect'] = maybe(ini['core'], 'dialect') doculect['dialect_name'] = maybe(ini['core'], 'dialect_name') doculect['notes'] = '\n'.join(maybe(ini, 'notes', [])) if doculect['notes'] == '': doculect['notes'] = None doculect['source_bibkey'] = maybe(ini['source'], 'glottolog', filters=INI_DEFAULTS) doculect['source_url'] = maybe(ini['source'], 'url', filters=INI_DEFAULTS) doculect['source_author'] = maybe(ini['source'], 'author', filters=INI_DEFAULTS) doculect['source_title'] = maybe(ini['source'], 'title', filters=INI_DEFAULTS) doculect['source_publisher'] = maybe(ini['source'], 'publisher', filters=INI_DEFAULTS) doculect['source_volume'] = maybe(ini['source'], 'volume', filters=INI_DEFAULTS) doculect['source_number'] = maybe(ini['source'], 'number', filters=INI_DEFAULTS) doculect['source_year'] = maybe(ini['source'], 'year', filters=INI_DEFAULTS) doculect['source_pages'] = maybe(ini['source'], 'pages', filters=INI_DEFAULTS) doculect['source_doi'] = maybe(ini['source'], 'doi', filters=INI_DEFAULTS) if doculect['source_year'] == 'Unknown': doculect['source_year'] = None phonemes = list(ini['phonemes']) allophonic_rules = [ parse_allophonic_rule(a) for a in ini['allophonic_rules'] ] # Start writing - first the doculect... insert('doculects', doculect, return_id=True, sql=sql) doculect_id = sql.fetchone()[0] # ...then the language, if necessary... language_id = find_or_create_language(doculect['glottocode'], sql=sql) # ...then the segments... # We'll handle featuralization later. # We'll also figure out how to store alternate forms later; # for now, we'll mostly mirror PSMITH's db structure, # and store non-canonical forms (that aren't listed as allophones) as conditionless allophonic rules. # (The eventual goal is to have everything in Haskell, so this is good enough for v0.1, # but it's not strictly correct.) # Another thing we'll figure out later is phoneme junctions in allophonic rules - # e.g. s+i > s̩ / unstressed. # These will be stored as: # s > s̩ / unstressed (_+i) # i > s̩ / unstressed (s+_) # Which also isn't strictly correct, but isn't good enough. # Rules with + in the output are just ignored, because I don't know how they should be handled, and see above re: deadlines. # Maybe we want NoSQL for the final DB. doculect_segment_ids = {} for phoneme_txt in phonemes: phoneme = parse_phoneme(phoneme_txt) canonical_form_id = find_or_create_segment(phoneme['canonical_form'], sql=sql) noncanonical_forms = phoneme['noncanonical_forms'] # OK, let's just do this as string processing to ensure consistency form_rule_str = '{} > {} / (non-canonical form - i.e. alternate representation of the phoneme)' form_rules = [ parse_allophonic_rule( form_rule_str.format(phoneme['canonical_form'], noncanonical_form)) for noncanonical_form in noncanonical_forms ] allophonic_rules += form_rules # Then build the doculect_segments, and save them in a mapping of phoneme -> doculect_segment_id for this doculect. insert(DOC_SEG_JOIN_TBL, OrderedDict({ 'doculect_id': doculect_id, 'segment_id': canonical_form_id, 'marginal': phoneme['marginal'], 'loan': phoneme['loan'] }), return_id=True, sql=sql) doculect_segment_ids[phoneme['canonical_form']] = sql.fetchone()[0] # ...then the allophones... for allophonic_rule in allophonic_rules: serialized_rules = [] # Note that 'variation' refers to whether the rule itself is obligatory or optional, # not whether there's variance in outputs (all of which are distinct from the input) of the rule. # That is, the presence of variation means the rule may output its input, # and the absence of variation means it may not. # That's the ideal, anyway. What +variation means here *in practice* is the presence of a ~ right after the >. for allophone in allophonic_rule['allophones']: # Skip anything with + in the output because we can't handle that in v0.1 if '+' in allophone: continue allophone_segment_id = find_or_create_segment(allophone, sql=sql) # If it's a compound, store the whole compound as text, I guess. compound = None if len(allophonic_rule['phonemes']) > 1: compound = '+'.join(allophonic_rule['phonemes']) for phoneme in allophonic_rule['phonemes']: rule = OrderedDict({ 'doculect_segment_id': doculect_segment_ids[phoneme], 'allophone_id': allophone_segment_id, 'variation': allophonic_rule['rule_type'] == 'variant', 'compound': '+'.join(allophonic_rule['phonemes']) if len(allophonic_rule['phonemes']) > 1 else None, 'environment': allophonic_rule['environment'] }) insert('allophones', rule, sql=sql)