def scrape_adverbs(filename): """Gerunds (lyap) and infinitives (tum). 'tvA' gerunds come from SL_adverbs.xml. """ labels = ['name', 'root', 'pos', 'modification'] format_str = ','.join('{%s}' % x for x in labels) output = [] output.append(','.join(labels)) for xml in scrape_utils.iter_xml(filename): vu = xml.find('vu') if vu is None: continue name = xml.attrib['form'] root = xml.find('s').attrib['stem'] modification = trans[vu.find('cj')[0].tag] huet_pos = vu.find('iv')[0].tag if huet_pos == 'abs': pos = 'gerund' elif huet_pos == 'inf': pos = 'infinitive' else: raise Exception("Unknown POS %s" % huet_pos) output.append(format_str.format(**{ 'name': name, 'root': root, 'pos': pos, 'modification': modification or '', })) return '\n'.join(output)
def scrape_adverbs(filename): """Gerunds (ktvA). Infinitives (tum) come from SL_final.xml, and other indeclinables come from the MW data. """ labels = ['name', 'root', 'pos', 'modification'] format_str = ','.join('{%s}' % x for x in labels) output = [] output.append(','.join(labels)) for xml in scrape_utils.iter_xml(filename): # Only gerunds ab = xml.find('ab') if not ab: continue name = xml.attrib['form'] root = xml.find('s').attrib['stem'] modification = trans[ab.find('cj')[0].tag] # Filter out e.g. "Asam" if name[-2:] not in ('vA', 'ya'): continue output.append( format_str.format( **{ 'name': name, 'root': root, 'pos': 'gerund', 'modification': modification or '', })) return '\n'.join(output)
def scrape_adverbs(filename): """Gerunds (ktvA). Infinitives (tum) come from SL_final.xml, and other indeclinables come from the MW data. """ labels = ['name', 'root', 'pos', 'modification'] format_str = ','.join('{%s}' % x for x in labels) output = [] output.append(','.join(labels)) for xml in scrape_utils.iter_xml(filename): # Only gerunds ab = xml.find('ab') if not ab: continue name = xml.attrib['form'] root = xml.find('s').attrib['stem'] modification = trans[ab.find('cj')[0].tag] # Filter out e.g. "Asam" if name[-2:] not in ('vA', 'ya'): continue output.append(format_str.format(**{ 'name': name, 'root': root, 'pos': 'gerund', 'modification': modification or '', })) return '\n'.join(output)
def scrape(filename): """Inflected verbs""" labels = ['form', 'root', 'class', 'person', 'number', 'mode', 'voice', 'modification'] rows = [] num_written = 0 for xml in scrape_utils.iter_xml(filename): v = xml.find('v') cj = v.find('cj') _sys = v.find('sys') tense = _sys[0] np = v.find('np') s = xml.find('s') # Present system (present, imperfect, imperative, optative) if tense.tag == 'prs': vclass = tense.attrib.get('k', None) mode = trans[tense.find('md')[0].tag] voice = trans[tense[1].tag] # "Tense paradigm" (future, aorist, conditional, perfect, # injunctive, benedictive) elif tense.tag == 'tp': vclass = None mode = trans[tense[0].tag] voice = trans[tense[1].tag] # Passive elif tense.tag == 'pas': vclass = None mode = trans[tense.find('md')[0].tag] voice = trans[tense.tag] # Periphrastic future elif tense.tag == 'pef': vclass = None mode = trans[tense.tag] voice = trans[tense[0].tag] else: print ET.tostring(xml) name = xml.attrib['form'] root = s.attrib['stem'] person = trans[np[1].tag] number = trans[np[0].tag] modification = trans[cj[0].tag] # Denominative if vclass == '11': vclass = 'denom' # For non-classed verb forms. vclass = vclass or '' rows.append((name, root, vclass, person, number, mode, voice, modification)) num_written += 1 return labels, rows
def scrape(parts_file): """Participles.""" labels = ['stem', 'root', 'class', 'mode', 'voice', 'modification'] rows = [] num_written = 0 for xml in scrape_utils.iter_xml(parts_file): form = xml.attrib['form'] root = xml.find('s').attrib['stem'] for pa in xml.findall('pa'): # Inflectional info na = pa.find('na') case = trans[na[0].tag] number = trans[na[1].tag] gender = trans[na[2].tag] if (gender, case, number) != ('m', '1', 's'): continue # Morphological info (stem) modification = trans[pa.find('cj')[0].tag] mode_elem = pa.find('no')[0] mode, voice = trans[mode_elem.tag] if (mode, voice) == ('pres', 'active'): vclass = mode_elem[0].text voice = trans[mode_elem[1].tag] elif (mode, voice) in [('fut', 'active'), ('perf', 'active')]: vclass = None voice = trans[mode_elem[0].tag] else: vclass = None # voice = default # '11', '12', and '13' refer to "modified" verb classes. We can # just discard these. if vclass and modification is not None: vclass = None # Construct stem if form[-1] == 's': stem = form[:-1] elif form.endswith('an'): stem = form[:-1] + 't' # -an -> -at elif mode == 'perf': stem = form[:-2] + 'as' # -vAn -> -vas elif mode == 'past': stem = form[:-2] + 'at' # -vAn -> -vat else: # Encoding error, but high recall is OK. stem = form rows.append((stem, root, vclass, mode, voice, modification)) num_written += 1 return labels, rows
def scrape(parts_file): """Participles.""" labels = ['stem', 'root', 'class', 'mode', 'voice', 'modification'] rows = [] num_written = 0 for xml in scrape_utils.iter_xml(parts_file): form = xml.attrib['form'] root = xml.find('s').attrib['stem'] for pa in xml.findall('pa'): # Inflectional info na = pa.find('na') case = trans[na[0].tag] number = trans[na[1].tag] gender = trans[na[2].tag] if (gender, case, number) != ('m', '1', 's'): continue # Morphological info (stem) modification = trans[pa.find('cj')[0].tag] mode_elem = pa.find('no')[0] mode, voice = trans[mode_elem.tag] if (mode, voice) == ('pres', 'active'): vclass = mode_elem[0].text voice = trans[mode_elem[1].tag] elif (mode, voice) in [('fut', 'active'), ('perf', 'active')]: vclass = None voice = trans[mode_elem[0].tag] else: vclass = None # voice = default # '11', '12', and '13' refer to "modified" verb classes. We can # just discard these. if vclass and modification is not None: vclass = None # Construct stem if form[-1] == 's': stem = form[:-1] elif form.endswith('an'): stem = form[:-1] + 't' # -an -> -at elif mode == 'perf': stem = form[:-2] + 'as' # -vAn -> -vas elif mode == 'past': stem = form[:-2] + 'at' # -vAn -> -vat else: # Encoding error, but high recall is OK. stem = form rows.append((stem, root, vclass, mode, voice, modification)) num_written += 1
def scrape_adverbs(filename): """Gerunds (lyap) and infinitives (tum). 'tvA' gerunds come from SL_adverbs.xml. """ labels = ['name', 'root', 'pos', 'modification'] format_str = ','.join('{%s}' % x for x in labels) output = [] output.append(','.join(labels)) for xml in scrape_utils.iter_xml(filename): vu = xml.find('vu') if vu is None: continue name = xml.attrib['form'] root = xml.find('s').attrib['stem'] modification = trans[vu.find('cj')[0].tag] huet_pos = vu.find('iv')[0].tag if huet_pos == 'abs': pos = 'gerund' elif huet_pos == 'inf': pos = 'infinitive' else: raise Exception("Unknown POS %s" % huet_pos) output.append( format_str.format( **{ 'name': name, 'root': root, 'pos': pos, 'modification': modification or '', })) return '\n'.join(output)