def scrape(xml_path): """Scrape verb prefixed from the MW dictionary.""" upasargas = set(UPASARGAS.splitlines()) labels = ['name', 'prefix_type'] regexp = 'root' rows = [] for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)): key1 = xml.find('h/key1') key2 = xml.find('h/key2') entry = key1.text if not (entry.endswith('kf') or entry.endswith('BU')): continue # A root is prefixed iff it has a <root> element. Any matches without # one are almost certainly nominals, which we can disregard. root = key2.find('.//root') if root is None: continue # Remove lingering XML root.clear() key2.tag = None name = ET.tostring(key2) name = re.sub('(<.*?>)|/', '', name) # Remove groups ending in upasargas splits = [x for x in name.split('-') if x] last = splits[-1] if last in upasargas or make_tidy(last) in upasargas: continue # Add prefixes to the proper category name = ''.join(splits) _type = None if name[-1] in ('I', 'U'): _type = 'cvi' elif name.endswith('A'): _type = 'DAc' else: _type = 'other' # 'sampra' is suggested as a prefix. This is wrong. if name == 'sampra': continue rows.append((name, _type)) rows = util.unique(rows, lambda x: x[0]) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)
def scrape(xml_path): """Scrape indeclinables from the MW dictionary.""" labels = ['name'] rows = [] regexp = 'body>\s*<lex>ind' for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)): word = xml.find('h/key1').text rows.append([word]) # util.tick(word, i, 50) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)
def scrape(xml_path): """Scrape nouns and adjectives from the MW dictionary.""" noun_lexes = { 'm': 'm', 'f': 'f', 'n': 'n', 'mf': 'mf', 'fn': 'fn', 'nf': 'fn', 'mn': 'mn', 'nm': 'mn' } adj_lexes = { 'mfn': 'mfn' } labels = ['stem', 'stem_genders'] regexp = '(<lex>[^i].*?</lex>)' rows = [] seen = set() for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)): # Genders lex = xml.find('body/lex') if lex is None: lex = xml.find('body/p/lex') if lex is None: continue lex.tag = None lex.tail = None lex = ET.tostring(lex) lex = re.sub('<.*>', '', lex) lex = re.sub('[^a-z]', '', lex) if lex not in noun_lexes and lex not in adj_lexes: continue genders = noun_lexes.get(lex) or adj_lexes.get(lex) assert genders # Stem stem = xml.find('h/key1').text if (stem, genders) in seen: continue seen.add((stem, genders)) rows.append((stem, genders)) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)
def scrape(xml_path): """Scrape nouns and adjectives from the MW dictionary.""" noun_lexes = { 'm': 'm', 'f': 'f', 'n': 'n', 'mf': 'mf', 'fn': 'fn', 'nf': 'fn', 'mn': 'mn', 'nm': 'mn' } adj_lexes = {'mfn': 'mfn'} labels = ['stem', 'stem_genders'] regexp = '(<lex>[^i].*?</lex>)' rows = [] seen = set() for i, xml in enumerate(util.iter_mw_xml(xml_path, regexp=regexp)): # Genders lex = xml.find('body/lex') if lex is None: lex = xml.find('body/p/lex') if lex is None: continue lex.tag = None lex.tail = None lex = ET.tostring(lex) lex = re.sub('<.*>', '', lex) lex = re.sub('[^a-z]', '', lex) if lex not in noun_lexes and lex not in adj_lexes: continue genders = noun_lexes.get(lex) or adj_lexes.get(lex) assert genders # Stem stem = xml.find('h/key1').text if (stem, genders) in seen: continue seen.add((stem, genders)) rows.append((stem, genders)) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)
def scrape(xml_path): """Scrape prefixed roots from the MW dictionary. This function doesn't scrape everything, but it's good enough. """ labels = ["prefixed_root", "unprefixed_root", "hom"] rows = [] for i, xml in enumerate(util.iter_mw_xml(xml_path, "vlex")): if not has_prefix(xml): continue prefixed_root = xml.find("h/key1").text # Skip any entries without a <root> element. This element wraps the # unprefixed root. If <root> is absent, this probably isn't a prefixed # root. # # TODO: The following prefixed roots have no <root> element: # - gavez # - pAWAntaraya # - sampalAy # - samprAv unprefixed_root = None root_elem = xml.find(".//root") if root_elem is not None: unprefixed_root = root_elem.text if (not unprefixed_root) and root_elem.tail: unprefixed_root = root_elem.tail.strip() if not unprefixed_root or "~" in unprefixed_root: continue # Some roots are homonymous. The MW <hom> element distinguishes one # root sense from another. hom = xml.find(".//root/hom") hom_value = hom.text if hom is not None else None rows.append((prefixed_root, unprefixed_root, hom_value)) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)
def scrape(xml_path): """Scrape prefixed roots from the MW dictionary. This function doesn't scrape everything, but it's good enough. """ labels = ['prefixed_root', 'unprefixed_root', 'hom'] rows = [] for i, xml in enumerate(util.iter_mw_xml(xml_path, 'vlex')): if not has_prefix(xml): continue prefixed_root = xml.find('h/key1').text # Skip any entries without a <root> element. This element wraps the # unprefixed root. If <root> is absent, this probably isn't a prefixed # root. # # TODO: The following prefixed roots have no <root> element: # - gavez # - pAWAntaraya # - sampalAy # - samprAv unprefixed_root = None root_elem = xml.find('.//root') if root_elem is not None: unprefixed_root = root_elem.text if (not unprefixed_root) and root_elem.tail: unprefixed_root = root_elem.tail.strip() if not unprefixed_root or '~' in unprefixed_root: continue # Some roots are homonymous. The MW <hom> element distinguishes one # root sense from another. hom = xml.find('.//root/hom') hom_value = hom.text if hom is not None else None rows.append((prefixed_root, unprefixed_root, hom_value)) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)
def scrape(xml_path): """Scrape unprefixed roots from the MW dictionary.""" labels = ['root', 'hom', 'class', 'voice'] rows = [] all_vclasses = set('1 2 3 4 5 6 7 8 9 10 denom'.split()) all_voices = set('para atma'.split()) voice_translator = {'p': 'para', 'a': 'atma', 'a1': 'atma'} for i, xml in enumerate(util.iter_mw_xml(xml_path)): if has_prefix(xml): continue root = xml.find('h/key1').text paradigms = [] vclasses = [] voice = None # To make a paradigm, we need a class and voice. Viable roots come in # three flavors: # # - class and voice: gam # - class, no voice: patAkaya # - voice, no class: candrikAya # # Some roots have neither class and voice. These are currently # ignored. for token in tokenized_vlexes(xml): if token in all_vclasses: vclasses.append(token) elif token in voice_translator: voice = voice_translator[token] for vclass in vclasses: paradigms.append((vclass, voice)) vclasses = [] # If the voice is not specified, search Sanskrit strings in the entry # to infer it. if vclasses and not paradigms: body = ET.tostring(xml.find('body')) # 'ti' at the end of a word if re.search('ti[,. <]', body): voice = voice_translator['p'] for vclass in vclasses: paradigms.append((vclass, voice)) # 'te' or 'mAna' at the end of a word elif re.search('(te)|(mAna)|(mARa)[,. <]', body): voice = voice_translator['a'] for vclass in vclasses: paradigms.append((vclass, voice)) # If the class is not specified, make some high-precision assumptions # about it. if voice and not paradigms: ends = root.endswith if ends('Aya') or ends('aya') or ends('Iya'): paradigms.append(('denom', voice)) paradigms = [list(x) for x in util.unique(paradigms)] if not paradigms: continue # Some roots are homonymous. The MW <hom> element distinguishes one # root sense from another. hom = xml.find('h/hom') hom_value = hom.text if hom is not None else None for vclass, voice in paradigms: assert vclass in all_vclasses assert voice in all_voices rows.append((root, hom_value, vclass, voice)) rows.sort(key=lambda x: util.key_fn(x[0])) print util.make_csv_string(labels, rows)