def add_rules_from_file(self, path): """ Add sandhi rules from file. Each line of the input file should contain one rule. E.g. अ + अ = आ Lines starting with a # are treated as comments and skipped. Empty lines are ignored as well. :param path: file to read rules from See also add_rules_from_dir """ filename = os.path.basename(path) with codecs.open(path, "rb", 'utf-8') as f: for linenum, line in enumerate(f): line = line.strip() if line.startswith('#') or line == '': continue self.logger.debug("Processing rule %s", line) rule = SanskritObject(line).canonical() for r in self.expand_rule(rule): self.add_rule(*r, annotation="%s:%d" % (filename, linenum + 1))
def test_medium_split(lexan): i = SanskritObject("budDaMSaraRaNgacCAmi", encoding=SLP1) graph = lexan.getSandhiSplits(i) splits = graph.findAllPaths() assert [u'budDam', u'SaraRam', u'gacCAmi'] in \ [list(map(str, ss)) for ss in splits]
def test_simple_split(lexan): # gaNeshannamAmi i = SanskritObject("gaReSannamAmi", encoding=SLP1) graph = lexan.getSandhiSplits(i) splits = graph.findAllPaths() assert [u'gaReSam', u'namAmi'] in [list(map(str, ss)) for ss in splits]
def process_line(lnum, l): '''Process a single line''' logging.info("Processing Line {}: {}".format(lnum, l)) r = None subsplitp = False line = l.strip() if line and line[0] == '#': logging.info("Skipping Comment") return None if line.find('=>') == -1: logging.info("Cannot find =>") return None full, split = line.split('=>') full = full.strip() full = full.replace(u'|', '') # Zero width joiner/nonjoiner full = full.replace(u"\u200c", "") full = full.replace(u"\u200d", "") ofull = full # Save full = _dumpchars(SanskritObject(full).transcoded(SLP1)) split = split.strip() split = split.replace(u'|', '') # Zero width joiner/nonjoiner split = split.replace(u"\u200c", "") split = split.replace(u"\u200d", "") osplit = split # Save splits = list( map(lambda x: _dumpchars(SanskritObject(x).transcoded(SLP1).strip()), split.split('+'))) if splits[-1] == '': splits.pop() # Empty full string if len(full) == 0: logger.info("Skipping") # UOHD errors, final visarga is sometimes missing if len(splits[-1]) > 1 and splits[-1][-2:] == "AH" and \ full[-1] == "A": full = full + "H" if len(splits[-1]) > 1 and splits[-1][-2:] == "aH" and \ full[-1] == "a": full = full + "H" if splits[-1][-1] == "A" and len(full) > 1 and full[-2:] == "AH": splits[-1] = splits[-1] + "H" if splits[-1][-1] == "a" and len(full) > 1 and full[-2:] == "aH": splits[-1] = splits[-1] + "H" # FIXME - this creates problems, eg on 'aho', 'prabho' # UOHD stores sandhied final words! # This is not a full fix full = re.sub("o$", "aH", full) # Modified splits s = [] for ss in splits: # Check if this word is in our db # Rakarantas # FIXME - these four replacements aren't working # Check intent and actual operation sss = ss.replace('punaH', 'punar') sss = ss.replace('antaH', 'antar') sss = ss.replace('bahiH', 'bahir') sss = ss.replace('prAtaH', 'prAtar') # FIXME - the above four replacements aren't working # Sakarantas sss = re.sub('H$', 's', sss) if sss.find('punas') != -1: logger.error("ERROR: found {}".format(sss)) # Is in our database if (subsplitp == "Skip") or lexan.forms.valid(sss): s.append(sss) else: # If not, treat it as a word to be split try: graph = lexan.getSandhiSplits(SanskritObject(ss, encoding=SLP1)) if graph is None: # Catch stray unicode symbols with the encode logger.warning("Skipping: {} is not in db".format( ss.encode('utf-8'))) subsplitp = "Skip" s.append(sss) continue else: subsplitp = True except: # noqa logger.warning("Split Error: {}".format(ss.encode('utf-8'))) s.append(sss) continue # First split ssp = list(map(str, graph.findAllPaths(max_paths=1)[0])) # Add it to split list s.extend(map(str, ssp)) logger.info(u"{} => {}".format(full, " ".join(s))) r = [full, s, ofull, osplit, subsplitp] return r
def expand_rule(self, rule): """ Expands a given sandhi rule from the rules file to generate all possible combinations :param rule: Rule to expand :return: A generator of all possible expanded rules """ self.logger.debug("Expanding rule %s", rule) ms = MaheshvaraSutras() b, afters = map(six.text_type.strip, rule.split("=")) before = list(map(six.text_type.strip, b.split("+", 1))) left_classes = re.split(r'\[(.*?)\]', before[0]) self.logger.debug("Left classes = %s", left_classes) # Split after forms into individual forms afters = map(six.text_type.strip, afters.split("/")) before_left = [] for c in left_classes: if c != '': if c.startswith("*"): # This is a mAheswara sUtra pratyAhAra splits = list(map(six.text_type.strip, c.split('-'))) varnas = set( ms.getPratyahara(SanskritObject(splits[0][1:], encoding=SLP1), longp=False, remove_a=True, dirghas=True).canonical()) if len(splits) == 2: varnas -= set(splits[1]) self.logger.debug("Found pratyAhAra %s = %s", c, varnas) before_left.append(varnas) else: before_left.append(map(six.text_type.strip, c.split(","))) self.logger.debug("before_left iterator = %s", before_left) right_classes = re.split(r'\[(.*?)\]', before[1]) # Could have used list comprehension, but this is easier to read self.logger.debug("right_classes = %s", right_classes) if right_classes: before_right = [] for c in right_classes: if c != '': if c.startswith("*"): # This is a mAheswara sUtra pratyAhAra splits = list( map(six.text_type.strip, re.split('([+-])', c))) varnas = set( ms.getPratyahara(SanskritObject(splits[0][1:], encoding=SLP1), longp=False, remove_a=True, dirghas=True).canonical()) if len(splits) == 3: if splits[1] == '-': varnas -= set(splits[2]) elif splits[1] == '+': varnas |= set(splits[2]) self.logger.debug("Found pratyAhAra %s (%s) = %s", c, splits[0][1:], varnas) before_right.append(varnas) else: before_right.append( map(six.text_type.strip, c.split(","))) else: before_right = [before[1].strip()] self.logger.debug("before_right iterator = %s", before_right) for after, before_l, before_r in itertools.product( afters, itertools.product(*before_left), itertools.product(*before_right)): left = ''.join(before_l) right = ''.join(before_r) list_before_r = list(before_r) left_right = (left, right) a = after.format(*(list(before_l) + list_before_r)) # The below is just too much logging - should be silenced in production: # self.logger.debug("Final rule = %s -> %s", left_right, a) yield (left_right, a)
def jtag(tag): """ Helper to translate tag to serializable format""" return (SanskritObject(tag[0], encoding=SLP1).devanagari(strict_io=False), [t.devanagari(strict_io=False) for t in list(tag[1])])
def jedge(pred, node, label): return (node.pada.devanagari(strict_io=False), jtag(node.getMorphologicalTags()), SanskritObject(label, encoding=SLP1).devanagari(strict_io=False), pred.pada.devanagari(strict_io=False))