def tags(argv=None): args = getTagsArgs(argv) if args.strict_io: print("Interpreting input strictly") else: print("Interpreting input loosely (strict_io set to false)") logger.info(f"Input String: {args.data}") if args.input_encoding is None: ie = None else: ie = SCHEMES[args.input_encoding] s = LexicalSandhiAnalyzer(args.lexical_lookup) with outputctx(args.strict_io): i = SanskritNormalizedString(args.data, encoding=ie, strict_io=args.strict_io, replace_ending_visarga='s') print("Input String in SLP1:", i.canonical()) ts = s.getMorphologicalTags(i, tmap=args.map_tags) print("Morphological tags:") if ts is not None: for t in ts: print(t) # Possible rakaranta # Try by replacing end visarga with 'r' instead elif not args.strict_io: i = SanskritNormalizedString(args.data, encoding=ie, strict_io=args.strict_io, replace_ending_visarga='r') ts = s.getMorphologicalTags(i) if ts is not None: print("Input String in SLP1:", i.canonical()) for t in ts: print(t) if args.tag_set or args.base: if args.tag_set is not None: g = set(args.tag_set) else: g = None if args.base is not None: b = SanskritNormalizedString(args.base) else: b = None print(s.hasTag(i, b, g))
def parse( self, input_string: str, ): s = SanskritNormalizedString( input_string, encoding=self.input_encoding, strict_io=self.strict_io, replace_ending_visarga=self.replace_ending_visarga) logger.info(f"Input String in SLP1: {s.canonical()}") sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup) logger.debug("Start Split") graph = sandhi_analyzer.getSandhiSplits(s, tag=True) logger.debug("End DAG generation") if graph is None: warnings.warn( "No splits found. Please check the input to ensure there are no typos." ) return None return ParseResult(self, input_string, graph)
def __init__(self, strict_io: bool = False, input_encoding: str = None, output_encoding: str = 'SLP1', lexical_lookup: str = "combined", score: bool = True, split_above: int = 5, replace_ending_visarga: str = None, fast_merge: bool = True): self.strict_io = strict_io if input_encoding is not None: self.input_encoding = SCHEMES[input_encoding] else: self.input_encoding = None self.output_encoding = SCHEMES[output_encoding] self.lexical_lookup = lexical_lookup self.score = score self.split_above = split_above self.replace_ending_visarga = replace_ending_visarga self.fast_merge = fast_merge self.sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup)
#!/usr/bin/env python # -*- encoding: utf-8 -*- from sanskrit_parser.parser.sandhi_analyzer import LexicalSandhiAnalyzer from sanskrit_parser.base.sanskrit_base import SanskritObject, DEVANAGARI, outputctx import pandas as pd import multiprocessing import os.path import inspect le = LexicalSandhiAnalyzer() def get_kosh_entries(test_count=0): kosh_entries = [] base_dir = os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) data_dir = os.path.join(base_dir, 'SandhiKosh') aa_kosh = pd.read_excel(os.path.join( data_dir, "Astaadhyaayii Corpus.xls"))[['S. No.', 'Word', 'Split']] aa_kosh['File'] = "Astaadhyaayii Corpus.xls" bg_kosh = pd.read_excel(os.path.join( data_dir, "Bhagvad_Gita Corpus.xls"))[['S. No.', 'Word', 'Split']] bg_kosh['File'] = "Bhagvad_Gita Corpus.xls" uoh_kosh = pd.read_excel(os.path.join( data_dir, "UoH_Corpus.xls"))[['S. No.', 'Word', 'Split']] uoh_kosh['File'] = "UoH_Corpus.xls" lit_kosh_dict = pd.read_excel(os.path.join( data_dir, "Rule-based Corpus and Literature Corpus.xls"), sheet_name=None) for k in ['Internal', 'External', 'Literature']:
def lexan(): return LexicalSandhiAnalyzer()
class Parser(): def __init__(self, strict_io: bool = False, input_encoding: str = None, output_encoding: str = 'SLP1', lexical_lookup: str = "combined", score: bool = True, split_above: int = 5, replace_ending_visarga: str = None, fast_merge: bool = True): self.strict_io = strict_io if input_encoding is not None: self.input_encoding = SCHEMES[input_encoding] else: self.input_encoding = None self.output_encoding = SCHEMES[output_encoding] self.lexical_lookup = lexical_lookup self.score = score self.split_above = split_above self.replace_ending_visarga = replace_ending_visarga self.fast_merge = fast_merge self.sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup) def _maybe_pre_segment(self, input_string: str, pre_segmented: bool): ''' Pre-process pre-segmented input if necessary ''' if not pre_segmented: s = SanskritNormalizedString( input_string, encoding=self.input_encoding, strict_io=self.strict_io, replace_ending_visarga=self.replace_ending_visarga) logger.info(f"Input String in SLP1: {s.canonical()}") return s else: logger.debug("Pre-Segmented") s = [] for seg in input_string.split(" "): o = SanskritObject(seg, encoding=self.input_encoding, strict_io=self.strict_io, replace_ending_visarga='r') ts = self.sandhi_analyzer.getMorphologicalTags(o, tmap=True) if ts is None: # Possible sakaranta # Try by replacing end visarga with 's' instead o = SanskritObject(seg, encoding=self.input_encoding, strict_io=self.strict_io, replace_ending_visarga='s') ts = self.sandhi_analyzer.getMorphologicalTags(o, tmap=True) if ts is None: logger.warning(f"Unknown pada {seg} - will be split") _s = list(self.split(seg, pre_segmented=False, limit=1))[0] logger.info(f"Split {_s}") s.extend(_s.split) if _s is None: logger.warning(f"Unknown pada {seg} - cannot be split") else: s.append(o) logger.info( f"Input String in SLP1: {' '.join([x.canonical() for x in s])}" ) return s def split( self, input_string: str, limit: int = 10, pre_segmented: bool = False, dot_file=None, ): s = self._maybe_pre_segment(input_string, pre_segmented) logger.debug("Start Split") graph = self.sandhi_analyzer.getSandhiSplits( s, tag=True, pre_segmented=pre_segmented) logger.debug("End DAG generation") if graph is None: warnings.warn( "No splits found. Please check the input to ensure there are no typos." ) return None else: if dot_file is not None: graph.write_dot(dot_file) splits = graph.find_all_paths(max_paths=limit, sort=True, score=self.score) return [Split(self, input_string, split) for split in splits]