def tags(argv=None):
    args = getTagsArgs(argv)
    if args.strict_io:
        print("Interpreting input strictly")
    else:
        print("Interpreting input loosely (strict_io set to false)")
    logger.info(f"Input String: {args.data}")
    if args.input_encoding is None:
        ie = None
    else:
        ie = SCHEMES[args.input_encoding]
    s = LexicalSandhiAnalyzer(args.lexical_lookup)
    with outputctx(args.strict_io):
        i = SanskritNormalizedString(args.data,
                                     encoding=ie,
                                     strict_io=args.strict_io,
                                     replace_ending_visarga='s')
        print("Input String in SLP1:", i.canonical())
        ts = s.getMorphologicalTags(i, tmap=args.map_tags)
        print("Morphological tags:")
        if ts is not None:
            for t in ts:
                print(t)
        # Possible rakaranta
        # Try by replacing end visarga with 'r' instead
        elif not args.strict_io:
            i = SanskritNormalizedString(args.data,
                                         encoding=ie,
                                         strict_io=args.strict_io,
                                         replace_ending_visarga='r')
            ts = s.getMorphologicalTags(i)
            if ts is not None:
                print("Input String in SLP1:", i.canonical())
                for t in ts:
                    print(t)
        if args.tag_set or args.base:
            if args.tag_set is not None:
                g = set(args.tag_set)
            else:
                g = None
            if args.base is not None:
                b = SanskritNormalizedString(args.base)
            else:
                b = None
            print(s.hasTag(i, b, g))
Exemple #2
0
 def parse(
     self,
     input_string: str,
 ):
     s = SanskritNormalizedString(
         input_string,
         encoding=self.input_encoding,
         strict_io=self.strict_io,
         replace_ending_visarga=self.replace_ending_visarga)
     logger.info(f"Input String in SLP1: {s.canonical()}")
     sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup)
     logger.debug("Start Split")
     graph = sandhi_analyzer.getSandhiSplits(s, tag=True)
     logger.debug("End DAG generation")
     if graph is None:
         warnings.warn(
             "No splits found. Please check the input to ensure there are no typos."
         )
         return None
     return ParseResult(self, input_string, graph)
Exemple #3
0
 def __init__(self,
              strict_io: bool = False,
              input_encoding: str = None,
              output_encoding: str = 'SLP1',
              lexical_lookup: str = "combined",
              score: bool = True,
              split_above: int = 5,
              replace_ending_visarga: str = None,
              fast_merge: bool = True):
     self.strict_io = strict_io
     if input_encoding is not None:
         self.input_encoding = SCHEMES[input_encoding]
     else:
         self.input_encoding = None
     self.output_encoding = SCHEMES[output_encoding]
     self.lexical_lookup = lexical_lookup
     self.score = score
     self.split_above = split_above
     self.replace_ending_visarga = replace_ending_visarga
     self.fast_merge = fast_merge
     self.sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from sanskrit_parser.parser.sandhi_analyzer import LexicalSandhiAnalyzer
from sanskrit_parser.base.sanskrit_base import SanskritObject, DEVANAGARI, outputctx
import pandas as pd
import multiprocessing
import os.path
import inspect

le = LexicalSandhiAnalyzer()


def get_kosh_entries(test_count=0):
    kosh_entries = []
    base_dir = os.path.dirname(
        os.path.abspath(inspect.getfile(inspect.currentframe())))
    data_dir = os.path.join(base_dir, 'SandhiKosh')
    aa_kosh = pd.read_excel(os.path.join(
        data_dir, "Astaadhyaayii Corpus.xls"))[['S. No.', 'Word', 'Split']]
    aa_kosh['File'] = "Astaadhyaayii Corpus.xls"
    bg_kosh = pd.read_excel(os.path.join(
        data_dir, "Bhagvad_Gita Corpus.xls"))[['S. No.', 'Word', 'Split']]
    bg_kosh['File'] = "Bhagvad_Gita Corpus.xls"
    uoh_kosh = pd.read_excel(os.path.join(
        data_dir, "UoH_Corpus.xls"))[['S. No.', 'Word', 'Split']]
    uoh_kosh['File'] = "UoH_Corpus.xls"
    lit_kosh_dict = pd.read_excel(os.path.join(
        data_dir, "Rule-based Corpus and Literature Corpus.xls"),
                                  sheet_name=None)
    for k in ['Internal', 'External', 'Literature']:
def lexan():
    return LexicalSandhiAnalyzer()
Exemple #6
0
class Parser():
    def __init__(self,
                 strict_io: bool = False,
                 input_encoding: str = None,
                 output_encoding: str = 'SLP1',
                 lexical_lookup: str = "combined",
                 score: bool = True,
                 split_above: int = 5,
                 replace_ending_visarga: str = None,
                 fast_merge: bool = True):
        self.strict_io = strict_io
        if input_encoding is not None:
            self.input_encoding = SCHEMES[input_encoding]
        else:
            self.input_encoding = None
        self.output_encoding = SCHEMES[output_encoding]
        self.lexical_lookup = lexical_lookup
        self.score = score
        self.split_above = split_above
        self.replace_ending_visarga = replace_ending_visarga
        self.fast_merge = fast_merge
        self.sandhi_analyzer = LexicalSandhiAnalyzer(self.lexical_lookup)

    def _maybe_pre_segment(self, input_string: str, pre_segmented: bool):
        ''' Pre-process pre-segmented input if necessary '''
        if not pre_segmented:
            s = SanskritNormalizedString(
                input_string,
                encoding=self.input_encoding,
                strict_io=self.strict_io,
                replace_ending_visarga=self.replace_ending_visarga)
            logger.info(f"Input String in SLP1: {s.canonical()}")
            return s
        else:
            logger.debug("Pre-Segmented")
            s = []
            for seg in input_string.split(" "):
                o = SanskritObject(seg,
                                   encoding=self.input_encoding,
                                   strict_io=self.strict_io,
                                   replace_ending_visarga='r')
                ts = self.sandhi_analyzer.getMorphologicalTags(o, tmap=True)
                if ts is None:
                    # Possible sakaranta
                    # Try by replacing end visarga with 's' instead
                    o = SanskritObject(seg,
                                       encoding=self.input_encoding,
                                       strict_io=self.strict_io,
                                       replace_ending_visarga='s')
                    ts = self.sandhi_analyzer.getMorphologicalTags(o,
                                                                   tmap=True)
                if ts is None:
                    logger.warning(f"Unknown pada {seg} - will be split")
                    _s = list(self.split(seg, pre_segmented=False, limit=1))[0]
                    logger.info(f"Split {_s}")
                    s.extend(_s.split)
                    if _s is None:
                        logger.warning(f"Unknown pada {seg} - cannot be split")
                else:
                    s.append(o)
            logger.info(
                f"Input String in SLP1: {' '.join([x.canonical() for x in s])}"
            )
            return s

    def split(
        self,
        input_string: str,
        limit: int = 10,
        pre_segmented: bool = False,
        dot_file=None,
    ):
        s = self._maybe_pre_segment(input_string, pre_segmented)
        logger.debug("Start Split")
        graph = self.sandhi_analyzer.getSandhiSplits(
            s, tag=True, pre_segmented=pre_segmented)
        logger.debug("End DAG generation")
        if graph is None:
            warnings.warn(
                "No splits found. Please check the input to ensure there are no typos."
            )
            return None
        else:
            if dot_file is not None:
                graph.write_dot(dot_file)
            splits = graph.find_all_paths(max_paths=limit,
                                          sort=True,
                                          score=self.score)
            return [Split(self, input_string, split) for split in splits]