Example #1
0
class TestSandhisplitter(TestCase):
    def setUp(self):
        super(TestSandhisplitter, self).setUp()
        self.model = Model(depth=3, skip=1)
        self.SS = Sandhisplitter()
        testcases = resource_filename("sandhisplitter.tests",
                                      "resources/samples.txt")
        self.entries = open(testcases, "r", encoding='utf-8')

    def test_splits(self):
        count = 0
        entries = map(lambda x: x.strip(), self.entries.readlines())
        for line in entries:
            count += 1
            (word, splits, locs) = extract(line)
            self.model.add_entry(word, splits, locs)
        m = self.model.serialize()
        self.SS.set_model(m)
        for line in entries:
            (word, splits, locs) = extract(line)
            obtained, pos = self.SS.split(word)
            self.assertEqual(locs, pos)
            self.assertEqual(splits, obtained)

    def test_details(self):
        self.assertEqual(self.SS.get_module_name(), "Sandhi-Splitter")
        self.assertEqual(self.SS.get_info(), "Sandhi-splitter for malayalam")

    def test_instance(self):
        self.assertEqual(isinstance(getInstance(), Sandhisplitter), True)
Example #2
0
def sandhi_split(token_words):
 #print("\n Splitted using Sandhi!!\n--------------------------")   
 temp=[]
 s = Sandhisplitter()
 for word in token_words:
    ss=s.split(word)
    out=ss[0]
    temp.append(out)
    print(out)
 return temp
Example #3
0
class Malayalam(BaseMalayalam, object):
    def __init__(self):
        super(Malayalam, self).__init__()
        # Let's give the spellchecker a boost.
        self.sandhi = Sandhisplitter()

    def check(self, word):
        # Trivial case, word is in corpus
        if super(Malayalam, self).check(word):
            return True

        # Sandhisplitter additions
        # Check for each split word if word exists in corpus
        # Increases True Positives, Reduces False Negatives
        words, splits = self.sandhi.split(word)
        for w in words:
            if not super(Malayalam, self).check(w):
                return False
        return True

    def suggest(self, word, n=5):
        # Start with bases suggestions
        suggestions = super(Malayalam, self).suggest(word, n)

        # Sandhisplitter additions
        words, splits = self.sandhi.split(word)
        corrections = []
        for w in words:
            # Word in dictionary
            if super(Malayalam, self).check(w):
                corrections.append([w])
            # Word not in dictionary
            else:
                corrections.append(super(Malayalam, self).suggest(w, n))

        # Cross product to get all possibilities
        candidates = product(*corrections)

        # Apply joiner on possibile tuples.
        for group in candidates:
            joined = self.sandhi.join(group)
            suggestions.append(joined)

        # Scoring via levenstein, sort by levenshtein
        scores = []
        for suggestion in suggestions:
            score = super(Malayalam,
                          self).levenshtein_distance(suggestion, word)
            scores.append(score)

        paired = list(zip(scores, suggestions))
        paired.sort()
        sorted_suggestions = []
        for (score, suggestion) in paired:
            sorted_suggestions.append(suggestion)

        # Trim off to match n
        if (len(sorted_suggestions) > n):
            sorted_suggestions = sorted_suggestions[:n]

        # And tadaa!!!
        return sorted_suggestions
Example #4
0
class Malayalam(BaseMalayalam, object):

    def __init__(self):
        super(Malayalam, self).__init__()
        # Let's give the spellchecker a boost.
        self.sandhi = Sandhisplitter()

    def check(self, word):
        # Trivial case, word is in corpus
        if super(Malayalam, self).check(word):
            return True

        # Sandhisplitter additions
        # Check for each split word if word exists in corpus
        # Increases True Positives, Reduces False Negatives
        words, splits = self.sandhi.split(word)
        for w in words:
            if not super(Malayalam, self).check(w):
                return False
        return True

    def suggest(self, word, n=5):
        # Start with bases suggestions
        suggestions = super(Malayalam, self).suggest(word, n)

        # Sandhisplitter additions
        words, splits = self.sandhi.split(word)
        corrections = []
        for w in words:
            # Word in dictionary
            if super(Malayalam, self).check(w):
                corrections.append([w])
            # Word not in dictionary
            else:
                corrections.append(super(Malayalam, self).suggest(w, n))

        # Cross product to get all possibilities
        candidates = product(*corrections)

        # Apply joiner on possibile tuples.
        for group in candidates:
            joined = self.sandhi.join(group)
            suggestions.append(joined)

        # Scoring via levenstein, sort by levenshtein
        scores = []
        for suggestion in suggestions:
            score = super(Malayalam, self).levenshtein_distance(
                suggestion, word)
            scores.append(score)

        paired = list(zip(scores, suggestions))
        paired.sort()
        sorted_suggestions = []
        for (score, suggestion) in paired:
            sorted_suggestions.append(suggestion)

        # Trim off to match n
        if (len(sorted_suggestions) > n):
            sorted_suggestions = sorted_suggestions[:n]

        # And tadaa!!!
        return sorted_suggestions