Example #1
0
 def check(self, word, language=None):
     word=word.strip()
     if word == "": 
         return None
     #If it is a number, don't do spelcheck
     if silpautils.is_number(word): 
         return True            
     if self.lang != language:
         self.NWORDS = None
     if language == None :
         self.lang = detect_lang(word)[word]
     else :
         self.lang = language
     if word=="": return True
     
     if self.NWORDS == None: 
         self.NWORDS = self.get_wordlist(word)  
     if self.NWORDS == None:           
         # Dictionary not found
         return False
     result = word in self.NWORDS
     #if it is english word, try converting the first letter to lower case.
     #This will happen if the word is first word of a sentence
     if result == False and word.upper() != word.lower():
         newword = word[0].lower()+word[1:]
         self.NWORDS = self.get_wordlist(newword)  
         return newword in self.NWORDS
     else:
         return result    
Example #2
0
 def suggest(self,word, language=None, distance=2):
     word=word.strip()
     if word=="": 
         return None
     if self.lang != language:
         self.NWORDS = None
     if language==None :
         self.lang = detect_lang(word)[word]
     else :
         self.lang = language
     if self.NWORDS == None:
         self.NWORDS = self.get_wordlist(word) 
     if word in self.NWORDS:
         return word        
     candidates = []
     for candidate in self.NWORDS:
         #skip if the first letter is different
         #if candidate[0] != word[0]:
         #    continue
         #if the length difference is greater than the threshold distance, skip
         if len(candidate) - len(word)  > distance or len(word) - len(candidate)  >    distance :
             continue
         if not self.levenshtein(candidate, word) > distance :
             candidates.append(candidate)
     candidates = self.filter_candidates(word, candidates)
     if len(candidates)==0:
         #try inserting spaces in between the letters to see if the word got merged
         pos = 2;
         while pos < len(word)-2:
             if self.check(word[:pos],self.lang) and self.check(word[pos:],self.lang):
                 candidates.append(word[:pos]+" "+word[pos:])
                 candidates.append(word[:pos]+"-"+word[pos:])
             pos+=1    
     return candidates
Example #3
0
 def suggest(self, word, language=None, distance=2):
     word = word.strip()
     if word == "":
         return None
     if self.lang != language:
         self.NWORDS = None
     if language == None :
         self.lang = detect_lang(word)[word]
     else :
         self.lang = language
     if self.NWORDS == None:
         self.NWORDS = self.get_wordlist(word)
     if word in self.NWORDS:
         return word
     candidates = []
     for candidate in self.NWORDS:
         # skip if the first letter is different
         # if candidate[0] != word[0]:
         #    continue
         # if the length difference is greater than the threshold distance, skip
         if len(candidate) - len(word) > distance or len(word) - len(candidate) > distance :
             continue
         if not self.levenshtein(candidate, word) > distance :
             candidates.append(candidate)
     candidates = self.filter_candidates(word, candidates)
     if len(candidates) == 0:
         # try inserting spaces in between the letters to see if the word got merged
         pos = 2;
         while pos < len(word) - 2:
             if self.check(word[:pos], self.lang) and self.check(word[pos:], self.lang):
                 candidates.append(word[:pos] + " " + word[pos:])
                 candidates.append(word[:pos] + "-" + word[pos:])
             pos += 1
     return candidates
Example #4
0
    def check(self, word, language=None):
        word = word.strip()
        if word == "":
            return None
        # If it is a number, don't do spelcheck
        if silpautils.is_number(word):
            return True
        if self.lang != language:
            self.NWORDS = None
        if language == None :
            self.lang = detect_lang(word)[word]
        else :
            self.lang = language
        if word == "": return True

        if self.NWORDS == None:
            self.NWORDS = self.get_wordlist(word)
        if self.NWORDS == None:
            # Dictionary not found
            return False
        result = word in self.NWORDS
        # if it is english word, try converting the first letter to lower case.
        # This will happen if the word is first word of a sentence
        if result == False and word.upper() != word.lower():
            newword = word[0].lower() + word[1:]
            self.NWORDS = self.get_wordlist(newword)
            return newword in self.NWORDS
        else:
            return result
Example #5
0
        # print(author)
        for poem_link in poem_links:
            sleep(1)
            if ctrlc:
                quit()
            else:
                link2, title = poem_link.xpath('./@href')[0], poem_link.xpath(
                    './@title')[0]
                data['title'] = title
                poem_page = requests.get(website + link2)
                poem = html.fromstring(
                    poem_page.content).xpath('//div[@class="poem"]/p/text()')
                poem_str = "".join(poem).strip()
                if poem_str:
                    data['text'] = poem_str
                    lang = detect_lang(poem_str)
                    if lang == "ne":
                        data['lang'] = "नेपाली"
                    else:
                        with open(os.path.join(dir, "undetected.txt"),
                                  "a") as f:
                            f.write(lang + "\n")
                            f.write(poem_str + "\n")
                        continue

                    with open(
                            os.path.join(dir,
                                         title.split('/')[0]) + '.txt',
                            "w") as f:
                        f.write(json.dumps(data))
                        continue
                    content = sahitya_content.xpath(
                        '//div[@class="entry-content"]/p')
                    author = content[0].xpath('//strong/text()')[0]
                    data['author'] = author
                    content_str = ''.join(
                        map(
                            lambda x: html.tostring(
                                x, encoding='unicode', pretty_print=True),
                            content[1:]))
                    data['text'] = content_str
                    if sahitya_type in lang_map:
                        data['lang'] = lang_map[sahitya_type]
                    else:
                        try:
                            lang = detect_lang(content_str)
                        except lang_detect_exception.LangDetectException:
                            print("error caught")
                            continue
                        except Exception:
                            print("Exception")
                            continue
                        if lang == 'ne':
                            data['lang'] = "नेपाली"
                        elif lang == 'hi':
                            data['lang'] = "हिन्दी"
                        # elif lang == 'Unknown':
                        # data['lang'] = lang
                        else:
                            continue