def get_words_from_random_n_roots(number_of_roots: int = 1000): nepali_dictionary = read_csv(helper.get_root_pos_rule_csv_path()) nepali_dictionary = random.sample(nepali_dictionary, number_of_roots) for d in nepali_dictionary: word = d.get('word').strip() rules = filter(None, d.get('rules').strip().split(',')) if not rules: continue get_words(list(rules), [(word, list(rules))])
def _make_dict_trie(): trie = Trie() csv_path = get_root_pos_rule_csv_path() root_data = read_csv(csv_path) for d in root_data: word = d.get('word').strip() rules = list(filter(None, d.get('rules').strip().split(','))) trie.insert(word, rules) return trie
def make_dict_trie() -> TrieNode: root = TrieNode('*') csv_path = helper.get_root_pos_rule_csv_path() root_data = read_csv(csv_path) for d in root_data: word = d.get('word').strip() pos = d.get('pos').strip() or None rules = list(filter(None, d.get('rules').strip().split(','))) add(root, word, rules, pos) return root
def get_rules_for_words_starting_with( starting: str, full_word: str) -> Tuple[Set, List[Tuple[str, List]]]: """ get all words starting with a letter along with their rules for word formation :param starting: staring letter of word to be lemmatized, स :param full_word: word to be lemmatized; सम्बन्धमा :return: set of rules for all words founds, list of (word, rules) """ rules_only = set() words = list() csv_path = helper.get_root_pos_rule_csv_path() root_data = read_csv(csv_path) for d in root_data: word = d.get('word').strip() rules = list(filter(None, d.get('rules').strip().split(','))) if rules and word.startswith(starting) and len(word) <= len(full_word): rules_only = rules_only.union(set(rules)) words.append((word, rules)) return rules_only, words
def insert(self, word): node = self for letter in word: if letter not in node.children: node.children[letter] = TrieNode() node = node.children[letter] node.word = word # read dictionary file into a trie trie = TrieNode() for d in read_csv(get_root_pos_rule_csv_path()): WordCount += 1 trie.insert(d.get('word').strip()) print("Read %d words into %d nodes" % (WordCount, NodeCount)) # The search function returns a list of all words that are less than the given # maximum distance from the target word def search(word, max_cost): # build first row current_row = range(len(word) + 1) print(current_row) results = [] # recursively search each branch of the trie
def __init__(self): csv_path = helper.get_root_pos_rule_csv_path() self.data = read_csv(csv_path)