def find_sentence_anagrams1(sentence: str, anagram_dictionary: list, temp=None): """ >>> d = ["Je", "suis", "Voldemort", "Veldomort", "Tom", "Jedusor", "Harry", "Potter", "Hermione", "Granger", "Ron"] >>> find_sentence_anagrams1("Tom Elvis Jedusor".lower(), [i.lower() for i in d]) ['je', 'suis', 'voldemort'] >>> d.extend(["ja", "sais"]) >>> find_sentence_anagrams1("Tom Elvis Jedasor".lower(), [i.lower() for i in d]) ['je', 'sais', 'voldemort'] :param sentence: we want to find anagrams of this sentence :param anagram_dictionary: :param temp: :return: the first result found or None """ if len(sentence) == 0: return sorted(temp) if temp is None: temp = set() key = "".join(sorted(sentence.replace(" ", "").lower())) for word in anagram_dictionary: if is_valid_subanagram(key, word.lower()): temp.add(word) anagram_dictionary.remove(word) sentence = "".join(Counter(key) - Counter(word)) res = find_sentence_anagrams1(sentence, anagram_dictionary, temp) if res is not None: return res return None
def is_valid_subanagram(hashed_sentence, word): """ >>> is_valid_subanagram(Counter("jesuisvoldemort"), Counter("voldemort")) True >>> is_valid_subanagram(Counter("jesuisvoldemort"), Counter("tromodlov")) False :param hashed_sentence: :param word: :return: """ count_hashed_sentence = Counter(hashed_sentence) count_word = Counter(word) for c in set(word): if c not in hashed_sentence or count_word[c] > count_hashed_sentence[c]: return False return True
def find_sentence_anagrams2(sentence: str, anagram_dictionary: list, temp=None, res=None): """ >>> d = ["Je", "suis", "Voldemort", "Veldomort", "Tom", "Jedusor", "Harry", "Potter", "Hermione", "Granger", "Ron"] >>> a = find_sentence_anagrams2("Tom Elvis Jedusor".lower(), [i.lower() for i in d]) >>> a ['je', 'suis', 'voldemort'] >>> d.extend(["ja", "sais"]) >>> find_sentence_anagrams2("Tom Elvis Jedasor".lower(), [i.lower() for i in d]) ['je', 'sais', 'voldemort'] :param sentence: we want to find anagrams of this sentence :param anagram_dictionary: :param temp: :param res: :return: sorted result or None """ # print(res) # print(temp # print(sentence) if temp is None: temp = "" if res is None: res = set() if len(sentence) == 0: # print(type(temp)) return {temp} key = "".join(sorted(sentence.replace(" ", "").lower())) for word in anagram_dictionary: if is_valid_subanagram(key, word.lower()): if temp: temp = temp + " " + word else: temp = word anagram_dictionary.remove(word) sentence = "".join(Counter(key) - Counter(word)) res.update(find_sentence_anagrams2(sentence, anagram_dictionary, temp, res)) return res
def create_lexicon(pos, neg): lexicon = [] for file_name in [pos, neg]: with open(file_name, 'r') as f: contents = f.read() for line in contents.split('\n'): data = line.strip('\n') if data: all_words = word_tokenize(data) lexicon += list(map((lambda x: x.lower()), all_words)) lexicons = [] for word in lexicon: if not word in stop_words: lexicons.append(word) word_counts = Counter(lexicons) # it will return kind of dictionary l2 = [] for word in word_counts: if 4000 > word_counts[word]: l2.append(word) return l2
def count_letters(string: str): """ >>> count_letters("bonjour oui non") Counter({'o': 4, 'n': 3, 'u': 2, ' ': 2, 'b': 1, 'j': 1, 'r': 1, 'i': 1}) """ return Counter(string.replace(" ", ""))
#----------------get the dataset-------------------------------- data_main = get_data.Datasets() #----------------get the vocab------------------------------------ train_reviews, train_sentences, train_tokens = data_main.get_normalized_data( "train") unlabeled_reviews, unlabeled_sentences, unlabeled_tokens = data_main.get_normalized_data( "unlabeled") test_reviews, test_sentences, test_tokens = data_main.get_normalized_data( "test") all_cleaned_reviews = train_reviews + unlabeled_reviews + test_reviews #create a counter all_tokens = train_tokens + test_tokens + unlabeled_tokens counter = Counter() for tokens in all_tokens: counter.update(tokens) print(len(counter)) common_keys = get_most_common_vocab(MAX_NB_WORDS, counter) print("common keys") print(common_keys) #load the vocab vocab_name = "tuning_vocab_txt" text = load_vocab(vocab_name) slices = set(text.split()) vocabulary = create_vocab_dict(slices) print(vocabulary)
data = [] arr = nltk.word_tokenize(res) result = [] for index in range(0, arr.__len__()): if(arr[index] == "<"): btw = arr[index] + arr[index+1] + arr[index+2] result.append(btw) elif arr[index] == ">" or arr[index] == "s" or arr[index] == "f": continue else: result.append(arr[index]) data.append(result) frequencies = Counter([]) freq_bi = Counter([]) freq_tr = Counter([]) freq_fo = Counter([]) freq_fi = Counter([]) unigram = ngrams(result, 1) bigrams = ngrams(result, 2) trigrams = ngrams(result, 3) fourgrams = ngrams(result, 4) fivegrams = ngrams(result, 5) frequencies+=Counter(unigram) freq_bi+=Counter(bigrams)