def extract_ngrams(df, key, by=[], ng_range=(1, 1), pad=False): """ nest terms as ngrams Args: ----- df : DataFrame with columns: term, author, doc_id ng_range : (min_gram, max_gram) by : list containing fileds to group by pad : whether to add <start>/<end> symbols when extracting n-grams """ if pad: new_df = df.groupby(by)[key]\ .apply(lambda x : list(everygrams(x, min_len=ng_range[0], max_len=ng_range[1], pad_left=True, pad_right=True, left_pad_symbol='<start>', right_pad_symbol='<end>' )))\ .explode()\ .reset_index() else: new_df = df.groupby(by)[key]\ .apply(lambda x : list(everygrams(x, min_len=ng_range[0], max_len=ng_range[1] )))\ .explode()\ .reset_index() return new_df
def get_feature_weights(raw, lower_nopunc, raw_nopunc, cls, tfidf, gram_length): X = tfidf.transform([lower_nopunc]) coef = cls.coef_[0] lower_grams = list( everygrams(lower_nopunc.split(), gram_length, gram_length)) original_grams = list( everygrams(raw_nopunc.split(), gram_length, gram_length)) # print(lower_grams) # print(len(original_grams), len(lower_grams)) vocab = tfidf.vocabulary_ weights = [] for i in range(len(lower_grams)): lower_token, original_token = " ".join(lower_grams[i]), " ".join( original_grams[i]) if lower_token in vocab and original_token in raw: idx = vocab[lower_token] weights.append([original_token, coef[idx] * tfidf.idf_[idx]]) else: pass # print(lower_token) filtered = [] if gram_length > 1: filtered = remove_overlap(weights, lower_nopunc, raw, raw_nopunc) else: weights = remove_duplicate(weights) weights.sort(key=lambda x: abs(x[1]), reverse=True) return weights, filtered
def json_reader(fname, count=1000, stemming=False, bigrams=False): """ Read multiple json files Args: fname: str: input file Returns: generator: iterator over documents """ en_stop = set(stopwords.words('english')) p_stemmer = PorterStemmer() def convertAscii(text): return ''.join([i if ord(i) < 128 else '' for i in text]) for line in open(fname, mode="r"): if (count <= 0): break count -= 1 rating = re.search('stars": (.+?),', line) if rating: rating = int(float(rating.group(1))) else: print('contin') continue review = re.search('"text": "(.+?)"', line) if review: review = review.group(1) else: print('contin') continue review = np.array(word_tokenize(review)) if (not stemming): if bigrams: if (len(review) < 3): # ignore it continue review = np.array(list(everygrams(review, 1, 2))) yield {'rating': rating, 'review': review} else: stopped_tokens = filter(lambda token: token not in en_stop, review) stemmed_tokens = map(lambda token: p_stemmer.stem(token), stopped_tokens) review = np.array(list(stemmed_tokens)) if bigrams: review = np.array(list(everygrams(review, 1, 2))) yield {'rating': rating, 'review': review}
def get_ranks(doc_id, doc, get_weight, get_personalization=None, weight='weight'): G = nx.Graph(name=doc_id) doc = nlp(doc) for sentence in doc.sents: tokens = [str(t) for t in sentence if t.is_alpha and not t.is_stop] grams = everygrams(tokens, min_len=1, max_len=3) grams = [' '.join(g) for g in grams] G.add_nodes_from(grams) edges = list(itertools.combinations(grams, 2)) weighted_edges = [(v1, v2, get_weight(v1, v2)) for v1, v2 in edges] G.add_weighted_edges_from(weighted_edges) personalization = {node: get_personalization(node) for node in G.nodes} if get_personalization else None rank = nx.pagerank(G, alpha=1 - 0.15, max_iter=50, weight=weight, personalization=personalization) return rank
def activity_two(frec_letters, msg, n_grams): R = math.log(len(frec_letters), 2) print("\nRango absoluto R = ", R) # Creamos diferentes gramas grams = list(everygrams(msg.lower(), n_grams, n_grams)) # Convertirmos a "set" para quitar elementos repetidos realgram = list(set(grams)) # Calcular rangos r rangos_grams = graficar_rangos(n_grams + 1, msg.lower(), R) # Imprimar los rangos print("\nrangos 'r' para cada n-gram") for i in range(n_grams): print("n-gram[", i + 1, "] - R =", rangos_grams[i]) # Redundancia para cada n print("\nRedundancia 'D' para cada rango 'r'") for i in range(n_grams): print("n-gram[", i + 1, "] - D =", R - rangos_grams[i]) print("\nObservar figura 'grafica_rango.png'") # Cantidad de información de cada char # ademas de entroopia entropia = 0 print("\nBits de informacion para cada simbolo") for key, frec in frec_letters: bit_info = math.log2(1 / frec) print(key, bit_info) entropia += (frec * bit_info) print("\nEntropia = ", entropia)
def get_new_ngrams(texts, n=3, vocabulary=None): ngrams = {} for item in texts: # Processing incoming text: text = item.get("text") probability = item.get("probability") text = re.sub(r"[^а-я\-\s]", "", text.lower().strip()) for i in range(1, n + 1): # Getting i-grams: igrams = list(nltk.everygrams(text.split(), i, i)) # Removing i-grams that do not have any new words bad_indeces = [] for index, igram in enumerate(igrams): bad_igram = True for word in igram: if word not in vocabulary: bad_igram = False break if bad_igram: bad_indeces.append(index) for bad_index in bad_indeces[::-1]: igrams.pop(bad_index) # Collecting i-grams with new words if len(igrams) > 0: if i not in ngrams: ngrams[i] = {} ngrams[i].update({ " ".join(igram): probability for igram in igrams if " ".join(igram) not in ngrams[i] or probability > ngrams[i][" ".join(igram)] }) return ngrams
def freq_by_cuisine(): #create word frequency line plots for each cuisine in the database df = pd.read_csv('concat_uncleaned_recipes.csv').dropna() #read in the file and drop out the nans df['Ingredients'] = df.apply( lambda row: ' '.join(pre_processing.clean_strings(row['Ingredients'])), axis=1) #create ingredients list for each recipe df_freq_mex = df[df['Cuisine'] == 2] df_freq_ital = df[df['Cuisine'] == 3] df_freq_fren = df[df['Cuisine'] == 5] df_freq_amer = df[df['Cuisine'] == 6] df_freq_brit = df[df['Cuisine'] == 7] df_freq_ch = df[df['Cuisine'] == 8] df_freq_ind = df[df['Cuisine'] == 9] df_freq_japan = df[df['Cuisine'] == 13] df_list = [ df_freq_mex, df_freq_ital, df_freq_fren, df_freq_amer, df_freq_brit, df_freq_ch, df_freq_ind, df_freq_japan ] for cuisine in df_list: #go through each cuisine and plot their word frequency for the top 20 terms data = cuisine['Ingredients'].apply( lambda row: list(everygrams(row.split(' '), min_len=2, max_len=2))) flat_data = [item for sublist in data for item in sublist] fdist = FreqDist(flat_data) print(fdist.most_common(20)) word_distro_plot(fdist)
def extract_skills(input_text): stop_words = set(nltk.corpus.stopwords.words('english')) word_tokens = nltk.tokenize.word_tokenize(input_text) # remove the stop words filtered_tokens = [w for w in word_tokens if w not in stop_words] # remove the punctuation filtered_tokens = [w for w in word_tokens if w.isalpha()] # generate bigrams and trigrams (such as artificial intelligence) bigrams_trigrams = list(map(' '.join, nltk.everygrams(filtered_tokens, 2, 3))) # we create a set to keep the results in. found_skills = set() # we search for each token in our skills database for token in filtered_tokens: if token.lower() in SKILLS_DB: found_skills.add(token) # we search for each bigram and trigram in our skills database for ngram in bigrams_trigrams: if ngram.lower() in SKILLS_DB: found_skills.add(ngram) return found_skills
def get_ngram_in_query(docs, query, min_n, max_n): docs = preprocess_docs(docs, query) ngram = [] for doc in docs: for sent in doc.split_sent(): ngram += [ ''.join(token) for token in everygrams(sent, min_n, max_n) for char in query if char in token ] return ngram
def long_gram_first(ngram): for token in ngram: tf = ngram[token] for short in [ ''.join(short) for short in everygrams(token, len(token) - 1, len(token) - 1) ]: ngram[short] -= tf return ngram
def graficar_rangos(max_rango, msg, R): lis_values_rangos = [] for i in range(1, max_rango): # Creamos diferentes gramas grams = list(everygrams(msg, i, i)) # Convertirmos a "set" para quitar elementos repetidos realgram = list(set(grams)) # print(realgram) r = math.log(len(realgram), 2**i) lis_values_rangos.append(r) return lis_values_rangos
def check_spelling(word): d = enchant.Dict('en_US') try: if d.check(word): return word.strip() else: sub_string_arr = sub_string_arr=[''.join(_ngram) for _ngram in everygrams( word) if d.check(''.join(_ngram)) and len(_ngram) > 1] word = sub_string_arr [len(sub_string_arr) - 1].strip() ''.join(e for e in word if e.isalpha()) return word except: return False
def tokenize_lemmatize_ngram(sentence): sentence = sentence.lower().strip('"') lemmatizer = WordNetLemmatizer() tokenized_list = nltk.word_tokenize(sentence) tokenized_list = [ word for word in tokenized_list if word not in stop_words ] lemmatized_word_list = [lemmatizer.lemmatize(w) for w in tokenized_list] every_gramm_list = list(everygrams(lemmatized_word_list, 2, 4)) #print(every_gramm_list) return [' '.join(gram) for gram in every_gramm_list]
def tokenize(self, text: str) -> List[Token]: """ Splits sentences into a set of all possible ngrams up to self._max_ngram_degree using nltk """ ngrams_iterator = everygrams(text.split(), max_len=self._max_ngram_degree) tokens = [Token(" ".join(ngram)) for ngram in ngrams_iterator] for start_token in self._start_tokens: tokens.insert(0, Token(start_token, 0)) for end_token in self._end_tokens: tokens.append(Token(end_token, -1)) return tokens
def ngrams_to_topics(phrases, merge=True, min_similarity=.96): # Core analysis: find matches found_topics = {} successful_grams = {} for concept in phrases: for ngram in everygrams(concept.split(), 1, 3): # TODO: pick between 'phrase' and 'concept' terminology concept = "_".join(ngram) if concept in MODEL: # there's an exact match for the '_'-concatenated ngram in the ontology matches = MODEL[concept] else: # we'll instead search for ontology elements proximate in vector space matches = match_ngram(ngram, merge=merge) for match in matches: topic = match["topic"] sim_t = match["sim_t"] wet = match["wet"] sim_w = match["sim_w"] if sim_t >= min_similarity and topic in CSO["topics_wu"]: if topic in found_topics: # tracking this match found_topics[topic]["times"] += 1 found_topics[topic]["gram_similarity"].append(sim_w) # tracking the matched gram if concept in found_topics[topic]["grams"]: found_topics[topic]["grams"][concept] += 1 else: found_topics[topic]["grams"][concept] = 1 # tracking the most similar gram to the topic if sim_t > found_topics[topic]["embedding_similarity"]: found_topics[topic]["embedding_similarity"] = sim_t found_topics[topic]["embedding_matched"] = wet else: # creating new topic in the result set found_topics[topic] = { 'grams': { concept: 1 }, 'embedding_matched': wet, 'embedding_similarity': sim_t, 'gram_similarity': [sim_w], 'times': 1, 'topic': topic } if sim_w == 1: found_topics[topic]["syntactic"] = True # reporting successful grams: it is the inverse of found_topics["topic"]["grams"] if concept in successful_grams: successful_grams[concept].append(topic) else: successful_grams[concept] = [topic] return found_topics, successful_grams
def cal_freq_ngrams(word_lv_sents): gram_freq = dict() with tqdm(total=len(word_lv_sents)) as bar: for sent in word_lv_sents: max_l = min([len(sent), 4]) for gram in everygrams(sent, max_len=max_l): if gram not in gram_freq: gram_freq[gram] = 1 else: gram_freq[gram] += 1 bar.update() return gram_freq
def extract_skills(corpus, filename): ''' Parses a string to extract resume skills Parameters: corpus (string): The extracted text of a resume filename (string): The filepath to the resume being parsed Returns: skills (set): The extracted skills of the corpus ''' global UNKNOWNS stop_words = set(nltk.corpus.stopwords.words('english')) word_tokens = nltk.tokenize.word_tokenize(corpus) filtered_tokens = [ w.lower() for w in word_tokens if w not in stop_words and w.isalpha() ] bitri = nltk.everygrams(filtered_tokens, 2, 3) filtered_tokens = set(filtered_tokens) for gram in bitri: gram = ' '.join(gram) gram = gram.lower() filtered_tokens.add(gram) db = DatabaseInterface() skills = db.getKnownSkills(filtered_tokens) unknown_skills = db.getUnknowns(filtered_tokens) UNKNOWNS = unknown_skills.copy() try: session.run(asyncAPICheck) except SystemExit: pass db.recordSkills(UNKNOWNS) db.recordNotSkills(unknown_skills.difference(UNKNOWNS)) extraction_package_skills = set([ elem.lower() for elem in ResumeParser(filename).get_extracted_data()['skills'] ]) db.recordSkills(extraction_package_skills) skills = skills.union(extraction_package_skills) skills = skills.union(UNKNOWNS) db.close() return skills
def extract_people(data, list1): """ Extracts potential People nominees from an individual tweet """ result = [] translator = str.maketrans('', '', string.punctuation) remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'golden', 'globe', 'globes'] stop = remove_terms + list1 for tweet in data: tweet = re.sub("\d+", "", tweet) #strip nums tweet = re.sub(r'http\S+', '', tweet) #strip urls tweet = re.sub(r'#\S+', '', tweet) #strip hashtags tweet = tweet.translate(translator) #strip non-alphanumeric characters tweet = tweet.split() #tokenize tweet = [term for term in tweet if term.lower() not in stop_words] #remove stop words for i in stop: for j in tweet: if i.lower() in j.lower(): tweet.remove(j) result.append(tweet) grams = []; for tweet in result: if tweet: # Get all possible bigrams & trigrams in a tweet gram = list(nltk.everygrams(tweet, 2, 3)) # Filter through and append to list for tweet for g in gram: if len(g) == 2: if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])): grams.append(' '.join(g)) else: if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])): grams.append(' '.join(g)) fdist = nltk.FreqDist(grams) try: names = fdist.most_common() except: names = "" return names
def get_word_vector(self, word: str, lemma: str, pos_tag: str = '', morph_tags: Tuple[str] = tuple()): morphemes = self.word2morph[lemma] ngrams = tuple([''.join(g) for g in everygrams(word, min_len=self.ngram_min_len, max_len=self.ngram_max_len)]) vector = self.morph2vec.get_vector(word=word, lemma=lemma, pos=pos_tag, morph_tags=morph_tags, morphemes=morphemes.segments, ngrams=ngrams) return vector
def train(self): logger.info("Training model...") logger.info("tokenizing...") corpus = " ".join(self.df["haiku"]) if self.tokenization == "words": tokens = nltk.word_tokenize(corpus) elif self.tokenization == "characters": tokens = list(corpus) ngrams = nltk.everygrams(tokens, max_len=self.order) logger.info("fitting...") self.model.fit([ngrams], vocabulary_text=self.vocab) logger.info("Trained model.")
def extract_presenters(data, list1, winners): result = [] translator = str.maketrans('', '', string.punctuation) remove_terms = ['#goldenglobes', 'golden globes', '#goldenglobe', 'golden globe', 'goldenglobes', 'goldenglobe', 'golden', 'globe', 'globes'] if winners: stop = remove_terms + list1 + winners.split() else: stop = remove_terms + list1 for tweet in data: tweet = re.sub("\d+", "", tweet) #strip nums tweet = re.sub(r'http\S+', '', tweet) #strip urls tweet = re.sub(r'#\S+', '', tweet) #strip hashtags tweet = tweet.translate(translator) #strip non-alphanumeric characters tweet = tweet.split() #tokenize for i in stop: for j in tweet: if i.lower() in j.lower(): tweet.remove(j) result.append(tweet) grams = []; for tweet in result: if tweet: gram = list(nltk.everygrams(tweet, 2, 3)) for g in gram: if len(g) == 2: if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])): grams.append(' '.join(g)) else: if bool(re.match(r'\b[A-Z][a-z]+\b', g[0])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[1])) and bool(re.match(r'\b[A-Z][a-z]+\b', g[2])): grams.append(' '.join(g)) fdist = nltk.FreqDist(grams) try: names = fdist.most_common() except: names = "" return names
def get_everygrams(data: List[List[str]], n : int): grams = [] for idx, sentence in enumerate(data): gram = everygrams( sentence, min_len=2, max_len=n, pad_left=True, pad_right=True, left_pad_symbol="<s>", right_pad_symbol="</s>") grams.append(gram) return grams
def get_collocations(words): # returns n-grams up to trigrams that appear at least 3 times, with pruning of grams that are redundant minimum_frequency = 3 ngrams = {"_".join(ngram): frequency/len(words) for ngram, frequency in FreqDist(everygrams(words, max_len=3)).items() if frequency > minimum_frequency} collocations = dict(ngrams) for ngram, likelihood in dict(ngrams).iteritems(): grams = ngram.split("_") if len(grams) != 1: gram_likelihoods = [ngrams[gram] for gram in grams] if likelihood < 0.5 * np.prod(gram_likelihoods)**(1 / len(grams)): collocations.pop(ngram, None) else: for gram in grams: collocations.pop(gram, None) return sorted(collocations.items(), key=itemgetter(1), reverse=True)
def exists(gram,keyword): "checks if an n-gram already exists in the keywords or not" keyword=keyword.lower() #makes 5 n-grams of the keyword to match with the given keyword keyword=list(everygrams(keyword.split(), 1, 5)) new_keyword=[] for p in keyword: p=" ".join(p) new_keyword.append(p) keyword=new_keyword if gram.lower() in keyword: return True else: return False
def count_words(s, n_gram_min, n_gram_max, nb_words): stop_words = set(stopwords.words("english")) l_w = ['aren',"aren't","couldn'",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'don',"don't","hadn't",'hasn', "hasn't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'no','nor','not',"shan't",'shouldn', "shouldn't",'wasn',"wasn't","won't",'wouldn',"wouldn't"] for i in l_w: stop_words.discard(i) s = s.lower() s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s) tokens = [token for token in s.split(" ") if token != ""] filtered_sentence = [] for w in tokens: if w not in stop_words: filtered_sentence.append(w) output = list(everygrams(filtered_sentence, n_gram_min, n_gram_max)) c = Counter(output) counts = c.most_common(nb_words) return counts
def df_get_tokens(df, col_name, n_gram=1): """ Create tokens of 'col_name' returns tokenized_text """ def _token_creator(sentence): replaced_punctation = list(map(lambda token: re.sub(r'[^\wa-zA-Z0-9!?]+', '', token), sentence)) removed_punctation = list(filter(lambda token: not token.isdigit(), replaced_punctation)) removed_empty = list(filter(None, removed_punctation)) replace_ = list(map(lambda token: re.sub(r'^_|(\d)+(_$|)|_\W|\W_|_$', '', token), removed_empty)) replace_ = list(map(lambda token: re.sub(r'^_|_$', '', token), replace_)) removed_empty = list(filter(None, replace_)) return removed_empty if n_gram == 1: df['tokenized_text'] = list(map(nltk.word_tokenize, df[col_name])) else: df['tokenized_text'] = df[col_name].apply(lambda x: ['_'.join(ng) for ng in nltk.everygrams(nltk.word_tokenize(x), 1, n_gram)]) df['tokenized_text'] = list(map(_token_creator, df.tokenized_text))
def getDaySubjects(identifier): """ Attempt to add subjects based on the 50 most frequent n-grams in this IA item """ # Cf. https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize # https://agailloty.rbind.io/en/project/nlp_clean-text/ # https://stackoverflow.com/a/58656665 # print("INFO: Planning to add the following words to {}".format(identifier)) r = requests.get("https://archive.org/download/{}/{}_djvu.txt".format( identifier, identifier)) tokens = nltk.tokenize.word_tokenize(r.text, language="italian") tokens = [ word.lower() for word in tokens if word.isalnum() and not word in nltk.corpus.stopwords.words("italian") ] grams = nltk.FreqDist(nltk.everygrams(tokens, min_len=2, max_len=5)) commongrams = [ " ".join(gram[0]) for gram in grams.most_common() if gram[1] > 3 ][:50] return commongrams
def get(filename, bookname): #opening,reading,translating(replacing),and spliting the sorcerors stone file with open(filename,"r",encoding='utf8',errors="ignore") as f: text_list = f.read().lower().split() #-------------------------------------------------------------------------- #takes out punctuation lines 32 - 45 #stopwords is a list of common words that can be removed stop_list = [] stop_words = list(stopwords.words("english")) for word in text_list: if word in stop_words: pass else: stop_list.append(word) #-------------------------------------------------------------------------- #translation is a built in function which helps with changing #the characters which I am using to remove punctuation. trans = str.maketrans("","",'~!@#$%^&*()`,.<>/?\\|[]{};-\n\':"') strip_list = [] for word in stop_list: word = word.translate(trans) strip_list.append(word) # adding the single item to a existing list #-------------------------------------------------------------------------- #stem helps with breaking the word down to its base/root stem_list = [] ss = SnowballStemmer("english") for word in strip_list: word = ss.stem(word) stem_list.append(word) #-------------------------------------------------------------------------- #Returns all possible ngrams generated from the text. ngram = list(everygrams(stem_list, min_len = 3, max_len = 5)) # adds any item to the exisisting list/dictionary count_dict = defaultdict(lambda: 0) # counting the number of ngrams. for tuples in ngram: count_dict[tuples] += 1 df = pd.DataFrame(count_dict, index = [bookname]) return df
def extract_skills(nlp_text, noun_chunks, skills_file=None): ''' Helper function to extract skills from spacy nlp text :param nlp_text: object of `spacy.tokens.doc.Doc` :param noun_chunks: noun chunks extracted from nlp text :return: list of skills extracted ''' tokens = [token.text for token in nlp_text if not token.is_stop] if not skills_file: data = pd.read_csv( os.path.join(os.path.dirname(__file__), 'skills.csv')) else: data = pd.read_csv(skills_file) skills = list(data.columns.values) skills = [s.replace('_', ' ') for s in skills] skillset = [] # generating n-grams from text ngram_len = max(len(group.split()) for group in skills) list_ngrams = list(everygrams(tokens, 1, ngram_len)) chunks = [' '.join(words) for words in list_ngrams] # process to the matching with the list of skills for token in chunks: token = token.lower().strip() if token in skills: skillset.append(token) else: highest = process.extractOne(token, skills) if highest[1] == 100: skillset.append(highest[0]) skill_list = [ i.replace(' ', '_') for i in set([i.lower() for i in skillset]) ] return sorted(skill_list)
def from_conll_line(self, line): parts = line.split('\t') word = parts[1].replace(self.special_char, '-') lemma = parts[2].replace(self.special_char, '-').lower() morphemes = self.word2morphemes[ lemma].segments if self.word2morphemes and lemma else tuple() return Token( index=int(parts[0]), word=word, lemma=lemma, pos=parts[3], xpos=parts[4], morphological_tags=tuple(parts[5].split('|')), morphemes=morphemes, ngrams=tuple([ ''.join(g) for g in everygrams(word, min_len=self.min_ngram_len, max_len=self.max_ngram_len) ]), )
def build(n, input_file, output_file_name): sentences = [] with codecs.open(input_file,'r',encoding='utf8') as f: for line in f: sentences.append(line) ngrams = [] exclude = set(string.punctuation) for sentence in sentences: sentence = ''.join(ch for ch in sentence.lower() if ch not in exclude) ngrams.extend(list(everygrams(sentence.split(), max_len=n, min_len=n))) unique_ngrams = sorted(set(ngrams)) output_file = output_file_name + "_unclean.txt" with codecs.open(output_file,'w',encoding='utf8') as f: for ngram in unique_ngrams: line = ngram + (str(ngrams.count(ngram)),) for word in line: f.write(''.join(word) + ' ') f.write('\n') return output_file