def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False): rouge_s_list = [] k_c = len(candidate) if d_skip is None else d_skip cand_skip_list = list(skipgrams(tokenizer.tokenize(candidate), n=2, k=k_c)) for ref in references: k_ref = len(ref) if d_skip is None else d_skip ref_skip_list = list(skipgrams(tokenizer.tokenize(ref), n=2, k=k_ref)) count = 0 for bigram in cand_skip_list: if bigram in ref_skip_list: count = count+1 if not smoothing: r_skip = count/len(ref_skip_list) p_skip = count/len(cand_skip_list) else: cand_ungm = list(ngrams(tokenizer.tokenize(candidate), n=1)) ref_ungm = list(ngrams(tokenizer.tokenize(ref), n=1)) for ungm in cand_ungm: if ungm in ref_ungm: count += 1 r_skip = count/(len(ref_skip_list)+len(ref_ungm)) p_skip = count/(len(cand_skip_list)+len(cand_ungm)) score = Rouge.get_score(r_skip, p_skip, beta) rouge_s_list.append(score) return Rouge.jacknifing(rouge_s_list, averaging=averaging)
def rouge_s(references, candidate, beta, d_skip=None, averaging=True, smoothing=False): rouge_s_list = [] k_c = len(candidate) if d_skip is None else d_skip cand_skip_list = list( skipgrams(tokenizer.tokenize(candidate), n=2, k=k_c)) for ref in references: k_ref = len(ref) if d_skip is None else d_skip ref_skip_list = list( skipgrams(tokenizer.tokenize(ref), n=2, k=k_ref)) count = 0 for bigram in cand_skip_list: if bigram in ref_skip_list: count = count + 1 if not smoothing: r_skip = count / len(ref_skip_list) p_skip = count / len(cand_skip_list) else: cand_ungm = list(ngrams(tokenizer.tokenize(candidate), n=1)) ref_ungm = list(ngrams(tokenizer.tokenize(ref), n=1)) for ungm in cand_ungm: if ungm in ref_ungm: count += 1 r_skip = count / (len(ref_skip_list) + len(ref_ungm)) p_skip = count / (len(cand_skip_list) + len(cand_ungm)) score = Rouge.get_score(r_skip, p_skip, beta) rouge_s_list.append(score) return Rouge.jacknifing(rouge_s_list, averaging=averaging)
def skipgrams(self, n, k, words=False, filtrate=False, lower=True, **kwargs): method = self.words if words else self.lemmas yield from nltk.skipgrams(method(filtrate=filtrate, lower=lower), n, k, **kwargs)
def skip_grams(tokens, n, k): skip_gram_value = 0 a = [x for x in nltk.skipgrams(tokens, n, k)] for j in range(len(a)): for k in range(n): ss = sid.polarity_scores(a[j][k]) if (ss["pos"] == 1): skip_gram_value += 1 if (ss["neg"] == 1): skip_gram_value -= 1 return skip_gram_value
def __call__(self, doc): tokens = list(self.sentiment_aware_tokenize(doc)) if self.negate: tokens = nltk.sentiment.util.mark_negation(tokens) if self.n == 1: return tokens skipgrams = list(nltk.skipgrams(tokens, self.n, self.k)) return list([' '.join(s) for s in skipgrams])
def SkipBigrams(x): all_skip_bigrams = [] final_list = [] for sentence in x: all_skip_bigrams.append(nltk.skipgrams(sentence, 2, 5)) for skipgram_sentence in all_skip_bigrams: for tupl in skipgram_sentence: final_list.append(tupl[0] + ' ' + tupl[1]) return final_list
def skip_grams(tokens, n, k): skip_gram_value = 0 #a history parameter for skipgram that defines the context a = [x for x in nltk.skipgrams(tokens, n, k)] for j in range(len(a)): for k in range(n): ss = sid.polarity_scores(a[j][k]) if (ss["pos"] == 1): skip_gram_value += 1 if (ss["neg"] == 1): skip_gram_value -= 1 return skip_gram_value
def __call__(self, doc): tokens = list(self.sentiment_aware_tokenize(doc)) if self.negate: tokens = nltk.sentiment.util.mark_negation(tokens) if self.n == 1: return tokens skipgrams = list(nltk.skipgrams(tokens, self.n, self.k)) return list([' '.join(s) for s in skipgrams])
def skip_grams(tokens, n, k): skip_gram_value = 0 # tokens = clean_data(tweet, lemmatize= False) a = [x for x in nltk.skipgrams(tokens, n, k)] for j in range(len(a)): for k in range(n): ss = sid.polarity_scores(a[j][k]) if (ss["pos"] == 1): skip_gram_value += 1 if (ss["neg"] == 1): skip_gram_value -= 1 return skip_gram_value
def rouge_s(summary: str, ref: str, beta: float = 1) -> float: """ Computes the ROUGE-S score of a summary, ROUGE-2 score with skip-bigrams Args: summary: A `str` corresponding to a summary we want to evaluate. ref: A `str` corresponding to a reference summary we use to evaluate. beta: a `float` giving the importance of the precision in comparision to the recall. Returns: A `float` between 0 and 1 giving the ROUGE-S score. """ vocab_summary = words(summary) vocab_ref = words(ref) summary_skip2 = set(skipgrams(vocab_summary, 2, len(vocab_summary))) ref_skip2 = set(skipgrams(vocab_ref, 2, len(vocab_ref))) if len(ref_skip2) == set() or summary_skip2 == set(): return 0 p = len(summary_skip2 & ref_skip2) / len(summary_skip2) r = len(summary_skip2 & ref_skip2) / len(ref_skip2) if r != 0 or p != 0: return ((1 + beta**2) * p * r) / (r + (beta**2 * p)) else: return 0
def SkipTrigrams(tokenized_sentence): all_skip_bigrams = [] final_list = [] all_skip_bigrams.append(nltk.skipgrams(tokenized_sentence, 3, 0)) #print(all_skip_bigrams[0]) #print(list(all_skip_bigrams[0])) for tupl in list(all_skip_bigrams[0]): final_list.append(tupl[0]+' '+tupl[1]+' '+tupl[2]) return final_list
def SkipBigramsSentence(x): all_skip_bigrams = [] final_list = [] #print(x) #print('xxx') all_skip_bigrams.append(list(nltk.skipgrams(x, 2, 5))) #print(all_skip_bigrams) for tupl in all_skip_bigrams[0]: #print('tuple') #print(tupl) final_list.append(tupl[0] + ' ' + tupl[1]) return final_list
def get_sg(seqs, scores, min_n, max_n, skip): """ Return all skipgrams of seqs and scores of length [min_n, max_n] with max number of skip tokens/= :param seqs: token sequences 2D list or similar :param scores: imp score sequences 2D list or similar :param min_n: minimum skipgram length :param max_n: max skipgram length (inclusive) :param skip: max number of tokens to skip :return: 2D list of skipgrams of seqs and scores (n_inst * n_sg) """ cur_inst_sg_seqs, cur_inst_sg_scores = list(), list() for n in range(min_n, max_n + 1): if not n: continue if n == 1: cur_inst_sg_seqs.extend(seqs) cur_inst_sg_scores.extend(scores) continue cur_inst_sg_seqs.extend([' '.join(sg) for sg in skipgrams(seqs, n=n, k=skip)]) cur_inst_sg_scores.extend([np.mean(sg) for sg in skipgrams(scores, n=n, k=skip)]) return cur_inst_sg_seqs, cur_inst_sg_scores
def skip_grams(tokens, n, k): skip_gram_value = 0 # tokens = clean_data('if it is well hidden', lemmatize= False) #a=[('if', 'it'), ('it', 'is'), ('is', 'well'), ('well', 'hidden')] for k=0 #a= [('if', 'it'), ('if', 'is'), ('it', 'is'), ('it', 'well'), ('is', 'well'), ('is', 'hidden'), ('well', 'hidden')] for k=1 #a = [('if', 'it'), ('if', 'is'), ('if', 'well'), ('it', 'is'), ('it', 'well'), ('it', 'hidden'), ('is', 'well'), ('is', 'hidden'), ('well', 'hidden')] for k=2 a = [x for x in nltk.skipgrams(tokens, n, k)] for j in range(len(a)): for k in range(n): ss = sid.polarity_scores(a[j][k]) if (ss["pos"] == 1): skip_gram_value += 1 if (ss["neg"] == 1): skip_gram_value -= 1 return skip_gram_value
def skip_grams(tokens, n, dist): skip_gram_value = 0 pos_words = [] neg_words = [] arr = [x for x in nltk.skipgrams(tokens, n, dist)] for j in range(len(arr)): for k in range(n): ss = sia.polarity_scores(arr[j][k]) if (ss["pos"] == 1): skip_gram_value += 1 pos_words.append(arr[j][k]) if (ss["neg"] == 1): skip_gram_value -= 1 neg_words.append(arr[j][k]) return pos_words, neg_words
def get_n_skipgrams(tokens, n, k): """ Fuction to extract the skip gram of a given list of tokens Args: ----- tokens (list of strings) >>> list of tokens extracted by using the TextPreprocessing class. n (int) >>> number of gram. k (int) >>> skip param for the skip gram. Returns: -------- a list of tuple that contains the skip gram. """ sent = " ".join(tokens).split() return list(nltk.skipgrams(sent, n, k))
def remove_html(raw_html): cleantext = re.sub(cleanr, '', raw_html) cleantext = cleantext.lower() cleantext = re.sub(' ', ' ', cleantext) cleantext = re.sub('•', ' ', cleantext) cleantext = re.sub(REPLACE_BY_SPACE_RE, " ", cleantext) cleantext = re.sub(BAD_SYMBOLS_RE, "", cleantext) cleantext = " ".join([ word_lemma.lemmatize(w) for w in cleantext.split(" ") if w not in STOPWORDS ]) #cleantext = " ".join([w for w in cleantext.split(" ") if w not in STOPWORDS]) cleantext = cleantext + ' '.join([ ' '.join(x) for x in (list(skipgrams(itertools.islice(cleantext.split(), 50), 3, 1))) ]) return cleantext
from nltk import skipgrams from nltk.corpus import stopwords from nltk.tokenize import word_tokenize import itertools stopwords = list(stopwords.words('english')) numbers = list(range(0, 100)) forumSpecificWords = ['>', '<', '?', '[', ']', '*'] + numbers stopwords += forumSpecificWords with open("shakespeare.txt", 'rb') as f: content = f.read().decode('utf-8') tokens = word_tokenize(content) withoutStopwords = [w for w in tokens if w not in stopwords] for w in tokens: if w not in stopwords: withoutStopwords.append(w.strip()) print( "\n-----2,2:", list(skipgrams(itertools.islice(withoutStopwords, 100, 2, 2)))) print( "\n-----2,3:", list(skipgrams(itertools.islice(withoutStopwords, 100, 2, 3)))) print( "\n-----3,2:", list(skipgrams(itertools.islice(withoutStopwords, 100, 3, 2))))
def main(): # Start the counter start = time.time() # Load raw data and tokenize print("Loading data...") corpus_file = str(sys.argv[1]) corpus = open(corpus_file, 'r') text = corpus.readlines() text = list(map(str.strip, text)) text_string = ' '.join(text) print("Tokenizing...") tokens = nltk.word_tokenize(text_string) # Function to create a dataframe with counts and probabilities def create_count_df(list_to_count): list_with_counts = collections.Counter(list_to_count) df = pd.DataFrame() df['word'] = list_with_counts.keys() df['count'] = list_with_counts.values() df['prob'] = df['count'] / sum(df['count']) return df # Create the list of unigrams with the count and normalize probability print("Creating the list of unigrams...") unigram_df = create_count_df(tokens) print("Creating the list of skipgrams...") skipgram_list = list(skipgrams(tokens, 2, 2)) skipgram_df = create_count_df(skipgram_list) print("# tokens: ", len(tokens)) print("# unigrams: ", unigram_df.shape[0]) print("# skipgrams: ", skipgram_df.shape[0]) # For each pair of words calculate the PMI and create a data frame print("Calculating PMI values for each skipgram...") skipgram_df[['word1', 'word2']] = skipgram_df['word'].apply(pd.Series) unigram_df = unigram_df.set_index('word') skipgram_df['prob1'] = skipgram_df['word1'].map(unigram_df['prob'].get) skipgram_df['prob2'] = skipgram_df['word2'].map(unigram_df['prob'].get) skipgram_df['pmi'] = np.log(skipgram_df['prob'] / (skipgram_df['prob1'] * skipgram_df['prob2'])) skipgram_df = skipgram_df[['word1', 'word2', 'pmi']] # Pivot the data frame into a sparse matrix, and convert NaNs into 0s print("Converting into a matrix...") pmi_matrix = skipgram_df.pivot(index='word1', columns='word2', values='pmi') pmi_matrix = pmi_matrix.fillna(0) # Apply SVD to reduce the size of the matrix to get word vectors print("Extracting word vectors...") U, S, V = scipy.sparse.linalg.svds(pmi_matrix, k=int(sys.argv[2])) word_list = unigram_df.index.get_values() # Save the model print("Saving model...") word_list_name = '_'.join([sys.argv[3], 'wordlist.p']) vectors_name = '_'.join([sys.argv[3], 'vectors.p']) output_word_list = open(word_list_name, 'wb') pickle.dump(word_list, output_word_list) output_word_list.close() output_vectors = open(vectors_name, 'wb') pickle.dump(U, output_vectors) output_vectors.close() # Print out overall statistics of the run end = time.time() print("Running time: ", str(round(end - start, 1)), "seconds") return
def get_char_skip_grams(text, n, k): text = ' '.join(text) chars = [c for c in text] return set([''.join(t) for t in skipgrams(chars, n, k)])
from nltk import skipgrams import jieba import csv import re import gc f = open('D:/wiki_texts_data.txt', 'r', encoding='utf-8') sent = re.sub("[A-Za-z0-9]", "", f.read()) sent_cut = jieba.cut(sent) sent_list = [] for i in sent_cut: sent_list.append(i) j = 0 for i in range(6): c = 2 for k in range(7): with open(str(j) + '.csv', 'w', encoding="utf_8_sig", newline='') as csvfile: writer = csv.writer(csvfile) skipsent = list(skipgrams(sent_list, c, k)) writer.writerows(zip(skipsent)) del skipsent gc.collect() j = j + 1 c = c + 2
def skip_analyzer(self, doc): tokens = super().build_analyzer()(doc) if self.n <= 1: return nltk.ngrams(tokens, n=self.n) return nltk.skipgrams(tokens, self.n, self.k)
freq_path = "data/processed/freqs" lyrics_files = [f for f in listdir(data_path) if isfile(join(data_path, f))] for lyrics_file in lyrics_files: with open(join(data_path, lyrics_file), "r") as infile: text = infile.read().strip().split() wfreqs = Counter(text) wfreqs = { k: v for k, v in sorted( wfreqs.items(), key=lambda item: item[1], reverse=True) if v > 2 and k not in STOP_WORDS } skipgrams = Counter(list(nltk.skipgrams(text, 2, 3))) skipgrams = { k: v for k, v in sorted( skipgrams.items(), key=lambda item: item[1], reverse=True) if v > 2 } text_graph = nx.Graph() for k, v in skipgrams.items(): text_graph.add_edge(k[0], k[1], weight=v) pr = nx.pagerank(text_graph, weight='weight') pr = { k: v for k, v in sorted(pr.items(), key=lambda item: item[1], reverse=True) if k not in STOP_WORDS }
from bounter import bounter from nltk import skipgrams in_path = "data/seedonly" txts = [f for f in listdir(in_path) if isfile(join(in_path, f))] with open("data/external/stopwords.txt", "r") as f: stopwords = set(f.read().strip().split("\n")) counts = bounter(size_mb=1024) for txt in txts: with open(join(in_path, txt), "r") as f: text = f.read().split() text = [wd for wd in text if wd not in stopwords] skips = list(skipgrams(text, 2, 5)) skips = [sorted(t) for t in skips] skips = ['@'.join(t) for t in skips] counts.update(skips) G = nx.Graph() i = 0 for skip, freq in counts.iteritems(): if freq > 50: try: source, target = skip.split("@") G.add_edge(source, target, weight=freq) if source not in G: G.add_node(source) if target not in G: G.add_node(target)