def stem_metn(request): soz_class = NameForm cumle_class = TextForm morf_class = SozForm porter = PorterStemmer() lancaster = LancasterStemmer() k = request.POST.get('metn', '') alqo = request.POST.get('alqo', '') txt = k if alqo == 'Bizim Alqoritm': txt = metn_oxu(k) elif alqo == 'Porter Alqoritmi': txt = porter.stem(txt) elif alqo == 'Lancaster Alqoritmi': txt = lancaster.stem(txt) elif alqo == 'WordNet Alqoritmi': wordnet_lemmatizer = WordNetLemmatizer() txt = metn_oxu(wordnet_lemmatizer.stem(k)) return render(request, 'metn.html', { 'form': soz_class, 'cumle': cumle_class, 'morf': morf_class, 'txt': txt })
def data_prepare(language_dict, ru=False, en=False, es=False): if ru: stop_words = set(stopwords.words('russian')) lemmatizer = pymorphy2.MorphAnalyzer() elif en: stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer() elif es: stop_words = set(stopwords.words('spanish')) lemmatizer = nltk.stem.SnowballStemmer('spanish') dict_prepared = [] for text in tqdm(language_dict): text = re.sub(r'[^\w\s]', '', text.lower()) text = re.sub(r'[0-9]', '', text) word_tokens = word_tokenize(text) word_tokens = [w for w in word_tokens if not w in stop_words] if ru: word_tokens = [ lemmatizer.parse(w)[0].normal_form for w in word_tokens ] elif en: word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens] elif es: word_tokens = [lemmatizer.stem(w) for w in word_tokens] word_tokens = [w for w in word_tokens if not w in stop_words] filtered_text = ' '.join(word_tokens) dict_prepared.append(filtered_text) return dict_prepared
def stemming_word_with_WordNet(word): print("actual word " + word) word_net = WordNetLemmatizer() """ WordNetLemmatizer arg-1 : word arg-2 : noun, verb, adjective etc.. eg: (pos ="v") """ print("stemmed word " + word_net.stem(word))
def stem_cluster(data, mode = 10, length_at_least = 3): global stemmer # load default stemmer (nltk lemmatizer) if stemmer == None: try: # import if corpus exists from nltk.stem import WordNetLemmatizer except: # download corpora if does not exist import nltk if not nltk.download('wordnet'): raise Exception('Error in downloading wordnet. \ Please make sure you are connected to the network, \ or try downloading manually.') from nltk.stem import WordNetLemmatizer # cache the default stemmer stemmer = WordNetLemmatizer() # port the lemmatizer as the stemmer stemmer.stem = stemmer.lemmatize from algoutils import flatten, split from collections import defaultdict # split data into words words = flatten(split(data, ' ')) # collect frequency of individual words frequency = defaultdict(int) for word in words: if len(word) >= length_at_least: frequency[word] += 1 # filter words by frequency words = filter(lambda (word,freq): freq >= mode, frequency.items()) words = list(zip(*words)[0]) # trim stems stem_map = defaultdict(list) stem = stemmer.stem for word in words: stem_map[stem(word)].append(word) # only return representative # aka. the word with least length return map(lambda rep: min(rep, key=len), stem_map.values())
def stem_cluster(data, mode=10, length_at_least=3): global stemmer # load default stemmer (nltk lemmatizer) if stemmer == None: try: # import if corpus exists from nltk.stem import WordNetLemmatizer except: # download corpora if does not exist import nltk if not nltk.download('wordnet'): raise Exception('Error in downloading wordnet. \ Please make sure you are connected to the network, \ or try downloading manually.') from nltk.stem import WordNetLemmatizer # cache the default stemmer stemmer = WordNetLemmatizer() # port the lemmatizer as the stemmer stemmer.stem = stemmer.lemmatize from algoutils import flatten, split from collections import defaultdict # split data into words words = flatten(split(data, ' ')) # collect frequency of individual words frequency = defaultdict(int) for word in words: if len(word) >= length_at_least: frequency[word] += 1 # filter words by frequency words = filter(lambda (word, freq): freq >= mode, frequency.items()) words = list(zip(*words)[0]) # trim stems stem_map = defaultdict(list) stem = stemmer.stem for word in words: stem_map[stem(word)].append(word) # only return representative # aka. the word with least length return map(lambda rep: min(rep, key=len), stem_map.values())
class Stemmer: stemmer = None mode = None join = True def __init__(self, mode='Porter', join=True): if (mode == 'Porter'): from nltk.stem import PorterStemmer self.stemmer = PorterStemmer() elif (mode == 'Lancaster'): from nltk.stem import LancasterStemmer self.stemmer = LancasterStemmer() elif (mode == 'Lemmatize'): from nltk.stem import WordNetLemmatizer self.stemmer = WordNetLemmatizer() elif (mode == 'Snowball'): raise Exception("TODO") elif (mode == 'Regexp'): raise Exception("TODO") self.mode = mode self.join = join def __str__(self): return ('NLTK Stemmer using ' + self.mode + ' Stemming') # data is a list of strings or a list of list of strings # returns either a list of words or a joined list def fitTransform(self, data): return self.transform(data) # data is a list of list of words # returns either a list of words or a joined list def transform(self, data): # for each list of words in data list, lemmatize/stem each word if (self.mode == "Lemmatize"): result = [[self.stemmer.lemmatize(word) for word in doc] for doc in data] else: result = [[self.stemmer.stem(word) for word in doc] for doc in data] # if necessary join words in each list of words in result list if (self.join): result = [' '.join(doc) for doc in result] return result
def _tokenize(doc, filter_stopwords=True, normalize='lemma'): import nltk.corpus from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.tokenize import sent_tokenize, wordpunct_tokenize from string import punctuation # use NLTK's default set of english stop words stops_list = nltk.corpus.stopwords.words('english') if normalize == 'lemma': # lemmatize with WordNet normalizer = WordNetLemmatizer() elif normalize == 'stem': # stem with Porter normalizer = PorterStemmer() # tokenize the document into sentences with NLTK default sents = sent_tokenize(doc) # tokenize each sentence into words with NLTK default tokenized_sents = [wordpunct_tokenize(sent) for sent in sents] # filter out "bad" words, normalize good ones normalized_sents = [] for tokenized_sent in tokenized_sents: good_words = [word for word in tokenized_sent # filter out too-long words if len(word) < 25 # filter out bare punctuation if word not in list(punctuation)] if filter_stopwords is True: good_words = [word for word in good_words # filter out stop words if word not in stops_list] if normalize == 'lemma': normalized_sents.append( [normalizer.lemmatize(word, 'v') for word in good_words]) elif normalize == 'stem': normalized_sents.append([normalizer.stem(word) for word in good_words]) else: normalized_sents.append([word for word in good_words]) return normalized_sents
def text_normalize(text, method='lemmas'): """ Parameters ---------- text: str List or pandas column of texts as string method: str, {'lemmas','stems'}, default 'lemmas' Normalization method used on text. Returns ------- normalized_text: list List of lists with lemmas or stems as strings Examples -------- normalize text with lemmas >>> dickens = ["It was the best of times.", "it was the worst of times!"] >>> normalized_text = px.text_normalize(dickens, method = 'lemmas') >>> normalized_text [['it', 'be', 'the', 'best', 'of', 'time'], ['it', 'be', 'the', 'worst', 'of', 'time']] """ if method == 'lemmas': normalizer = WordNetLemmatizer() if method == 'stems': normalizer = PorterStemmer() temp = [] for i in tqdm(range(len(text))): words = word_tokenize(text[i]) words = [word.lower() for word in words if word.isalpha()] if method == 'lemmas': temp.append([normalizer.lemmatize(w, pos='v') for w in words]) if method == 'stems': temp.append([normalizer.stem(w) for w in words]) return temp
class CrazyTokenizer(object): """ Tokenizer with Reddit- and Twitter-specific options Parameters ---------- lowercase : bool, optional If True, lowercase all tokens. Defaults to True. keepcaps: bool, optional If True, keep ALL CAPS WORDS uppercased. Defaults to False. normalize: int or bool, optional If not False, perform normalization of repeated charachers ("awesoooooome" -> "awesooome"). The value of parameter determines the number of occurences to keep. Defaults to 3. ignore_quotes: bool, optional If True, ignore tokens contained within double quotes. Defaults to False. ignore_reddit_quotes: bool, optional If True, remove quotes from the Reddit comments. Defaults to False. ignore_stopwords: str, list, or boolean, optional Whether to ignore stopwords - str: language to get a list of stopwords for from NLTK package - list: list of stopwords to remove - True: use built-in list of the english stop words - False: keep all tokens Defaults to False stem: {False, 'stem', 'lemm'}, optional Whether to perform word stemming - False: do not perform word stemming - 'stem': use PorterStemmer from NLTK package - 'lemm': use WordNetLemmatizer from NLTK package remove_punct: bool, optional If True, remove punctuation tokens. Defaults to True. remove_breaks: bool, optional If True, remove linebreak tokens. Defaults to True. decontract: bool, optional If True, attempt to expand certain contractions. Defaults to False. Example: "'ll" -> " will" numbers, subreddits, reddit_usernames, emails: False or str, optional Replacement of the different types of tokens - False: leaves these tokens intact - str: replacement token - '': removes all occurrences of these tokens twitter_handles: False, 'realname' or str, optional Processing of twitter handles - False: do nothing - str: replacement token - 'realname': replace with the real screen name of Twitter account - 'split': split handles using Viterbi algorithm Example: "#vladimirputinisthebest" -> "vladimir putin is the best" hashtags: False or str, optional Processing of hashtags - False: do nothing - str: replacement token - 'split': split hashtags according using Viterbi algorithm urls: False or str, optional Replacement of parsed URLs - False: leave URL intact - str: replacement token - dict: replace all URLs stored in keys with the corresponding values - '': removes all occurrences of these tokens - 'domain': extract domain ("http://cnn.com" -> "cnn") - 'domain_unwrap_fast': extract domain after unwraping links for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com) - 'domain_unwrap': extract domain after unwraping all links - 'title': extract and tokenize title of each link after unwraping it Defaults to False. extra_patterns: None or list of tuples, optional Replacement of any user-supplied extra patterns. Tuples must have the following form: (name, re_pattern, replacement_token): - name (str): name of the pattern - re_pattern (_sre.SRE_Pattern): compiled re pattern - replacement_token (str): replacement token Defaults to None keep_untokenized: None or list, optional List of expressions to keep untokenized Example: ["New York", "Los Angeles", "San Francisco"] whitespaces_to_underscores: boolean, optional If True, replace all whitespace characters with underscores in the final tokens. Defaults to True. remove_nonunicode: boolean, optional If True, remove all non-unicode characters. Defaults to False. pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional Replace positive, negative, and neutral emojis with the special tokens - None: do not perform replacement - True: perform replacement of the default lists of emojis - list: list of emojis to replace print_url_warnings: bool, optional If True, print URL-related warnings. Defaults to False. latin_chars_fix: bool, optional Try applying this fix if you have a lot of \\xe2\\x80\\x99-like or U+1F601-like strings in your data. Defaults to False. ngrams: int, optional Add ngrams of tokens after tokenizing """ def __init__(self, lowercase=True, keepcaps=False, normalize=3, ignore_quotes=False, ignore_reddit_quotes=False, ignore_stopwords=False, stem=False, remove_punct=True, remove_breaks=True, decontract=False, twitter_handles=False, urls=False, hashtags=False, numbers=False, subreddits=False, reddit_usernames=False, emails=False, extra_patterns=None, keep_untokenized=None, whitespaces_to_underscores=True, remove_nonunicode=False, pos_emojis=None, neg_emojis=None, neutral_emojis=None, print_url_warnings=False, latin_chars_fix=False, ngrams=1): self.params = locals() self._nlp = English() self._merging_matcher = Matcher(self._nlp.vocab) self._matcher = Matcher(self._nlp.vocab) self._replacements = {} self._domains = {} self._realnames = {} self._stopwords = None alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check) hashtag_flag = self._nlp.vocab.add_flag(hashtag_check) twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check) self._merging_matcher.add('HASHTAG', None, [{ 'ORTH': '#' }, { 'IS_ASCII': True }]) self._merging_matcher.add('SUBREDDIT', None, [{ 'ORTH': '/r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'r' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) self._merging_matcher.add('REDDIT_USERNAME', None, [{ 'ORTH': '/u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }], [{ 'ORTH': 'u' }, { 'ORTH': '/' }, { alpha_digits_flag: True }]) if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules): try: self._stopwords = stopwords.words(ignore_stopwords) except OSError: raise ValueError('Language {} was not found by NLTK'.format( ignore_stopwords)) elif ignore_stopwords is True: self._matcher.add('STOPWORDS', self._remove_token, [{ 'IS_STOP': True }]) elif isinstance(ignore_stopwords, list): self._stopwords = [word.lower() for word in ignore_stopwords] elif ignore_stopwords is not False: raise TypeError( 'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed' .format(type(ignore_stopwords))) if lowercase and (not keepcaps): self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False }]) elif lowercase and keepcaps: self._matcher.add('LOWERCASE', self._lowercase, [{ 'IS_LOWER': False, 'IS_UPPER': False }]) if remove_punct: self._matcher.add('PUNCTUATION', self._remove_token, [{ 'IS_PUNCT': True }]) if remove_breaks: def break_check(text): return bool(BREAKS_RE.fullmatch(text)) break_flag = self._nlp.vocab.add_flag(break_check) self._matcher.add('BREAK', self._remove_token, [{ break_flag: True }]) if normalize: def normalize_check(text): return bool(NORMALIZE_RE.search(text)) normalize_flag = self._nlp.vocab.add_flag(normalize_check) self._matcher.add('NORMALIZE', self._normalize, [{ normalize_flag: True }]) if numbers is not False: self._matcher.add('NUMBER', self._replace_token, [{ 'LIKE_NUM': True }]) self._replacements['NUMBER'] = numbers if urls is not False: if urls in [ 'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title' ]: self._urls = urls self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) elif isinstance(urls, dict): self._domains = urls self._urls = 'domain_unwrap_fast' self._matcher.add('URL', self._process_url, [{ 'LIKE_URL': True }]) else: self._matcher.add('URL', self._replace_token, [{ 'LIKE_URL': True }]) self._replacements['URL'] = urls if emails is not False: self._matcher.add('EMAIL', self._replace_token, [{ 'LIKE_EMAIL': True }]) self._replacements['EMAIL'] = emails if reddit_usernames is not False: def reddit_username_check(text): return bool(REDDITORS_RE.fullmatch(text)) reddit_username_flag = self._nlp.vocab.add_flag( reddit_username_check) self._matcher.add('REDDIT_USERNAME', self._replace_token, [{ reddit_username_flag: True }]) self._replacements['REDDIT_USERNAME'] = reddit_usernames if subreddits is not False: def subreddit_check(text): return bool(SUBREDDITS_RE.fullmatch(text)) subreddit_flag = self._nlp.vocab.add_flag(subreddit_check) self._matcher.add('SUBREDDIT', self._replace_token, [{ subreddit_flag: True }]) self._replacements['SUBREDDIT'] = subreddits if twitter_handles is not False: self._matcher.add('TWITTER_HANDLE', self._handles_postprocess, [{ twitter_handle_flag: True }]) if hashtags is not False: self._matcher.add('HASHTAG', self._hashtag_postprocess, [{ hashtag_flag: True }]) if hashtags == 'split' or twitter_handles == 'split': file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt') with open(file) as f: self._words = f.read().split() self._wordcost = dict((k, log((i + 1) * log(len(self._words)))) for i, k in enumerate(self._words)) self._maxword = max(len(x) for x in self._words) if twitter_handles == 'realname': with open(os.path.join(DATA_PATH, 'realnames.json')) as f: self._realnames = json.load(f) if ignore_quotes: self._merging_matcher.add('QUOTE', None, [{ 'ORTH': '"' }, { 'OP': '*', 'IS_ASCII': True }, { 'ORTH': '"' }]) def doublequote_check(text): return bool(QUOTES_RE.fullmatch(text)) doublequote_flag = self._nlp.vocab.add_flag(doublequote_check) self._matcher.add('DOUBLE_QUOTES', self._remove_token, [{ doublequote_flag: True }]) if self._stopwords: def stopword_check(text): return bool(text.lower() in self._stopwords) stopword_flag = self._nlp.vocab.add_flag(stopword_check) self._matcher.add('STOPWORD', self._remove_token, [{ stopword_flag: True }]) if keep_untokenized is not None: if not isinstance(keep_untokenized, list): raise ValueError( "keep_untokenized has to be either None or a list") for i, phrase in enumerate(keep_untokenized): phrase_tokens = phrase.split(' ') rule = [] for token in phrase_tokens: rule.append({'LOWER': token.lower()}) self._merging_matcher.add('RULE_' + str(i), None, rule) if pos_emojis: if not isinstance(pos_emojis, list): pos_emojis = POS_EMOJIS pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis] self._matcher.add('HAPPY', self._replace_token, *pos_patterns) self._replacements['HAPPY'] = 'POS_EMOJI' if neg_emojis: if not isinstance(neg_emojis, list): neg_emojis = NEG_EMOJIS neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis] self._matcher.add('SAD', self._replace_token, *neg_patterns) self._replacements['SAD'] = 'NEG_EMOJI' if neutral_emojis: if not isinstance(neutral_emojis, list): neutral_emojis = NEUTRAL_EMOJIS neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis] self._matcher.add('NEUTRAL', self._replace_token, *neutral_patterns) self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI' if isinstance(extra_patterns, list): self._flags = {} for name, re_pattern, replacement_token in extra_patterns: def flag(text): return bool(re_pattern.match(text)) self._flags[name] = self._nlp.vocab.add_flag(flag) self._matcher.add(name, self._replace_token, [{ self._flags[name]: True }]) self._replacements[name] = replacement_token if stem and ('nltk' in sys.modules): if stem == 'stem': self._stemmer = PorterStemmer() elif stem == 'lemm': self._stemmer = WordNetLemmatizer() else: raise ValueError( 'Stemming method {} is not supported'.format(stem)) self._matcher.add('WORD_TO_STEM', self._stem_word, [{ 'IS_ALPHA': True }]) retokenize_flag = self._nlp.vocab.add_flag(retokenize_check) self._matcher.add('RETOKENIZE', self._retokenize, [{ retokenize_flag: True, 'IS_PUNCT': False, 'LIKE_URL': False, 'LIKE_EMAIL': False, 'LIKE_NUM': False, hashtag_flag: False, twitter_handle_flag: False }]) self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True) self._nlp.add_pipe(self._match_doc, name='match_doc', last=True) self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True) @staticmethod def _lowercase(__, doc, i, matches): # Lowercase tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = tok._.transformed_text.lower() def _stem_word(self, __, doc, i, matches): # Stem tokens __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['stem'] == 'stem': tok._.transformed_text = self._stemmer.stem( tok._.transformed_text) elif self.params['stem'] == 'lemm': tok._.transformed_text = self._stemmer.lemmatize( tok._.transformed_text) def _normalize(self, __, doc, i, matches): # Normalize repeating symbols __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = NORMALIZE_RE.sub( r"\1" * self.params['normalize'], tok._.transformed_text) def _process_url(self, __, doc, i, matches): # Process found URLs __, start, end = matches[i] span = doc[start:end] for tok in span: found_urls = URLS_RE.findall(tok.text) if found_urls: if found_urls[0] in self._domains: tok._.transformed_text = self._domains[found_urls[0]] elif self._urls == 'domain': tok._.transformed_text = tldextract.extract( found_urls[0]).domain elif self._urls != 'title': if self._urls == 'domain_unwrap': domain = unshorten_url( found_urls[0], None, self.params['print_url_warnings']) else: domain = unshorten_url( found_urls[0], URL_SHORTENERS, self.params['print_url_warnings']) self._domains[found_urls[0]] = domain tok._.transformed_text = domain elif self._urls == 'title': domain = unshorten_url(found_urls[0], URL_SHORTENERS) if domain != 'twitter': title = get_url_title( found_urls[0], self.params['print_url_warnings']) title = self.tokenize(URLS_RE.sub('', title)) else: title = '' tok._.transformed_text = title self._domains[found_urls[0]] = title def _replace_token(self, __, doc, i, matches): # Replace tokens with something else match_id, start, end = matches[i] span = doc[start:end] replacement_token = self._replacements[doc.vocab.strings[match_id]] for tok in span: tok._.transformed_text = replacement_token @staticmethod def _remove_token(__, doc, i, matches): # Remove tokens __, start, end = matches[i] span = doc[start:end] for tok in span: tok._.transformed_text = '' def _retokenize(self, __, doc, i, matches): # Retokenize __, start, end = matches[i] span = doc[start:end] for tok in span: text = tok.text text = re.sub(r'([#@])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text).strip() tok._.transformed_text = self.tokenize(text) def _infer_spaces(self, text): # Infer location of spaces in hashtags text = text.lower() text = re.sub(r'[^\w\s]', '', text) def best_match(i): # Find the best match for the first i characters # assuming costs has been built for the first (i-1) characters candidates = enumerate(reversed(cost[max(0, i - self._maxword):i])) return min( (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1) for k, c in candidates) cost = [0] for i in range(1, len(text) + 1): cur_cost, k = best_match(i) cost.append(cur_cost) out = [] i = len(text) while i > 0: cur_cost, k = best_match(i) assert cur_cost == cost[i] out.append(text[i - k:i]) i -= k return list(reversed(out)) def _handles_postprocess(self, __, doc, i, matches): # Process twitter handles __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['twitter_handles'] == 'realname': if tok.text in self._realnames: tok._.transformed_text = self._realnames[tok.text] else: handle = get_twitter_realname(tok.text) realname = self.tokenize(TWITTER_HANDLES_RE.sub( '', handle)) tok._.transformed_text = realname self._realnames[tok.text] = realname elif self.params['twitter_handles'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['twitter_handles'] def _hashtag_postprocess(self, __, doc, i, matches): # Process hashtags __, start, end = matches[i] span = doc[start:end] for tok in span: if self.params['hashtags'] == 'split': poss = self._infer_spaces(tok._.transformed_text[1:]) if poss: tok._.transformed_text = poss else: tok._.transformed_text = self.params['hashtags'] @staticmethod def _decontract(text): # Expand contractions for contraction, decontraction in DECONTRACTIONS.items(): text = re.sub(contraction, decontraction, text) return text def _preprocess_text(self, text): # Do some preprocessing text = re.sub("’", "'", text) if self.params['remove_nonunicode']: try: text = text.encode('utf-8').decode('unicode-escape') text = ''.join(filter(lambda x: x in string.printable, text)).strip() except UnicodeDecodeError: warnings.warn( '(UnicodeDecodeError while trying to remove non-unicode characters' ) if self.params['decontract']: text = self._decontract(text) text = html.unescape(text) if self.params['latin_chars_fix']: if EMOJIS_UTF_RE.findall(text): text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text) for utf_code, emoji in EMOJIS_UTF.items(): text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text) if EMOJIS_UNICODE_RE.findall(text): text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text) for utf_code, emoji in EMOJIS_UNICODE.items(): text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text) if LATIN_CHARS_RE.findall(text): for _hex, _char in LATIN_CHARS.items(): text = LATIN_CHARS_PATS[_hex].sub(_char, text) if self.params['ignore_reddit_quotes']: text = REDDIT_QUOTES_RE.sub(text, ' ') text = text.replace('.@', '. @') text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text) text = re.sub(r'\s{2,}', ' ', text) return text.strip() def _merge_doc(self, doc): # Perform merging for certain types of tokens matches = self._merging_matcher(doc) spans = [] for __, start, end in matches: spans.append(doc[start:end]) for span in spans: span.merge() for tok in doc: tok._.transformed_text = tok.text return doc def _match_doc(self, doc): # Perform all additional processing self._matcher(doc) return doc def _postproc_doc(self, doc): # Perform postprocessing doc._.tokens = [] for tok in doc: if isinstance(tok._.transformed_text, list): doc._.tokens.extend(tok._.transformed_text) elif tok._.transformed_text.strip() != '': if self.params['whitespaces_to_underscores']: tok._.transformed_text = "_".join( tok._.transformed_text.split()) doc._.tokens.append(tok._.transformed_text.strip()) return doc def tokenize(self, text): """ Tokenize document Parameters ---------- text : str Document to tokenize Returns ------- list List of tokens Examples -------- >>> from redditscore.tokenizer import CrazyTokenizer >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False) >>> tokenizer.tokenize("#makeamericagreatagain") ["make", "america", "great", "again"] """ if not isinstance(text, str): warnings.warn('Document {} is not a string'.format(text)) return [] text = self._preprocess_text(text) doc = self._nlp(text) tokens = doc._.tokens if self.params['ngrams'] > 1: if self.params['whitespaces_to_underscores']: tokens = word_ngrams(tokens, (1, self.params['ngrams']), separator='_') else: tokens = word_ngrams(tokens, (1, self.params['ngrams'])) return tokens
class CorpusIterator(six.Iterator): """Class to do tokenization, tagging, stemming, etc. and yield each document 1 at a time. Only ever loads one doc into memory""" def __init__(self, indir, n, stem=None, stop_words=True, tag=None, tag_pattern=None, punctuation=True, split_clauses=False, outdir=None): """Constructor Input: indir: path to directory of txt files n: order of n gram stem: {'snowball','porter','lemma',None} stemmer to use Defaults to None. stop_words: Boolean. include stopwords. Defaults to True tag: {'nltk',None}. POS tagger to use. Defaults to None tag_pattern: list of of tag patterns to allow in simplified form. Defaults to None. if tag_pattern = "default", use default tag pattern. punctuation: Boolean. include punctuation. Defaults to True split_clauses: Boolean. Split on clauses outdir: directory to write to. Defaults to indir/ngram_results """ self.indir = indir # check if directory is zip archive or directory and act accordingly if not zipfile.is_zipfile(indir): # list the files in the directory self.files = sorted([ os.path.join(indir, f) for f in os.listdir(indir) if os.path.splitext(f)[1] == ".txt" ]) # create directory for results if outdir is None: outdir = os.path.join(indir, "ngram_results") # check if directory exists, if not create direcotry if not os.path.exists(outdir): os.mkdir(outdir) # set zip_corpus to None self.zip_corpus = None else: # files is the namelist of the zip archive self.zip_corpus = zipfile.ZipFile(indir) self.files = self.zip_corpus.namelist() # create directory for results in the directory of the zip archive if outdir is None: # get the directory of the zip archive tmp = os.path.split(indir)[0] outdir = os.path.join(tmp, "ngram_results") # check if the directory exists, if not create the directory if not os.path.exists(outdir): os.mkdir(outdir) # assign option variables self.n = n self.stem = stem self.stop_words = stop_words self.tag = tag self.tag_pattern = tag_pattern self.punctuation = punctuation self.outdir = outdir # keep an index for the __next__() function self.index = 0 # class variable holding default tag patterns and dict for conversion # to universal tag set self.default_tagpatterns = set([ 'AN', 'NN', 'VN', 'VV', 'VP', 'NNN', 'AAN', 'ANN', 'NAN', 'NPN', 'VAN', 'VNN', 'VPN', 'ANV', 'NVV', 'VDN', 'VVV', 'VVP' ]) self.default_tagset = set(''.join(self.default_tagpatterns)) self.tagdict = keydefaultdict( lambda x: x, { 'NN': 'N', 'NNS': 'N', 'NNP': 'N', 'NNPS': 'N', 'JJ': 'A', 'JJR': 'A', 'JJS': 'A', 'VBG': 'A', 'RB': 'A', 'DT': 'D', 'IN': 'P', 'TO': 'P', 'VB': 'V', 'VBD': 'V', 'VBN': 'V', 'VBP': 'V', 'VBZ': 'V', 'MD': 'V', 'RP': 'P' }) # class variable which contains english stop words as a set self.stop = set(stopwords.words('english')) # set up tagger if tag is not None if tag is not None: if tag == 'nltk': # create a named tuple which holds nltk.pos_tag_sents as # tag_sents NLTKTagger = namedtuple("NLTKTagger", ["tag_sents", "tag"]) self.tagger = NLTKTagger(nltk.pos_tag_sents, nltk.pos_tag) else: # raise a value error if an unsupproted tagger is included raise ValueError('Not an available tagger') # initialize stemmer if stem is not None if stem is not None: if stem == 'porter': self.stemmer = PorterStemmer() elif stem == 'snowball': self.stemmer = SnowballStemmer("english") elif stem == 'lemma': self.stemmer = WordNetLemmatizer() # add stem as another name for lemmatize self.stemmer.stem = stemmer.lemmatize else: # raise a value error if a wrong stemmer is chosen raise ValueError('Not an available stemmer') # set splitting on clauses self.split_clauses = split_clauses # current clauses self.curr_clauses = [] def __len__(self): """len function, number of documents""" return (len(self.files)) def __next__(self): """Next function for iterator""" if self.index >= len(self.files): raise StopIteration # if not splitting on clauses if not self.split_clauses: # get sentences from file sents = doc_sents(self.files[self.index], zipped=self.zip_corpus) #get ngrams of doc and yield ngrams = self.ngrams_from_sents(sents, self.n, self.stem, self.stop_words, self.tag, self.tag_pattern, self.punctuation) self.index += 1 return (ngrams) #if splitting on clauses use the clauses else: if len(self.curr_clauses) == 0: #get the sentences for the current clauses self.curr_clauses = doc_sents(self.files[self.index], zipped=self.zip_corpus, clauses=True) self.index += 1 #pop one clauses from self.curr_clauses sents = self.curr_clauses.pop() ngrams = self.ngrams_from_sents(sents, self.n, self.stem, self.stop_words, self.tag, self.tag_pattern, self.punctuation) return (ngrams) def __iter__(self): """Iterator, does tokenization,stemming,tagging,etc on a doc before returning it""" if not self.split_clauses: for i, fName in enumerate(sorted(self.files)): if i % 100 == 0: logging.info("Computing N-grams for %ith file %s" % (i, fName)) #get sentences from file sents = doc_sents(fName, zipped=self.zip_corpus) #get ngrams of doc and yield ngrams = self.ngrams_from_sents(sents, self.n, self.stem, self.stop_words, self.tag, self.tag_pattern, self.punctuation) yield (ngrams) else: for i, fName in enumerate(self.files): if i % 100 == 0: logging.info("Computing N-grams for %ith file %s" % (i, fName)) #get sentences for clauses clauses = doc_sents(fName, zipped=self.zip_corpus, clauses=True) for sents in clauses: ngrams = self.ngrams_from_sents(sents, self.n, self.stem, self.stop_words, self.tag, self.tag_pattern, self.punctuation) yield (ngrams) def custom_ngrams(self, words, n): """Faster n gram generation than nltk.ngrams Input: words: word tokenized sentence n: order of ngram Output: ngrams: list of ngrams """ ngrams = zip(*[words[i:] for i in range(n)]) return (ngrams) def word_tokenize(self, words): """Faster word tokenization than nltk.word_tokenize Input: words: a string to be tokenized Output: tokens: tokenized words """ tokens = re.findall(r"[a-z]+-?[a-z]+", words.lower(), flags=re.UNICODE | re.LOCALE) return (tokens) def ngrams_from_sents(self, sents, n, stem=None, stop_words=True, tag=None, tag_pattern=None, punctuation=True): """Gets the ngrams from a list of sentences Input: sents: list of sentences as strings n: order of n gram stem: {'snowball','porter','lemma',None} stemmer to use Defaults to None. stop_words: Boolean. include stopwords. Defaults to True tag: {'ap','nltk','stanford',None}. POS tagger to use. Defaults to None tag_pattern: list of of tag patterns to allow in simplified form. Defaults to None. if tag_pattern = "default", use default tag pattern. punctuation: Boolean. include punctuation. Defaults to True Output: ngrams: list of ngrams as "word1-word2" strings """ #tag sentences first if tag is not None: #tokenize the sentences tmp = [] for sent in sents: tmp.append([word.lower() for word in self.word_tokenize(sent)]) sents = tmp if tag == 'nltk': # tag words tags = self.tagger.tag_sents(sents) # extract the tags without the words tags = [[self.tagdict[tagWord[1]] for tagWord in tag[i]] for i in range(len(sents))] else: #raise a value error if an unsupproted tagger is included raise ValueError('Not an available tagger') #iterate through sentences and get ngrams ngrams = [] for i, words in enumerate(sents): if tag is None: #if tag is None then word tokenization hasn't happend words = self.word_tokenize(words) #stem words if stem is not None if stem is not None: words = [self.stemmer.stem(word) for word in words] #join tags and words if tag is not None if tag is not None: words = ['::'.join(tagWord) for tagWord in zip(words, tags[i])] #remove stop words if stop = False if not stop_words: words = [ word for word in words if not word.split("::")[0] in self.stop ] #remove punctuation if punctuation is false if not punctuation: pun = string.punctuation words = [ word for word in words if not word.split("::")[0] in pun ] #get n grams and add to ngrams list sent_grams = [ "_".join(gram) for gram in self.custom_ngrams(words, n) ] #if tag_pattern isn't None, go through sent_grams and only keep those #ngrams with the proper tag pattern if tag_pattern is not None: #assign default tag pattern if tag_pattern == 'default' if tag_pattern == 'default': tag_pattern = self.default_tagpatterns tmp = [] #maybe make this a list comprehension? for gram in sent_grams: #get tags separately tags_squash = [t.split("::")[1] for t in gram.split("_")] #check if the tag pattern is allowed if ''.join(tags_squash) in tag_pattern: tmp.append(gram) sent_grams = tmp ngrams.extend(sent_grams) return (ngrams)
# Parts of speech tagging print "Processing data to tag parts of speech..." pos_word_data = nltk.pos_tag(word_data) print pos_word_data # Stemming and Lemmatization ## Stemming with Porter Stemmer print "Stemming with Porter Stemmer..." porter_stemmer = PorterStemmer() for w in word_data[:20]: print "Actual: %s Stem: %s" % (w, porter_stemmer.stem(w)) ## Stemming with Lancaster Stemmer print "Stemming with Porter Stemmer..." lancaster_stemmer = LancasterStemmer() for w in word_data[:20]: print "Actual: %s Stem: %s" % (w, lancaster_stemmer.stem(w)) ## Stemming with Snowball Algorithm print "Stemming with Snowball Stemmer..." snowball_stemmer = SnowballStemmer("english") for w in word_data[:20]: print "Actual: %s Stem: %s" % (w, snowball_stemmer.stem(w)) ## Lemmatization with WordNet wordnet_lemmatizer = WordNetLemmatizer() for w in word_data[:20]: print "Actual: %s Stem: %s" % (w, wordnet_lemmatizer.stem(w))
class ConceptBasedILPSummarizer(LoadFile): """Implementation of the concept-based ILP model for summarization. The original algorithm was published and described in: * Dan Gillick and Benoit Favre, A Scalable Global Model for Summarization, *Proceedings of the NAACL HLT Workshop on Integer Linear Programming for Natural Language Processing*, pages 10–18, 2009. """ def __init__(self, input_directory, language): """ Args: input_directory (str): the directory from which text documents to be summarized are loaded. @type language: str """ self.input_directory = input_directory self.sentences = [] self.weights = {} self.c2s = defaultdict(set) self.concept_sets = defaultdict(frozenset) self.LANGUAGE = language # type: str self.stoplist = set(stopwords.words(self.LANGUAGE)) self.stemmer = WordNetLemmatizer() self.word_frequencies = defaultdict(int) self.w2s = defaultdict(set) def extract_ngrams2(self, concept_type='ngrams', n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ for i, sentence in enumerate(self.sentences): untokenized_concepts = [] if concept_type == 'ngrams': ngrams = extract_ngrams2([sentence.untokenized_form], self.stemmer, self.LANGUAGE, n) pruned_list = prune_ngrams(ngrams, self.stoplist, n) elif concept_type == 'phrase': pruned_list = self.sentences[i].phrases for concept in pruned_list: wrds = unstem_ngram(concept, sentence) untokenized_concepts.append(" ".join(wrds)) self.sentences[i].concepts = pruned_list self.sentences[i].untokenized_concepts = untokenized_concepts #print(untokenized_concepts) if len(self.sentences[i].concepts) != len( self.sentences[i].untokenized_concepts): raise BaseException( "unexpected length difference between concepts and untokenized_concepts" ) def extract_ngrams(self, n=2): """Extract the ngrams of words from the input sentences. Args: n (int): the number of words for ngrams, defaults to 2 """ for i, sentence in enumerate(self.sentences): # for each ngram of words for j in range(len(sentence.tokens) - (n - 1)): # initialize ngram container ngram = [] # for each token of the ngram for k in range(j, j + n): ngram.append(sentence.tokens[k].lower()) # do not consider ngrams containing punctuation marks marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)] if len(marks) > 0: continue # do not consider ngrams composed of only stopwords stops = [t for t in ngram if t in self.stoplist] if len(stops) == len(ngram): continue # stem the ngram ngram = [self.stemmer.stem(t) for t in ngram] #ngram = [self.stemmer.lemmatize(t) for t in ngram] # add the ngram to the concepts self.sentences[i].concepts.append(' '.join(ngram)) def compute_document_frequency(self): """Compute the document frequency of each concept. """ for i in range(len(self.sentences)): # for each concept for concept in self.sentences[i].concepts: # add the document id to the concept weight container if concept not in self.weights: self.weights[concept] = set([]) self.weights[concept].add(self.sentences[i].doc_id) # loop over the concepts and compute the document frequency for concept in self.weights: self.weights[concept] = len(self.weights[concept]) def compute_word_frequency(self): """Compute the frequency of each word in the set of documents. """ for i, sentence in enumerate(self.sentences): for token in sentence.tokens: t = token.lower() if not re.search('[a-zA-Z0-9]', t) or t in self.stoplist: continue #t = self.stemmer.stem(t) t = self.stemmer.lemmatize(t) self.w2s[t].add(i) self.word_frequencies[t] += 1 def prune_sentences(self, mininum_sentence_length=5, remove_citations=True, remove_redundancy=True, imp_list=None): """Prune the sentences. Remove the sentences that are shorter than a given length, redundant sentences and citations from entering the summary. Args: mininum_sentence_length (int): the minimum number of words for a sentence to enter the summary, defaults to 5 remove_citations (bool): indicates that citations are pruned, defaults to True remove_redundancy (bool): indicates that redundant sentences are pruned, defaults to True """ if imp_list is None: imp_list = [] retained_sentences = [] # loop over the sentences for i, sentence in enumerate(self.sentences): if imp_list: if imp_list[i] == 0: continue # prune short sentences if sentence.length < mininum_sentence_length: continue # prune citations first_token, last_token = sentence.tokens[0], sentence.tokens[-1] if remove_citations and \ (first_token == u"``" or first_token == u'"' \ or last_token == u"''" or first_token == u'"' \ or last_token== u"'" or first_token==u"'") \ or last_token == u'"': continue # prune ___ said citations # if remove_citations and \ # (sentence.tokens[0]==u"``" or sentence.tokens[0]==u'"') and \ # re.search('(?i)(''|") \w{,30} (said|reported|told)\.$', # sentence.untokenized_form): # continue # prune identical and almost identical sentences if remove_redundancy: is_redundant = False for prev_sentence in retained_sentences: if sentence.tokens == prev_sentence.tokens: is_redundant = True break if is_redundant: continue # otherwise add the sentence to the pruned sentence container retained_sentences.append(sentence) # from all concepts that are going to be pruned, keep only those that also appear elsewhere retained_concepts = [ concept for s in retained_sentences for concept in s.concepts ] for sentence in set(self.sentences).difference(retained_sentences): for concept in sentence.concepts: if concept not in retained_concepts \ and self.weights.has_key(concept): del self.weights[concept] log.debug("keeping %s unique sentences of %s sentences" % (len(retained_sentences), len(self.sentences))) self.sentences = retained_sentences def prune_concepts(self, method="threshold", value=3, rejected_list=None): """Prune the concepts for efficient summarization. Args: method (str): the method for pruning concepts that can be whether by using a minimal value for concept scores (threshold) or using the top-N highest scoring concepts (top-n), defaults to threshold. value (int): the value used for pruning concepts, defaults to 3. """ if rejected_list is None: rejected_list = [] if method == 'stopwords': concepts = self.weights.keys() for concept in concepts: pruned_list = prune_ngrams(concept, self.stoplist, 1) if not pruned_list: #print concept, self.weights[concept] del self.weights[concept] if method == "list": concepts = self.weights.keys() for concept in concepts: if concept in rejected_list: #print concept, self.weights[concept] del self.weights[concept] # 'threshold' pruning method if method == "threshold": # iterates over the concept weights concepts = self.weights.keys() for concept in concepts: if self.weights[concept] < value: del self.weights[concept] # 'top-n' pruning method elif method == "top-n": # sort concepts by scores sorted_concepts = sorted(self.weights, key=lambda x: self.weights[x], reverse=True) # iterates over the concept weights concepts = self.weights.keys() for concept in concepts: if concept not in sorted_concepts[:value]: del self.weights[concept] # iterates over the sentences for i in range(len(self.sentences)): # current sentence concepts concepts = self.sentences[i].concepts # prune concepts self.sentences[i].concepts = [ c for c in concepts if c in self.weights ] def compute_c2s(self): """Compute the inverted concept to sentences dictionary. """ for i, sentence in enumerate(self.sentences): for concept in sentence.concepts: self.c2s[concept].add(i) def compute_concept_sets(self): """Compute the concept sets for each sentence.""" for i, sentence in enumerate(self.sentences): for concept in sentence.concepts: self.concept_sets[i] |= {concept} def greedy_approximation(self, summary_size=100): """Greedy approximation of the ILP model. Args: summary_size (int): the maximum size in words of the summary, defaults to 100. Returns: (value, set) tuple (int, list): the value of the approximated objective function and the set of selected sentences as a tuple. """ # initialize the inverted c2s dictionary if not already created if not self.c2s: self.compute_c2s() # initialize weights weights = {} # initialize the score of the best singleton best_singleton_score = 0 # compute indices of our sentences sentences = range(len(self.sentences)) # compute initial weights and fill the reverse index # while keeping track of the best singleton solution for i, sentence in enumerate(self.sentences): weights[i] = sum(self.weights[c] for c in set(sentence.concepts)) if sentence.length <= summary_size\ and weights[i] > best_singleton_score: best_singleton_score = weights[i] best_singleton = i # initialize the selected solution properties sel_subset, sel_concepts, sel_length, sel_score = set(), set(), 0, 0 # greedily select a sentence while True: ################################################################### # RETRIEVE THE BEST SENTENCE ################################################################### # sort the sentences by gain and reverse length sort_sent = sorted(((weights[i] / float(self.sentences[i].length), -self.sentences[i].length, i) for i in sentences), reverse=True) # select the first sentence that fits in the length limit for sentence_gain, rev_length, sentence_index in sort_sent: if sel_length - rev_length <= summary_size: break # if we don't find a sentence, break out of the main while loop else: break # if the gain is null, break out of the main while loop if not weights[sentence_index]: break # update the selected subset properties sel_subset.add(sentence_index) sel_score += weights[sentence_index] sel_length -= rev_length # update sentence weights with the reverse index for concept in set(self.sentences[sentence_index].concepts): if concept not in sel_concepts: for sentence in self.c2s[concept]: weights[sentence] -= self.weights[concept] # update the last selected subset property sel_concepts.update(self.sentences[sentence_index].concepts) # check if a singleton has a better score than our greedy solution if best_singleton_score > sel_score: return best_singleton_score, set([best_singleton]) # returns the (objective function value, solution) tuple return sel_score, sel_subset def tabu_search(self, summary_size=100, memory_size=10, iterations=100, mutation_size=2, mutation_group=True): """Greedy approximation of the ILP model with a tabu search meta-heuristic. Args: summary_size (int): the maximum size in words of the summary, defaults to 100. memory_size (int): the maximum size of the pool of sentences to ban at a given time, defaults at 5. iterations (int): the number of iterations to run, defaults at 30. mutation_size (int): number of sentences to unselect and add to the tabu list at each iteration. mutation_group (boolean): flag to consider the mutations as a group: we'll check sentence combinations in the tabu list, not sentences alone. Returns: (value, set) tuple (int, list): the value of the approximated objective function and the set of selected sentences as a tuple. """ # compute concept to sentences and concept sets for each sentence if not self.c2s: self.compute_c2s() if not self.concept_sets: self.compute_concept_sets() # initialize weights weights = {} # initialize the score of the best singleton best_singleton_score = 0 # compute initial weights and fill the reverse index # while keeping track of the best singleton solution for i, sentence in enumerate(self.sentences): weights[i] = sum(self.weights[c] for c in set(sentence.concepts)) if sentence.length <= summary_size\ and weights[i] > best_singleton_score: best_singleton_score = weights[i] best_singleton = i best_subset, best_score = None, 0 state = State() for i in xrange(iterations): queue = deque([], memory_size) # greedily select sentences state = self.select_sentences(summary_size, weights, state, queue, mutation_group) if state.score > best_score: best_subset = state.subset.copy() best_score = state.score to_tabu = set(random.sample(state.subset, mutation_size)) state = self.unselect_sentences(weights, state, to_tabu) queue.extend(to_tabu) # check if a singleton has a better score than our greedy solution if best_singleton_score > best_score: return best_singleton_score, set([best_singleton]) # returns the (objective function value, solution) tuple return best_score, best_subset def select_sentences(self, summary_size, weights, state, tabu_set, mutation_group): """Greedy sentence selector. Args: summary_size (int): the maximum size in words of the summary, defaults to 100. weights (dictionary): the sentence weights dictionary. This dictionnary is updated during this method call (in-place). state (State): the state of the tabu search from which to start selecting sentences. tabu_set (iterable): set of sentences that are tabu: this selector will not consider them. mutation_group (boolean): flag to consider the mutations as a group: we'll check sentence combinations in the tabu list, not sentences alone. Returns: state (State): the new state of the search. Also note that weights is modified in-place. """ # greedily select a sentence while respecting the tabu while True: ################################################################### # RETRIEVE THE BEST SENTENCE ################################################################### # sort the sentences by gain and reverse length sort_sent = sorted( ((weights[i] / float(self.sentences[i].length), -self.sentences[i].length, i) for i in range(len(self.sentences)) if self.sentences[i].length + state.length <= summary_size), reverse=True) # select the first sentence that fits in the length limit for sentence_gain, rev_length, sentence_index in sort_sent: if mutation_group: subset = state.subset | {sentence_index} for tabu in tabu_set: if tabu <= subset: break else: break else: if sentence_index not in tabu_set: break # if we don't find a sentence, break out of the main while loop else: break # if the gain is null, break out of the main while loop if not weights[sentence_index]: break # update state state.subset |= {sentence_index} state.concepts.update(self.concept_sets[sentence_index]) state.length -= rev_length state.score += weights[sentence_index] # update sentence weights with the reverse index for concept in set(self.concept_sets[sentence_index]): if state.concepts[concept] == 1: for sentence in self.c2s[concept]: weights[sentence] -= self.weights[concept] return state def unselect_sentences(self, weights, state, to_remove): """Sentence ``un-selector'' (reverse operation of the select_sentences method). Args: weights (dictionary): the sentence weights dictionary. This dictionnary is updated during this method call (in-place). state (State): the state of the tabu search from which to start un-selecting sentences. to_remove (iterable): set of sentences to unselect. Returns: state (State): the new state of the search. Also note that weights is modified in-place. """ # remove the sentence indices from the solution subset state.subset -= to_remove for sentence_index in to_remove: # update state state.concepts.subtract(self.concept_sets[sentence_index]) state.length -= self.sentences[sentence_index].length # update sentence weights with the reverse index for concept in set(self.concept_sets[sentence_index]): if not state.concepts[concept]: for sentence in self.c2s[concept]: weights[sentence] += self.weights[concept] state.score -= weights[sentence_index] return state def solve_ilp_problem(self, summary_size=100, units="WORDS", solver='glpk', excluded_solutions=None, unique=False): """Solve the ILP formulation of the concept-based model. :param summary_size: the maximum size in words of the summary, defaults to 100. :param units: defaults to "WORDS" :param solver: the solver used, defaults to glpk :param excluded_solutions: (list of list): a list of subsets of sentences that are to be excluded, defaults to [] :param unique: (bool): modify the model so that it produces only one optimal solution, defaults to False :return: (value, set) tuple (int, list): the value of the objective function and the set of selected sentences as a tuple. """ if excluded_solutions is None: excluded_solutions = [] # initialize container shortcuts concepts = self.weights.keys() w = self.weights L = summary_size C = len(concepts) S = len(self.sentences) if not self.word_frequencies: self.compute_word_frequency() tokens = self.word_frequencies.keys() f = self.word_frequencies T = len(tokens) # HACK Sort keys concepts = sorted(self.weights, key=self.weights.get, reverse=True) # formulation of the ILP problem prob = pulp.LpProblem(self.input_directory, pulp.LpMaximize) # initialize the concepts binary variables c = pulp.LpVariable.dicts(name='c', indexs=range(C), lowBound=0, upBound=1, cat='Integer') # initialize the sentences binary variables s = pulp.LpVariable.dicts(name='s', indexs=range(S), lowBound=0, upBound=1, cat='Integer') # initialize the word binary variables t = pulp.LpVariable.dicts(name='t', indexs=range(T), lowBound=0, upBound=1, cat='Integer') # OBJECTIVE FUNCTION prob += pulp.lpSum(w[concepts[i]] * c[i] for i in range(C)) if unique: prob += pulp.lpSum(w[concepts[i]] * c[i] for i in range(C)) + \ 10e-6 * pulp.lpSum(f[tokens[k]] * t[k] for k in range(T)) # CONSTRAINT FOR SUMMARY SIZE if units == "WORDS": prob += pulp.lpSum(s[j] * self.sentences[j].length for j in range(S)) <= L if units == "CHARACTERS": prob += pulp.lpSum(s[j] * len(self.sentences[j].untokenized_form) for j in range(S)) <= L # INTEGRITY CONSTRAINTS for i in range(C): for j in range(S): if concepts[i] in self.sentences[j].concepts: prob += s[j] <= c[i] for i in range(C): prob += pulp.lpSum( s[j] for j in range(S) if concepts[i] in self.sentences[j].concepts) >= c[i] # WORD INTEGRITY CONSTRAINTS if unique: for k in range(T): for j in self.w2s[tokens[k]]: prob += s[j] <= t[k] for k in range(T): prob += pulp.lpSum(s[j] for j in self.w2s[tokens[k]]) >= t[k] # CONSTRAINTS FOR FINDING OPTIMAL SOLUTIONS for sentence_set in excluded_solutions: prob += pulp.lpSum([s[j] for j in sentence_set ]) <= len(sentence_set) - 1 # prob.writeLP('test.lp') # solving the ilp problem # solving the ilp problem try: print('BASEILP with CPLEX') prob.solve(pulp.CPLEX(msg=0)) except: #print('BASEILP fallback to %s' % (solver)) if solver == 'gurobi': prob.solve(pulp.GUROBI(msg=0)) elif solver == 'glpk': print('BASEILP with GLPK') prob.solve(pulp.GLPK(msg=0)) else: sys.exit('no solver specified') # retreive the optimal subset of sentences solution = set([j for j in range(S) if s[j].varValue == 1]) # returns the (objective function value, solution) tuple return (pulp.value(prob.objective), solution)
class Preprocessor: def __init__(self, normalizer): self.label_encoder = LabelEncoder() self.tf_idf_vectorizer = TfidfVectorizer( sublinear_tf=True, stop_words=stopwords.words('english').append([ "nt", "get", "like", "would", "peopl", "one", "think", "time", "becaus" ]), smooth_idf=True, norm="l2", lowercase=True, max_features=30000, use_idf=True, encoding="utf-8", decode_error='ignore', strip_accents='unicode', analyzer="word") if normalizer == "stemmer": self.normalizer = SnowballStemmer("english") elif normalizer == "lemmatizer": self.normalizer = WordNetLemmatizer() else: raise Exception( "Normalizer must be \"stemmer\" or \"lemmatizer\".") def preprocess_reddit_train(self): """ Stores a cleaned up version of the dataset on the current directory """ # Read dataset df = pd.read_csv("data/reddit_train.csv") # Apply stemming function df["cleaned"] = df["comments"].apply(self.clean_text) # Transform each subreddit into an unique integer df["label"] = self.label_encoder.fit_transform(df["subreddits"]) # Save cleaned dataset df.to_csv("data/preprocessed_reddit_train_" + type(self.normalizer).__name__ + ".csv", index=False) # TODO: Implement Regularization (i.e. PCA, SVD, L1, L2...?) def clean_text(self, sentence): # Put all words to lower case sentence = sentence.lower() # Tokenize words word_tokens = word_tokenize(sentence) # Remove punctuation word_tokens = [_ for _ in word_tokens if _ not in string.punctuation] # Remove non-alphabetical char word_tokens = [ re.sub(pattern="[^a-zA-Z0-9\s]", repl="", string=_) for _ in word_tokens ] # Remove empty strings word_tokens = [_ for _ in word_tokens if _] # Stem words processed_sentence = self.normalize(" ".join(word_tokens)) # TODO: Remove links? return processed_sentence.strip() def normalize(self, sentence): normalized_str = [] word_tokens = word_tokenize(sentence) if type(self.normalizer).__name__ == "SnowballStemmer": for i in word_tokens: normalized_str.append(self.normalizer.stem(i)) elif type(self.normalizer).__name__ == "WordNetLemmatizer": for i in word_tokens: normalized_str.append(self.normalizer.lemmatize(i)) else: raise Exception( "Normalizer must be \"stemmer\" or \"lemmatizer\".") return " ".join(normalized_str) def preprocess_reddit_test(self): """ Returns a cleaned up version of the test dataset """ # Read dataset df = pd.read_csv("data/reddit_test.csv") # Apply stemming function df["cleaned"] = df["comments"].apply(self.clean_text) # Store cleaned test set in df.to_csv("data/preprocessed_reddit_test_" + type(self.normalizer).__name__ + ".csv", index=False)
def stem_test(): stemmer = WordNetLemmatizer() print stemmer.lemmatize("environment") print stemmer.lemmatize("environmental") stemmer = PorterStemmer() print stemmer.stem("environmental")