コード例 #1
0
ファイル: views.py プロジェクト: reshng10/Pro
def stem_metn(request):
    soz_class = NameForm
    cumle_class = TextForm
    morf_class = SozForm
    porter = PorterStemmer()
    lancaster = LancasterStemmer()
    k = request.POST.get('metn', '')
    alqo = request.POST.get('alqo', '')
    txt = k
    if alqo == 'Bizim Alqoritm':
        txt = metn_oxu(k)
    elif alqo == 'Porter Alqoritmi':
        txt = porter.stem(txt)
    elif alqo == 'Lancaster Alqoritmi':
        txt = lancaster.stem(txt)
    elif alqo == 'WordNet Alqoritmi':
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = metn_oxu(wordnet_lemmatizer.stem(k))

    return render(request, 'metn.html', {
        'form': soz_class,
        'cumle': cumle_class,
        'morf': morf_class,
        'txt': txt
    })
コード例 #2
0
def data_prepare(language_dict, ru=False, en=False, es=False):
    if ru:
        stop_words = set(stopwords.words('russian'))
        lemmatizer = pymorphy2.MorphAnalyzer()
    elif en:
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
    elif es:
        stop_words = set(stopwords.words('spanish'))
        lemmatizer = nltk.stem.SnowballStemmer('spanish')

    dict_prepared = []

    for text in tqdm(language_dict):
        text = re.sub(r'[^\w\s]', '', text.lower())
        text = re.sub(r'[0-9]', '', text)

        word_tokens = word_tokenize(text)
        word_tokens = [w for w in word_tokens if not w in stop_words]
        if ru:
            word_tokens = [
                lemmatizer.parse(w)[0].normal_form for w in word_tokens
            ]
        elif en:
            word_tokens = [lemmatizer.lemmatize(w) for w in word_tokens]
        elif es:
            word_tokens = [lemmatizer.stem(w) for w in word_tokens]

        word_tokens = [w for w in word_tokens if not w in stop_words]

        filtered_text = ' '.join(word_tokens)
        dict_prepared.append(filtered_text)
    return dict_prepared
コード例 #3
0
def stemming_word_with_WordNet(word):
    print("actual word " + word)
    word_net = WordNetLemmatizer()
    """
    WordNetLemmatizer
    arg-1 : word
    arg-2 : noun, verb, adjective etc.. eg: (pos ="v")
    """

    print("stemmed word " + word_net.stem(word))
コード例 #4
0
ファイル: clustering.py プロジェクト: Hikari9/Matching
def stem_cluster(data, mode = 10, length_at_least = 3):

	global stemmer
	
	# load default stemmer (nltk lemmatizer)
	if stemmer == None:
		
		try: # import if corpus exists
			from nltk.stem import WordNetLemmatizer

		except: # download corpora if does not exist
			import nltk
			if not nltk.download('wordnet'):
				raise Exception('Error in downloading wordnet. \
								Please make sure you are connected to the network, \
								or try downloading manually.')
			from nltk.stem import WordNetLemmatizer

		# cache the default stemmer
		stemmer = WordNetLemmatizer()

		# port the lemmatizer as the stemmer
		stemmer.stem = stemmer.lemmatize

	from algoutils import flatten, split
	from collections import defaultdict
	
	# split data into words
	words = flatten(split(data, ' '))

	# collect frequency of individual words
	frequency = defaultdict(int)
	for word in words:
		if len(word) >= length_at_least:
			frequency[word] += 1
	
	# filter words by frequency
	words = filter(lambda (word,freq): freq >= mode, frequency.items())
	words = list(zip(*words)[0])
	
	# trim stems
	stem_map = defaultdict(list)
	stem = stemmer.stem
	for word in words:
		stem_map[stem(word)].append(word)
	
	# only return representative
	# aka. the word with least length
	return map(lambda rep: min(rep, key=len), stem_map.values())
コード例 #5
0
def stem_cluster(data, mode=10, length_at_least=3):

    global stemmer

    # load default stemmer (nltk lemmatizer)
    if stemmer == None:

        try:  # import if corpus exists
            from nltk.stem import WordNetLemmatizer

        except:  # download corpora if does not exist
            import nltk
            if not nltk.download('wordnet'):
                raise Exception('Error in downloading wordnet. \
								Please make sure you are connected to the network, \
								or try downloading manually.')
            from nltk.stem import WordNetLemmatizer

        # cache the default stemmer
        stemmer = WordNetLemmatizer()

        # port the lemmatizer as the stemmer
        stemmer.stem = stemmer.lemmatize

    from algoutils import flatten, split
    from collections import defaultdict

    # split data into words
    words = flatten(split(data, ' '))

    # collect frequency of individual words
    frequency = defaultdict(int)
    for word in words:
        if len(word) >= length_at_least:
            frequency[word] += 1

    # filter words by frequency
    words = filter(lambda (word, freq): freq >= mode, frequency.items())
    words = list(zip(*words)[0])

    # trim stems
    stem_map = defaultdict(list)
    stem = stemmer.stem
    for word in words:
        stem_map[stem(word)].append(word)

    # only return representative
    # aka. the word with least length
    return map(lambda rep: min(rep, key=len), stem_map.values())
コード例 #6
0
ファイル: Stemmer.py プロジェクト: dchu1/NLP_Class_Project
class Stemmer:
    stemmer = None
    mode = None
    join = True

    def __init__(self, mode='Porter', join=True):
        if (mode == 'Porter'):
            from nltk.stem import PorterStemmer
            self.stemmer = PorterStemmer()
        elif (mode == 'Lancaster'):
            from nltk.stem import LancasterStemmer
            self.stemmer = LancasterStemmer()
        elif (mode == 'Lemmatize'):
            from nltk.stem import WordNetLemmatizer
            self.stemmer = WordNetLemmatizer()
        elif (mode == 'Snowball'):
            raise Exception("TODO")
        elif (mode == 'Regexp'):
            raise Exception("TODO")
        self.mode = mode
        self.join = join

    def __str__(self):
        return ('NLTK Stemmer using ' + self.mode + ' Stemming')

    # data is a list of strings or a list of list of strings
    # returns either a list of words or a joined list
    def fitTransform(self, data):
        return self.transform(data)

    # data is a list of list of words
    # returns either a list of words or a joined list
    def transform(self, data):
        # for each list of words in data list, lemmatize/stem each word
        if (self.mode == "Lemmatize"):
            result = [[self.stemmer.lemmatize(word) for word in doc]
                      for doc in data]
        else:
            result = [[self.stemmer.stem(word) for word in doc]
                      for doc in data]

        # if necessary join words in each list of words in result list
        if (self.join):
            result = [' '.join(doc) for doc in result]
        return result
コード例 #7
0
ファイル: app.py プロジェクト: ArthurGoodman/paraphrase
def _tokenize(doc, filter_stopwords=True, normalize='lemma'):
    import nltk.corpus
    from nltk.stem import PorterStemmer, WordNetLemmatizer
    from nltk.tokenize import sent_tokenize, wordpunct_tokenize
    from string import punctuation

    # use NLTK's default set of english stop words
    stops_list = nltk.corpus.stopwords.words('english')

    if normalize == 'lemma':
        # lemmatize with WordNet
        normalizer = WordNetLemmatizer()
    elif normalize == 'stem':
        # stem with Porter
        normalizer = PorterStemmer()

    # tokenize the document into sentences with NLTK default
    sents = sent_tokenize(doc)
    # tokenize each sentence into words with NLTK default
    tokenized_sents = [wordpunct_tokenize(sent) for sent in sents]
    # filter out "bad" words, normalize good ones
    normalized_sents = []
    for tokenized_sent in tokenized_sents:
        good_words = [word for word in tokenized_sent
                      # filter out too-long words
                      if len(word) < 25
                      # filter out bare punctuation
                      if word not in list(punctuation)]
        if filter_stopwords is True:
            good_words = [word for word in good_words
                          # filter out stop words
                          if word not in stops_list]
        if normalize == 'lemma':
            normalized_sents.append(
                [normalizer.lemmatize(word, 'v') for word in good_words])
        elif normalize == 'stem':
            normalized_sents.append([normalizer.stem(word)
                                     for word in good_words])
        else:
            normalized_sents.append([word for word in good_words])

    return normalized_sents
コード例 #8
0
ファイル: utils.py プロジェクト: halmga/pantext
def text_normalize(text, method='lemmas'):
    """
    
    Parameters
    ----------
    text: str
        List or pandas column of texts as string
    method: str, {'lemmas','stems'}, default 'lemmas'
        Normalization method used on text.
    
    Returns
    -------
    normalized_text: list
        List of lists with lemmas or stems as strings
    
    Examples
    --------
    normalize text with lemmas
    
    >>> dickens = ["It was the best of times.", "it was the worst of times!"]
    >>> normalized_text = px.text_normalize(dickens, method = 'lemmas')
    >>> normalized_text
    [['it', 'be', 'the', 'best', 'of', 'time'],
    ['it', 'be', 'the', 'worst', 'of', 'time']]
    
    """
    if method == 'lemmas':
        normalizer = WordNetLemmatizer()
    if method == 'stems':
        normalizer = PorterStemmer()
    temp = []
    for i in tqdm(range(len(text))):
        words = word_tokenize(text[i])
        words = [word.lower() for word in words if word.isalpha()]
        if method == 'lemmas':
            temp.append([normalizer.lemmatize(w, pos='v') for w in words])
        if method == 'stems':
            temp.append([normalizer.stem(w) for w in words])
    return temp
コード例 #9
0
ファイル: tokenizer.py プロジェクト: cande1gut/RedditScore
class CrazyTokenizer(object):
    """
    Tokenizer with Reddit- and Twitter-specific options

    Parameters
    ----------
    lowercase : bool, optional
        If True, lowercase all tokens. Defaults to True.

    keepcaps: bool, optional
        If True, keep ALL CAPS WORDS uppercased. Defaults to False.

    normalize: int or bool, optional
        If not False, perform normalization of repeated charachers
        ("awesoooooome" -> "awesooome"). The value of parameter
        determines the number of occurences to keep. Defaults to 3.

    ignore_quotes: bool, optional
        If True, ignore tokens contained within double quotes.
        Defaults to False.

    ignore_reddit_quotes: bool, optional
        If True, remove quotes from the Reddit comments. Defaults to False.

    ignore_stopwords: str, list, or boolean, optional
        Whether to ignore stopwords

        - str: language to get a list of stopwords for from NLTK package
        - list: list of stopwords to remove
        - True: use built-in list of the english stop words
        - False: keep all tokens

        Defaults to False

    stem: {False, 'stem', 'lemm'}, optional
        Whether to perform word stemming

        - False: do not perform word stemming
        - 'stem': use PorterStemmer from NLTK package
        - 'lemm': use WordNetLemmatizer from NLTK package

    remove_punct: bool, optional
        If True, remove punctuation tokens. Defaults to True.

    remove_breaks: bool, optional
        If True, remove linebreak tokens. Defaults to True.

    decontract: bool, optional
        If True, attempt to expand certain contractions. Defaults to False.
        Example: "'ll" -> " will"

    numbers, subreddits, reddit_usernames, emails:
    False or str, optional
        Replacement of the different types of tokens

        - False: leaves these tokens intact
        - str: replacement token
        - '': removes all occurrences of these tokens

    twitter_handles: False, 'realname' or str, optional
        Processing of twitter handles

        - False: do nothing
        - str: replacement token
        - 'realname': replace with the real screen name of Twitter account
        - 'split': split handles using Viterbi algorithm

        Example: "#vladimirputinisthebest" -> "vladimir putin is the best"

    hashtags: False or str, optional
        Processing of hashtags

        - False: do nothing
        - str: replacement token
        - 'split': split hashtags according using Viterbi algorithm

    urls: False or str, optional
        Replacement of parsed URLs

        - False: leave URL intact
        - str: replacement token
        - dict: replace all URLs stored in keys with the corresponding values
        - '': removes all occurrences of these tokens
        - 'domain': extract domain ("http://cnn.com" -> "cnn")
        - 'domain_unwrap_fast': extract domain after unwraping links
        for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com)
        - 'domain_unwrap': extract domain after unwraping all links
        - 'title': extract and tokenize title of each link after unwraping it

        Defaults to False.

    extra_patterns: None or list of tuples, optional
        Replacement of any user-supplied extra patterns.
        Tuples must have the following form: (name, re_pattern, replacement_token):

        - name (str): name of the pattern
        - re_pattern (_sre.SRE_Pattern): compiled re pattern
        - replacement_token (str): replacement token

        Defaults to None

    keep_untokenized: None or list, optional
        List of expressions to keep untokenized

        Example: ["New York", "Los Angeles", "San Francisco"]

    whitespaces_to_underscores: boolean, optional
        If True, replace all whitespace characters with
        underscores in the final tokens. Defaults to True.

    remove_nonunicode: boolean, optional
        If True, remove all non-unicode characters. Defaults to False.

    pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional
        Replace positive, negative, and neutral emojis with the special tokens

        - None: do not perform replacement
        - True: perform replacement of the default lists of emojis
        - list: list of emojis to replace

    print_url_warnings: bool, optional
        If True, print URL-related warnings. Defaults to False.

    latin_chars_fix: bool, optional
        Try applying this fix if you have a lot of \\xe2\\x80\\x99-like
        or U+1F601-like strings in your data. Defaults to False.

    ngrams: int, optional
        Add ngrams of tokens after tokenizing
    """
    def __init__(self,
                 lowercase=True,
                 keepcaps=False,
                 normalize=3,
                 ignore_quotes=False,
                 ignore_reddit_quotes=False,
                 ignore_stopwords=False,
                 stem=False,
                 remove_punct=True,
                 remove_breaks=True,
                 decontract=False,
                 twitter_handles=False,
                 urls=False,
                 hashtags=False,
                 numbers=False,
                 subreddits=False,
                 reddit_usernames=False,
                 emails=False,
                 extra_patterns=None,
                 keep_untokenized=None,
                 whitespaces_to_underscores=True,
                 remove_nonunicode=False,
                 pos_emojis=None,
                 neg_emojis=None,
                 neutral_emojis=None,
                 print_url_warnings=False,
                 latin_chars_fix=False,
                 ngrams=1):
        self.params = locals()

        self._nlp = English()
        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add('HASHTAG', None, [{
            'ORTH': '#'
        }, {
            'IS_ASCII': True
        }])
        self._merging_matcher.add('SUBREDDIT', None, [{
            'ORTH': '/r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }], [{
            'ORTH': 'r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }])
        self._merging_matcher.add('REDDIT_USERNAME', None,
                                  [{
                                      'ORTH': '/u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }], [{
                                      'ORTH': 'u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }])

        if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules):
            try:
                self._stopwords = stopwords.words(ignore_stopwords)
            except OSError:
                raise ValueError('Language {} was not found by NLTK'.format(
                    ignore_stopwords))
        elif ignore_stopwords is True:
            self._matcher.add('STOPWORDS', self._remove_token, [{
                'IS_STOP': True
            }])
        elif isinstance(ignore_stopwords, list):
            self._stopwords = [word.lower() for word in ignore_stopwords]
        elif ignore_stopwords is not False:
            raise TypeError(
                'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed'
                .format(type(ignore_stopwords)))

        if lowercase and (not keepcaps):
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False
            }])
        elif lowercase and keepcaps:
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False,
                'IS_UPPER': False
            }])

        if remove_punct:
            self._matcher.add('PUNCTUATION', self._remove_token,
                              [{
                                  'IS_PUNCT': True
                              }])

        if remove_breaks:

            def break_check(text):
                return bool(BREAKS_RE.fullmatch(text))

            break_flag = self._nlp.vocab.add_flag(break_check)
            self._matcher.add('BREAK', self._remove_token, [{
                break_flag: True
            }])

        if normalize:

            def normalize_check(text):
                return bool(NORMALIZE_RE.search(text))

            normalize_flag = self._nlp.vocab.add_flag(normalize_check)
            self._matcher.add('NORMALIZE', self._normalize,
                              [{
                                  normalize_flag: True
                              }])

        if numbers is not False:
            self._matcher.add('NUMBER', self._replace_token, [{
                'LIKE_NUM': True
            }])
            self._replacements['NUMBER'] = numbers

        if urls is not False:
            if urls in [
                    'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title'
            ]:
                self._urls = urls
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            elif isinstance(urls, dict):
                self._domains = urls
                self._urls = 'domain_unwrap_fast'
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            else:
                self._matcher.add('URL', self._replace_token, [{
                    'LIKE_URL': True
                }])
                self._replacements['URL'] = urls

        if emails is not False:
            self._matcher.add('EMAIL', self._replace_token, [{
                'LIKE_EMAIL': True
            }])
            self._replacements['EMAIL'] = emails

        if reddit_usernames is not False:

            def reddit_username_check(text):
                return bool(REDDITORS_RE.fullmatch(text))

            reddit_username_flag = self._nlp.vocab.add_flag(
                reddit_username_check)
            self._matcher.add('REDDIT_USERNAME', self._replace_token,
                              [{
                                  reddit_username_flag: True
                              }])
            self._replacements['REDDIT_USERNAME'] = reddit_usernames

        if subreddits is not False:

            def subreddit_check(text):
                return bool(SUBREDDITS_RE.fullmatch(text))

            subreddit_flag = self._nlp.vocab.add_flag(subreddit_check)
            self._matcher.add('SUBREDDIT', self._replace_token,
                              [{
                                  subreddit_flag: True
                              }])
            self._replacements['SUBREDDIT'] = subreddits

        if twitter_handles is not False:
            self._matcher.add('TWITTER_HANDLE', self._handles_postprocess,
                              [{
                                  twitter_handle_flag: True
                              }])

        if hashtags is not False:
            self._matcher.add('HASHTAG', self._hashtag_postprocess,
                              [{
                                  hashtag_flag: True
                              }])

        if hashtags == 'split' or twitter_handles == 'split':
            file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt')
            with open(file) as f:
                self._words = f.read().split()
            self._wordcost = dict((k, log((i + 1) * log(len(self._words))))
                                  for i, k in enumerate(self._words))
            self._maxword = max(len(x) for x in self._words)

        if twitter_handles == 'realname':
            with open(os.path.join(DATA_PATH, 'realnames.json')) as f:
                self._realnames = json.load(f)

        if ignore_quotes:
            self._merging_matcher.add('QUOTE', None, [{
                'ORTH': '"'
            }, {
                'OP': '*',
                'IS_ASCII': True
            }, {
                'ORTH': '"'
            }])

            def doublequote_check(text):
                return bool(QUOTES_RE.fullmatch(text))

            doublequote_flag = self._nlp.vocab.add_flag(doublequote_check)
            self._matcher.add('DOUBLE_QUOTES', self._remove_token,
                              [{
                                  doublequote_flag: True
                              }])

        if self._stopwords:

            def stopword_check(text):
                return bool(text.lower() in self._stopwords)

            stopword_flag = self._nlp.vocab.add_flag(stopword_check)
            self._matcher.add('STOPWORD', self._remove_token,
                              [{
                                  stopword_flag: True
                              }])

        if keep_untokenized is not None:
            if not isinstance(keep_untokenized, list):
                raise ValueError(
                    "keep_untokenized has to be either None or a list")
            for i, phrase in enumerate(keep_untokenized):
                phrase_tokens = phrase.split(' ')
                rule = []
                for token in phrase_tokens:
                    rule.append({'LOWER': token.lower()})
                self._merging_matcher.add('RULE_' + str(i), None, rule)

        if pos_emojis:
            if not isinstance(pos_emojis, list):
                pos_emojis = POS_EMOJIS
            pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis]
            self._matcher.add('HAPPY', self._replace_token, *pos_patterns)
            self._replacements['HAPPY'] = 'POS_EMOJI'

        if neg_emojis:
            if not isinstance(neg_emojis, list):
                neg_emojis = NEG_EMOJIS
            neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis]
            self._matcher.add('SAD', self._replace_token, *neg_patterns)
            self._replacements['SAD'] = 'NEG_EMOJI'

        if neutral_emojis:
            if not isinstance(neutral_emojis, list):
                neutral_emojis = NEUTRAL_EMOJIS
            neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis]
            self._matcher.add('NEUTRAL', self._replace_token,
                              *neutral_patterns)
            self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI'

        if isinstance(extra_patterns, list):
            self._flags = {}
            for name, re_pattern, replacement_token in extra_patterns:

                def flag(text):
                    return bool(re_pattern.match(text))

                self._flags[name] = self._nlp.vocab.add_flag(flag)
                self._matcher.add(name, self._replace_token,
                                  [{
                                      self._flags[name]: True
                                  }])
                self._replacements[name] = replacement_token

        if stem and ('nltk' in sys.modules):
            if stem == 'stem':
                self._stemmer = PorterStemmer()
            elif stem == 'lemm':
                self._stemmer = WordNetLemmatizer()
            else:
                raise ValueError(
                    'Stemming method {} is not supported'.format(stem))
            self._matcher.add('WORD_TO_STEM', self._stem_word,
                              [{
                                  'IS_ALPHA': True
                              }])

        retokenize_flag = self._nlp.vocab.add_flag(retokenize_check)
        self._matcher.add('RETOKENIZE', self._retokenize,
                          [{
                              retokenize_flag: True,
                              'IS_PUNCT': False,
                              'LIKE_URL': False,
                              'LIKE_EMAIL': False,
                              'LIKE_NUM': False,
                              hashtag_flag: False,
                              twitter_handle_flag: False
                          }])

        self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True)
        self._nlp.add_pipe(self._match_doc, name='match_doc', last=True)
        self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True)

    @staticmethod
    def _lowercase(__, doc, i, matches):
        # Lowercase tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = tok._.transformed_text.lower()

    def _stem_word(self, __, doc, i, matches):
        # Stem tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['stem'] == 'stem':
                tok._.transformed_text = self._stemmer.stem(
                    tok._.transformed_text)
            elif self.params['stem'] == 'lemm':
                tok._.transformed_text = self._stemmer.lemmatize(
                    tok._.transformed_text)

    def _normalize(self, __, doc, i, matches):
        # Normalize repeating symbols
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = NORMALIZE_RE.sub(
                r"\1" * self.params['normalize'], tok._.transformed_text)

    def _process_url(self, __, doc, i, matches):
        # Process found URLs
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            found_urls = URLS_RE.findall(tok.text)
            if found_urls:
                if found_urls[0] in self._domains:
                    tok._.transformed_text = self._domains[found_urls[0]]
                elif self._urls == 'domain':
                    tok._.transformed_text = tldextract.extract(
                        found_urls[0]).domain
                elif self._urls != 'title':
                    if self._urls == 'domain_unwrap':
                        domain = unshorten_url(
                            found_urls[0], None,
                            self.params['print_url_warnings'])
                    else:
                        domain = unshorten_url(
                            found_urls[0], URL_SHORTENERS,
                            self.params['print_url_warnings'])
                    self._domains[found_urls[0]] = domain
                    tok._.transformed_text = domain
                elif self._urls == 'title':
                    domain = unshorten_url(found_urls[0], URL_SHORTENERS)
                    if domain != 'twitter':
                        title = get_url_title(
                            found_urls[0], self.params['print_url_warnings'])
                        title = self.tokenize(URLS_RE.sub('', title))
                    else:
                        title = ''
                    tok._.transformed_text = title
                    self._domains[found_urls[0]] = title

    def _replace_token(self, __, doc, i, matches):
        # Replace tokens with something else
        match_id, start, end = matches[i]
        span = doc[start:end]
        replacement_token = self._replacements[doc.vocab.strings[match_id]]
        for tok in span:
            tok._.transformed_text = replacement_token

    @staticmethod
    def _remove_token(__, doc, i, matches):
        # Remove tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = ''

    def _retokenize(self, __, doc, i, matches):
        # Retokenize
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            text = tok.text
            text = re.sub(r'([#@])', r' \1', text)
            text = re.sub(r'\s{2,}', ' ', text).strip()
            tok._.transformed_text = self.tokenize(text)

    def _infer_spaces(self, text):
        # Infer location of spaces in hashtags
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)

        def best_match(i):
            # Find the best match for the first i characters
            # assuming costs has been built for the first (i-1) characters
            candidates = enumerate(reversed(cost[max(0, i - self._maxword):i]))
            return min(
                (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1)
                for k, c in candidates)

        cost = [0]
        for i in range(1, len(text) + 1):
            cur_cost, k = best_match(i)
            cost.append(cur_cost)

        out = []
        i = len(text)
        while i > 0:
            cur_cost, k = best_match(i)
            assert cur_cost == cost[i]
            out.append(text[i - k:i])
            i -= k

        return list(reversed(out))

    def _handles_postprocess(self, __, doc, i, matches):
        # Process twitter handles
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['twitter_handles'] == 'realname':
                if tok.text in self._realnames:
                    tok._.transformed_text = self._realnames[tok.text]
                else:
                    handle = get_twitter_realname(tok.text)
                    realname = self.tokenize(TWITTER_HANDLES_RE.sub(
                        '', handle))
                    tok._.transformed_text = realname
                    self._realnames[tok.text] = realname
            elif self.params['twitter_handles'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['twitter_handles']

    def _hashtag_postprocess(self, __, doc, i, matches):
        # Process hashtags
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['hashtags'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['hashtags']

    @staticmethod
    def _decontract(text):
        # Expand contractions
        for contraction, decontraction in DECONTRACTIONS.items():
            text = re.sub(contraction, decontraction, text)
        return text

    def _preprocess_text(self, text):
        # Do some preprocessing
        text = re.sub("’", "'", text)
        if self.params['remove_nonunicode']:
            try:
                text = text.encode('utf-8').decode('unicode-escape')
                text = ''.join(filter(lambda x: x in string.printable,
                                      text)).strip()
            except UnicodeDecodeError:
                warnings.warn(
                    '(UnicodeDecodeError while trying to remove non-unicode characters'
                )
        if self.params['decontract']:
            text = self._decontract(text)
        text = html.unescape(text)

        if self.params['latin_chars_fix']:
            if EMOJIS_UTF_RE.findall(text):
                text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text)
                for utf_code, emoji in EMOJIS_UTF.items():
                    text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text)

            if EMOJIS_UNICODE_RE.findall(text):
                text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text)
                for utf_code, emoji in EMOJIS_UNICODE.items():
                    text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text)

            if LATIN_CHARS_RE.findall(text):
                for _hex, _char in LATIN_CHARS.items():
                    text = LATIN_CHARS_PATS[_hex].sub(_char, text)

        if self.params['ignore_reddit_quotes']:
            text = REDDIT_QUOTES_RE.sub(text, ' ')

        text = text.replace('.@', '. @')
        text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text)
        text = re.sub(r'\s{2,}', ' ', text)

        return text.strip()

    def _merge_doc(self, doc):
        # Perform merging for certain types of tokens
        matches = self._merging_matcher(doc)
        spans = []
        for __, start, end in matches:
            spans.append(doc[start:end])
        for span in spans:
            span.merge()
        for tok in doc:
            tok._.transformed_text = tok.text

        return doc

    def _match_doc(self, doc):
        # Perform all additional processing
        self._matcher(doc)
        return doc

    def _postproc_doc(self, doc):
        # Perform postprocessing
        doc._.tokens = []
        for tok in doc:
            if isinstance(tok._.transformed_text, list):
                doc._.tokens.extend(tok._.transformed_text)
            elif tok._.transformed_text.strip() != '':
                if self.params['whitespaces_to_underscores']:
                    tok._.transformed_text = "_".join(
                        tok._.transformed_text.split())
                doc._.tokens.append(tok._.transformed_text.strip())
        return doc

    def tokenize(self, text):
        """
        Tokenize document

        Parameters
        ----------
        text : str
            Document to tokenize

        Returns
        -------
        list
            List of tokens

        Examples
        --------
        >>> from redditscore.tokenizer import CrazyTokenizer
        >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False)
        >>> tokenizer.tokenize("#makeamericagreatagain")
        ["make", "america", "great", "again"]
        """
        if not isinstance(text, str):
            warnings.warn('Document {} is not a string'.format(text))
            return []
        text = self._preprocess_text(text)
        doc = self._nlp(text)
        tokens = doc._.tokens
        if self.params['ngrams'] > 1:
            if self.params['whitespaces_to_underscores']:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']),
                                     separator='_')
            else:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']))
        return tokens
コード例 #10
0
class CorpusIterator(six.Iterator):
    """Class to do tokenization, tagging, stemming, etc. and yield each
        document 1 at a time. Only ever loads one doc into memory"""
    def __init__(self,
                 indir,
                 n,
                 stem=None,
                 stop_words=True,
                 tag=None,
                 tag_pattern=None,
                 punctuation=True,
                 split_clauses=False,
                 outdir=None):
        """Constructor
        Input:
            indir: path to directory of txt files
            n: order of n gram
            stem: {'snowball','porter','lemma',None} stemmer to use
                    Defaults to None.
            stop_words: Boolean. include stopwords. Defaults to True
            tag: {'nltk',None}. POS tagger to use. Defaults 
                    to None
            tag_pattern: list of of tag patterns to allow in simplified form.
                         Defaults to None. if tag_pattern = "default", 
                         use default tag pattern.
            punctuation: Boolean. include punctuation. Defaults to True
            split_clauses: Boolean. Split on clauses
            outdir: directory to write to. Defaults to indir/ngram_results
        """
        self.indir = indir
        # check if directory is zip archive or directory and act accordingly
        if not zipfile.is_zipfile(indir):
            # list the files in the directory
            self.files = sorted([
                os.path.join(indir, f) for f in os.listdir(indir)
                if os.path.splitext(f)[1] == ".txt"
            ])
            # create directory for results
            if outdir is None:
                outdir = os.path.join(indir, "ngram_results")
            # check if directory exists, if not create direcotry
            if not os.path.exists(outdir):
                os.mkdir(outdir)
            # set zip_corpus to None
            self.zip_corpus = None
        else:
            # files is the namelist of the zip archive
            self.zip_corpus = zipfile.ZipFile(indir)
            self.files = self.zip_corpus.namelist()
            # create directory for results in the directory of the zip archive
            if outdir is None:
                # get the directory of the zip archive
                tmp = os.path.split(indir)[0]
                outdir = os.path.join(tmp, "ngram_results")
            # check if the directory exists, if not create the directory
            if not os.path.exists(outdir):
                os.mkdir(outdir)
        # assign option variables
        self.n = n
        self.stem = stem
        self.stop_words = stop_words
        self.tag = tag
        self.tag_pattern = tag_pattern
        self.punctuation = punctuation
        self.outdir = outdir

        # keep an index for the __next__() function
        self.index = 0

        # class variable holding default tag patterns and dict for conversion
        # to universal tag set
        self.default_tagpatterns = set([
            'AN', 'NN', 'VN', 'VV', 'VP', 'NNN', 'AAN', 'ANN', 'NAN', 'NPN',
            'VAN', 'VNN', 'VPN', 'ANV', 'NVV', 'VDN', 'VVV', 'VVP'
        ])
        self.default_tagset = set(''.join(self.default_tagpatterns))
        self.tagdict = keydefaultdict(
            lambda x: x, {
                'NN': 'N',
                'NNS': 'N',
                'NNP': 'N',
                'NNPS': 'N',
                'JJ': 'A',
                'JJR': 'A',
                'JJS': 'A',
                'VBG': 'A',
                'RB': 'A',
                'DT': 'D',
                'IN': 'P',
                'TO': 'P',
                'VB': 'V',
                'VBD': 'V',
                'VBN': 'V',
                'VBP': 'V',
                'VBZ': 'V',
                'MD': 'V',
                'RP': 'P'
            })

        # class variable which contains english stop words as a set
        self.stop = set(stopwords.words('english'))

        # set up tagger if tag is not None
        if tag is not None:
            if tag == 'nltk':
                # create a named tuple which holds nltk.pos_tag_sents as
                # tag_sents
                NLTKTagger = namedtuple("NLTKTagger", ["tag_sents", "tag"])
                self.tagger = NLTKTagger(nltk.pos_tag_sents, nltk.pos_tag)
            else:
                # raise a value error if an unsupproted tagger is included
                raise ValueError('Not an available tagger')
        # initialize stemmer if stem is not None
        if stem is not None:
            if stem == 'porter':
                self.stemmer = PorterStemmer()
            elif stem == 'snowball':
                self.stemmer = SnowballStemmer("english")
            elif stem == 'lemma':
                self.stemmer = WordNetLemmatizer()
                # add stem as another name for lemmatize
                self.stemmer.stem = stemmer.lemmatize
            else:
                # raise a value error if a wrong stemmer is chosen
                raise ValueError('Not an available stemmer')
        # set splitting on clauses
        self.split_clauses = split_clauses
        # current clauses
        self.curr_clauses = []

    def __len__(self):
        """len function, number of documents"""
        return (len(self.files))

    def __next__(self):
        """Next function for iterator"""
        if self.index >= len(self.files):
            raise StopIteration
        # if not splitting on clauses
        if not self.split_clauses:
            # get sentences from file
            sents = doc_sents(self.files[self.index], zipped=self.zip_corpus)
            #get ngrams of doc and yield
            ngrams = self.ngrams_from_sents(sents, self.n, self.stem,
                                            self.stop_words, self.tag,
                                            self.tag_pattern, self.punctuation)
            self.index += 1
            return (ngrams)
        #if splitting on clauses use the clauses
        else:
            if len(self.curr_clauses) == 0:
                #get the sentences for the current clauses
                self.curr_clauses = doc_sents(self.files[self.index],
                                              zipped=self.zip_corpus,
                                              clauses=True)
                self.index += 1
            #pop one clauses from self.curr_clauses
            sents = self.curr_clauses.pop()
            ngrams = self.ngrams_from_sents(sents, self.n, self.stem,
                                            self.stop_words, self.tag,
                                            self.tag_pattern, self.punctuation)
            return (ngrams)

    def __iter__(self):
        """Iterator, does tokenization,stemming,tagging,etc on a doc before
            returning it"""
        if not self.split_clauses:
            for i, fName in enumerate(sorted(self.files)):
                if i % 100 == 0:
                    logging.info("Computing N-grams for %ith file %s" %
                                 (i, fName))
                #get sentences from file
                sents = doc_sents(fName, zipped=self.zip_corpus)
                #get ngrams of doc and yield
                ngrams = self.ngrams_from_sents(sents, self.n, self.stem,
                                                self.stop_words, self.tag,
                                                self.tag_pattern,
                                                self.punctuation)
                yield (ngrams)
        else:
            for i, fName in enumerate(self.files):
                if i % 100 == 0:
                    logging.info("Computing N-grams for %ith file %s" %
                                 (i, fName))
                #get sentences for clauses
                clauses = doc_sents(fName,
                                    zipped=self.zip_corpus,
                                    clauses=True)
                for sents in clauses:
                    ngrams = self.ngrams_from_sents(sents, self.n, self.stem,
                                                    self.stop_words, self.tag,
                                                    self.tag_pattern,
                                                    self.punctuation)
                    yield (ngrams)

    def custom_ngrams(self, words, n):
        """Faster n gram generation than nltk.ngrams
        Input:
            words: word tokenized sentence
            n: order of ngram
        Output:
            ngrams: list of ngrams
        """
        ngrams = zip(*[words[i:] for i in range(n)])
        return (ngrams)

    def word_tokenize(self, words):
        """Faster word tokenization than nltk.word_tokenize
        Input:
            words: a string to be tokenized
        Output:
            tokens: tokenized words
        """
        tokens = re.findall(r"[a-z]+-?[a-z]+",
                            words.lower(),
                            flags=re.UNICODE | re.LOCALE)
        return (tokens)

    def ngrams_from_sents(self,
                          sents,
                          n,
                          stem=None,
                          stop_words=True,
                          tag=None,
                          tag_pattern=None,
                          punctuation=True):
        """Gets the ngrams from a list of sentences
        Input:
            sents: list of sentences as strings
            n: order of n gram
            stem: {'snowball','porter','lemma',None} stemmer to use
                    Defaults to None.
            stop_words: Boolean. include stopwords. Defaults to True
            tag: {'ap','nltk','stanford',None}. POS tagger to use. Defaults 
                    to None
            tag_pattern: list of of tag patterns to allow in simplified form.
                         Defaults to None. if tag_pattern = "default", 
                         use default tag pattern.
            punctuation: Boolean. include punctuation. Defaults to True
        Output:
            ngrams: list of ngrams as "word1-word2" strings
        """

        #tag sentences first
        if tag is not None:
            #tokenize the sentences
            tmp = []
            for sent in sents:
                tmp.append([word.lower() for word in self.word_tokenize(sent)])
            sents = tmp
            if tag == 'nltk':
                # tag words
                tags = self.tagger.tag_sents(sents)
                # extract the tags without the words
                tags = [[self.tagdict[tagWord[1]] for tagWord in tag[i]]
                        for i in range(len(sents))]
            else:
                #raise a value error if an unsupproted tagger is included
                raise ValueError('Not an available tagger')
        #iterate through sentences and get ngrams
        ngrams = []
        for i, words in enumerate(sents):
            if tag is None:
                #if tag is None then word tokenization hasn't happend
                words = self.word_tokenize(words)
            #stem words if stem is not None
            if stem is not None:
                words = [self.stemmer.stem(word) for word in words]
            #join tags and words if tag is not None
            if tag is not None:
                words = ['::'.join(tagWord) for tagWord in zip(words, tags[i])]
            #remove stop words if stop = False
            if not stop_words:
                words = [
                    word for word in words
                    if not word.split("::")[0] in self.stop
                ]
            #remove punctuation if punctuation is false
            if not punctuation:
                pun = string.punctuation
                words = [
                    word for word in words if not word.split("::")[0] in pun
                ]

            #get n grams and add to ngrams list
            sent_grams = [
                "_".join(gram) for gram in self.custom_ngrams(words, n)
            ]
            #if tag_pattern isn't None, go through sent_grams and only keep those
            #ngrams with the proper tag pattern
            if tag_pattern is not None:
                #assign default tag pattern if tag_pattern == 'default'
                if tag_pattern == 'default':
                    tag_pattern = self.default_tagpatterns
                tmp = []
                #maybe make this a list comprehension?
                for gram in sent_grams:
                    #get tags separately
                    tags_squash = [t.split("::")[1] for t in gram.split("_")]
                    #check if the tag pattern is allowed
                    if ''.join(tags_squash) in tag_pattern:
                        tmp.append(gram)
                sent_grams = tmp

            ngrams.extend(sent_grams)

        return (ngrams)
コード例 #11
0
# Parts of speech tagging

print "Processing data to tag parts of speech..."
pos_word_data = nltk.pos_tag(word_data)
print pos_word_data

# Stemming and Lemmatization

## Stemming with Porter Stemmer
print "Stemming with Porter Stemmer..."
porter_stemmer = PorterStemmer()
for w in word_data[:20]:
    print "Actual: %s Stem: %s" % (w, porter_stemmer.stem(w))

## Stemming with Lancaster Stemmer
print "Stemming with Porter Stemmer..."
lancaster_stemmer = LancasterStemmer()
for w in word_data[:20]:
    print "Actual: %s Stem: %s" % (w, lancaster_stemmer.stem(w))

## Stemming with Snowball Algorithm
print "Stemming with Snowball Stemmer..."
snowball_stemmer = SnowballStemmer("english")
for w in word_data[:20]:
    print "Actual: %s Stem: %s" % (w, snowball_stemmer.stem(w))

## Lemmatization with WordNet
wordnet_lemmatizer = WordNetLemmatizer()
for w in word_data[:20]:
    print "Actual: %s Stem: %s" % (w, wordnet_lemmatizer.stem(w))
コード例 #12
0
class ConceptBasedILPSummarizer(LoadFile):
    """Implementation of the concept-based ILP model for summarization.

    The original algorithm was published and described in:

      * Dan Gillick and Benoit Favre, A Scalable Global Model for Summarization,
        *Proceedings of the NAACL HLT Workshop on Integer Linear Programming for
        Natural Language Processing*, pages 10–18, 2009.
        
    """
    def __init__(self, input_directory, language):
        """
        Args:
            input_directory (str): the directory from which text documents to
              be summarized are loaded.

        @type language: str

        """
        self.input_directory = input_directory
        self.sentences = []
        self.weights = {}
        self.c2s = defaultdict(set)
        self.concept_sets = defaultdict(frozenset)
        self.LANGUAGE = language
        # type: str

        self.stoplist = set(stopwords.words(self.LANGUAGE))
        self.stemmer = WordNetLemmatizer()

        self.word_frequencies = defaultdict(int)
        self.w2s = defaultdict(set)

    def extract_ngrams2(self, concept_type='ngrams', n=2):
        """Extract the ngrams of words from the input sentences.

        Args:
            n (int): the number of words for ngrams, defaults to 2
        """
        for i, sentence in enumerate(self.sentences):
            untokenized_concepts = []
            if concept_type == 'ngrams':
                ngrams = extract_ngrams2([sentence.untokenized_form],
                                         self.stemmer, self.LANGUAGE, n)
                pruned_list = prune_ngrams(ngrams, self.stoplist, n)
            elif concept_type == 'phrase':
                pruned_list = self.sentences[i].phrases

            for concept in pruned_list:
                wrds = unstem_ngram(concept, sentence)
                untokenized_concepts.append(" ".join(wrds))

            self.sentences[i].concepts = pruned_list
            self.sentences[i].untokenized_concepts = untokenized_concepts
            #print(untokenized_concepts)
            if len(self.sentences[i].concepts) != len(
                    self.sentences[i].untokenized_concepts):
                raise BaseException(
                    "unexpected length difference between concepts and untokenized_concepts"
                )

    def extract_ngrams(self, n=2):
        """Extract the ngrams of words from the input sentences.

        Args:
            n (int): the number of words for ngrams, defaults to 2
        """
        for i, sentence in enumerate(self.sentences):

            # for each ngram of words
            for j in range(len(sentence.tokens) - (n - 1)):

                # initialize ngram container
                ngram = []

                # for each token of the ngram
                for k in range(j, j + n):
                    ngram.append(sentence.tokens[k].lower())

                # do not consider ngrams containing punctuation marks
                marks = [t for t in ngram if not re.search('[a-zA-Z0-9]', t)]
                if len(marks) > 0:
                    continue

                # do not consider ngrams composed of only stopwords
                stops = [t for t in ngram if t in self.stoplist]
                if len(stops) == len(ngram):
                    continue

                # stem the ngram
                ngram = [self.stemmer.stem(t) for t in ngram]
                #ngram = [self.stemmer.lemmatize(t) for t in ngram]

                # add the ngram to the concepts
                self.sentences[i].concepts.append(' '.join(ngram))

    def compute_document_frequency(self):
        """Compute the document frequency of each concept.

        """
        for i in range(len(self.sentences)):

            # for each concept
            for concept in self.sentences[i].concepts:

                # add the document id to the concept weight container
                if concept not in self.weights:
                    self.weights[concept] = set([])
                self.weights[concept].add(self.sentences[i].doc_id)

        # loop over the concepts and compute the document frequency
        for concept in self.weights:
            self.weights[concept] = len(self.weights[concept])

    def compute_word_frequency(self):
        """Compute the frequency of each word in the set of documents. """

        for i, sentence in enumerate(self.sentences):
            for token in sentence.tokens:
                t = token.lower()
                if not re.search('[a-zA-Z0-9]', t) or t in self.stoplist:
                    continue
                #t = self.stemmer.stem(t)
                t = self.stemmer.lemmatize(t)
                self.w2s[t].add(i)
                self.word_frequencies[t] += 1

    def prune_sentences(self,
                        mininum_sentence_length=5,
                        remove_citations=True,
                        remove_redundancy=True,
                        imp_list=None):
        """Prune the sentences.

        Remove the sentences that are shorter than a given length, redundant
        sentences and citations from entering the summary.

        Args:
            mininum_sentence_length (int): the minimum number of words for a
              sentence to enter the summary, defaults to 5
            remove_citations (bool): indicates that citations are pruned,
              defaults to True
            remove_redundancy (bool): indicates that redundant sentences are
              pruned, defaults to True

        """
        if imp_list is None:
            imp_list = []
        retained_sentences = []

        # loop over the sentences
        for i, sentence in enumerate(self.sentences):
            if imp_list:
                if imp_list[i] == 0:
                    continue
            # prune short sentences
            if sentence.length < mininum_sentence_length:
                continue

            # prune citations
            first_token, last_token = sentence.tokens[0], sentence.tokens[-1]

            if remove_citations and \
               (first_token == u"``" or first_token == u'"' \
                or last_token == u"''" or first_token == u'"' \
                or last_token== u"'" or first_token==u"'") \
                or last_token == u'"':
                continue

            # prune ___ said citations
            # if remove_citations and \
            #     (sentence.tokens[0]==u"``" or sentence.tokens[0]==u'"') and \
            #     re.search('(?i)(''|") \w{,30} (said|reported|told)\.$',
            #               sentence.untokenized_form):
            #     continue

            # prune identical and almost identical sentences
            if remove_redundancy:
                is_redundant = False
                for prev_sentence in retained_sentences:
                    if sentence.tokens == prev_sentence.tokens:
                        is_redundant = True
                        break

                if is_redundant:
                    continue

            # otherwise add the sentence to the pruned sentence container
            retained_sentences.append(sentence)

        # from all concepts that are going to be pruned, keep only those that also appear elsewhere
        retained_concepts = [
            concept for s in retained_sentences for concept in s.concepts
        ]

        for sentence in set(self.sentences).difference(retained_sentences):
            for concept in sentence.concepts:
                if concept not in retained_concepts \
                        and self.weights.has_key(concept):
                    del self.weights[concept]

        log.debug("keeping %s unique sentences of %s sentences" %
                  (len(retained_sentences), len(self.sentences)))
        self.sentences = retained_sentences

    def prune_concepts(self, method="threshold", value=3, rejected_list=None):
        """Prune the concepts for efficient summarization.

        Args:
            method (str): the method for pruning concepts that can be whether
              by using a minimal value for concept scores (threshold) or using
              the top-N highest scoring concepts (top-n), defaults to
              threshold.
            value (int): the value used for pruning concepts, defaults to 3.

        """
        if rejected_list is None:
            rejected_list = []

        if method == 'stopwords':
            concepts = self.weights.keys()
            for concept in concepts:
                pruned_list = prune_ngrams(concept, self.stoplist, 1)
                if not pruned_list:
                    #print concept, self.weights[concept]
                    del self.weights[concept]

        if method == "list":
            concepts = self.weights.keys()
            for concept in concepts:
                if concept in rejected_list:
                    #print concept, self.weights[concept]
                    del self.weights[concept]

        # 'threshold' pruning method
        if method == "threshold":

            # iterates over the concept weights
            concepts = self.weights.keys()
            for concept in concepts:
                if self.weights[concept] < value:
                    del self.weights[concept]

        # 'top-n' pruning method
        elif method == "top-n":

            # sort concepts by scores
            sorted_concepts = sorted(self.weights,
                                     key=lambda x: self.weights[x],
                                     reverse=True)

            # iterates over the concept weights
            concepts = self.weights.keys()
            for concept in concepts:
                if concept not in sorted_concepts[:value]:
                    del self.weights[concept]

        # iterates over the sentences
        for i in range(len(self.sentences)):

            # current sentence concepts
            concepts = self.sentences[i].concepts

            # prune concepts
            self.sentences[i].concepts = [
                c for c in concepts if c in self.weights
            ]

    def compute_c2s(self):
        """Compute the inverted concept to sentences dictionary. """

        for i, sentence in enumerate(self.sentences):
            for concept in sentence.concepts:
                self.c2s[concept].add(i)

    def compute_concept_sets(self):
        """Compute the concept sets for each sentence."""

        for i, sentence in enumerate(self.sentences):
            for concept in sentence.concepts:
                self.concept_sets[i] |= {concept}

    def greedy_approximation(self, summary_size=100):
        """Greedy approximation of the ILP model.

        Args:
            summary_size (int): the maximum size in words of the summary,
              defaults to 100.

        Returns:
            (value, set) tuple (int, list): the value of the approximated
              objective function and the set of selected sentences as a tuple.

        """
        # initialize the inverted c2s dictionary if not already created
        if not self.c2s:
            self.compute_c2s()

        # initialize weights
        weights = {}

        # initialize the score of the best singleton
        best_singleton_score = 0

        # compute indices of our sentences
        sentences = range(len(self.sentences))

        # compute initial weights and fill the reverse index
        # while keeping track of the best singleton solution
        for i, sentence in enumerate(self.sentences):
            weights[i] = sum(self.weights[c] for c in set(sentence.concepts))
            if sentence.length <= summary_size\
               and weights[i] > best_singleton_score:
                best_singleton_score = weights[i]
                best_singleton = i

        # initialize the selected solution properties
        sel_subset, sel_concepts, sel_length, sel_score = set(), set(), 0, 0

        # greedily select a sentence
        while True:

            ###################################################################
            # RETRIEVE THE BEST SENTENCE
            ###################################################################

            # sort the sentences by gain and reverse length
            sort_sent = sorted(((weights[i] / float(self.sentences[i].length),
                                 -self.sentences[i].length, i)
                                for i in sentences),
                               reverse=True)

            # select the first sentence that fits in the length limit
            for sentence_gain, rev_length, sentence_index in sort_sent:
                if sel_length - rev_length <= summary_size:
                    break
            # if we don't find a sentence, break out of the main while loop
            else:
                break

            # if the gain is null, break out of the main while loop
            if not weights[sentence_index]:
                break

            # update the selected subset properties
            sel_subset.add(sentence_index)
            sel_score += weights[sentence_index]
            sel_length -= rev_length

            # update sentence weights with the reverse index
            for concept in set(self.sentences[sentence_index].concepts):
                if concept not in sel_concepts:
                    for sentence in self.c2s[concept]:
                        weights[sentence] -= self.weights[concept]

            # update the last selected subset property
            sel_concepts.update(self.sentences[sentence_index].concepts)

        # check if a singleton has a better score than our greedy solution
        if best_singleton_score > sel_score:
            return best_singleton_score, set([best_singleton])

        # returns the (objective function value, solution) tuple
        return sel_score, sel_subset

    def tabu_search(self,
                    summary_size=100,
                    memory_size=10,
                    iterations=100,
                    mutation_size=2,
                    mutation_group=True):
        """Greedy approximation of the ILP model with a tabu search
          meta-heuristic.

        Args:
            summary_size (int): the maximum size in words of the summary,
              defaults to 100.
            memory_size (int): the maximum size of the pool of sentences
              to ban at a given time, defaults at 5.
            iterations (int): the number of iterations to run, defaults at
              30.
            mutation_size (int): number of sentences to unselect and add to
              the tabu list at each iteration.
            mutation_group (boolean): flag to consider the mutations as a
              group: we'll check sentence combinations in the tabu list, not
              sentences alone.
        Returns:
            (value, set) tuple (int, list): the value of the approximated
              objective function and the set of selected sentences as a tuple.

        """
        # compute concept to sentences and concept sets for each sentence
        if not self.c2s:
            self.compute_c2s()
        if not self.concept_sets:
            self.compute_concept_sets()

        # initialize weights
        weights = {}

        # initialize the score of the best singleton
        best_singleton_score = 0

        # compute initial weights and fill the reverse index
        # while keeping track of the best singleton solution
        for i, sentence in enumerate(self.sentences):
            weights[i] = sum(self.weights[c] for c in set(sentence.concepts))
            if sentence.length <= summary_size\
               and weights[i] > best_singleton_score:
                best_singleton_score = weights[i]
                best_singleton = i

        best_subset, best_score = None, 0
        state = State()
        for i in xrange(iterations):
            queue = deque([], memory_size)
            # greedily select sentences
            state = self.select_sentences(summary_size, weights, state, queue,
                                          mutation_group)
            if state.score > best_score:
                best_subset = state.subset.copy()
                best_score = state.score
            to_tabu = set(random.sample(state.subset, mutation_size))
            state = self.unselect_sentences(weights, state, to_tabu)
            queue.extend(to_tabu)

        # check if a singleton has a better score than our greedy solution
        if best_singleton_score > best_score:
            return best_singleton_score, set([best_singleton])

        # returns the (objective function value, solution) tuple
        return best_score, best_subset

    def select_sentences(self, summary_size, weights, state, tabu_set,
                         mutation_group):
        """Greedy sentence selector.

        Args:
            summary_size (int): the maximum size in words of the summary,
              defaults to 100.
            weights (dictionary): the sentence weights dictionary. This
              dictionnary is updated during this method call (in-place).
            state (State): the state of the tabu search from which to start
              selecting sentences.
            tabu_set (iterable): set of sentences that are tabu: this
              selector will not consider them.
            mutation_group (boolean): flag to consider the mutations as a
              group: we'll check sentence combinations in the tabu list, not
              sentences alone.

        Returns:
            state (State): the new state of the search. Also note that
              weights is modified in-place.

        """
        # greedily select a sentence while respecting the tabu
        while True:

            ###################################################################
            # RETRIEVE THE BEST SENTENCE
            ###################################################################

            # sort the sentences by gain and reverse length
            sort_sent = sorted(
                ((weights[i] / float(self.sentences[i].length),
                  -self.sentences[i].length, i)
                 for i in range(len(self.sentences))
                 if self.sentences[i].length + state.length <= summary_size),
                reverse=True)

            # select the first sentence that fits in the length limit
            for sentence_gain, rev_length, sentence_index in sort_sent:
                if mutation_group:
                    subset = state.subset | {sentence_index}
                    for tabu in tabu_set:
                        if tabu <= subset:
                            break
                    else:
                        break
                else:
                    if sentence_index not in tabu_set:
                        break
            # if we don't find a sentence, break out of the main while loop
            else:
                break

            # if the gain is null, break out of the main while loop
            if not weights[sentence_index]:
                break

            # update state
            state.subset |= {sentence_index}
            state.concepts.update(self.concept_sets[sentence_index])
            state.length -= rev_length
            state.score += weights[sentence_index]

            # update sentence weights with the reverse index
            for concept in set(self.concept_sets[sentence_index]):
                if state.concepts[concept] == 1:
                    for sentence in self.c2s[concept]:
                        weights[sentence] -= self.weights[concept]
        return state

    def unselect_sentences(self, weights, state, to_remove):
        """Sentence ``un-selector'' (reverse operation of the
          select_sentences method).

        Args:
            weights (dictionary): the sentence weights dictionary. This
              dictionnary is updated during this method call (in-place).
            state (State): the state of the tabu search from which to start
              un-selecting sentences.
            to_remove (iterable): set of sentences to unselect.

        Returns:
            state (State): the new state of the search. Also note that
              weights is modified in-place.

        """
        # remove the sentence indices from the solution subset
        state.subset -= to_remove
        for sentence_index in to_remove:
            # update state
            state.concepts.subtract(self.concept_sets[sentence_index])
            state.length -= self.sentences[sentence_index].length
            # update sentence weights with the reverse index
            for concept in set(self.concept_sets[sentence_index]):
                if not state.concepts[concept]:
                    for sentence in self.c2s[concept]:
                        weights[sentence] += self.weights[concept]
            state.score -= weights[sentence_index]
        return state

    def solve_ilp_problem(self,
                          summary_size=100,
                          units="WORDS",
                          solver='glpk',
                          excluded_solutions=None,
                          unique=False):
        """Solve the ILP formulation of the concept-based model.

            :param summary_size: the maximum size in words of the summary, defaults to 100.
            :param units: defaults to "WORDS"
            :param solver: the solver used, defaults to glpk
            :param excluded_solutions: (list of list): a list of subsets of sentences that are to be excluded,
                defaults to []
            :param unique: (bool): modify the model so that it produces only one optimal solution, defaults to False

            :return: (value, set) tuple (int, list): the value of the objective function
                and the set of selected sentences as a tuple.

        """

        if excluded_solutions is None:
            excluded_solutions = []
        # initialize container shortcuts
        concepts = self.weights.keys()

        w = self.weights
        L = summary_size
        C = len(concepts)
        S = len(self.sentences)

        if not self.word_frequencies:
            self.compute_word_frequency()

        tokens = self.word_frequencies.keys()
        f = self.word_frequencies
        T = len(tokens)

        # HACK Sort keys
        concepts = sorted(self.weights, key=self.weights.get, reverse=True)

        # formulation of the ILP problem
        prob = pulp.LpProblem(self.input_directory, pulp.LpMaximize)

        # initialize the concepts binary variables
        c = pulp.LpVariable.dicts(name='c',
                                  indexs=range(C),
                                  lowBound=0,
                                  upBound=1,
                                  cat='Integer')

        # initialize the sentences binary variables
        s = pulp.LpVariable.dicts(name='s',
                                  indexs=range(S),
                                  lowBound=0,
                                  upBound=1,
                                  cat='Integer')

        # initialize the word binary variables
        t = pulp.LpVariable.dicts(name='t',
                                  indexs=range(T),
                                  lowBound=0,
                                  upBound=1,
                                  cat='Integer')

        # OBJECTIVE FUNCTION
        prob += pulp.lpSum(w[concepts[i]] * c[i] for i in range(C))

        if unique:
            prob += pulp.lpSum(w[concepts[i]] * c[i] for i in range(C)) + \
                    10e-6 * pulp.lpSum(f[tokens[k]] * t[k] for k in range(T))

        # CONSTRAINT FOR SUMMARY SIZE
        if units == "WORDS":
            prob += pulp.lpSum(s[j] * self.sentences[j].length
                               for j in range(S)) <= L
        if units == "CHARACTERS":
            prob += pulp.lpSum(s[j] * len(self.sentences[j].untokenized_form)
                               for j in range(S)) <= L

        # INTEGRITY CONSTRAINTS
        for i in range(C):
            for j in range(S):
                if concepts[i] in self.sentences[j].concepts:
                    prob += s[j] <= c[i]

        for i in range(C):
            prob += pulp.lpSum(
                s[j] for j in range(S)
                if concepts[i] in self.sentences[j].concepts) >= c[i]

        # WORD INTEGRITY CONSTRAINTS
        if unique:
            for k in range(T):
                for j in self.w2s[tokens[k]]:
                    prob += s[j] <= t[k]

            for k in range(T):
                prob += pulp.lpSum(s[j] for j in self.w2s[tokens[k]]) >= t[k]

        # CONSTRAINTS FOR FINDING OPTIMAL SOLUTIONS
        for sentence_set in excluded_solutions:
            prob += pulp.lpSum([s[j] for j in sentence_set
                                ]) <= len(sentence_set) - 1

        # prob.writeLP('test.lp')

        # solving the ilp problem
        # solving the ilp problem
        try:
            print('BASEILP with CPLEX')
            prob.solve(pulp.CPLEX(msg=0))
        except:
            #print('BASEILP fallback to %s' % (solver))
            if solver == 'gurobi':
                prob.solve(pulp.GUROBI(msg=0))
            elif solver == 'glpk':
                print('BASEILP with GLPK')
                prob.solve(pulp.GLPK(msg=0))
            else:
                sys.exit('no solver specified')

        # retreive the optimal subset of sentences
        solution = set([j for j in range(S) if s[j].varValue == 1])

        # returns the (objective function value, solution) tuple
        return (pulp.value(prob.objective), solution)
コード例 #13
0
ファイル: preprocess.py プロジェクト: mlej8/COMP551
class Preprocessor:
    def __init__(self, normalizer):
        self.label_encoder = LabelEncoder()
        self.tf_idf_vectorizer = TfidfVectorizer(
            sublinear_tf=True,
            stop_words=stopwords.words('english').append([
                "nt", "get", "like", "would", "peopl", "one", "think", "time",
                "becaus"
            ]),
            smooth_idf=True,
            norm="l2",
            lowercase=True,
            max_features=30000,
            use_idf=True,
            encoding="utf-8",
            decode_error='ignore',
            strip_accents='unicode',
            analyzer="word")
        if normalizer == "stemmer":
            self.normalizer = SnowballStemmer("english")
        elif normalizer == "lemmatizer":
            self.normalizer = WordNetLemmatizer()
        else:
            raise Exception(
                "Normalizer must be \"stemmer\" or \"lemmatizer\".")

    def preprocess_reddit_train(self):
        """ Stores a cleaned up version of the dataset on the current directory """
        # Read dataset
        df = pd.read_csv("data/reddit_train.csv")
        # Apply stemming function
        df["cleaned"] = df["comments"].apply(self.clean_text)
        # Transform each subreddit into an unique integer
        df["label"] = self.label_encoder.fit_transform(df["subreddits"])
        # Save cleaned dataset
        df.to_csv("data/preprocessed_reddit_train_" +
                  type(self.normalizer).__name__ + ".csv",
                  index=False)
        # TODO: Implement Regularization (i.e. PCA, SVD, L1, L2...?)

    def clean_text(self, sentence):
        # Put all words to lower case
        sentence = sentence.lower()
        # Tokenize words
        word_tokens = word_tokenize(sentence)
        # Remove punctuation
        word_tokens = [_ for _ in word_tokens if _ not in string.punctuation]
        # Remove non-alphabetical char
        word_tokens = [
            re.sub(pattern="[^a-zA-Z0-9\s]", repl="", string=_)
            for _ in word_tokens
        ]
        # Remove empty strings
        word_tokens = [_ for _ in word_tokens if _]
        # Stem words
        processed_sentence = self.normalize(" ".join(word_tokens))
        # TODO: Remove links?
        return processed_sentence.strip()

    def normalize(self, sentence):
        normalized_str = []
        word_tokens = word_tokenize(sentence)
        if type(self.normalizer).__name__ == "SnowballStemmer":
            for i in word_tokens:
                normalized_str.append(self.normalizer.stem(i))
        elif type(self.normalizer).__name__ == "WordNetLemmatizer":
            for i in word_tokens:
                normalized_str.append(self.normalizer.lemmatize(i))
        else:
            raise Exception(
                "Normalizer must be \"stemmer\" or \"lemmatizer\".")

        return " ".join(normalized_str)

    def preprocess_reddit_test(self):
        """ Returns a cleaned up version of the test dataset """
        # Read dataset
        df = pd.read_csv("data/reddit_test.csv")
        # Apply stemming function
        df["cleaned"] = df["comments"].apply(self.clean_text)
        # Store cleaned test set in
        df.to_csv("data/preprocessed_reddit_test_" +
                  type(self.normalizer).__name__ + ".csv",
                  index=False)
コード例 #14
0
ファイル: test.py プロジェクト: wushengcq/lsa
def stem_test():
    stemmer = WordNetLemmatizer()
    print stemmer.lemmatize("environment")
    print stemmer.lemmatize("environmental")
    stemmer = PorterStemmer()
    print stemmer.stem("environmental")