Exemple #1
0
def demo_similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.
        
        @param word: The word used to seed the similarity search
        @type word: C{str} 
        @param num: The number of words to generate (default=20)
        @type num: C{int}
        @seealso: L{ContextIndex.similar_words()}
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.text.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        while 1:
          word = raw_input('Enter a Chinese word such as "開心"(type 0 to exit):'); 
          print "word='"+ word + "'"
          if word == '0': break
          word = word.decode('utf-8')
          wci = self._word_context_index._word_to_contexts
          if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            print tokenwrap(words)
          else:
            print "No matches"
Exemple #2
0
def demo_collocations(self, num=40, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        @seealso: L{find_collocations}
        @param num: The maximum number of collocations to print.
        @type num: C{int}
        @param window_size: The number of tokens spanned by a collocation (default=2)
        @type window_size: C{int}
        """
        if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size):
            self._num = num
            self._window_size = window_size
            print "Building collocations list"
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            from nltk.collocations import BigramCollocationFinder
            finder = BigramCollocationFinder.from_words(self.tokens, window_size) 
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            from nltk.metrics import f_measure, BigramAssocMeasures
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1+u' '+w2 for w1, w2 in self._collocations]
        print "List {0} collocations".format(num)
        print tokenwrap(colloc_strings, separator=u'; ')
Exemple #3
0
def demo_common_context(self, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.
        @seealso: L{ContextIndex.common_contexts()}
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.text.ContextIndex(self.tokens,
                                                    key=lambda s:s.lower())
        while 1:
          inp = raw_input('Enter two Chinese words such as "我 你"(type 0 to exit):'); 
          print "inp='"+ inp+"'"
          if inp == '0': break
          inp = inp.decode('utf-8')
          words = inp.split(u' ')
          try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print "No common contexts were found"
            else:
                ranked_contexts = fd.keys()[:num]
                print tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)
          except ValueError, e:
            print e
    def sandwich(cls, word):
        """
        """
        ind = cls.corpora_health.index(max(cls.corpora_health))
        results = cls.corpora[ind].sandwich(word)
#        results = [corpus.sandwich(word) for corpus in cls.corpora]
        return tokenwrap(results)
Exemple #5
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not ('_collocations' in self.__dict__ and self._num == num
                and self._window_size == window_size):
            self._num = num
            self._window_size = window_size

            #print("Building collocations list")
            from nltk.corpus import stopwords
            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(
                self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(
                lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio,
                                              num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
Exemple #6
0
    def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            # print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens, key=lambda s: s.lower()
            )

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))

        except ValueError as e:
            print(e)
Exemple #7
0
    def findall(self, regexp):
        """
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        """

        if "_token_searcher" not in self.__dict__:
            self._token_searcher = TokenSearcher(self)

        hits = self._token_searcher.findall(regexp)
        hits = [' '.join(h) for h in hits]
        print(tokenwrap(hits, "; "))
Exemple #8
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :seealso: find_collocations
        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """
        if not (
            '_collocations' in self.__dict__
            and self._num == num
            and self._window_size == window_size
        ):
            self._num = num
            self._window_size = window_size

            # print("Building collocations list")
            from nltk.corpus import stopwords

            ignored_words = stopwords.words('english')
            finder = BigramCollocationFinder.from_words(self.tokens, window_size)
            finder.apply_freq_filter(2)
            finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
            bigram_measures = BigramAssocMeasures()
            self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num)
        colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations]
        print(tokenwrap(colloc_strings, separator="; "))
Exemple #9
0
    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            # print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens, filter=lambda x: x.isalpha(), key=lambda s: s.lower()
            )

        # words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = Counter(
                w
                for w in wci.conditions()
                for c in wci[w]
                if c in contexts and not w == word
            )
            words = [w for w, _ in fd.most_common(num)]
            print(tokenwrap(words))
        else:
            print("No matches")
Exemple #10
0
    def findall(self, regexp):
        """
        Find instances of the regular expression in the text.
        The text is a list of tokens, and a regexp pattern to match
        a single token must be surrounded by angle brackets.  E.g.

        >>> print('hack'); from nltk.book import text1, text5, text9
        hack...
        >>> text5.findall("<.*><.*><bro>")
        you rule bro; telling you bro; u twizted bro
        >>> text1.findall("<a>(<.*>)<man>")
        monied; nervous; dangerous; white; white; white; pious; queer; good;
        mature; white; Cape; great; wise; wise; butterless; white; fiendish;
        pale; furious; better; certain; complete; dismasted; younger; brave;
        brave; brave; brave
        >>> text9.findall("<th.*>{3,}")
        thread through those; the thought that; that the thing; the thing
        that; that that thing; through these than through; them that the;
        through the thick; them that they; thought that the

        :param regexp: A regular expression
        :type regexp: str
        """

        if "_token_searcher" not in self.__dict__:
            self._token_searcher = TokenSearcher(self)

        hits = self._token_searcher.findall(regexp)
        hits = [' '.join(h) for h in hits]
        print(tokenwrap(hits, "; "))
Exemple #11
0
    def common_contexts(self, words, num=20):
        """
        Find contexts where the specified words appear; list
        most frequent common contexts first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.common_contexts()
        """
        if '_word_context_index' not in self.__dict__:
            #print('Building word-context index...')
            self._word_context_index = ContextIndex(self.tokens,
                                                    key=lambda s: s.lower())

        try:
            fd = self._word_context_index.common_contexts(words, True)
            if not fd:
                print("No common contexts were found")
            else:
                ranked_contexts = [w for w, _ in fd.most_common(num)]
                print(tokenwrap(w1 + "_" + w2 for w1, w2 in ranked_contexts))

        except ValueError as e:
            print(e)
Exemple #12
0
    def similar(self, word, num=20):
        """
        Distributional similarity: find other words which appear in the
        same contexts as the specified word; list most similar words first.

        :param word: The word used to seed the similarity search
        :type word: str
        :param num: The number of words to generate (default=20)
        :type num: int
        :seealso: ContextIndex.similar_words()
        """
        if '_word_context_index' not in self.__dict__:
            #print('Building word-context index...')
            self._word_context_index = ContextIndex(
                self.tokens,
                filter=lambda x: x.isalpha(),
                key=lambda s: s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = Counter(w for w in wci.conditions() for c in wci[w]
                         if c in contexts and not w == word)
            words = [w for w, _ in fd.most_common(num)]
            print(tokenwrap(words))
        else:
            print("No matches")
Exemple #13
0
def gen(context='', hashtag='', tries=30):
    tokens = nltk.word_tokenize(corpus)
    text = nltk.Text(tokens)
    text.generate(0) #generate model

    n = 10
    r = tokenwrap(text._trigram_model.generate(n, context))
    return r[:140-len(hashtag)]+' '+hashtag
def main():
    # Parse Book into Array
    parsed_book = open(str(sys.argv[1])).read().split()

    # Default Values and Parsing Input Values
    # Graph Values
    num_points = 30
    if "--numPoints" in sys.argv:
        num_points = sys.argv[sys.argv.index("--numPoints") + 1]

    y_label = "Frequencies"
    line_width = 3

    title = "Top " + str(num_points) + " Useful Words For " + str(sys.argv[1])
    if "--title" in sys.argv:
        title = sys.argv[sys.argv.index("--title") + 1]

    if "--yLabel" in sys.argv:
        y_label = sys.argv[sys.argv.index("--yLabel") + 1]

    if "--lineWidth" in sys.argv:
        line_width = sys.argv[sys.argv.index("--lineWidth") + 1]

    # Stop Words Values
    blacklist = []
    if "--blacklist" in sys.argv:
        blacklist = sys.argv[sys.argv.index("--blacklist") + 1].replace(
            " ", "").split(',')

    # Collocations Values
    num_collocations = 20
    if "--numCollocations" in sys.argv:
        num_collocations = sys.argv[sys.argv.index("--numCollocations") + 1]

    window_size = 4
    if "--windowSize" in sys.argv:
        window_size = sys.argv[sys.argv.index("--windowSize") + 1]

    # Collocations
    # Paper Explaining The Math
    # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf
    print("Words Commonly Used Together:")
    print(
        tokenwrap(find_collocations(parsed_book,
                                    blacklist,
                                    num=int(num_collocations),
                                    window_size=int(window_size)),
                  separator=" ; "))

    # Filter out Stop Words
    filtered_freq_dist = word_filter(FreqDist(parsed_book), blacklist)

    # Plot
    plot_most_common(filtered_freq_dist, int(num_points), title, y_label,
                     int(line_width))
def preprocessing(comment):

    """
    Function to clean the comment. Lower all words and remove stop words
    """

    words = nltk.word_tokenize(comment)
    clean_words = [word.lower() for word in words if word.lower() not in stopwords.words('danish')]
    cleaned_comment = tokenwrap(clean_words)

    return cleaned_comment
Exemple #16
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """

        collocation_strings = [w1 + ' ' + w2 for w1, w2 in self.collocation_list(num, window_size)]
        print(tokenwrap(collocation_strings, separator="; "))
Exemple #17
0
def demo_findall(text):
  while 1:
    inp = raw_input('Enter two Chinese words such as "我:2 手:4"(type 0 to exit):'); 
    print "inp='"+ inp+"'"
    if inp == '0': break
    inp = inp.decode('big5')
    reg = "<1> <2> <3> <4> <5>"
    if len(inp) == 0:
      print 'no input words'
    else:
      for wp in inp.split(' '):	
        (w, p) = wp.split(':')
  #        reg = re.sub(p, w, reg)
        reg = re.sub(p, ''.join(['.*', w, '.*']), reg)
    reg = re.sub('\d', '.*', reg)
    print "reg=", reg
#    text.findall(reg)
    if "_token_searcher" not in text.__dict__:
      text._token_searcher = nltk.text.TokenSearcher(text)
    hits = text._token_searcher.findall(reg)
    hits = [' '.join(h) for h in hits]
    print tokenwrap(hits, u"; ") 
Exemple #18
0
    def generate(self, length=100, context=()):
        """
        Return random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print "Building ngram index..."
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length, context=context)
        return tokenwrap(text)
Exemple #19
0
    def generate(self, length=100):
        """
        Print random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print("Building ngram index...")
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length)
        print(tokenwrap(text))
Exemple #20
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.

        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """

        collocation_strings = [
            w1 + " " + w2
            for w1, w2 in self.collocation_list(num, window_size)
        ]
        print(tokenwrap(collocation_strings, separator="; "))
    def synonyms(word):
        ## todo: this should move because we want to cache the results so we can calculate health!!
        results = []
        for synset in wn.synsets(word):
            results.extend(synset.lemma_names)

        result_set = set(results)        
        if word in result_set:
            result_set.remove(word)

        ### todo: stopped here... should filter these down to some reasonable thing
        ############ todo:check if the above needs to be cached somewhere (maybe it is cached by wn.synsets?)
        results = list(result_set)
        results = results[:MAX_SYNONYMS_TO_RETURN]

        return tokenwrap(results)
Exemple #22
0
    def generate(self, length=100, text_seed=None, random_seed=42):
        """
        Print random text, generated using a trigram language model.
        See also `help(nltk.lm)`.

        :param length: The length of text to generate (default=100)
        :type length: int

        :param text_seed: Generation can be conditioned on preceding context.
        :type text_seed: list(str)

        :param random_seed: A random seed or an instance of `random.Random`. If provided,
        makes the random sampling part of generation reproducible. (default=42)
        :type random_seed: int

        """
        # Create the model when using it the first time.
        self._tokenized_sents = [
            sent.split(" ") for sent in sent_tokenize(" ".join(self.tokens))
        ]
        if not hasattr(self, "_trigram_model"):
            print("Building ngram index...", file=sys.stderr)
            self._trigram_model = self._train_default_ngram_lm(
                self._tokenized_sents, n=3
            )

        generated_tokens = []

        assert length > 0, "The `length` must be more than 0."
        while len(generated_tokens) < length:
            for idx, token in enumerate(
                self._trigram_model.generate(
                    length, text_seed=text_seed, random_seed=random_seed
                )
            ):
                if token == "<s>":
                    continue
                if token == "</s>":
                    break
                generated_tokens.append(token)
            random_seed += 1

        prefix = " ".join(text_seed) + " " if text_seed else ""
        output_str = prefix + tokenwrap(generated_tokens[:length])
        print(output_str)
        return output_str
Exemple #23
0
    def collocations(self, num=20, window_size=2):
        """
        Print collocations derived from the text, ignoring stopwords.
        
            >>> from nltk.book import text4
            >>> text4.collocations() # doctest: +ELLIPSIS
            United States; fellow citizens; four years; ...

        :param num: The maximum number of collocations to print.
        :type num: int
        :param window_size: The number of tokens spanned by a collocation (default=2)
        :type window_size: int
        """

        collocation_strings = [
            w1 + " " + w2 for w1, w2 in self.collocation_list(num, window_size)
        ]
        print(tokenwrap(collocation_strings, separator="; "))
Exemple #24
0
    def similar(self, word, num=20):
        """
        Returns as a string similar words
        """
        if '_word_context_index' not in self.__dict__:
            print 'Building word-context index...'
            self._word_context_index = nltk.ContextIndex(self.tokens,
                                                    filter=lambda x:x.isalpha(),
                                                    key=lambda s:s.lower())

#        words = self._word_context_index.similar_words(word, num)

        word = word.lower()
        wci = self._word_context_index._word_to_contexts
        if word in wci.conditions():
            contexts = set(wci[word])
            fd = FreqDist(w for w in wci.conditions() for c in wci[w]
                          if c in contexts and not w == word)
            words = fd.keys()[:num]
            return tokenwrap(words)
        else:
            print "No matches"
#NLTK processing
words = [ w
			for t in status_texts
			    for w in t.split() ]

nltk_text = nltk.Text(words)
nltk_text.collocations()
ignored_words = stopwords.words('english')
finder = BigramCollocationFinder.from_words(words, 2)
finder.apply_freq_filter(2)
finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words)
bigram_measures = nltk.collocations.BigramAssocMeasures()
collocations = finder.nbest(bigram_measures.likelihood_ratio, 20)
colloc_strings = [w1+' '+w2 for w1, w2 in collocations]
#finder = BigramCollocationFinder(word_fd, bigram_fd)
print tokenwrap(colloc_strings, separator="; ")




#create unstylized HTML
summarizedLinks = Counter(urls)

html_file = open('{0}_{1}_statuses.html'.format(data_file, file_time), 'w')
html_file.write('<!DOCTYPE html><html><head></head><body><h1>Analysis of past tweets: "{0}"</h1><h2>{1}</h2>'.format(q, now_time.strftime(fmt) ))
html_file.write('<br /><br /><h2>Collocations of commonly occuring pairs of words</h2>')
html_file.write('<ul>')
for collocation in colloc_strings:
	  html_file.write('<li>{0}</li>'.format(collocation))
html_file.write('</ul>')
html_file.write('<h2>Most common referenced URLs, unshortened and sorted</h2>')
Exemple #26
0
def wrap(iterable):
    return tokenwrap(iterable)
Exemple #27
0
 def sandwich(cls, word):
     """
     """
     results = [corpus.sandwich(word) for corpus in cls.corpora]
     return tokenwrap(results)