Python LanguageUtils.tokenize Examples

Programming Language: Python

Class/Type: LanguageUtils

Method/Function: tokenize

Examples at hotexamples.com: 2

Python LanguageUtils.tokenize - 2 examples found. These are the top rated real world Python examples of LanguageUtils.tokenize extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

tokenize(2)

punkt_tokenize(1)

Frequently Used Methods

tokenize (2)

punkt_tokenize (1)

Example #1

Show file

File: DataSet.py Project: dgboy2000/cs224u

    def getAllBoW(self):
        """Return unigram, bigram words as bag of words."""

        fname = 'cache/word_ngrams.%s.set%d.dom%d.pickle' % (
                 self.getFilename(), self.getEssaySet(), self.getDomain())

        try:
            f = open(fname, 'rb')
            bow = pickle.load(f)
        except:
            bigram_measures = nltk.collocations.BigramAssocMeasures()

            bow = list()
            for line in self.getRawText():
                cur = LanguageUtils.tokenize(line)

                finder = BigramCollocationFinder.from_words(cur)
                scored = finder.score_ngrams(bigram_measures.pmi)
                for bigram, score in scored:
                    cur.append(bigram)

                bow.append(cur)

            pickle.dump(bow, open(fname, 'w'))

        return bow

Example #2

Show file

    def extractFeatures(self, ds, corpus):
        """Extracts features from a DataSet ds"""

        # load into memory the string from data/essay_set_desc_?.txt
        f = open('data/essay_set_desc_%d.txt' % ds.getEssaySet(), 'r')
        prompt = f.read()

        # tokenize into unigrams & bigrams
        tokens = LanguageUtils.tokenize(prompt)

        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens)
        scored = finder.score_ngrams(bigram_measures.raw_freq)
        for bigram, score in scored:
            tokens.append(bigram)

        # Get feature bows for projection into LSI
        dictionary = corpus.getWordDictionary()

        # get lsi
        lsi = corpus.getLSA()
        tfidf = corpus.getTfidf()
        mm_corpus = ds.getGensimCorpus()

        # project into lsi space
        vec_bow = dictionary.doc2bow(tokens)
        vec_lsi = lsi[tfidf[vec_bow]]

        index = gensim.similarities.MatrixSimilarity(lsi[tfidf[ds.getGensimCorpus()]])

        sims = index[vec_lsi]

        feats = list()
        for sim in sims:
            cur_feat = list()
            cur_feat.append(sim)
            feats.append(cur_feat)

        self.features = np.asarray(feats)
        return