Python tokenize_document Examples, feature_extraction.tokenize_document Python Examples

Example #1

0

Show file

def train_word2vec(categories, comments, n_dim):
    from feature_extraction import tokenize_document
    from feature_extraction import word2vec_model
    from sklearn.linear_model import SGDClassifier
    documents = [tokenize_document(document) for document in comments]
    model = word2vec_model(documents, n_dim)
    train_vecs = w2vectorize(documents, model, n_dim)
    classifier = SGDClassifier(loss='log', penalty='l1')
    classifier.fit(train_vecs, categories)

    return model, classifier

Example #2

0

Show file

File: train.py Project: chrisdamba/trolling_detection

def train_word2vec(categories, comments, n_dim):
    from feature_extraction import tokenize_document
    from feature_extraction import word2vec_model
    from sklearn.linear_model import SGDClassifier
    documents = [tokenize_document(document) for document in comments]
    model = word2vec_model(documents, n_dim)
    train_vecs = w2vectorize(documents, model, n_dim)
    classifier = SGDClassifier(loss='log', penalty='l1')
    classifier.fit(train_vecs, categories)

    return model, classifier

Example #3

0

Show file

        print "\nSamples: "
        import pprint
        printer = pprint.PrettyPrinter(indent=4)
        printer.pprint(model["sound"].samples())

        print "\n"

        model = word2vec_model(comments)
        print model.similarity('retarded', 'loser')

    if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'):
        from train import train_word2vec
        from train import w2vectorize
        from feature_extraction import tokenize_document
        model, classifier = train_word2vec(categories, comments, 500)
        test_documents = [
            tokenize_document(document, stopwords='english')
            for document in test_comments
        ]
        test_vecs = w2vectorize(test_documents, model, 500)
        predictions = classifier.predict(test_vecs)
        print "\nWord2Vec Model Result\n"
        prediction_info(predictions, test_categories)

    if config_parser.getboolean(EXECUTION_SECTION, 'Final'):
        from train import train_assembling
        classifier = train_assembling(categories, comments, badwords)
        predictions = classifier.predict(test_comments)
        print "\nFeature Ensemble Model Result\n"
        prediction_info(predictions, test_categories)

Example #4

0

Show file

def tokenize_collection(collection, lowercase=True, stopwords=True, min_length=3):
    documents = [tokenize_document(document, lowercase=lowercase, stopwords=stopwords, min_length=min_length)
                 for document in collection]
    words = [token for document in documents for token in document]
    return words

Example #5

0

Show file

    def transform(self, documents):
        import enchant
        import sentlex
        from feature_extraction import tokenize_document

        d = enchant.Dict("en_US")
        swn = sentlex.SWN3Lexicon()
        tokenized_documents = [tokenize_document(document) for document in documents]
        n_words = []
        n_chars = []
        # number of uppercase words
        all_caps = []
        n_bad = []
        exclamation = []
        addressing = []

        n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents]

        sent_pos = []
        sent_neg = []
        n_you_re = []
        for comment in documents:
            n_words.append(len(comment.split()))
            n_chars.append(len(comment))
            all_caps.append(np.sum([w.isupper() for w in comment.split()]))
            n_bad.append(comment.count('fakeinsult'))
            exclamation.append(comment.count("!"))
            addressing.append(comment.count("@"))
            doc = nlp(comment)
            count = 0.
            pos_sum = 0.
            neg_sum = 0.
            for token in doc:
                if token.text == 'fakeinsult':
                    pos_sum += 0.
                    neg_sum += 1.
                    count += 1.
                    continue
                if token.pos_.startswith('RB'):
                    sentiment = swn.getadverb(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                elif token.pos_.startswith('NN'):
                    sentiment = swn.getnoun(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if token.pos_.startswith('JJ'):
                    sentiment = swn.getadjective(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if token.pos_.startswith('VB'):
                    sentiment = swn.getverb(token.text)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
            if count != 0:
                pos_sum /= count
                neg_sum /= count
            sent_neg.append(neg_sum)
            sent_pos.append(pos_sum)
            matches = self.__matcher(doc)
            n_you_re.append(len(matches))

        allcaps_ratio = np.array(all_caps) / np.array(n_words, dtype=np.float)
        bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
        dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float)

        return np.array([n_words, n_chars, n_dwords, n_you_re, exclamation, all_caps,
                         addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio,
                         sent_pos]).T

Example #6

0

Show file

    def transform(self, documents):
        import enchant
        import re
        import sentlex
        from pattern.en import tag as tagger

        d = enchant.Dict("en_US")
        SWN = sentlex.SWN3Lexicon()
        from feature_extraction import tokenize_document
        tokenized_documents = [
            tokenize_document(document) for document in documents
        ]
        n_words = [len(c.split()) for c in documents]
        #n_words = [len(document) for document in tokenized_documents]
        n_chars = [len(c) for c in documents]
        n_dwords = [
            sum(1 for word in document if d.check(word))
            for document in tokenized_documents
        ]

        sent_pos = []
        sent_neg = []
        for comment in documents:
            count = 0.
            pos_sum = 0.
            neg_sum = 0.
            for word, tag in tagger(comment.lower()):
                if word == 'fakeinsult':
                    pos_sum += 0.
                    neg_sum += 1.
                    count += 1.
                    continue
                if tag.startswith('RB'):
                    sentiment = SWN.getadverb(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                elif tag.startswith('NN'):
                    sentiment = SWN.getnoun(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if tag.startswith('JJ'):
                    sentiment = SWN.getadjective(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if tag.startswith('VB'):
                    sentiment = SWN.getverb(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
            if count != 0:
                pos_sum /= count
                neg_sum /= count
            sent_neg.append(neg_sum)
            sent_pos.append(pos_sum)

        n_you_re = [
            len(re.findall(self.__you_re, document)) for document in documents
        ]
        n_you = [
            len(re.findall(self.__you, document)) for document in documents
        ]

        # number of uppercase words
        allcaps = [
            np.sum([w.isupper() for w in comment.split()])
            for comment in documents
        ]
        # longest word
        #max_word_len = [np.max([len(w) for w in c.split()]) for c in documents]
        # average word length
        #mean_word_len = [np.mean([len(w) for w in c.split()])
        #                                    for c in documents]
        # number badwords:
        n_bad = [
            np.sum([c.lower().count(w) for w in self.__badwords])
            for c in documents
        ]
        exclamation = [c.count("!") for c in documents]
        addressing = [c.count("@") for c in documents]

        allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float)
        bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
        dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float)

        return np.array([
            n_words, n_chars, n_dwords, n_you_re, n_you, exclamation, allcaps,
            addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio, sent_pos
        ]).T

Example #7

0

Show file

File: train.py Project: chrisdamba/trolling_detection

    def transform(self, documents):
        import enchant
        import re
        import sentlex
        from pattern.en import tag as tagger

        d = enchant.Dict("en_US")
        SWN = sentlex.SWN3Lexicon()
        from feature_extraction import tokenize_document
        tokenized_documents = [tokenize_document(document) for document in documents]
        n_words = [len(c.split()) for c in documents]
        #n_words = [len(document) for document in tokenized_documents]
        n_chars = [len(c) for c in documents]
        n_dwords = [sum(1 for word in document if d.check(word)) for document in tokenized_documents]

        sent_pos = []
        sent_neg = []
        for comment in documents:
            count = 0.
            pos_sum = 0.
            neg_sum = 0.
            for word, tag in tagger(comment.lower()):
                if word == 'fakeinsult':
                    pos_sum += 0.
                    neg_sum += 1.
                    count += 1.
                    continue
                if tag.startswith('RB'):
                    sentiment = SWN.getadverb(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                elif tag.startswith('NN'):
                    sentiment = SWN.getnoun(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if tag.startswith('JJ'):
                    sentiment = SWN.getadjective(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
                if tag.startswith('VB'):
                    sentiment = SWN.getverb(word)
                    pos_sum += sentiment[0]
                    neg_sum += sentiment[1]
                    count += 1.
            if count != 0:
                pos_sum /= count
                neg_sum /= count
            sent_neg.append(neg_sum)
            sent_pos.append(pos_sum)

        n_you_re = [len(re.findall(self.__you_re, document)) for document in documents]
        n_you = [len(re.findall(self.__you, document)) for document in documents]

        # number of uppercase words
        allcaps = [np.sum([w.isupper() for w in comment.split()])
               for comment in documents]
        # longest word
        #max_word_len = [np.max([len(w) for w in c.split()]) for c in documents]
        # average word length
        #mean_word_len = [np.mean([len(w) for w in c.split()])
        #                                    for c in documents]
        # number badwords:
        n_bad = [np.sum([c.lower().count(w) for w in self.__badwords]) for c in documents]
        exclamation = [c.count("!") for c in documents]
        addressing = [c.count("@") for c in documents]

        allcaps_ratio = np.array(allcaps) / np.array(n_words, dtype=np.float)
        bad_ratio = np.array(n_bad) / np.array(n_words, dtype=np.float)
        dic_ratio = np.array(n_dwords) / np.array(n_words, dtype=np.float)

        return np.array([n_words, n_chars, n_dwords, n_you_re, n_you, exclamation, allcaps,
                         addressing, bad_ratio, n_bad, allcaps_ratio, dic_ratio,
                         sent_pos]).T

Example #8

0

Show file

File: main.py Project: chrisdamba/trolling_detection

        model = language_model(comments)

        print "\nSamples: "
        import pprint
        printer = pprint.PrettyPrinter(indent=4)
        printer.pprint(model["sound"].samples())

        print "\n"

        model = word2vec_model(comments)
        print model.similarity('retarded', 'loser')



    if config_parser.getboolean(EXECUTION_SECTION, 'WordVec'):
        from train import train_word2vec
        from train import w2vectorize
        from feature_extraction import tokenize_document
        model, classifier = train_word2vec(categories, comments, 500)
        test_documents = [tokenize_document(document, stopwords='english') for document in test_comments]
        test_vecs = w2vectorize(test_documents, model, 500)
        predictions = classifier.predict(test_vecs)
        print "\nWord2Vec Model Result\n"
        prediction_info(predictions, test_categories)

    if config_parser.getboolean(EXECUTION_SECTION, 'Final'):
        from train import train_assembling
        classifier = train_assembling(categories, comments, badwords)
        predictions = classifier.predict(test_comments)
        print "\nFeature Ensemble Model Result\n"
        prediction_info(predictions, test_categories)