Python tokenize Exemples, lda_stats.utils.idf.tokenize Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : naive_bayes.py Projet : TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        print "Loading classifier"
        classifier_path = os.path.join(
            settings.BASE_DIR,
            "lda_stats/classifiers/naive_bayes.classifier.cpickle")
        nb = cPickle.load(open(classifier_path, 'rb'))

        filters = {}
        if options['only_newsitems']:
            filters['newsitem__isnull'] = False
        elif options['only_comments']:
            filters['comment__isnull'] = False

        for database in self.selected_dbs:

            print "Processing database " + database

            algorithm_naive_bayes, created = Algorithm.objects.using(database)\
                                                .get_or_create(name="naive_bayes")

            print "Removing previous results"
            query = """
            DELETE FROM result WHERE algorithm_id IN (
                SELECT id
                  FROM algorithm 
                 WHERE name = 'naive_bayes'
            )"""
            cursor = connections[database].cursor()
            cursor.execute(query)

            print "Querying database"
            queryset = Text.objects.using(database).filter(**filters)

            results = []

            print "Calculating..."
            self.pbar_setup(maxval=queryset.count())
            for text in queryset_iterator(queryset, chunksize=2000):
                estimate = nb.predict([
                    utils.TFIDF_to_list(utils.TFIDF(utils.tokenize(text.text)))
                ])
                results.append(
                    Result(algorithm=algorithm_naive_bayes,
                           text=text,
                           value=str(estimate[0])))
                self.pbar_increment()

                if len(results) > 100000:
                    print "\nSaving partial results..."
                    Result.objects.using(database).bulk_create(results)
                    results = []

            self.pbar_destroy()

            print "Saving results"
            Result.objects.using(database).bulk_create(results)

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

Exemple #2

0

Afficher le fichier

def estimate(text):
    return (text,
            nb.predict(
                [utils.TFIDF_to_list(utils.TFIDF(utils.tokenize(text.text)))]))

Exemple #3

0

Afficher le fichier

    def handle(self, *args, **options):

        k = 5

        print "Reading data to memory"

        TFIDF_list = []
        label = []

        queryset = Newsitem.objects.all()
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset, chunksize=100)

        for newsitem in newsitems:
            TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text)))
            if newsitem.cat1 in [1, 2]:
                label.append(1)
            else:
                label.append(0)
            self.pbar_increment()

        self.pbar_destroy()

        print "Creating traing and test data..."

        TFIDF_svm = []
        for i in TFIDF_list:
            TFIDF_svm.append(utils.TFIDF_to_list(i))
        # TFIDF_svm is the input matrix of SVM

        # Reads the train_len from command line
        #train_len=int(sys.argv[1])
        train_len = 200

        # Index of train samples from class 0
        indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len]
        # Index of train samples from class 1
        indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len]
        # We have K number of positive samples and also K number of negative samples

        train = []
        train_label = []
        for i in indexZero + indexOne:
            train.append(TFIDF_svm[i])
            train_label.append(label[i])
        # Train: train matrix
        # train_label: lables of train data

        # The other samples are test samples.
        test = [
            TFIDF_svm[i] for i in range(len(TFIDF_svm))
            if i not in indexZero + indexOne
        ]
        test_label = [
            label[i] for i in range(len(label))
            if i not in indexZero + indexOne
        ]

        print "Fitting..."
        clf = svm.SVC(probability=True)
        # Train the model
        clf.fit(train, train_label)

        #print "Score: " + clf.score(train, train_label)

        print "Generating probabilities"
        pred_probas = clf.predict_proba(test)[:, 1]

        fpr, tpr, _ = roc_curve(test_label, pred_probas)
        roc_auc = auc(fpr, tpr)

        print "Plotting..."
        plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.legend(loc='lower right')

        print "Saving!"
        plt.savefig('out.png')

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

Exemple #4

0

Afficher le fichier

Fichier : algo_command.py Projet : TheResearchProject/CommentParser

    def handle(self, *args, **options):

        TFIDF_list = []
        label = []

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            TFIDF_list.append(self.tokenize(newsitem.text.text))
            if newsitem.cat1 in [1, 2]:
                label.append(1)
            else:
                label.append(0)
            self.pbar_increment()

        self.pbar_destroy()

        self.train()

        print "Estimating..."
        self.pbar_setup(maxval=len(TFIDF_list))

        counter1 = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0
        while counter1 < len(TFIDF_list):
            distance_list = []
            counter2 = 0
            while counter2 < len(TFIDF_list):
                if counter1 != counter2:
                    distance_list.append(
                        utils.TFIDF_distance(TFIDF_list[counter1],
                                             TFIDF_list[counter2]))
                counter2 += 1
            nearest_list = sorted(range(len(distance_list)),
                                  key=lambda i: distance_list[i])[:k]
            repeat_dic = {}
            for i in nearest_list:
                if repeat_dic.has_key(label[i]):
                    repeat_dic[label[i]] += 1
                else:
                    repeat_dic[label[i]] = 1
            estimate_label = max(repeat_dic, key=repeat_dic.get)
            if estimate_label == 1 and label[counter1] == 1:
                TP += 1
            elif estimate_label == 1 and label[counter1] == 0:
                FN += 1
            elif estimate_label == 0 and label[counter1] == 0:
                TN += 1
            else:
                FP += 1
            counter1 += 1
            self.pbar_increment()

        self.pbar_destroy()

        data = [
            ('algo_knn_tp', TP),
            ('algo_knn_fn', FN),
            ('algo_knn_fp', FP),
            ('algo_knn_tn', TN),
        ]

        print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN
        #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN

        print "Saving algorithm results"

        for item in data:
            algorithm_name = item[0]
            value = item[1]
            algorithm, create = Algorithm.objects.get_or_create(
                name=algorithm_name)
            result, created = Result.objects.get_or_create(algorithm=algorithm)
            result.value = str(value)
            result.save()

        algo_knn_uniform_estimative, create = Algorithm.objects.get_or_create(
            name="algo_knn_uniform_estimative")

        print "Calculating estimatives and saving result"

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            data = utils.TFIDF(utils.tokenize(row[0]))
            distance_list = []
            for i in range(len(TFIDF_list)):
                distance_list.append(utils.TFIDF_distance(data, TFIDF_list[i]))
            nearest_list = sorted(range(len(distance_list)),
                                  key=lambda i: distance_list[i])[:k]
            repeat_dic = {}
            for i in nearest_list:
                if distance_list[i] != 0:
                    if repeat_dic.has_key(label[i]):
                        repeat_dic[label[i]] += 1
                    else:
                        repeat_dic[label[i]] = 1
            estimate = max(repeat_dic, key=repeat_dic.get)
            Result.objects.create(algorithm=algo_knn_uniform_estimative,
                                  text=newsitem.text,
                                  value=str(estimate))
            self.pbar_increment()

        self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))

Exemple #5

0

Afficher le fichier

Fichier : algo_command.py Projet : TheResearchProject/CommentParser

 def tokenize(self, text):
     return utils.TFIDF(utils.tokenize(text))

Exemple #6

0

Afficher le fichier

Fichier : svm.py Projet : TheResearchProject/CommentParser

    def handle(self, *args, **options):
        super(Command, self).handle(self, *args, **options)

        k = 5

        print "Reading data to memory"

        TFIDF_list = []
        label = []

        queryset = Newsitem.objects.all()
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset, chunksize=100)

        for newsitem in newsitems:
            TFIDF_list.append(utils.TFIDF(utils.tokenize(newsitem.text.text)))
            if newsitem.cat1 in [1, 2]:
                label.append(1)
            else:
                label.append(0)
            self.pbar_increment()

        self.pbar_destroy()

        print "Training..."

        TFIDF_svm = []
        for i in TFIDF_list:
            TFIDF_svm.append(utils.TFIDF_to_list(i))
        # TFIDF_svm is the input matrix of SVM

        # Reads the train_len from command line
        #train_len=int(sys.argv[1])
        train_len = 200

        # Index of train samples from class 0
        indexZero = [i for i in range(len(label)) if label[i] == 0][:train_len]
        # Index of train samples from class 1
        indexOne = [i for i in range(len(label)) if label[i] == 1][:train_len]
        # We have K number of positive samples and also K number of negative samples

        train = []
        train_label = []
        for i in indexZero + indexOne:
            train.append(TFIDF_svm[i])
            train_label.append(label[i])
        # Train: train matrix
        # train_label: lables of train data

        # The other samples are test samples.
        test = [
            TFIDF_svm[i] for i in range(len(TFIDF_svm))
            if i not in indexZero + indexOne
        ]
        test_label = [
            label[i] for i in range(len(label))
            if i not in indexZero + indexOne
        ]

        clf = svm.SVC()
        # Train the model
        clf.fit(train, train_label)

        counter1 = 0
        TP = 0
        TN = 0
        FP = 0
        FN = 0

        print "Estimating..."
        self.pbar_setup(maxval=len(test))

        for i in test:
            estimate_label = clf.predict([i])[0]
            if estimate_label == 1 and label[counter1] == 1:
                TP += 1
            elif estimate_label == 1 and label[counter1] == 0:
                FN += 1
            elif estimate_label == 0 and label[counter1] == 0:
                TN += 1
            else:
                FP += 1
            counter1 += 1
            self.pbar_increment()

        self.pbar_destroy()

        print 'TP=>', TP, 'FN=>', FN, 'FP=>', FP, 'TN=>', TN
        #print 'F1 Measurement: ', float(TP+TN)/(TP+FN+FP+TN), float(TP)/(TP+FP), float(TP)/(TP+FN), TP, FN, FP, TN

        data = [
            ('algo_svm_tp', TP),
            ('algo_svm_fn', FN),
            ('algo_svm_fp', FP),
            ('algo_svm_tn', TN),
            ('algo_svm_score', clf.score(train, train_label)),
        ]

        print "Saving algorithm results"

        for item in data:
            algorithm_name = item[0]
            value = item[1]
            algorithm, create = Algorithm.objects.get_or_create(
                name=algorithm_name)
            result, create = Result.objects.get_or_create(algorithm=algorithm)
            result.value = str(value)
            result.save()

        algo_svm_estimative, create = Algorithm.objects.get_or_create(
            name="algo_svm_estimative")

        print "Calculating estimatives and saving result"

        queryset = Newsitem.objects
        self.pbar_setup(maxval=queryset.count())
        newsitems = queryset_iterator(queryset.all(), chunksize=100)

        for newsitem in newsitems:
            estimate = clf.predict([
                utils.TFIDF_to_list(
                    utils.TFIDF(utils.tokenize(newsitem.text.text)))
            ])
            Result.objects.create(algorithm=algo_svm_estimative,
                                  text=newsitem.text,
                                  value=str(estimate[0]))
            self.pbar_increment()

        self.pbar_destroy()

        self.stdout.write(self.style.SUCCESS('Command executed succesfully'))