def selectFeatureAndNegSamples(keyinfo,
                               feat_type,
                               feat_size,
                               neg_sample_by_feat_size,
                               debug_category=''):
    '''
    High level function to select features of type and select negative samples using the selected features
    Params:
        keyinfo: output from getKeyInfoForClassifier
        feat_type: name of feature, from CS4242_Assg2.constants
        feat_size: number of features to select
        neg_sample_by_feat_size: number of negative samples to select 
        debug_category (optional): category name for writing debug files
    Output:
        tuple
        [0]: [] of selected features
        [1]: [] of selected negative samples
    '''
    pos_tweets = keyinfo[POSITIVE][PROCESSED_TWEETS]
    pos_sample_size = len(pos_tweets)
    neg_tweets = keyinfo[NEGATIVE][PROCESSED_TWEETS]
    neg_sample_size = len(neg_tweets)

    sel_feat = []
    sel_neg_by_feat = []

    if feat_type in keyinfo[UNIQUE_FEATURES] \
        and feat_type in keyinfo[POSITIVE][FEATURES] \
        and feat_type in keyinfo[NEGATIVE][FEATURES]:
        unique_feat = keyinfo[UNIQUE_FEATURES][feat_type]
        pos_feat = keyinfo[POSITIVE][FEATURES][feat_type]
        neg_feat = keyinfo[NEGATIVE][FEATURES][feat_type]

        chi2_feat = selectFeatureByChi2(unique_feat, pos_feat, neg_feat,
                                        pos_sample_size, neg_sample_size,
                                        feat_size)
        writeDebugListToFile(
            "%s_%s_chi2_sel_feat.txt" % (debug_category, feat_type), chi2_feat)

        sel_feat = [x[0] for x in chi2_feat]
        writeDebugListToFile(
            "%s_%s_chi2_sel_feat_only.txt" % (debug_category, feat_type),
            sel_feat)

        debugPrint("%s feature count (intial): %s" %
                   (feat_type, len(pos_feat)))
        debugPrint("%s feature count (selected): %s" %
                   (feat_type, len(sel_feat)))

        sel_neg_by_feat = selectTweetIfFeatureExists(neg_tweets,
                                                     neg_sample_by_feat_size,
                                                     sel_feat, feat_type)
        debugPrint("%s selected neg tweet count: %s" %
                   (feat_type, len(sel_neg_by_feat)))

    else:
        debugPrint("%s not in use" % feat_type)

    return (sel_feat, sel_neg_by_feat)
Beispiel #2
0
def createGroundTruth(labelfile, groundtruth_cat, appendname=''):

    if DEBUG_CODE:
        groundtruth_list = []
        with codecs.open(labelfile, encoding='cp1252') as the_file:
            for line in the_file:
                splitarray = line.strip().split(',')

                category = splitarray[0][1:-1]
                if category == groundtruth_cat:
                    groundtruth_list.append(1)
                else:
                    groundtruth_list.append(0)


#             print groundtruth_list
        writeDebugListToFile(
            "%s_groundtruth_%s.txt" % (groundtruth_cat, appendname),
            groundtruth_list)
def performClassification(test_input_filename, features_used=FEATURES_DEFAULT):
    '''
    Returns:
        combined_results:
            {
                'apple': [0,0,1,0...],
                'twitter': [0,0,0,...],
            }
    '''
    with codecs.open(test_input_filename, encoding='cp1252') as k:
        tweet_features_list = []
        categorieslist = []
        svmstates = SVMStatesClassifier.objects.all()
        for svm in svmstates:
            categorieslist.append(svm.classifier_name)
        for line in k:
            json_data = json.loads(line, encoding='cp1252')
            featureline = extractFeaturesFromTweet(json_data, categorieslist,
                                                   features_used)
            # {'tweet': This was a triumph, 'features': {FEATURE_TEXT: __ , 'geolocation' : __ }}
            tweet_features_list.append(featureline)
        writeDebugListToFile("test_tweets_feature.txt", tweet_features_list)

        #    For each svm
        combined_results = {}
        for svm in svmstates:
            featurematrix_classifier = svm.featurematrix
            svm_matrix = getSVMMatrixForClassification(
                featurematrix_classifier, tweet_features_list)
            writeDebugListToFile(
                "%s_test_svm_matrix.txt" % featurematrix_classifier.category,
                svm_matrix[SVM_X])

            #             print type(featurematrix_classifier.category)
            reslist = performSVMClassification(svm, svm_matrix)
            combined_results[svm.classifier_name] = reslist

        for key, value in combined_results.iteritems():
            writeDebugListToFile("%s_results.txt" % key, value)

        return combined_results
Beispiel #4
0
def performTrainingForSA(data_filename,
                         label_filename,
                         features_used=FEATURES_SA_DEFAULT,
                         job_id=None):
    # extract & preprocess features
    try:
        debugPrint("feature extraction and preprocessing...")
        if job_id != None:
            connection.close()
            jobstatus = JobStatusSA.objects.get(id=job_id)

        gen = parseLabelFile(PATH_GROUNDTRUTH_TRAINING)
        categories_list = gen['categories']
        groundtruth_list = gen['groundtruth_list']
        all_keyinfo = getKeyInfoForSA(
            PATH_TRAINING_DATA, categories_list, groundtruth_list,
            features_used)  # A test for unicode errors

        for category, keyinfo in all_keyinfo.iteritems():
            debugPrint("training category: %s" % category)
            if job_id != None:
                updateJobStatus(jobstatus, "Training Category: %s" % category)
            pos_tweets = keyinfo[CLASS_SVM_POSITIVE][PROCESSED_TWEETS]
            neg_tweets = keyinfo[CLASS_SVM_NEGATIVE][PROCESSED_TWEETS]
            neu_tweets = keyinfo[CLASS_SVM_NEUTRAL][PROCESSED_TWEETS]

            #         size = min(len(pos_tweets), len(neg_tweets), len(neu_tweets))
            #         max_size = 2*size
            #         if max_size < 100:
            #             max_size = 100
            #         print size, max_size

            # feature selection
            debugPrint(">> feature selection")

            # create feature matrix for each tweet
            debugPrint(">> get feature matrix")
            training_tweets = {
                CLASS_SVM_POSITIVE: pos_tweets,
                CLASS_SVM_NEGATIVE: neg_tweets,
                CLASS_SVM_NEUTRAL: neu_tweets
            }

            selected_feat_tweets = selectFeaturesForTraining(
                keyinfo, features_used)
            selected_feat = selected_feat_tweets[0]
            #             training_tweets = selected_feat_tweets[1]

            #         selected_feat = selectFeaturesForSA(keyinfo, training_tweets, features_used)
            writeDebugListToFile("%s_sa_selected_feat.txt" % category,
                                 selected_feat)
            writeDebugListToFile("%s_sa_pos_tweets.txt" % category, pos_tweets)
            writeDebugListToFile("%s_sa_neg_tweets.txt" % category, neg_tweets)
            writeDebugListToFile("%s_sa_neu_tweets.txt" % category, neu_tweets)

            feature_matrix = getFeatureMatrixForSA(category, training_tweets,
                                                   selected_feat,
                                                   features_used)

            debugPrint("feature count: %s" %
                       len(feature_matrix.feature_to_id_map))
            writeDebugCountDictToFile("%s_sa_feature_to_id_map.txt" % category,
                                      feature_matrix.feature_to_id_map)
            writeDebugListToFile(
                "%s_sa_tweet_feature_matrix_list.txt" % category,
                feature_matrix.tweet_feature_matrix_list)

            # create svm matrix
            debugPrint(">> create svm matrix")
            if job_id != None:
                updateJobStatus(
                    jobstatus,
                    "Creating SVM Matrix for category %s" % (category))
            svm_matrix = getSVMMatrixForSA(feature_matrix, features_used)
            createSVMForSA(category, feature_matrix, svm_matrix, features_used)
            writeDebugListToFile("%s_sa_svm_matrix_X.txt" % category,
                                 svm_matrix[SVM_X])
            writeDebugListToFile("%s_sa_svm_matrix_Y.txt" % category,
                                 svm_matrix[SVM_Y])

            debugPrint("training completed for category: %s" % category)
            if job_id != None:
                updateJobStatus(
                    jobstatus,
                    "Training completed for category: %s" % (category))

            # TODO: remove!
    #         break

        if job_id != None:
            updateJobStatus(jobstatus, "Completed!")
    except:
        traceback.print_exc(file=open("%s/svmstates/errlog.txt" %
                                      (BASE_DIR), "a"))
def performTraining(data_filename,
                    label_filename,
                    features_used=FEATURES_DEFAULT,
                    job_id=None):
    try:
        # extract & preprocess features
        debugPrint("feature extraction and preprocessing...")
        if job_id != None:
            connection.close()
            jobstatus = JobStatus.objects.get(id=job_id)
            updateJobStatus(jobstatus, "Acquiring Key info")

        gen = parseLabelFile(label_filename)
        categories_list = gen['categories']
        groundtruth_list = gen['groundtruth_list']
        all_keyinfo = getKeyInfoForClassifier(data_filename, categories_list,
                                              groundtruth_list, features_used)

        for category, keyinfo in all_keyinfo.iteritems():
            debugPrint("training category: %s" % category)
            if job_id != None:
                updateJobStatus(jobstatus,
                                "Training category: %s" % (category))
            pos_tweets = keyinfo[POSITIVE][PROCESSED_TWEETS]
            pos_sample_size = len(pos_tweets)
            neg_tweets = keyinfo[NEGATIVE][PROCESSED_TWEETS]
            neg_sample_size = len(neg_tweets)

            # feature selection
            debugPrint(">> feature selection")
            if job_id != None:
                updateJobStatus(
                    jobstatus,
                    "Feature selection on category: %s" % (category))
            select_results = selectTrainingFeaturesAndNegSamples(
                keyinfo, features_used, pos_sample_size, category)
            selected_feat = select_results[0]
            selected_neg_tweets = select_results[1]
            #
            writeDebugListToFile("%s_selected_feat.txt" % category,
                                 selected_feat)
            writeDebugListToFile("%s_selected_neg_tweets.txt" % category,
                                 selected_neg_tweets)
            writeDebugListToFile("%s_pos_tweets.txt" % category, pos_tweets)

            # create feature matrix for each tweet
            debugPrint(">> get feature matrix")
            training_tweets = {
                POSITIVE: pos_tweets,
                NEGATIVE: selected_neg_tweets
            }
            feature_matrix = getFeatureMatrix(category, training_tweets,
                                              selected_feat, features_used)
            writeDebugCountDictToFile("%s_feature_to_id_map.txt" % category,
                                      feature_matrix.feature_to_id_map)
            writeDebugListToFile("%s_tweet_feature_ids_list.txt" % category,
                                 feature_matrix.tweet_feature_ids_list)

            debugPrint('feature count: %s' %
                       len(feature_matrix.feature_to_id_map))
            debugPrint("positive tweets count: %s" % pos_sample_size)
            debugPrint("negative tweets count: %s" % len(selected_neg_tweets))

            # create svm matrix
            debugPrint(">> create svm matrix")

            if job_id != None:
                updateJobStatus(
                    jobstatus,
                    "Creating SVM Matrix for category %s" % (category))
            svm_matrix = getSVMMatrixForClassification(feature_matrix)
            writeDebugListToFile("%s_svm_matrix_X.txt" % category,
                                 svm_matrix[SVM_X])
            writeDebugListToFile("%s_svm_matrix_Y.txt" % category,
                                 svm_matrix[SVM_Y])
            createSVM(category, feature_matrix, svm_matrix)

            debugPrint("training completed for category: %s" % category)
            if job_id != None:
                updateJobStatus(
                    jobstatus,
                    "Training completed for category: %s" % (category))

        if job_id != None:
            updateJobStatus(jobstatus, "Completed!")

    except:
        traceback.print_exc(file=open("%s/svmstates/errlog.txt" %
                                      (BASE_DIR), "a"))