Beispiel #1
0
def test(sc, file_positive, files_negative, file_model):
    """
    Tests a classification model using positive samples in file_positive and
    negative samples in file_negative. It prints the results to standard output

    :param sc: The spark context
    :type sc: SparkContext
    :param file_positive: The file with tweets to predict
    :type file_positive: str
    :param files_negative: The files with tweets to reject
    :type files_negative: list[str]
    :param file_model: The file where the model is located
    :type file_model: str
    """
    tweets_positive = sc.textFile(file_positive).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)).cache()
    list_negatives = [sc.textFile(file_negative).map(parse_json).filter(lambda x: is_valid(x) and is_english(x)) for file_negative in files_negative]
    tweets_negative = list_negatives[0]
    for ln in list_negatives[1:]:
        tweets_negative = tweets_negative.union(ln)
    try:
        print("Reading stored classification model")
        model = pickle.load(open(file_model, 'rb'))
        print("Computing predictions")
        threshold = 0.0
        total_positive = tweets_positive.count()
        total_negative = tweets_negative.count()
        true_positives = tweets_positive.filter(lambda x: model.predict(parse(x)) > threshold).count()
        true_negatives = tweets_negative.filter(lambda x: model.predict(parse(x)) <= threshold).count()
        false_negatives = total_positive - true_positives
        false_positives = total_negative - true_negatives
        print("Results for %s:" % file_model)
        print("  Total positives: %d" % total_positive)
        print("  Total negatives: %d" % total_negative)
        print("  False positives: %d" % false_positives)
        print("  False negatives: %d" % false_negatives)
        precision = 0.0
        recall = 0.0
        try:
            precision = float(true_positives) / float(true_positives + false_positives)
            recall = float(true_positives) / float(true_positives + false_negatives)
        except:
            pass
        print("  Precision: %f" % precision)
        print("  Recall: %f" % recall)
        print("Done!")
    except Exception as e:
        print("Error:")
        print(e)
Beispiel #2
0
def fast_predict(sc, file_input, file_output, sports_model, politics_model,
                 technology_model):
    """
    Predicts using the provided models
    """
    tweets = sc.textFile(file_input).map(parse_json).filter(
        lambda x: is_valid(x) and is_english(x))
    try:
        print("Reading stored classification model")
        sports = pickle.load(open(sports_model, 'rb'))
        politics = pickle.load(open(politics_model, 'rb'))
        technology = pickle.load(open(technology_model, 'rb'))

        def predict_labels(tweet):
            x = parse(tweet)
            labels = []
            if sports.predict(x) > 0.0:
                labels.append("sports")
            if politics.predict(x) > 0.0:
                labels.append("politics")
            if technology.predict(x):
                labels.append("technology")
            return labels

        print("Computing predictions")
        predictions = tweets.map(lambda t: (t, predict_labels(t)))
        filtered_predictions = predictions.filter(lambda t: len(t[1]) == 1)
        filtered_predictions.map(prediction_string).saveAsTextFile(file_output)
        print("Done!")
    except Exception as e:
        print("Error:")
        print(e)
Beispiel #3
0
def fast_predict(sc, file_input, file_output, sports_model, politics_model, technology_model):
    """
    Predicts using the provided models
    """
    tweets = sc.textFile(file_input).map(parse_json).filter(lambda x: is_valid(x) and is_english(x))
    try:
        print("Reading stored classification model")
        sports = pickle.load(open(sports_model, 'rb'))
        politics = pickle.load(open(politics_model, 'rb'))
        technology = pickle.load(open(technology_model, 'rb'))

        def predict_labels(tweet):
            x = parse(tweet)
            labels = []
            if sports.predict(x) > 0.0:
                labels.append("sports")
            if politics.predict(x) > 0.0:
                labels.append("politics")
            if technology.predict(x):
                labels.append("technology")
            return labels


        print("Computing predictions")
        predictions = tweets.map(lambda t: (t, predict_labels(t)))
        filtered_predictions = predictions.filter(lambda t: len(t[1]) == 1)
        filtered_predictions.map(prediction_string).saveAsTextFile(file_output)
        print("Done!")
    except Exception as e:
        print("Error:")
        print(e)