예제 #1
0
def predict(test_list: list, special={}):
    """Takes in a test_list of sentences to classify and returns a list of labels
    
    Parameters
    ----------    
    test_list:[str]
        list of strings to be labeled
    
    DEFAULTS:
        special:dict
            special lexicon dictionary we would like to use in scoring words

    Returns
    -------
    labels:list
        list of labels for each item in test_list e.g. [1,0,0,-1,...]
    """
    contractions = rneg.getContractions()  # create contractions
    emoti_dict = emoticons.load_emoticon_sentiment()  # create emoti_dict
    label_list = []  # list to return
    for item in test_list:
        label, _ = swn_pipeline(item, contractions=contractions, emoti_dict=emoti_dict, special=special)
        label_list.append(label)
    return label_list
예제 #2
0
def senti_train(
    train_list: list,
    special_dict={},
    iterations=5,
    stop_iter_threshold=0.0,
    diff_thresh=0,
    word_err_threshold=0.2,
    output_filename="outputNew.txt",
    learn_nouns=False,
    count_nouns=True,
):
    """trains on train_list to create special lexiographic dictionary
    
    Each iteration, will create a bag_of_words to count all the words that appear
    in the tweets incorrectly labeled. It will then reverse the positive and
    negative scores for all the words that cause an error above word_err_threshold
    
    Do this for iterations times or until error rate is below threshold
    Parameters
    ----------    
    train_list: list[list] 
        in form [[label1, tweet1], [label2, tweet2],...]
        Label must be either 1, 0 or -1 for positive, neutral, negative respectively
    
    DEFAULTS:
        special: dict = {}
            dictionary of specialized lexicon to score words on top of SentiWordNet
        
        iterations: int = 5
            how many times we want to iterate through the list to train
        
        stop_iter_threshold: int = 0.0
            err_threshold below which to stop training
        
        diff_thresh:int = 0
            how far we want the positive and negative score to differ to be considered
            not neutral
            
        word_err_threshold: int=0.2
            how many errors word would have to cause in order to have their sentiscores changed
        
        output_filename: str
            file to output debugging results
            
        learn_nouns: boolean=False
            considers whether or not to add nouns to the outputted specialized dictionary
        
        count_nouns=true: determines whether or not to include nouns in the scoring

    Returns
    -------
    tuple(dict, list)
        dict is the newly created specialized sentiment lexicon
        list is a is a list of error rates from each iteration    
    
    """
    if type(iterations) != int:
        raise TypeError("Expected 'iterations' to be of type int but was of type {}".format(type(iterations)))
    if iterations < 0:
        raise ValueError("Parameter 'iterations' must be >= 0")
    if stop_iter_threshold < 0:
        raise ValueError("Paremeter 'threshold' must be >= 0")
    if word_err_threshold < 0:
        raise ValueError("Paremeter 'sp_threshold' must be >= 0")
    if diff_thresh < 0:
        raise ValueError("Paremeter 'diff_thresh' must be >= 0")

    # create all necessary variables, files and dictionaries for processing
    output = open(output_filename, "w", encoding="utf8")
    output.write(
        "Training on {} items with a specialized dictionary of {} items\n".format(len(train_list), len(special_dict))
    )
    output.write(
        "Iterations={}; Stopping Error Threshold={}; word error to be in dictionary threshold = {}\n".format(
            iterations, stop_iter_threshold, word_err_threshold
        )
    )
    output.write(
        "Words must be {} apart to not be neutral. Learning nouns set to {} and counting nounse set to = {}\n".format(
            diff_thresh, learn_nouns, count_nouns
        )
    )
    output.flush()
    contractions = rneg.getContractions()
    emoti_dict = emoticons.load_emoticon_sentiment()
    it = 0  # current iteration
    err_rate = 1  # current_error rate (initialized to 1 because assume everything wrong)
    err_list = []  # list of error_rates at each iteration
    special = special_dict  # specialized dictionary different from sentiwordnet labels learned
    flip_dict = {}  # keeps track of words that keep flipping back and forth (keeps the one with lower error)
    while it < iterations and err_rate > stop_iter_threshold:
        err_count = 0  # counts how many errors were in the iteration
        err_bow = {}  # dict of words, counts and scores word:{count, pos, neg, pos_count, neg_count, neutral_count}

        # look for words in which there is an error
        for item in train_list:
            # pow_word_list has tuples in form (word, positive score, negative score, part_of_speech)
            prediction, pow_word_tup = swn_pipeline(
                item[1],
                contractions=contractions,
                label_diff_thresh=diff_thresh,
                special=special,
                emoti_dict=emoti_dict,
                count_nouns=count_nouns,
            )

            # if incorrect increase error and add to err_bow
            if prediction != item[0]:
                err_count += 1
                for word, pos_score, neg_score, pos in pow_word_tup:  # count all words in erroneous tweet
                    # count number of errors
                    if word in err_bow:
                        err_bow[word]["count"] += 1
                    else:
                        err_bow[word] = {
                            "count": 1,
                            "pos_score": pos_score,
                            "neg_score": neg_score,
                            "pos_count": 0,
                            "neg_count": 0,
                            "neutral_count": 0,
                            "part_of_speech": pos,
                        }

                    # add count for label in word
                    if item[0] == "-1":  # if label was suppose to be negative
                        err_bow[word]["neg_count"] += 1
                    elif item[0] == "1":  # suppose to be positive
                        err_bow[word]["pos_count"] += 1
                    else:  # if was suppose to be neutral
                        err_bow[word]["neutral_count"] += 1

                    if count_nouns:
                        output.write(
                            "word: {}; p-o-s: {}; pos_score: {}; neg_score: {}\n".format(
                                word, pos, pos_score, neg_score
                            )
                        )

                # uncomment bottom to see what incorrect labeling has been done
                output.write("label: {} ==> {} for tweet ({})\n".format(prediction, item[0], item[1]))
                output.write("__________________________________________________________\n")
                sys.stdout.flush()

        # adjust scores of words
        for (
            word,
            info_dict,
        ) in (
            err_bow.items()
        ):  # info_list in form [count, pos_score, neg_score, pos_count, neg_count, neutral_count, part_of_speech]
            # adds a higher score to to the correct label score
            if info_dict["count"] > word_err_threshold * err_count and (
                learn_nouns or info_dict["part_of_speech"] != "n"
            ):  # word caused a lot of errors

                # if suppose to be neutral
                if (
                    info_dict["neutral_count"] > info_dict["pos_count"]
                    and info_dict["neutral_count"] > info_dict["neg_count"]
                    or info_dict["pos_count"] == info_dict["neg_count"]
                ):
                    new_pos = 0.0
                    new_neg = 0.0

                # otherwise, if word was not in sentidict add word with initialized score
                elif info_dict["pos_score"] == None:  # word was not in sentidict
                    if info_dict["pos_count"] > info_dict["neg_count"]:  # suppose to be positive
                        new_pos = 0.5
                        new_neg = 0.0
                    else:  # suppose to be negative
                        new_pos = 0.0
                        new_neg = 0.5

                # if word should have been positive/negative, but scores are opposite
                # invert them
                elif (
                    (
                        info_dict["pos_count"] > info_dict["neg_count"]
                        and info_dict["pos_score"] < info_dict["neg_score"]
                    )
                    or info_dict["pos_count"] < info_dict["neg_count"]
                    and info_dict["pos_score"] > info_dict["neg_score"]
                ):
                    # if had been flipped originally invert only if this cuased a bigger error
                    if word not in flip_dict or word in flip_dict and flip_dict[word] < info_dict["count"]:
                        new_pos = info_dict["neg_score"]
                        new_neg = info_dict["pos_score"]
                        flip_dict[word] = info_dict["count"]

                # if word scores were correct in comparison but not large enough difference
                else:
                    if info_dict["pos_count"] > info_dict["neg_count"]:
                        new_pos = info_dict["pos_score"] + gv.score_increment
                        new_pos = 1.0 if new_pos > 1.0 else new_pos
                        new_neg = info_dict["neg_score"]
                    else:
                        new_neg = info_dict["neg_score"] + gv.score_increment
                        new_neg = 1.0 if new_neg > 1.0 else new_neg
                        new_pos = info_dict["pos_score"]

                special[word] = (new_pos, new_neg)
        err_rate = err_count / len(train_list)
        it += 1
        err_list.append(err_rate)
        print("current special dictionary:", special)
        print("it: {}, err_rate: {}".format(it - 1, err_rate))
        print("###########################################################")
        output.write("current special dictionary: {}\n".format(special))
        output.write("it: {}, err_rate: {}\n".format(it - 1, err_rate))
        output.write("###########################################################\n")
        output.flush()
    output.close()
    return special, err_list