Ejemplo n.º 1
0
def get_randomised_sample(number_tweets=12):
    """
    Get some random tweets from the whole data set
    :param number_tweets: Number of tweets to collect up to the total number of tweets
    :return: sample_list containing the line from the data set
    """
    number_tweets = min(number_tweets, 2 * NB_TWEETS_PER_FILE)
    sample_list = list()
    with open(get_path_resource('Sentiment_analysis_dataset_1.csv'), 'rb') as file_part1:
        with open(get_path_resource('Sentiment_analysis_dataset_2.csv'), 'rb') as file_part2:
            global_file = file_part1.readlines() + file_part2.readlines()
            while number_tweets:
                sample_list.append(clean_line(global_file.pop(randbelow(len(global_file)))))
                number_tweets -= 1
    return sample_list
Ejemplo n.º 2
0
def _count_pos_neg_sample():
    """
    Method to compute the number of positive and negative sample contained in our data set
    :return: None, print the number of negative tweets, positive tweets and total tweets counted
    """
    count_pos, count_neg, number_tweets = 0, 0, 2 * NB_TWEETS_PER_FILE - 1
    with open(get_path_resource('Sentiment_analysis_dataset_1.csv'), 'rb') as file_part1:
        with open(get_path_resource('Sentiment_analysis_dataset_2.csv'), 'rb') as file_part2:
            global_file = file_part1.readlines() + file_part2.readlines()
            while number_tweets:
                number_tweets -= 1
                if global_file.pop().split(b',Sentiment140,')[0].split(b',')[1] == b'1':
                    count_pos += 1
                else:
                    count_neg += 1
    print(count_neg, count_pos, count_neg + count_pos)
Ejemplo n.º 3
0
def get_some_sample(number_tweets=12):
    """
    Collect from our data sets the desired amount of tweets
    :param number_tweets: number of tweets to collect
    :return: tuple of actual number of tweets collected and the list of tweets
    """
    # data set csv file : ItemID,Sentiment,SentimentSource,SentimentText
    number_tweets = min(number_tweets, 2 * NB_TWEETS_PER_FILE)
    with open(get_path_resource('Sentiment_analysis_dataset_1.csv'), 'rb') as file_part1:
        number_lines, sample_list = _get_lines_file(number_tweets, file_part1)
        if number_tweets - number_lines:
            with open(get_path_resource('Sentiment_analysis_dataset_2.csv'), 'rb') as file_part2:
                number_lines_2, sample_list_2 = _get_lines_file(number_tweets - number_lines, file_part2)
                if number_tweets - (number_lines + number_lines_2):
                    return NB_TWEETS_PER_FILE * 2, sample_list + sample_list_2
                else:
                    return number_tweets, sample_list + sample_list_2

    return number_tweets, sample_list
Ejemplo n.º 4
0
def get_randomised_pos_neg_sample(num_pos=12, num_neg=12):
    """
    Get two lists of negative and positive texts from the labelled tweets in the whole data set up to the number of
    positive and negative tweets contained in the data set
    :param num_pos: number of positive tweets to collect up to the total number of positive tweets in the data set
    :param num_neg: number of negative tweets to collect up to the total number of negative tweets in the data set
    :return: 2 lists containing negative texts and positive texts from the tweets in the data set
    """
    num_pos = min(num_pos, NB_TOTAL_POSITIVE_TWEETS)
    num_neg = min(num_neg, NB_TOTAL_NEGATIVE_TWEETS)
    positives, negatives = list(), list()
    with open(get_path_resource('Sentiment_analysis_dataset_1.csv'), 'rb') as file_part1:
        with open(get_path_resource('Sentiment_analysis_dataset_2.csv'), 'rb') as file_part2:
            global_file = file_part1.readlines() + file_part2.readlines()
            while num_pos or num_neg:
                element = clean_line(global_file.pop(randbelow(len(global_file))))
                if element[0] == b'1' and num_pos:
                    positives.append(element[1])
                    num_pos -= 1
                elif num_neg:
                    negatives.append(element[1])
                    num_neg -= 1
    return negatives, positives
Ejemplo n.º 5
0
def get_characteristic_label_vectors(nb, randomness, pos_equal_neg, Resource, keep_null_vector=False, language='en'):
    """
    Collect the desired number of label vectors regarding the parameters given. Provide 2 booleans to get a
    collection randomised or not and equal in number of positive and negative vector, or not.
    :param nb: number of vector to collect
    :param randomness: if the collection should be randomised among all the tweets in the sample
    :param pos_equal_neg: if we want the same amount of positive and negative vectors
    :param Resource: class object containing all the resources (positive words, negative words, positive emoticons,
    negative emoticons, stop words)
    :param keep_null_vector: False or True
        False : if we only want non null vector
        True : if we want tweets only, with the corresponding eventually null vector
    :param language: Choose the language from french to english
        'fr' | 'en'
    :return: tuple of array containing the features vectors and labels vectors corresponding
    """
    m_features, m_labels = list(), list()
    nb_pos, nb_neg, nb_tweet = 0, 0, 0
    if keep_null_vector:
        nb = min(nb, NB_TWEETS_PER_FILE * 2)
        with open(get_path_resource('Sentiment_analysis_dataset_1.csv'), 'rb') as file_part1:
            with open(get_path_resource('Sentiment_analysis_dataset_2.csv'), 'rb') as file_part2:
                global_file = file_part1.readlines() + file_part2.readlines()
                while nb_tweet < nb:
                    if randomness:
                        label, text = clean_line(global_file.pop(randbelow(len(global_file))))
                    else:
                        label, text = clean_line(global_file.pop())
                    feature_vector = characteristic_vector(clean_text(text, get_correct_stop_word(Resource, language)),
                                                           Resource)
                    float_label = float(label)
                    if pos_equal_neg:
                        if float_label == 0.0 and nb_neg < nb // 2 or float_label == 1.0 and nb_pos < nb // 2:
                            m_features.append(feature_vector)
                            m_labels.append(float_label)
                            nb_tweet += 1
                            if float(label) == 1.0:
                                nb_pos += 1
                            else:
                                nb_neg += 1
                    else:
                        m_features.append(feature_vector)
                        m_labels.append(float_label)
                        nb_tweet += 1
    else:
        nb = min(nb, NB_NON_NULL_VECTORS)
        with open(get_path_resource('Features_labels_dataset.json'), 'r') as f_l_file:
            global_file = loads(f_l_file.read())
            while nb_tweet < nb:
                if randomness:
                    index = randbelow(len(global_file["vectors"]))
                else:
                    index = 0
                label = float(global_file["labels"].pop(index))
                if pos_equal_neg and nb_pos < nb // 2 and label == 1.0:
                    m_features.append(global_file["vectors"].pop(index))
                    m_labels.append(label)
                    nb_tweet += 1
                    nb_pos += 1
                elif pos_equal_neg and nb_neg < nb // 2 and label == 0.0:
                    m_features.append(global_file["vectors"].pop(index))
                    m_labels.append(label)
                    nb_tweet += 1
                    nb_neg += 1
                elif not pos_equal_neg:
                    m_features.append(global_file["vectors"].pop(index))
                    m_labels.append(label)
                    nb_tweet += 1

    return array(m_features), array(m_labels)