Beispiel #1
0
    def validated_tweet(self, data):
        ''' A bunch of checks to determine whether the tweet is valid.
            Also returns emojis contained by the tweet.
        '''

        # Ordering of validations is important for speed
        # If it passes all checks, then the tweet is validated for usage

        # Skips incomplete tweets
        if len(data) <= 9:
            return False, []

        text = data[9]

        if self.ignore_retweets and RETWEETS_RE.search(text):
            return False, []

        if self.ignore_url_tweets and URLS_RE.search(text):
            return False, []

        if self.ignore_mention_tweets and MENTION_RE.search(text):
            return False, []

        if self.wanted_emojis is not None:
            uniq_emojis = np.unique(extract_emojis(text, self.wanted_emojis))
            if len(uniq_emojis) == 0:
                return False, []
        else:
            uniq_emojis = []

        if self.non_english_user_set is not None and \
           non_english_user(data[1], self.non_english_user_set):
            return False, []
        return True, uniq_emojis
Beispiel #2
0
def generate_validation_data():
    file_number = 0
    data_val = []
    label_val = []
    while 1:
        print("Loading Validation file: " + validation_files[file_number])
        with open(os.path.join(path_to_train,
                               validation_files[file_number])) as json_file:
            data_preprocess = json_file.readlines()
            data_preprocess = [pre_process(x) for x in data_preprocess]
            data_preprocess = filter(lambda y: y != '', data_preprocess)
            random.shuffle(data_preprocess)
            for i in range(len(data_preprocess)):
                text = data_preprocess[i]
                emojis = extract_emojis(text, wanted_emojis)
                emojis = np.unique(emojis)
                for l in range(len(emojis)):
                    text = text.replace(emojis[l], '')
                for j in range(len(emojis)):
                    data_val.append(text)
                    label_val.append(wanted_emojis_index.get_loc(emojis[j]))
                if (i % 1024 == 1023 or i == len(data_preprocess) - 1):
                    sequences = tokenizer.texts_to_sequences(data_val)
                    X_val = pad_sequences(sequences,
                                          maxlen=max_sequence_length)
                    Y_val = np.eye(64)[label_val]
                    print("Loaded Validation " + str(len(X_val)) +
                          " records from" + validation_files[file_number])
                    yield (X_val, Y_val)
                    data_val = []
                    label_val = []
        file_number = (file_number + 1) % len(validation_files)
Beispiel #3
0
def prepare_data_from_excel(path_to_load):
    final_texts = []
    final_labels = []
    data = pd.read_excel(path_to_load)
    content = data["Message"].tolist()
    content = [convert_to_unicode(x) for x in content]
    content = filter(lambda y: y != '', content)
    for i in range((len(content))):
        text = content[i]
        emojis = extract_emojis(text, wanted_emojis)
        emojis = np.unique(emojis)
        for l in range(len(emojis)):
            text = text.replace(emojis[l], '')
        for j in range(len(emojis)):
            final_texts.append(text)
            final_labels.append(wanted_emojis_index.get_loc(emojis[j]))
    data = prepare_texts(final_texts)
    final_labels = np.eye(64)[final_labels]
    return data, final_labels
Beispiel #4
0
def prepare_data_from_json(file_to_load):
    final_texts = []
    final_labels = []
    print("Loading file: " + file_to_load)
    with open(os.path.join(path_to_train, file_to_load)) as json_file:
        content = json_file.readlines()
        content = [pre_process(x) for x in content]
        content = filter(lambda y: y != '', content)
        for i in range((len(content))):
            text = content[i]
            emojis = extract_emojis(text, wanted_emojis)
            emojis = np.unique(emojis)
            for l in range(len(emojis)):
                text = text.replace(emojis[l], '')
            for j in range(len(emojis)):
                final_texts.append(text)
                final_labels.append(wanted_emojis_index.get_loc(emojis[j]))
    data = prepare_texts(final_texts)
    final_labels = np.eye(64)[final_labels]
    return data, final_labels