コード例 #1
0
def TkloadingTweetsAndUserInfoData(args, resultTextbox, window):

    if not os.path.isfile(os.path.join(args.dataset, args.pickle_name)):

        if not os.path.isfile(
                os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)):

            resultTextbox.insert(
                "end",
                ("Loading " +
                 str(os.path.join(args.dataset,
                                  "FullTweetsDataNoOrdered.html")) + ' and ' +
                 str(
                     os.path.join(args.dataset,
                                  "FullExtraInfoDataNoOrdered.csv")) +
                 " to do the Proprocessing\n"))
            window.update_idletasks()

            tweets_df = pd.read_html(
                os.path.join(args.dataset, "FullTweetsDataNoOrdered.html"))
            extraInfo_df = pd.read_csv(
                os.path.join(args.dataset, "FullExtraInfoDataNoOrdered.csv"))
            tweets_df = pd.DataFrame(list(tweets_df[0].iloc[1:][0]))
            tweets_df.columns = ['text']
            df = pd.concat([tweets_df, extraInfo_df], axis=1)
            del tweets_df
            del extraInfo_df

            resultTextbox.insert("end",
                                 ("Dataset size: " + str(len(df)) + "\n"))
            window.update_idletasks()

            def preprocessingInputTextData(colName):
                input = df[colName]
                ps = nltk.stem.PorterStemmer()
                tknzr = TweetTokenizer()
                allText = [i for i in input]
                preprocessedText = [[
                    ps.stem(word) for word in tknzr.tokenize(
                        re.sub(
                            r'\d+', '',
                            re.sub(r"http\S+|www.\S+", matchingURL,
                                   sentence)).lower())
                    if word not in nltk.corpus.stopwords.words('english')
                    and len(word) >= 3
                ] for sentence in allText]
                df[colName] = preprocessedText

            def fillingNullValue(colName):
                if args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MOST_COMMON:
                    df[colName] = df[colName].astype('float')
                    df[colName].fillna(df[colName].mean(), inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MEAN:
                    df[colName] = df[colName].astype('category')
                    df[colName].fillna(
                        df[colName].astype('category').describe()['top'],
                        inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.CERTAIN_VALUE:
                    df[colName] = df[colName].astype('category')
                    df[colName] = df[colName].cat.add_categories(
                        [args.preprocessingStra[colName]['fillingNullValue']])
                    df[colName].fillna(
                        args.preprocessingStra[colName]['fillingNullValue'],
                        inplace=True)

            def TweetsWithUserInfoPreprocessing():
                for colName in args.preprocessingStra.keys():
                    resultTextbox.insert(
                        "end",
                        ("Preprocessing feature: " + str(colName) + "\n"))
                    window.update_idletasks()
                    for step in args.preprocessingStra[colName]['steps']:
                        if not step is None:
                            step(colName)

            ############### Hiding Preprocessing Strategy ###############

            args.preprocessingStra = defaultdict(dict)
            args.preprocessingStra['text']['steps'] = [
                preprocessingInputTextData
            ]
            args.preprocessingStra["numberOfHashtags_c"]['steps'] = [None]
            args.preprocessingStra['favorite_count']['steps'] = [None]
            args.preprocessingStra['retweet_count']['steps'] = [None]
            args.preprocessingStra['possibly_sensitive'] = {
                'fillingNullMethod': filling_method.CERTAIN_VALUE,
                'fillingNullValue': 'UNKNOWN',
                'steps': [fillingNullValue],
            }
            args.preprocessingStra['followers_count']['steps'] = [None]
            args.preprocessingStra['friends_count']['steps'] = [None]
            # args.preprocessingStra['description']= {
            #    'steps': [ fillingNullValue,preprocessingInputTextData],
            #     'fillingNullMethod': filling_method.CERTAIN_VALUE,
            #     'fillingNullValue': 'NULLDescription'
            # }
            args.preprocessingStra['default_profile']['steps'] = [None]
            args.preprocessingStra['default_profile_image']['steps'] = [None]
            args.preprocessingStra['favourites_count']['steps'] = [None]
            args.preprocessingStra['listed_count']['steps'] = [None]
            args.preprocessingStra['statuses_count']['steps'] = [None]
            args.preprocessingStra['verified']['steps'] = [None]

            resultTextbox.insert("end", ('Preprocessing Strategy Set\n'))
            window.update_idletasks()

            #############################################################

            resultTextbox.insert("end", ('Start Preprocessing...\n'))
            window.update_idletasks()
            TweetsWithUserInfoPreprocessing()  # Apply inplace preprocessing
            df = pd.get_dummies(df,
                                drop_first=True,
                                columns=[
                                    'possibly_sensitive', 'default_profile',
                                    'default_profile_image', 'verified'
                                ])

            # resultTextbox.insert("end", ('Spliting Datasets...\n'))
            # window.update_idletasks()
            # X_train, X_test, Y_train, Y_test = train_test_split(df.drop(
            #     'maliciousMark', axis=1), df['maliciousMark'], test_size=args.validation_portion, stratify=df['maliciousMark'],  random_state=args.random_seed)
            # X_validation, X_test, Y_validation, Y_test = train_test_split(
            #     X_test, Y_test, test_size=args.test_portion, stratify=Y_test, random_state=args.random_seed)

            # resultTextbox.insert("end", ('Creating Tweets_text...\n'))
            # window.update_idletasks()
            # tweets_text = nltk.Text(list(itertools.chain(*X_train['text'])))

            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx),
                    "wb") as fp:  # Pickling
                # pickle.dump([X_train, X_validation, X_test,
                #              Y_train, Y_validation, Y_test,  tweets_text], fp)
                pickle.dump(df, fp)
                resultTextbox.insert(
                    "end", ("The Pickle Data beforeMapToIdx Dumped to: " + str(
                        os.path.join(args.dataset,
                                     args.pickle_name_beforeMapToIdx)) + "\n"))
                window.update_idletasks()

        else:
            print("Loading Existing BeforeMapToIdx file for Tweets and User: "******"end", (
                "Loading Existing BeforeMapToIdx file for Tweets and User: "******"\n"))
            window.update_idletasks()
            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx),
                    "rb") as fp:  # Unpickling
                # [X_train, X_validation, X_test,
                #  Y_train, Y_validation, Y_test,  tweets_text] = pickle.load(fp)
                df = pickle.load(fp)

        #################

        resultTextbox.insert("end", ('Spliting Datasets...\n'))
        window.update_idletasks()

        #### A fake split for get the small size ###

        # 0.02 dataset

        if args.runningOnSmallDataset:
            X_temp, X_train, Y_temp, Y_train = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=0.02,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)
            X_train, X_test, Y_train, Y_test = train_test_split(
                X_train,
                Y_train,
                test_size=args.validation_portion,
                stratify=Y_train,
                random_state=args.random_seed)
            del X_temp
            del Y_temp
        else:
            X_train, X_test, Y_train, Y_test = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=args.validation_portion,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)

        X_validation, X_test, Y_validation, Y_test = train_test_split(
            X_test,
            Y_test,
            test_size=args.test_portion,
            stratify=Y_test,
            random_state=args.random_seed)

        resultTextbox.insert(
            "end",
            ("Dataset Size: " +
             str(len(X_train) + len(X_validation) + len(X_test)) + "\n"))
        resultTextbox.insert("end",
                             ("TrainingSet Size: " + str(len(X_train)) + "\n"))
        resultTextbox.insert(
            "end", ("ValidationSet Size: " + str(len(X_validation)) + "\n"))
        resultTextbox.insert("end",
                             ("TestSet Size: " + str(len(X_test)) + "\n"))
        window.update_idletasks()

        resultTextbox.insert("end", ('Creating Tweets_text...\n'))
        window.update_idletasks()
        tweets_text = nltk.Text(list(itertools.chain(*X_train['text'])))

        ####################

        args.vocab_size = args.vocab_size or len(tweets_text.tokens)
        if args.vocab_size:  # and this if expression
            tweets_text.tokens = specialTokenList + \
                [w for w, _ in tweets_text.vocab().most_common(
                    args.vocab_size - len(specialTokenList))]
        else:
            tweets_text.tokens = specialTokenList + tweets_text.tokens
        args.vocab_size = len(tweets_text.tokens)  # change the vacab_size

        resultTextbox.insert("end", ('Maping Word To Idx: training set\n'))
        window.update_idletasks()
        X_train['text'] = mapFromWordToIdx(X_train['text'], tweets_text)
        resultTextbox.insert("end", ('Maping Word To Idx: validation set\n'))
        window.update_idletasks()
        X_validation['text'] = mapFromWordToIdx(X_validation['text'],
                                                tweets_text)
        resultTextbox.insert("end", ('Maping Word To Idx: test set\n'))
        window.update_idletasks()
        X_test['text'] = mapFromWordToIdx(X_test['text'], tweets_text)

        resultTextbox.insert("end", ('Creating Torch Training Datasets...\n'))
        window.update_idletasks()

        args.X_train = X_train
        args.Y_train = Y_train
        training_dataset = CreateTweetsWithUserInfoDatatset(
            X_train, list(map(int, list(Y_train))))

        resultTextbox.insert("end",
                             ('Creating Torch Validation Datasets...\n'))
        window.update_idletasks()
        validation_dataset = CreateTweetsWithUserInfoDatatset(
            X_validation, list(map(int, list(Y_validation))))

        resultTextbox.insert("end", ('Creating Torch Test Datasets...\n'))
        window.update_idletasks()
        test_dataset = CreateTweetsWithUserInfoDatatset(
            X_test, list(map(int, list(Y_test))))

        resultTextbox.insert("end", ('Dumping data...\n'))
        window.update_idletasks()

        with open(os.path.join(args.dataset, args.pickle_name),
                  "wb") as fp:  # Pickling
            pickle.dump([
                training_dataset, validation_dataset, test_dataset, tweets_text
            ], fp)
            print("The Pickle Data Dumped to: ",
                  os.path.join(args.dataset, args.pickle_name))
            resultTextbox.insert(
                "end",
                ("The Pickle Data Dumped to: " +
                 str(os.path.join(args.dataset, args.pickle_name)) + "\n"))
            window.update_idletasks()

    else:
        resultTextbox.insert(
            "end", ("Loading Existing File: " +
                    str(os.path.join(args.dataset, args.pickle_name)) + '\n'))
        window.update_idletasks()

        with open(os.path.join(args.dataset, args.pickle_name),
                  "rb") as fp:  # Unpickling
            training_dataset, validation_dataset, test_dataset, tweets_text = pickle.load(
                fp)

    args.vocab_size = len(tweets_text.tokens)
    args.num_extra_info = len(training_dataset[0][1])
    args.num_features = len(training_dataset[0][1]) + 1

    return training_dataset, validation_dataset, test_dataset, tweets_text
コード例 #2
0
def TkloadingTweetsAndUserInfoData(args, resultTextbox, window):
    '''
    This function can load the data and perfrom the preprocessing
    '''

    # Check if the pre-processed datasets (training, validation, test) exist. if the pre-processed data already exist, then load it rather than perform pr-processing again.
    if not os.path.isfile(os.path.join(args.dataset, args.pickle_name)):

        # Check if the pre-processed df (overall Pandas dataframe). If it doesn't exist, load the original data to perform preprocessing.
        if not os.path.isfile(
                os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)):
            '''
            resultTextbox.insert("end", "String_Here") -> This function for adding a String to the result box
            window.update_idletasks() -> This function make the wondow to update the result box.
            '''

            # Adding the loading information to result box
            resultTextbox.insert(
                "end",
                ("Loading " +
                 str(os.path.join(args.dataset,
                                  "FullTweetsDataNoOrdered.html")) + ' and ' +
                 str(
                     os.path.join(args.dataset,
                                  "FullExtraInfoDataNoOrdered.csv")) +
                 " to do the Proprocessing\n"))
            window.update_idletasks()

            # Load the original data set, which contains a html (storing the tweets text) and a csv (storing other tweets' and users' information)

            # Load the tweets text
            tweets_df = pd.read_html(
                os.path.join(args.dataset, "FullTweetsDataNoOrdered.html"))
            tweets_df = pd.DataFrame(list(tweets_df[0].iloc[1:][0]))
            tweets_df.columns = ['text']

            # Load other information
            extraInfo_df = pd.read_csv(
                os.path.join(args.dataset, "FullExtraInfoDataNoOrdered.csv"))

            # Concat two loaded dataframe
            df = pd.concat([tweets_df, extraInfo_df], axis=1)

            # Delete the loaded dataframe after concating since we have the concatinated df now.
            del tweets_df
            del extraInfo_df

            # Show the dataset size in the result box
            resultTextbox.insert("end",
                                 ("Dataset size: " + str(len(df)) + "\n"))
            window.update_idletasks()

            def preprocessingInputTextData(colName):
                '''
                This function is used for preprocessing
                '''
                input = df[colName]
                ps = nltk.stem.PorterStemmer()  # Init Poter Stemmer
                tknzr = TweetTokenizer()  # Init Tweet Tokenizer
                allText = [i for i in input]

                ## The detail preprocessing step is in the report
                preprocessedText = [[
                    ps.stem(word) for word in tknzr.tokenize(
                        re.sub(
                            r'\d+', '',
                            re.sub(r"http\S+|www.\S+", matchingURL,
                                   sentence)).lower())
                    if word not in nltk.corpus.stopwords.words('english')
                    and len(word) >= 3
                ] for sentence in allText]
                df[colName] = preprocessedText

            def fillingNullValue(colName):
                '''
                The function used for replace the 'nan' in the dataset
                '''

                if args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MOST_COMMON:
                    ## replace the nan by mean
                    df[colName] = df[colName].astype('float')
                    df[colName].fillna(df[colName].mean(), inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MEAN:
                    ## replace the nan by the most common value
                    df[colName] = df[colName].astype('category')
                    df[colName].fillna(
                        df[colName].astype('category').describe()['top'],
                        inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.CERTAIN_VALUE:
                    ## replace the nan by a certain value
                    df[colName] = df[colName].astype('category')
                    df[colName] = df[colName].cat.add_categories(
                        [args.preprocessingStra[colName]['fillingNullValue']])
                    df[colName].fillna(
                        args.preprocessingStra[colName]['fillingNullValue'],
                        inplace=True)

            def TweetsWithUserInfoPreprocessing():
                '''
                Perform the preprocessing here
                '''

                for colName in args.preprocessingStra.keys():
                    resultTextbox.insert(
                        "end",
                        ("Preprocessing feature: " + str(colName) + "\n"))
                    window.update_idletasks()
                    for step in args.preprocessingStra[colName]['steps']:
                        if not step is None:
                            step(colName)

            ###############  Preprocessing Strategy ###############

            args.preprocessingStra = defaultdict(dict)
            args.preprocessingStra['text']['steps'] = [
                preprocessingInputTextData
            ]
            args.preprocessingStra["numberOfHashtags_c"]['steps'] = [None]
            args.preprocessingStra['favorite_count']['steps'] = [None]
            args.preprocessingStra['retweet_count']['steps'] = [None]
            args.preprocessingStra['possibly_sensitive'] = {
                'fillingNullMethod': filling_method.CERTAIN_VALUE,
                'fillingNullValue': 'UNKNOWN',
                'steps': [fillingNullValue],
            }
            args.preprocessingStra['followers_count']['steps'] = [None]
            args.preprocessingStra['friends_count']['steps'] = [None]
            args.preprocessingStra['default_profile']['steps'] = [None]
            args.preprocessingStra['default_profile_image']['steps'] = [None]
            args.preprocessingStra['favourites_count']['steps'] = [None]
            args.preprocessingStra['listed_count']['steps'] = [None]
            args.preprocessingStra['statuses_count']['steps'] = [None]
            args.preprocessingStra['verified']['steps'] = [None]

            resultTextbox.insert("end", ('Preprocessing Strategy Set\n'))
            window.update_idletasks()

            #############################################################

            resultTextbox.insert("end", ('Start Preprocessing...\n'))
            window.update_idletasks()

            TweetsWithUserInfoPreprocessing()  # Apply inplace preprocessing

            # Get dummy variable
            df = pd.get_dummies(df,
                                drop_first=True,
                                columns=[
                                    'possibly_sensitive', 'default_profile',
                                    'default_profile_image', 'verified'
                                ])

            # Save the preprocessed-df
            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx), "wb") as fp:
                pickle.dump(df, fp)
                resultTextbox.insert(
                    "end", ("The Pickle Data beforeMapToIdx Dumped to: " + str(
                        os.path.join(args.dataset,
                                     args.pickle_name_beforeMapToIdx)) + "\n"))
                window.update_idletasks()

        else:
            # If the preprocessed-df exist, load it.
            print("Loading Existing BeforeMapToIdx file for Tweets and User: "******"end", (
                "Loading Existing BeforeMapToIdx file for Tweets and User: "******"\n"))
            window.update_idletasks()
            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx), "rb") as fp:
                df = pickle.load(fp)

        #################### After having the pre-processed df ####################

        resultTextbox.insert("end", ('Spliting Datasets...\n'))
        window.update_idletasks()

        ## split the df to training, validation and test set.

        if args.runningOnSmallDataset:
            # If this user want to test the program on the small dataset,
            # do a fake split to have a smaller dataset. X_temp (will be deleted later) will have 98% of data, and the small dataset size is 2%

            # Fake split
            X_temp, X_train, Y_temp, Y_train = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=0.02,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)
            # get real training and test set.
            X_train, X_test, Y_train, Y_test = train_test_split(
                X_train,
                Y_train,
                test_size=args.validation_portion,
                stratify=Y_train,
                random_state=args.random_seed)

            # delete X_temp and Y_temp
            del X_temp
            del Y_temp
        else:
            # if not running on the small dataset, do normal data split.
            X_train, X_test, Y_train, Y_test = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=args.validation_portion,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)

        X_validation, X_test, Y_validation, Y_test = train_test_split(
            X_test,
            Y_test,
            test_size=args.test_portion,
            stratify=Y_test,
            random_state=args.random_seed)

        ## Show the datasets' sizes
        resultTextbox.insert(
            "end",
            ("Dataset Size: " +
             str(len(X_train) + len(X_validation) + len(X_test)) + "\n"))
        resultTextbox.insert("end",
                             ("TrainingSet Size: " + str(len(X_train)) + "\n"))
        resultTextbox.insert(
            "end", ("ValidationSet Size: " + str(len(X_validation)) + "\n"))
        resultTextbox.insert("end",
                             ("TestSet Size: " + str(len(X_test)) + "\n"))
        window.update_idletasks()

        resultTextbox.insert("end", ('Creating Tweets_text...\n'))
        window.update_idletasks()

        ## create nltk.Text, which will be used as a dictionray.
        tweets_text = nltk.Text(list(itertools.chain(*X_train['text'])))

        # check if the hyper-parameter vocab_size exists and filter out the low tf words.
        args.vocab_size = args.vocab_size or len(tweets_text.tokens)
        if args.vocab_size:  # and this if expression
            tweets_text.tokens = specialTokenList + \
                [w for w, _ in tweets_text.vocab().most_common(
                    args.vocab_size - len(specialTokenList))]
        else:
            tweets_text.tokens = specialTokenList + tweets_text.tokens
        args.vocab_size = len(tweets_text.tokens)  # change the vacab_size

        ## Map the terms to index for every dataset
        resultTextbox.insert("end", ('Maping Word To Idx: training set\n'))
        window.update_idletasks()
        X_train['text'] = mapFromWordToIdx(X_train['text'], tweets_text)
        resultTextbox.insert("end", ('Maping Word To Idx: validation set\n'))
        window.update_idletasks()
        X_validation['text'] = mapFromWordToIdx(X_validation['text'],
                                                tweets_text)
        resultTextbox.insert("end", ('Maping Word To Idx: test set\n'))
        window.update_idletasks()
        X_test['text'] = mapFromWordToIdx(X_test['text'], tweets_text)

        resultTextbox.insert("end", ('Creating Torch Training Datasets...\n'))
        window.update_idletasks()

        # args.X_train = X_train
        # args.Y_train = Y_train

        # Create training, validation and test Pytorch dataset for feeding data into the Pytorch Neural Network
        # More datails are in the utils.py CreateTweetsWithUserInfoDataset function
        training_dataset = CreateTweetsWithUserInfoDataset(
            X_train, list(map(int, list(Y_train))))

        resultTextbox.insert("end",
                             ('Creating Torch Validation Datasets...\n'))
        window.update_idletasks()
        validation_dataset = CreateTweetsWithUserInfoDataset(
            X_validation, list(map(int, list(Y_validation))))

        resultTextbox.insert("end", ('Creating Torch Test Datasets...\n'))
        window.update_idletasks()
        test_dataset = CreateTweetsWithUserInfoDataset(
            X_test, list(map(int, list(Y_test))))

        resultTextbox.insert("end", ('Dumping data...\n'))
        window.update_idletasks()

        # Dump the pre-processed datasets
        with open(os.path.join(args.dataset, args.pickle_name),
                  "wb") as fp:  # Pickling
            pickle.dump([
                training_dataset, validation_dataset, test_dataset, tweets_text
            ], fp)
            print("The Pickle Data Dumped to: ",
                  os.path.join(args.dataset, args.pickle_name))
            resultTextbox.insert(
                "end",
                ("The Pickle Data Dumped to: " +
                 str(os.path.join(args.dataset, args.pickle_name)) + "\n"))
            window.update_idletasks()

    else:

        # If the pre-processed datasets exist, load it.
        resultTextbox.insert(
            "end", ("Loading Existing File: " +
                    str(os.path.join(args.dataset, args.pickle_name)) + '\n'))
        window.update_idletasks()

        with open(os.path.join(args.dataset, args.pickle_name),
                  "rb") as fp:  # Unpickling
            training_dataset, validation_dataset, test_dataset, tweets_text = pickle.load(
                fp)

    ## Some dataframe hyper-parameters that will be used latter
    args.vocab_size = len(tweets_text.tokens)
    args.num_extra_info = len(training_dataset[0][1])
    args.num_features = len(training_dataset[0][1]) + 1

    # return the loaded or generated dataset.
    return training_dataset, validation_dataset, test_dataset, tweets_text
コード例 #3
0
def TkloadingData(args, resultTextbox, window):

    if not os.path.isfile(os.path.join(args.dataset, args.pickle_name)):

        if not os.path.isfile(
                os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)):

            resultTextbox.insert(
                "end", ("Loading Origin Data and do the Proprocessing\n"))
            window.update_idletasks()

            if args.dataset == "HSpam14":
                resultTextbox.insert("end", ("Loading HSpam14 dataset\n"))
                df = pd.read_html(
                    os.path.join(args.dataset,
                                 "FullDataFromSQLHSpam14.html"))[0].iloc[1:, :]
                df.columns = ['text', 'maliciousMark']
            elif args.dataset == "Honeypot":
                df_Nonspammer = pd.read_csv("./Honeypot/nonspam_tweets.csv",
                                            encoding="ISO-8859-1")[[
                                                'text', 'maliciousMark'
                                            ]]
                df_Spammer = pd.read_csv("./Honeypot/spam_tweets.csv",
                                         encoding="ISO-8859-1")[[
                                             'text', 'maliciousMark'
                                         ]]
                df = pd.concat([df_Nonspammer, df_Spammer])
                del df_Nonspammer, df_Spammer
                resultTextbox.insert("end", ("Loading Honeypot dataset\n"))
            else:
                resultTextbox.insert(
                    "end",
                    ("Please input a valid dataset name: HSpam14, Honeypot\n"))
                raise ValueError

            window.update_idletasks()
            resultTextbox.insert("end", ("Data Splitation\n"))
            window.update_idletasks()
            X_train, X_test, Y_train, Y_test = train_test_split(
                df['text'],
                df['maliciousMark'],
                test_size=args.validation_portion,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)
            X_validation, X_test, Y_validation, Y_test = train_test_split(
                X_test,
                Y_test,
                test_size=args.test_portion,
                stratify=Y_test,
                random_state=args.random_seed)

            resultTextbox.insert(
                "end",
                ("Number of Training Data: " + str(len(X_train)) + "\n"))
            resultTextbox.insert("end", ("Number of Validation Data: " +
                                         str(len(X_validation)) + "\n"))
            resultTextbox.insert(
                "end", ("Number of Test Data: " + str(len(X_test)) + "\n"))
            window.update_idletasks()

            resultTextbox.insert("end", ("Preprocessing X_train\n"))
            window.update_idletasks()

            X_train = preprocessingInputData(X_train)

            resultTextbox.insert("end", ("Preprocessing X_validation\n"))
            window.update_idletasks()

            X_validation = preprocessingInputData(X_validation)

            resultTextbox.insert("end", ("Preprocessing X_test\n"))
            window.update_idletasks()

            X_test = preprocessingInputData(X_test)

            resultTextbox.insert("end", ("Generating text\n"))
            window.update_idletasks()

            # Preparing the dictionary
            text = nltk.Text(list(itertools.chain(*X_train)))

            resultTextbox.insert(
                "end",
                ("Original Vocab Size: " + str(len(text.tokens)) + "\n"))
            window.update_idletasks()

            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx),
                    "wb") as fp:  # Pickling
                pickle.dump([
                    X_train, X_validation, X_test, Y_train, Y_validation,
                    Y_test, text
                ], fp)
                resultTextbox.insert(
                    "end", ("The Pickle Data beforeMapToIdx Dumped to:" + str(
                        os.path.join(args.dataset,
                                     args.pickle_name_beforeMapToIdx)) + "\n"))
                window.update_idletasks()

        else:
            resultTextbox.insert(
                "end", ("Loading Existing BeforeMapToIdx file: " + str(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx)) + "\n"))
            window.update_idletasks()
            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx),
                    "rb") as fp:  # Unpickling
                [
                    X_train, X_validation, X_test, Y_train, Y_validation,
                    Y_test, text
                ] = pickle.load(fp)

        # The vocab_size will start to affect the data from here
        args.vocab_size = args.vocab_size or len(text.tokens)
        if args.vocab_size:  # and this if expression
            text.tokens = specialTokenList + \
                [w for w, _ in text.vocab().most_common(
                    args.vocab_size - len(specialTokenList))]
        else:
            text.tokens = specialTokenList + text.tokens
        args.vocab_size = len(text.tokens)  # change the vacab_size

        resultTextbox.insert("end", ("Generating Datasets\n"))

        resultTextbox.insert("end", ("Training set map to Idx\n"))
        window.update_idletasks()

        training_dataset = CreateDatatset(X_train,
                                          mapFromWordToIdx(X_train, text),
                                          list(map(int, list(Y_train))))

        resultTextbox.insert("end", ("Validation set map to Idx\n"))
        window.update_idletasks()

        validation_dataset = CreateDatatset(
            X_validation, mapFromWordToIdx(X_validation, text),
            list(map(int, list(Y_validation))))

        resultTextbox.insert("end", ("Test set map to Idx\n"))
        window.update_idletasks()

        test_dataset = CreateDatatset(X_test, mapFromWordToIdx(X_test, text),
                                      list(map(int, list(Y_test))))

        resultTextbox.insert("end", ("Dumping Data\n"))
        window.update_idletasks()

        with open(os.path.join(args.dataset, args.pickle_name),
                  "wb") as fp:  # Pickling
            pickle.dump(
                [training_dataset, validation_dataset, test_dataset, text], fp)
            resultTextbox.insert("end", ("The Pickle Data Dumped\n"))
            window.update_idletasks()

    else:
        resultTextbox.insert(
            "end", ("Loading Existing File: " + args.pickle_name + "\n"))
        window.update_idletasks()
        with open(os.path.join(args.dataset, args.pickle_name),
                  "rb") as fp:  # Unpickling
            training_dataset, validation_dataset, test_dataset, text = pickle.load(
                fp)
            args.vocab_size = len(text.tokens)

    return training_dataset, validation_dataset, test_dataset, text
コード例 #4
0
def loadingTweetsAndUserInfoData(args):

    if not os.path.isfile(os.path.join(args.dataset, args.pickle_name)):

        if not os.path.isfile(
                os.path.join(args.dataset, args.pickle_name_beforeMapToIdx)):

            print(
                "Loading ",
                os.path.join(args.dataset,
                             "FullTweetsWithUserInfoSelected.html"),
                " to do the Proprocessing")

            df = pd.read_html(
                os.path.join(args.dataset,
                             "FullTweetsWithUserInfoSelected.html"))

            columnNames = list(df[0].loc[0])
            df = df[0].iloc[1:, :]
            df.columns = columnNames

            def preprocessingInputTextData(colName):
                input = df[colName]
                ps = nltk.stem.PorterStemmer()
                tknzr = TweetTokenizer()
                allText = [i for i in input]
                preprocessedText = [[
                    ps.stem(word) for word in tknzr.tokenize(
                        re.sub(
                            r'\d+', '',
                            re.sub(r"http\S+|www.\S+", matchingURL,
                                   sentence)).lower())
                    if word not in nltk.corpus.stopwords.words('english')
                    and len(word) >= 3
                ] for sentence in allText]
                df[colName] = preprocessedText

            def fillingNullValue(colName):
                if args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MOST_COMMON:
                    df[colName] = df[colName].astype('float')
                    df[colName].fillna(df[colName].mean(), inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.MEAN:
                    df[colName] = df[colName].astype('category')
                    df[colName].fillna(
                        df[colName].astype('category').describe()['top'],
                        inplace=True)
                elif args.preprocessingStra[colName][
                        'fillingNullMethod'] == filling_method.CERTAIN_VALUE:
                    df[colName] = df[colName].astype('category')
                    df[colName] = df[colName].cat.add_categories(
                        [args.preprocessingStra[colName]['fillingNullValue']])
                    df[colName].fillna(
                        args.preprocessingStra[colName]['fillingNullValue'],
                        inplace=True)

            def TweetsWithUserInfoPreprocessing():
                for colName in args.preprocessingStra.keys():
                    print("Preprocessing column: ", colName)
                    for step in args.preprocessingStra[colName]['steps']:
                        if not step is None:
                            step(colName)

            ############### Hiding Preprocessing Strategy ###############

            args.preprocessingStra = defaultdict(dict)
            args.preprocessingStra['text']['steps'] = [
                preprocessingInputTextData
            ]
            args.preprocessingStra["numberOfHashtags_c"]['steps'] = [None]
            args.preprocessingStra['favorite_count']['steps'] = [None]
            args.preprocessingStra['retweet_count']['steps'] = [None]
            args.preprocessingStra['possibly_sensitive'] = {
                'fillingNullMethod': filling_method.CERTAIN_VALUE,
                'fillingNullValue': 'UNKNOWN',
                'steps': [fillingNullValue],
            }
            args.preprocessingStra['followers_count']['steps'] = [None]
            args.preprocessingStra['friends_count']['steps'] = [None]
            # args.preprocessingStra['description']= {
            #    'steps': [ fillingNullValue,preprocessingInputTextData],
            #     'fillingNullMethod': filling_method.CERTAIN_VALUE,
            #     'fillingNullValue': 'NULLDescription'
            # }
            args.preprocessingStra['default_profile']['steps'] = [None]
            args.preprocessingStra['default_profile_image']['steps'] = [None]
            args.preprocessingStra['favourites_count']['steps'] = [None]
            args.preprocessingStra['listed_count']['steps'] = [None]
            args.preprocessingStra['statuses_count']['steps'] = [None]
            args.preprocessingStra['verified']['steps'] = [None]

            print('Preprocessing Strategy Set')

            #############################################################

            print('Start Preprocessing...')
            TweetsWithUserInfoPreprocessing()  # Apply inplace preprocessing
            df = pd.get_dummies(df,
                                drop_first=True,
                                columns=[
                                    'possibly_sensitive', 'default_profile',
                                    'default_profile_image', 'verified'
                                ])

            print('Spliting Datasets...')
            X_train, X_test, Y_train, Y_test = train_test_split(
                df.drop('maliciousMark', axis=1),
                df['maliciousMark'],
                test_size=args.validation_portion,
                stratify=df['maliciousMark'],
                random_state=args.random_seed)
            X_validation, X_test, Y_validation, Y_test = train_test_split(
                X_test,
                Y_test,
                test_size=args.test_portion,
                stratify=Y_test,
                random_state=args.random_seed)

            print('Creating Tweets_text')
            tweets_text = nltk.Text(list(itertools.chain(*X_train['text'])))

            with open(
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx),
                    "wb") as fp:  # Pickling
                pickle.dump([
                    X_train, X_validation, X_test, Y_train, Y_validation,
                    Y_test, tweets_text
                ], fp)
                print(
                    "The Pickle Data beforeMapToIdx Dumped to:",
                    os.path.join(args.dataset,
                                 args.pickle_name_beforeMapToIdx))

        else:
            print("Loading Existing BeforeMapToIdx file for Tweets and User: "******"rb") as fp:  # Unpickling
                [
                    X_train, X_validation, X_test, Y_train, Y_validation,
                    Y_test, tweets_text
                ] = pickle.load(fp)

        args.vocab_size = args.vocab_size or len(tweets_text.tokens)
        if args.vocab_size:  # and this if expression
            tweets_text.tokens = specialTokenList + \
                [w for w, _ in tweets_text.vocab().most_common(
                    args.vocab_size - len(specialTokenList))]
        else:
            tweets_text.tokens = specialTokenList + tweets_text.tokens
        args.vocab_size = len(tweets_text.tokens)  # change the vacab_size

        print("Maping Word To Idx: training set")
        X_train['text'] = mapFromWordToIdx(X_train['text'], tweets_text)
        print("Maping Word To Idx: validation set")
        X_validation['text'] = mapFromWordToIdx(X_validation['text'],
                                                tweets_text)
        print("Maping Word To Idx: test set")
        X_test['text'] = mapFromWordToIdx(X_test['text'], tweets_text)

        print("Creating Torch Datasets...")
        training_dataset = CreateTweetsWithUserInfoDatatset(
            X_train, list(map(int, list(Y_train))))
        validation_dataset = CreateTweetsWithUserInfoDatatset(
            X_validation, list(map(int, list(Y_validation))))
        test_dataset = CreateTweetsWithUserInfoDatatset(
            X_test, list(map(int, list(Y_test))))

        print("Dumping Data")

        with open(os.path.join(args.dataset, args.pickle_name),
                  "wb") as fp:  # Pickling
            pickle.dump([
                training_dataset, validation_dataset, test_dataset, tweets_text
            ], fp)
            print("The Pickle Data Dumped to: ",
                  os.path.join(args.dataset, args.pickle_name))

    else:
        print("Loading Existing File: ",
              os.path.join(args.dataset, args.pickle_name))
        with open(os.path.join(args.dataset, args.pickle_name),
                  "rb") as fp:  # Unpickling
            training_dataset, validation_dataset, test_dataset, tweets_text = pickle.load(
                fp)

    args.vocab_size = len(tweets_text.tokens)
    args.num_extra_info = len(training_dataset[0][1])
    args.num_features = len(training_dataset[0][1]) + 1

    return training_dataset, validation_dataset, test_dataset, tweets_text