コード例 #1
0
ファイル: Ilwar.py プロジェクト: MacLunch/MacLunch
    def _predict(self, json_test):
        
        test = self.pre_process(json_test, istrain = False)

        bow_vectorizer = BagOfWordsVectorizer()
        word2vec_model = Word2VecModel()
        tag_counter_model = TagCounterModel()

        # word2vec_model.set_model(self.author_model)
        # author_features = word2vec_model.transform(test["author_pos_sentences"], "author")

        #bow_vectorizer.set_vectorizer(self.title_model)
        #title_features = bow_vectorizer.transform(test["title_pos_sentences"], "title")

        bow_vectorizer.set_vectorizer(self.text_model)
        text_features = bow_vectorizer.transform(test["text_pos_sentences"], "text")

        #tag_counter_model.set_col(self.tag_model)
        #tag_features = tag_counter_model.transform(test["text"])

        test = pd.concat([test, text_features], axis = 1)

        #le = preprocessing.LabelEncoder()

        #test["forumid"] = le.fit_transform(test["forumid"])

        test = test.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1)

        test.columns = [str(x) for x in range(len(test.columns))]

        return test
コード例 #2
0
ファイル: Ilwar.py プロジェクト: MacLunch/MacLunch
    def fit(self, json_train, n_estimators = 10, is_xgb = True):

        train = self.pre_process(json_train, istrain = True)
        
        bow_vectorizer = BagOfWordsVectorizer()
        word2vec_model = Word2VecModel()
        tag_counter_model = TagCounterModel()

        # word2vec_model.fit(train["author_pos_sentences"], 500)
        # author_features = word2vec_model.transform(train["author_pos_sentences"], "author")
        # self.author_model = word2vec_model.get_model()

#        bow_vectorizer.fit(train["title_pos_sentences"], 1000)
#        title_features = bow_vectorizer.transform(train["title_pos_sentences"], "title")
#        self.title_model = bow_vectorizer.get_vectorizer()

        bow_vectorizer.fit(train["text_pos_sentences"], 1000)
        text_features = bow_vectorizer.transform(train["text_pos_sentences"], "text")
        self.text_model = bow_vectorizer.get_vectorizer()

#        tag_features = tag_counter_model.fit_transform(train["text"])
#        self.tag_model = tag_counter_model.get_col()

        train = pd.concat([train, text_features], axis = 1)

        #le = preprocessing.LabelEncoder()

        # train["forumid"] = le.fit_transform(train["forumid"])
        
        label = train['istroll']
        train = train.drop('istroll', axis=1)
        train = train.drop(['text', 'text_pos', 'text_pos_sentences'], axis=1)
        
        print(train.columns)

        train.columns = [str(x) for x in range(len(train.columns))]
        
        if is_xgb == False:
            self.model = RandomForestClassifier(n_estimators, n_jobs=-1)
        else:
            self.model = XGBClassifier(n_estimators = n_estimators, max_depth = 10)

        print(train.shape)
        self.model.fit(train, label)