def prepare_test_set(self): self.test_df = io.readData(self.test_df_file) self.pos_test_df = self.test_df[self.test_df['relevant'] == True].sample(KeytermClassifier.TEST_DATASET_SIZE) self.neg_test_df = self.test_df[self.test_df['relevant'] == False].sample(KeytermClassifier.TEST_DATASET_SIZE) self.selected_test_df = pd.concat([self.pos_test_df, self.neg_test_df]) # self.X_test = self.selected_test_df.drop(['relevant', 'doc_url', 'term'], axis = 1) #self.X_test = self.selected_test_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf'], axis = 1) self.X_test = self.selected_test_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf', 'is_url'], axis = 1) self.y_test = self.selected_test_df['relevant']
def create_term_train_test_dataset(global_term_feature_file, extracted_test_terms_file, train_feature_file, test_feature_file): import ioData as io df = io.readData(global_term_feature_file) cvalRes = None with open(extracted_test_terms_file) as fp: cvalRes = json.load(fp, encoding="utf-8") test_urls = cvalRes.keys() test_df = df.loc[df['doc_url'].isin(test_urls)] io.writeData(test_df, test_feature_file) train_df = df.loc[~df.index.isin(test_df.index)] io.writeData(train_df, train_feature_file)
def prepare_training_set(self): self.train_df = io.readData(self.train_df_file) self.pos_train_df = self.train_df[self.train_df['relevant'] == True].sample(KeytermClassifier.TRAIN_DATASET_SIZE) self.neg_train_df = self.train_df[self.train_df['relevant'] == False].sample(KeytermClassifier.TRAIN_DATASET_SIZE) self.selected_train_df = pd.concat([self.pos_train_df, self.neg_train_df]) #self.y, self.X = dmatrices('relevant ~ cvalue + tf + df + tfidf + doc_pos + is_title + is_url + \ # is_anchor + is_description + is_first_par + is_last_par + is_img_desc', # self.selected_train_df, return_type = "dataframe") #self.y = np.ravel(self.y) # self.X = self.selected_train_df.drop(['relevant', 'doc_url', 'term'], axis = 1) #self.X = self.selected_train_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf'], axis = 1) self.X = self.selected_train_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf', 'is_url'], axis = 1) self.y = self.selected_train_df['relevant']
import ioData, codecs df = ioData.readData("dataset/preProcData.json") f = codecs.open("dataset/cValueTest_small.txt", "w", "utf-8") print("Starting processing ...") for index, content in enumerate(df.textZone[1:20]): for par in content: par_text = reduce(lambda x, y: x + y, par, "") f.write(par_text) f.write("\n") f.write("##########END##########") f.write("\n") print("Done processing page %i" % index) print("Done processing!") f.close()