def prepare_test_set(self):
        self.test_df = io.readData(self.test_df_file)

        self.pos_test_df = self.test_df[self.test_df['relevant'] == True].sample(KeytermClassifier.TEST_DATASET_SIZE)
        self.neg_test_df = self.test_df[self.test_df['relevant'] == False].sample(KeytermClassifier.TEST_DATASET_SIZE)
        self.selected_test_df = pd.concat([self.pos_test_df, self.neg_test_df])

        # self.X_test = self.selected_test_df.drop(['relevant', 'doc_url', 'term'], axis = 1)
        #self.X_test = self.selected_test_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf'], axis = 1)
        self.X_test = self.selected_test_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf', 'is_url'], axis = 1)
        self.y_test = self.selected_test_df['relevant']
def create_term_train_test_dataset(global_term_feature_file, extracted_test_terms_file, train_feature_file, test_feature_file):
    import ioData as io
    df = io.readData(global_term_feature_file)

    cvalRes = None
    with open(extracted_test_terms_file) as fp:
        cvalRes = json.load(fp, encoding="utf-8")

    test_urls = cvalRes.keys()
    test_df = df.loc[df['doc_url'].isin(test_urls)]
    io.writeData(test_df, test_feature_file)

    train_df = df.loc[~df.index.isin(test_df.index)]
    io.writeData(train_df, train_feature_file)
    def prepare_training_set(self):
        self.train_df = io.readData(self.train_df_file)

        self.pos_train_df = self.train_df[self.train_df['relevant'] == True].sample(KeytermClassifier.TRAIN_DATASET_SIZE)
        self.neg_train_df = self.train_df[self.train_df['relevant'] == False].sample(KeytermClassifier.TRAIN_DATASET_SIZE)
        self.selected_train_df = pd.concat([self.pos_train_df, self.neg_train_df])

        #self.y, self.X = dmatrices('relevant ~ cvalue + tf + df + tfidf + doc_pos + is_title + is_url + \
        #                           is_anchor + is_description + is_first_par + is_last_par + is_img_desc',
        #                           self.selected_train_df, return_type = "dataframe")
        #self.y = np.ravel(self.y)
        # self.X = self.selected_train_df.drop(['relevant', 'doc_url', 'term'], axis = 1)

        #self.X = self.selected_train_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf'], axis = 1)
        self.X = self.selected_train_df.drop(['relevant', 'doc_url', 'term', 'df', 'tfidf', 'is_url'], axis = 1)
        self.y = self.selected_train_df['relevant']
import ioData, codecs

df = ioData.readData("dataset/preProcData.json")
f = codecs.open("dataset/cValueTest_small.txt", "w", "utf-8")

print("Starting processing ...")

for index, content in enumerate(df.textZone[1:20]):
    for par in content:
        par_text = reduce(lambda x, y: x + y, par, "")
        f.write(par_text)
        f.write("\n")

    f.write("##########END##########")
    f.write("\n")

    print("Done processing page %i" % index)

print("Done processing!")
f.close()