def train(datafile=paths.get_dataset_path(name), model_file=paths.get_model_path(name) ): #settings.heading_classification_model_file): data = pd.read_csv(datafile) X, Y = data_prep(data, y=True) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20) clf = Pipeline([ ('tfidf', TfidfVectorizer( analyzer='word', ngram_range=(1, 2))), #(token_pattern=r'([a-zA-Z]|[0-9])+')), ('clf', ComplementNB(norm=True)) ]) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) #weights = eli5.formatters.as_dataframe.explain_weights_df(clf, feature_names=clf['tfidf'].get_feature_names(), top=10, target_names=y_test) #print(weights) #prediction = eli5.formatters.as_dataframe.explain_prediction_df(clf, X_test[0], feature_names=clf['tfidf'].get_feature_names(), target_names=y_test) #print(prediction) accuracy = accuracy_score(y_test, y_pred) print(accuracy) report = classification_report(Y, clf.predict(X)) print(report) with open(paths.result_path + name + '_CNB_report.txt', "w") as r: r.write(report) with open(model_file, "wb") as file: pickle.dump(clf, file)
def sanitise_datasets(): rtitle = 'QGMJ' rtype = 'WELCOM' ref = pd.read_excel( 'C:/Users/andraszeka/Documents/gsq-boreholes/investigations/QDEX_metada_export.xlsx', dtype={'REPNO': int}) bad = ref.loc[ref.RTITLE.str.contains(rtitle) | ref.RTYPE.str.contains(rtype)] bad_docids = bad.REPNO.values names = [ 'marginal_lines', 'toc', 'fig', 'heading_id_toc', 'heading_id_intext' ] # page id and page extraction datasets don't have DocID attribute datasets = [paths.get_dataset_path(name) for name in names] for dataset in datasets: if os.path.exists(dataset): try: data = pd.read_csv(dataset, dtype={'DocID': int}) except ValueError: data = pd.read_csv(dataset) data.dropna(subset=['DocID'], inplace=True) data.DocID = data.DocID.astype(int) prelen = data.shape[0] data = data.loc[~data.DocID.isin(bad_docids)] postlen = data.shape[0] data.to_csv(dataset, index=False) print('Removed ', str(prelen - postlen), ' bad values from ', dataset)
def train( n_queries=10, mode=paths.dataset_version, spec_name=name ): #datafile=settings.get_dataset_path('heading_id_intext'), model_file=settings.get_model_path('heading_id_intext'), datafile = paths.get_dataset_path(name, mode) model_file = paths.get_model_path(spec_name, mode) data = pd.read_csv(datafile) if 'no_toc' in model_file: limit_cols.extend(['MatchesHeading', 'MatchesType']) estimator = Pipeline( [ ( 'text', ColumnTransformer( [( 'cnb', Text2CNBPrediction(), 1 ) # 1 HAS TO BE 'TEXT'. changing it to int bc AL uses np arrays ], remainder="passthrough")), ('forest', RandomForestClassifier()) ], verbose=True) accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, limit_cols) print(accuracy) with open(model_file, "wb") as file: pickle.dump(learner, file) print("End of training stage. Re-run to train again")
def train(n_queries=10, mode='boreholes'): datafile = paths.get_dataset_path(name, mode) df = pd.read_csv(datafile) df = df.loc[df['Content'] != '[]'] clf = Pipeline( [ ('list2str', FunctionTransformer(concat_tables)), #('vect', CountVectorizer(ngram_range=(1, 2), min_df=0.01)), ('tfidf', TfidfVectorizer(ngram_range=(1, 2), min_df=0.0025) ), # min_df discourages overfitting ('cnb', ComplementNB(alpha=0.2)) ], verbose=True) accuracy, learner = active_learning.train(df, y_column, n_queries, clf, datafile, limit_cols=limit_cols, mode=mode) model_loc = paths.get_model_path(name, mode) with open(model_loc, "wb") as file: pickle.dump(learner, file) return learner
def create_dataset(): sourcefile = paths.get_dataset_path('heading_id_intext') df = pd.read_csv(sourcefile) df = df.loc[df['Heading'] > 0] df = df.drop( columns=['Heading', 'MatchesHeading', 'MatchesType', 'MatchesI']) df['HeadingClass'] = '' return df
def train(n_queries=10, mode=paths.dataset_version): #, model='forest') datafile=data_path, datafile = paths.get_dataset_path(name, mode) # need to define these here because mode may be production model_path = paths.get_model_path(name, mode) data = pd.read_csv(datafile) accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, limit_cols=limit_cols) with open(model_path, "wb") as file: pickle.dump(learner, file) print("End of training stage. Re-run to train again") return accuracy
def save_dataset(df, name): path = paths.get_dataset_path(name) if not os.path.exists(paths.dataset_path + '/' + paths.dataset_version): os.mkdir(paths.dataset_path + '/' + paths.dataset_version) if not os.path.exists(path): df.to_csv(path, index=False) else: print( 'Dataset already exists here. To prevent overwriting annotation, delete it manually first.' )
def create_training_sets_pt2(): proc_df = heading_id_toc.pre_process_id_dataset( datafile=paths.get_dataset_path('heading_id_toc')) save_dataset(proc_df, 'processed_heading_id_toc') # page_id_df = page_identification.create_dataset() # save_dataset(page_id_df, 'page_id') heading_id_intext_df = heading_id_intext.create_dataset() save_dataset(heading_id_intext_df, 'heading_id_intext')
def create_dataset(): sourcefile = paths.get_dataset_path('marginal_lines') texts = pd.read_csv(sourcefile, usecols=['Text', 'Marginal']) texts = texts.loc[texts['Marginal'] > 0] new_texts = pd.DataFrame(columns=columns) new_texts['original'] = texts['Text'] new_texts['transformed'] = texts.Text.apply(lambda x: transform_text(x)) new_texts['tag'] = None #print(new_text) #new_text.to_csv(settings.marginals_id_trans_dataset, index=False) return new_texts
def run_model(mode=paths.production): nn = NeuralNetwork() model_loc = paths.get_model_path(name, mode=mode) nn.load_model_from_file(model_loc=model_loc) df = pd.read_csv(paths.get_dataset_path(name, mode=mode), usecols=['original']) #df = pd.read_csv(paths.marginals_id_trans_dataset, usecols=['original']) #data = df.original data = pd.Series(['page 8', 'bhp hello 3', '12 month report', 'epm3424 3 february 1900', 'epm23 february 2000', 'epm34985 4000']) p, r = nn.predict(data)#.original) for i, row in df.iterrows(): print(row.original, ', ', p[i], ', ', r[i])
def create_dataset(): df = pd.DataFrame(columns=columns) pageinfos = sorted(glob.glob('training/restructpageinfo/*.json')) for pagesinfo in pageinfos: pi = json.load(open(pagesinfo)) #docset = np.zeros((len(pi.items()), 11)) docid = pagesinfo.split('\\')[-1].replace('_1_restructpageinfo.json', '') docset = write_to_dataset(pi, docid) pgdf = pd.DataFrame(data=docset, columns=columns) df = df.append(pgdf, ignore_index=True) prev_dataset = paths.get_dataset_path(name, paths.production) df = mlh.add_legacy_y(prev_dataset, df, y_column) return df
def train(n_queries=10, mode=paths.dataset_version): #datafile=data_path, ): datafile = paths.get_dataset_path(name, mode) data = pd.read_csv(datafile) accuracy, learner = active_learning.train(data, y_column, n_queries, estimator, datafile, mode=mode) if type(learner) == tree._classes.DecisionTreeClassifier: tree.plot_tree(learner, feature_names=include_cols, class_names=True) plt.show() with open(paths.get_model_path(name, mode), "wb") as file: pickle.dump(learner, file) print("End of training stage. Re-run to train again") return accuracy
def train( n_queries=10, mode=paths.dataset_version ): # datafile=settings.get_dataset_path(name), model_file=settings.get_model_path(name), datafile = paths.get_dataset_path(name, mode) if not os.path.exists(datafile): data = create_dataset() data.to_csv(datafile, index=False) else: data = pd.read_csv(datafile) clf = RandomForestClassifier() #tree.DecisionTreeClassifier() accuracy, clf = al.train(data, y_column, n_queries, clf, datafile, limited_cols) print(accuracy) model_file = paths.get_model_path(name, mode) with open(model_file, "wb") as file: pickle.dump(clf, file)
def train(self, n_queries=10, mode=paths.dataset_version): #settings.marginals_id_trans_dataset): file = paths.get_dataset_path(name, mode) df = pd.read_csv(file) #self.X = df['transformed'] #self.Y = df['tag'] self.max_words, self.max_len = check_maxlens(df) lstm = KerasClassifier(build_fn=self.LSTM, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2) estimator = Pipeline([ #('transform1', ColumnTransformer([ ('transform_text', FunctionTransformer(transform_text_wrapper)),# 0) #], remainder="passthrough")), ('transform2', Text2Seq(classes=2)), ('lstm', lstm) ], verbose=True) accuracy, learner = active_learning.train(df, y_column, n_queries, estimator, file, limit_cols=limit_cols) self.model = learner # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account! # self.model = self.LSTM() # # X_train, X_test, Y_train, Y_test = train_test_split(self.X, self.Y, test_size=0.15) # # self.tok.fit_on_texts(X_train) # sequences = self.tok.texts_to_sequences(X_train) # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len) # y_binary = to_categorical(Y_train) # self.model.summary() # self.model.fit(sequences_matrix, y_binary, batch_size=self.batch_size, epochs=self.epochs, # validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] # # test_sequences = self.tok.texts_to_sequences(X_test) # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len) # # accr = self.model.evaluate(test_sequences_matrix, to_categorical(Y_test)) # print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0], accr[1])) self.model_loc = paths.get_model_path(name, mode) #self.model.save(self.model_loc) #joblib.dump(self.tok, self.tok_loc) with open(self.model_loc, "wb") as f: pickle.dump(self.model, f)
def automatically_tag(type, classification_function, y_column, mode=paths.dataset_version): source = paths.get_dataset_path(type, mode) # 'toc' df = pd.read_csv(source) df = df.reset_index(drop=True) new_tags = classification_function( df, masked=False ) # can add mode parameter if ever use it on production set #idx = df.loc[((df['TagMethod'] != 'legacy') != (df['TOCPage'] == df['TOCPage'])) & (df['TagMethod'] != 'manual')].index.values #= new_tags.loc[(df['TagMethod'] != 'legacy') & (df['TagMethod'] != 'manual')] idx = df.loc[((df['TagMethod'] == 'auto') | (df['TagMethod'] != df['TagMethod'])) | (df[y_column] != df[y_column] )].index.values # join of auto and TOCPage==None df.loc[idx, y_column] = new_tags.loc[idx] df.loc[idx, 'TagMethod'] = 'auto' print(len(idx), " automatically tagged") #df['TagMethod'].loc[(df['TagMethod'] != 'legacy') & (df['TagMethod'] != 'manual')] = 'auto' if 'proba' in df.columns: df = df.drop(columns=['proba']) df.to_csv(source, index=False)
def train(n_queries=10, mode=paths.dataset_version): if not os.path.exists(paths.get_dataset_path('page_id', mode)): df = create_dataset() df.to_csv(paths.get_dataset_path('page_id', mode), index=False) nn = NeuralNetwork() nn.train(n_queries=n_queries, mode=mode)
def create_dataset( datafile=paths.get_dataset_path(name), docid=False ): #datafile=settings.dataset_path + 'heading_id_intext_dataset.csv', docid=False): sourcefile = paths.get_dataset_path('marginal_lines') df = pd.read_csv(sourcefile, dtype={ 'DocID': int, 'PageNum': int, 'LineNum': int, 'Heading': int }) if docid: df = df.loc[df['DocID'] == float(docid)] # remove ContainsTab, ContainsPage df = df.drop(['ContainsTab', 'ContainsPage'], axis=1) # remove rows with Marginal == 1 or 2. then remove marginal column df = df.loc[(df.Marginal == 0) | (df.Marginal != df.Marginal)] df = df.drop(['Marginal'], axis=1) # find ALL the toc pages and remove their lines from the dataset # find ALL the fig pages and remove their lines from the dataset toc_dataset = pd.read_csv(paths.get_dataset_path('toc')) # fig_dataset = pd.read_csv(settings.get_dataset_path('fig')) tocs = toc_dataset.loc[toc_dataset.TOCPage == 1] #figs = fig_dataset.loc[fig_dataset.FigPage == 1] toc_tuples = [(id, page) for id, page in zip(tocs.DocID, tocs.PageNum)] #fig_tuples = [(id, page) for id, page in zip(figs.DocID, figs.PageNum)] to_drop = [] for i, row in df.iterrows(): if (row.DocID, row.PageNum ) in toc_tuples: #or (row.DocID, row.PageNum) in fig_tuples: to_drop.append(i) df = df.drop(index=to_drop) # update contains num to just re.search('[0-9]+') df['ContainsNum'] = df.Text.apply(lambda x: contains_num(x)) # add column: line word count df.dropna(subset=['Text'], inplace=True) # remove nans df['WordCount'] = df.Text.apply(lambda x: len(x.split())) proc_df = pd.read_csv(paths.get_dataset_path('proc_heading_id_toc')) proc_head_df = proc_df.loc[ proc_df.Heading > 0] # works with None type? no, but works with NaN and it should be that proc_head_df['Text'] = proc_head_df.apply( lambda x: str(x.SectionPrefix) + ' ' + x.SectionText, axis=1) series_mh = pd.Series() series_mt = pd.Series() series_mi = pd.Series() for id in df.DocID.unique(): doc_toc = proc_head_df.loc[proc_head_df.DocID == float(id)] df_doc = df.loc[df.DocID == float(id)] matches_heading, matches_type, matches_i = compare_lines2headings( df_doc.Text, doc_toc) print(len(matches_heading) == df_doc.shape[0], id) series_mh = series_mh.append(pd.Series(matches_heading), ignore_index=True) series_mt = series_mt.append(pd.Series(matches_type), ignore_index=True) series_mi = series_mi.append(pd.Series(matches_i), ignore_index=True) df['MatchesHeading'], df['MatchesType'], df[ 'MatchesI'] = series_mh, series_mt, series_mi df['TagMethod'] = None df[y_column] = None prev_dataset = paths.dataset_path + 'heading_id_intext_dataset.csv' df = mlh.add_legacy_y(prev_dataset, df, y_column, line=True) if not docid: df.to_csv(datafile, index=False) #df['Heading'] = 0 return df
import paths from sklearn import ensemble import pickle import re from report import active_learning, machine_learning_helper as mlh name = 'marginal_lines' y_column = 'Marginal' columns = ['DocID', 'PageNum', 'LineNum', 'NormedLineNum','Text', 'Words2Width', 'WordsWidth', 'Width', 'Height', 'Left', 'Top', 'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality', y_column, 'TagMethod'] limit_cols=['DocID', 'Text', 'LineNum'] include_cols = ['PageNum', 'NormedLineNum', 'Words2Width', 'WordsWidth', 'Width', 'Height', 'Left', 'Top', 'ContainsNum', 'ContainsTab', 'ContainsPage', 'Centrality'] estimator = ensemble.RandomForestClassifier() data_path = paths.get_dataset_path(name) model_path = paths.get_model_path(name) def contains_num(string): if re.search(r'(\s|^)[0-9]+(\s|$)', string): return 1 return 0 def contains_tab(string): if re.search(r'\t', string): return 1 return 0
def create_dataset(ids=False, save=True, docids_only=False, training=True): if ids: save = False if save: dataset = paths.get_dataset_path('tables', 'boreholes') dataset = dataset.split('../')[1] #docids = ['32730', '44448', '37802', '2646', '44603'] ids = paths.get_files_from_path(type='tables', training=training) cols = ['DocID', 'TableNum', 'Content', 'FullTable'] all_columns = pd.DataFrame(columns=cols) if docids_only: new_ids = [] for id in ids: i = paths.get_files_from_path('tables', one_docid=id, training=training) new_ids.extend(i) ids = new_ids for id in ids: # try: # texttransforming.save_tables_and_kvs(id) # except json.decoder.JSONDecodeError: # print(id) # continue docid, file_num = id[0], id[1] tables = get_tables(docid, file_num=file_num, training=training) #columns = pd.Series([table.columns.values for table in tables]) full_tables = [] for table in tables: t = table.to_numpy() t = t.astype(str) t = np.insert(t, 0, table.columns.values, 0) full_tables.append(t) tables_values = [list(table.columns.values) for table in tables] #exclude = ['Unnamed: ', 'nan'] for t, i in zip(tables, range(len(tables))): for j, row in t.iterrows(): tables_values[i] = np.concatenate( (tables_values[i], row.values)) tables_values[i] = [ v for v in tables_values[i] if re.match(r'[A-z]+', str(v)) ] tables_values[i] = [ v for v in tables_values[i] if 'Unnamed:' not in str(v) ] tables_values[i] = [ v for v in tables_values[i] if str(v) != 'nan' ] tables_values = pd.Series(tables_values) docids = pd.Series([docid for x in range(len(tables_values))]) tablenums = pd.Series([x + 1 for x in range(len(tables_values))]) fulls = pd.Series(full_tables) series = [docids, tablenums, tables_values, fulls] iddf = pd.concat(series, axis=1) iddf.columns = cols #all_columns = all_columns.append(pd.Series(columns), ignore_index=True) all_columns = all_columns.append(iddf, ignore_index=True) if save: all_columns.to_csv(dataset, index=False) print('Done creating ', dataset) else: return all_columns
def train(self, n_queries=10, mode=paths.dataset_version): #settings.page_extraction_dataset): file = paths.get_dataset_path(name, mode) df = pd.read_csv(file) #self.X = df['transformed'] #self.X = df['transformed'] # self.X only exists here to get proper maxlens self.Y = df['position'] # try with y position instead of y value #transform = FunctionTransformer(lambda x: num2word(x)) # not sure this will work #self.X = self.X.apply(lambda x: num2word(x)) #self.max_words, self.max_len = check_maxlens(self.X) self.max_len = 20 # assuming max num of words in line will be 20 self.classes, y_vectorised = self.position2int() self.inv_classes = {v: k for k, v in self.classes.items()} y_masked = np.zeros((self.Y.size, self.max_len)) for i, j in zip(y_masked, y_vectorised): p = self.inv_classes[j] i[p] = 1 self.num_classes = len(self.classes.items()) nn = KerasClassifier(build_fn=self.NN, batch_size=self.batch_size, epochs=self.epochs, validation_split=0.2) clf = Pipeline([ ('transform_text', FunctionTransformer(transform_text_wrapper)), #('num2word', FunctionTransformer(lambda x: num2word(x))), ('transform', Text2Seq(classes=self.num_classes, pad_len=self.max_len)), ('nn', nn) ], verbose=True) accuracy, learner = active_learning.train(df, y_column, n_queries, clf, file, limit_cols=limit_cols) self.model = learner self.model_loc = paths.get_model_path(name, mode) # self.tok = Tokenizer(num_words=self.max_words+1) # only num_words-1 will be taken into account! # if self.mode_type == 'LSTM': # self.model = self.LSTM() # else: # self.model = self.NN() # # X_train, X_test, Y_train, Y_test = train_test_split(self.X, y_masked, test_size=0.15) # # self.tok.fit_on_texts(self.X) # sequences = self.tok.texts_to_sequences(X_train) # sequences_matrix = sequence.pad_sequences(sequences, maxlen=self.max_len) # #y_binary = to_categorical(Y_train) # y needs to be onehot encoded # # self.model.summary() # self.model.fit(sequences_matrix, Y_train, batch_size=self.batch_size, epochs=self.epochs, # validation_split=0.2) #, callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)] # # test_sequences = self.tok.texts_to_sequences(X_test) # test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=self.max_len) # # accr = self.model.evaluate(test_sequences_matrix, Y_test) # print('Test set\n Loss: {:0.3f}\n Accuracy: {:0.3f}'.format(accr[0], accr[1])) #self.model.save(self.model_loc) with open(self.model_loc, "wb") as f: pickle.dump(self.model, f) self.classes_loc = paths.get_model_path(name, mode, classes=True) #self.model_path + self.model_name + 'class_dict.joblib' #joblib.dump(self.tok, self.tok_loc) joblib.dump(self.inv_classes, self.classes_loc) print("End of training stage. Re-run to train again") return accuracy