def get_lr(self): lr = LogisticRegression(random_state=66, multi_class="auto", class_weight='balanced', solver="lbfgs", max_iter=340) lr.fit(self.X_train, self.Y_train) # Validation results Y_val_pred = lr.predict(self.X_validate) predicted_validation_labels = [ LABELS[int(pred)] for pred in Y_val_pred ] actual_validation_labels = [ LABELS[int(pred)] for pred in self.Y_validate ] validation_score, validation_confusion_matrix = score_submission( actual_validation_labels, predicted_validation_labels) print_confusion_matrix(validation_confusion_matrix) null_score, max_score = score_defaults(actual_validation_labels) print("Percentage of validation score for Logistic Regression is:", validation_score / float(max_score)) # Test results Y_test_pred = lr.predict(self.X_test) predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred] actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test] # Prints the number of count stances in the dataset count_stances(actual_test_labels) # CSV output write_to_csv(output + "/" + "lr_actual_labels.csv", actual_test_labels) write_to_csv(output + "/" + "lr_predicted_labels.csv", predicted_test_labels) test_score, test_confusion_matrix = score_submission( actual_test_labels, predicted_test_labels) print_confusion_matrix(test_confusion_matrix) null_score, max_score = score_defaults(actual_test_labels) print("Percentage of test score for Logistic Regression is:", test_score / float(max_score)) precision, recall, f1 = metrics.performance_metrics( validation_confusion_matrix) print("Precision for LR: ", precision) print("Recall for LR:", recall) print("F1 Score for LR:", f1)
def get_rf(self): rf = RandomForestClassifier(n_estimators=50, random_state=66, verbose=True) rf.fit(self.X_train, self.Y_train) # Validation results Y_val_pred = rf.predict(self.X_validate) predicted_validation_labels = [ LABELS[int(pred)] for pred in Y_val_pred ] actual_validation_labels = [ LABELS[int(pred)] for pred in self.Y_validate ] validation_score, validation_confusion_matrix = score_submission( actual_validation_labels, predicted_validation_labels) print_confusion_matrix(validation_confusion_matrix) null_score, max_score = score_defaults(actual_validation_labels) print( "Percentage of validation score for Random Forest Classifier is:", validation_score / float(max_score)) # Test results Y_test_pred = rf.predict(self.X_test) predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred] actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test] write_to_csv(output + "/" + "rf_actual_labels.csv", actual_test_labels) write_to_csv(output + "/" + "rf_predicted_labels.csv", predicted_test_labels) test_score, test_confusion_matrix = score_submission( actual_test_labels, predicted_test_labels) print_confusion_matrix(test_confusion_matrix) null_score, max_score = score_defaults(actual_test_labels) print("Percentage of test score for Random Forest Classifier is:", test_score / float(max_score)) precision, recall, f1 = metrics.performance_metrics( validation_confusion_matrix) print("Precision for RF: ", precision) print("Recall for RF:", recall) print("F1 Score for RF:", f1)
def get_dt(self): dt = DecisionTreeClassifier(random_state=66, max_depth=10) dt.fit(self.X_train, self.Y_train) # Validation results Y_val_pred = dt.predict(self.X_validate) predicted_validation_labels = [ LABELS[int(pred)] for pred in Y_val_pred ] actual_validation_labels = [ LABELS[int(pred)] for pred in self.Y_validate ] validation_score, validation_confusion_matrix = score_submission( actual_validation_labels, predicted_validation_labels) print_confusion_matrix(validation_confusion_matrix) null_score, max_score = score_defaults(actual_validation_labels) print("Percentage of validation score for Decision Tree is:", validation_score / float(max_score)) # Test results Y_test_pred = dt.predict(self.X_test) predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred] actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test] write_to_csv(output + "/" + "dt_actual_labels.csv", actual_test_labels) write_to_csv(output + "/" + "dt_predicted_labels.csv", predicted_test_labels) test_score, test_confusion_matrix = score_submission( actual_test_labels, predicted_test_labels) print_confusion_matrix(test_confusion_matrix) null_score, max_score = score_defaults(actual_test_labels) print("Percentage of test score for Decision Tree is:", test_score / float(max_score)) precision, recall, f1 = metrics.performance_metrics( validation_confusion_matrix) print("Precision for DT: ", precision) print("Recall for DT:", recall) print("F1 Score for DT:", f1)
def get_nb(self): nb = MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True) nb.fit(self.X_train, self.Y_train) # Validation results Y_val_pred = nb.predict(self.X_validate) predicted_validation_labels = [ LABELS[int(pred)] for pred in Y_val_pred ] actual_validation_labels = [ LABELS[int(pred)] for pred in self.Y_validate ] validation_score, validation_confusion_matrix = score_submission( actual_validation_labels, predicted_validation_labels) print_confusion_matrix(validation_confusion_matrix) null_score, max_score = score_defaults(actual_validation_labels) print("Percentage of validation score for Naive Bayes is:", validation_score / float(max_score)) # Test results Y_test_pred = nb.predict(self.X_test) predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred] actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test] write_to_csv(output + "/" + "nb_actual_labels.csv", actual_test_labels) write_to_csv(output + "/" + "nb_predicted_labels.csv", predicted_test_labels) test_score, test_confusion_matrix = score_submission( actual_test_labels, predicted_test_labels) print_confusion_matrix(test_confusion_matrix) null_score, max_score = score_defaults(actual_test_labels) print("Percentage of test score for Naive Bayes is:", test_score / float(max_score)) precision, recall, f1 = metrics.performance_metrics( validation_confusion_matrix) print("Precision for NB: ", precision) print("Recall for NB:", recall) print("F1 Score for NB:", f1)
def dump_output(self, datumid2pred: dict, path): fieldnames = [ 'raw_image_id', 'image_id', 'utterance', 'response', 'label', 'pred' ] rows = [] for datumid, pred in datumid2pred.items(): datum = self.dataset.id2datum[datumid] label = datum['label'] pred = int(pred) label = int(label) row = { 'raw_image_id': datum['raw_image_id'], 'image_id': datum['image_id'], 'utterance': datum['utterance'], 'response': datum['response'], 'label': label, 'pred': pred } rows.append(row) write_to_csv(path, fieldnames, rows, delimiter='\t')
def extract_portfolio_to_csv(self, data_dir): portfolio_data_dir = self._get_data_dir(data_dir, 'portfolio') data_header = "table[class='table portfolio'] thead tr td" data_row = "table[class='table portfolio'] tbody tr" row_data = [] for f in list_files(portfolio_data_dir, '*.html'): with open(f, 'r') as fp: html = fp.read() soup = BeautifulSoup(html, "html.parser") if soup: headers = [] for item in soup.select(data_header): headers.append(item.text.strip()) rows = soup.select(data_row) for row in rows: data = dict() for index, cell in enumerate(row.select('td')): data[headers[index]] = cell.text.strip() row_data.append(data) print('Num rows:', len(row_data)) # Normalize and export records = [] for row in row_data: row['ID'] = int(row['ID']) row['Prestado'] = self._norm_money(row['Prestado']) row['Pagado'] = self._norm_money(row['Pagado']) row['Te debe'] = self._norm_money(row['Te debe']) balance = self._norm_money(row.get('balance', '0')) record = tuple(row[header] for header in headers) records.append(record) headers = tuple(map(str.upper, headers)) records.sort(key=lambda r: r[0]) write_to_csv(records, './data/portfolio.csv', headers=headers)
def dump_results(self, datumid2pred: dict, path): fieldnames = [ 'image_id', 'utterance', 'response', 'raw_image_id', 'pred_raw_id' ] rows = [] for datumid, pred in datumid2pred.items(): datum = self.dataset.id2datum[datumid] pred_datum = self.dataset.id2datum[pred] utterance = datum['utterance'] response = datum['response'] raw_id = datum['raw_image_id'] iid = datum['image_id'] pred_raw_id = pred_datum['raw_image_id'] rows.append({ 'image_id': iid, 'utterance': utterance, 'response': response, 'raw_image_id': raw_id, 'pred_raw_id': pred_raw_id }) write_to_csv(path, fieldnames, rows, delimiter='\t')
import tensorflow as tf from src import utils from model import resnet # load model and weights model = resnet.small_resnet() model.load_weights("./weights/small_resnet") # load test images images_list = utils.get_images_from_csv("./datasets/test.csv", train=False) images_list = utils.preprocess(images=images_list, train=False) # get predict result out = model.predict(images_list) out = tf.argmax(out, axis=1).numpy().tolist() out = [[i+1, data] for i, data in enumerate(out)] # write result into csv utils.write_to_csv("./datasets/result.csv", out)
def extract_transactions_to_csv(self, data_dir): transactions_data_dir = self._get_data_dir(data_dir, 'transactions') data_row = "tr[class^='account_statements']" fields = { 'record_id': ('aut', 'span'), 'date': ('date', ''), 'time': ('date', 'span'), 'reference': ('reference', 'span'), 'type': ('type', ''), 'amount': ('amount', ''), 'balance': ('balance', ''), } records = [] for f in list_files(transactions_data_dir, '*.html'): with open(f, 'r') as fp: html = fp.read() soup = BeautifulSoup(html, "html.parser") if soup: row_data = list() for item in soup.select(data_row): data = dict() for data_key, selector_data in fields.items(): css_class, sub_elem = selector_data selector = f"td[class='{css_class}']" if sub_elem: selector += f' {sub_elem}' elem = item.select(selector) if isinstance(elem, list): elem = elem[0] if elem and elem.text: text = ' '.join(elem.text.strip().split()) else: text = '' data[data_key] = text row_data.append(data) for row in row_data: record_id = row.get('record_id', '') assert record_id date = row.get('date', '').split()[0] date = datetime.datetime.strptime( date, "%d/%m/%Y").strftime('%Y-%m-%d') time = row.get('time', '').lower() twenty_four_hour_time = time[:4] # remove the am/pm hour = int(twenty_four_hour_time.split(':')[0]) minute = int(twenty_four_hour_time.split(':')[1]) is_pm = time[4].lower() == 'p' if is_pm: hour = (hour + 12) % 24 time = f'{str(hour).zfill(2)}:{str(minute).zfill(2)}' reference = row.get('reference', '') type_ = row.get('type', '').lower() amount = self._norm_money(row.get('amount', '0')) balance = self._norm_money(row.get('balance', '0')) record = ( record_id, date, time, reference, type_, amount, balance, ) records.append(record) headers = ('ID', 'DATE', 'TIME', 'REFERENCE', 'TYPE', 'AMOUNT', 'BALANCE') records.sort(key=lambda r: (r[1], r[2])) write_to_csv(records, './data/transactions.csv', headers=headers)