Ejemplo n.º 1
0
    def get_lr(self):
        lr = LogisticRegression(random_state=66,
                                multi_class="auto",
                                class_weight='balanced',
                                solver="lbfgs",
                                max_iter=340)

        lr.fit(self.X_train, self.Y_train)

        # Validation results
        Y_val_pred = lr.predict(self.X_validate)
        predicted_validation_labels = [
            LABELS[int(pred)] for pred in Y_val_pred
        ]
        actual_validation_labels = [
            LABELS[int(pred)] for pred in self.Y_validate
        ]

        validation_score, validation_confusion_matrix = score_submission(
            actual_validation_labels, predicted_validation_labels)
        print_confusion_matrix(validation_confusion_matrix)

        null_score, max_score = score_defaults(actual_validation_labels)
        print("Percentage of validation score for Logistic Regression is:",
              validation_score / float(max_score))

        # Test results
        Y_test_pred = lr.predict(self.X_test)
        predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred]
        actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test]

        # Prints the number of count stances in the dataset
        count_stances(actual_test_labels)

        # CSV output
        write_to_csv(output + "/" + "lr_actual_labels.csv", actual_test_labels)
        write_to_csv(output + "/" + "lr_predicted_labels.csv",
                     predicted_test_labels)

        test_score, test_confusion_matrix = score_submission(
            actual_test_labels, predicted_test_labels)
        print_confusion_matrix(test_confusion_matrix)
        null_score, max_score = score_defaults(actual_test_labels)
        print("Percentage of test score for Logistic Regression is:",
              test_score / float(max_score))

        precision, recall, f1 = metrics.performance_metrics(
            validation_confusion_matrix)
        print("Precision for LR: ", precision)
        print("Recall for LR:", recall)
        print("F1 Score for LR:", f1)
Ejemplo n.º 2
0
    def get_rf(self):
        rf = RandomForestClassifier(n_estimators=50,
                                    random_state=66,
                                    verbose=True)

        rf.fit(self.X_train, self.Y_train)

        # Validation results
        Y_val_pred = rf.predict(self.X_validate)
        predicted_validation_labels = [
            LABELS[int(pred)] for pred in Y_val_pred
        ]
        actual_validation_labels = [
            LABELS[int(pred)] for pred in self.Y_validate
        ]

        validation_score, validation_confusion_matrix = score_submission(
            actual_validation_labels, predicted_validation_labels)
        print_confusion_matrix(validation_confusion_matrix)

        null_score, max_score = score_defaults(actual_validation_labels)
        print(
            "Percentage of validation score for Random Forest Classifier is:",
            validation_score / float(max_score))

        # Test results
        Y_test_pred = rf.predict(self.X_test)
        predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred]
        actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test]

        write_to_csv(output + "/" + "rf_actual_labels.csv", actual_test_labels)
        write_to_csv(output + "/" + "rf_predicted_labels.csv",
                     predicted_test_labels)

        test_score, test_confusion_matrix = score_submission(
            actual_test_labels, predicted_test_labels)
        print_confusion_matrix(test_confusion_matrix)
        null_score, max_score = score_defaults(actual_test_labels)
        print("Percentage of test score for Random Forest Classifier is:",
              test_score / float(max_score))

        precision, recall, f1 = metrics.performance_metrics(
            validation_confusion_matrix)

        print("Precision for RF: ", precision)
        print("Recall for RF:", recall)
        print("F1 Score for RF:", f1)
Ejemplo n.º 3
0
    def get_dt(self):
        dt = DecisionTreeClassifier(random_state=66, max_depth=10)

        dt.fit(self.X_train, self.Y_train)

        # Validation results
        Y_val_pred = dt.predict(self.X_validate)
        predicted_validation_labels = [
            LABELS[int(pred)] for pred in Y_val_pred
        ]
        actual_validation_labels = [
            LABELS[int(pred)] for pred in self.Y_validate
        ]

        validation_score, validation_confusion_matrix = score_submission(
            actual_validation_labels, predicted_validation_labels)
        print_confusion_matrix(validation_confusion_matrix)

        null_score, max_score = score_defaults(actual_validation_labels)
        print("Percentage of validation score for Decision Tree is:",
              validation_score / float(max_score))

        # Test results
        Y_test_pred = dt.predict(self.X_test)
        predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred]
        actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test]

        write_to_csv(output + "/" + "dt_actual_labels.csv", actual_test_labels)
        write_to_csv(output + "/" + "dt_predicted_labels.csv",
                     predicted_test_labels)

        test_score, test_confusion_matrix = score_submission(
            actual_test_labels, predicted_test_labels)
        print_confusion_matrix(test_confusion_matrix)
        null_score, max_score = score_defaults(actual_test_labels)
        print("Percentage of test score for Decision Tree is:",
              test_score / float(max_score))

        precision, recall, f1 = metrics.performance_metrics(
            validation_confusion_matrix)

        print("Precision for DT: ", precision)
        print("Recall for DT:", recall)
        print("F1 Score for DT:", f1)
Ejemplo n.º 4
0
    def get_nb(self):
        nb = MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

        nb.fit(self.X_train, self.Y_train)

        # Validation results
        Y_val_pred = nb.predict(self.X_validate)
        predicted_validation_labels = [
            LABELS[int(pred)] for pred in Y_val_pred
        ]
        actual_validation_labels = [
            LABELS[int(pred)] for pred in self.Y_validate
        ]

        validation_score, validation_confusion_matrix = score_submission(
            actual_validation_labels, predicted_validation_labels)
        print_confusion_matrix(validation_confusion_matrix)

        null_score, max_score = score_defaults(actual_validation_labels)
        print("Percentage of validation score for Naive Bayes is:",
              validation_score / float(max_score))

        # Test results
        Y_test_pred = nb.predict(self.X_test)
        predicted_test_labels = [LABELS[int(pred)] for pred in Y_test_pred]
        actual_test_labels = [LABELS[int(pred)] for pred in self.Y_test]

        write_to_csv(output + "/" + "nb_actual_labels.csv", actual_test_labels)
        write_to_csv(output + "/" + "nb_predicted_labels.csv",
                     predicted_test_labels)

        test_score, test_confusion_matrix = score_submission(
            actual_test_labels, predicted_test_labels)
        print_confusion_matrix(test_confusion_matrix)
        null_score, max_score = score_defaults(actual_test_labels)
        print("Percentage of test score for Naive Bayes is:",
              test_score / float(max_score))

        precision, recall, f1 = metrics.performance_metrics(
            validation_confusion_matrix)

        print("Precision for NB: ", precision)
        print("Recall for NB:", recall)
        print("F1 Score for NB:", f1)
Ejemplo n.º 5
0
    def dump_output(self, datumid2pred: dict, path):
        fieldnames = [
            'raw_image_id', 'image_id', 'utterance', 'response', 'label',
            'pred'
        ]
        rows = []
        for datumid, pred in datumid2pred.items():
            datum = self.dataset.id2datum[datumid]
            label = datum['label']
            pred = int(pred)
            label = int(label)

            row = {
                'raw_image_id': datum['raw_image_id'],
                'image_id': datum['image_id'],
                'utterance': datum['utterance'],
                'response': datum['response'],
                'label': label,
                'pred': pred
            }
            rows.append(row)
        write_to_csv(path, fieldnames, rows, delimiter='\t')
Ejemplo n.º 6
0
    def extract_portfolio_to_csv(self, data_dir):
        portfolio_data_dir = self._get_data_dir(data_dir, 'portfolio')
        data_header = "table[class='table portfolio'] thead tr td"
        data_row = "table[class='table portfolio'] tbody tr"
        row_data = []
        for f in list_files(portfolio_data_dir, '*.html'):
            with open(f, 'r') as fp:
                html = fp.read()
                soup = BeautifulSoup(html, "html.parser")

            if soup:
                headers = []
                for item in soup.select(data_header):
                    headers.append(item.text.strip())

                rows = soup.select(data_row)
                for row in rows:
                    data = dict()
                    for index, cell in enumerate(row.select('td')):
                        data[headers[index]] = cell.text.strip()
                    row_data.append(data)

        print('Num rows:', len(row_data))

        # Normalize and export
        records = []
        for row in row_data:
            row['ID'] = int(row['ID'])
            row['Prestado'] = self._norm_money(row['Prestado'])
            row['Pagado'] = self._norm_money(row['Pagado'])
            row['Te debe'] = self._norm_money(row['Te debe'])

            balance = self._norm_money(row.get('balance', '0'))
            record = tuple(row[header] for header in headers)
            records.append(record)

        headers = tuple(map(str.upper, headers))
        records.sort(key=lambda r: r[0])
        write_to_csv(records, './data/portfolio.csv', headers=headers)
Ejemplo n.º 7
0
    def dump_results(self, datumid2pred: dict, path):
        fieldnames = [
            'image_id', 'utterance', 'response', 'raw_image_id', 'pred_raw_id'
        ]
        rows = []
        for datumid, pred in datumid2pred.items():
            datum = self.dataset.id2datum[datumid]
            pred_datum = self.dataset.id2datum[pred]

            utterance = datum['utterance']
            response = datum['response']
            raw_id = datum['raw_image_id']
            iid = datum['image_id']
            pred_raw_id = pred_datum['raw_image_id']
            rows.append({
                'image_id': iid,
                'utterance': utterance,
                'response': response,
                'raw_image_id': raw_id,
                'pred_raw_id': pred_raw_id
            })

        write_to_csv(path, fieldnames, rows, delimiter='\t')
Ejemplo n.º 8
0
import tensorflow as tf
from src import utils
from model import resnet

# load model and weights
model = resnet.small_resnet()
model.load_weights("./weights/small_resnet")

# load test images
images_list = utils.get_images_from_csv("./datasets/test.csv", train=False)
images_list = utils.preprocess(images=images_list, train=False)

# get predict result
out = model.predict(images_list)
out = tf.argmax(out, axis=1).numpy().tolist()
out = [[i+1, data] for i, data in enumerate(out)]

# write result into csv
utils.write_to_csv("./datasets/result.csv", out)

Ejemplo n.º 9
0
    def extract_transactions_to_csv(self, data_dir):
        transactions_data_dir = self._get_data_dir(data_dir, 'transactions')
        data_row = "tr[class^='account_statements']"
        fields = {
            'record_id': ('aut', 'span'),
            'date': ('date', ''),
            'time': ('date', 'span'),
            'reference': ('reference', 'span'),
            'type': ('type', ''),
            'amount': ('amount', ''),
            'balance': ('balance', ''),
        }

        records = []
        for f in list_files(transactions_data_dir, '*.html'):
            with open(f, 'r') as fp:
                html = fp.read()
                soup = BeautifulSoup(html, "html.parser")

            if soup:
                row_data = list()
                for item in soup.select(data_row):
                    data = dict()
                    for data_key, selector_data in fields.items():
                        css_class, sub_elem = selector_data
                        selector = f"td[class='{css_class}']"
                        if sub_elem:
                            selector += f' {sub_elem}'

                        elem = item.select(selector)
                        if isinstance(elem, list):
                            elem = elem[0]

                        if elem and elem.text:
                            text = ' '.join(elem.text.strip().split())
                        else:
                            text = ''

                        data[data_key] = text
                    row_data.append(data)

                for row in row_data:
                    record_id = row.get('record_id', '')
                    assert record_id
                    date = row.get('date', '').split()[0]
                    date = datetime.datetime.strptime(
                        date, "%d/%m/%Y").strftime('%Y-%m-%d')

                    time = row.get('time', '').lower()
                    twenty_four_hour_time = time[:4]  # remove the am/pm
                    hour = int(twenty_four_hour_time.split(':')[0])
                    minute = int(twenty_four_hour_time.split(':')[1])

                    is_pm = time[4].lower() == 'p'
                    if is_pm:
                        hour = (hour + 12) % 24

                    time = f'{str(hour).zfill(2)}:{str(minute).zfill(2)}'

                    reference = row.get('reference', '')
                    type_ = row.get('type', '').lower()
                    amount = self._norm_money(row.get('amount', '0'))
                    balance = self._norm_money(row.get('balance', '0'))
                    record = (
                        record_id,
                        date,
                        time,
                        reference,
                        type_,
                        amount,
                        balance,
                    )
                    records.append(record)

        headers = ('ID', 'DATE', 'TIME', 'REFERENCE', 'TYPE', 'AMOUNT',
                   'BALANCE')
        records.sort(key=lambda r: (r[1], r[2]))
        write_to_csv(records, './data/transactions.csv', headers=headers)