Python export_data Examples, utils.export_data Python Examples

Example #1

0

Show file

File: write_crowdtangle_lists.py Project: medialab/webclim_facebook

def create_template_csv_from_serie(serie, list_name):

    df = pd.DataFrame(columns=["Page or Account URL", "List"])
    df["Page or Account URL"] = serie.index
    df["List"] = list_name

    export_data(df, 'crowdtangle_list', list_name + '.csv')

    return df

Example #2

0

Show file

def _exec_crf_test(input_text):
    with tempfile.NamedTemporaryFile(mode='w') as input_file:
        input_file.write(utils.export_data(input_text))
        input_file.flush()
        return check_output(
            ['crf_test', '--verbose=1', '--model', 'ingredient-parser.crfmodel',
             input_file.name])

Example #3

0

Show file

def last_export():
    raw_data_file = utils.get_latest_export()
    if raw_data_file is None:
        raw_data_file = utils.export_data(db)

    no_ext = raw_data_file.split(".")[0]
    date = no_ext.split("_")[1]
    return date

Example #4

0

Show file

def admin_index():
    if len(request.args) > 0 and int(request.args.get("export")) == 1:
        # re-export data from database to CSV file
        data_fn = utils.export_data(db)
        data_path = url_for("static", filename="data/" + data_fn)
        if data_path.startswith("/"):
            data_path = data_path[1:]
        data_path = request.url_root + data_path
        return render_template("admin/index.html", exported_data=data_path)
    else:
        return render_template("admin/index.html")

Example #5

0

Show file

File: PhraseCounter.py Project: can-keklik/RecipePostagger

def parse_ingredientForCRF(ingredients):
    returnArr = []
    eachIngre = []
    _, tmpFile = tempfile.mkstemp()
    _, tmpFile2 = tempfile.mkstemp()
    with open(tmpFile, 'w') as outfile:
        outfile.write(utils.export_data(ingredients))
        outfile.close()
    tmpFilePath = "./tmp/model_file"
    modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
    aa = "crf_test   -m %s %s " % (modelFilename, tmpFile)
    bb = " > %s " % (tmpFile2)
    cc = " || exit 1 "
    fullCommand = aa + bb + cc
    os.system(fullCommand)
    os.system("rm %s" % tmpFile)
    f = open(tmpFile2)
    parsedData = f.read()
    f.close()
    os.system("rm %s" % tmpFile)
    os.system("rm %s" % tmpFile2)
    data = parsedData.split('\n')

    for sen in xrange(len(data)):
        words = data[sen].split('\t')
        name = words[0]
        name = name.translate(None, string.punctuation).lower()
        val = words[len(words) - 1]
        val2 = val.split('-')

        if val2[len(val2) - 1] != '' and name != '':
            obj = (name, val2[len(val2) - 1])
            eachIngre.append(obj)
        else:
            if len(eachIngre) > 0:
                returnArr.append(eachIngre)
            eachIngre = []

    return returnArr

Example #6

0

Show file

    url_df = url_df.dropna(subset=['scientific_topic'])

    return url_df


def keep_only_topic_data(url_df, TOPIC):
    if TOPIC in ["climate", "health", "covid"]:
        return url_df[url_df["scientific_topic"] == TOPIC]
    else:
        return url_df


if __name__ == "__main__":

    DATE = sys.argv[1]
    TOPIC = sys.argv[2] if len(sys.argv) >= 3 else ""

    url_df = import_data(folder="sciencefeedback",
                         file_name="Appearances-Grid view " + DATE + ".csv")
    url_df = keep_only_the_urls_considered_fake_by_facebook(url_df)
    url_df = clean_url_format(url_df)
    url_df = add_info_from_fact_check_table(url_df)
    url_df = keep_only_topic_data(url_df, TOPIC)
    url_df = url_df[[
        'url', 'url_cleaned', 'domain_name', 'Item reviewed',
        'Date of publication', 'scientific_topic'
    ]]
    print("There are {} fake news urls.".format(len(url_df)))
    export_data(url_df, 'sciencefeedback',
                "appearances_" + DATE + "_" + TOPIC + ".csv")

Example #7

0

Show file

#!/usr/bin/env python
from __future__ import print_function

import os
import sys
import tempfile

import utils

if len(sys.argv) < 2:
    sys.stderr.write('Usage: parse-ingredients.py FILENAME')
    sys.exit(1)

FILENAME = str(sys.argv[1])
_, tmpFile = tempfile.mkstemp()

with open(FILENAME) as infile, open(tmpFile, 'w') as outfile:
    outfile.write(utils.export_data(infile.readlines()))

tmpFilePath = "/tmp/model_file"
modelFilename = os.path.join(os.path.dirname(__file__), tmpFilePath)
os.system("crf_test -v 1 -m %s %s" % (modelFilename, tmpFile))
os.system("rm %s" % tmpFile)

Example #8

0

Show file

File: clean_crowdtangle_group_data.py Project: medialab/webclim_facebook

    clean_df = pd.DataFrame(columns=[
        "account_name", "account_id", "date", "share", "comment", "reaction"
    ])

    clean_df['account_name'] = df['account_name'].astype(str)
    clean_df['account_id'] = df['account_id'].astype(int)

    clean_df['date'] = pd.to_datetime(df['date'])

    clean_df["share"] = df[["actual_share_count"]].astype(int)
    clean_df["comment"] = df[["actual_comment_count"]].astype(int)

    clean_df["reaction"] = df[[
        "actual_like_count", "actual_favorite_count", "actual_love_count",
        "actual_wow_count", "actual_haha_count", "actual_sad_count",
        "actual_angry_count", "actual_thankful_count"
    ]].sum(axis=1).astype(int)

    return clean_df


if __name__ == "__main__":

    DATE = sys.argv[1]
    SUFFIX = sys.argv[2]

    df = import_data(folder="crowdtangle_group",
                     file_name='posts_group_' + DATE + '.csv')
    clean_df = clean_columns(df)
    export_data(clean_df, 'crowdtangle_group', 'posts_' + SUFFIX + '.csv')

Example #9

0

Show file

def index():
    raw_data_file = utils.get_latest_export()
    if raw_data_file is None:
        raw_data_file = utils.export_data(db)
    raw_data_file = "data/" + raw_data_file
    return render_template("index.html", raw_data=raw_data_file)

Example #10

0

Show file

File: task3.py Project: LudmilaGG/it_proj

def main():
    def age_category(age):
        if 18 <= age < 25:
            return '[18, 25)'
        elif 25 <= age < 35:
            return '[25, 35)'
        elif 35 <= age < 55:
            return '[35, 55)'
        elif age != age:
            return 'None'
        else:
            return '[55, inf)'

    def income_category(income):
        if isinstance(income, str) and not len(income):
            income = float('nan')
        else:
            income = float(income)
        if income < 40000:
            return '[0-40000)'
        elif 40000 <= income < 100000:
            return '[40000, 100000)'
        elif 100000 <= income < 200000:
            return '[100000, 200000)'
        elif 200000 <= income < 500000:
            return '[200000, 500000)'
        elif income != income:
            return 'None'
        else:
            return '[500000, inf)'

    def age_of_car_category(age_of_car):
        if isinstance(age_of_car, str) and not len(age_of_car):
            age_of_car = float('nan')
        else:
            age_of_car = float(age_of_car)
        if age_of_car < 1:
            return '[0, 1)'
        elif 1 <= age_of_car < 3:
            return '[1, 3)'
        elif age_of_car != age_of_car:
            return 'None'
        else:
            return '[3, inf)'

    def score_client(client):
        def age_score(age):
            if age == '[18, 25)':
                return 10
            elif age == '[25, 35)':
                return 20
            elif age == '[35, 55)':
                return 25
            elif age == 'None':
                return 0
            elif age == '[55, inf)':
                return 15

        def family_score(family):
            if family == 1:
                return 10
            elif family == 2:
                return 15
            elif family == 3:
                return 25
            elif family == 4:
                return 20
            else:
                return 0

        def income_score(income):
            if income == '[0-40000)':
                return 5
            elif income == '[40000, 100000)':
                return 10
            elif income == '[100000, 200000)':
                return 15
            elif income == '[200000, 500000)':
                return 20
            elif income == 'None':
                return 0
            elif income == '[500000, inf)':
                return 25

        def house_ownership_score(house_ownership):
            if house_ownership:
                return 25
            else:
                return 0

        def age_of_car_score(age_of_car):
            if age_of_car == '[0, 1)':
                return 25
            elif age_of_car == '[1, 3)':
                return 15
            elif age_of_car == 'None':
                return 0
            elif age_of_car == '[3, inf)':
                return 5

        def employed_by_score(employed_by):
            employed_by_score_dict = {
                "Business Entity Type 3": 25,
                "Business Entity Type 2": 25,
                "Government": 20,
                "Military": 5,
                "Security Ministries": 15,
                "Emergency": 5,
                "Security": 5,
                "Construction": 5,
                "Electricity": 5,
                "XNA": 0,
                "Other": 0,
            }
            return employed_by_score_dict.get(employed_by, 10)

        def education_score(education):
            if education == "Higher education":
                return 25
            elif education == "Secondary / secondary special":
                return 10
            elif education == "Incomplete higher":
                return 20
            else:
                return 0

        def marital_status_score(marital_status):
            if marital_status == "Married":
                return 25
            elif marital_status == "Single / not married":
                return 10
            elif marital_status == "Civil marriage":
                return 15
            elif marital_status == "Widow":
                return 5
            elif marital_status == "Separated":
                return 5
            else:
                return 0

        def position_score(position):
            if position == "Core staff":
                return 25
            elif position == "Accountants":
                return 25
            elif position == "Managers":
                return 25
            elif position == "Security staff":
                return 10
            elif position == "<undefined>":
                return 0
            else:
                return 15

        def income_type_score(income_type):
            if income_type == "State servant":
                return 15
            if income_type == "Working":
                return 10
            if income_type == "Commercial associate":
                return 25
            if income_type == "Pensioner":
                return 5
            else:
                return 0

        def housing_score(housing):
            if housing == "House / apartment":
                return 25
            elif housing == "Rented apartment":
                return 15
            elif housing == "With parents":
                return 10
            else:
                return 0

        score_dict = dict()
        score_dict['age'] = age_score(client.age)
        score_dict['family'] = family_score(client.family)
        score_dict['income'] = income_score(client.income)
        score_dict['house_ownership'] = house_ownership_score(
            client.house_ownership)
        score_dict['age_of_car'] = age_of_car_score(client.age_of_car)
        score_dict['employed_by'] = employed_by_score(client.employed_by)
        score_dict['education'] = education_score(client.education)
        score_dict['marital_status'] = marital_status_score(
            client.marital_status)
        score_dict['position'] = position_score(client.position)
        score_dict['income_type'] = income_type_score(client.income_type)
        score_dict['housing'] = housing_score(client.housing)
        return score_dict

    db_connection = DBhandler(db_path=settings.DATABASE_PATH)

    profile_data = pd.read_sql_query(
        "SELECT * FROM %s" % settings.PROFILE_TABLE, db_connection.connection)
    profile_data.birth = profile_data.birth.apply(parse_date)
    profile_data['age'] = profile_data.birth.apply(
        lambda x: (datetime.date.today() - x).days // 365)

    used_columns = [
        'employed_by', 'education', 'marital_status', 'position',
        'income_type', 'housing'
    ]

    for column in used_columns:
        profile_data[column] = convert_data_from_categorical(
            profile_data[column].values, column)

    profile_data['age'] = profile_data.age.apply(age_category)
    profile_data['income'] = profile_data.income.apply(income_category)
    profile_data['age_of_car'] = profile_data.age_of_car.apply(
        age_of_car_category)

    score_table = pd.DataFrame([],
                               columns=[
                                   'id',
                                   'age',
                                   'family',
                                   'income',
                                   'house_ownership',
                                   'age_of_car',
                                   'employed_by',
                                   'education',
                                   'marital_status',
                                   'position',
                                   'income_type',
                                   'housing',
                               ])

    curr_pos = 0
    for i in range(len(profile_data)):
        client = profile_data.iloc[i]
        score_dict = score_client(client)
        score_table.loc[curr_pos] = [
            client.id,
            score_dict['age'],
            score_dict['family'],
            score_dict['income'],
            score_dict['house_ownership'],
            score_dict['age_of_car'],
            score_dict['employed_by'],
            score_dict['education'],
            score_dict['marital_status'],
            score_dict['position'],
            score_dict['income_type'],
            score_dict['housing'],
        ]
        curr_pos = curr_pos + 1

    print(score_table)

    db_connection.execute_sql("DROP TABLE IF EXISTS %s" %
                              settings.CLIENT_SCORES_TABLE)
    score_table.set_index('id', drop=True).to_sql(settings.CLIENT_SCORES_TABLE,
                                                  db_connection.connection)

    export_data("scores table", "clients_scores.csv", score_table.to_csv)
    print("Task 3 done!")

Example #11

0

Show file

def main():
    db_connection = DBhandler(db_path=settings.DATABASE_PATH)

    clients_scores = pd.read_sql_query("SELECT * FROM %s" % settings.CLIENT_SCORES_TABLE, db_connection.connection)

    score_weights = {
        "age": 0.4919,
        "age_of_car": 0.3890,
        "education": 0.0389,
        "employed_by": 1.0320,
        "family": 0.3844,
        "house_ownership": 0.0023,
        "housing": 0.1372,
        "income": 0.4485,
        "income_type": 0.2288,
        "marital_status": 0.1281,
        "position": 0.7185,
    }

    clients_scores['total_score'] = 0
    for ind in clients_scores.index:
        row = clients_scores.loc[ind]
        temp_scores = dict()
        for column_name in score_weights.keys():
            temp_scores[column_name] = row[column_name] * score_weights[column_name]
        clients_scores.loc[ind, 'total_score'] = sum(temp_scores.values())

    print(clients_scores[['id', 'total_score']])

    db_connection.execute_sql("DROP TABLE IF EXISTS %s" % settings.TOTAL_SCORES_TABLE)
    clients_scores[['id', 'total_score']].set_index('id', drop=True).to_sql(settings.TOTAL_SCORES_TABLE, db_connection.connection)

    export_data("client scores table", "total_scores.csv", clients_scores[['id', 'total_score']].to_csv)

    # risk horizon

    payments_df = pd.read_sql_query("SELECT * FROM %s" % settings.PAYMENT_TABLE, db_connection.connection)

    payments_df.payment_date = payments_df.payment_date.apply(parse_date)

    payments_df['overdue_days'] = -1

    for contract in payments_df.contract_id.unique():
        temp_df = payments_df[payments_df.contract_id == contract].sort_values(['payment_date'])
        index_arr = temp_df.index.tolist()
        for i in range(1, len(index_arr)):
            prev_ind = index_arr[i - 1]
            ind = index_arr[i]
            if payments_df.loc[ind, 'amount_paid'] < payments_df.loc[ind, 'amount_due']:
                if payments_df.loc[prev_ind, 'overdue_days'] == -1:
                    payments_df.loc[ind, 'overdue_days'] = 0
                else:
                    new_ov_days = 30
                    payments_df.loc[ind, 'overdue_days'] = payments_df.loc[prev_ind, 'overdue_days'] + new_ov_days
            elif payments_df.loc[ind, 'amount_paid'] > payments_df.loc[ind, 'amount_due']:
                months_covered = int(round(payments_df.loc[ind, 'amount_paid'] / payments_df.loc[ind, 'amount_due'], 0))
                months_covered = months_covered - 1
                payments_df.loc[ind, 'overdue_days'] = payments_df.loc[prev_ind, 'overdue_days'] - months_covered * 30
                if payments_df.loc[ind, 'overdue_days'] < 0:
                    payments_df.loc[ind, 'overdue_days'] = -1  # Если погасил весь долг - нет просрочки

    default_dates = payments_df[payments_df.overdue_days == 90].groupby(['contract_id'], as_index=False).agg({"payment_date": "max"})

    contracts_df = pd.read_sql_query("SELECT * FROM %s" % settings.CONTRACT_TABLE, db_connection.connection, index_col='contract_id')
    contracts_df.contract_date = contracts_df.contract_date.apply(parse_date)

    payments_df['contract_date'] = np.nan
    payments_df['id_number'] = 0
    for ind in payments_df.index:
        row = payments_df.loc[ind]
        payments_df.loc[ind, 'contract_date'] = contracts_df.loc[row.contract_id].contract_date
        payments_df.loc[ind, 'id_number'] = contracts_df.loc[row.contract_id].id

    payments_df['age'] = 0
    for contract in payments_df.contract_id.unique():
        temp_df = payments_df[payments_df.contract_id == contract].sort_values(['payment_date'])
        index_arr = temp_df.index.tolist()
        contract_date = temp_df.contract_date.iloc[0]  # Берем первую дату договора из таблицы, т.к. они все одинаковые
        for i in range(0, len(index_arr)):
            ind = index_arr[i]
            curr_date = payments_df.loc[ind, 'payment_date']
            curr_age = (curr_date.year - contract_date.year) * 12 + (curr_date.month - contract_date.month)
            payments_df.loc[ind, 'age'] = curr_age

    period_months = int(input("Please enter risk horizon (in months): "))

    out_df = payments_df[payments_df.age == period_months][['contract_id',
                                                            'payment_date',
                                                            'contract_date',
                                                            'id_number',
                                                            'age']].reset_index(drop=True)  # Берем только нужные поля и "сбрасываем" индекс (начинаем с 0)

    out_df['Default?'] = False

    for contract in out_df.contract_id.unique():
        if contract in default_dates.contract_id.values:
            default_date = default_dates[default_dates.contract_id == contract].payment_date.iloc[0]
            current_date = out_df.loc[out_df.contract_id == contract, "payment_date"].iloc[0]
            if current_date >= default_date:
                out_df.loc[(out_df.contract_id == contract), "Default?"] = True

    out_df_gb = out_df.groupby(['id_number'], as_index=False).agg({"age": "max", "Default?": "max"})

    out_df_gb = out_df_gb[out_df_gb.id_number.isin(clients_scores.id.unique())]

    out_df_gb.sort_values(['id_number'], inplace=True)

    out_df_gb['score'] = out_df_gb.id_number.apply(lambda x: clients_scores.set_index(['id']).loc[x].total_score)

    scores_arr = sorted(out_df_gb.score.unique())

    x_plot = []
    y_plot = []
    y_true = out_df_gb['Default?'].astype(int).values
    for threshold in ([0.0] + scores_arr + [100.0]):
        y_pred = (out_df_gb.score <= threshold).values.astype(int)
        conf_mx = confusion_matrix(y_true, y_pred)
        tpr = conf_mx[1, 1] / conf_mx[1].sum()
        fpr = conf_mx[0, 1] / conf_mx[0].sum()
        x_plot.append(fpr)
        y_plot.append(tpr)

    roc_curve = plt.figure()
    plt.plot(x_plot, y_plot, label='ROC curve (area = %0.2f)' % auc(x_plot, y_plot))
    plt.plot([0, 1], [0, 1], '--r')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve')
    plt.legend(loc="lower right")
    plt.ion()
    plt.show()

    plt.pause(0.01)
    input("Press [enter] to continue...")

    export_data("roc plot", "roc_plot.png", roc_curve.savefig)
    print("Task 4 done!")

Example #12

0

Show file

File: task2.py Project: LudmilaGG/it_proj

def main():
    db_connection = DBhandler(db_path=settings.DATABASE_PATH)

    payments_df = pd.read_sql_query(
        "SELECT * FROM %s" % settings.PAYMENT_TABLE, db_connection.connection)

    payments_df.payment_date = payments_df.payment_date.apply(parse_date)

    s_dates = payments_df.groupby(['contract_id'],
                                  as_index=False).agg({'payment_date': 'min'})

    s_dates['amount_due'] = np.nan
    s_dates['amount_paid'] = np.nan

    for ind in s_dates.index:
        row = s_dates.loc[ind]
        t_df = payments_df[(payments_df.contract_id == row.contract_id)
                           & (payments_df.payment_date == row.payment_date)]
        s_dates.loc[ind, 'amount_due'] = t_df.amount_due.iloc[0]
        s_dates.loc[ind, 'amount_paid'] = t_df.amount_paid.iloc[0]

    if np.any(s_dates.amount_paid - s_dates.amount_due < 0):
        mb_zero_default = s_dates[s_dates.amount_paid -
                                  s_dates.amount_due < 0].contract_id.unique()
        print("There may be overdues in the first period! Check contracts:",
              mb_zero_default)

    # Заполняем количество дней просрочки

    payments_df[
        'overdue_days'] = -1  # Ставим -1, там, где не было просрочки (потом уберем), а 0 будем ставить там, где должен был быть платеж, но не произошёл

    for contract in payments_df.contract_id.unique():
        temp_df = payments_df[payments_df.contract_id == contract].sort_values(
            ['payment_date'])
        # Получаем массив индексов, т.к. индексация в "частях" pandas.DataFrame остаётся как и в исходном
        index_arr = temp_df.index.tolist()
        for i in range(
                1, len(index_arr)
        ):  # Берем индексы с 1, т.к. в 0 всегда просрочка 0 (первый платеж)
            prev_ind = index_arr[i - 1]
            ind = index_arr[i]
            if payments_df.loc[ind, 'amount_paid'] < payments_df.loc[
                    ind, 'amount_due']:  # Платеж не поступил или был меньше
                if payments_df.loc[prev_ind,
                                   'overdue_days'] == -1:  # Не было просрочки
                    payments_df.loc[
                        ind,
                        'overdue_days'] = 0  # С текущей даты пошла просрочка, но пока что 0
                else:
                    #  Для простоты будем просто добавлять 30, т.к. платежи ровно через 1 мес.
                    new_ov_days = 30
                    payments_df.loc[ind, 'overdue_days'] = payments_df.loc[
                        prev_ind,
                        'overdue_days'] + new_ov_days  # Складываем прошлую просрочку с текущей
            elif payments_df.loc[ind, 'amount_paid'] > payments_df.loc[
                    ind, 'amount_due']:
                months_covered = int(
                    round(
                        payments_df.loc[ind, 'amount_paid'] /
                        payments_df.loc[ind, 'amount_due'],
                        0))  # Считаем, за сколько месяцев оплатил
                months_covered = months_covered - 1  # Вычитаем 1 месяца из погашения, т.к. мы не прибавили 30 дней за текущий месяц
                # Вычитаем из просрочки количество "погашенных" просроченных платежей
                # В случае более сложного начисления (с процентами, неравномерные платежи) надо использовать логику сложнее
                payments_df.loc[ind, 'overdue_days'] = payments_df.loc[
                    prev_ind, 'overdue_days'] - months_covered * 30
                if payments_df.loc[ind, 'overdue_days'] < 0:
                    payments_df.loc[
                        ind,
                        'overdue_days'] = -1  # Если погасил весь долг - нет просрочки

    # Берем макс. дату выхода в просрочку 90+, т.к. вдруг клиент гасил просрочку 90 и возрашался в 60
    default_dates = payments_df[payments_df.overdue_days == 90].groupby(
        ['contract_id'], as_index=False).agg({"payment_date": "max"})

    print(
        default_dates.rename(
            {
                "contract_id": "Contract Number",
                "payment_date": "Default Date"
            },
            axis=1))

    contracts_df = pd.read_sql_query("SELECT * FROM %s" %
                                     settings.CONTRACT_TABLE,
                                     db_connection.connection,
                                     index_col='contract_id')
    contracts_df.contract_date = contracts_df.contract_date.apply(parse_date)

    # Определяем дату договора и заёмщика
    payments_df['contract_date'] = np.nan
    payments_df['id_number'] = 0
    for ind in payments_df.index:
        row = payments_df.loc[ind]
        payments_df.loc[ind, 'contract_date'] = contracts_df.loc[
            row.contract_id].contract_date
        payments_df.loc[ind,
                        'id_number'] = contracts_df.loc[row.contract_id].id

    # Определяем "возраст" договора в месяцах на каждую дату
    payments_df['age'] = 0
    for contract in payments_df.contract_id.unique():
        temp_df = payments_df[payments_df.contract_id == contract].sort_values(
            ['payment_date'])
        index_arr = temp_df.index.tolist()
        contract_date = temp_df.contract_date.iloc[
            0]  # Берем первую дату договора из таблицы, т.к. они все одинаковые
        for i in range(0, len(index_arr)):
            ind = index_arr[i]
            curr_date = payments_df.loc[ind, 'payment_date']
            curr_age = (curr_date.year - contract_date.year) * 12 + (
                curr_date.month - contract_date.month)
            payments_df.loc[ind, 'age'] = curr_age

    period_months = int(input("Please enter risk horizon (in months): "))

    out_df = payments_df[payments_df.age == period_months][[
        'contract_id', 'payment_date', 'contract_date', 'id_number', 'age'
    ]].reset_index(
        drop=True
    )  # Берем только нужные поля и "сбрасываем" индекс (начинаем с 0)

    out_df['Default?'] = False

    for contract in out_df.contract_id.unique():
        if contract in default_dates.contract_id.values:
            default_date = default_dates[default_dates.contract_id ==
                                         contract].payment_date.iloc[0]
            current_date = out_df.loc[out_df.contract_id == contract,
                                      "payment_date"].iloc[0]
            if current_date >= default_date:
                out_df.loc[(out_df.contract_id == contract), "Default?"] = True

    out_df_gb = out_df.groupby(['id_number'], as_index=False).agg({
        "age":
        "max",
        "Default?":
        "max"
    }).rename({"age": "Age (months)"}, axis=1)
    print(out_df_gb)

    # Выбираем нужные данные о клиентах
    applications_df = pd.read_sql_query("""
    SELECT
        *
    FROM
        %s
    """ % settings.PROFILE_TABLE,
                                        db_connection.connection,
                                        index_col='id')

    # Убираем ненужные колонки
    applications_df.drop(["issue_date"], axis=1, inplace=True)
    applications_df.birth = applications_df.birth.apply(parse_date)

    # Убираем клиента, по которому мало данных

    if 100076 in applications_df.index:
        applications_df.drop([100076], inplace=True)

    # Заполняем пустые поля
    applications_df.fillna(0.0, inplace=True)

    applications_df['default'] = 0

    applications_df.loc[applications_df.index.isin(out_df_gb[
        out_df_gb['Default?']].id_number.unique()), "default"] = 1

    applications_df['age'] = (
        datetime.date.today() -
        applications_df.birth).apply(lambda x: x.days // 365)

    applications_df.drop(['birth'], axis=1, inplace=True)

    applications_df.income = pd.qcut(applications_df.income, 5)

    def get_age_of_car_category(age):
        if age == 0:
            return '0'
        elif age <= 3:
            return '<=3'
        else:
            return '>3'

    applications_df.age_of_car = applications_df.age_of_car.apply(
        get_age_of_car_category)

    applications_df.age = pd.qcut(applications_df.age, 5)

    quantilized_columns = ['income', 'age', 'age_of_car']

    categorical_columns = [
        'gender',
        'employed_by',
        'education',
        'marital_status',
        'position',
        'income_type',
        'housing',
        'house_ownership',
        'children',
        'family',
    ]

    choices = dict(
        zip(range(len(categorical_columns + quantilized_columns)),
            sorted(categorical_columns + quantilized_columns)))

    for i in choices.keys():
        print("%d: %s" % (i, choices[i]))

    column_choice = choices.get(
        int(input("Please choose column (default=0): ")), 0)

    temp_df = applications_df[[column_choice, 'default']].copy()

    temp_df['event'] = temp_df.default
    temp_df['non_event'] = 1 - temp_df.default

    df_gb = temp_df.groupby([column_choice]).agg({
        "default":
        "sum",
        "event":
        lambda x: x.sum() / temp_df.event.sum(),
        "non_event":
        lambda x: x.sum() / temp_df.non_event.sum()
    })

    df_gb['woe'] = np.log(np.clip(df_gb.event / df_gb.non_event, 0.001,
                                  np.inf))

    df_gb['IV'] = (df_gb.event - df_gb.non_event) * df_gb.woe

    df_gb.index = df_gb.index.astype(str)

    woe_fig = plt.figure(figsize=(12, 7))
    plt.scatter(list(range(df_gb.shape[0])), df_gb.woe)
    plt.grid()
    plt.xticks(list(range(df_gb.shape[0])), df_gb.index)
    plt.ylabel("WOE", fontsize=14)
    plt.xlabel(column_choice, fontsize=14)
    plt.title("WOE plot", fontsize=16)
    plt.ion()
    plt.show()

    iv_fig = plt.figure(figsize=(12, 7))
    plt.scatter(list(range(df_gb.shape[0])), df_gb.IV)
    plt.grid()
    plt.xticks(list(range(df_gb.shape[0])), df_gb.index)
    plt.ylabel("IV", fontsize=14)
    plt.xlabel(column_choice, fontsize=14)
    plt.title("Information Value plot", fontsize=16)
    plt.ion()
    plt.show()

    df_gb.loc['SUM'] = df_gb.sum()

    print(df_gb)

    plt.pause(0.001)
    input("Press [enter] to continue...")

    export_data("woe plot", "woe_plot.png", woe_fig.savefig)
    export_data("iv plot", "iv_plot.png", iv_fig.savefig)

    export_data("woe iv table", "woe_iv_table.csv", df_gb.to_csv)
    export_data("defaults on horizon table", "defaults_horizon.csv",
                out_df_gb.to_csv)
    print("Task 2 done!")