Ejemplo n.º 1
0
def test_get_train_data_with_UFED(_MockCursor):
    _MockCursor.execute.return_value = None
    _MockCursor.description = MOCK_DESCRIPTION
    _MockCursor.fetchall.return_value = MOCK_VALUES

    expected_query = TRAIN_QUERY + " AND B.SNCA_UFED_DK = 33"

    saida = get_train_data(_MockCursor, 33)
    expected_output = pd.DataFrame(MOCK_VALUES, columns=MOCK_COLUMNS)

    assert_frame_equal(saida, expected_output)
    _MockCursor.execute.assert_called_with(expected_query)
Ejemplo n.º 2
0
def test_get_train_data_with_end_date(_MockCursor):
    _MockCursor.execute.return_value = None
    _MockCursor.description = MOCK_DESCRIPTION
    _MockCursor.fetchall.return_value = MOCK_VALUES

    expected_query = TRAIN_QUERY + (" AND A.ATSD_DT_REGISTRO <= "
                                    "TO_DATE('2018-01-01', 'YYYY-MM-DD')")

    saida = get_train_data(_MockCursor, end_date='2018-01-01')
    expected_output = pd.DataFrame(MOCK_VALUES, columns=MOCK_COLUMNS)

    assert_frame_equal(saida, expected_output)
    _MockCursor.execute.assert_called_with(expected_query)
Ejemplo n.º 3
0
LABEL_COLUMN = 'DMDE_MDEC_DK'

# Vectorizer parameters
NGRAM_RANGE = (1, 3)
MAX_DF = 0.6
MIN_DF = 1

print('Running train script:')
print('Querying database...')
client = KerberosClient(HDFS_URL)

conn = jdbc.connect("oracle.jdbc.driver.OracleDriver", URL_ORACLE_SERVER,
                    [USER_ORACLE, PASSWD_ORACLE], ORACLE_DRIVER_PATH)
curs = conn.cursor()

df = get_train_data(curs, start_date=START_DATE, end_date=END_DATE)

nb_documents = len(df)
if nb_documents == 0:
    print('No data to train model!')
    sys.exit()
else:
    print('{} documents available to train model.\n'.format(nb_documents))

train_keys = get_keys(df, ID_COLUMN)

print('Preparing data...')
df[TEXT_COLUMN] = df[TEXT_COLUMN].apply(clean_text)

# Labels need to be grouped to be passed to the MultiLabelBinarizer
df = df.groupby(TEXT_COLUMN)\
Ejemplo n.º 4
0
def test_get_train_data_UFED_not_int(_MockCursor):
    _MockCursor.description = MOCK_DESCRIPTION
    _MockCursor.fetchall.return_value = MOCK_VALUES
    with pytest.raises(TypeError):
        get_train_data(_MockCursor, 'not an int')