Ejemplos de Insert en Python, ejemplos de extract.extract.Insert en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: transform_mil_test.py Proyecto: johnvorsten/point_categorizer

    def test_numeric_pipe(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        numeric_pipe = Transform.numeric_pipeline(
            numeric_attributes=NUM_ATTRIBUTES)

        df_clean = clean_pipe.fit_transform(dataset_raw)
        df_numeric = numeric_pipe.fit_transform(df_clean)

        return None

Ejemplo n.º 2

0

Mostrar archivo

Archivo: transform_mil_test.py Proyecto: johnvorsten/point_categorizer

    def test_categorical_pipe(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        categorical_pipe = Transform.categorical_pipeline(
            categorical_attributes=CATEGORICAL_ATTRIBUTES,
            handle_unknown='ignore',
            categories_file=CATEGORIES_FILE)

        df_clean = clean_pipe.fit_transform(dataset_raw)
        ohe_array = categorical_pipe.fit_transform(df_clean).toarray()
        print("Example OneHotEcoded Array: ", ohe_array[0])

        # Find more about categorical pipe
        ohe = categorical_pipe.named_steps['OneHotEncoder']
        print("Categories used for OneHotEncoder", ohe.categories)

        return None

Ejemplo n.º 3

0

Mostrar archivo

def get_train_test_id_sql(train_pct=0.8):
    """Returns primary keys of all unique customers
    The complete set of customer_ids are split into training and testing
    sets
    inputs
    -------
    train_pct: (float) percent of docuemnt _ids to be considered for training
        outputs_
    -------
    (train_ids, test_ids): (list) of training and testing _ids """

    # Set up connection to SQL
    Insert = extract.Insert(server_name=server_name,
                            driver_name=driver_name,
                            database_name=database_name)

    # Query SQL for all customer primary keys
    sel = sqlalchemy.select([Customers.id])
    customer_ids = Insert.core_select_execute(sel)
    customer_ids = [x.id for x in customer_ids]

    # Permute all primary keys into training and testing sets
    index = np.arange(len(customer_ids))
    np.random.shuffle(index)
    n_train = int(len(customer_ids) * train_pct)
    train_index = index[n_train:]
    text_index = index[:n_train]

    train_ids = [customer_ids[idx] for idx in train_index]
    test_ids = [customer_ids[idx] for idx in text_index]

    return train_ids, test_ids

Ejemplo n.º 4

0

Mostrar archivo

Archivo: transform_mil_test.py Proyecto: johnvorsten/point_categorizer

    def test_text_pipe(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        # Create pipeline specifically for clustering text features
        name_vocabulary = VocabularyText.read_vocabulary_disc(
            POINTNAME_VOCABULARY_FILENAME)
        name_text_pipe = Transform.text_pipeline_label(
            attributes=['NAME'], vocabulary=name_vocabulary)

        full_pipeline = Pipeline([
            ('clean_pipe', clean_pipe),
            ('text_pipe', name_text_pipe),
        ])

        dataset = full_pipeline.fit_transform(dataset_raw)

        return None

Ejemplo n.º 5

0

Mostrar archivo

def test_get_building_suffix():
    """Test whether a set of points is a building suffix word"""

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(18))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Split name variable
    token_pattern = r'\.'
    tokenizer = re.compile(token_pattern)

    # Keep track of words
    words = []

    # Split each name into tokens
    for idx, word in dataset_raw['NAME'].iteritems():
        parts = tokenizer.split(word)
        words.append(parts)

    # Get vocabulary
    VocabularyText = transform_pipeline.VocabularyText()
    suffix = VocabularyText.get_building_suffix(words)
    print("Suffix found : ", suffix)

    return None

Ejemplo n.º 6

0

Mostrar archivo

def test_get_text_vocabulary():
    """Generate data to find Vocabulary"""
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    sel = sqlalchemy.select([Points])
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=True,
                                             remove_virtual=True)

    df_clean = clean_pipe.fit_transform(dataset_raw)

    # Get vocabulary for DESCRIPTOR feature - a text feature
    VocabularyText = transform_pipeline.VocabularyText()
    vocabulary = VocabularyText\
        .get_text_vocabulary(X=df_clean,
                             col_name='DESCRIPTOR',
                             remove_suffix=False,
                             max_features=80)

    # Sove vocabulary
    file_name = r'../data/vocab_descriptor.txt'
    transform_pipeline.VocabularyText.save_vocabulary(vocabulary, file_name)

    return None

Ejemplo n.º 7

0

Mostrar archivo

def test_cleaning_pipe():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)

    df = clean_pipe.fit_transform(dataset_raw)

    return df

Ejemplo n.º 8

0

Mostrar archivo

def test_get_database_features():

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                          replace_numbers=False,
                                          remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                       attributes='NAME',
                                       seperator='.',
                                       heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([('clean_pipe', clean_pipe),
                              ('text_pipe',text_pipe),
                              ])
    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 15
    sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer_id))
    database = Insert.pandas_select_execute(sel)
    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    database_features = ExtractLabels.get_database_features(database,
                                                            full_pipeline,
                                                            instance_name=customer_name)
    return database_features

Ejemplo n.º 9

0

Mostrar archivo

Archivo: transform_mil_test.py Proyecto: johnvorsten/point_categorizer

    def test_read_categories(self):

        # Ititialize
        categories = Transform._read_categories(CATEGORICAL_ATTRIBUTES,
                                                CATEGORIES_FILE)

        replaceNone = ReplaceNone(CATEGORICAL_ATTRIBUTES)
        dataFrameSelector = DataFrameSelector(CATEGORICAL_ATTRIBUTES)
        oneHotEncoder = OneHotEncoder(categories=categories,
                                      handle_unknown='ignore')

        # Get raw database
        Insert = extract.Insert(server_name, driver_name, database_name)
        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)
        df_clean1 = clean_pipe.fit_transform(dataset_raw)

        # Transform
        df0 = replaceNone.fit_transform(df_clean1)
        df1_array = dataFrameSelector.fit_transform(df0)
        ohearray = oneHotEncoder.fit_transform(df1_array).toarray()

        return None

Ejemplo n.º 10

0

Mostrar archivo

def test_categorical_pipe():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)

    categorical_pipe = Transform.categorical_pipeline(
        categorical_attributes=None,
        categories_file=r'../data/categorical_categories.dat')

    df_clean = clean_pipe.fit_transform(dataset_raw)
    ohe_array = categorical_pipe.fit_transform(df_clean).toarray()

    # Find more about categorical pipe
    ohe = categorical_pipe.named_steps['catEncoder']
    ohe.categories  # ohe.categories_ when categories='auto'

    return ohe_array

Ejemplo n.º 11

0

Mostrar archivo

def test_time():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()

    RemoveAttribute = transform_pipeline.RemoveAttribute(
        Transform.drop_attributes)
    RemoveNan = transform_pipeline.RemoveNan(Transform.nan_replace_dict)
    SetDtypes = transform_pipeline.SetDtypes(Transform.type_dict)
    TextCleaner = transform_pipeline.TextCleaner(Transform._text_clean_attrs,
                                                 replace_numbers=True)
    UnitCleaner = transform_pipeline.UnitCleaner(Transform.unit_dict)
    DuplicateRemover = transform_pipeline.DuplicateRemover(Transform.dupe_cols,
                                                           remove_dupe=True)
    VirtualRemover = transform_pipeline.VirtualRemover(remove_virtual=True)

    t0 = time.time()
    df0 = RemoveAttribute.fit_transform(dataset_raw)

    t1 = time.time()
    df1 = RemoveNan.fit_transform(df0)

    t2 = time.time()
    df2 = SetDtypes.fit_transform(df1)

    t3 = time.time()
    df3 = TextCleaner.fit_transform(df2)

    t4 = time.time()
    df4 = UnitCleaner.fit_transform(df3)

    t5 = time.time()
    indicies = DuplicateRemover.get_duplicate_indicies(df4, 'NAME')
    print('Duplicate names')
    print(df4['NAME'].iloc[indicies[:50]])
    df5 = DuplicateRemover.fit_transform(df4)

    t6 = time.time()
    df6 = VirtualRemover.fit_transform(df5)
    t7 = time.time()

    print('RemoveAttribute : {}'.format(t1 - t0))
    print('RemoveNan : {}'.format(t2 - t1))
    print('SetDtypes : {}'.format(t3 - t2))
    print('TextCleaner : {}'.format(t4 - t3))
    print('UnitCleaner : {}'.format(t5 - t4))
    print('DuplicateRemover : {}'.format(t6 - t5))
    print('VirtualRemover : {}'.format(t7 - t6))

    return None

Ejemplo n.º 12

0

Mostrar archivo

Archivo: transform_mil_test.py Proyecto: johnvorsten/point_categorizer

    def test_calc_categories_dict(self):

        # Generate data to find categories
        Insert = extract.Insert(server_name, driver_name, database_name)

        sel = sqlalchemy.select([Points])
        dataset_raw = Insert.pandas_select_execute(sel)

        # Create 'clean' data processing pipeline
        clean_pipe = Transform.cleaning_pipeline(
            drop_attributes=DROP_ATTRIBUTES,
            nan_replace_dict=NAN_REPLACE_DICT,
            dtype_dict=TYPE_DICT,
            unit_dict=UNIT_DICT,
            dupe_cols=DUPE_COLS,
            remove_dupe=REMOVE_DUPE,
            replace_numbers=REPLACE_NUMBERS,
            remove_virtual=REMOVE_VIRTUAL,
            text_clean_attributes=TEXT_CLEAN_ATTRS)

        string_pipe = SetDtypes(
            type_dict={
                'TYPE': str,
                'ALARMTYPE': str,
                'FUNCTION': str,
                'VIRTUAL': str,
                'CS': str,
                'SENSORTYPE': str,
                'DEVUNITS': str
            })

        categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe),
                                          ('string_pipe', string_pipe)])

        df_clean = categories_clean_pipe.fit_transform(dataset_raw)

        # Calculate categories to be used later
        Encoding = EncodingCategories()
        columns = [
            'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE',
            'DEVUNITS'
        ]
        categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns)

        if not os.path.exists(CATEGORIES_FILE):
            raise OSError("Categories file not found: {}".\
                          format(CATEGORIES_FILE))

        # Compare categoires read to those saved on disc
        categories_dict_read = Encoding.read_categories_from_disc(
            CATEGORIES_FILE)
        for key in set(
            (*categories_dict_calc.keys(), *categories_dict_read.keys())):
            self.assertEqual(set(categories_dict_calc[key]),
                             set(categories_dict_read[key]))

        return None

Ejemplo n.º 13

0

Mostrar archivo

Archivo: unsupervised_cluster_test.py Proyecto: johnvorsten/point_categorizer

def test_cluster_with_hyperparameters():
    """Test clustering with hyperparameters"""

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 13
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        # res = connection.execute(sel).fetchone()
        database = pd.read_sql(sel, connection)

    df_clean = clean_pipe.fit_transform(database)
    X = text_pipe.fit_transform(df_clean).toarray()
    #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary
    #df_text = pd.DataFrame(X, columns=_word_vocab)

    hyperparameters = {
        'by_size': False,
        'distance': 'euclidean',
        'clusterer': 'ward.D',
        'n_components': 8,
        'reduce': 'MDS',
        'index': 'Ratkowsky'
    }

    result = UnsupervisedCluster.cluster_with_hyperparameters(
        hyperparameters, X)

    best_nc_df = result.best_nc_dataframe

    sel = sqlalchemy.select([Customers])\
        .where(Customers.id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        res = connection.execute(sel).fetchone()
        correct_k = res.correct_k

    return result

Ejemplo n.º 14

0

Mostrar archivo

    def test_serialize_example_in_example(self):

        # Requires MSSQL server, data in 'Clustering' database,
        # configuration file, and other stuff
        config = configparser.ConfigParser()
        config.read(r'../extract/sql_config.ini')
        server_name = config['sql_server']['DEFAULT_SQL_SERVER_NAME']
        driver_name = config['sql_server']['DEFAULT_SQL_DRIVER_NAME']
        database_name = config['sql_server']['DEFAULT_DATABASE_NAME']

        Insert = extract.Insert(server_name=server_name,
                                driver_name=driver_name,
                                database_name=database_name)

        # Load an example from SQL database
        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        database = Insert.pandas_select_execute(sel)
        sel = sqlalchemy.select([Customers.name
                                 ]).where(Customers.id.__eq__(customer_id))
        customer_name = Insert.core_select_execute(sel)[0].name

        # Transformation pipeline
        full_pipeline = Transform.get_ranking_pipeline()

        # Dictionary with keys ['n_instance', 'n_features', 'len_var', 'uniq_ratio',
        #                    'n_len1', 'n_len2', 'n_len3', 'n_len4', 'n_len5',
        #                    'n_len6', 'n_len7']
        database_features = get_database_features(database,
                                                  full_pipeline,
                                                  instance_name=customer_name)
        database_features.pop('instance')

        #1. Context features (bytes object)
        serialized_context = serialize_context_from_dictionary(
            database_features)

        #2. Peritem features (bytes object)
        serialized_peritem = serialize_examples_model4(
            HYPERPARAMETER_LIST, list_size=_LIST_SIZE_MODEL4)

        # Prepare serialized feature spec for EIE format
        serialized_dict = {
            'serialized_context': _bytes_feature([serialized_context]),
            'serialized_examples': _bytes_feature(serialized_peritem)
        }

        # Convert to tf.train.Example object
        serialized_proto = tf.train.Example(features=tf.train.Features(
            feature=serialized_dict))
        serialized_example_in_example = serialized_proto.SerializeToString()

        return serialized_example_in_example

Ejemplo n.º 15

0

Mostrar archivo

Archivo: unsupervised_cluster_test.py Proyecto: johnvorsten/point_categorizer

def test_unsupervised_cluster():

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        # res = connection.execute(sel).fetchone()
        database = pd.read_sql(sel, connection)

    df_clean = clean_pipe.fit_transform(database)
    X = text_pipe.fit_transform(df_clean).toarray()
    _word_vocab = text_pipe.named_steps[
        'WordDictToSparseTransformer'].vocabulary
    df_text = pd.DataFrame(X, columns=_word_vocab)

    # Get number of clusters
    sel = sqlalchemy.select([Customers])\
        .where(Customers.id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        res = connection.execute(sel).fetchone()
        correct_k = res.correct_k

    if X.shape[0] <= 3 or correct_k == 1:
        # Dont cluster - just pass 1 cluster total
        prediction_agglo = np.ones((X.shape[0]))

    else:
        # Cluster
        agglomerative = AgglomerativeClustering(n_clusters=correct_k,
                                                affinity='euclidean',
                                                linkage='ward')
        prediction_agglo = agglomerative.fit_predict(X)

    return df_clean, prediction_agglo

Ejemplo n.º 16

0

Mostrar archivo

Archivo: transform_mil_test.py Proyecto: johnvorsten/point_categorizer

    def test_timeself(self):

        Insert = extract.Insert(server_name, driver_name, database_name)

        customer_id = 15
        sel = sqlalchemy.select([Points]).where(
            Points.customer_id.__eq__(customer_id))
        dataset_raw = Insert.pandas_select_execute(sel)

        removeAttribute = RemoveAttribute(DROP_ATTRIBUTES)
        removeNan = RemoveNan(NAN_REPLACE_DICT)
        setDtypes = SetDtypes(TYPE_DICT)
        textCleaner = TextCleaner(TEXT_CLEAN_ATTRS, replace_numbers=True)
        unitCleaner = UnitCleaner(UNIT_DICT)
        duplicateRemover = DuplicateRemover(DUPE_COLS, remove_dupe=True)
        virtualRemover = VirtualRemover(remove_virtual=True)

        t0 = time.time()
        df0 = removeAttribute.fit_transform(dataset_raw)

        t1 = time.time()
        df1 = removeNan.fit_transform(df0)

        t2 = time.time()
        df2 = setDtypes.fit_transform(df1)

        t3 = time.time()
        df3 = textCleaner.fit_transform(df2)

        t4 = time.time()
        df4 = unitCleaner.fit_transform(df3)

        t5 = time.time()
        indicies = duplicateRemover.get_duplicate_indicies(df4, 'NAME')
        print('Duplicate names')
        print(df4['NAME'].iloc[indicies[:50]])
        df5 = duplicateRemover.fit_transform(df4)

        t6 = time.time()
        virtualRemover.fit_transform(df5)
        t7 = time.time()

        print('RemoveAttribute : {}'.format(t1 - t0))
        print('RemoveNan : {}'.format(t2 - t1))
        print('SetDtypes : {}'.format(t3 - t2))
        print('TextCleaner : {}'.format(t4 - t3))
        print('UnitCleaner : {}'.format(t5 - t4))
        print('DuplicateRemover : {}'.format(t6 - t5))
        print('VirtualRemover : {}'.format(t7 - t6))

        return None

Ejemplo n.º 17

0

Mostrar archivo

Archivo: rank_write_record_test.py Proyecto: johnvorsten/point_categorizer

def test_serialize_examples_from_dictionary():
    """This module has (3) methods of serializing peritem """

    """Set up how I want to assign labels to objects
    Reciprocal will cause labels to be the inverse of the loss metric
    Set to True if I do not want labels to be binned"""
    reciprocal = False # Reciprocal of relevance label - use if you dont bin labels
    n_bins = 5 # number of bins for relevance label

    label_key = 'relevance'

    # These are peritem featuer columns names
    peritem_keys = ['by_size','n_components','clusterer','reduce','index']


    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get all records relating to one customer
    customer_id = 15
    sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
        .where(Clustering.customer_id.__eq__(customer_id))
    res = Insert.core_select_execute(sel)
    primary_keys = [x.id for x in res]
    correct_k = res[0].correct_k

    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    # Calculate ranking of all records
    records = get_records(primary_keys)
    best_labels = ExtractLabels.calc_labels(records, correct_k,
                                            error_scale=0.8, var_scale=0.2)
    example_features = []
    for label in best_labels:
        feature_dict = {}
        for key in peritem_keys:
            feature_dict[key] = label.hyperparameter_dict[key]
        feature_dict[label_key] = label.loss
        example_features.append(feature_dict)

    serialized_example = serialize_examples_from_dictionary(example_features,
                                       label_key,
                                       peritem_keys,
                                       reciprocal=reciprocal,
                                       n_bins=n_bins,
                                       shuffle_peritem=True)

    return serialized_example

Ejemplo n.º 18

0

Mostrar archivo

def test_calc_categories_dict():
    """Generate data to find categories"""
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    sel = sqlalchemy.select([Points])
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)
    string_pipe = transform_pipeline.SetDtypes(
        type_dict={
            'TYPE': str,
            'ALARMTYPE': str,
            'FUNCTION': str,
            'VIRTUAL': str,
            'CS': str,
            'SENSORTYPE': str,
            'DEVUNITS': str
        })

    categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe),
                                      ('string_pipe', string_pipe)])

    df_clean = categories_clean_pipe.fit_transform(dataset_raw)
    """Calculate and save categories to be used later"""
    Encoding = transform_pipeline.EncodingCategories()
    columns = [
        'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE',
        'DEVUNITS'
    ]
    categories_dict = Encoding.calc_categories_dict(df_clean, columns)
    save_path = r'../data/categorical_categories.dat'

    Encoding.save_categories_to_disc(categories_dict, save_path)
    categories_dict1 = Encoding.read_categories_from_disc(save_path)
    for key in set((*categories_dict.keys(), *categories_dict1.keys())):
        assert (np.array_equal(categories_dict[key], categories_dict1[key]))

    return None

Ejemplo n.º 19

0

Mostrar archivo

    def test_legacy_numeric_transform_pipeline_MIL(self):

        # Get some raw data
        Insert = extract.Insert(server_name, driver_name, database_name)
        group_id = 15
        sel = sqlalchemy.select([Points
                                 ]).where(Points.group_id.__eq__(group_id))
        dfraw = Insert.pandas_select_execute(sel)

        # Get the legacy pipeline
        full_pipeline = legacy_numeric_transform_pipeline_MIL()

        # Transform data
        bag = full_pipeline.fit_transform(dfraw)

        # Observe output number of attributes, should be 3236 for compatability
        self.assertEquals(bag.shape[1], 3236)

        return None

Ejemplo n.º 20

0

Mostrar archivo

def test_read_categories():

    # Ititialize
    Transform = transform_pipeline.Transform()
    categories_file = r'../data/categorical_categories.dat'
    categories = Transform._read_categories(Transform.cat_attributes,
                                            categories_file)
    categorical_attributes = Transform.cat_attributes

    ReplaceNone = transform_pipeline.ReplaceNone(categorical_attributes)
    DataFrameSelector = transform_pipeline.DataFrameSelector(
        categorical_attributes)
    OneHotEncoder = transform_pipeline.OneHotEncoder(categories=categories)

    # Get raw database
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')
    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=False,
                                             remove_virtual=True)
    df_clean1 = clean_pipe.fit_transform(dataset_raw)

    # Transform
    df0 = ReplaceNone.fit_transform(df_clean1)
    df1_array = DataFrameSelector.fit_transform(df0)
    ohearray = OneHotEncoder.fit_transform(df1_array).toarray()

    # Examine the transformers
    print(df0[categorical_attributes].iloc[:5])
    print(df1_array[:5])
    OneHotEncoder.categories

    return None

Ejemplo n.º 21

0

Mostrar archivo

def test_get_database_labels():

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get all records relating to one customer
    customer_id = 15
    sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
        .where(Clustering.customer_id.__eq__(customer_id))
    res = Insert.core_select_execute(sel)
    primary_keys = [x.id for x in res]
    correct_k = res[0].correct_k

    sel = sqlalchemy.select([Customers.name]).where(Customers.id.__eq__(customer_id))
    customer_name = Insert.core_select_execute(sel)[0].name

    # Calculate ranking of all records
    records = get_records(primary_keys)
    best_labels = ExtractLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2)

    return best_labels

Ejemplo n.º 22

0

Mostrar archivo

def test_text_pipe():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    customer_id = 15
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    dataset_raw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=True,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    full_pipeline = Pipeline([
        ('clean_pipe', clean_pipe),
        ('text_pipe', text_pipe),
    ])

    dataset = full_pipeline.fit_transform(dataset_raw)

    return dataset

Ejemplo n.º 23

0

Mostrar archivo

def test_full_pipeline():

    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')
    # group_id = 4
    group_id = 15
    sel = sqlalchemy.select([Points]).where(Points.group_id.__eq__(group_id))
    dfraw = Insert.pandas_select_execute(sel)

    # Transform pipeline
    Transform = transform_pipeline.Transform()

    # Cleaning pipeline
    clean_pipe = Transform.cleaning_pipeline(drop_attributes=None,
                                             nan_replace_dict=None,
                                             dtype_dict=None,
                                             unit_dict=None,
                                             remove_dupe=True,
                                             replace_numbers=True,
                                             remove_virtual=True)

    # Text feature encoders
    name_file = r'../data/vocab_name.txt'
    name_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(
        name_file)
    name_text_pipe = Transform.text_pipeline_label(attributes=['NAME'],
                                                   vocabulary=name_vocabulary)
    descriptor_file = r'../data/vocab_descriptor.txt'
    descriptor_vocabulary = transform_pipeline.VocabularyText.read_vocabulary_disc(
        descriptor_file)
    descriptor_text_pipe = Transform.text_pipeline_label(
        attributes=['DESCRIPTOR'], vocabulary=descriptor_vocabulary)

    # Categorical Features
    categorical_pipe = Transform.categorical_pipeline(
        categorical_attributes=None,
        handle_unknown='ignore',
        categories_file=r'../data/categorical_categories.dat')

    # Numeric features
    numeric_pipe = Transform.numeric_pipeline(numeric_attributes=None)

    # Union
    combined_features = FeatureUnion(transformer_list=[
        ('CategoricalPipe', categorical_pipe),
        ('NameTextPipe', name_text_pipe),
        ('DescriptorTextPipe', descriptor_text_pipe),
        ('NumericPipe', numeric_pipe),
    ])
    full_pipeline = Pipeline([
        ('CleaningPipe', clean_pipe),
        ('CombinedFeatures', combined_features),
    ])

    combined_csr = full_pipeline.fit_transform(dfraw)
    combined_csr.shape

    CleaningPipe = full_pipeline.steps[0][1]  # CleaningPipe
    RemoveAttribute = full_pipeline.steps[0][1][0]  # RemoveAttribute
    RemoveNan = full_pipeline.steps[0][1][1]
    SetDtypes = full_pipeline.steps[0][1][2]
    TextCleaner = full_pipeline.steps[0][1][3]
    UnitCleaner = full_pipeline.steps[0][1][4]
    DuplicateRemover = full_pipeline.steps[0][1][5]
    VirtualRemover = full_pipeline.steps[0][1][6]

    df0 = RemoveAttribute.fit_transform(copy.deepcopy(dfraw))
    df1 = RemoveNan.fit_transform(copy.deepcopy(df0))
    df2 = SetDtypes.fit_transform(copy.deepcopy(df1))
    df3 = TextCleaner.fit_transform(copy.deepcopy(df2))
    df4 = UnitCleaner.fit_transform(copy.deepcopy(df3))
    df5 = DuplicateRemover.fit_transform(copy.deepcopy(df4))
    df6 = VirtualRemover.fit_transform(copy.deepcopy(df5))

    return None

Ejemplo n.º 24

0

Mostrar archivo

def calc_save_categories_vocabulary():

    # Read raw data from database
    Insert = extract.Insert(server_name, driver_name, database_name)
    sel = sqlalchemy.select([Points])
    dataset_raw = Insert.pandas_select_execute(sel)

    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(
        drop_attributes=DROP_ATTRIBUTES,
        nan_replace_dict=NAN_REPLACE_DICT,
        dtype_dict=TYPE_DICT,
        unit_dict=UNIT_DICT,
        dupe_cols=DUPE_COLS,
        remove_dupe=REMOVE_DUPE,
        replace_numbers=REPLACE_NUMBERS,
        remove_virtual=REMOVE_VIRTUAL,
        text_clean_attributes=TEXT_CLEAN_ATTRS)

    string_pipe = SetDtypes(
        type_dict={
            'TYPE': str,
            'ALARMTYPE': str,
            'FUNCTION': str,
            'VIRTUAL': str,
            'CS': str,
            'SENSORTYPE': str,
            'DEVUNITS': str
        })

    categories_clean_pipe = Pipeline([('clean_pipe', clean_pipe),
                                      ('string_pipe', string_pipe)])

    # Process raw data with pipeline
    df_clean = categories_clean_pipe.fit_transform(dataset_raw)

    # Calculate categories to be used later
    Encoding = EncodingCategories()
    columns = [
        'TYPE', 'ALARMTYPE', 'FUNCTION', 'VIRTUAL', 'CS', 'SENSORTYPE',
        'DEVUNITS'
    ]
    categories_dict_calc = Encoding.calc_categories_dict(df_clean, columns)

    # Save categories in numpy array to be used later
    Encoding.save_categories_to_disc(categories_dict_calc, CATEGORIES_FILE)

    # Save vocabulary to file
    VOCAB_ALARMTYPE_PATH = '../data/vocab_alarmtype.txt'
    save_numpy_string_array_to_text(categories_dict_calc['ALARMTYPE'],
                                    VOCAB_ALARMTYPE_PATH)
    VOCAB_CS_PATH = '../data/vocab_cs.txt'
    save_numpy_string_array_to_text(categories_dict_calc['CS'], VOCAB_CS_PATH)
    VOCAB_DEVUNITS_PATH = '../data/vocab_devunits.txt'
    save_numpy_string_array_to_text(categories_dict_calc['DEVUNITS'],
                                    VOCAB_DEVUNITS_PATH)
    VOCAB_FUNCTION_PATH = '../data/vocab_function.txt'
    save_numpy_string_array_to_text(categories_dict_calc['FUNCTION'],
                                    VOCAB_FUNCTION_PATH)
    VOCAB_SENSORTYPE_PATH = '../data/vocab_sensortype.txt'
    save_numpy_string_array_to_text(categories_dict_calc['SENSORTYPE'],
                                    VOCAB_SENSORTYPE_PATH)
    VOCAB_TYPE_PATH = '../data/vocab_type.txt'
    save_numpy_string_array_to_text(categories_dict_calc['TYPE'],
                                    VOCAB_TYPE_PATH)
    VOCAB_VIRTUAL_PATH = '../data/vocab_virtual.txt'
    save_numpy_string_array_to_text(categories_dict_calc['VIRTUAL'],
                                    VOCAB_VIRTUAL_PATH)

    return None

Ejemplo n.º 25

0

Mostrar archivo

def main():

    # Hyperparameters
    hyperparams = {
        'by_size':False,
        'n_components':8,
        'reduce':'MDS',
        'clusterer':'ward.D',
        'distance':'euclidean',
        'index':'all'}

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                          replace_numbers=False,
                                          remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                       attributes='NAME',
                                       seperator='.',
                                       heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Clustering class
    UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints()

    # Save hyperparameters to SQL
    # See if its already inserted
    sel = sqlalchemy.select([ClusteringHyperparameter]).where(
        sqlalchemy.sql.and_(ClusteringHyperparameter.by_size == hyperparams['by_size'],
                            ClusteringHyperparameter.clusterer == hyperparams['clusterer'],
                            ClusteringHyperparameter.distance == hyperparams['distance'],
                            ClusteringHyperparameter.reduce == hyperparams['reduce'],
                            ClusteringHyperparameter.n_components == hyperparams['n_components']))
    with Insert.engine.connect() as connection:
        res = connection.execute(sel).fetchall()

    if res.__len__():
        # Get hyperparameters id of existing hyperparameter set
        hyperparameter_id = res[0].id
    else:
        # Insert new object
        res = Insert.core_insert_instance(ClusteringHyperparameter, hyperparams)
        hyperparameter_id = res.inserted_primary_key[0]

    # Get customer list from SQL
    sel = sqlalchemy.select([Customers])
    customers = Insert.core_select_execute(sel)

    # Iterate through customers and cluster
    for customer in customers:

        # Get points from SQL
        sel = sqlalchemy.select([Points]).where(Points.customer_id.__eq__(customer.id))
        database = Insert.pandas_select_execute(sel)
        if database.shape[0] == 0:
            print('Customer ID {} Skipped, points shape {}'.format(customer.id, database.shape[0]))
            continue
        else:
            df_clean = clean_pipe.fit_transform(database)
            X = text_pipe.fit_transform(df_clean).toarray()
            #_word_vocab = text_pipe.named_steps['WordDictToSparseTransformer'].vocabulary
            #df_text = pd.DataFrame(X, columns=_word_vocab)

        # NbClust clustering
        print('Customer ID {}\nDB Size : {}'.format(customer_id, X_reduced.shape))
        try:
            print('Starting NbClust')
            # Perform clustering with NbClust package
            result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X)
            best_nc_df = result.best_nc_dataframe
        except RRuntimeError as e:
            if str(e).__contains__('computationally singular'):
                # The eigenvalue matrix is singular. Reduce the number of dimensions
                _hyperparams = hyperparams
                _hyperparams['n_components'] = int(_hyperparams['n_components'] / 2)
                result = UnsupervisedCluster.cluster_with_hyperparameters(hyperparams, X)
                best_nc_df = result.best_nc_dataframe
            else:
                print(e)
                continue

        # Build dictionary for SQL
        sel = sqlalchemy.select([Customers]).where(Customers.id.__eq__(customer.id))
        with Insert.engine.connect() as connection:
            res = connection.execute(sel).fetchone()
            correct_k = res.correct_k
        values = best_nc_df.loc['Number_clusters'].to_dict()
        values['correct_k'] = correct_k
        values['customer_id'] = customer.id
        values['hyperparameter_id'] = hyperparameter_id
        n_lens = Clustering.get_n_len_features(X)
        for key, val in n_lens.items():
            values[key] = int(val)

        # Save results to SQL
        res = Insert.core_insert_instance(Clustering, values)
        print("Inserted {}".format(res.inserted_primary_key))

    return None

Ejemplo n.º 26

0

Mostrar archivo

from extract import extract
from extract.SQLAlchemyDataDefinition import (Customers, Points, Netdev,
                                              ClusteringHyperparameter, Clustering,
                                              Labeling)

# Local declarations
config = configparser.ConfigParser()
config.read(r'../extract/sql_config.ini')
server_name = config['sql_server']['DEFAULT_SQL_SERVER_NAME']
driver_name = config['sql_server']['DEFAULT_SQL_DRIVER_NAME']
database_name = config['sql_server']['DEFAULT_DATABASE_NAME']

Extract = extract.Extract()
Insert = extract.Insert(server_name,
                        driver_name,
                        database_name)

#%%
"""
Save databases from server to local machine
"""

def main_copy_to_local():

    search_directory = r"R:\JOBS"
    save_directory = r"D:\Z - Saved SQL Databases"

    Extract.search_and_save(search_directory, save_directory)

    return None

Ejemplo n.º 27

0

Mostrar archivo

Archivo: mil_load.py Proyecto: johnvorsten/point_categorizer

 def __init__(self, server_name, driver_name, database_name):
     self.Insert = extract.Insert(server_name=server_name,
                                  driver_name=driver_name,
                                  database_name=database_name)
     return None

Ejemplo n.º 28

0

Mostrar archivo

Archivo: unsupervised_cluster_test.py Proyecto: johnvorsten/point_categorizer

def test_cluster_with_hyperparameters2():

    # Instantiate local classes
    Transform = transform_pipeline.Transform()
    UnsupervisedCluster = unsupervised_cluster.UnsupervisedClusterPoints()
    # Create 'clean' data processing pipeline
    clean_pipe = Transform.cleaning_pipeline(remove_dupe=False,
                                             replace_numbers=False,
                                             remove_virtual=True)

    # Create pipeline specifically for clustering text features
    text_pipe = Transform.text_pipeline(vocab_size='all',
                                        attributes='NAME',
                                        seperator='.',
                                        heirarchial_weight_word_pattern=True)

    # Set up connection to SQL
    Insert = extract.Insert(server_name='.\\DT_SQLEXPR2008',
                            driver_name='SQL Server Native Client 10.0',
                            database_name='Clustering')

    # Get a points dataframe
    customer_id = 13
    sel = sqlalchemy.select([Points
                             ]).where(Points.customer_id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        # res = connection.execute(sel).fetchone()
        database = pd.read_sql(sel, connection)

    df_clean = clean_pipe.fit_transform(database)
    X = text_pipe.fit_transform(df_clean).toarray()
    #_word_vocab = text_pipe.named_steps['WordDictToS

    hyperparameters = {
        'by_size': False,
        'distance': 'euclidean',
        'clusterer': 'ward.D',
        'n_components': 8,
        'reduce': 'MDS',
        'index': 'Ratkowsky'
    }
    # Clean hyperparameters
    hyperparams = UnsupervisedCluster._parse_hyperparameter_dictionary(
        hyperparameters)

    # Perform dimensionality reduction on data
    X_dim_reduced = UnsupervisedCluster._dimensionality_reduction(
        X,
        method=hyperparams['reduce'],
        n_components=hyperparams['n_components'])

    # Conditionally call nbclust package or optimalk package
    # based on input clustering hyperparameters
    if hyperparams['index'] in UnsupervisedCluster.nbclust_indicies:
        # Cluster with nbclust and clustering algorithm
        min_nc = 3  # Static
        max_nc = UnsupervisedCluster._get_max_nc(X)  # Based on actual data

        best_nc_df = UnsupervisedCluster._nbclust_calc(
            X_dim_reduced,
            index=hyperparams['index'],
            clusterer=hyperparams['clusterer'],
            distance=hyperparams['distance'],
            min_nc=min_nc,
            max_nc=max_nc)
    # Get number of clusters
    sel = sqlalchemy.select([Customers])\
        .where(Customers.id.__eq__(customer_id))
    with Insert.engine.begin() as connection:
        res = connection.execute(sel).fetchone()
        correct_k = res.correct_k
    print(correct_k)

    pass

Ejemplo n.º 29

0

Mostrar archivo

    # Remove the drive letter on windows
    _CWD = os.path.splitdrive(os.getcwd())[1]
    _PARTS = _CWD.split(os.sep)
    # Project dir is one level above cwd
    _PROJECT_DIR = os.path.join(os.sep, *_PARTS[:-1])
    if _PROJECT_DIR not in sys.path:
        sys.path.insert(0, _PROJECT_DIR)

from extract import extract
from extract.SQLAlchemyDataDefinition import (Clustering, Points, Netdev,
                                              Customers,
                                              ClusteringHyperparameter,
                                              Labeling)

Insert = extract.Insert(server_name='.\DT_SQLEXPR2008',
                        driver_name='SQL Server Native Client 10.0',
                        database_name='Clustering')

#%%


class Record():
    """Keep track of individual dataframes and their related information
    parameters
    -------
    dataframe : a dataframe containing error metric information.
    See import_error_dfs()
    parent_file : original csv file
    hyper_dict : a dictionary on the predicted sets hyperparameters. For example
    {'hyper1':value1, [...]}"""
    def __init__(self, indicies_dictionary, hyperparameter_dictionary):

Ejemplo n.º 30

0

Mostrar archivo

def get_hyperparameters_serving():
    """The ranking model imputs a tensor of context features and per-item features
    The per-item features are clusterering hyperparameters turned to indicator
    columns.
    In order to predict on a new database, you must input the per-item
    clustering hyperparameters into the model.
    In training, I have been doing this with actual recorded hyperparameters
    For prediction I must generate the clustering hyperparameters. These must
    be known before this module will generate an array of clustering 
    hyperparameters like: 
    [['False', 'kmeans', '8', 'TSNE', 'optk_TSNE_gap*_max'],
     ['True', 'ward.D', '8', 'MDS', 'SDbw'],
     [...]]
    This can be fed to tf.feature_columns or TFRecords in order to generate
    inputs to a ranking model for prediction
    """

    # Instantiate a class for reading SQL data
    Insert = extract.Insert(server_name,
                            driver_name,
                            database_name)

    """Get most frequent hyperparameter occurences for each customer
    Customer IDs are used to retrieve clustering results for each customer"""
    sel = sqlalchemy.select([Customers.id])
    customer_ids = Insert.core_select_execute(sel)

    # Keep track of the best clustering hyperparameters for all datasets
    all_labels = []

    for _id in customer_ids:
        customer_id = _id.id

        """Get primary key of clusterings related to customer
        Each primary key is used to create Record objects with get_records"""
        sel = sqlalchemy.select([Clustering.id, Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))
        res = Insert.core_select_execute(sel)
        primary_keys = [x.id for x in res]

        # Create records for feeding while calculating the best labels
        records = get_records(primary_keys)
        if records.__len__() <= 1:
            # Not enough examples to append
            continue
        sel = sqlalchemy.select([Clustering.correct_k])\
            .where(Clustering.customer_id.__eq__(customer_id))\
            .limit(1)
        res = Insert.core_select_execute(sel)
        correct_k = res[0].correct_k
        """best_labels is a list of namedtuple objects
        each tuple has a name hyperparameter_dict which contains hyperparameters
        used to cluster that customers database
        A unique list of clustering hyperparameters will be used for model serving"""
        best_labels = ClusteringLabels.calc_labels(records, correct_k, error_scale=0.8, var_scale=0.2)

        """Keep the 10 best best_lables for each customer_id
        The idea is we should predict between some of the best available
        hyperparameters for ranking model"""
        if best_labels.__len__() > 10:
            for i in range(0,10):
                all_labels.append(best_labels[i])
        else:
            n = int(best_labels.__len__() * 0.5)
            for i in range(n):
                all_labels.append(best_labels[i])

    """Each hyperparameter_dict in all_labels is not unique
    To create a unique set of dictionary values use the frozenset object
    The frozenset is hashable (unlike normal set) which means it can be used
    in Counter objects"""
    hyperparams = []
    for x in all_labels:
        y = x.hyperparameter_dict # Dictionary
        hyperparams_set = frozenset(y.values())
        hyperparams.append(hyperparams_set)

    # Counter objects create a set from hyperparams
    c = Counter(hyperparams)
    c.most_common()

    """Convert to dictionary and save in list
    Convert hyperparameter frozenset back to a nomral dictionary"""
    hyperparameters_serving = []
    for x in c.keys():
        hyperparameter_dict = ClusteringLabels._hyperparameter_set_2_dict(x)
        hyperparameters_serving.append(hyperparameter_dict)

    return hyperparameters_serving