Exemple #1
0
from util.base_util import normalize_genre_string


pickle_dir="C:\\Users\\Kevin\\Desktop\\GitHub\\Research\\Webscraper\\pickle_dir"

if __name__=="__main__":
    mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"}

    #s=SourceMapper(URLBow.objects(),mapping)
    X_pickle_path=os.path.join(pickle_dir,"X_summary_pickle")
    y_pickle_path=os.path.join(pickle_dir,"y_summary_pickle")
    ref_index_pickle_path=os.path.join(pickle_dir,"refIndex_summary_pickle")

    mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"}


    label="summary_unsupervised_chi_top1cls_10000"

    #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),1000)

    #load training, feature selection
    train_set=Training(label,pickle_dir=pickle_dir)
    train_set.load_training()
    train_set.y=np.array([list(set(normalize_genre_string(genre,1) for genre in g_list)) for g_list in train_set.y])
    train_set.X=chi_squared_feature_select(train_set.X,train_set.y,k_best=10000)

    params=GraphCutParams(X=train_set.X,y=train_set.y,ref_id=train_set._ref_index,
                   k_closest_neighbors=4,vocab_size=train_set._X.shape[1],num_clusters=3)

    alpha_beta_swap(params)
Exemple #2
0
def load_training_testing(Xs,ys,ref_indexes,settings,train_set_index,test_set_index):
    """
    Load training and testing set based on indexes provided by crossvalidation

    :return: List of train_set and test_set objs
    """
    train_sets=[]
    test_sets=[]

    for c,setting in enumerate(settings):
        train_set=Training(setting,pickle_dir=setting.pickle_dir)
        train_set.X=Xs[c][train_set_index]
        train_set.y=ys[c][train_set_index]
        train_set.ref_index=ref_indexes[c][train_set_index]

        test_set=Testing(setting,pickle_dir=setting.pickle_dir)
        test_set.X=Xs[c][test_set_index]
        test_set.y=ys[c][test_set_index]
        test_set.ref_index=ref_indexes[c][test_set_index]

        train_sets.append(train_set)
        test_sets.append(test_set)

    #flatten training
    for train_set in train_sets:
        flatten_train_set(train_set)

    #make sure the sets match
    classification_logger.info("Checking the sets match")
    ys=[train_set.y for train_set in train_sets]
    ref_indexes=[train_set.ref_index for train_set in train_sets]

    test_ys=np.array([test_set.y for test_set in test_sets])
    test_ref_indexes=[test_set.ref_index for test_set in test_sets]

    for c,elem in enumerate((ys,ref_indexes,test_ys,test_ref_indexes)):

        prev=elem[0]
        match=True
        for e in elem[1:]:
            match=match and (e==prev).all()
        if not match:
            raise AttributeError("NOT MATCH FOR {} ELEMENT".format(c))

    return train_sets,test_sets


# def select_training_testing_sets(settings,Xs,y,ref_index,num,do_pickle=True):
#     """
#     Randomly choose from a super set of data and split it into a training set of size num. The remainder will become
#         the Test set. Uses _pick_random_samples
#
#     :param setting:
#     :param X:
#     :param y:
#     :param ref_index:
#     :param num:
#     :return: tuple_training,tuple_testing
#     """
#
#     selector=np.not_equal(ref_index,None)
#     ref_index=ref_index[selector]
#     Xs=[X[selector] for X in Xs]
#     y=y[selector]
#
#     train_Xs,train_y,train_ref_index,test_Xs,test_y,test_ref_index=_pick_random_samples(Xs,y,ref_index,num)
#
#     train_objs=[]
#     test_objs=[]
#     if do_pickle:
#         for c,setting in enumerate(settings):
#             train_X=train_Xs[c]
#             test_X=test_Xs[c]
#
#             _pickle_training_testing(setting,train_X,train_y,train_ref_index,test_X,test_y,test_ref_index)
#
#             training_obj=Training(label=setting,pickle_dir=setting.pickle_dir)
#             training_obj.set_data(train_X,train_y,train_ref_index)
#
#             testing_obj=Testing(label=setting,pickle_dir=setting.pickle_dir)
#             testing_obj.set_data(test_X,test_y,test_ref_index)
#
#             train_objs.append(training_obj)
#             test_objs.append(testing_obj)
#
#     return train_objs,test_objs
Exemple #3
0
    settings=LearningSettings(type="unsupervised",dim_reduction="chi",feature_selection="summary",num_attributes=10000)
    settings.parent_clusters=[] #used to record a tree of parent clusters for the current cluster

    #settings.clustering_alg="kNN_agglomerative"
    settings.clustering_alg="lda"
    clustering_alg=KMeans
    settings.num_clusters=list({16})
    settings.num_top_words=20 #LDA only
    settings.max_cluster_size=10000 #the cluster will be further broken up if it is greater than this size
    settings.break_up_clusters=True
    settings.spectre_clustering_limit=15000 # if the cluster is less than 15K in size, use spectre clustering instead

    #LOAD DATA
    #generate_random_sample(unpickle_obj(X_pickle_path),unpickle_obj(y_pickle_path),unpickle_obj(ref_index_pickle_path),50000)

    train_set=Training(settings,pickle_dir=PICKLE_DIR)
    train_set.load_training()

    #FEATURE SELECTION
    best_k_attr=10000
    feature_selector=Pipeline([("chi2",SelectKBest(chi2,best_k_attr))])

    clustering_logger.info("Choosing best {} features".format(best_k_attr))

    clustering_logger.debug("Normalizing to LV1")
    #NORMALIZING THE Y
    train_set.y=np.array([[normalize_genre_string(g,1) for g in r] for r in (row for row in train_set.y)])

    clusterer=Clustering()
    clusterer.feature_selection(train_set,feature_selector,fit=True)