Ejemplo n.º 1
0
Archivo: dcl.py Proyecto: shuber2/pyfsa
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Setup vanilla CLI parsing and add custom arg(s).
    parser = utils.setup_cli_parsing()
    parser.add_option("",
                      "--codewords",
                      help="number of codewords.",
                      default=50,
                      type="int")
    (options, args) = parser.parse_args()

    # Setup logging
    utils.setup_logging(options)
    logger = logging.getLogger()

    # Read graph file list and label file list
    graph_file_list = utils.read_graph_file_list(options)
    if not options.globalLabelFile is None:
        label_file_list = [options.globalLabelFile] * len(graph_file_list)
    else:
        label_file_list = utils.read_label_file_list(options,
                                                     graph_file_list)

    # Read class info and grouping info
    class_info = utils.read_class_info(options)
    group_info = utils.read_group_info(options)

    assert (group_info.shape[0] ==
            len(class_info) ==
            len(graph_file_list) ==
            len(label_file_list))

    # Zip lists together
    data = zip(graph_file_list,
               label_file_list,
               class_info)

    # Run fine-structure analysis
    fsa_res = fsa.run_fsa(data,
                          options.radii,
                          options.recompute,
                          options.writeAs,
                          options.skip,
                          options.omitDegenerate)
    data_mat = fsa_res['data_mat']
    data_idx = fsa_res['data_idx']

    # Create cross-validation folds
    # NOTE: random_state=0 should guarantee equal splits
    n_graphs = len(class_info)
    cv = ShuffleSplit(n_graphs,
                      n_iter=options.cvRuns,
                      test_size=0.2,
                      random_state=0)

    # Try inplace feature normalization
    if options.normalize:
        logger.info("Running feature normalization ...")
        scaler = preprocessing.StandardScaler(copy=False)
        scaler.fit_transform(fsa_res['data_mat'])

    scores = []
    todisk = []
    for cv_id, (trn, tst) in enumerate(cv):

        # Compose training data
        pos = []
        for i in trn:
            tmp = np.where(data_idx==i)[0]
            pos.extend(list(tmp))
        np_pos = np.array(pos)

        # Learn a codebook from training data
        codebook = fsa.learn_codebook(data_mat[np_pos,:],
                                      options.codewords,
                                      options.seed)

        # Compute BoW histograms for training data
        bow_trn_mat = np.zeros((len(trn), options.codewords))
        for cnt, i in enumerate(trn):
            np_pos = np.where(data_idx==i)[0]
            bow_trn_mat[cnt,:] = np.asarray(fsa.bow(data_mat[np_pos,:],
                                                    codebook))

        # Cross-validate (5-fold) SVM classifier and parameters
        param_selection = [{'kernel': ['rbf'],
                            'gamma': np.logspace(-6,2,10),
                            'C': [1, 10, 100, 1000]},
                           {'kernel': ['linear'],
                            'C': [1, 10, 100, 1000]}]
        clf = GridSearchCV(svm.SVC(C=1), param_selection, cv=5)
        clf.fit(bow_trn_mat, np.asarray(class_info)[trn])

        # Compute BoW histograms for testing data
        bow_tst_mat = np.zeros((len(tst), options.codewords))
        for cnt,i in enumerate(tst):
            pos =  np.where(data_idx==i)[0]
            bow_tst_mat[cnt,:] = fsa.bow(data_mat[pos,:], codebook)


        yhat = clf.predict(bow_tst_mat)
        gold = np.asarray(class_info)[tst]    

        print "yhat : ", yhat
        print "gold : ", gold

        tmp = {"yhat" : list(yhat), 
               "gold" : list(gold)}
        todisk.append(tmp)

        # Score the classifier
        score = clf.score(bow_tst_mat, np.asarray(class_info)[tst])
        scores.append(score)
        logger.info("Score (%.2d): %.2f" % (cv_id,100*score))

    json_file = "%s.json" % options.writeAs
    with open(json_file, 'w') as outfile:
        json.dump(todisk, outfile)

    utils.show_summary(scores)
Ejemplo n.º 2
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Setup vanilla CLI parsing and add custom arg(s).
    parser = utils.setup_cli_parsing()
    parser.add_option("",
                      "--mixComp",
                      help="number of GMM components.",
                      default=3,
                      type="int")
    (options, args) = parser.parse_args()

    # Setup logging
    utils.setup_logging(options)
    logger = logging.getLogger()

    # Read graph file list and label file list
    graph_file_list = utils.read_graph_file_list(options)
    label_file_list = utils.read_label_file_list(options, graph_file_list)

    # Read class info and grouping info
    class_info = utils.read_class_info(options)
    group_info = utils.read_group_info(options)

    assert (group_info.shape[0] == len(class_info) == len(graph_file_list) ==
            len(label_file_list))

    # Zip lists together
    data = zip(graph_file_list, label_file_list, class_info)

    # Run fine-structure analysis
    fsa_res = fsa.run_fsa(data, options.radii, options.recompute,
                          options.writeAs, options.skip,
                          options.omitDegenerate)
    data_mat = fsa_res['data_mat']
    data_idx = fsa_res['data_idx']

    # Create cross-validation folds (20% testing)
    n_graphs = len(class_info)
    cv = ShuffleSplit(n_graphs,
                      n_iter=options.cvRuns,
                      test_size=0.2,
                      random_state=0)

    # Our unique class labels
    label_set = np.unique(class_info)

    if options.normalize:
        logger.info("Running feature normalization ...")
        scaler = preprocessing.StandardScaler(copy=False)
        scaler.fit_transform(fsa_res['data_mat'])

    scores = []
    for cv_id, (trn, tst) in enumerate(cv):

        models = []
        for l in label_set:
            l_idx = np.where(class_info == l)[0]
            l_idx = np.asarray(l_idx).ravel()
            l_trn = np.intersect1d(l_idx, trn)

            pos = []
            for i in l_trn:
                tmp = np.where(fsa_res['data_idx'] == i)[0]
                pos.extend(list(tmp))

            np_pos = np.asarray(pos)
            gmm_model = fsa.estimate_gm(data_mat[np_pos, :], options.mixComp)
            models.append(gmm_model)

        predict = []
        for i in tst:
            pos = np.where(data_idx == i)[0]
            map_idx = fsa.pp_gmm(data_mat[pos, :], models, argmax=True)
            predict.append(label_set[map_idx])

        # Score the MAP classifier
        truth = [class_info[i] for i in tst]
        score = accuracy_score(truth, predict)

        print "yhat :", predict
        print "gold :", truth

        logger.info("Score (%.2d): %.2f" % (cv_id, 100 * score))
        scores.append(score)

    utils.show_summary(scores)
Ejemplo n.º 3
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Setup vanilla CLI parsing and add custom arg(s).
    parser = utils.setup_cli_parsing()
    parser.add_option("",
                      "--mixComp",
                      help="number of GMM components.",
                      default=3,
                      type="int")
    (options, args) = parser.parse_args()

    # Setup logging
    utils.setup_logging(options)
    logger = logging.getLogger()

    # Read graph file list and label file list
    graph_file_list = utils.read_graph_file_list(options)
    label_file_list = utils.read_label_file_list(options, graph_file_list)

    # Read class info and grouping info
    class_info = utils.read_class_info(options)
    group_info = utils.read_group_info(options)

    assert (group_info.shape[0] ==
            len(class_info) ==
            len(graph_file_list) ==
            len(label_file_list))

    # Zip lists together
    data = zip(graph_file_list,
               label_file_list,
               class_info)

    # Run fine-structure analysis
    fsa_res = fsa.run_fsa(data,
                          options.radii,
                          options.recompute,
                          options.writeAs,
                          options.skip,
                          options.omitDegenerate)
    data_mat = fsa_res['data_mat']
    data_idx = fsa_res['data_idx']

    # Create cross-validation folds (20% testing)
    n_graphs = len(class_info)
    cv = ShuffleSplit(n_graphs,
                      n_iter=options.cvRuns,
                      test_size=0.2,
                      random_state=0)

    # Our unique class labels
    label_set = np.unique(class_info)

    if options.normalize:
        logger.info("Running feature normalization ...")
        scaler = preprocessing.StandardScaler(copy=False)
        scaler.fit_transform(fsa_res['data_mat'])

    scores = []
    for cv_id, (trn, tst) in enumerate(cv):

        models = []
        for l in label_set:
            l_idx = np.where(class_info == l)[0]
            l_idx = np.asarray(l_idx).ravel()
            l_trn = np.intersect1d(l_idx, trn)

            pos = []
            for i in l_trn:
                tmp = np.where(fsa_res['data_idx']==i)[0]
                pos.extend(list(tmp))

            np_pos = np.asarray(pos)
            gmm_model = fsa.estimate_gm(data_mat[np_pos,:], options.mixComp)
            models.append(gmm_model)

        predict = []
        for i in tst:
            pos = np.where(data_idx==i)[0]
            map_idx = fsa.pp_gmm(data_mat[pos,:], models, argmax=True)
            predict.append(label_set[map_idx])

        # Score the MAP classifier
        truth = [class_info[i] for i in tst]
        score = accuracy_score(truth, predict)

        print "yhat :", predict
        print "gold :", truth

        logger.info("Score (%.2d): %.2f" % (cv_id, 100*score))
        scores.append(score)

    utils.show_summary(scores)
Ejemplo n.º 4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    # Setup vanilla CLI parsing and add custom arg(s).
    parser = utils.setup_cli_parsing()
    parser.add_option("",
                      "--codewords",
                      help="number of codewords.",
                      default=50,
                      type="int")
    (options, args) = parser.parse_args()

    # Setup logging
    utils.setup_logging(options)
    logger = logging.getLogger()

    # Read graph file list and label file list
    graph_file_list = utils.read_graph_file_list(options)
    if not options.globalLabelFile is None:
        label_file_list = [options.globalLabelFile] * len(graph_file_list)
    else:
        label_file_list = utils.read_label_file_list(options,
                                                     graph_file_list)

    # Read class info and grouping info
    class_info = utils.read_class_info(options)
    group_info = utils.read_group_info(options)

    assert (group_info.shape[0] ==
            len(class_info) ==
            len(graph_file_list) ==
            len(label_file_list))

    # Zip lists together
    data = zip(graph_file_list,
               label_file_list,
               class_info)

    # Run fine-structure analysis
    fsa_res = fsa.run_fsa(data,
                          options.radii,
                          options.recompute,
                          options.writeAs,
                          options.skip,
                          options.omitDegenerate)
    data_mat = fsa_res['data_mat']
    data_idx = fsa_res['data_idx']

    # Create cross-validation folds
    n_graphs = len(class_info)
    cv = ShuffleSplit(n_graphs,
                      n_iter=options.cvRuns,
                      test_size=0.2,
                      random_state=0)

    # Try inplace feature normalization
    if options.normalize:
        logger.info("Running feature normalization ...")
        scaler = preprocessing.StandardScaler(copy=False)
        scaler.fit_transform(fsa_res['data_mat'])

    scores = []
    for cv_id, (trn, tst) in enumerate(cv):

        # Compose training data
        pos = []
        for i in trn:
            tmp = np.where(data_idx==i)[0]
            pos.extend(list(tmp))
        np_pos = np.array(pos)

        # Learn a codebook from training data
        codebook = fsa.learn_codebook(data_mat[np_pos,:],
                                      options.codewords,
                                      options.seed)

        # Compute BoW histograms for training data
        bow_trn_mat = np.zeros((len(trn), options.codewords))
        for cnt, i in enumerate(trn):
            np_pos = np.where(data_idx==i)[0]
            bow_trn_mat[cnt,:] = np.asarray(fsa.bow(data_mat[np_pos,:],
                                                    codebook))

        # Cross-validate (5-fold) SVM classifier and parameters
        param_selection = [{'kernel': ['rbf'],
                            'gamma': np.logspace(-6,2,10),
                            'C': [1, 10, 100, 1000]},
                           {'kernel': ['linear'],
                            'C': [1, 10, 100, 1000]}]
        clf = GridSearchCV(svm.SVC(C=1), param_selection)
        clf.fit(bow_trn_mat, np.asarray(class_info)[trn], cv=5)

        # Compute BoW histograms for testing data
        bow_tst_mat = np.zeros((len(tst), options.codewords))
        for cnt,i in enumerate(tst):
            pos =  np.where(data_idx==i)[0]
            bow_tst_mat[cnt,:] = fsa.bow(data_mat[pos,:], codebook)

        print "yhat : ", clf.predict(bow_tst_mat)
        print "gold : ", np.asarray(class_info)[tst]

        # Score the classifier
        score = clf.score(bow_tst_mat, np.asarray(class_info)[tst])
        scores.append(score)

        logger.info("Score (%.2d): %.2f" % (cv_id,100*score))

    utils.show_summary(scores)