## Re-training ##
    #################
    print("For training and testing...")
    path = "%s/All" % config.feat_folder
    for feat_name,column_name in zip(feat_names, column_names):
        print "generate %s feat" % feat_name
        tfv = getTFV(ngram_range=ngram_range)
        X_tfidf_train = tfv.fit_transform(dfTrain[column_name])
        X_tfidf_test = tfv.transform(dfTest[column_name])
        with open("%s/train.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_tfidf_train, f, -1)
        with open("%s/test.%s.feat.pkl" % (path, feat_name), "wb") as f:
            cPickle.dump(X_tfidf_test, f, -1)

        ## svd
        svd = TruncatedSVD(n_components=svd_n_components, n_iter=15)
        X_svd_train = svd.fit_transform(X_tfidf_train)
        X_svd_test = svd.transform(X_tfidf_test)
        with open("%s/train.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
            cPickle.dump(X_svd_train, f, -1)
        with open("%s/test.%s_individual_svd%d.feat.pkl" % (path, feat_name, svd_n_components), "wb") as f:
            cPickle.dump(X_svd_test, f, -1)

    print("Done.")

    ## save feat names
    print("Feature names are stored in %s" % feat_name_file)
    feat_names += [ "%s_individual_svd%d"%(f, svd_n_components) for f in feat_names ]
    dump_feat_name(feat_names, feat_name_file)

    print("All Done.")
                    with open("%s/valid.%s.feat.pkl" % (path, feat_name),
                              "wb") as f:
                        cPickle.dump(X_valid, f, -1)
                    print X_train.shape
                    print X_valid.shape
                ## extract statistical distance features
                if stats_feat_flag:
                    dfTrain2 = dfTrain.iloc[trainInd].copy()
                    dfValid = dfTrain.iloc[validInd].copy()
                    extract_statistical_distance_feat(path, dfTrain2, dfValid,
                                                      "valid", feat_names)

        ## save feat names
        print("Feature names are stored in %s" % feat_name_file)
        ## dump feat name
        dump_feat_name(feat_names, feat_name_file)
        print("----time elapsed----",
              str(timedelta(seconds=time.time() - start_time)))
        print("Done.")

    #===========================================
    ###For testing...
    print sys.argv[1]
    if sys.argv[1] == "Testing":
        if sys.argv[2] == "All":
            Ntest = range(config.test_subset_number)
        else:
            exec("Ntest =" + sys.argv[2])
        path = "%s/All" % config.feat_folder
        ## use full version for X_train
Esempio n. 3
0
                    #    for fold, (trainInd, validInd) in enumerate(skf[run]):
                    print("Run: %d, Fold: %d" % (run + 1, fold + 1))
                    path = "%s/Run%d/Fold%d" % (config.feat_folder, run + 1,
                                                fold + 1)

                    dfTrain2 = dfTrain.iloc[trainInd].copy()
                    dfValid = dfTrain.iloc[validInd].copy()
                    print dfTrain2.shape, dfValid.shape
                    ## extract feat
                    new_feat_names = extract_feat(path, dfTrain2, dfValid,
                                                  "valid", feat_names,
                                                  column_names)
                    print("----time elapsed----",
                          str(timedelta(seconds=time.time() - start_time)))
            ## dump feat name
            dump_feat_name(new_feat_names, feat_name_file)
            print("Done.")

    #===========================================
    ###For testing...
    if sys.argv[1] == "Testing":
        if sys.argv[2] == "All":
            Ntest = range(config.test_subset_number)
        else:
            exec("Ntest =" + sys.argv[2])

        for vec_type in vec_types:
            print "Processing vector %s" % vec_type
            feat_names = ["question1", "question2"]
            feat_names = [
                name + "_%s_%s_vocabulary" % (vec_type, vocabulary_type)