Example #1
0
def pso_train(doc_dir: str, ref_dir: str, config):
    docs = sorted(os.listdir(doc_dir))
    refs = sorted(os.listdir(ref_dir))

    documents: List[List[List[str]]] = []
    references: List[str] = []
    features = []
    for d, r in zip(docs, refs):
        doc, ref = Utils.load_document(doc_dir + "/" + d, ref_dir + "/" + r)
        p_doc: List[List[str]] = Utils.process_document(
            doc, config.use_stopwords, config.use_lemmatizer)
        p_doc_wo_title = Utils.process_document(Utils.remove_headings(doc),
                                                config.use_stopwords,
                                                config.use_lemmatizer)
        p_ref: str = Utils.join_sentences(
            Utils.process_document(ref, config.use_stopwords,
                                   config.use_lemmatizer))
        features.append(PSO.extract_features(p_doc, config))
        documents.append(p_doc_wo_title)
        references.append(p_ref)

    # Initialize a Binary PSO model.
    # python main.py  -mode train  -w_max 0.9  -w_min 0.4  -v_max 4  -v_min -4  -c1 1  -c2 1  -num_particles 2  -num_iterations 20
    # -num_features 3  -summary_size 75  -similarity_score 0.12  -n_grams 1  -freq_thresh 0.4  -max_sent_thresh 0.8  -min_sent_thresh 0.2  -use_stopwords True  -use_lemmatizer False  -file None  -index 25
    # CHECK w   is what
    model = PSO.Swarm(documents,
                      references,
                      n_features=config.num_features,
                      n_particles=config.num_particles,
                      n_iterations=config.num_iterations,
                      w=0.9,
                      c1=config.c1,
                      c2=config.c2,
                      sum_size=config.summary_size,
                      config=config)

    # Train the model with extracted features.
    weights = model.train(features)

    # Generate summary with weights.
    rouge_scores = [0.0] * len(documents)
    for i, feature in enumerate(features):
        p_sum_idx = np.argsort(np.dot(feature, weights))[-config.summary_size:]
        p_sum = Utils.join_sentences([documents[i][idx] for idx in p_sum_idx])
        rouge_scores[i] = Utils.calculate_rouge(p_sum, [references[i]], 1)

    print(rouge_scores)
    print(weights)
    return weights
Example #2
0
fn = "history_ncert_class10/chap_3.txt" # sys.argv[1]
ref_dir_n = "history_ncert_class10/annotations/chapter3" # sys.argv[2]

# load file
document, ref_sum = Utils.load_documents(fn, ref_dir_n)

# Pre-process with Stemmer and/or Lemmatizer.
processed_doc = Utils.process_document(document)
processed_ref_sum = Utils.process_documents(ref_sum)

# Extract features
features = PSO.extract_features(processed_doc)

# Initialize a Binary PSO model.
model = PSO.Swarm(processed_doc, processed_ref_sum)

# Train the model with extracted features.
weights = model.train(features)

# Generate summary with weights.
p_sum_idx = np.argsort(np.dot(features, weights))[-PSO.SUMMARY_SIZE:]
p_sum = Utils.generate_summary([document[idx] for idx in p_sum_idx])
p_sum1 = Utils.join_sentences([processed_doc[idx] for idx in p_sum_idx])
ref_sum = Utils.join_docs(processed_ref_sum)
print("Final Rouge Score: ", Utils.calculate_rouge(p_sum1, ref_sum, 1))

f = open("predicted_summary.txt", 'w', encoding='utf-8')
f.write(p_sum)
f.close()
Example #3
0
def main(dataset, run, alg):
    '''
    :param dataset:
    :param run: run index
    :param alg
    :return:
    '''

    # set seed for random, this seed is for PSO in PSO and PSOL
    # to evolve the solutions
    np.random.seed(1617 * run)

    # load data
    mat = scipy.io.loadmat('/home/nguyenhoai2/Grid/data/FSMathlab/' + dataset +
                           '.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # ensure that y label start from 0, not 1
    num_class, count = np.unique(y, return_counts=True)
    n_classes = np.unique(y).shape[0]
    min_class = np.min(count)
    if np.max(y) >= len(num_class):
        y = y - 1
    n_features = X.shape[1]

    # ensure that the division is the same for all algorithms, in all runs
    n_splits = min(min_class, 10)
    skf = StratifiedKFold(n_splits=n_splits, random_state=1617)

    to_print = 'Apply %d folds\n' % n_splits

    if alg == 'PSO':
        if n_features < 100:
            num_selected_features = [
                i for i in range(1, n_features, n_features / 10)
            ]
        else:
            num_selected_features = [i for i in range(10, 110, 10)]
        selected_test_svm = np.array([0.0] * len(num_selected_features))
        selected_test_knn = np.array([0.0] * len(num_selected_features))
        full_test_svm = []
        full_test_knn = []

        for train_index, test_index in skf.split(X, y):
            to_print += '=========Fold ' + str(count) + '=========\n'
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # normalize data
            scaler = preprocessing.StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = np.nan_to_num(X_train)
            X_test = np.nan_to_num(X_test)
            normalize = preprocessing.MinMaxScaler().fit(X_train)
            X_train = normalize.transform(X_train)
            X_test = normalize.transform(X_test)
            X_train = np.nan_to_num(X_train)
            X_test = np.nan_to_num(X_test)

            # full results
            clf = svm.LinearSVC(random_state=1617)
            clf.fit(X_train, y_train)
            full_test_svm.append(np.mean(clf.predict(X_test) == y_test))

            clf = KNeighborsClassifier()
            clf.fit(X_train, y_train)
            full_test_knn.append(np.mean(clf.predict(X_test) == y_test))

            # prepare for PSO
            n_part = 50
            n_iter = 1000
            max_range = X_train.max(axis=0)
            min_range = X_train.min(axis=0)
            max_pos = np.tile(max_range, (n_classes, ))
            min_pos = np.tile(min_range, (n_classes, ))
            length = n_features * n_classes
            max_vel = np.array([0.05] * max_pos.shape[0])
            min_vel = -max_vel
            prob = Problem.CentroidClassification(minimized=True,
                                                  X=X_train,
                                                  y=y_train)
            swarm = PSO.Swarm(n_particle=n_part,
                              length=length,
                              problem=prob,
                              n_iterations=n_iter,
                              max_pos=max_pos,
                              min_pos=min_pos,
                              max_vel=max_vel,
                              min_vel=min_vel)
            sol, fit, loss, dist = swarm.iterate()

            centroids = np.reshape(sol, (n_classes, n_features))
            normalize = preprocessing.MinMaxScaler().fit(centroids)
            centroids_n = normalize.transform(centroids)
            vars = np.var(centroids_n, axis=0)
            idx = np.argsort(vars)[::-1]

            for index, n_selected in enumerate(num_selected_features):
                X_train_selected = X_train[:, idx[0:n_selected]]
                X_test_selected = X_test[:, idx[0:n_selected]]

                # D_train = cdist(X_train, centroids)
                # pseu_train = np.argmin(D_train, axis=1)
                # print("Training accuracy: %f" %np.mean(y_train == pseu_train))
                #
                # D_test = cdist(X_test, centroids)
                # pseu_test = np.argmin(D_test, axis=1)
                # print("Testing accuracy: %f" %np.mean(y_test == pseu_test))

                clf = svm.LinearSVC(random_state=1617)
                clf.fit(X_train_selected, y_train)
                selected_test_svm[index] += np.mean(
                    clf.predict(X_test_selected) == y_test)

                clf = KNeighborsClassifier()
                clf.fit(X_train_selected, y_train)
                selected_test_knn[index] += np.mean(
                    clf.predict(X_test_selected) == y_test)

        selected_test_svm = np.array(selected_test_svm) / n_splits
        selected_test_knn = np.array(selected_test_knn) / n_splits
        test_svm = np.mean(full_test_svm)
        test_knn = np.mean(full_test_knn)

        print "-------------------KNN----------------------"
        print 'Full test: %f' % test_knn
        for n_features, selected_test in zip(num_selected_features,
                                             selected_test_knn):
            print '%d features: %f' % (n_features, selected_test)

        print "-------------------SVM----------------------"
        print 'Full test: %f' % test_svm
        for n_features, selected_test in zip(num_selected_features,
                                             selected_test_svm):
            print '%d features: %f' % (n_features, selected_test)

    elif alg == 'PSOL':
        num_selected_features = []
        selected_test_knn = []
        selected_test_svm = []
        selected_test_embed = []
        full_test_svm = []
        full_test_knn = []

        for train_index, test_index in skf.split(X, y):
            to_print += '=========Fold ' + str(count) + '=========\n'
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # normalize data
            scaler = preprocessing.StandardScaler().fit(X_train)
            X_train = scaler.transform(X_train)
            X_test = scaler.transform(X_test)
            X_train = np.nan_to_num(X_train)
            X_test = np.nan_to_num(X_test)
            normalize = preprocessing.MinMaxScaler().fit(X_train)
            X_train = normalize.transform(X_train)
            X_test = normalize.transform(X_test)
            X_train = np.nan_to_num(X_train)
            X_test = np.nan_to_num(X_test)

            # full results
            clf = svm.LinearSVC(random_state=1617)
            clf.fit(X_train, y_train)
            full_test_svm.append(np.mean(clf.predict(X_test) == y_test))

            clf = KNeighborsClassifier()
            clf.fit(X_train, y_train)
            full_test_knn.append(np.mean(clf.predict(X_test) == y_test))

            # prepare for PSO
            n_part = 50
            n_iter = 1000
            max_range = X_train.max(axis=0)
            min_range = X_train.min(axis=0)
            max_pos = np.tile(max_range, (n_classes, ))
            max_pos = np.append(max_pos, 1.0)
            min_pos = np.tile(min_range, (n_classes, ))
            min_pos = np.append(min_pos, 0.0)
            length = n_features * n_classes + 1
            max_vel = np.array([0.05] * max_pos.shape[0])
            min_vel = -max_vel

            prob = Problem.CentroidClassificationLimit(minimized=True,
                                                       X=X_train,
                                                       y=y_train)
            swarm = PSO.Swarm(n_particle=n_part,
                              length=length,
                              problem=prob,
                              n_iterations=n_iter,
                              max_pos=max_pos,
                              min_pos=min_pos,
                              max_vel=max_vel,
                              min_vel=min_vel)
            sol, fit, loss, dist = swarm.iterate()

            centroids = np.reshape(sol[0:n_features * n_classes],
                                   (n_classes, n_features))
            n_selected_features = int(sol[n_features * n_classes] * n_features)
            # normalize = preprocessing.MinMaxScaler().fit(centroids)
            # centroids_n = normalize.transform(centroids)
            vars = np.var(centroids, axis=0)
            idx = np.argsort(vars)[::-1]
            X_train_selected = X_train[:, idx[0:n_selected_features]]
            X_test_selected = X_test[:, idx[0:n_selected_features]]
            centroids_selected = centroids[:, idx[0:n_selected_features]]

            num_selected_features.append(n_selected_features)

            D = cdist(X_test_selected, centroids_selected, metric='cityblock')
            pseu = np.argmin(D, axis=1)
            selected_test_embed.append(np.mean(pseu == y_test))

            clf = svm.LinearSVC(random_state=1617)
            clf.fit(X_train_selected, y_train)
            selected_test_svm.append(
                np.mean(clf.predict(X_test_selected) == y_test))

            clf = KNeighborsClassifier()
            clf.fit(X_train_selected, y_train)
            selected_test_knn.append(
                np.mean(clf.predict(X_test_selected) == y_test))
            print selected_test_embed[-1]
            print full_test_knn[-1], selected_test_knn[-1]
            print full_test_svm[-1], selected_test_svm[-1]

        print "-------------------Centroid----------------------"
        print 'Centroid: %f' % np.mean(selected_test_embed)

        print "-------------------KNN----------------------"
        print 'Full test: %f' % np.mean(full_test_knn)
        print 'Select %f features with accuracy of %f' % (
            np.mean(num_selected_features), np.mean(selected_test_knn))

        print "-------------------SVM----------------------"
        print 'Full test: %f' % np.mean(full_test_svm)
        print 'Select %f features with accuracy of %f' % (
            np.mean(num_selected_features), np.mean(selected_test_svm))

    else:
        raise Exception('Algorithm %s has not been implemented!!!!' % alg)
Example #4
0
import matplotlib.pyplot as plt
import Utils
import PSO
import os

directory = 'graphs/function'
if not os.path.exists(directory):
    os.makedirs(directory)
directoryboxplot = 'graphs/boxplot'
if not os.path.exists(directoryboxplot):
    os.makedirs(directoryboxplot)

particleSwarmOpt = PSO.Swarm()
for inertia in Utils.constants.inertiaTypes:
    for function in Utils.constants.functionTypes:
        listselems = []
        subtitlesElems = []
        finals = []
        for comunication in Utils.constants.comunicationTypes:
            listelems, finalgbestLists = particleSwarmOpt.run(
                inertia, comunication, function)
            listselems.append(listelems)
            subtitlesElems.append(comunication.name)
            finals.append(finalgbestLists)
        for i in range(len(listselems)):
            funcselems = listselems[i]
            plt.plot(funcselems, label=subtitlesElems[i])
        plt.legend()
        plt.savefig(
            os.path.join(directory,
                         function.name + '_' + inertia.name + '.png'))
Example #5
0
import PSO as PSO
from MyMath import distance3d

# a simple 3d geometric search example
# the heuristic for each particle is the inverse of distance from DESTINATION
# verdict: works pretty well, runs into some trouble due to the distance function
# did help me debug the PSO though

DESTINATION = (25, 25, 25)
geometric_mins = (0, 0, 0)
geometric_maxs = (50, 50, 50)
geometric_dimensions = 3
geometric_particles = 20

geometric_search = PSO.Swarm(geometric_particles, geometric_dimensions,
                             geometric_mins, geometric_maxs)
thisIteration = 0
bestFoundScore = -float(1e3000)

print "Looking for point 25,25,25 "

while (geometric_search.getIterations() < 100):
    currentParticle = geometric_search.getCurrentParticle()
    particleLocation = currentParticle.getPosition()
    distanceFromTarget = distance3d(DESTINATION, particleLocation)

    currentParticle.setHeuristic(-distanceFromTarget) # since heuristic is maximized

    geometric_search.tickCurrentParticle()

    if geometric_search.getIterations() > thisIteration: