Esempio n. 1
0
def select_best_corpora():
    f_selection = [
        'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector',
        'Word Lengths', 'Word Level Levenshtein', 'NE Coupling', 'NE LCS syn'
    ]

    feature_files = os.listdir(slash.join(processed_news_features_path))

    with open(slash.join(saved_models_path +
                         ['select_best_corpora_scores_file.csv']),
              'w',
              newline='') as f:
        writer = csv.writer(f, delimiter=';')
        # add featureselection
        #header = ['modelnum', 'usedcorpora', 'corpussize', 'loss', 'accuracy',
        #          'loss on meter', 'accuracy on meter', 'featureselection']
        header = [
            'filename', 'mscoco', 'msrp', 'msrp-a', 'opinosis', 'p4p', 'quora',
            'balanced', 'corpussize', 'loss', 'accuracy', 'loss on meter',
            'accuracy on meter'
        ]

        writer.writerow(header)

        modelnum = 0
        for f_name in feature_files:
            start = time.time()
            model, score, rounded, y = classify(f_selection, 0.1, f_name)
            loss, accuracy = score
            X_train, X_test, y_train, y_test = get_meter_features(
                model, 0.1, f_selection, None)
            loss_on_meter, accuracy_on_meter = meter_classify(
                X_train, X_test, y_train, y_test)

            new_model_row = [0 for _ in list(range(len(header)))]
            # new_model_row[0] = modelnum
            new_model_row[0] = f_name
            # usedcorpora = feature_file_name[0][:-4].split('_')
            usedcorpora = f_name[:-4].split('_')
            new_model_row[1] = 1 if 'mscoco' in usedcorpora else 0
            new_model_row[2] = 1 if 'msrp' in usedcorpora else 0
            new_model_row[3] = 1 if 'msrpa' in usedcorpora else 0
            new_model_row[4] = 1 if 'opinosis' in usedcorpora else 0
            new_model_row[5] = 1 if 'p4p' in usedcorpora else 0
            new_model_row[6] = 1 if 'quora' in usedcorpora else 0
            new_model_row[7] = 1 if usedcorpora[0] == 'balance' else 0
            new_model_row[8] = int(usedcorpora[-1])  # corpussize
            new_model_row[9] = loss
            new_model_row[10] = accuracy
            new_model_row[11] = loss_on_meter
            new_model_row[12] = accuracy_on_meter

            writer.writerow(new_model_row)

            print(modelnum, 'done in', time.time() - start)
            K.clear_session()
            modelnum += 1
Esempio n. 2
0
def save(model, modelnum):

    #create directory
    os.makedirs(slash.join(saved_models_path + [modelnum]))
    path = slash.join(saved_models_path + [modelnum, modelnum])

    #p = slash.join(saved_models_path)
    #print(os.listdir(p))

    model_json = model.to_json()
    with open(path + '.json', 'w') as f:
        f.write(model_json)
    model.save_weights(path + '.h5')
Esempio n. 3
0
def parse(file_name):

    #file_name = os.listdir(slash.join(processed_meter_corpus_path))[0]
    file_path = slash.join(processed_meter_corpus_path + [file_name])

    feature_names = ['TARGET_ID', 'SOURCE_ID',  'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector',
                     # 'TFIDF': tfidf_X,
                     'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein',
                     'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix',
                     'NE Coupling', 'NE Coupling syn', 'NE GT',
                     'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring',
                     'Sentence Lengths', 'String Matching', 'Punctuation Overlap']

    features = {'TARGET_ID': [], 'SOURCE_ID': [],  'GST': [], 'GST syn': [], 'LCS': [], 'LCS syn': [], 'TO': [],
                'TO syn': [], 'Sparse Vector': [],  # 'TFIDF': tfidf_X,
                'Word Embeddings': [], 'Word Lengths': [], 'Word Level Levenshtein': [],
                'WN Path Matrix': [], 'WN LCH Matrix': [], 'WN WUP Matrix': [],
                'NE Coupling': [], 'NE Coupling syn': [], 'NE GT': [], 'NE GT syn': [],
                'NE LCS': [], 'NE LCS syn': [], 'NE Overlap': [], 'NE Overlap syn': [],
                'LCSubstring': [], 'Sentence Lengths': [], 'String Matching': [], 'Punctuation Overlap': []}

    with open(file_path, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for line in reader:
            if len(line)>0 and line[0] != 'source':
                derived = line[2]
                f = list(zip(feature_names, [float(n) for n in line[3:]]))
                #print(line[3], line[4])
                for label, val in f:
                    features[label].append(val)

    return features, derived
    def export(self):
        """
        Export the scores to a json file
        """

        #data_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) + '/data/Similarity/'
        data_path = slash.join(vand_folder_path + ['sim_matrix.txt'])
        with open(data_path, 'w') as file:
            json.dump(self.similarity_dict, file)
Esempio n. 5
0
def load(name):
    #path = project_root + '/saved_models/'
    path = slash.join(saved_models_path + [name, name])
    with open(path + '.json', 'r') as g:
        file = g.read()

    model = model_from_json(file)
    model.load_weights(path + '.h5')
    return model
Esempio n. 6
0
def feature_selection():
    f2 = [
        'GST',
        'GST syn',
        'LCS',
        'LCS syn',
        'TO',
        'TO syn',
        'Sparse Vector',
        # 'TFIDF': tfidf_X,
        'Word Embeddings',
        'Word Lengths',
        'Word Level Levenshtein',
        # 'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix',
        'NE Coupling',
        'NE Coupling syn',
        'NE GT',
        'NE GT syn',
        'NE LCS',
        'NE LCS syn',
        'NE Overlap',
        'NE Overlap syn',
        'LCSubstring',
        'Sentence Lengths',
        'String Matching',
        'Punctuation Overlap'
    ]

    feature_files = os.listdir(slash.join(processed_news_features_path))

    f = ['Sparse Vector', 'Word Level Levenshtein', 'TO', 'GST']
    #must be changed in method to return this
    X, y = classify(f2, 0.1, feature_files[0])
    nan_indeces = numpy.isnan(X)
    X[nan_indeces] = 0
    print(numpy.isnan(numpy.min(X)))
    model = LogisticRegression()
    rfe = RFE(model, 1)
    fit = rfe.fit(X, y)

    features = fit.support_
    ranking = fit.ranking_
    indices = [f2[i] for i, x in enumerate(features) if x]
    print(indices)
    t = []
    #print(ranking)
    features_ranked = []
    for i, num in enumerate(ranking):
        features_ranked.append((num, f2[i]))

    features_ranked_sorted = sorted(features_ranked, key=lambda x: x[0])
    print(features_ranked_sorted[:11])
Esempio n. 7
0
def test_5_features():
    feature_files = os.listdir(slash.join(processed_news_features_path))
    best_corpus = feature_files[75]

    f_selections = [
        'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector',
        'Word Lengths', 'Word Level Levenshtein', 'NE Coupling', 'NE LCS syn'
    ]

    a = []
    for c in itertools.combinations(f_selections, 5):
        a.append(c)

    print(len(a))
def run(primary_model, test_size, feature_selection, b_mode):
    #model = load(model_name)

    file_names = os.listdir(slash.join(processed_meter_corpus_path))

    scores = []
    y = []
    for file_name in file_names:
        f = featurize(feature_selection, file_name, primary_model)
        if f is not None:
            score, label = f
            #print(score)
            scores.append(score)
            new_y = 0 if label == '0' else 1
            y.append(new_y)
            #y.append(int(label))
            #break


    #print(scores)


    X = np.array([np.array(i) for i in scores])

    if b_mode == 0:
        b_X, b_y = balance_both(X, y)
    elif b_mode == 1:
        b_X, b_y = balance_down(X, y)
    elif b_mode == 2:
        b_X, b_y = balance_up(X, y)
    else:
        b_X = X
        b_y = y


    X_train, X_test, y_train, y_test = train_test_split(b_X, b_y, test_size=test_size, shuffle=True)
    return X_train, X_test, y_train, y_test
Esempio n. 9
0
def parse(file_name):
    """
    At the moment only returns the featuresets and labels, but more information is available
    :return:
    """

    feature_names = ['source_id', 'target_id', 'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector',
     # 'TFIDF': tfidf_X,
     'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein',
     'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix',
     'NE Coupling', 'NE Coupling syn', 'NE GT',
     'NE GT syn', 'NE LCS', 'NE LCS syn', 'NE Overlap', 'NE Overlap syn', 'LCSubstring',
     'Sentence Lengths', 'String Matching', 'Punctuation Overlap']




    file_path = slash.join(processed_test_files_path + [file_name])

    features = {'source_id':[], 'target_id':[], 'GST':[], 'GST syn':[], 'LCS':[], 'LCS syn':[], 'TO':[],
                'TO syn':[], 'Sparse Vector':[], # 'TFIDF': tfidf_X,
                'Word Embeddings':[], 'Word Lengths':[], 'Word Level Levenshtein':[],
                'WN Path Matrix':[], 'WN LCH Matrix':[], 'WN WUP Matrix':[],
                'NE Coupling':[], 'NE Coupling syn':[], 'NE GT':[], 'NE GT syn':[],
                'NE LCS':[], 'NE LCS syn':[], 'NE Overlap':[], 'NE Overlap syn':[],
                'LCSubstring':[], 'Sentence Lengths':[], 'String Matching':[], 'Punctuation Overlap':[]}
    with open(file_path, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        for line in reader:
            if line[0] != 'source':
                source, target, source_id, target_id = line[:4]
                f = list(zip(feature_names, [int(source_id), int(target_id)] + [float(n) for n in line[4:]]))
                for label, val in f:
                    features[label].append(val)


    return source, target, features
Esempio n. 10
0
def test_different_feature_selections():
    """f_selections = ['GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn',
                    'Sparse Vector', 'Word Lengths', 'Word Level Levenshtein',
                    'NE Coupling', 'NE LCS syn']"""

    f_selections = [
        'GST syn', 'LCS syn', 'TO syn', 'GST', 'TO', 'LCS', 'LCSubstring',
        'Word Lengths', 'Word Level Levenshtein', 'String Matching',
        'NE Coupling'
    ]

    feature_files = os.listdir(slash.join(processed_news_features_path))
    best_c_indeces = [75, 4, 61, 33, 104]
    best_corpora = [(feature_files[i], i) for i in best_c_indeces]

    with open(slash.join(saved_models_path +
                         ['test_different_features_scores_file_new.csv']),
              'w',
              newline='') as f:
        writer = csv.writer(f, delimiter=';')
        # add featureselection

        header = [
            'corpusnum', 'loss', 'accuracy', 'loss on meter',
            'accuracy on meter', 'GST syn', 'LCS syn', 'TO syn', 'GST', 'TO',
            'LCS', 'LCSubstring', 'Word Lengths', 'Word Level Levenshtein',
            'String Matching', 'NE Coupling'
        ]

        writer.writerow(header)
        modelnum = 0
        for c_name in best_corpora:
            f_name = c_name[0]
            num = c_name[1]
            for i in range(5, 12):
                new_model_row = [0 for _ in list(range(len(header)))]
                f = f_selections[:i]
                start = time.time()
                model, score, rounded, y = classify(f, 0.1, f_name)
                loss, accuracy = score
                X_train, X_test, y_train, y_test = get_meter_features(
                    model, 0.1, f, None)
                loss_on_meter, accuracy_on_meter = meter_classify(
                    X_train, X_test, y_train, y_test)

                new_model_row[0] = num
                new_model_row[1] = loss
                new_model_row[2] = accuracy
                new_model_row[3] = loss_on_meter
                new_model_row[4] = accuracy_on_meter
                new_model_row[5] = 1 if 'GST syn' in f else 0
                new_model_row[6] = 1 if 'LCS syn' in f else 0
                new_model_row[7] = 1 if 'TO syn' in f else 0
                new_model_row[8] = 1 if 'GST' in f else 0
                new_model_row[9] = 1 if 'TO' in f else 0
                new_model_row[10] = 1 if 'LCS' in f else 0
                new_model_row[11] = 1 if 'LCSSubstring' in f else 0
                new_model_row[12] = 1 if 'Word Lenghts' in f else 0
                new_model_row[13] = 1 if 'Word Level Levenshtein' in f else 0
                new_model_row[14] = 1 if 'String Matching' in f else 0
                new_model_row[15] = 1 if 'NE Coupling' in f else 0

                writer.writerow(new_model_row)

                print(modelnum, 'done in', time.time() - start)
                K.clear_session()
                modelnum += 1
                if modelnum == 3:
                    break
Esempio n. 11
0
def iterate_all_feature_selections_and_files():

    feature_selection = [  #'GST', 'GST syn', 'LCS', 'LCS syn', 'TO', 'TO syn', 'Sparse Vector',
        # 'TFIDF': tfidf_X,
        #'Word Embeddings', 'Word Lengths', 'Word Level Levenshtein',
        #'WN Path Matrix', 'WN LCH Matrix', 'WN WUP Matrix',
        #'NE Coupling', 'NE Coupling syn',
        'NE GT',
        'NE GT syn',
        'NE LCS',
        'NE LCS syn',
        'NE Overlap',
        'NE Overlap syn',
        'LCSubstring',
        'Sentence Lengths',
        'String Matching',
        'Punctuation Overlap'
    ]

    best_feautures = [
        'Sparse Vector', 'Word Level Levenshtein', 'TO', 'GST', 'NE LCS'
    ]
    best_features_all_combinations = []
    for i in range(2, 5):
        for c in itertools.combinations(best_feautures, i):
            best_features_all_combinations.append(c)

    feature_files = os.listdir(slash.join(processed_news_features_path))

    #nums = [18, 94, 9, 78, 93, 76, 40, 59, 45, 49, 15, 54, 85, 25]
    nums = [9, 15, 18, 49, 78, 94]
    best_files = [(feature_files[i], i) for i in nums]

    with open(slash.join(saved_models_path +
                         ['multiple_features_scores_file.csv']),
              'w',
              newline='') as f:
        writer = csv.writer(f, delimiter=';')
        # add featureselection
        #header = ['modelnum', 'usedcorpora', 'corpussize', 'loss', 'accuracy',
        #          'loss on meter', 'accuracy on meter', 'featureselection']
        header = [
            'modelnum', 'mscoco', 'msrp', 'msrp-a', 'opinosis', 'p4p', 'quora',
            'balanced', 'corpussize', 'loss', 'accuracy', 'loss on meter',
            'accuracy on meter', 'Sparse Vector', 'Word Level Levenshtein',
            'TO', 'GST', 'NE LCS'
        ]

        writer.writerow(header)
        modelnum = 0
        for feature_combination in best_features_all_combinations:
            for feature_file_name in best_files:
                start = time.time()
                new_model_row = [0 for _ in list(range(len(header)))]
                #new_model_row[0] = modelnum
                new_model_row[0] = feature_file_name[1]
                #usedcorpora = feature_file_name[0][:-4].split('_')
                usedcorpora = feature_file_name[0][:-4].split('_')
                new_model_row[1] = 1 if 'mscoco' in usedcorpora else 0
                new_model_row[2] = 1 if 'msrp' in usedcorpora else 0
                new_model_row[3] = 1 if 'msrpa' in usedcorpora else 0
                new_model_row[4] = 1 if 'opinosis' in usedcorpora else 0
                new_model_row[5] = 1 if 'p4p' in usedcorpora else 0
                new_model_row[6] = 1 if 'quora' in usedcorpora else 0
                new_model_row[7] = 1 if usedcorpora[0] == 'balance' else 0
                new_model_row[8] = int(usedcorpora[-1])  # corpussize
                model, score = classify(feature_selection, 0.1,
                                        feature_file_name[0])
                #model, score = classify(feature_selection, 0.1, feature_file_name)
                loss, accuracy = score

                # meterstuff
                X_train, X_test, y_train, y_test = get_meter_features(
                    model, 0.1, feature_selection)
                loss_on_meter, accuracy_on_meter = meter_classify(
                    X_train, X_test, y_train, y_test)

                new_model_row[9] = loss
                new_model_row[10] = accuracy
                new_model_row[11] = loss_on_meter
                new_model_row[12] = accuracy_on_meter
                new_model_row[
                    13] = 1 if 'Sparse Vector' in feature_combination else 0
                new_model_row[
                    14] = 1 if 'Word Level Levenshtein' in feature_combination else 0
                new_model_row[15] = 1 if 'TO' in feature_combination else 0
                new_model_row[16] = 1 if 'GST' in feature_combination else 0
                new_model_row[17] = 1 if 'NE LCS' in feature_combination else 0

                writer.writerow(new_model_row)

                print(modelnum, 'done in', time.time() - start)
                K.clear_session()
                modelnum += 1
        """for feature_file_name in feature_files:
Esempio n. 12
0
def parse(file_name):
    """
    At the moment only returns the featuresets and labels, but more information is available
    :return:
    """

    feature_names = [
        'GST',
        'GST syn',
        'LCS',
        'LCS syn',
        'TO',
        'TO syn',
        'Sparse Vector',
        # 'TFIDF': tfidf_X,
        'Word Embeddings',
        'Word Lengths',
        'Word Level Levenshtein',
        'WN Path Matrix',
        'WN LCH Matrix',
        'WN WUP Matrix',
        'NE Coupling',
        'NE Coupling syn',
        'NE GT',
        'NE GT syn',
        'NE LCS',
        'NE LCS syn',
        'NE Overlap',
        'NE Overlap syn',
        'LCSubstring',
        'Sentence Lengths',
        'String Matching',
        'Punctuation Overlap'
    ]

    file_path = slash.join(processed_news_features_path + [file_name])

    return 's', 't', file_path

    phrase_pairs = []
    labels = []
    origins = []
    features = {
        'GST': [],
        'GST syn': [],
        'LCS': [],
        'LCS syn': [],
        'TO': [],
        'TO syn': [],
        'Sparse Vector': [],  # 'TFIDF': tfidf_X,
        'Word Embeddings': [],
        'Word Lengths': [],
        'Word Level Levenshtein': [],
        'WN Path Matrix': [],
        'WN LCH Matrix': [],
        'WN WUP Matrix': [],
        'NE Coupling': [],
        'NE Coupling syn': [],
        'NE GT': [],
        'NE GT syn': [],
        'NE LCS': [],
        'NE LCS syn': [],
        'NE Overlap': [],
        'NE Overlap syn': [],
        'LCSubstring': [],
        'Sentence Lengths': [],
        'String Matching': [],
        'Punctuation Overlap': []
    }
    with open(file_path, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        #i = 0
        for line in reader:
            if len(line) > 0 and line[0] != 'source':
                #print(line)
                phrase_pairs.append(line[:2])
                #labels.append([1,0] if line[2] == '1' else [0,1])
                labels.append(int(line[2]))
                #labels.append(float(line[2]))
                origins.append(line[3])
                f = list(zip(feature_names, [float(n) for n in line[4:]]))
                #print(f)
                for label, val in f:
                    features[label].append(val)
                #i+=1
            #if i == 800:
            #    break

    return features, labels