Exemple #1
0
def main():
    t0 = time.time()
    print("Fetching training and testing datasets..")
    tr = load_txt("data/train_set.txt")
    tr_x = tr[0]
    tr_y = tr[1]

    ts_x_raw = load_xlsx("data/test_set.xlsx")
    ts_x = [row["A"] for row in ts_x_raw]

    ts_y_raw = load_txt("data/test_set_y.txt")
    ts_y = ts_y_raw[1]
    ts_y = ts_y[0:len(ts_y) - 1]  # because there's a new line at the end

    if compare_datasets:
        # Check our test labels against Eysteinn's
        ts_y_alternate = load_csv("data/test_dataset.csv")
        different = []
        for i in range(len(ts_y_alternate)):
            if ts_y[i] is not ts_y_alternate[i]:
                different.append(i)
        print("Number of different entries:")
        print(len(different))
        print(different)

    print("Creating features from training set..")
    vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True)
    tr_vectors = vectorizer.fit_transform(tr_x)

    print("Creating MultinomialNB classifier..")
    clf = MultinomialNB()
    clf.fit(tr_vectors, tr_y)
    ts_x_featurized = vectorizer.transform(ts_x)

    print("Making predictions..")
    predictions = clf.predict(ts_x_featurized)
    t1 = time.time()
    dt = t1 - t0
    i = 0
    correct_predictions = 0
    for row in predictions:
        if row == ts_y[i]:
            correct_predictions = correct_predictions + 1
        i = i + 1

    print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" %
          (correct_predictions, len(predictions),
           100. * correct_predictions / len(predictions), dt))
    print(classification_report(ts_y, predictions))
Exemple #2
0
def _inner_load_stopwords():
    # from stop_words python package
    pyth_fr = get_stop_words('fr')
    pyth_de = get_stop_words('de')
    pyth_en = get_stop_words('en')
    pyth_it = get_stop_words('it')

    # from Peter Graham on GitHub
    graham_fr = load_txt('stop_words/smart_stop_words_fr.txt')
    graham_de = load_txt('stop_words/smart_stop_words_de.txt')
    graham_en = load_txt('stop_words/smart_stop_words_en.txt')
    graham_it = load_txt('stop_words/smart_stop_words_it.txt')

    union = list(set().union(pyth_fr, pyth_de, pyth_en, pyth_it, graham_fr,
                             graham_de, graham_en, graham_it))
    return union
Exemple #3
0
    def __init__(self,
                 ids_file,
                 x_dir,
                 y_dir,
                 get_label=True,
                 x_suffix='.jpg',
                 y_suffix='.png',
                 transforms=None,
                 seed=None,
                 verbose=False):
        """
        Args:
        - ids_file: path to a text file containing (only) samples' ids in each line
            - eg: '0000\n0201\n0299'
        - x_dir (str or Path): path to the directory containing data images (x)
        - y_dir (str or Path): path to the directory containing label images (y)
        - get_label (bool): if True, returns `y` (as np.array of shape (h,w,1) in addition to `x`
        - transform (tuple or None): a tuple of transforms to apply to `x` and `y`, respectively
        """

        # Read data line by line
        self.ids = load_txt(ids_file)

        self.x_dir = Path(x_dir) if not isinstance(x_dir, Path) else x_dir
        self.y_dir = Path(y_dir) if not isinstance(y_dir, Path) else y_dir
        self.x_suffix, self.y_suffix = x_suffix, y_suffix
        self.get_label = get_label
        self.transforms = transforms
        self.seed = seed

        if verbose:
            print("Created dataset: ", len(self.ids), '[', self.ids[0],
                  self.ids[-1], ']')
            print('\txdir: ', self.x_dir)
            print('\tydir: ', self.y_dir)
            print('\tnumber of ids: ', len(self.ids))
Exemple #4
0
 def load(self, filename):
     self.filter_list = load_txt(filename)
Exemple #5
0
def main():
    t0 = time.time()
    print("Fetching training and testing datasets..")
    tr = load_txt("data/train_set.txt")
    tr_x = tr[0]
    tr_y = tr[1]

    ts_x_raw = load_xlsx("data/test_set.xlsx")
    ts_x = [row["A"] for row in ts_x_raw]

    ts_y_raw = load_txt("data/test_set_y.txt")
    ts_y = ts_y_raw[1]
    ts_y = ts_y[0:len(ts_y) - 1]  # because there's a new line at the end

    if compare_datasets:
        # Check our test labels against Eysteinn's
        ts_y_alternate = load_csv("data/test_dataset.csv")
        different = []
        for i in range(len(ts_y_alternate)):
            if ts_y[i] is not ts_y_alternate[i]:
                different.append(i)
        print("Number of different entries:")
        print(len(different))
        print(different)

    print("Creating features from training set..")
    vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), lowercase=True)
    tr_vectors = vectorizer.fit_transform(tr_x)

    print("Grid searching params for SVM classifier..")
    #clf = MultinomialNB()
    params = {
        'kernel': ('linear', 'poly', 'rbf'),
        'C': [1, 10],
        'degree': [2, 3, 4, 5],
        'coef0': [5, 7, 10, 15, 17, 20]
    }
    #params = {'kernel':['poly'], 'C':[10], 'degree':[3], 'coef0':[5]}
    bestclf = 0
    bestRes = 0
    bestPredictions = 0
    for classifier in params['kernel']:
        for c in params['C']:
            for d in params['degree']:
                for coef in params['coef0']:
                    clf = svm.SVC(kernel=classifier, C=c, degree=d, coef0=coef)
                    clf.fit(tr_vectors, tr_y)
                    ts_x_featurized = vectorizer.transform(ts_x)
                    predictions = clf.predict(ts_x_featurized)
                    t1 = time.time()
                    i = 0
                    correct_predictions = 0
                    for row in predictions:
                        if row == ts_y[i]:
                            correct_predictions = correct_predictions + 1
                        i = i + 1
                    if correct_predictions > bestRes:
                        bestRes = correct_predictions
                        bestclf = clf
                        bestPredictions = predictions
                        print('kernel:', classifier, 'C:', c, 'degree:', d,
                              'coef0:', coef)
                        print('Numcorrect', bestRes)

    dt = t1 - t0
    print("Result: %d/%d correct predictions (%.2f%%), in %.2fs.\n" %
          (bestRes, len(bestPredictions),
           100. * bestRes / len(bestPredictions), dt))
    print(classification_report(ts_y, bestPredictions))