def save_all_partners(): """ Retrieves all the partners and their fields directly from the Coursera Catalog API and saves it to the RethinkDB server """ fields = ','.join([ 'id', 'name', 'shortName', 'description', 'banner', 'courseIds', 'instructorIds', 'primaryColor', 'logo', 'squareLogo', 'rectangularLogo', 'links', 'location' ]) partners = [] res = requests.get(partners_URL + '?fields=' + fields + '&start=0&limit=100').json() partners.extend(res['elements']) while (res['paging'].get('next') != None): res = requests.get(partners_URL + '?fields=' + fields + '&start=' + res['paging']['next'] + '&limit=100').json() partners.extend(res['elements']) create_or_delete_table('partners', delete=True) r.table('partners').insert(partners).run(connection) print('Successfully inserted all the partners.') return jsonify({'message': 'Successfully inserted all the partners.'})
def save_all_instructors(): """ Retrieves all the instructors and their fields directly from the Coursera Catalog API and saves it to the RethinkDB server """ fields = ','.join([ 'id', 'photo', 'photo150', 'bio', 'prefixName', 'firstName', 'middleName', 'lastName', 'suffixName', 'fullName', 'title', 'department', 'website', 'websiteTwitter', 'websiteFacebook', 'websiteLinkedin', 'websiteGplus', 'shortName' ]) instructors = [] res = requests.get(instructors_URL + '?fields=' + fields + '&start=0&limit=100').json() instructors.extend(res['elements']) while (res['paging'].get('next') != None): res = requests.get(instructors_URL + '?fields=' + fields + '&start=' + res['paging']['next'] + '&limit=100').json() instructors.extend(res['elements']) create_or_delete_table('instructors', delete=True) r.table('instructors').insert(instructors).run(connection) print('Successfully inserted all the instructors.') return jsonify({'message': 'Successfully inserted all the instructors.'})
def save_all_courses(): """ Retrieves all the courses and their fields directly from the Coursera Catalog API and saves it to the RethinkDB server """ fields = ','.join([ 'id', 'slug', 'courseType', 'name', 'primaryLanguages', 'subtitleLanguages', 'partnerLogo', 'instructorIds', 'partnerIds', 'photoUrl', 'certificates', 'description', 'startDate', 'workload', 'previewLink', 'specializations', 's12nIds', 'domainTypes', 'categories' ]) courses = [] res = requests.get(courses_URL + '?fields=' + fields + '&start=0&limit=100').json() courses.extend(res['elements']) while (res['paging'].get('next') != None): res = requests.get(courses_URL + '?fields=' + fields + '&start=' + res['paging']['next'] + '&limit=100').json() courses.extend(res['elements']) create_or_delete_table('courses', delete=True) r.table('courses').insert(courses).run(connection) print('Successfully inserted all the courses.') return jsonify({'message': 'Successfully inserted all the courses.'})
def save_to_performance(data_size, train_score, test_score, train_size, test_size, cm_train, cm_test, f1_score, precision_score, recall_score, classifier, vocab_model, tf_idf, corrected): """ Save all the important performance measures to the table performance """ create_or_delete_table('performance') # Delete the row with the same data_size and classifier (that will serve as the unique identifier per row) r.table('performance').filter({ 'data_size': data_size, 'classifier': classifier, 'vocab_model': vocab_model, 'tf_idf': tf_idf, 'corrected': corrected }).delete().run(connection) # Insert the current data r.table('performance').insert({ 'data_size': data_size, 'classifier': classifier, 'vocab_model': vocab_model, 'tf_idf': tf_idf, 'corrected': corrected, 'train_score': train_score, 'train_size': train_size, 'test_score': test_score, 'test_size': test_size, 'cm_train': cm_train.tolist(), 'cm_test': cm_test.tolist(), 'f1_score': f1_score, 'precision_score': precision_score, 'recall_score': recall_score }).run(connection)
def main(data_size, test_size=0.2, min_df=5, vocab_model='unigram', tf_idf=False, corrected=False): """ Perform Multi-layer Perceptron Classifier on the current data """ if corrected: reviews = load_files(dir_path + '/../data/reviews/corrected') else: reviews = load_files(dir_path + '/../data/reviews/not_corrected') if data_size != -1: text_train, text_test, y_train, y_test = train_test_split( reviews.data[:data_size], reviews.target[:data_size], test_size=test_size, random_state=0) else: text_train, text_test, y_train, y_test = train_test_split( reviews.data, reviews.target, test_size=test_size, random_state=0) if tf_idf: if vocab_model == 'unigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(1, 1)) transformer = TfidfTransformer(use_idf=True) if vocab_model == 'bigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(2, 2)) transformer = TfidfTransformer(use_idf=True) if vocab_model == 'trigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(3, 3)) transformer = TfidfTransformer(use_idf=True) else: if vocab_model == 'unigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(1, 1)) transformer = TfidfTransformer(use_idf=False) if vocab_model == 'bigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(2, 2)) transformer = TfidfTransformer(use_idf=False) if vocab_model == 'trigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(3, 3)) transformer = TfidfTransformer(use_idf=False) # PIPELINED WITH K-FOLD CROSS VALIDATION # Multi-layer Perceptron Classifier (with 1 hidden layer with 100 hidden units) # clf = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', verbose=False) # parameters = { # 'clf__alpha': np.logspace(-4, 4, 10).tolist() # } # pipe = Pipeline([ # ('vect', vect), # ('transformer', transformer), # ('clf', clf) # ]) # grid_search = GridSearchCV(pipe, parameters, cv=10, n_jobs=-1, verbose=True) # grid_search.fit(text_train, y_train) # y_train_pred = grid_search.predict(text_train) # y_test_pred = grid_search.predict(text_test) # TRAIN-TEST SPLIT (80-20) clf = MLPClassifier(hidden_layer_sizes=(100, ), activation='logistic', verbose=True) pipe = make_pipeline(vect, clf) pipe.fit(text_train, y_train) y_train_pred = pipe.predict(text_train) y_test_pred = pipe.predict(text_test) ################################################### train_score = accuracy_score(y_train, y_train_pred) test_score = accuracy_score(y_test, y_test_pred) score_f1 = f1_score(y_test, y_test_pred, average='weighted') score_precision = precision_score(y_test, y_test_pred, average='weighted') score_recall = recall_score(y_test, y_test_pred, average='weighted') print('(sklearn) Train data accuracy:', train_score) print('(sklearn) Test data accuracy:', test_score) cm_train = confusion_matrix(y_train, y_train_pred) cm_test = confusion_matrix(y_test, y_test_pred) # Save the features to RethinkDB server create_or_delete_table('features') insert_features(vocab_model, tf_idf, vect.vocabulary_.keys()) if data_size == -1: persist_to_disk('MLP', vocab_model, tf_idf, corrected, clf, vect.vocabulary_) data_size = len(reviews.data) return train_score, test_score, len(text_train), len( text_test ), cm_train, cm_test, score_f1, score_precision, score_recall, vect.vocabulary_, data_size
def main(data_size, test_size=0.2, min_df=5, vocab_model='unigram', tf_idf=False, corrected=False): """ Perform Logistic Regression on the current data """ if corrected: reviews = load_files(dir_path + '/../data/reviews/corrected') else: reviews = load_files(dir_path + '/../data/reviews/not_corrected') if data_size != -1: text_train, text_test, y_train, y_test = train_test_split( reviews.data[:data_size], reviews.target[:data_size], test_size=test_size, random_state=0) else: text_train, text_test, y_train, y_test = train_test_split( reviews.data, reviews.target, test_size=test_size, random_state=0) if tf_idf: if vocab_model == 'unigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(1, 1)) transformer = TfidfTransformer(use_idf=True) if vocab_model == 'bigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(2, 2)) transformer = TfidfTransformer(use_idf=True) if vocab_model == 'trigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(3, 3)) transformer = TfidfTransformer(use_idf=True) else: if vocab_model == 'unigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(1, 1)) transformer = TfidfTransformer(use_idf=False) if vocab_model == 'bigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(2, 2)) transformer = TfidfTransformer(use_idf=False) if vocab_model == 'trigram': vect = CountVectorizer(min_df=min_df, stop_words=stopwords, ngram_range=(3, 3)) transformer = TfidfTransformer(use_idf=False) # Logistic Regression with Stratified K-Fold. Uses all the CPU cores # clf = LogisticRegressionCV(solver='newton-cg', n_jobs=-1, cv=10, verbose=False) # pipe = Pipeline([ # ('vect', vect), # ('transformer', transformer), # ('clf', clf) # ]) # pipe.fit(text_train, y_train) # y_train_pred = pipe.predict(text_train) # y_test_pred = pipe.predict(text_test) # TRAIN-TEST SPLIT (80-20) clf = LogisticRegression(solver='newton-cg', verbose=False) pipe = make_pipeline(vect, clf) pipe.fit(text_train, y_train) y_train_pred = pipe.predict(text_train) y_test_pred = pipe.predict(text_test) ################################################### train_score = accuracy_score(y_train, y_train_pred) test_score = accuracy_score(y_test, y_test_pred) score_f1 = f1_score(y_test, y_test_pred, average='weighted') score_precision = precision_score(y_test, y_test_pred, average='weighted') score_recall = recall_score(y_test, y_test_pred, average='weighted') print('(sklearn) Train data accuracy:', train_score) print('(sklearn) Test data accuracy:', test_score) cm_train = confusion_matrix(y_train, y_train_pred) cm_test = confusion_matrix(y_test, y_test_pred) # Save the features to RethinkDB server create_or_delete_table('features') insert_features(vocab_model, tf_idf, vect.vocabulary_.keys()) if data_size == -1: persist_to_disk('LR', vocab_model, tf_idf, corrected, clf, vect.vocabulary_) data_size = len(reviews.data) return train_score, test_score, len(text_train), len( text_test ), cm_train, cm_test, score_f1, score_precision, score_recall, vect.vocabulary_, data_size