예제 #1
0
def save_all_partners():
    """
	Retrieves all the partners and their fields directly from the Coursera Catalog API and saves it to the RethinkDB server
	"""
    fields = ','.join([
        'id', 'name', 'shortName', 'description', 'banner', 'courseIds',
        'instructorIds', 'primaryColor', 'logo', 'squareLogo',
        'rectangularLogo', 'links', 'location'
    ])

    partners = []

    res = requests.get(partners_URL + '?fields=' + fields +
                       '&start=0&limit=100').json()
    partners.extend(res['elements'])

    while (res['paging'].get('next') != None):
        res = requests.get(partners_URL + '?fields=' + fields + '&start=' +
                           res['paging']['next'] + '&limit=100').json()
        partners.extend(res['elements'])

    create_or_delete_table('partners', delete=True)
    r.table('partners').insert(partners).run(connection)
    print('Successfully inserted all the partners.')

    return jsonify({'message': 'Successfully inserted all the partners.'})
예제 #2
0
def save_all_instructors():
    """
	Retrieves all the instructors and their fields directly from the Coursera Catalog API and saves it to the RethinkDB server
	"""
    fields = ','.join([
        'id', 'photo', 'photo150', 'bio', 'prefixName', 'firstName',
        'middleName', 'lastName', 'suffixName', 'fullName', 'title',
        'department', 'website', 'websiteTwitter', 'websiteFacebook',
        'websiteLinkedin', 'websiteGplus', 'shortName'
    ])

    instructors = []

    res = requests.get(instructors_URL + '?fields=' + fields +
                       '&start=0&limit=100').json()
    instructors.extend(res['elements'])

    while (res['paging'].get('next') != None):
        res = requests.get(instructors_URL + '?fields=' + fields + '&start=' +
                           res['paging']['next'] + '&limit=100').json()
        instructors.extend(res['elements'])

    create_or_delete_table('instructors', delete=True)
    r.table('instructors').insert(instructors).run(connection)
    print('Successfully inserted all the instructors.')

    return jsonify({'message': 'Successfully inserted all the instructors.'})
예제 #3
0
def save_all_courses():
    """
	Retrieves all the courses and their fields directly from the Coursera Catalog API and saves it to the RethinkDB server
	"""
    fields = ','.join([
        'id', 'slug', 'courseType', 'name', 'primaryLanguages',
        'subtitleLanguages', 'partnerLogo', 'instructorIds', 'partnerIds',
        'photoUrl', 'certificates', 'description', 'startDate', 'workload',
        'previewLink', 'specializations', 's12nIds', 'domainTypes',
        'categories'
    ])

    courses = []

    res = requests.get(courses_URL + '?fields=' + fields +
                       '&start=0&limit=100').json()
    courses.extend(res['elements'])

    while (res['paging'].get('next') != None):
        res = requests.get(courses_URL + '?fields=' + fields + '&start=' +
                           res['paging']['next'] + '&limit=100').json()
        courses.extend(res['elements'])

    create_or_delete_table('courses', delete=True)
    r.table('courses').insert(courses).run(connection)
    print('Successfully inserted all the courses.')

    return jsonify({'message': 'Successfully inserted all the courses.'})
예제 #4
0
def save_to_performance(data_size, train_score, test_score, train_size,
                        test_size, cm_train, cm_test, f1_score,
                        precision_score, recall_score, classifier, vocab_model,
                        tf_idf, corrected):
    """
	Save all the important performance measures to the table performance
	"""

    create_or_delete_table('performance')

    # Delete the row with the same data_size and classifier (that will serve as the unique identifier per row)
    r.table('performance').filter({
        'data_size': data_size,
        'classifier': classifier,
        'vocab_model': vocab_model,
        'tf_idf': tf_idf,
        'corrected': corrected
    }).delete().run(connection)

    # Insert the current data
    r.table('performance').insert({
        'data_size': data_size,
        'classifier': classifier,
        'vocab_model': vocab_model,
        'tf_idf': tf_idf,
        'corrected': corrected,
        'train_score': train_score,
        'train_size': train_size,
        'test_score': test_score,
        'test_size': test_size,
        'cm_train': cm_train.tolist(),
        'cm_test': cm_test.tolist(),
        'f1_score': f1_score,
        'precision_score': precision_score,
        'recall_score': recall_score
    }).run(connection)
예제 #5
0
파일: MLP.py 프로젝트: septa97/SP-server
def main(data_size,
         test_size=0.2,
         min_df=5,
         vocab_model='unigram',
         tf_idf=False,
         corrected=False):
    """
	Perform Multi-layer Perceptron Classifier on the current data
	"""
    if corrected:
        reviews = load_files(dir_path + '/../data/reviews/corrected')
    else:
        reviews = load_files(dir_path + '/../data/reviews/not_corrected')

    if data_size != -1:
        text_train, text_test, y_train, y_test = train_test_split(
            reviews.data[:data_size],
            reviews.target[:data_size],
            test_size=test_size,
            random_state=0)
    else:
        text_train, text_test, y_train, y_test = train_test_split(
            reviews.data, reviews.target, test_size=test_size, random_state=0)

    if tf_idf:
        if vocab_model == 'unigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(1, 1))
            transformer = TfidfTransformer(use_idf=True)
        if vocab_model == 'bigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(2, 2))
            transformer = TfidfTransformer(use_idf=True)
        if vocab_model == 'trigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(3, 3))
            transformer = TfidfTransformer(use_idf=True)
    else:
        if vocab_model == 'unigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(1, 1))
            transformer = TfidfTransformer(use_idf=False)
        if vocab_model == 'bigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(2, 2))
            transformer = TfidfTransformer(use_idf=False)
        if vocab_model == 'trigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(3, 3))
            transformer = TfidfTransformer(use_idf=False)

    # PIPELINED WITH K-FOLD CROSS VALIDATION
    # Multi-layer Perceptron Classifier (with 1 hidden layer with 100 hidden units)
    # clf = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', verbose=False)

    # parameters = {
    # 	'clf__alpha': np.logspace(-4, 4, 10).tolist()
    # }

    # pipe = Pipeline([
    # 		('vect', vect),
    # 		('transformer', transformer),
    # 		('clf', clf)
    # 	])

    # grid_search = GridSearchCV(pipe, parameters, cv=10, n_jobs=-1, verbose=True)
    # grid_search.fit(text_train, y_train)

    # y_train_pred = grid_search.predict(text_train)
    # y_test_pred = grid_search.predict(text_test)

    # TRAIN-TEST SPLIT (80-20)
    clf = MLPClassifier(hidden_layer_sizes=(100, ),
                        activation='logistic',
                        verbose=True)

    pipe = make_pipeline(vect, clf)
    pipe.fit(text_train, y_train)

    y_train_pred = pipe.predict(text_train)
    y_test_pred = pipe.predict(text_test)
    ###################################################
    train_score = accuracy_score(y_train, y_train_pred)
    test_score = accuracy_score(y_test, y_test_pred)
    score_f1 = f1_score(y_test, y_test_pred, average='weighted')
    score_precision = precision_score(y_test, y_test_pred, average='weighted')
    score_recall = recall_score(y_test, y_test_pred, average='weighted')

    print('(sklearn) Train data accuracy:', train_score)
    print('(sklearn) Test data accuracy:', test_score)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    # Save the features to RethinkDB server
    create_or_delete_table('features')
    insert_features(vocab_model, tf_idf, vect.vocabulary_.keys())

    if data_size == -1:
        persist_to_disk('MLP', vocab_model, tf_idf, corrected, clf,
                        vect.vocabulary_)
        data_size = len(reviews.data)

    return train_score, test_score, len(text_train), len(
        text_test
    ), cm_train, cm_test, score_f1, score_precision, score_recall, vect.vocabulary_, data_size
예제 #6
0
파일: LR.py 프로젝트: septa97/SP-server
def main(data_size,
         test_size=0.2,
         min_df=5,
         vocab_model='unigram',
         tf_idf=False,
         corrected=False):
    """
	Perform Logistic Regression on the current data
	"""
    if corrected:
        reviews = load_files(dir_path + '/../data/reviews/corrected')
    else:
        reviews = load_files(dir_path + '/../data/reviews/not_corrected')

    if data_size != -1:
        text_train, text_test, y_train, y_test = train_test_split(
            reviews.data[:data_size],
            reviews.target[:data_size],
            test_size=test_size,
            random_state=0)
    else:
        text_train, text_test, y_train, y_test = train_test_split(
            reviews.data, reviews.target, test_size=test_size, random_state=0)

    if tf_idf:
        if vocab_model == 'unigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(1, 1))
            transformer = TfidfTransformer(use_idf=True)
        if vocab_model == 'bigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(2, 2))
            transformer = TfidfTransformer(use_idf=True)
        if vocab_model == 'trigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(3, 3))
            transformer = TfidfTransformer(use_idf=True)
    else:
        if vocab_model == 'unigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(1, 1))
            transformer = TfidfTransformer(use_idf=False)
        if vocab_model == 'bigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(2, 2))
            transformer = TfidfTransformer(use_idf=False)
        if vocab_model == 'trigram':
            vect = CountVectorizer(min_df=min_df,
                                   stop_words=stopwords,
                                   ngram_range=(3, 3))
            transformer = TfidfTransformer(use_idf=False)

    # Logistic Regression with Stratified K-Fold. Uses all the CPU cores
    # clf = LogisticRegressionCV(solver='newton-cg', n_jobs=-1, cv=10, verbose=False)

    # pipe = Pipeline([
    # 		('vect', vect),
    # 		('transformer', transformer),
    # 		('clf', clf)
    # 	])
    # pipe.fit(text_train, y_train)

    # y_train_pred = pipe.predict(text_train)
    # y_test_pred = pipe.predict(text_test)

    # TRAIN-TEST SPLIT (80-20)
    clf = LogisticRegression(solver='newton-cg', verbose=False)

    pipe = make_pipeline(vect, clf)
    pipe.fit(text_train, y_train)

    y_train_pred = pipe.predict(text_train)
    y_test_pred = pipe.predict(text_test)
    ###################################################
    train_score = accuracy_score(y_train, y_train_pred)
    test_score = accuracy_score(y_test, y_test_pred)
    score_f1 = f1_score(y_test, y_test_pred, average='weighted')
    score_precision = precision_score(y_test, y_test_pred, average='weighted')
    score_recall = recall_score(y_test, y_test_pred, average='weighted')

    print('(sklearn) Train data accuracy:', train_score)
    print('(sklearn) Test data accuracy:', test_score)

    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)

    # Save the features to RethinkDB server
    create_or_delete_table('features')
    insert_features(vocab_model, tf_idf, vect.vocabulary_.keys())

    if data_size == -1:
        persist_to_disk('LR', vocab_model, tf_idf, corrected, clf,
                        vect.vocabulary_)
        data_size = len(reviews.data)

    return train_score, test_score, len(text_train), len(
        text_test
    ), cm_train, cm_test, score_f1, score_precision, score_recall, vect.vocabulary_, data_size