Example #1
0
def analyze(path, alg='lasso', score='val_auc', limit=0, kfolds=10, sw=True, kwr=5000, alpha=1):
    # Craft path for outputs from parameters
    out_path = 'results/analysis/'
    expname = path.split('.')[0] + '_' + alg
    if kwr is not None:
        expname += '_rank' + str(kwr)
    if sw:
        expname += '_sw'
    out_path += expname
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    log = open(out_path + '/log.txt', 'w')

    # Translates short names of scores into long names understood by the model
    if score not in score_names.values():
        score = score_names[score]

    # Load CSV from path
    tweets_df = pd.read_csv(path, delimiter=';', encoding='utf-8')
    label = tweets_df['gender']
    tweets_df = tweets_df.drop('gender')

    # Preprocess and feature extraction
    tweets_df, label = text.preprocess(tweets_df, label)
    y = np.array(label).reshape((-1,))
    X_train, X_test, y_train, y_test = train_test_split(tweets_df, y, test_size=0.2)

    feat_ext = CountVectorizer(encoding='utf-8', token_pattern=' ', ngram_range=(1,1), analyzer='word',
                               max_features=kwr, binary=True)
    X_train = feat_ext.fit_trasform(X_train)
    X_test = feat_ext.transform(X_test)

    best = 0
    for a in alpha:
        c_reg = 1 / a
        if alg in ['lasso', 'l1']:
            model = LogisticRegression(class_weight='balanced', solver='liblinear', penalty='l1', C=c_reg)
        elif alg in ['ridge', 'l2']:
            model = LogisticRegression(class_weight='balanced', C=c_reg)
        elif alg == 'svm':
            model = LinearSVC(class_weight='balanced', C=c_reg)

        # Run experiments
        # variables: res, analyzer
        res = train_and_evaluate(X_train, y_train, X_test, y_test, model, kfolds)

        # Write to results file
        os.makedirs(out_path, exist_ok=True)
        results_to_file(top=500, to=open(out_path + '/' + 'a' + str(a).replace('.', '_') + '.txt', 'w'))

        # Write to log file
        print('Experiment', expname, ', case', a, '...', file=log)
        print('Train - Validation', file=log)
        print('F1:', res['Train_F1'], '-', res['Validation_F1'], file=log)
        print('Precision:', res['Train_Precision'], '-', res['Validation_Precision'], file=log)
        print('Recall:', res['Train_Recall'], '-', res['Validation_Recall'], file=log)
        print('AUC:', res['Train_AUC'], '-', res['Validation_AUC'], file=log)
        print('', file=log)

        if res[score] > best:
            best = res[score]
            best_analyzer = analyzer
            best_a = a

    print('Best experiment: Alpha =', best_a, file=log)
    print('Test scores', file=log)
    print('F1:', best_analyzer.scores['Test_F1'], file=log)
    print('Precision:', best_analyzer.scores['Test_Precision'], file=log)
    print('Recall:', best_analyzer.scores['Test_Recall'], file=log)
    print('AUC:', best_analyzer.scores['Test_AUC'], file=log)