Ejemplo n.º 1
0
def run_supervised_models(models, variables):
    datasets_likelihood = pickle.load(open('working_data/machinecoding/datasets_likelihood.pickle', 'rb'))
    datasets_binary = pickle.load(open('working_data/machinecoding/datasets_binary.pickle', 'rb'))

    datasets = {}
    for key, value in datasets_likelihood.items():
        datasets[key] = value

    for key, value in datasets_binary.items():
        datasets[key] = value

    for model in models['models']:
        for scoring in models['scoring_parameters']:
            for dataset_name, dataset in datasets.items():
                for variable in variables:
                    train_set = dataset[dataset['traintest']=='train']
                    test_set = dataset[dataset['traintest']=='test']
                    features = get_features(dataset_name, dataset)
                    pipeline = model['pipeline']
                    tuned_parameters = model['tuned_parameters']
                    classifier_name = 'supervised_' + model['model_name'] + '_' + variable + '_' + dataset_name + '_' + scoring
                    logmsg('Starting ' + classifier_name)
                    try:
                        execute_pipeline(features, variable, pipeline, tuned_parameters, scoring, train_set, test_set, classifier_name)
                        calculate_precision_recall(classifier_name+'.pkl', [variable,], classifier_name)
                    except Exception as e:
                        logmsg('error: ' + classifier_name)
                        logmsg(str(e))
    return
Ejemplo n.º 2
0
def run_expert_models(variables):
    logmsg('Starting with predictions for Expert coding')
    logmsg(
        '... looking for expert coding selections at working_data/expertcoding'
    )
    logmsg('... files should be named expert_coding_<<MODELNAME>>.xlsx')
    models = [
        item for item in os.listdir('working_data/expertcoding')
        if ('.xlsx' in item) and (item.startswith('expert_coding_'))
    ]
    if len(models) == 0:
        logmsg('ERROR: no files with coding selections found')
        return

    logmsg('models_found: ' + ', '.join(models))

    traintest = pd.read_pickle(
        'working_data/manualcoding/manualcoding_traintest.pkl')
    logmsg('loaded manual coding of subsample. N = ' + str(len(traintest)))

    traintest_unique_ids = traintest['unique_photo_id'].unique().tolist()
    logmsg('total of unique IDs in subsample. N = ' +
           str(len(traintest_unique_ids)))

    # Loading machine coding
    google = pd.read_pickle('working_data/machinecoding/google_parsed.pkl')
    clarifai = pd.read_pickle('working_data/machinecoding/clarifai_parsed.pkl')
    microsoft = pd.read_pickle(
        'working_data/machinecoding/microsoft_parsed.pkl')

    google['unique_photo_id'] = google['unique_photo_id'].apply(cleanuniqueid)
    clarifai['unique_photo_id'] = clarifai['unique_photo_id'].apply(
        cleanuniqueid)
    microsoft['unique_photo_id'] = microsoft['unique_photo_id'].apply(
        cleanuniqueid)

    # Ensuring alignment between manual and machine coding

    clarifai = clarifai[clarifai['unique_photo_id'].isin(traintest_unique_ids)]
    google = google[google['unique_photo_id'].isin(traintest_unique_ids)]
    microsoft = microsoft[microsoft['unique_photo_id'].isin(
        traintest_unique_ids)]

    logmsg(
        'Clarifai, Google and Microsoft datasets now aligned with train/test dataset per unique id'
    )

    # Google Labels
    google_label_detection = google[google['classifier'] ==
                                    'google_label_detection'][[
                                        'unique_photo_id',
                                        'label_description',
                                        'label_mid',
                                        'label_score',
                                    ]]
    logmsg('Loaded Google Labels, total rows = ' +
           str(len(google_label_detection)))
    google_label_detection['label_description'] = google_label_detection[
        'label_description'].apply(renametags,
                                   args=('google_label_detection_', ))
    # google_label_detection_reworked = pd.pivot_table(google_label_detection, values='label_score', index=['unique_photo_id', ],
    #                  columns=['label_description'], aggfunc=np.sum).fillna(0).reset_index()

    google_label_detection = google_label_detection.rename(
        columns={
            'label_description': 'label',
            'label_score': 'likelihood'
        })
    google_label_detection['classifier'] = 'google_label_detection'

    # Clarifai Labels
    clarifai_labels = clarifai[[
        'unique_photo_id', 'clarifai_label', 'clarifai_likelihood_value'
    ]]
    logmsg('Loaded Clarifai Labels, total rows = ' + str(len(clarifai_labels)))

    clarifai_labels['clarifai_label'] = clarifai_labels[
        'clarifai_label'].apply(renametags, args=('clarifai_', ))
    # clarifai_labels_reworked = pd.pivot_table(clarifai_labels, values='clarifai_likelihood_value', index=['unique_photo_id', ],
    #                  columns=['clarifai_label'], aggfunc=np.sum).fillna(0).reset_index()
    clarifai_labels = clarifai_labels.rename(
        columns={
            'clarifai_label': 'label',
            'clarifai_likelihood_value': 'likelihood'
        })
    clarifai_labels['classifier'] = 'clarifai'

    # Microsoft Labels
    microsoft_tags = microsoft[microsoft.classifier == 'microsoft_tags'][[
        'unique_photo_id', 'microsoft_tags_name', 'microsoft_tags_score'
    ]]
    logmsg('Loaded Microsoft Tags, total rows = ' + str(len(microsoft_tags)))

    microsoft_tags['microsoft_tags_name'] = microsoft_tags[
        'microsoft_tags_name'].apply(renametags, args=('microsoft_tags', ))
    # microsoft_tags_reworked = pd.pivot_table(microsoft_tags, values='microsoft_tags_score', index=['unique_photo_id', ],
    #                     columns=['microsoft_tags_name'], aggfunc=np.sum).fillna(0).reset_index()
    microsoft_tags = microsoft_tags.rename(columns={
        'microsoft_tags_name': 'label',
        'microsoft_tags_score': 'likelihood'
    })
    microsoft_tags['classifier'] = 'microsoft_tags'

    microsoft_category = microsoft[
        microsoft.classifier == 'microsoft_category'][[
            'unique_photo_id', 'microsoft_category_label',
            'microsoft_category_score'
        ]]
    logmsg('Loaded Microsoft Category, total rows = ' +
           str(len(microsoft_category)))

    microsoft_category['microsoft_category_label'] = microsoft_category[
        'microsoft_category_label'].apply(renametags,
                                          args=('microsoft_category', ))
    # microsoft_category_reworked = pd.pivot_table(microsoft_category, values='microsoft_category_score', index=['unique_photo_id', ],
    #                     columns=['microsoft_category_label'], aggfunc=np.sum).fillna(0).reset_index()
    microsoft_category = microsoft_category.rename(
        columns={
            'microsoft_category_label': 'label',
            'microsoft_category_score': 'likelihood'
        })
    microsoft_category['classifier'] = 'microsoft_tags'

    logmsg('Creating dataset with tags for all APIs')
    # dataset = traintest.merge(clarifai_labels_reworked, how="left").merge(microsoft_tags_reworked, how="left").merge(microsoft_category_reworked, how="left").merge(google_label_detection_reworked, how="left")
    # dataset = dataset.fillna(0)
    dataset = google_label_detection.append(microsoft_category).append(
        microsoft_tags).append(clarifai_labels)
    dataset = dataset[['unique_photo_id', 'classifier', 'label', 'likelihood']]

    for model in models:
        model_name = model.replace('.xlsx', '')
        confidence = model_name.split('_')[-1]
        confidence = confidence.replace('l', '')
        confidence = float(confidence)
        expert_rules = pd.read_excel('working_data/expertcoding/' + model)
        expert_rules = expert_rules.fillna(0)
        test_set = traintest[traintest['traintest'] == 'test']

        logmsg('...total rows for labels ' + str(len(dataset)))
        logmsg('...eliminating all labels with confidence lower than ' +
               str(confidence))
        dataset_for_predictions = dataset[dataset['likelihood'] >= confidence]
        logmsg('...total rows now = ' + str(len(dataset_for_predictions)))

        logmsg('Starting with predictions for ' + model_name)
        for variable in variables:
            classifier_name = model_name
            logmsg('...' + variable)
            test_set = predict_expert_coding(test_set, dataset_for_predictions,
                                             expert_rules, variable,
                                             classifier_name)

        test_set.to_pickle('working_data/predictions/' + model_name + '.pkl')
        calculate_precision_recall(model_name + '.pkl', variables, model_name)

    logmsg('...completed predictions based on expert coding')

    return
Ejemplo n.º 3
0
def process_dataset(datasetname, dataset, variables, num_topics, alpha,
                    traintest, overall_coherence, unsupervised_models_config):
    # Loading corpus, dictionary and LDA model associated with dataset
    corpus = corpora.MmCorpus('working_data/unsupervised/topics/corpus_' +
                              datasetname + '.mm')
    dictionary = corpora.Dictionary.load(
        'working_data/unsupervised/topics/dic_' + datasetname + '.dict')
    lda = models.LdaModel.load('working_data/unsupervised/topics/model_' +
                               datasetname + '_' + str(num_topics) + '_' +
                               str(alpha) + '.lda')

    # Creating and storing predictions
    topics_full = lda.inference(corpus)
    dataset = dataset.reset_index()
    dataset = dataset.reset_index()

    topics_full = pd.DataFrame(topics_full[0])
    topics_full = topics_full.reset_index()
    topics_full = topics_full.rename(columns={'index': 'level_0'})
    topicnames = [item for item in topics_full.columns if type(item) == int]
    dataset = dataset.merge(topics_full)

    # Identifying topic for the document based on highest loading
    max_topic = dataset[topicnames].idxmax(axis=1)
    max_topic = pd.DataFrame(max_topic).reset_index().rename(columns={
        'index': 'level_0',
        0: 'maxtopic'
    })
    dataset = dataset.merge(max_topic)

    dataset = dataset.apply(check_equality, axis=1, args=(topicnames, ))

    ## Dropping missing values
    dataset = dataset.dropna(subset=['maxtopic'])

    # Adding manual coding targets
    dataset = traintest.merge(dataset)

    # Performing NB
    train_set = dataset[dataset['traintest'] == 'train']
    test_set = dataset[dataset['traintest'] == 'test']

    for model in unsupervised_models_config['models']:
        for scoring in unsupervised_models_config['scoring_parameters']:
            for variable in variables:
                features = ['maxtopic']
                pipeline = model['pipeline']
                tuned_parameters = model['tuned_parameters']
                classifier_name = 'unsupervised_' + datasetname + '_' + str(
                    num_topics) + '_' + str(alpha) + '_' + model[
                        'model_name'] + '_' + variable + '_' + scoring
                logmsg('Starting ' + classifier_name)
                try:
                    execute_pipeline_unsupervised(features, variable, pipeline,
                                                  tuned_parameters, scoring,
                                                  train_set, test_set,
                                                  classifier_name)
                    calculate_precision_recall(classifier_name + '.pkl', [
                        variable,
                    ], classifier_name)
                except Exception as e:
                    logmsg('error: ' + classifier_name)
                    logmsg(str(e))
    return
Ejemplo n.º 4
0
def run_routine():
    print('Executing the first step')

    print('Loading the API keys')
    #1. Run the images through the APIs

    # Locating API keys
    from keys import api_keys
    for key in api_keys.keys():
        if api_keys[key] == None:
            raise Exception(
                str('API key information for ' + key +
                    ' not filled out. \nPlease update the keys.py file.'))

    print('API keys loaded:', ' '.join(list(api_keys.keys())))

    # Running clarifai

    from helpers.models.clarifai_classifier import run_clarifai_classifier

    run_clarifai_classifier(api_keys['clarifai_api_key'],
                            'working_data/machinecoding/clarifai',
                            'source/subsample')

    from helpers.models.clarifai_parser import run_clarifai_parser

    run_clarifai_parser('working_data/machinecoding/',
                        'working_data/machinecoding/clarifai')

    # Running Microsoft
    from helpers.models.microsoft_classifier import run_microsoft_classifier
    run_microsoft_classifier(api_keys['microsoft_api_key'],
                             'working_data/machinecoding/microsoft',
                             'source/subsample')

    from helpers.models.microsoft_parser import run_microsoft_parser

    run_microsoft_parser('working_data/machinecoding/',
                         'working_data/machinecoding/microsoft')

    # from helpers.models.google_classifier import run_google_classifier
    run_google_classifier('working_data/machinecoding/google',
                          'source/subsample')

    from helpers.models.google_parser import run_google_parser

    run_google_parser('working_data/machinecoding/',
                      'working_data/machinecoding/google')

    #2. Do the train/test split

    from helpers.analysis.train_test_split import run_train_test_split
    variables = run_train_test_split(train_test_ratio=0.9)

    # #3. Do the API training and report results

    from helpers.custommodels.clarifai_trainer import create_clarifai_model, add_images_clarifai, train_clarifai_model, predict_testset_clarifai

    create_clarifai_model(api_keys['clarifai_api_key'],
                          api_keys['clarifai_custom_model_name'], variables)

    add_images_clarifai(api_keys['clarifai_api_key'])

    model = train_clarifai_model(api_keys['clarifai_api_key'],
                                 api_keys['clarifai_custom_model_name'])

    predict_testset_clarifai(api_keys['clarifai_api_key'],
                             api_keys['clarifai_custom_model_name'],
                             confidence=0.5)

    from helpers.analysis.classification_report import calculate_precision_recall

    calculate_precision_recall('clarifai_custom_model.pkl',
                               ['gen_people', 'gen_profit', 'gen_planet'],
                               'clarifaicustom')

    # # #4. Export tags for Expert training

    from helpers.analysis.expert_coding import export_tags_expert_coding

    export_tags_expert_coding(['gen_people', 'gen_profit', 'gen_planet'],
                              confidence=[0.7, 0.9, 0.95, 0.99])

    # # #5. Run supervised machine learning
    from helpers.analysis.supervised_models import create_features, run_supervised_models
    from supervised_models_config import models_supervised
    # setting the confidence to create the binary datasets as 0, meaning that if a tag has a likelihood above 0 (i.e., it is detected
    # even if marginally), it will be considered as present (for the binary datasets)
    create_features(confidence_for_binary=0)
    run_supervised_models(models_supervised,
                          variables=['gen_people', 'gen_profit', 'gen_planet'])

    # # #6. Retrieve expert coding and run predictions
    from helpers.analysis.expert_coding import run_expert_models
    from helpers.analysis.classification_report import calculate_precision_recall
    run_expert_models(['gen_people', 'gen_profit', 'gen_planet'])

    # # #7. Unsupervised
    from helpers.analysis.unsupervised_models import create_features_unsupervised, create_unsupervised_LDA_models, select_unsupervised_models
    from unsupervised_models_config import models_unsupervised
    create_features_unsupervised(
        likelihoods=api_keys['unsupervised_likelihoods'],
        freqthresholds=api_keys['unsupervised_thresholds'])
    create_unsupervised_LDA_models(
        num_topics_list=api_keys['unsupervised_num_topics_list'],
        alphas=api_keys['unsupervised_alphas'])
    select_unsupervised_models(['gen_people', 'gen_profit', 'gen_planet'],
                               models_unsupervised,
                               multiclass=False)