Python export_civis Exemples, utilities.export_civis Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : name_cleaning.py Projet : gabrielmzucker/votetripling

def main(args):

    # Set home directory
    home = Path(args.home_folder)

    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        data = load_civis(args.input_data_filename.replace(".csv", ""),
                          args.database_name)
    else:
        data = load_flat_file(home, args.input_data_filename)

    # Only Retain relevant data
    data.loc[data.names.isnull(), 'names'] = ""
    data = data.loc[~(data.names == '')][['names']]

    # Clean Names
    data['clean_names'] = ''
    for i, row in data.iterrows():
        names = row['names']
        data.loc[i, 'clean_names'] = clean_labeled_names(names, names)

    # Write out annotated file
    if args.use_civis:
        export_civis(data, args.output_file.replace(".csv", ""),
                     args.database_name)
    else:
        data.to_csv(Path(home, "Output_Data", args.output_file), index=False)

Exemple #2

0

Afficher le fichier

Fichier : name_cleaning_with_responses.py Projet : alutes/votetripling

def main(args):

    # Set home directory
    home = Path(args.home_folder)

    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        data = load_civis(args.input_data_filename.replace(".csv", ""),
                          args.database_name)
    else:
        data = load_flat_file(home, args.input_data_filename)

    # Fix NA Values
    data.loc[data.triplemessage.isnull(), 'triplemessage'] = ""
    data.loc[data.voterresponse.isnull(), 'voterresponse'] = ""
    data.loc[data.voterfinal.isnull(), 'voterfinal'] = ""
    data.loc[data.voterpost.isnull(), 'voterpost'] = ""
    data.loc[data.names.isnull(), 'names'] = ""

    # Only Retain relevant data
    data = data.loc[~(data.names == '')]

    # Clean Names
    data['clean_names'] = ''
    data['review'] = False
    for i, row in data.iterrows():
        names = row['names']
        response = row['voterresponse'] + ' ' + row['voterfinal'] + ' ' + row[
            'voterpost']
        clean_names, review = clean_labeled_names(names, response)
        data.loc[i, 'clean_names'] = clean_names
        data.loc[i, 'review'] = review

    # Write out annotated file
    if args.use_civis:
        export_civis(data, args.output_file.replace(".csv", ""),
                     args.database_name)
    else:
        data.to_csv(Path(home, "Output_Data", args.output_file), index=False)

Exemple #3

0

Afficher le fichier

Fichier : annotate_conversations.py Projet : alutes/votetripling

def main(args):

    # Set home directory
    home = Path(args.home_folder)

    print(args.database_name)
    print(args.input_data_filename)

    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        data = load_civis(args.input_data_filename.replace(".csv", ""),
                          args.database_name)
        for col in [
                'noresponse', 'negresponse', 'posresponse', 'affirmresponse',
                'finalaffirmresponse'
        ]:
            data[col] = (data[col] == 't').astype(bool)
    else:
        data = load_flat_file(home, args.input_data_filename)

    # Thresholds for manual review and labeling
    LOWER_BOUND = .4
    UPPER_BOUND = .75
    MID_BOUND = .5

    # Ensure data has the right columns
    for col in [
            'noresponse', 'negresponse', 'posresponse', 'affirmresponse',
            'finalaffirmresponse', 'triplemessage', 'voterresponse',
            'voterfinal', 'voterpost', 'conversationid', 'contact_phone'
    ]:
        if col not in data.columns:
            raise Exception("%s must be a valid column in the dataset" % col)

    print("Loading Models...")

    pickle_file = Path(home, "Models", "annotation_models.pkl")
    with open(pickle_file, "rb") as f:
        # N-Gram Featurizers
        response_vectorizer = pickle.load(f)
        final_vectorizer = pickle.load(f)
        post_vectorizer = pickle.load(f)

        # Logistic Regressions
        token_model = pickle.load(f)
        model_tripler = pickle.load(f)
        model_name = pickle.load(f)
        model_opt = pickle.load(f)
        model_wrongnumber = pickle.load(f)
        token_counter = pickle.load(f)
        model_van_name = pickle.load(f)
        van_vectorizer = pickle.load(f)
        Features = pickle.load(f)
        model_token_bow = pickle.load(f)
        van_token_vectorizer = pickle.load(f)

    print("Loading Data...")

    # US Census Data
    census = pd.read_csv(
        Path(home, "Utility_Data", "census_first_names_all.csv"))
    census_dict = {}
    for i, row in census.iterrows():
        census_dict[row['name']] = np.log(row['census_count'])

    # Last Name Data
    census_last = pd.read_csv(
        Path(home, "Utility_Data", "census_last_names_all.csv"))
    census_last_dict = {}
    for i, row in census_last.iterrows():
        census_last_dict[row['name']] = np.log(row['census_count'])

    # US Word Freq Data
    english = pd.read_csv(Path(home, "Utility_Data", "english.csv"))
    english_dict = {}
    for i, row in english.iterrows():
        english_dict[row['name']] = row['freq']

    print("Cleaning and Featurizing...")

    # Fix NA Values
    data.loc[data.triplemessage.isnull(), 'triplemessage'] = ""
    data.loc[data.voterresponse.isnull(), 'voterresponse'] = ""
    data.loc[data.voterfinal.isnull(), 'voterfinal'] = ""
    data.loc[data.voterpost.isnull(), 'voterpost'] = ""

    # Fix Auto Replies
    auto_reply_reg = re.compile("(^\\[Auto[- ]?Reply\\])|(Sent from my car)",
                                re.I)
    data.loc[data.voterresponse.str.contains(auto_reply_reg),
             "voterresponse"] = ""
    data.loc[data.voterfinal.str.contains(auto_reply_reg), "voterfinal"] = ""
    data.loc[data.voterpost.str.contains(auto_reply_reg), "voterpost"] = ""

    # Number of tokens in final response
    data['num_tokens_response'] = data.voterresponse.str.count(
        " ") + ~(data.voterresponse == "")
    data['num_tokens_final'] = data.voterfinal.str.count(
        " ") + ~(data.voterfinal == "")
    data['num_tokens_post'] = data.voterpost.str.count(" ") + ~(data.voterpost
                                                                == "")

    # Build Token Features
    data = add_token_features(data,
                              van_token_vectorizer,
                              model_token_bow,
                              token_model,
                              Features,
                              english_dict,
                              census_dict,
                              census_last_dict,
                              token_counter,
                              LOWER_BOUND=LOWER_BOUND,
                              UPPER_BOUND=UPPER_BOUND)

    # Build Features
    X = featurize_conversation(data, response_vectorizer, final_vectorizer,
                               post_vectorizer)

    print("Annotating with Predictions...")

    # Add Predictions
    data['tripler_probability'] = model_tripler.predict_proba(X)[:, 1]
    data['name_provided_probability'] = model_name.predict_proba(X)[:, 1]
    data['optout_probability'] = model_opt.predict_proba(X)[:, 1]
    data['wrongnumber_probability'] = model_wrongnumber.predict_proba(X)[:, 1]

    # Create Dataset for triplers
    triplers = data.loc[(data.tripler_probability > UPPER_BOUND)
                        & ((data.name_provided_probability > UPPER_BOUND)
                           | (data.name_provided_probability < LOWER_BOUND)) &
                        ((data.optout_probability > UPPER_BOUND) |
                         (data.optout_probability < LOWER_BOUND)) &
                        (data.manual_review == False)].copy()
    triplers['is_tripler'] = 'yes'
    triplers.loc[triplers.name_provided_probability < UPPER_BOUND,
                 'names_extract'] = ''
    triplers['opted_out'] = np.where(triplers.optout_probability < UPPER_BOUND,
                                     'no', 'yes')
    triplers['wrong_number'] = np.where(
        triplers.wrongnumber_probability < UPPER_BOUND, 'no', 'yes')
    triplers = triplers[[
        'conversationid', 'contact_phone', 'is_tripler', 'opted_out',
        'wrong_number', 'names_extract'
    ]]

    # Create Dataset for optouts
    optouts = data.loc[(data.tripler_probability < LOWER_BOUND) & (
        (data.optout_probability > UPPER_BOUND)
        | (data.wrongnumber_probability > UPPER_BOUND))].copy()
    optouts['opted_out'] = np.where(optouts.optout_probability < UPPER_BOUND,
                                    'no', 'yes')
    optouts['wrong_number'] = np.where(
        optouts.wrongnumber_probability < UPPER_BOUND, 'no', 'yes')
    optouts = optouts[[
        'conversationid', 'contact_phone', 'opted_out', 'wrong_number'
    ]]

    # Create Dataset for manual review
    review = data.loc[(data.tripler_probability > LOWER_BOUND)
                      & (((data.tripler_probability < UPPER_BOUND)) | (
                          (data.name_provided_probability < UPPER_BOUND) &
                          (data.name_provided_probability > LOWER_BOUND)) | (
                              (data.optout_probability < UPPER_BOUND) &
                              (data.optout_probability > LOWER_BOUND))
                         | (data.manual_review == True))].copy()

    # Also review cases where we extracted two names and likely missed a third
    two_name_review = data.loc[(data.name_prob1 > UPPER_BOUND)
                               & (data.name_prob2 > UPPER_BOUND) &
                               (data.name_prob3 < LOWER_BOUND) &
                               (data.name_prob3 > 0) &
                               (data.num_tokens_final < 5)].copy()
    review = pd.concat([review, two_name_review])
    review['is_tripler'] = np.where(review.tripler_probability < MID_BOUND,
                                    'no', 'yes')
    review.loc[review.name_provided_probability < MID_BOUND,
               'names_extract'] = ''
    review['opted_out'] = np.where(review.optout_probability < MID_BOUND, 'no',
                                   'yes')
    review['wrong_number'] = np.where(
        review.wrongnumber_probability < MID_BOUND, 'no', 'yes')
    review = review[[
        'conversationid', 'contact_phone', 'voterresponse', 'voterfinal',
        'voterpost', 'is_tripler', 'opted_out', 'wrong_number', 'names_extract'
    ]]

    # Write out annotated files
    if args.use_civis:
        export_civis(triplers, args.output_filename.replace(".csv", ""),
                     args.database_name)
        export_civis(optouts, args.optouts_filename.replace(".csv", ""),
                     args.database_name)
        export_civis(review, args.manual_review_filename.replace(".csv", ""),
                     args.database_name)
    else:
        triplers.to_csv(Path(home, "Output_Data", args.output_filename),
                        index=False,
                        encoding='latin1')
        optouts.to_csv(Path(home, "Output_Data", args.optouts_filename),
                       index=False,
                       encoding='latin1')
        review.to_csv(Path(home, "Output_Data", args.manual_review_filename),
                      index=False,
                      encoding='latin1')

Exemple #4

0

Afficher le fichier

Fichier : annotate_conversations.py Projet : gabrielmzucker/votetripling

def main(args):

    # Set home directory
    home = Path(args.home_folder)

    print(args.database_name)
    print(args.input_data_filename)

    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        data = load_civis(args.input_data_filename.replace(".csv", ""),
                          args.database_name)
        for col in [
                'noresponse', 'negresponse', 'posresponse', 'affirmresponse',
                'finalaffirmresponse'
        ]:
            data[col] = data[col].astype(bool)
    else:
        data = load_flat_file(home, args.input_data_filename)

    # Thresholds for manual review and labeling
    LOWER_BOUND = .4
    UPPER_BOUND = .75
    MID_BOUND = .5

    print("Loading Models...")

    pickle_file = Path(home, "Models", "annotation_models.pkl")
    with open(pickle_file, "rb") as f:
        # N-Gram Featurizers
        response_vectorizer = pickle.load(f)
        final_vectorizer = pickle.load(f)
        post_vectorizer = pickle.load(f)

        # Logistic Regressions
        token_model = pickle.load(f)
        model_tripler = pickle.load(f)
        model_name = pickle.load(f)
        model_opt = pickle.load(f)
        model_wrongnumber = pickle.load(f)
        token_counter = pickle.load(f)

    print("Loading Data...")

    # US Census Data
    census = pd.read_csv(
        Path(home, "Utility_Data", "census_first_names_all.csv"))
    census_dict = {}
    for i, row in census.iterrows():
        census_dict[row['name']] = np.log(row['census_count'])

    # Last Name Data
    census_last = pd.read_csv(
        Path(home, "Utility_Data", "census_last_names_all.csv"))
    census_last_dict = {}
    for i, row in census_last.iterrows():
        census_last_dict[row['name']] = np.log(row['census_count'])

    # US Word Freq Data
    english = pd.read_csv(Path(home, "Utility_Data", "english.csv"))
    english_dict = {}
    for i, row in english.iterrows():
        english_dict[row['name']] = row['freq']

    print("Cleaning and Featurizing...")

    # Fix NA Values
    data.loc[data.triplemessage.isnull(), 'triplemessage'] = ""
    data.loc[data.voterresponse.isnull(), 'voterresponse'] = ""
    data.loc[data.voterfinal.isnull(), 'voterfinal'] = ""
    data.loc[data.voterpost.isnull(), 'voterpost'] = ""

    # Number of tokens in final response
    data['num_tokens'] = data.voterfinal.str.count(" ") + ~(data.voterfinal
                                                            == "")

    # Build Token Features
    data = add_token_features(data,
                              token_model,
                              english_dict,
                              census_dict,
                              census_last_dict,
                              token_counter,
                              threshold=LOWER_BOUND)

    # Build Features
    X = featurize_conversation(data, response_vectorizer, final_vectorizer,
                               post_vectorizer)

    print("Annotating with Predictions...")

    # Add Predictions
    data['tripler_probability'] = model_tripler.predict_proba(X)[:, 1]
    data['name_provided_probability'] = model_name.predict_proba(X)[:, 1]
    data['optout_probability'] = model_opt.predict_proba(X)[:, 1]
    data['wrongnumber_probability'] = model_wrongnumber.predict_proba(X)[:, 1]

    # Create Dataset for triplers
    triplers = data.loc[(data.tripler_probability > UPPER_BOUND)
                        & ((data.name_provided_probability > UPPER_BOUND)
                           | (data.name_provided_probability < LOWER_BOUND)) &
                        ((data.optout_probability > UPPER_BOUND) |
                         (data.optout_probability < LOWER_BOUND)) &
                        ((data.name_prob1 > UPPER_BOUND) |
                         (data.name_prob1 < LOWER_BOUND)) &
                        ((data.name_prob2 > UPPER_BOUND) |
                         (data.name_prob2 < LOWER_BOUND)) &
                        ((data.name_prob3 > UPPER_BOUND) |
                         (data.name_prob3 < LOWER_BOUND))].copy()
    triplers['is_tripler'] = 'yes'
    triplers.loc[triplers.name_provided_probability < UPPER_BOUND,
                 'names_extract'] = ''
    triplers['opted_out'] = np.where(triplers.optout_probability < UPPER_BOUND,
                                     'no', 'yes')
    triplers['wrong_number'] = np.where(
        triplers.wrongnumber_probability < UPPER_BOUND, 'no', 'yes')
    triplers = triplers[[
        'conversationid', 'contact_phone', 'is_tripler', 'opted_out',
        'wrong_number', 'names_extract'
    ]]

    # Create Dataset for manual review
    review = data.loc[((data.tripler_probability < UPPER_BOUND) &
                       (data.tripler_probability > LOWER_BOUND)) |
                      ((data.name_provided_probability < UPPER_BOUND) &
                       (data.name_provided_probability > LOWER_BOUND)) |
                      ((data.optout_probability < UPPER_BOUND) &
                       (data.optout_probability > LOWER_BOUND)) |
                      ((data.name_prob1 < UPPER_BOUND) &
                       (data.name_prob1 > LOWER_BOUND)) |
                      ((data.name_prob2 < UPPER_BOUND) &
                       (data.name_prob2 > LOWER_BOUND)) |
                      ((data.name_prob3 < UPPER_BOUND) &
                       (data.name_prob3 > LOWER_BOUND))].copy()
    review['is_tripler'] = np.where(review.tripler_probability < MID_BOUND,
                                    'no', 'yes')
    review.loc[review.name_provided_probability < MID_BOUND,
               'names_extract'] = ''
    review['opted_out'] = np.where(review.optout_probability < MID_BOUND, 'no',
                                   'yes')
    review['wrong_number'] = np.where(
        review.wrongnumber_probability < MID_BOUND, 'no', 'yes')
    review = review[[
        'conversationid', 'contact_phone', 'voterresponse', 'voterfinal',
        'voterpost', 'is_tripler', 'opted_out', 'wrong_number', 'names_extract'
    ]]

    # Write out annotated files
    if args.use_civis:
        export_civis(triplers, args.output_filename.replace(".csv", ""),
                     args.database_name)
        export_civis(review, args.manual_review_filename.replace(".csv", ""),
                     args.database_name)
    else:
        triplers.to_csv(Path(home, "Output_Data", args.output_filename),
                        index=False,
                        encoding='latin1')
        review.to_csv(Path(home, "Output_Data", args.manual_review_filename),
                      index=False,
                      encoding='latin1')

Exemple #5

0

Afficher le fichier

Fichier : van_export_cleaning.py Projet : alutes/votetripling

def main(args):

    # Set home directory
    home = Path(args.home_folder)

    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        van = load_civis(args.input_data_filename.replace(".csv", ""),
                         args.database_name)
    else:
        van = load_flat_file(home, args.input_data_filename)

    # Thresholds for manual review and labeling
    LOWER_BOUND = .4
    UPPER_BOUND = .75

    print("Loading Models...")

    pickle_file = Path(home, "Models", "annotation_models.pkl")
    with open(pickle_file, "rb") as f:
        # N-Gram Featurizers
        response_vectorizer = pickle.load(f)
        final_vectorizer = pickle.load(f)
        post_vectorizer = pickle.load(f)

        # Logistic Regressions
        token_model = pickle.load(f)
        model_tripler = pickle.load(f)
        model_name = pickle.load(f)
        model_opt = pickle.load(f)
        model_wrongnumber = pickle.load(f)
        token_counter = pickle.load(f)
        model_van_name = pickle.load(f)
        van_vectorizer = pickle.load(f)
        Features = pickle.load(f)
        model_token_bow = pickle.load(f)
        van_token_vectorizer = pickle.load(f)

    print("Loading Data...")

    # US Census Data
    census = pd.read_csv(
        Path(home, "Utility_Data", "census_first_names_all.csv"))
    census_dict = {}
    for i, row in census.iterrows():
        census_dict[row['name']] = np.log(row['census_count'])

    # Last Name Data
    census_last = pd.read_csv(
        Path(home, "Utility_Data", "census_last_names_all.csv"))
    census_last_dict = {}
    for i, row in census_last.iterrows():
        census_last_dict[row['name']] = np.log(row['census_count'])

    # US Word Freq Data
    english = pd.read_csv(Path(home, "Utility_Data", "english.csv"))
    english_dict = {}
    for i, row in english.iterrows():
        english_dict[row['name']] = row['freq']

    # Ensure data has the right columns
    for col in ['voter_file_vanid', 'contactname', 'notetext']:
        if col not in van.columns:
            raise Exception("%s must be a valid column in the dataset" % col)

    # Clean NA values
    van.loc[van.notetext.isnull(), 'notetext'] = ""
    van.loc[van.contactname.isnull(), 'contactname'] = ""

    # Aggregate by van id, combine notetext
    van = van.loc[~(van['notetext'] == "")]
    van['notetext'] = van.groupby(
        ['voter_file_vanid',
         'contactname'])['notetext'].transform(lambda x: ','.join(x))
    van = van[['voter_file_vanid', 'contactname',
               'notetext']].drop_duplicates()

    # Number of tokens
    van['num_tokens'] = van.notetext.str.count(" ") + ~(van.notetext == "")

    # Build Token Features
    van = add_token_features_van(van,
                                 van_token_vectorizer,
                                 model_token_bow,
                                 token_model,
                                 Features,
                                 english_dict,
                                 census_dict,
                                 census_last_dict,
                                 token_counter,
                                 LOWER_BOUND=LOWER_BOUND,
                                 UPPER_BOUND=UPPER_BOUND)

    # Build Features
    X = featurize_conversation_van(van, van_vectorizer)

    print("Annotating with Predictions...")

    # Add Predictions
    van['names_probability'] = model_van_name.predict_proba(X)[:, 1]

    # Don't put any names if we don't know what the names should be
    van.loc[(van.names_probability < LOWER_BOUND) | (van.names_extract == ""),
            'names_extract'] = ""

    # Get those with confirmed names
    triplers = van.loc[(~(van.manual_review == True) &
                        (van.names_probability > UPPER_BOUND)) |
                       (van.names_probability < LOWER_BOUND)]
    review = van.loc[(van.names_probability > LOWER_BOUND) & (
        (van.manual_review == True) | (van.names_probability < UPPER_BOUND))]

    # Also review cases where we extracted two names and likely missed a third
    two_name_review = van.loc[(van.name_prob1 > UPPER_BOUND)
                              & (van.name_prob2 > UPPER_BOUND) &
                              (van.name_prob3 < LOWER_BOUND) &
                              (van.name_prob3 > 0) & (van.num_tokens < 5)
                              & ~(van.manual_review == True)].copy()
    review = pd.concat([review, two_name_review])

    # Write out annotated files
    if args.use_civis:
        export_civis(triplers, args.output_filename.replace(".csv", ""),
                     args.database_name)
        export_civis(review, args.manual_review_filename.replace(".csv", ""),
                     args.database_name)
    else:
        triplers.to_csv(Path(home, "Output_Data", args.output_filename),
                        index=False,
                        encoding='latin1')
        review.to_csv(Path(home, "Output_Data", args.manual_review_filename),
                      index=False,
                      encoding='latin1')

Exemple #6

0

Afficher le fichier

Fichier : van_export_cleaning.py Projet : gabrielmzucker/votetripling

def main(args):

    # Set home directory
    home = Path(args.home_folder)
    
    # Read in data either from flat file or civis
    if args.use_civis:
        home = Path("./Projects/NLP/SMS_Annotation/")
        van = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name)
    else:
        van = load_flat_file(home, args.input_data_filename)
    
    # Thresholds for manual review and labeling
    LOWER_BOUND = .4 
    UPPER_BOUND = .75

    print("Loading Models...")


    pickle_file = Path(home, "Models", "annotation_models.pkl")
    with open(pickle_file, "rb") as f:
        # N-Gram Featurizers
        response_vectorizer = pickle.load(f)
        final_vectorizer = pickle.load(f)
        post_vectorizer = pickle.load(f)

        # Logistic Regressions
        token_model = pickle.load(f)
        model_tripler = pickle.load(f)
        model_name = pickle.load(f)
        model_opt = pickle.load(f)
        model_wrongnumber = pickle.load(f)
        token_counter = pickle.load(f)

    print("Loading Data...")

    # US Census Data
    census = pd.read_csv(Path(home, "Utility_Data", "census_first_names_all.csv"))
    census_dict = {}
    for i, row in census.iterrows():
        census_dict[row['name']] = np.log(row['census_count'])

    # Last Name Data
    census_last = pd.read_csv(Path(home, "Utility_Data", "census_last_names_all.csv"))
    census_last_dict = {}
    for i, row in census_last.iterrows():
        census_last_dict[row['name']] = np.log(row['census_count'])

    # US Word Freq Data
    english = pd.read_csv(Path(home, "Utility_Data", "english.csv"))
    english_dict = {}
    for i, row in english.iterrows():
        english_dict[row['name']] = row['freq']

    # Clean NA values
    van.loc[van.notetext.isnull(), 'notetext'] = ""
    van.loc[van.contactname.isnull(), 'contactname'] = ""

    # Get Extracted Names
    names_extract = []
    manual_review = []
    for i, row in van.iterrows():
        response = row['notetext']
        if (cleanString(response) == ""):
            names_extract.append("")
            manual_review.append(False)
            continue
        X_tokens_row = pd.DataFrame(
            get_token_features(response, row['contactname'], english_dict, census_dict, census_last_dict, token_counter)
            ).values.astype(float)
        y_pred = token_model.predict_proba(X_tokens_row)
        doc = get_doc(response)
        clean_tokens = [normalize_token(t.string) for t in doc] 
        clean_tokens = [t for t in clean_tokens if not t == ""]
        
        # Extract any plausible tokens
        names_extract.append(extract_good_tokens(
                clean_tokens = clean_tokens, 
                triple_message = row['contactname'],
                y_pred = y_pred, 
                response = response, 
                threshold = LOWER_BOUND
                ))
        
        # Send to Manual Review if there are any tokens in the unclear range
        manual_review.append(((y_pred[:,1] > LOWER_BOUND) & (y_pred[:,1] < UPPER_BOUND)).sum() > 0)
    van['names_extract'] = names_extract
    van['manual_review'] = manual_review

    # Get those with confirmed names
    triplers = van.loc[(van.manual_review == False) & ~(van.names_extract == "")][['vanid', 'names_extract']]
    review = van.loc[van.manual_review == True][['vanid', 'contactname', 'notetext', 'names_extract']]
    
    # Write out annotated files
    if args.use_civis:
        export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name)
        export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name)
    else:
        triplers.to_csv(Path(home, "Output_Data", args.output_filename), index = False, encoding = 'latin1')
        review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index = False, encoding = 'latin1')