def main(args): # Set home directory home = Path(args.home_folder) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") data = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) else: data = load_flat_file(home, args.input_data_filename) # Only Retain relevant data data.loc[data.names.isnull(), 'names'] = "" data = data.loc[~(data.names == '')][['names']] # Clean Names data['clean_names'] = '' for i, row in data.iterrows(): names = row['names'] data.loc[i, 'clean_names'] = clean_labeled_names(names, names) # Write out annotated file if args.use_civis: export_civis(data, args.output_file.replace(".csv", ""), args.database_name) else: data.to_csv(Path(home, "Output_Data", args.output_file), index=False)
def main(args): # Set home directory home = Path(args.home_folder) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") data = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) else: data = load_flat_file(home, args.input_data_filename) # Fix NA Values data.loc[data.triplemessage.isnull(), 'triplemessage'] = "" data.loc[data.voterresponse.isnull(), 'voterresponse'] = "" data.loc[data.voterfinal.isnull(), 'voterfinal'] = "" data.loc[data.voterpost.isnull(), 'voterpost'] = "" data.loc[data.names.isnull(), 'names'] = "" # Only Retain relevant data data = data.loc[~(data.names == '')] # Clean Names data['clean_names'] = '' data['review'] = False for i, row in data.iterrows(): names = row['names'] response = row['voterresponse'] + ' ' + row['voterfinal'] + ' ' + row[ 'voterpost'] clean_names, review = clean_labeled_names(names, response) data.loc[i, 'clean_names'] = clean_names data.loc[i, 'review'] = review # Write out annotated file if args.use_civis: export_civis(data, args.output_file.replace(".csv", ""), args.database_name) else: data.to_csv(Path(home, "Output_Data", args.output_file), index=False)
def main(args): # Set home directory home = Path(args.home_folder) print(args.database_name) print(args.input_data_filename) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") data = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) for col in [ 'noresponse', 'negresponse', 'posresponse', 'affirmresponse', 'finalaffirmresponse' ]: data[col] = (data[col] == 't').astype(bool) else: data = load_flat_file(home, args.input_data_filename) # Thresholds for manual review and labeling LOWER_BOUND = .4 UPPER_BOUND = .75 MID_BOUND = .5 # Ensure data has the right columns for col in [ 'noresponse', 'negresponse', 'posresponse', 'affirmresponse', 'finalaffirmresponse', 'triplemessage', 'voterresponse', 'voterfinal', 'voterpost', 'conversationid', 'contact_phone' ]: if col not in data.columns: raise Exception("%s must be a valid column in the dataset" % col) print("Loading Models...") pickle_file = Path(home, "Models", "annotation_models.pkl") with open(pickle_file, "rb") as f: # N-Gram Featurizers response_vectorizer = pickle.load(f) final_vectorizer = pickle.load(f) post_vectorizer = pickle.load(f) # Logistic Regressions token_model = pickle.load(f) model_tripler = pickle.load(f) model_name = pickle.load(f) model_opt = pickle.load(f) model_wrongnumber = pickle.load(f) token_counter = pickle.load(f) model_van_name = pickle.load(f) van_vectorizer = pickle.load(f) Features = pickle.load(f) model_token_bow = pickle.load(f) van_token_vectorizer = pickle.load(f) print("Loading Data...") # US Census Data census = pd.read_csv( Path(home, "Utility_Data", "census_first_names_all.csv")) census_dict = {} for i, row in census.iterrows(): census_dict[row['name']] = np.log(row['census_count']) # Last Name Data census_last = pd.read_csv( Path(home, "Utility_Data", "census_last_names_all.csv")) census_last_dict = {} for i, row in census_last.iterrows(): census_last_dict[row['name']] = np.log(row['census_count']) # US Word Freq Data english = pd.read_csv(Path(home, "Utility_Data", "english.csv")) english_dict = {} for i, row in english.iterrows(): english_dict[row['name']] = row['freq'] print("Cleaning and Featurizing...") # Fix NA Values data.loc[data.triplemessage.isnull(), 'triplemessage'] = "" data.loc[data.voterresponse.isnull(), 'voterresponse'] = "" data.loc[data.voterfinal.isnull(), 'voterfinal'] = "" data.loc[data.voterpost.isnull(), 'voterpost'] = "" # Fix Auto Replies auto_reply_reg = re.compile("(^\\[Auto[- ]?Reply\\])|(Sent from my car)", re.I) data.loc[data.voterresponse.str.contains(auto_reply_reg), "voterresponse"] = "" data.loc[data.voterfinal.str.contains(auto_reply_reg), "voterfinal"] = "" data.loc[data.voterpost.str.contains(auto_reply_reg), "voterpost"] = "" # Number of tokens in final response data['num_tokens_response'] = data.voterresponse.str.count( " ") + ~(data.voterresponse == "") data['num_tokens_final'] = data.voterfinal.str.count( " ") + ~(data.voterfinal == "") data['num_tokens_post'] = data.voterpost.str.count(" ") + ~(data.voterpost == "") # Build Token Features data = add_token_features(data, van_token_vectorizer, model_token_bow, token_model, Features, english_dict, census_dict, census_last_dict, token_counter, LOWER_BOUND=LOWER_BOUND, UPPER_BOUND=UPPER_BOUND) # Build Features X = featurize_conversation(data, response_vectorizer, final_vectorizer, post_vectorizer) print("Annotating with Predictions...") # Add Predictions data['tripler_probability'] = model_tripler.predict_proba(X)[:, 1] data['name_provided_probability'] = model_name.predict_proba(X)[:, 1] data['optout_probability'] = model_opt.predict_proba(X)[:, 1] data['wrongnumber_probability'] = model_wrongnumber.predict_proba(X)[:, 1] # Create Dataset for triplers triplers = data.loc[(data.tripler_probability > UPPER_BOUND) & ((data.name_provided_probability > UPPER_BOUND) | (data.name_provided_probability < LOWER_BOUND)) & ((data.optout_probability > UPPER_BOUND) | (data.optout_probability < LOWER_BOUND)) & (data.manual_review == False)].copy() triplers['is_tripler'] = 'yes' triplers.loc[triplers.name_provided_probability < UPPER_BOUND, 'names_extract'] = '' triplers['opted_out'] = np.where(triplers.optout_probability < UPPER_BOUND, 'no', 'yes') triplers['wrong_number'] = np.where( triplers.wrongnumber_probability < UPPER_BOUND, 'no', 'yes') triplers = triplers[[ 'conversationid', 'contact_phone', 'is_tripler', 'opted_out', 'wrong_number', 'names_extract' ]] # Create Dataset for optouts optouts = data.loc[(data.tripler_probability < LOWER_BOUND) & ( (data.optout_probability > UPPER_BOUND) | (data.wrongnumber_probability > UPPER_BOUND))].copy() optouts['opted_out'] = np.where(optouts.optout_probability < UPPER_BOUND, 'no', 'yes') optouts['wrong_number'] = np.where( optouts.wrongnumber_probability < UPPER_BOUND, 'no', 'yes') optouts = optouts[[ 'conversationid', 'contact_phone', 'opted_out', 'wrong_number' ]] # Create Dataset for manual review review = data.loc[(data.tripler_probability > LOWER_BOUND) & (((data.tripler_probability < UPPER_BOUND)) | ( (data.name_provided_probability < UPPER_BOUND) & (data.name_provided_probability > LOWER_BOUND)) | ( (data.optout_probability < UPPER_BOUND) & (data.optout_probability > LOWER_BOUND)) | (data.manual_review == True))].copy() # Also review cases where we extracted two names and likely missed a third two_name_review = data.loc[(data.name_prob1 > UPPER_BOUND) & (data.name_prob2 > UPPER_BOUND) & (data.name_prob3 < LOWER_BOUND) & (data.name_prob3 > 0) & (data.num_tokens_final < 5)].copy() review = pd.concat([review, two_name_review]) review['is_tripler'] = np.where(review.tripler_probability < MID_BOUND, 'no', 'yes') review.loc[review.name_provided_probability < MID_BOUND, 'names_extract'] = '' review['opted_out'] = np.where(review.optout_probability < MID_BOUND, 'no', 'yes') review['wrong_number'] = np.where( review.wrongnumber_probability < MID_BOUND, 'no', 'yes') review = review[[ 'conversationid', 'contact_phone', 'voterresponse', 'voterfinal', 'voterpost', 'is_tripler', 'opted_out', 'wrong_number', 'names_extract' ]] # Write out annotated files if args.use_civis: export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name) export_civis(optouts, args.optouts_filename.replace(".csv", ""), args.database_name) export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name) else: triplers.to_csv(Path(home, "Output_Data", args.output_filename), index=False, encoding='latin1') optouts.to_csv(Path(home, "Output_Data", args.optouts_filename), index=False, encoding='latin1') review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index=False, encoding='latin1')
def main(args): # Set home directory home = Path(args.home_folder) print(args.database_name) print(args.input_data_filename) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") data = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) for col in [ 'noresponse', 'negresponse', 'posresponse', 'affirmresponse', 'finalaffirmresponse' ]: data[col] = data[col].astype(bool) else: data = load_flat_file(home, args.input_data_filename) # Thresholds for manual review and labeling LOWER_BOUND = .4 UPPER_BOUND = .75 MID_BOUND = .5 print("Loading Models...") pickle_file = Path(home, "Models", "annotation_models.pkl") with open(pickle_file, "rb") as f: # N-Gram Featurizers response_vectorizer = pickle.load(f) final_vectorizer = pickle.load(f) post_vectorizer = pickle.load(f) # Logistic Regressions token_model = pickle.load(f) model_tripler = pickle.load(f) model_name = pickle.load(f) model_opt = pickle.load(f) model_wrongnumber = pickle.load(f) token_counter = pickle.load(f) print("Loading Data...") # US Census Data census = pd.read_csv( Path(home, "Utility_Data", "census_first_names_all.csv")) census_dict = {} for i, row in census.iterrows(): census_dict[row['name']] = np.log(row['census_count']) # Last Name Data census_last = pd.read_csv( Path(home, "Utility_Data", "census_last_names_all.csv")) census_last_dict = {} for i, row in census_last.iterrows(): census_last_dict[row['name']] = np.log(row['census_count']) # US Word Freq Data english = pd.read_csv(Path(home, "Utility_Data", "english.csv")) english_dict = {} for i, row in english.iterrows(): english_dict[row['name']] = row['freq'] print("Cleaning and Featurizing...") # Fix NA Values data.loc[data.triplemessage.isnull(), 'triplemessage'] = "" data.loc[data.voterresponse.isnull(), 'voterresponse'] = "" data.loc[data.voterfinal.isnull(), 'voterfinal'] = "" data.loc[data.voterpost.isnull(), 'voterpost'] = "" # Number of tokens in final response data['num_tokens'] = data.voterfinal.str.count(" ") + ~(data.voterfinal == "") # Build Token Features data = add_token_features(data, token_model, english_dict, census_dict, census_last_dict, token_counter, threshold=LOWER_BOUND) # Build Features X = featurize_conversation(data, response_vectorizer, final_vectorizer, post_vectorizer) print("Annotating with Predictions...") # Add Predictions data['tripler_probability'] = model_tripler.predict_proba(X)[:, 1] data['name_provided_probability'] = model_name.predict_proba(X)[:, 1] data['optout_probability'] = model_opt.predict_proba(X)[:, 1] data['wrongnumber_probability'] = model_wrongnumber.predict_proba(X)[:, 1] # Create Dataset for triplers triplers = data.loc[(data.tripler_probability > UPPER_BOUND) & ((data.name_provided_probability > UPPER_BOUND) | (data.name_provided_probability < LOWER_BOUND)) & ((data.optout_probability > UPPER_BOUND) | (data.optout_probability < LOWER_BOUND)) & ((data.name_prob1 > UPPER_BOUND) | (data.name_prob1 < LOWER_BOUND)) & ((data.name_prob2 > UPPER_BOUND) | (data.name_prob2 < LOWER_BOUND)) & ((data.name_prob3 > UPPER_BOUND) | (data.name_prob3 < LOWER_BOUND))].copy() triplers['is_tripler'] = 'yes' triplers.loc[triplers.name_provided_probability < UPPER_BOUND, 'names_extract'] = '' triplers['opted_out'] = np.where(triplers.optout_probability < UPPER_BOUND, 'no', 'yes') triplers['wrong_number'] = np.where( triplers.wrongnumber_probability < UPPER_BOUND, 'no', 'yes') triplers = triplers[[ 'conversationid', 'contact_phone', 'is_tripler', 'opted_out', 'wrong_number', 'names_extract' ]] # Create Dataset for manual review review = data.loc[((data.tripler_probability < UPPER_BOUND) & (data.tripler_probability > LOWER_BOUND)) | ((data.name_provided_probability < UPPER_BOUND) & (data.name_provided_probability > LOWER_BOUND)) | ((data.optout_probability < UPPER_BOUND) & (data.optout_probability > LOWER_BOUND)) | ((data.name_prob1 < UPPER_BOUND) & (data.name_prob1 > LOWER_BOUND)) | ((data.name_prob2 < UPPER_BOUND) & (data.name_prob2 > LOWER_BOUND)) | ((data.name_prob3 < UPPER_BOUND) & (data.name_prob3 > LOWER_BOUND))].copy() review['is_tripler'] = np.where(review.tripler_probability < MID_BOUND, 'no', 'yes') review.loc[review.name_provided_probability < MID_BOUND, 'names_extract'] = '' review['opted_out'] = np.where(review.optout_probability < MID_BOUND, 'no', 'yes') review['wrong_number'] = np.where( review.wrongnumber_probability < MID_BOUND, 'no', 'yes') review = review[[ 'conversationid', 'contact_phone', 'voterresponse', 'voterfinal', 'voterpost', 'is_tripler', 'opted_out', 'wrong_number', 'names_extract' ]] # Write out annotated files if args.use_civis: export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name) export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name) else: triplers.to_csv(Path(home, "Output_Data", args.output_filename), index=False, encoding='latin1') review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index=False, encoding='latin1')
def main(args): # Set home directory home = Path(args.home_folder) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") van = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) else: van = load_flat_file(home, args.input_data_filename) # Thresholds for manual review and labeling LOWER_BOUND = .4 UPPER_BOUND = .75 print("Loading Models...") pickle_file = Path(home, "Models", "annotation_models.pkl") with open(pickle_file, "rb") as f: # N-Gram Featurizers response_vectorizer = pickle.load(f) final_vectorizer = pickle.load(f) post_vectorizer = pickle.load(f) # Logistic Regressions token_model = pickle.load(f) model_tripler = pickle.load(f) model_name = pickle.load(f) model_opt = pickle.load(f) model_wrongnumber = pickle.load(f) token_counter = pickle.load(f) model_van_name = pickle.load(f) van_vectorizer = pickle.load(f) Features = pickle.load(f) model_token_bow = pickle.load(f) van_token_vectorizer = pickle.load(f) print("Loading Data...") # US Census Data census = pd.read_csv( Path(home, "Utility_Data", "census_first_names_all.csv")) census_dict = {} for i, row in census.iterrows(): census_dict[row['name']] = np.log(row['census_count']) # Last Name Data census_last = pd.read_csv( Path(home, "Utility_Data", "census_last_names_all.csv")) census_last_dict = {} for i, row in census_last.iterrows(): census_last_dict[row['name']] = np.log(row['census_count']) # US Word Freq Data english = pd.read_csv(Path(home, "Utility_Data", "english.csv")) english_dict = {} for i, row in english.iterrows(): english_dict[row['name']] = row['freq'] # Ensure data has the right columns for col in ['voter_file_vanid', 'contactname', 'notetext']: if col not in van.columns: raise Exception("%s must be a valid column in the dataset" % col) # Clean NA values van.loc[van.notetext.isnull(), 'notetext'] = "" van.loc[van.contactname.isnull(), 'contactname'] = "" # Aggregate by van id, combine notetext van = van.loc[~(van['notetext'] == "")] van['notetext'] = van.groupby( ['voter_file_vanid', 'contactname'])['notetext'].transform(lambda x: ','.join(x)) van = van[['voter_file_vanid', 'contactname', 'notetext']].drop_duplicates() # Number of tokens van['num_tokens'] = van.notetext.str.count(" ") + ~(van.notetext == "") # Build Token Features van = add_token_features_van(van, van_token_vectorizer, model_token_bow, token_model, Features, english_dict, census_dict, census_last_dict, token_counter, LOWER_BOUND=LOWER_BOUND, UPPER_BOUND=UPPER_BOUND) # Build Features X = featurize_conversation_van(van, van_vectorizer) print("Annotating with Predictions...") # Add Predictions van['names_probability'] = model_van_name.predict_proba(X)[:, 1] # Don't put any names if we don't know what the names should be van.loc[(van.names_probability < LOWER_BOUND) | (van.names_extract == ""), 'names_extract'] = "" # Get those with confirmed names triplers = van.loc[(~(van.manual_review == True) & (van.names_probability > UPPER_BOUND)) | (van.names_probability < LOWER_BOUND)] review = van.loc[(van.names_probability > LOWER_BOUND) & ( (van.manual_review == True) | (van.names_probability < UPPER_BOUND))] # Also review cases where we extracted two names and likely missed a third two_name_review = van.loc[(van.name_prob1 > UPPER_BOUND) & (van.name_prob2 > UPPER_BOUND) & (van.name_prob3 < LOWER_BOUND) & (van.name_prob3 > 0) & (van.num_tokens < 5) & ~(van.manual_review == True)].copy() review = pd.concat([review, two_name_review]) # Write out annotated files if args.use_civis: export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name) export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name) else: triplers.to_csv(Path(home, "Output_Data", args.output_filename), index=False, encoding='latin1') review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index=False, encoding='latin1')
def main(args): # Set home directory home = Path(args.home_folder) # Read in data either from flat file or civis if args.use_civis: home = Path("./Projects/NLP/SMS_Annotation/") van = load_civis(args.input_data_filename.replace(".csv", ""), args.database_name) else: van = load_flat_file(home, args.input_data_filename) # Thresholds for manual review and labeling LOWER_BOUND = .4 UPPER_BOUND = .75 print("Loading Models...") pickle_file = Path(home, "Models", "annotation_models.pkl") with open(pickle_file, "rb") as f: # N-Gram Featurizers response_vectorizer = pickle.load(f) final_vectorizer = pickle.load(f) post_vectorizer = pickle.load(f) # Logistic Regressions token_model = pickle.load(f) model_tripler = pickle.load(f) model_name = pickle.load(f) model_opt = pickle.load(f) model_wrongnumber = pickle.load(f) token_counter = pickle.load(f) print("Loading Data...") # US Census Data census = pd.read_csv(Path(home, "Utility_Data", "census_first_names_all.csv")) census_dict = {} for i, row in census.iterrows(): census_dict[row['name']] = np.log(row['census_count']) # Last Name Data census_last = pd.read_csv(Path(home, "Utility_Data", "census_last_names_all.csv")) census_last_dict = {} for i, row in census_last.iterrows(): census_last_dict[row['name']] = np.log(row['census_count']) # US Word Freq Data english = pd.read_csv(Path(home, "Utility_Data", "english.csv")) english_dict = {} for i, row in english.iterrows(): english_dict[row['name']] = row['freq'] # Clean NA values van.loc[van.notetext.isnull(), 'notetext'] = "" van.loc[van.contactname.isnull(), 'contactname'] = "" # Get Extracted Names names_extract = [] manual_review = [] for i, row in van.iterrows(): response = row['notetext'] if (cleanString(response) == ""): names_extract.append("") manual_review.append(False) continue X_tokens_row = pd.DataFrame( get_token_features(response, row['contactname'], english_dict, census_dict, census_last_dict, token_counter) ).values.astype(float) y_pred = token_model.predict_proba(X_tokens_row) doc = get_doc(response) clean_tokens = [normalize_token(t.string) for t in doc] clean_tokens = [t for t in clean_tokens if not t == ""] # Extract any plausible tokens names_extract.append(extract_good_tokens( clean_tokens = clean_tokens, triple_message = row['contactname'], y_pred = y_pred, response = response, threshold = LOWER_BOUND )) # Send to Manual Review if there are any tokens in the unclear range manual_review.append(((y_pred[:,1] > LOWER_BOUND) & (y_pred[:,1] < UPPER_BOUND)).sum() > 0) van['names_extract'] = names_extract van['manual_review'] = manual_review # Get those with confirmed names triplers = van.loc[(van.manual_review == False) & ~(van.names_extract == "")][['vanid', 'names_extract']] review = van.loc[van.manual_review == True][['vanid', 'contactname', 'notetext', 'names_extract']] # Write out annotated files if args.use_civis: export_civis(triplers, args.output_filename.replace(".csv", ""), args.database_name) export_civis(review, args.manual_review_filename.replace(".csv", ""), args.database_name) else: triplers.to_csv(Path(home, "Output_Data", args.output_filename), index = False, encoding = 'latin1') review.to_csv(Path(home, "Output_Data", args.manual_review_filename), index = False, encoding = 'latin1')