def remake_counter(counter_str): counter_out = C_() counter_str = re.sub("Counter\(\{", "", counter_str) counter_str = re.sub("\}\)$", "", counter_str) counter_str_list = re.split(", ", counter_str) for item in counter_str_list: counter_out[int(item[0:item.find(":")])] = int(item[item.find(":") + 2:]) return counter_out
def count_words_counter(ref_dict, list_to_count): cnt_out = C_() for item in list_to_count: cnt_out[ref_dict[item]] += 1 return cnt_out
w_d_rev = {} # word_dict with keys and values reversed stemmed_dict = { } # Dict of all stems in format {first_stem_to_appear:0, ..., nth_stem_to_appear:n} s_d_rev = {} # stemmed_dict with keys and values reversed s_blacklist = [] # List of stems to remove w_blacklist = [] # List of words to remove if exclude_stopwords == True: w_blacklist += stopwords.words('english') if custom_stopwords == True: stoplist_directory = stoppath custom_stoplist = custom_stopword_list(stoplist_file, stoplist_directory) w_blacklist += custom_stoplist counter_full = C_( {} ) # Counter of all words (or stems) and their counts in format Counter({value_of_word_in_word_dict_or_stemmed_dict: count, ...}) file_count = 1 doc_grouper = "" # Prepares for grouping of mem. cont. by metadata counter_to_print = C_() working_df['index1'] = working_df.index w_mc = [] #Open ref_file so you can write out words ref_file = open(outpath + 'full-reflist.txt', "w") ####################################### #Begin iterating over rows #######################################
def pandasDF_to_mult(inputdf, outpath, custom_tokens=[], custom_clean=[], exclude_stopwords=True, stem_mc=False, doc_col_name="speech", min_chars_for_word=2): '''This is the baseline mult generator for a pandas dataframe. There's no option for custom stopword FILE, but it does include option for custom removals (regex patterns), which you can theoretically use to remove certain words. Includes option to exclude NLTK stopwords. Also includes option for custom tokens using regex patterns. Words are defined by the regex '[A-Za-z]{min_chars_for_word,}', i.e. only including strings of letters longer than or equal to minimum. Requires header: ['id','day','month','year','speaker', 'speech'] Outputs TO FILE in output path: # meta_output.csv = metadata without DAT info # full_reflist.txt = reference wordlist # full_mult.dat = DAT info # final_meta.csv = metadata with DAT info # [user defined name].csv = full dataframe, including speeches, metadata, and DAT info Returns: A pandas dataframe which includes all data. Parameters: (-) inputdf = a pandas dataframe with each row corresponding to a document (-) outpath = directory to write out the output files above (-) custom_tokens = list of regex patterns to keep as tokens. ORDER MATTERS! Regex catches from left to right, separate with '|'. (-) custom_clean = list of regex patterns to substitute with a space in text (-) exclude_stopwords = exclude NLTK stopwords (-) stem_mc = should I use the NLTK Snowball stemmer to stem words? (-) doc_col_name = column name in pandas dataframe of raw document text (-) min_chars_for_word = integer for minimum chars to consider something a word_tokenize ''' #Prep Working DF working_df = inputdf working_df = working_df.reset_index() working_df['index1'] = working_df.index #Prepare your toys for wordplay snowball = nltk.stem.snowball.EnglishStemmer(ignore_stopwords=False) final_token_set = custom_tokens + '|[A-Za-z]{' + str( min_chars_for_word) + ',}' tokenizer = token_re( final_token_set) # group cites + words as tokens: ORDER MATTERS!!!! word_dict = { } # Dict of all words in format {first_word_to_appear:0, ..., nth_word_to_appear:n} w_d_rev = {} # word_dict with keys and values reversed stemmed_dict = { } # Dict of all stems in format {first_stem_to_appear:0, ..., nth_stem_to_appear:n} s_d_rev = {} # stemmed_dict with keys and values reversed s_blacklist = [] # List of stems to remove w_blacklist = [] # List of words to remove if exclude_stopwords == True: w_blacklist += stopwords.words('english') ############# #Currently does not support custom stopword option ############# # if custom_stopwords == True: # stoplist_directory = stoppath # custom_stoplist=custom_list(stoplist_file,stoplist_directory) # w_blacklist += custom_stoplist counter_full = C_( {} ) # Counter of all words (or stems) and their counts in format Counter({value_of_word_in_word_dict_or_stemmed_dict: count, ...}) file_count = 1 doc_grouper = "" # Prepares for grouping of mem. cont. by metadata counter_to_print = C_() working_df['index1'] = working_df.index w_mc = [] #Open ref_file so you can write out words ref_file = open(outpath + 'full-reflist.txt', "w") ####################################### #Begin iterating over rows ####################################### print('CHECKPOINT 2 - prep work complete') for index1, row in working_df.iterrows(): working_row = row.copy() yr = working_row["year"] mem_cont_raw = str(working_row["speech"]).lower() #If there's custom stuff to remove, do it here for stuff_to_clean in custom_clean: mem_cont_raw = re.sub(str(stuff_to_clean), ' ', mem_cont_raw) mem_cont = mem_cont_raw #Tokenize words words_in_mc = tokenizer.tokenize(mem_cont) words_in_mc = clean_list(w_blacklist, words_in_mc) ########## #Currently does not support custom stopwords/stems ########## # #Custom stoplist stemming # if custom_stopwords == True and custom_stems == True: # if stem_exclude: # excl_list=custom_list(stem_exclude,stoplist_directory) # else: # excl_list=[] # custom_stemlist=[i for i in custom_stoplist if i not in excl_list] # words_in_mc = clean_stems(custom_stemlist,words_in_mc) #Get wordcount if stem_mc == True: ustemmed_in_mc = [] stemmed_in_mc = stem_list(words_in_mc) stemmed_in_mc = clean_list(s_blacklist, stemmed_in_mc) ustemmed_in_mc = combine_lists(ustemmed_in_mc, stemmed_in_mc) stemmed_dict, s_d_rev = add_to_master(stemmed_dict, s_d_rev, ustemmed_in_mc, ref_file) counter_for_mc = count_words_counter(stemmed_dict, stemmed_in_mc) elif stem_mc == False: unique_in_mc = [] unique_in_mc = combine_lists(unique_in_mc, words_in_mc) word_dict, w_d_rev = add_to_master(word_dict, w_d_rev, unique_in_mc, ref_file) counter_for_mc = count_words_counter(word_dict, words_in_mc) counter_full += counter_for_mc multi_index, multi_count = make_multi(counter_for_mc) mc_str = make_multi_full(counter_for_mc) #Process the heck out of it mc = pandas.Series(multi_count, index=multi_index, dtype='int') mc_dict = mc.to_dict() print('Processing: ' + str(index1)) mc_str = str(mc_str) working_df.loc[index1, 'Full String'] = mc_str w_mc.append(mc_dict) file_count += 1 ####################### #Write out the outputs ####################### os.chdir(outpath) final_result = working_df #Write out metafile final_result_meta = working_df[[ 'index1', 'id', 'Full String', 'day', 'month', 'year', 'speaker', ]] final_result_meta.to_csv('final_meta.csv', encoding='utf-8', index=False) #Write out your datfile datfile = working_df[['Full String']] datfile.to_csv('full-mult.dat', header=False, index=False) #Close your ref file; you're done appending it. ref_file.close() return (final_result)