def remake_counter(counter_str):
	counter_out = C_()
	counter_str = re.sub("Counter\(\{", "", counter_str)
	counter_str = re.sub("\}\)$", "", counter_str)
	counter_str_list = re.split(", ", counter_str)
	for item in counter_str_list:
		counter_out[int(item[0:item.find(":")])] = int(item[item.find(":") + 2:])
	return counter_out
Exemple #2
0
def count_words_counter(ref_dict, list_to_count):
    cnt_out = C_()
    for item in list_to_count:
        cnt_out[ref_dict[item]] += 1
    return cnt_out
Exemple #3
0
w_d_rev = {}  # word_dict with keys and values reversed
stemmed_dict = {
}  # Dict of all stems in format {first_stem_to_appear:0, ..., nth_stem_to_appear:n}
s_d_rev = {}  # stemmed_dict with keys and values reversed
s_blacklist = []  # List of stems to remove
w_blacklist = []  # List of words to remove
if exclude_stopwords == True:
    w_blacklist += stopwords.words('english')

if custom_stopwords == True:
    stoplist_directory = stoppath
    custom_stoplist = custom_stopword_list(stoplist_file, stoplist_directory)
    w_blacklist += custom_stoplist

counter_full = C_(
    {}
)  # Counter of all words (or stems) and their counts in format Counter({value_of_word_in_word_dict_or_stemmed_dict: count, ...})
file_count = 1

doc_grouper = ""  # Prepares for grouping of mem. cont. by metadata
counter_to_print = C_()
working_df['index1'] = working_df.index
w_mc = []

#Open ref_file so you can write out words
ref_file = open(outpath + 'full-reflist.txt', "w")

#######################################
#Begin iterating over rows
#######################################
def pandasDF_to_mult(inputdf,
                     outpath,
                     custom_tokens=[],
                     custom_clean=[],
                     exclude_stopwords=True,
                     stem_mc=False,
                     doc_col_name="speech",
                     min_chars_for_word=2):
    '''This is the baseline mult generator for a pandas dataframe. There's no option for custom stopword FILE, but 
	it does include option for custom removals (regex patterns), which you can theoretically use to remove certain words.
	Includes option to exclude NLTK stopwords. Also includes option for custom tokens using regex patterns. 
	Words are defined by the regex '[A-Za-z]{min_chars_for_word,}', i.e. only including strings of letters longer than or equal to minimum.

	Requires header: ['id','day','month','year','speaker', 'speech']
	
	Outputs TO FILE in output path:
		# meta_output.csv = metadata without DAT info
		# full_reflist.txt = reference wordlist
		# full_mult.dat = DAT info
		# final_meta.csv = metadata with DAT info
		# [user defined name].csv = full dataframe, including speeches, metadata, and DAT info

	Returns:
		A pandas dataframe which includes all data.

	Parameters:
	(-) inputdf = a pandas dataframe with each row corresponding to a document
	(-) outpath = directory to write out the output files above

	(-) custom_tokens = list of regex patterns to keep as tokens. ORDER MATTERS! Regex catches from left to right, separate with '|'.
	(-) custom_clean = list of regex patterns to substitute with a space in text
	(-) exclude_stopwords = exclude NLTK stopwords
	(-) stem_mc = should I use the NLTK Snowball stemmer to stem words?
	(-) doc_col_name = column name in pandas dataframe of raw document text
	(-) min_chars_for_word = integer for minimum chars to consider something a word_tokenize
	'''

    #Prep Working DF
    working_df = inputdf
    working_df = working_df.reset_index()
    working_df['index1'] = working_df.index

    #Prepare your toys for wordplay
    snowball = nltk.stem.snowball.EnglishStemmer(ignore_stopwords=False)
    final_token_set = custom_tokens + '|[A-Za-z]{' + str(
        min_chars_for_word) + ',}'
    tokenizer = token_re(
        final_token_set)  # group cites + words as tokens: ORDER MATTERS!!!!

    word_dict = {
    }  # Dict of all words in format {first_word_to_appear:0, ..., nth_word_to_appear:n}
    w_d_rev = {}  # word_dict with keys and values reversed
    stemmed_dict = {
    }  # Dict of all stems in format {first_stem_to_appear:0, ..., nth_stem_to_appear:n}
    s_d_rev = {}  # stemmed_dict with keys and values reversed
    s_blacklist = []  # List of stems to remove
    w_blacklist = []  # List of words to remove
    if exclude_stopwords == True:
        w_blacklist += stopwords.words('english')

    #############
    #Currently does not support custom stopword option
    #############
    # if custom_stopwords == True:
    #         stoplist_directory = stoppath
    #         custom_stoplist=custom_list(stoplist_file,stoplist_directory)
    #         w_blacklist += custom_stoplist

    counter_full = C_(
        {}
    )  # Counter of all words (or stems) and their counts in format Counter({value_of_word_in_word_dict_or_stemmed_dict: count, ...})
    file_count = 1

    doc_grouper = ""  # Prepares for grouping of mem. cont. by metadata
    counter_to_print = C_()
    working_df['index1'] = working_df.index
    w_mc = []

    #Open ref_file so you can write out words
    ref_file = open(outpath + 'full-reflist.txt', "w")

    #######################################
    #Begin iterating over rows
    #######################################

    print('CHECKPOINT 2 - prep work complete')

    for index1, row in working_df.iterrows():
        working_row = row.copy()
        yr = working_row["year"]
        mem_cont_raw = str(working_row["speech"]).lower()

        #If there's custom stuff to remove, do it here
        for stuff_to_clean in custom_clean:
            mem_cont_raw = re.sub(str(stuff_to_clean), ' ', mem_cont_raw)
        mem_cont = mem_cont_raw

        #Tokenize words
        words_in_mc = tokenizer.tokenize(mem_cont)
        words_in_mc = clean_list(w_blacklist, words_in_mc)

        ##########
        #Currently does not support custom stopwords/stems
        ##########
        # #Custom stoplist stemming
        # if custom_stopwords == True and custom_stems == True:
        # 	if stem_exclude:
        # 		excl_list=custom_list(stem_exclude,stoplist_directory)
        # 	else:
        # 		excl_list=[]
        # 	custom_stemlist=[i for i in custom_stoplist if i not in excl_list]
        # 	words_in_mc = clean_stems(custom_stemlist,words_in_mc)

        #Get wordcount
        if stem_mc == True:
            ustemmed_in_mc = []
            stemmed_in_mc = stem_list(words_in_mc)
            stemmed_in_mc = clean_list(s_blacklist, stemmed_in_mc)
            ustemmed_in_mc = combine_lists(ustemmed_in_mc, stemmed_in_mc)
            stemmed_dict, s_d_rev = add_to_master(stemmed_dict, s_d_rev,
                                                  ustemmed_in_mc, ref_file)
            counter_for_mc = count_words_counter(stemmed_dict, stemmed_in_mc)

        elif stem_mc == False:
            unique_in_mc = []
            unique_in_mc = combine_lists(unique_in_mc, words_in_mc)
            word_dict, w_d_rev = add_to_master(word_dict, w_d_rev,
                                               unique_in_mc, ref_file)
            counter_for_mc = count_words_counter(word_dict, words_in_mc)
        counter_full += counter_for_mc

        multi_index, multi_count = make_multi(counter_for_mc)
        mc_str = make_multi_full(counter_for_mc)

        #Process the heck out of it
        mc = pandas.Series(multi_count, index=multi_index, dtype='int')
        mc_dict = mc.to_dict()
        print('Processing: ' + str(index1))
        mc_str = str(mc_str)
        working_df.loc[index1, 'Full String'] = mc_str
        w_mc.append(mc_dict)
        file_count += 1

    #######################
    #Write out the outputs
    #######################

    os.chdir(outpath)
    final_result = working_df

    #Write out metafile
    final_result_meta = working_df[[
        'index1',
        'id',
        'Full String',
        'day',
        'month',
        'year',
        'speaker',
    ]]
    final_result_meta.to_csv('final_meta.csv', encoding='utf-8', index=False)

    #Write out your datfile
    datfile = working_df[['Full String']]
    datfile.to_csv('full-mult.dat', header=False, index=False)

    #Close your ref file; you're done appending it.
    ref_file.close()

    return (final_result)