def group_years(df, fill_years=True, export=False, export_path='.'): """Return DataFrame with texts grouped by year. Texts are grouped based on year. By default, intermittent years without texts are included for a continuous timeline suitable for linear plotting. If needed for external analysis, texts grouped by year can be exported as .txt-files. """ col = 'text' frame_check(df, col) # Drop rows without text. df = df[~df[col].str.contains('PDF')].copy() # Convert dates to datetime-format df.loc[:, 'date'] = pd.to_datetime(df['date']) # Create empty DataFrame indexed by year. year_frame = pd.DataFrame(columns=['doc_count', col], index=df['date'].dt.year.unique()) # Iterate over years and insert texts accordingly. for year in year_frame.index: texts = list(df[df['date'].dt.year == year][col]) year_text = '\n'.join(texts) year_frame.loc[year] = {'doc_count': len(texts), col: year_text} if export: # Export grouped texts to new text file. text_file = generate_filename(f"{export_path}/texts_{year}", 'txt', try_original=True) with open(text_file, 'w') as f: f.write(year_text) if fill_years: # Add empty years to DataFrame. # NOTE: Empty years at beginning and end of range are not included. for year in range(year_frame.index.min(), year_frame.index.max()): if not year in year_frame.index: year_frame.loc[year] = {'doc_count': 0, col: ''} # Sort index for coherent timeline. year_frame = year_frame.sort_index() return year_frame
def relative_frequencies(df, term, col='text', strict=False): """Return Series of relative frequencies. Calculate relative frequencies of term in relation to total word count across column. """ frame_check(df, col) word_counts = df[col].str.split().str.len() if strict: # Case-sensitive term_counts = df[col].str.count(term) else: # Case-insensitive term_counts = df[col].str.count(term, re.IGNORECASE) return pd.Series(term_counts / word_counts, name=term).fillna(0)
def get_edges(df, col, sep=';'): """Returns list of edges (tuples of pairs) and a dictionary of values. """ frame_check(df, col) # Prepare DataFrame for processing. df = df.dropna(axis=0, subset=[col]) # Get rows with multiple values. df = df[df[col].str.contains(sep)] split_frame = df[col].str.split(sep, expand=True) row_list = split_frame.values.tolist() # Collapse list of lists and find unique values. flat_list = list() [[flat_list.append(item) for item in row if item and item not in flat_list] for row in row_list] flat_list = sorted(flat_list) # Generate dictionary of values and IDs for cross referencing. values = dict() for num, item in enumerate(flat_list): values[item] = num # Generate list of edges. edge_list = list() for row in row_list: for first_index, _ in enumerate(row): for second_index, _ in enumerate(row[first_index + 1:]): first_item = row[first_index] second_item = row[first_index + second_index + 1] if first_item and second_item: edge_list.append((values[first_item], values[second_item])) return edge_list, values
def filter_frame(df, terms, col='subjects', strict=False): """Return slice of DataFrame. Filter DataFrame based on presence of term in specified column (Default: "subjects"). Terms can be passed as a string or a list of strings. for multiple terms documents containing any of the terms are included. """ frame_check(df, col) # Filter out NA values. df = df.dropna(axis=0, subset=[col]) filtered_frame = pd.DataFrame() if isinstance(terms, str): terms = [terms] if not isinstance(terms, list): raise TypeError('Search terms must be a string or a list of strings.') for term in terms: if strict: # Matching exact content of cell. if not term in df[col].values: raise Exception(f'Passed term "{term}" not found in {col}.') term_frame = df[df[col] == term] filtered_frame = pd.concat([filtered_frame, term_frame], sort=False) else: # Case-insensitive matching of cells containing term. term_frame = df[df[col].str.contains(term, case=False)] filtered_frame = pd.concat([filtered_frame, term_frame], sort=False)
def counter(df, col, sep=';', drop_na=True, na_value='N/A', sort='values', top=None, min_val=None): """Return dictionary of value counts. Count and sort values of specified column in DataFrame. Parameters ---------- df : DataFrame The DataFrame to analyse. col : str Name of column to count. sep : str, default ';' Value of separator for parsing multi-value cells. drop_na : bool, default True Exclude NA values from counting. na_value : str, default 'N/A' NA value to use when "drop_na=False". sort : {'values', 'keys'}, default 'values' Sort the dictionary by values or keys. top : int, default None, optional Limit the dictionary by specifying the number of items to include in descending order. min_val : int, default None, optional Specify the minimum number of occurences to include in dictionary. """ frame_check(df, col) count_dict = dict() # Handle NA values before counting. if drop_na: df = df.dropna(axis=0, subset=[col]) else: df = df.fillna(value=na_value) # Iterate over cells in column and count values. for row in df.index: for item in df[col][row].split(sep): item = item.strip() if item in count_dict: count_dict[item] += 1 else: count_dict[item] = 1 # Sort dictionary by keys or values if not sort in ['keys', 'values']: print('Sort method not recognised. Must be "keys" or "values". Default sorting by values.') sort = 'values' if sort == 'values': count_dict = dict(sorted(count_dict.items(), key=(lambda x: x[1]), reverse=True)) else: count_dict = dict(sorted(count_dict.items())) # Modify dictionary before returning. if top: if type(top) == int: top_dict = dict() for item, _ in zip(count_dict, range(top)): top_dict[item] = count_dict[item] return top_dict else: raise TypeError(f'"top" must be an integer. Passed value is of type {type(top)}.') if min_val: if type(min_val) == int: min_val_dict = dict() for item in count_dict: if count_dict[item] >= min_val: min_val_dict[item] = count_dict[item] return min_val_dict else: raise TypeError(f'"min_val" must be an integer. Passed value is of type {type(min_val)}.') return count_dict