Exemple #1
0
def group_years(df, fill_years=True, export=False, export_path='.'):
    """Return DataFrame with texts grouped by year.

    Texts are grouped based on year. By default, intermittent years without texts are included
    for a continuous timeline suitable for linear plotting.

    If needed for external analysis, texts grouped by year can be exported as .txt-files.
    """
    col = 'text'
    frame_check(df, col)

    # Drop rows without text.
    df = df[~df[col].str.contains('PDF')].copy()

    # Convert dates to datetime-format
    df.loc[:, 'date'] = pd.to_datetime(df['date'])

    # Create empty DataFrame indexed by year.
    year_frame = pd.DataFrame(columns=['doc_count', col], index=df['date'].dt.year.unique())

    # Iterate over years and insert texts accordingly.
    for year in year_frame.index:

        texts = list(df[df['date'].dt.year == year][col])

        year_text = '\n'.join(texts)

        year_frame.loc[year] = {'doc_count': len(texts), col: year_text}

        if export:
            # Export grouped texts to new text file.
            text_file = generate_filename(f"{export_path}/texts_{year}", 'txt', try_original=True)
            with open(text_file, 'w') as f:
                f.write(year_text)

    if fill_years:
        # Add empty years to DataFrame.
        # NOTE: Empty years at beginning and end of range are not included.
        for year in range(year_frame.index.min(), year_frame.index.max()):
            if not year in year_frame.index:
                year_frame.loc[year] = {'doc_count': 0, col: ''}

        # Sort index for coherent timeline.
        year_frame = year_frame.sort_index()

    return year_frame
Exemple #2
0
def relative_frequencies(df, term, col='text', strict=False):
    """Return Series of relative frequencies.

    Calculate relative frequencies of term in relation to total word count across column.
    """

    frame_check(df, col)

    word_counts = df[col].str.split().str.len()

    if strict:
        # Case-sensitive
        term_counts = df[col].str.count(term)
    else:
        # Case-insensitive
        term_counts = df[col].str.count(term, re.IGNORECASE)

    return pd.Series(term_counts / word_counts, name=term).fillna(0)
Exemple #3
0
def get_edges(df, col, sep=';'):
    """Returns list of edges (tuples of pairs) and a dictionary of values.
    """

    frame_check(df, col)

    # Prepare DataFrame for processing.
    df = df.dropna(axis=0, subset=[col])

    # Get rows with multiple values.
    df = df[df[col].str.contains(sep)]

    split_frame = df[col].str.split(sep, expand=True)

    row_list = split_frame.values.tolist()

    # Collapse list of lists and find unique values.
    flat_list = list()

    [[flat_list.append(item) for item in row if item and item not in flat_list] for row in row_list]

    flat_list = sorted(flat_list)

    # Generate dictionary of values and IDs for cross referencing.
    values = dict()

    for num, item in enumerate(flat_list):
        values[item] = num

    # Generate list of edges.
    edge_list = list()

    for row in row_list:
        for first_index, _ in enumerate(row):
            for second_index, _ in enumerate(row[first_index + 1:]):
                first_item = row[first_index]
                second_item = row[first_index + second_index + 1]
                if first_item and second_item:
                    edge_list.append((values[first_item], values[second_item]))

    return edge_list, values
Exemple #4
0
def filter_frame(df, terms, col='subjects', strict=False):
    """Return slice of DataFrame.

    Filter DataFrame based on presence of term in specified column (Default: "subjects").
    
    Terms can be passed as a string or a list of strings.
    for multiple terms documents containing any of the terms are included.
    """

    frame_check(df, col)

    # Filter out NA values.
    df = df.dropna(axis=0, subset=[col])

    filtered_frame = pd.DataFrame()

    if isinstance(terms, str):
        terms = [terms]
    
    if not isinstance(terms, list):
        raise TypeError('Search terms must be a string or a list of strings.')

    for term in terms:
        if strict:
            # Matching exact content of cell.
            if not term in df[col].values:
                raise Exception(f'Passed term "{term}" not found in {col}.')

            term_frame = df[df[col] == term]

            filtered_frame = pd.concat([filtered_frame, term_frame], sort=False)

        else:
            # Case-insensitive matching of cells containing term.
            term_frame = df[df[col].str.contains(term, case=False)]

            filtered_frame = pd.concat([filtered_frame, term_frame], sort=False)
Exemple #5
0
def counter(df, col, sep=';', drop_na=True, na_value='N/A', sort='values', top=None, min_val=None):
    """Return dictionary of value counts.

    Count and sort values of specified column in DataFrame.
    
    Parameters
    ----------
    df : DataFrame
        The DataFrame to analyse.
    col : str
        Name of column to count.
    sep : str, default ';'
        Value of separator for parsing multi-value cells.
    drop_na : bool, default True
        Exclude NA values from counting.
    na_value : str, default 'N/A'
        NA value to use when "drop_na=False".
    sort : {'values', 'keys'}, default 'values'
        Sort the dictionary by values or keys.
    top : int, default None, optional
        Limit the dictionary by specifying the number of items to include in descending order.
    min_val : int, default None, optional
        Specify the minimum number of occurences to include in dictionary.
    """
    
    frame_check(df, col)

    count_dict = dict()

    # Handle NA values before counting.
    if drop_na:
        df = df.dropna(axis=0, subset=[col])
    else:
        df = df.fillna(value=na_value)

    # Iterate over cells in column and count values.
    for row in df.index:
        for item in df[col][row].split(sep):
            item = item.strip()
            if item in count_dict:
                count_dict[item] += 1
            else:
                count_dict[item] = 1

    # Sort dictionary by keys or values
    if not sort in ['keys', 'values']:
        print('Sort method not recognised. Must be "keys" or "values". Default sorting by values.')
        sort = 'values'

    if sort == 'values':
        count_dict = dict(sorted(count_dict.items(), key=(lambda x: x[1]), reverse=True))
    else:
        count_dict = dict(sorted(count_dict.items()))

    # Modify dictionary before returning.
    if top:
        if type(top) == int:
            top_dict = dict()
            for item, _ in zip(count_dict, range(top)):
                top_dict[item] = count_dict[item]
            return top_dict
        else:
            raise TypeError(f'"top" must be an integer. Passed value is of type {type(top)}.')
    
    if min_val:
        if type(min_val) == int:
            min_val_dict = dict()
            for item in count_dict:
                if count_dict[item] >= min_val:
                    min_val_dict[item] = count_dict[item]
            return min_val_dict

        else:
            raise TypeError(f'"min_val" must be an integer. Passed value is of type {type(min_val)}.')

    return count_dict