Example #1
0
def test_frequencies_entries():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["frequencies"] == [('random', 2), ('1 world', 1),
                                            ('hello world 2', 1),
                                            ('hello hello', 1),
                                            ('world world', 1)]
def test_frequencies_entries():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["frequencies"] == [
        ("random", 2),
        ("1 world", 1),
        ("hello world 2", 1),
        ("hello hello", 1),
        ("world world", 1),
    ]
Example #3
0
def frequency(df, columnIndex, options={}):
    cutoff = 50
    useWords = False
    column = df[df.columns[columnIndex]]

    if type(options) is dict:
        if options.get("useWords", False) is True and not issubclass(
                column.dtype.type, np.datetime64) and not issubclass(
                    column.dtype.type, np.number):
            useWords = True
        if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50:
            cutoff = int(options["cutoff"])

    values = []
    counts = []
    if useWords:
        tuples = textAnalysis(column)["word_frequencies"]
        for x in reversed(tuples[:cutoff]):
            values.append(x[0].decode("utf-8", "replace") if isinstance(
                x[0], basestring) else x[0])
            counts.append(x[1])
    else:
        tuples = column.value_counts()
        for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1):
            values.append(
                tuples.index[index].decode("utf-8", "replace") if isinstance(
                    tuples.index[index], basestring) else tuples.index[index])
            counts.append(tuples.iloc[index])

    pyplot.style.use('ggplot')
    fig = pyplot.figure(figsize=(10, 8))
    ax = fig.add_subplot(111)

    ax.set_ylim(-0.5, len(values) - 0.5)
    ax.barh(np.arange(len(values)), counts, tick_label=values, align="center")
    ax.set_xlabel("Frequency")
    ax.set_ylabel("Value")

    stream = StringIO()
    fig.savefig(stream, format="png", dpi=300)
    pyplot.close(fig)

    return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
Example #4
0
def frequency(df, columnIndex, options={}):
	cutoff = 50
	useWords = False
	column = df[df.columns[columnIndex]]

	if type(options) is dict:
		if options.get("useWords", False) is True and not issubclass(column.dtype.type, np.datetime64) and not issubclass(column.dtype.type, np.number):
			useWords = True
		if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50:
			cutoff = int(options["cutoff"])

	values = []
	counts = []
	if useWords:
		tuples = textAnalysis(column)["word_frequencies"]
		for x in reversed(tuples[:cutoff]):
			values.append(x[0].decode("utf-8", "replace") if isinstance(x[0], basestring) else x[0])
			counts.append(x[1])
	else:
		tuples = column.value_counts()
		for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1):
			values.append(tuples.index[index].decode("utf-8", "replace") if isinstance(tuples.index[index], basestring) else tuples.index[index])
			counts.append(tuples.iloc[index])
	
	pyplot.style.use('ggplot')
	fig = pyplot.figure(figsize=(10, 8))
	ax = fig.add_subplot(111)

	ax.set_ylim(-0.5, len(values) - 0.5)
	ax.barh(np.arange(len(values)), counts, tick_label=values, align="center")
	ax.set_xlabel("Frequency")
	ax.set_ylabel("Value")

	stream = StringIO()
	fig.savefig(stream, format="png", dpi=300)
	pyplot.close(fig)

	return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
Example #5
0
def test_common_word_mode():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_mode_frequency"] == 4
Example #6
0
def test_unique_words():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_unique_count"] == 5
Example #7
0
def test_words_average():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_count_average"] == 1
Example #8
0
def test_string_average():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_length_average"] == 4
Example #9
0
def test_mode_count_entries():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["mode_frequency"] == 2
def test_total_words():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_total"] == 11
def test_words_average():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_count_average"] == 1
def test_words_max():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_count_max"] == 3
def test_string_average():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_length_average"] == 4
def test_string_max_length():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_length_max"] == 6
def test_mode_count_entries():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["mode_frequency"] == 2
def test_mode_entries():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["mode"] == ["random"]
Example #17
0
def test_mode_entries():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["mode"] == ['random']
def test_unique_words():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_unique_count"] == 5
Example #19
0
def test_string_max_length():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_length_max"] == 6
def test_common_word():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_mode"] == ["world"]
Example #21
0
def test_words_max():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_count_max"] == 3
def test_common_word_mode():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_mode_frequency"] == 4
Example #23
0
def test_total_words():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_total"] == 11
def test_word_frequency():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_frequencies"] == [("world", 4), ("hello", 3), ("random", 2), ("1", 1), ("2", 1)]
Example #25
0
def test_common_word():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_mode"] == ['world']
def test_invalid_entries():
    test_series = df["col2"]
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["invalid"] == 2
Example #27
0
def test_word_frequency():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["word_frequencies"] == [('world', 4), ('hello', 3),
                                                 ('random', 2), ('1', 1),
                                                 ('2', 1)]
Example #28
0
def frequency(df, columnIndex, options={}):
    """Uses ``matplotlib`` to generate a horizontal frequency bar chart of the specified :class:`pandas.DataFrame` column

	This function uses the :meth:`pandas.Series.value_counts` method (or :func:`dcs.analyze.textAnalysis`['word_frequencies'] if plotting word frequency)
	to get the (value, frequency) tuples for the specified column. A horizontal bar chart is generated with the :func:`matplotlib.axes.Axes.barh` function,
	and the chart is exported to a PNG image and then encoded into a string using Base64. 

	.. note::

		The *options* kwarg can be used to customize the plot and may have the following key-value pairs:

		*	**useWords** : a ``bool`` flag which may be set to ``True`` to plot word frequencies instad of row value frequencies for a string column
		*	**cutoff** : an ``int`` specifying the top *n* values by frequency to plot, default is 50, maximum is 50

	The function returns a dictionary with the following key-value pairs:

	*	**image** : *StringIO.StringIO* – :class:`StringIO.StringIO` object containing Base64 encoded PNG image of generated plot

	Args:
		df (pandas.DataFrame): data frame
		columnIndices (list<int>): indices of columns to plot
		options (dict, optional): options dictionary

	Returns:
		dict: dictionary containing image
	"""

    cutoff = 50
    useWords = False
    column = df[df.columns[columnIndex]]

    if type(options) is dict:
        if options.get("useWords", False) is True and not issubclass(
                column.dtype.type, np.datetime64) and not issubclass(
                    column.dtype.type, np.number):
            useWords = True
        if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50:
            cutoff = int(options["cutoff"])

    values = []
    counts = []
    if useWords:
        tuples = textAnalysis(column)["word_frequencies"]
        for x in reversed(tuples[:cutoff]):
            values.append(x[0].decode("utf-8", "replace") if isinstance(
                x[0], basestring) else x[0])
            counts.append(x[1])
    else:
        tuples = column.value_counts()
        for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1):
            values.append(
                tuples.index[index].decode("utf-8", "replace") if isinstance(
                    tuples.index[index], basestring) else tuples.index[index])
            counts.append(tuples.iloc[index])

    pyplot.style.use('ggplot')
    fig = pyplot.figure(figsize=(10, 8))
    ax = fig.add_subplot(111)

    ax.set_ylim(-0.5, len(values) - 0.5)
    ax.barh(np.arange(len(values)), counts, tick_label=values, align="center")
    ax.set_xlabel("Frequency")
    ax.set_ylabel("Value")

    stream = StringIO()
    fig.savefig(stream, format="png", dpi=300)
    pyplot.close(fig)

    return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
Example #29
0
def frequency(df, columnIndex, options={}):
	"""Uses ``matplotlib`` to generate a horizontal frequency bar chart of the specified :class:`pandas.DataFrame` column

	This function uses the :meth:`pandas.Series.value_counts` method (or :func:`dcs.analyze.textAnalysis`['word_frequencies'] if plotting word frequency)
	to get the (value, frequency) tuples for the specified column. A horizontal bar chart is generated with the :func:`matplotlib.axes.Axes.barh` function,
	and the chart is exported to a PNG image and then encoded into a string using Base64. 

	.. note::

		The *options* kwarg can be used to customize the plot and may have the following key-value pairs:

		*	**useWords** : a ``bool`` flag which may be set to ``True`` to plot word frequencies instad of row value frequencies for a string column
		*	**cutoff** : an ``int`` specifying the top *n* values by frequency to plot, default is 50, maximum is 50

	The function returns a dictionary with the following key-value pairs:

	*	**image** : *StringIO.StringIO* – :class:`StringIO.StringIO` object containing Base64 encoded PNG image of generated plot

	Args:
		df (pandas.DataFrame): data frame
		columnIndices (list<int>): indices of columns to plot
		options (dict, optional): options dictionary

	Returns:
		dict: dictionary containing image
	"""

	cutoff = 50
	useWords = False
	column = df[df.columns[columnIndex]]

	if type(options) is dict:
		if options.get("useWords", False) is True and not issubclass(column.dtype.type, np.datetime64) and not issubclass(column.dtype.type, np.number):
			useWords = True
		if options.get("cutoff", -1) > 0 and options.get("cutoff", -1) <= 50:
			cutoff = int(options["cutoff"])

	values = []
	counts = []
	if useWords:
		tuples = textAnalysis(column)["word_frequencies"]
		for x in reversed(tuples[:cutoff]):
			values.append(x[0].decode("utf-8", "replace") if isinstance(x[0], basestring) else x[0])
			counts.append(x[1])
	else:
		tuples = column.value_counts()
		for index in range(min(cutoff - 1, len(tuples) - 1), -1, -1):
			values.append(tuples.index[index].decode("utf-8", "replace") if isinstance(tuples.index[index], basestring) else tuples.index[index])
			counts.append(tuples.iloc[index])
	
	pyplot.style.use('ggplot')
	fig = pyplot.figure(figsize=(10, 8))
	ax = fig.add_subplot(111)

	ax.set_ylim(-0.5, len(values) - 0.5)
	ax.barh(np.arange(len(values)), counts, tick_label=values, align="center")
	ax.set_xlabel("Frequency")
	ax.set_ylabel("Value")

	stream = StringIO()
	fig.savefig(stream, format="png", dpi=300)
	pyplot.close(fig)

	return {'image': base64.b64encode(stream.getvalue()).decode('utf-8')}
Example #30
0
def test_invalid_entries():
    test_series = df['col2']
    test_analysis = analyze.textAnalysis(test_series)
    assert test_analysis["invalid"] == 2