def output_html(lr): fi = file(OUT_FILE) lines = [] used = [] for line in fi: if line.startswith("RT "): continue if any(line.startswith(x) for x in used): continue used.append(line[:30]) lines.append(line) lines.reverse() X = make_feature_matrix(lines) ps = lr.predict_proba(X)[:, 1] data = [] for line, p in zip(lines, ps): if p < 0.6: continue items = line.split('\t') url = "https://twitter.com/{1}/status/{2}".format(*items) data.append(dict(url=url, score=p, text=items[0])) print len(data) from filter import render render(data, OUT_FILE)
def test_bad_date(self): # columns that aren't dates -> error params = { 'column': 'a', 'condition': menu.index('Date is'), 'value': '2015-7-31' } out = render(self.table, params) self.assertTrue(isinstance(out, str)) # should return error message params = { 'column': 'b', 'condition': menu.index('Date is'), 'value': '2015-7-31' } out = render(self.table, params) self.assertTrue(isinstance(out, str)) # stirng that isn't a date -> error params = { 'column': 'date', 'condition': menu.index('Date is'), 'value': 'gibberish' } out = render(self.table, params) self.assertTrue(isinstance(out, str))
def test_not_empty(self): params = simple_params('c', 'cell_is_not_empty', 'nonsense') result = render(self.table, params) expected = self.table[[True, False, False, True, False]].reset_index(drop=True) assert_frame_equal(result, expected) # should not require value params = simple_params('c', 'cell_is_not_empty', '') result = render(self.table, params) assert_frame_equal(result, expected)
def test_empty(self): params = simple_params("c", "cell_is_empty", "nonsense") result = render(self.table, params) expected = self.table[[False, True, True, False, True]].reset_index(drop=True) assert_frame_equal(result, expected) # should not require value params = simple_params("c", "cell_is_empty", "") result = render(self.table, params) assert_frame_equal(result, expected)
def test_not_contains(self): # Case-insensitive, no regex, keep params = { 'column': 'a', 'condition': menu.index('Text does not contain'), 'value': 'fred', 'casesensitive': False, 'regex': False, 'keep': keepmenu.index('Keep') } out = render(self.table, params) ref = self.table[[False, False, True, True, False]] self.assertTrue(out.equals(ref)) # Case-sensitive, no regex, keep params = { 'column': 'a', 'condition': menu.index('Text does not contain'), 'value': 'fred', 'casesensitive': True, 'regex': False, 'keep': keepmenu.index('Keep') } out = render(self.table, params) ref = self.table[[False, False, True, True, True]] self.assertTrue(out.equals(ref)) # Case-sensitive, regex, keep params = { 'column': 'a', 'condition': menu.index('Text does not contain'), 'value': 'f[a-zA-Z]+d', 'casesensitive': True, 'regex': True, 'keep': keepmenu.index('Keep') } out = render(self.table, params) ref = self.table[[False, False, True, True, True]] self.assertTrue(out.equals(ref)) # Case-sensitive, regex, drop params = { 'column': 'a', 'condition': menu.index('Text does not contain'), 'value': 'f[a-zA-Z]+d', 'casesensitive': True, 'regex': True, 'keep': keepmenu.index('Drop') } out = render(self.table, params) ref = self.table[[True, True, False, False, False]] self.assertTrue(out.equals(ref))
def test_not_exactly_case_insensitive(self): params = simple_params("a", "text_is_not_exactly", "x", case_sensitive=False) result = render(pd.DataFrame({"a": ["x", "X", "y"]}), params) assert_frame_equal(result, pd.DataFrame({"a": ["y"]}))
def _test_render( arrow_table: pa.Table, params: Dict[str, Any], expected_table: Optional[pa.Table], expected_errors: List[I18nMessage] = [], ): with tempfile.NamedTemporaryFile() as tf: path = Path(tf.name) actual_errors = render(arrow_table, params, path) if path.stat().st_size == 0: actual_table = None else: with pa.ipc.open_file(path) as f: actual_table = f.read_all() assert actual_errors == expected_errors if expected_table is None: assert actual_table is None else: assert actual_table is not None assert actual_table.column_names == expected_table.column_names for output_column, expected_column in zip( actual_table.itercolumns(), expected_table.itercolumns() ): assert output_column.type == expected_column.type assert output_column.to_pylist() == expected_column.to_pylist() if pa.types.is_dictionary(output_column.type): for output_chunk, expected_chunk in zip( output_column.iterchunks(), expected_column.iterchunks() ): assert ( output_chunk.dictionary.to_pylist() == expected_chunk.dictionary.to_pylist() )
def test_two_subfilters_or(self): table = pd.DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}) params = { "keep": True, "filters": { "operator": "and", "filters": [{ "operator": "or", "subfilters": [ { "colname": "A", "condition": "number_is_less_than", "value": 2, "case_sensitive": False, }, { "colname": "B", "condition": "number_is_greater_than", "value": 3, "case_sensitive": False, }, ], }], }, } result = render(table, params) assert_frame_equal(result, pd.DataFrame({"A": [1, 3], "B": [2, 4]}))
def test_two_subfilters_or(self): table = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]}) params = { 'keep': True, 'filters': { 'operator': 'and', 'filters': [ { 'operator': 'or', 'subfilters': [ { 'colname': 'A', 'condition': 'number_is_less_than', 'value': 2, 'case_sensitive': False, }, { 'colname': 'B', 'condition': 'number_is_greater_than', 'value': 3, 'case_sensitive': False, }, ], }, ], }, } result = render(table, params) assert_frame_equal(result, pd.DataFrame({'A': [1, 3], 'B': [2, 4]}))
def test_equals(self): # working as intended params = simple_params('c', 'number_equals', '3') result = render(self.table, params) expected = self.table[[True, False, False, False, False]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_greater_equals(self): # edge case, first row has b=2 params = simple_params('b', 'number_is_greater_than_or_equals', '2') result = render(self.table, params) expected = self.table[[True, True, False, True, True]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_less_equals(self): # edge case, second and last row has b=5 params = simple_params('b', 'number_is_less_than_or_equals', '5') result = render(self.table, params) expected = self.table[[True, True, False, False, True]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_datetime_before(self): table = pd.DataFrame( {"date": ["2019-01-01T04:59+0500", "2019-01-01T05:01+0500"]}) params = simple_params("date", "date_is_before", "2019-01-01") result = render(table, params) expected = pd.DataFrame({"date": ["2019-01-01T04:59+0500"]}) assert_frame_equal(result, expected)
def test_contains_regex_nan(self): table = pd.DataFrame({'A': ['a', np.nan]}) params = simple_params('A', 'text_contains_regex', 'a', case_sensitive=True) result = render(table, params) assert_frame_equal(result, pd.DataFrame({'A': ['a']}))
def test_contains_regex_parse_error_case_insensitive(self): table = pd.DataFrame({'A': ['a']}) params = simple_params('A', 'text_contains_regex', '(', case_sensitive=False) result = render(table, params) self.assertEqual(result, 'Regex parse error: missing ): (')
def test_contains_regex_parse_error_case_insensitive(self): table = pd.DataFrame({"A": ["a"]}) params = simple_params("A", "text_contains_regex", "(", case_sensitive=False) result = render(table, params) self.assertEqual(result, "Regex parse error: missing ): (")
def test_bad_date(self): # columns that aren't dates -> error params = simple_params("a", "date_is", "2015-7-31") result = render(self.table, params) self.assertEqual(result, "Column is not dates. Please convert to dates.") params = simple_params("b", "date_is", "2015-7-31") result = render(self.table, params) self.assertEqual(result, "Column is not dates. Please convert to dates.") # string that isn't a date -> error params = simple_params("date", "date_is", "gibberish") result = render(self.table, params) self.assertEqual(result, "Value is not a date. Please enter a date and time.")
def test_contains_regex_nan(self): table = pd.DataFrame({"A": ["a", np.nan]}) params = simple_params("A", "text_contains_regex", "a", case_sensitive=True) result = render(table, params) assert_frame_equal(result, pd.DataFrame({"A": ["a"]}))
def test_contains_regex_case_insensitive(self): table = pd.DataFrame({'A': ['a', 'A', 'b']}) params = simple_params('A', 'text_contains_regex', 'a', case_sensitive=False) result = render(table, params) assert_frame_equal(result, pd.DataFrame({'A': ['a', 'A']}))
def test_contains_regex_case_insensitive(self): table = pd.DataFrame({"A": ["a", "A", "b"]}) params = simple_params("A", "text_contains_regex", "a", case_sensitive=False) result = render(table, params) assert_frame_equal(result, pd.DataFrame({"A": ["a", "A"]}))
def test_date_after(self): # edge case, first row is 2018-1-12 08:15 so after implied midnight of # date without time params = simple_params('date', 'date_is_after', '2018-01-12') result = render(self.table, params) expected = self.table[[False, True, False, False, True]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_bad_date(self): # columns that aren't dates -> error params = simple_params('a', 'date_is', '2015-7-31') result = render(self.table, params) self.assertEqual(result, 'Column is not dates. Please convert to dates.') params = simple_params('b', 'date_is', '2015-7-31') result = render(self.table, params) self.assertEqual(result, 'Column is not dates. Please convert to dates.') # string that isn't a date -> error params = simple_params('date', 'date_is', 'gibberish') result = render(self.table, params) self.assertEqual(result, 'Value is not a date. Please enter a date and time.')
def test_not_contains_regex(self): params = simple_params('a', 'text_does_not_contain_regex', 'f[a-zA-Z]+d', case_sensitive=True) result = render(self.table, params) expected = self.table[[False, False, True, True, True]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_equals_non_number_errors(self): # non-numeric column should return error message params = simple_params("a", "number_equals", "3") result = render(self.table, params) self.assertEqual(result, "Column is not numbers. Please convert to numbers.") # non-numeric column should return error message params = simple_params("date", "number_equals", "3") result = render(self.table, params) self.assertEqual(result, "Column is not numbers. Please convert to numbers.") # non-numeric value should return error message params = simple_params("c", "number_equals", "gibberish") result = render(self.table, params) self.assertEqual( result, "Value is not a number. Please enter a valid number.")
def test_category_equals(self): table = pd.DataFrame({'A': ['foo', np.nan, 'bar']}, dtype='category') params = simple_params('A', 'text_is_exactly', 'foo', case_sensitive=True) result = render(table, params) assert_frame_equal(result, pd.DataFrame({'A': ['foo']}, dtype='category'))
def test_category_equals(self): table = pd.DataFrame({"A": ["foo", np.nan, "bar"]}, dtype="category") params = simple_params("A", "text_is_exactly", "foo", case_sensitive=True) result = render(table, params) assert_frame_equal(result, pd.DataFrame({"A": ["foo"]}, dtype="category"))
def test_equals_non_number_errors(self): # non-numeric column should return error message params = simple_params('a', 'number_equals', '3') result = render(self.table, params) self.assertEqual(result, 'Column is not numbers. Please convert to numbers.') # non-numeric column should return error message params = simple_params('date', 'number_equals', '3') result = render(self.table, params) self.assertEqual(result, 'Column is not numbers. Please convert to numbers.') # non-numeric value should return error message params = simple_params('c', 'number_equals', 'gibberish') result = render(self.table, params) self.assertEqual( result, 'Value is not a number. Please enter a valid number.')
def test_contains_case_insensitive(self): params = simple_params('a', 'text_contains', 'fred', case_sensitive=False) result = render(self.table, params) expected = self.table[[True, True, False, False, True]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_contains_regex(self): params = simple_params("a", "text_contains_regex", "f[a-zA-Z]+d", case_sensitive=True) result = render(self.table, params) expected = self.table[[True, True, False, False, False]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_exactly(self): params = simple_params('a', 'text_is_exactly', 'fred', case_sensitive=True) result = render(self.table, params) expected = self.table[[True, False, False, False, False]].reset_index(drop=True) assert_frame_equal(result, expected)
def test_not_contains_case_sensitive(self): params = simple_params("a", "text_does_not_contain", "fred", case_sensitive=True) result = render(self.table, params) expected = self.table[[False, False, True, True, True]].reset_index(drop=True) assert_frame_equal(result, expected)