def test_count_syll(self): column = "test_column" test_data = pandas.DataFrame({column: ["wdp", "tree tree", "buh"]}) verify_data = pandas.DataFrame({column: [0, 4, 1]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = CountSyllable(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_removechar(self): column = "test_column" char = "s" test_data = pandas.DataFrame({column: ["TestString", "super", "wow"]}) verify_data = pandas.DataFrame({column: ["TetString", "uper", "wow"]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = RemoveChar(column, char) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_lemmatize(self): column = "test_column" test_data = pandas.DataFrame( {column: ["dogs", "churches", "aardwolves", "abaci"]}) verify_data = pandas.DataFrame( {column: ["dogs", "churches", "aardwolves", "abaci"]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Lemmatizer(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_stem(self): column = "test_column" test_data = pandas.DataFrame( {column: ["maximum", "presumably", "multiply", "provision"]}) verify_data = pandas.DataFrame( {column: ["maxim", "presum", "multiply", "provid"]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Stemmer(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_ngram(self): column = "test_column" n_gram_value = 3 test_data = pandas.DataFrame({column: ["12345"]}) verify_data = pandas.DataFrame( {column: [[("1", "2", "3"), ("2", "3", "4"), ("3", "4", "5")]]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = NGram(column, n_gram_value) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_uppercase(self): column = "test_column" test_data = pandas.DataFrame({ column: ["TestString", "TESTSTRING", "teststring", "TestString0!"] }) verify_data = pandas.DataFrame({ column: ["TESTSTRING", "TESTSTRING", "TESTSTRING", "TESTSTRING0!"] }) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = UpperCase(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_removepunct(self): column = "test_column" test_data = pandas.DataFrame({ column: ["Test:String", "Test.St.ri.ng", "Tes,t:String", "TestSt::ring!"] }) verify_data = pandas.DataFrame( {column: ["TestString", "TestString", "TestString", "TestString"]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = RemovePunctuation(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def __init__(self, id=None, name=None, featureset=None, truth_column=[], index=""): self._id = id self._name = name ''' Featureset class''' self._featureset = FeatureSet() if featureset is not None: self._featureset.set_featureset(featureset) ''' truth is string to search in Dataframe''' self._truth = None self._truth_shadow = None self._truth_column = truth_column self._index = index self._processing_history = [] self._store = False
def test_join(self): column = "test_column" join_string = "!wow!" test_data = pandas.DataFrame( {column: ["TestString", ["Test", "String"], ""]}) verify_data = pandas.DataFrame({ column: [ "T!wow!e!wow!s!wow!t!wow!S!wow!t!wow!r!wow!i!wow!n!wow!g", "Test!wow!String", "" ] }) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = JoinOperation(column, join_string) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_tokenizer(self): column = "test_column" test_data = pandas.DataFrame({ column: [ "Test String", "anothertest string", "Another test string", "test" ] }) verify_data = pandas.DataFrame({ column: [["Test", "String"], ["anothertest", "string"], ["Another", "test", "string"], ["test"]] }) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Tokenizer(column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_mask(self): column = "test_column" column_1 = "second_column" condition = "featureset > 3" test_data = pandas.DataFrame({ column: [1, 2, 3, 4, 5], column_1: [5, 4, 3, 2, 1] }) verify_data = pandas.DataFrame({ column: [1, 2, 3, np.NaN, np.NaN], column_1: [np.NaN, np.NaN, 3, 2, 1] }) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Mask(condition) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_sort(self): column = "test_column" test_data = pandas.DataFrame({ column: [3, 2, 1, 4, 5], "species": ["bb", "ba", "ab", "aa", "bc"] }) verify_data = pandas.DataFrame( { column: [1, 2, 3, 4, 5], "species": ["ab", "ba", "bb", "aa", "bc"] }, index=[2, 1, 0, 3, 4]) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Sort("column", column) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset()) test_data = pandas.DataFrame( { column: [3, 2, 1, 4, 5], "species": ["bb", "ba", "ab", "aa", "bc"] }, index=[3, 4, 2, 1, 0]) verify_data = pandas.DataFrame( { column: [5, 4, 1, 3, 2], "species": ["bc", "aa", "ab", "bb", "ba"] }, index=[0, 1, 2, 3, 4]) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Sort("index") visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
class FeatureSetComponent(Component): def __init__(self, id=None, name=None, featureset=None, truth_column=[], index=""): self._id = id self._name = name ''' Featureset class''' self._featureset = FeatureSet() if featureset is not None: self._featureset.set_featureset(featureset) ''' truth is string to search in Dataframe''' self._truth = None self._truth_shadow = None self._truth_column = truth_column self._index = index self._processing_history = [] self._store = False def operation(self): pass def accept(self, visitor): return visitor.visit(self._featureset) def get_featureset(self): if self._featureset is None: return 0 return self._featureset def set_featureset(self, featureset): try: self._featureset = featureset except Exception as error: print("Unable to set feature") print(error) def get_id(self): return self._id def get_truth(self): return self._truth def get_truth_column(self): return self._truth_column def get_index(self): return self._index def set_index(self, index): self._index = index def set_truth(self, column): self._truth = column def set_truth_column(self, column): self._truth_column = column def set_id(self, id): self._id = id def set_store(self, bool): self._store = bool def add_history(self, history): self._processing_history.append(history)
def test_interpolate(self): column = "test_column" column_1 = "second_column" mode_0 = "nearest" mode_1 = "linear" test_data = pandas.DataFrame({ column: [1, 2, np.NaN, np.NaN, 5], column_1: [5.3, np.NaN, 3.5, np.NaN, 14.31] }) verify_data = pandas.DataFrame({ column: [1, 2, 2.0, 5.0, 5], column_1: [5.3, 5.3, 3.5, 3.5, 14.31] }) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Interpolate(mode_0) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset()) verify_data = pandas.DataFrame({ column: [1.0, 2.0, 3.0, 4.0, 5.0], column_1: [5.3, 4.4, 3.5, 8.905, 14.31] }) test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Interpolate(mode_1) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_condense(self): column = "test_column" column_1 = "test_column0" sequential = True test_data = pandas.DataFrame({ column: [1, 2, 3, 4, 5], column_1: [5, 4, 3, 2, 1], "species": ["blue", "blue", "red", "red", "blue"] }) verify_data = pandas.DataFrame( { column: [1.5, 3.5, 5], column_1: [4.5, 2.5, 1], "species": ["blue", "red", "blue"] }, index=[0, 2, 4]) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Condense("species", sequential) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset()) test_data = pandas.DataFrame({ column: [1, 2, 3, 4, 5], "species": ["blue", "blue", "red", "red", "blue"] }) verify_data = pandas.DataFrame( { column: [2, 3.5], "species": ["blue", "red"] }, index=[0, 2]) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = Condense("species") visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_represent(self): column = "test_column" wordlist_0 = "Hallo" wordlist_1 = ["Hi", "Ok"] __location__ = os.path.realpath( os.path.join(os.getcwd(), os.path.dirname(__file__))) wordlist_2 = os.path.join(__location__, "phrases.txt") mode_0 = "presence" mode_1 = "count" frompath = True test_data = pandas.DataFrame( {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]}) verify_data = pandas.DataFrame( {column: [[("1", "2", "3"), ("2", "3", "4"), ("3", "4", "5")]]}) test_featureset = FeatureSet() verify_featureset = FeatureSet() test_data = pandas.DataFrame( {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]}) verify_data = pandas.DataFrame({ column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"], str(wordlist_0 + "_" + column + "_presence"): [1, 1, 0] }) test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = RepresentByWordlist(column, wordlist_0, mode_0) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset()) test_data = pandas.DataFrame({ column: [ "Hallo hey was geht ab Hi Ok", "Hallo Ok Hallo hallo", "hi Hi Hi hallo" ] }) verify_data = pandas.DataFrame({ column: [ "Hallo hey was geht ab Hi Ok", "Hallo Ok Hallo hallo", "hi Hi Hi hallo" ], str(wordlist_1[0] + "_" + column + "_count"): [1, 0, 2], str(wordlist_1[1] + "_" + column + "_count"): [1, 1, 0] }) test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = RepresentByWordlist(column, wordlist_1, mode_1) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset()) test_data = pandas.DataFrame( {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]}) verify_data = pandas.DataFrame({ column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"], str(wordlist_0 + "_" + column + "_count"): [1, 2, 0] }) test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = RepresentByWordlist(column, wordlist_0, mode_1) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset()) "Load from Path" test_data = pandas.DataFrame( {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]}) verify_data = pandas.DataFrame({ column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"], str("Hallo_" + column + "_presence"): [1, 1, 0], str("was geht_" + column + "_presence"): [1, 0, 0] }) test_featureset.set_featureset(test_data) verify_featureset.set_featureset(verify_data) visitor = RepresentByWordlist(column, wordlist_2, mode_0, frompath) visitor.visit(test_featureset) test.assert_frame_equal(test_featureset.get_featureset(), verify_featureset.get_featureset())
def test_split(self): column = "test_column" ids = {"a": 0.2, "b": 0.4, "c": 0.4} test_data = pandas.DataFrame({column: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}) verify_data_0 = pandas.DataFrame({column: [0, 1]}, index=[0, 1]) verify_data_1 = pandas.DataFrame({column: [2, 3, 4, 5]}, index=[2, 3, 4, 5]) verify_data_2 = pandas.DataFrame({column: [6, 7, 8, 9]}, index=[6, 7, 8, 9]) test_featureset = FeatureSet() verify_featureset_0 = FeatureSet() verify_featureset_1 = FeatureSet() verify_featureset_2 = FeatureSet() test_featureset.set_featureset(test_data) verify_featureset_0.set_featureset(verify_data_0) verify_featureset_1.set_featureset(verify_data_1) verify_featureset_2.set_featureset(verify_data_2) visitor = Split(ids, "sequential") featuresets = visitor.visit(test_featureset) test.assert_frame_equal(featuresets["a"], verify_featureset_0.get_featureset()) test.assert_frame_equal(featuresets["b"], verify_featureset_1.get_featureset()) test.assert_frame_equal(featuresets["c"], verify_featureset_2.get_featureset())