def test_count_syll(self):
     column = "test_column"
     test_data = pandas.DataFrame({column: ["wdp", "tree tree", "buh"]})
     verify_data = pandas.DataFrame({column: [0, 4, 1]})
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = CountSyllable(column)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_removechar(self):
     column = "test_column"
     char = "s"
     test_data = pandas.DataFrame({column: ["TestString", "super", "wow"]})
     verify_data = pandas.DataFrame({column: ["TetString", "uper", "wow"]})
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = RemoveChar(column, char)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_lemmatize(self):
     column = "test_column"
     test_data = pandas.DataFrame(
         {column: ["dogs", "churches", "aardwolves", "abaci"]})
     verify_data = pandas.DataFrame(
         {column: ["dogs", "churches", "aardwolves", "abaci"]})
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = Lemmatizer(column)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_stem(self):
     column = "test_column"
     test_data = pandas.DataFrame(
         {column: ["maximum", "presumably", "multiply", "provision"]})
     verify_data = pandas.DataFrame(
         {column: ["maxim", "presum", "multiply", "provid"]})
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = Stemmer(column)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_ngram(self):
     column = "test_column"
     n_gram_value = 3
     test_data = pandas.DataFrame({column: ["12345"]})
     verify_data = pandas.DataFrame(
         {column: [[("1", "2", "3"), ("2", "3", "4"), ("3", "4", "5")]]})
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = NGram(column, n_gram_value)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_uppercase(self):
     column = "test_column"
     test_data = pandas.DataFrame({
         column: ["TestString", "TESTSTRING", "teststring", "TestString0!"]
     })
     verify_data = pandas.DataFrame({
         column: ["TESTSTRING", "TESTSTRING", "TESTSTRING", "TESTSTRING0!"]
     })
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = UpperCase(column)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_removepunct(self):
     column = "test_column"
     test_data = pandas.DataFrame({
         column:
         ["Test:String", "Test.St.ri.ng", "Tes,t:String", "TestSt::ring!"]
     })
     verify_data = pandas.DataFrame(
         {column: ["TestString", "TestString", "TestString", "TestString"]})
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = RemovePunctuation(column)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def __init__(self,
              id=None,
              name=None,
              featureset=None,
              truth_column=[],
              index=""):
     self._id = id
     self._name = name
     ''' Featureset class'''
     self._featureset = FeatureSet()
     if featureset is not None:
         self._featureset.set_featureset(featureset)
     ''' truth is string to search in Dataframe'''
     self._truth = None
     self._truth_shadow = None
     self._truth_column = truth_column
     self._index = index
     self._processing_history = []
     self._store = False
 def test_join(self):
     column = "test_column"
     join_string = "!wow!"
     test_data = pandas.DataFrame(
         {column: ["TestString", ["Test", "String"], ""]})
     verify_data = pandas.DataFrame({
         column: [
             "T!wow!e!wow!s!wow!t!wow!S!wow!t!wow!r!wow!i!wow!n!wow!g",
             "Test!wow!String", ""
         ]
     })
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = JoinOperation(column, join_string)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
 def test_tokenizer(self):
     column = "test_column"
     test_data = pandas.DataFrame({
         column: [
             "Test String", "anothertest string", "Another test string",
             "test"
         ]
     })
     verify_data = pandas.DataFrame({
         column: [["Test", "String"], ["anothertest", "string"],
                  ["Another", "test", "string"], ["test"]]
     })
     test_featureset = FeatureSet()
     verify_featureset = FeatureSet()
     test_featureset.set_featureset(test_data)
     verify_featureset.set_featureset(verify_data)
     visitor = Tokenizer(column)
     visitor.visit(test_featureset)
     test.assert_frame_equal(test_featureset.get_featureset(),
                             verify_featureset.get_featureset())
    def test_mask(self):
        column = "test_column"
        column_1 = "second_column"
        condition = "featureset > 3"
        test_data = pandas.DataFrame({
            column: [1, 2, 3, 4, 5],
            column_1: [5, 4, 3, 2, 1]
        })
        verify_data = pandas.DataFrame({
            column: [1, 2, 3, np.NaN, np.NaN],
            column_1: [np.NaN, np.NaN, 3, 2, 1]
        })

        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = Mask(condition)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())
    def test_sort(self):
        column = "test_column"
        test_data = pandas.DataFrame({
            column: [3, 2, 1, 4, 5],
            "species": ["bb", "ba", "ab", "aa", "bc"]
        })
        verify_data = pandas.DataFrame(
            {
                column: [1, 2, 3, 4, 5],
                "species": ["ab", "ba", "bb", "aa", "bc"]
            },
            index=[2, 1, 0, 3, 4])
        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)

        visitor = Sort("column", column)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())

        test_data = pandas.DataFrame(
            {
                column: [3, 2, 1, 4, 5],
                "species": ["bb", "ba", "ab", "aa", "bc"]
            },
            index=[3, 4, 2, 1, 0])
        verify_data = pandas.DataFrame(
            {
                column: [5, 4, 1, 3, 2],
                "species": ["bc", "aa", "ab", "bb", "ba"]
            },
            index=[0, 1, 2, 3, 4])
        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)

        visitor = Sort("index")
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())
Esempio n. 13
0
class FeatureSetComponent(Component):
    def __init__(self,
                 id=None,
                 name=None,
                 featureset=None,
                 truth_column=[],
                 index=""):
        self._id = id
        self._name = name
        ''' Featureset class'''
        self._featureset = FeatureSet()
        if featureset is not None:
            self._featureset.set_featureset(featureset)
        ''' truth is string to search in Dataframe'''
        self._truth = None
        self._truth_shadow = None
        self._truth_column = truth_column
        self._index = index
        self._processing_history = []
        self._store = False

    def operation(self):
        pass

    def accept(self, visitor):
        return visitor.visit(self._featureset)

    def get_featureset(self):
        if self._featureset is None:
            return 0
        return self._featureset

    def set_featureset(self, featureset):
        try:
            self._featureset = featureset
        except Exception as error:
            print("Unable to set feature")
            print(error)

    def get_id(self):
        return self._id

    def get_truth(self):
        return self._truth

    def get_truth_column(self):
        return self._truth_column

    def get_index(self):
        return self._index

    def set_index(self, index):
        self._index = index

    def set_truth(self, column):
        self._truth = column

    def set_truth_column(self, column):
        self._truth_column = column

    def set_id(self, id):
        self._id = id

    def set_store(self, bool):
        self._store = bool

    def add_history(self, history):
        self._processing_history.append(history)
    def test_interpolate(self):
        column = "test_column"
        column_1 = "second_column"
        mode_0 = "nearest"
        mode_1 = "linear"
        test_data = pandas.DataFrame({
            column: [1, 2, np.NaN, np.NaN, 5],
            column_1: [5.3, np.NaN, 3.5, np.NaN, 14.31]
        })
        verify_data = pandas.DataFrame({
            column: [1, 2, 2.0, 5.0, 5],
            column_1: [5.3, 5.3, 3.5, 3.5, 14.31]
        })

        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)

        visitor = Interpolate(mode_0)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())

        verify_data = pandas.DataFrame({
            column: [1.0, 2.0, 3.0, 4.0, 5.0],
            column_1: [5.3, 4.4, 3.5, 8.905, 14.31]
        })

        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)

        visitor = Interpolate(mode_1)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())
    def test_condense(self):
        column = "test_column"
        column_1 = "test_column0"
        sequential = True
        test_data = pandas.DataFrame({
            column: [1, 2, 3, 4, 5],
            column_1: [5, 4, 3, 2, 1],
            "species": ["blue", "blue", "red", "red", "blue"]
        })
        verify_data = pandas.DataFrame(
            {
                column: [1.5, 3.5, 5],
                column_1: [4.5, 2.5, 1],
                "species": ["blue", "red", "blue"]
            },
            index=[0, 2, 4])
        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = Condense("species", sequential)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())

        test_data = pandas.DataFrame({
            column: [1, 2, 3, 4, 5],
            "species": ["blue", "blue", "red", "red", "blue"]
        })
        verify_data = pandas.DataFrame(
            {
                column: [2, 3.5],
                "species": ["blue", "red"]
            }, index=[0, 2])
        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = Condense("species")
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())
    def test_represent(self):
        column = "test_column"
        wordlist_0 = "Hallo"
        wordlist_1 = ["Hi", "Ok"]
        __location__ = os.path.realpath(
            os.path.join(os.getcwd(), os.path.dirname(__file__)))
        wordlist_2 = os.path.join(__location__, "phrases.txt")
        mode_0 = "presence"
        mode_1 = "count"
        frompath = True
        test_data = pandas.DataFrame(
            {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]})
        verify_data = pandas.DataFrame(
            {column: [[("1", "2", "3"), ("2", "3", "4"), ("3", "4", "5")]]})
        test_featureset = FeatureSet()
        verify_featureset = FeatureSet()

        test_data = pandas.DataFrame(
            {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]})
        verify_data = pandas.DataFrame({
            column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"],
            str(wordlist_0 + "_" + column + "_presence"): [1, 1, 0]
        })
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = RepresentByWordlist(column, wordlist_0, mode_0)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())

        test_data = pandas.DataFrame({
            column: [
                "Hallo hey was geht ab Hi Ok", "Hallo Ok Hallo hallo",
                "hi Hi Hi hallo"
            ]
        })
        verify_data = pandas.DataFrame({
            column: [
                "Hallo hey was geht ab Hi Ok", "Hallo Ok Hallo hallo",
                "hi Hi Hi hallo"
            ],
            str(wordlist_1[0] + "_" + column + "_count"): [1, 0, 2],
            str(wordlist_1[1] + "_" + column + "_count"): [1, 1, 0]
        })
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = RepresentByWordlist(column, wordlist_1, mode_1)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())

        test_data = pandas.DataFrame(
            {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]})
        verify_data = pandas.DataFrame({
            column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"],
            str(wordlist_0 + "_" + column + "_count"): [1, 2, 0]
        })
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = RepresentByWordlist(column, wordlist_0, mode_1)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())

        "Load from Path"
        test_data = pandas.DataFrame(
            {column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"]})
        verify_data = pandas.DataFrame({
            column: ["Hallo hey was geht ab", "Hallo Hallo hallo", "hallo"],
            str("Hallo_" + column + "_presence"): [1, 1, 0],
            str("was geht_" + column + "_presence"): [1, 0, 0]
        })
        test_featureset.set_featureset(test_data)
        verify_featureset.set_featureset(verify_data)
        visitor = RepresentByWordlist(column, wordlist_2, mode_0, frompath)
        visitor.visit(test_featureset)
        test.assert_frame_equal(test_featureset.get_featureset(),
                                verify_featureset.get_featureset())
    def test_split(self):
        column = "test_column"
        ids = {"a": 0.2, "b": 0.4, "c": 0.4}
        test_data = pandas.DataFrame({column: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]})
        verify_data_0 = pandas.DataFrame({column: [0, 1]}, index=[0, 1])
        verify_data_1 = pandas.DataFrame({column: [2, 3, 4, 5]},
                                         index=[2, 3, 4, 5])
        verify_data_2 = pandas.DataFrame({column: [6, 7, 8, 9]},
                                         index=[6, 7, 8, 9])
        test_featureset = FeatureSet()
        verify_featureset_0 = FeatureSet()
        verify_featureset_1 = FeatureSet()
        verify_featureset_2 = FeatureSet()
        test_featureset.set_featureset(test_data)
        verify_featureset_0.set_featureset(verify_data_0)
        verify_featureset_1.set_featureset(verify_data_1)
        verify_featureset_2.set_featureset(verify_data_2)

        visitor = Split(ids, "sequential")
        featuresets = visitor.visit(test_featureset)
        test.assert_frame_equal(featuresets["a"],
                                verify_featureset_0.get_featureset())
        test.assert_frame_equal(featuresets["b"],
                                verify_featureset_1.get_featureset())
        test.assert_frame_equal(featuresets["c"],
                                verify_featureset_2.get_featureset())