Esempio n. 1
0
    def process_table(self, df, overlap_attr, qgram, rem_stop_words):
        # get ltable attr column
        attr_col_values = df[overlap_attr].values
        # remove non-ascii chars
        attr_col_values = [remove_non_ascii(v) for v in attr_col_values]

        # remove special characters
        attr_col_values = [
            self.rem_punctuations(v).lower() for v in attr_col_values
        ]
        # chop the attribute values
        col_values_chopped = [v.split() for v in attr_col_values]
        # convert it into set

        col_values_chopped = [list(set(v)) for v in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [
                self.rem_stopwords(v) for v in col_values_chopped
            ]
        if qgram is not None:
            values = [' '.join(v) for v in col_values_chopped]
            col_values_chopped = [ngrams(v, qgram) for v in values]

        return col_values_chopped
Esempio n. 2
0
 def process_val(self, val, overlap_attr, qgram, rem_stop_words):
     val = remove_non_ascii(val)
     val = self.rem_punctuations(val).lower()
     chopped_vals = val.split()
     if rem_stop_words == True:
         chopped_vals = self.rem_stopwords(chopped_vals)
     if qgram != None:
         values = ' '.join(chopped_vals)
         chopped_vals = ngrams(values, qgram)
     return list(set(chopped_vals))
Esempio n. 3
0
 def process_val(self, val, overlap_attr, qgram, rem_stop_words):
     val = remove_non_ascii(val)
     val = self.rem_punctuations(val).lower()
     chopped_vals = val.split()
     if rem_stop_words == True:
         chopped_vals = self.rem_stopwords(chopped_vals)
     if qgram != None:
         values = ' '.join(chopped_vals)
         chopped_vals = ngrams(values, qgram)
     return list(set(chopped_vals))
Esempio n. 4
0
    def process_table(self, df, overlap_attr, qgram, rem_stop_words):
        # get ltable attr column
        attr_col_values = df[overlap_attr].values
        # remove non-ascii chars
        attr_col_values = [remove_non_ascii(v) for v in attr_col_values]

        # remove special characters
        attr_col_values = [self.rem_punctuations(v).lower() for v in attr_col_values]
        # chop the attribute values
        col_values_chopped = [v.split() for v in attr_col_values]
        # convert it into set

        col_values_chopped = [list(set(v)) for v in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [self.rem_stopwords(v) for v in col_values_chopped]
        if qgram is not None:
            values = [' '.join(v) for v in col_values_chopped]
            col_values_chopped = [ngrams(v, qgram) for v in values]

        return col_values_chopped