def process_table(self, df, overlap_attr, qgram, rem_stop_words): # get ltable attr column attr_col_values = df[overlap_attr].values # remove non-ascii chars attr_col_values = [remove_non_ascii(v) for v in attr_col_values] # remove special characters attr_col_values = [ self.rem_punctuations(v).lower() for v in attr_col_values ] # chop the attribute values col_values_chopped = [v.split() for v in attr_col_values] # convert it into set col_values_chopped = [list(set(v)) for v in col_values_chopped] # remove stop words if rem_stop_words == True: col_values_chopped = [ self.rem_stopwords(v) for v in col_values_chopped ] if qgram is not None: values = [' '.join(v) for v in col_values_chopped] col_values_chopped = [ngrams(v, qgram) for v in values] return col_values_chopped
def process_val(self, val, overlap_attr, qgram, rem_stop_words): val = remove_non_ascii(val) val = self.rem_punctuations(val).lower() chopped_vals = val.split() if rem_stop_words == True: chopped_vals = self.rem_stopwords(chopped_vals) if qgram != None: values = ' '.join(chopped_vals) chopped_vals = ngrams(values, qgram) return list(set(chopped_vals))
def process_table(self, df, overlap_attr, qgram, rem_stop_words): # get ltable attr column attr_col_values = df[overlap_attr].values # remove non-ascii chars attr_col_values = [remove_non_ascii(v) for v in attr_col_values] # remove special characters attr_col_values = [self.rem_punctuations(v).lower() for v in attr_col_values] # chop the attribute values col_values_chopped = [v.split() for v in attr_col_values] # convert it into set col_values_chopped = [list(set(v)) for v in col_values_chopped] # remove stop words if rem_stop_words == True: col_values_chopped = [self.rem_stopwords(v) for v in col_values_chopped] if qgram is not None: values = [' '.join(v) for v in col_values_chopped] col_values_chopped = [ngrams(v, qgram) for v in values] return col_values_chopped