Ejemplo n.º 1
0
def jaro(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, six.string_types):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, six.string_types):
        s2 = remove_non_ascii(s2)
    return sim.jaro(str(s1), str(s2))
Ejemplo n.º 2
0
def lev(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, six.string_types):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, six.string_types):
        s2 = remove_non_ascii(s2)
    return sim.levenshtein(str(s1), str(s2))
Ejemplo n.º 3
0
def smith_waterman(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, six.string_types):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, six.string_types):
        s2 = remove_non_ascii(s2)
    return sim.smith_waterman(str(s1), str(s2))
Ejemplo n.º 4
0
def needleman_wunsch(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, six.string_types):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, six.string_types):
        s2 = remove_non_ascii(s2)
    return sim.needleman_wunsch(str(s1), str(s2))
Ejemplo n.º 5
0
def jaro_winkler(s1, s2):
    if s1 is None or s2 is None:
        return np.NaN
    if pd.isnull(s1) or pd.isnull(s2):
        return np.NaN
    if isinstance(s1, six.string_types):
        s1 = remove_non_ascii(s1)
    if isinstance(s2, six.string_types):
        s2 = remove_non_ascii(s2)
    return sim.jaro_winkler(str(s1), str(s2))
Ejemplo n.º 6
0
    def process_table(self, table, overlap_attr, q_val, rem_stop_words):

        # get overlap_attr column
        attr_col_values = table[overlap_attr]

        # remove non-ascii chars
        attr_col_values = [remove_non_ascii(val) for val in attr_col_values]

        # remove special characters
        attr_col_values = [self.rem_punctuations(val).lower() for val in attr_col_values]

        # chop the attribute values
        col_values_chopped = [val.split() for val in attr_col_values]

        # convert the chopped values into a set
        col_values_chopped = [list(set(val)) for val in col_values_chopped]

        # remove stop words
        if rem_stop_words == True:
            col_values_chopped = [self.rem_stopwords(val) for val in col_values_chopped]

        if q_val is not None:
            values = [' '.join(val) for val in col_values_chopped]
            col_values_chopped = [qgram(val, q_val) for val in values]

        return col_values_chopped
Ejemplo n.º 7
0
 def process_val(self, val, q_val, rem_stop_words):
     val = remove_non_ascii(val)
     val = self.rem_punctuations(val).lower()
     chopped_vals = val.split()
     if rem_stop_words == True:
         chopped_vals = self.rem_stopwords(chopped_vals)
     if q_val != None:
         values = ' '.join(chopped_vals)
         chopped_vals = qgram(values, q_val)
     return list(set(chopped_vals))
Ejemplo n.º 8
0
def _cast_val(v, i):
    if v == "None":
        return None
    elif isinstance(i, bool):
        return bool(v)
    elif isinstance(i, float):
        return float(v)
    elif isinstance(i, int):
        return int(v)
    elif isinstance(i, six.string_types):
        v = remove_non_ascii(str(v))
        return str(v)
    elif isinstance(i, object):
        return v
    else:
        logger.warning('Input value did not match any of the known types')
        return v
Ejemplo n.º 9
0
 def tok_delim(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     s = remove_non_ascii(s)
     return s.split(d)
Ejemplo n.º 10
0
 def tok_delim(s):
     # check if the input is of type base string
     if pd.isnull(s):
         return s
     s = remove_non_ascii(s)
     return s.split(d)