Esempio n. 1
0
def tok_delim(input_string, d):
    """
    This function splits the input string into a list of tokens
    (based on the delimiter).

    Args:
        input_string (string): Input string that should be tokenized.
        d (string): Delimiter string.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_delim('data science', ' ')
        ['data', 'science']
        >>> em.tok_delim('data$#$science', '$#$')
        ['data', 'science']
        >>> em.tok_delim(None, ' ')
        nan


    """

    if pd.isnull(input_string):
        return pd.np.NaN
    if not (isinstance(input_string, six.string_types)
            or isinstance(input_string, bytes)):
        input_string = str(input_string)
    else:
        if isinstance(input_string, bytes):
            input_string = input_string.decode('utf-8')
    measure = sm.DelimiterTokenizer(delim_set=[d])
    return measure.tokenize(input_string)
def tok_delim(input_string, d):
    """
    This function splits the input string into a list of tokens
    (based on the delimiter).

    Args:
        input_string (string): Input string that should be tokenized.
        d (string): Delimiter string.

    Returns:
        A list of tokens, if the input string is not NaN ,
        else returns NaN.

    Examples:
        >>> import py_entitymatching as em
        >>> em.tok_delim('data science', ' ')
        ['data', 'science']
        >>> em.tok_delim('data$#$science', '$#$')
        ['data', 'science']
        >>> em.tok_delim(None, ' ')
        nan


    """

    if pd.isnull(input_string):
        return pd.np.NaN

    input_string = gh.convert_to_str_unicode(input_string)

    measure = sm.DelimiterTokenizer(delim_set=[d])

    return measure.tokenize(input_string)
Esempio n. 3
0
def matchHeaders(headers):
    jac = sm.Jaccard()
    lev = sm.Levenshtein()
    oc = sm.OverlapCoefficient()

    i = 0
    j = 0

    header_len = len(headers)

    for i in range(0, header_len - 1):
        for first in headers[i]:
            j = i + 1
            if j == header_len:
                break
            for second in headers[j]:
                #                print(first, '' , second, '')
                #        i = i + 1
                #        if(i == header_len):
                #           continue
                x = first
                y = second
                delim_tok = sm.DelimiterTokenizer(delim_set=['_'])
                jacScore = jac.get_sim_score(delim_tok.tokenize(x),
                                             delim_tok.tokenize(y))
                levScore = lev.get_sim_score(x, y)
                ocScore = oc.get_sim_score(delim_tok.tokenize(x),
                                           delim_tok.tokenize(y))

                if (ocScore == 1 or levScore >= 0.5 or jacScore >= 0.5):
                    print(first + ' of Table' + str(i + 1) + ' and ' + second +
                          ' of Table' + str(j + 1) + ' matched')
    def tok_delim(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s
        # Remove non ascii  characters. Note: This should be fixed in the
        # next version.
        #s = remove_non_ascii(s)

        s = gh.convert_to_str_unicode(s)

        # Initialize the tokenizer measure object
        measure = sm.DelimiterTokenizer(delim_set=[d])
        # Call the function that will tokenize the input string.
        return measure.tokenize(s)
Esempio n. 5
0
    def tok_delim(s):
        # check if the input is of type base string
        if pd.isnull(s):
            return s
        # Remove non ascii  characters. Note: This should be fixed in the
        # next version.
        #s = remove_non_ascii(s)
        if not (isinstance(s, six.string_types) or isinstance(s, bytes)):
            s = str(s)
        else:
            if isinstance(s, bytes):
                s = s.decode('utf-8')

        # Initialize the tokenizer measure object
        measure = sm.DelimiterTokenizer(delim_set=[d])
        # Call the function that will tokenize the input string.
        return measure.tokenize(s)
Esempio n. 6
0
def tok_delim(input_string, d):
    if pd.isnull(input_string):
        return pd.np.NaN

    measure = sm.DelimiterTokenizer(delim_set=[d])
    return measure.tokenize(input_string)