Exemple #1
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """
        # always lower case + unidecode
        text = unicode(
            unidecode(text.lower().decode('utf-8')), errors='ignore')

        # optionally remove punctuation
        if kw.get('remove_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('remove_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('remove_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('remove_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
Exemple #2
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """
        # always lower case + unidecode
        text = unicode(unidecode(text.lower().decode('utf-8')),
                       errors='ignore')

        # optionally remove punctuation
        if kw.get('remove_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('remove_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('remove_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('remove_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
Exemple #3
0
def parse(search_str):
    """
    Takes a candidate string and
    extracts out the name(s) in list form
    >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV'
    >>> authors_from_string(string)
    ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV']
    """
    # set initial counter
    initial_count = 0

    # clean string
    search_str = html.strip_tags(search_str)
    search_str = re_by.sub('', search_str)
    search_str = search_str.strip()

    # tokenize
    name_tokens = [s.strip() for s in re_name_token.split(search_str)]

    _authors = []
    curname = []  # List of first, last name tokens

    for token in name_tokens:
        # check if the length of the name
        # and the token suggest an initial
        if _is_initial(curname, token):
            # upper case initial & increment
            token = token.upper()
            initial_count += 1

        # if we're at a delimiter, check if the name is complete
        if token.lower() in DELIM:

            # check valid name based on initial count
            if _end_name(curname, initial_count):
                name = ' '.join(curname)
                if not any([t in name.lower() for t in BAD_TOKENS]):
                    _authors.append(name)

                # reset
                initial_count = 0
                curname = []

        # otherwise, append token
        elif not re_digits.search(token):
            curname.append(token)

    # One last check at end
    valid_name = (len(curname) >= MIN_NAME_TOKENS)
    if valid_name:
        name = ' '.join(curname)
        if not any([t in name.lower() for t in BAD_TOKENS]):
            _authors.append(name)

    return _format(_authors)
Exemple #4
0
def parse(search_str):
    """
    Takes a candidate string and
    extracts out the name(s) in list form
    >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV'
    >>> authors_from_string(string)
    ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV']
    """
    # set initial counter
    initial_count = 0

    # clean string
    search_str = html.strip_tags(search_str)
    search_str = re_by.sub('', search_str)
    search_str = search_str.strip()

    # tokenize
    name_tokens = [s.strip() for s in re_name_token.split(search_str)]

    _authors = []
    curname = []  # List of first, last name tokens

    for token in name_tokens:
        # check if the length of the name
        # and the token suggest an initial
        if _is_initial(curname, token):
            # upper case initial & increment
            token = token.upper()
            initial_count += 1

        # if we're at a delimiter, check if the name is complete
        if token.lower() in DELIM:

            # check valid name based on initial count
            if _end_name(curname, initial_count):
                name = ' '.join(curname)
                if not any([t in name.lower() for t in BAD_TOKENS]):
                    _authors.append(name)

                # reset
                initial_count = 0
                curname = []

        # otherwise, append token
        elif not re_digits.search(token):
            curname.append(token)

    # One last check at end
    valid_name = (len(curname) >= MIN_NAME_TOKENS)
    if valid_name:
        name = ' '.join(curname)
        if not any([t in name.lower() for t in BAD_TOKENS]):
            _authors.append(name)

    return _format(_authors)
Exemple #5
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """

        # optionally remove punctuation
        if kw.get('rm_punct', True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get('rm_digits', True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get('rm_html', True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get('rm_whitespace', True):
            text = re_whitespace.sub(" ", text).strip()

        return text
Exemple #6
0
    def _process_text(self, text, **kw):
        """
        Preprocess text.
        """

        # optionally remove punctuation
        if kw.get("rm_punct", True):
            text = "".join(map(lambda x: x if x not in punct else " ", text))

        # optionally remove digits
        if kw.get("rm_digits", True):
            text = "".join(map(lambda x: x if x not in digits else " ", text))

        # optionally remove whitespace
        if kw.get("rm_html", True):
            text = html.strip_tags(text)

        # optionally remove whitespace
        if kw.get("rm_whitespace", True):
            text = re_whitespace.sub(" ", text).strip()

        return text