def _process_text(self, text, **kw): """ Preprocess text. """ # always lower case + unidecode text = unicode( unidecode(text.lower().decode('utf-8')), errors='ignore') # optionally remove punctuation if kw.get('remove_punct', True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get('remove_digits', True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get('remove_html', True): text = html.strip_tags(text) # optionally remove whitespace if kw.get('remove_whitespace', True): text = re_whitespace.sub(" ", text).strip() return text
def _process_text(self, text, **kw): """ Preprocess text. """ # always lower case + unidecode text = unicode(unidecode(text.lower().decode('utf-8')), errors='ignore') # optionally remove punctuation if kw.get('remove_punct', True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get('remove_digits', True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get('remove_html', True): text = html.strip_tags(text) # optionally remove whitespace if kw.get('remove_whitespace', True): text = re_whitespace.sub(" ", text).strip() return text
def parse(search_str): """ Takes a candidate string and extracts out the name(s) in list form >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV' >>> authors_from_string(string) ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV'] """ # set initial counter initial_count = 0 # clean string search_str = html.strip_tags(search_str) search_str = re_by.sub('', search_str) search_str = search_str.strip() # tokenize name_tokens = [s.strip() for s in re_name_token.split(search_str)] _authors = [] curname = [] # List of first, last name tokens for token in name_tokens: # check if the length of the name # and the token suggest an initial if _is_initial(curname, token): # upper case initial & increment token = token.upper() initial_count += 1 # if we're at a delimiter, check if the name is complete if token.lower() in DELIM: # check valid name based on initial count if _end_name(curname, initial_count): name = ' '.join(curname) if not any([t in name.lower() for t in BAD_TOKENS]): _authors.append(name) # reset initial_count = 0 curname = [] # otherwise, append token elif not re_digits.search(token): curname.append(token) # One last check at end valid_name = (len(curname) >= MIN_NAME_TOKENS) if valid_name: name = ' '.join(curname) if not any([t in name.lower() for t in BAD_TOKENS]): _authors.append(name) return _format(_authors)
def parse(search_str): """ Takes a candidate string and extracts out the name(s) in list form >>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV' >>> authors_from_string(string) ['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV'] """ # set initial counter initial_count = 0 # clean string search_str = html.strip_tags(search_str) search_str = re_by.sub('', search_str) search_str = search_str.strip() # tokenize name_tokens = [s.strip() for s in re_name_token.split(search_str)] _authors = [] curname = [] # List of first, last name tokens for token in name_tokens: # check if the length of the name # and the token suggest an initial if _is_initial(curname, token): # upper case initial & increment token = token.upper() initial_count += 1 # if we're at a delimiter, check if the name is complete if token.lower() in DELIM: # check valid name based on initial count if _end_name(curname, initial_count): name = ' '.join(curname) if not any([t in name.lower() for t in BAD_TOKENS]): _authors.append(name) # reset initial_count = 0 curname = [] # otherwise, append token elif not re_digits.search(token): curname.append(token) # One last check at end valid_name = (len(curname) >= MIN_NAME_TOKENS) if valid_name: name = ' '.join(curname) if not any([t in name.lower() for t in BAD_TOKENS]): _authors.append(name) return _format(_authors)
def _process_text(self, text, **kw): """ Preprocess text. """ # optionally remove punctuation if kw.get('rm_punct', True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get('rm_digits', True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get('rm_html', True): text = html.strip_tags(text) # optionally remove whitespace if kw.get('rm_whitespace', True): text = re_whitespace.sub(" ", text).strip() return text
def _process_text(self, text, **kw): """ Preprocess text. """ # optionally remove punctuation if kw.get("rm_punct", True): text = "".join(map(lambda x: x if x not in punct else " ", text)) # optionally remove digits if kw.get("rm_digits", True): text = "".join(map(lambda x: x if x not in digits else " ", text)) # optionally remove whitespace if kw.get("rm_html", True): text = html.strip_tags(text) # optionally remove whitespace if kw.get("rm_whitespace", True): text = re_whitespace.sub(" ", text).strip() return text