def test_split_camel_case(): eq_(split_camel_case('BlaBla'), 'Bla Bla') eq_(split_camel_case('Bla'), 'Bla') eq_(split_camel_case('iBla'), 'iBla') eq_(split_camel_case('iBlaBla'), 'iBla Bla') eq_(split_camel_case('BlaBlaBlaaa'), 'Bla Bla Blaaa') eq_(split_camel_case('iBlaBla BlaaaBla'), 'iBla Bla Blaaa Bla')
def extract_text(doc, title_weight=None, header_weights=None, use_pdf=True, use_stemmer=False, ukkonen_len=0): """ Extracts cleaned text from an HTML or PDF. """ if is_pdf(doc): if use_pdf: try: text = extract_text_pdf(doc) except: # TODO: Nice error handling return '' else: return '' else: text = extract_text_html( str2unicode(doc), title_weight=title_weight, header_weights=header_weights) # Replace newlines etc. text = re.sub('\s+', ' ', text) # Replace Umlaute and apply 'unidecode' text = clean_text(text) # Remove punctuation replace_punctuation = string.maketrans( string.punctuation, ' ' * len(string.punctuation)) text = text.translate(replace_punctuation) # Remove multiple spaces text = re.sub(' +', ' ', text) # Strip text = text.strip() # Split camel case words text = ' '.join([split_camel_case(word) for word in text.split(' ')]) # Remove digits text = ''.join([c for c in text if not c.isdigit()]) # Remove single characters text = ' '.join([word for word in text.split(' ') if len(word) > 1]) # Remove multiple spaces text = re.sub(' +', ' ', text) # Stem if use_stemmer: text = apply_stemmer(text) # Remove long repeated strings if ukkonen_len: text = remove_repeated_long_strings(text, ukkonen_len) return text.lower()