def doc_preprocessing(doc): # Removes HTML tags doc = BeautifulSoup(doc, features="lxml").get_text() # Remove numbers doc = doc.translate(str.maketrans('', '', "0123456789")) # Remove punctuation doc = doc.translate( str.maketrans('', '', self.strip_punctuation)) return doc
def doc_preprocessing(doc): # Removes HTML tags doc = BeautifulSoup(doc, features="lxml").get_text() # Remove accentuation doc = unicodedata.normalize('NFKD', doc).encode( 'ASCII', 'ignore').decode('ASCII') # Remove numbers doc = doc.translate(str.maketrans('', '', "0123456789")) # Remove punctuation doc = doc.translate( str.maketrans('', '', self.strip_punctuation)) return doc
def word_clean(words): text = BeautifulSoup(words, 'lxml') text = text.get_text() text = text.encode('ascii', 'replace').decode() text = str(' '.join(text.split('\n'))).lower() text = text.translate(None, string.punctuation) text = text.translate(None, string.digits) words = [word for word in text.split(' ') if word != ''] spwords = spell_check(words) res = " ".join( [word1 for word1, word2 in zip(words, spwords) if word1 == word2]) return res
def pre_processing(question): def lemmatize_with_pos_tag(sentence): tokenized_sentence = TextBlob(sentence) tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'} words_and_tags = [(word, tag_dict.get(pos[0], 'n')) for word, pos in tokenized_sentence.tags] lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags] return " ".join(lemmatized_list) question = BeautifulSoup(question, 'html.parser').get_text() question = question.lower() question.translate(str.maketrans(" ", " ", string.punctuation)) question = lemmatize_with_pos_tag(question) return question
def set_body_word_count(self): body_basic_html = self.body.stream_block.render_basic(self.body) body_text = BeautifulSoup(body_basic_html, 'html.parser').get_text() remove_chars = string.punctuation + '“”’' body_words = body_text.translate( body_text.maketrans(dict.fromkeys(remove_chars))).split() self.body_word_count = len(body_words)
def clean_html(raw_html): # remove roman numberals inside brackets raw_html = re.sub('\([v|i|x]+\)', '', raw_html) raw_html = re.sub('\s\d+\s', '', raw_html) raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore') cleantext = BeautifulSoup(raw_html).text cleantext = " ".join(cleantext.split()) # clean all arabic numerals numbers = re.findall('\d+', cleantext) for number in numbers: cleantext = cleantext.replace(number, " ") # remove punctuations table = cleantext.maketrans("", "", string.punctuation) cleantext = cleantext.translate(table) # remove non - ascii printable = set(string.printable) cleantext = list(filter(lambda x: x in printable, cleantext)) cleantext = "".join(cleantext) # remove roman from string toremove = [ ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ', ' x ' ] text_array = cleantext.split("\s+") cleantext = [word.strip() for word in text_array if word not in toremove] cleantext = " ".join(cleantext) return cleantext.strip()
def doc_preprocessing(doc): # Removes HTML tags doc = BeautifulSoup(doc, features="lxml").get_text() # Lowercase doc = doc.lower() # Remove numbers doc = doc.translate(str.maketrans('', '', "0123456789")) return doc
def doc_preprocessing(doc): # Removes HTML tags doc = BeautifulSoup(doc, features="lxml").get_text() # Lowercase doc = doc.lower() # Remove punctuation doc = doc.translate( str.maketrans('', '', self.strip_punctuation)) return doc
def get_keywords(id, name): name = name.replace(" ", "_") temp = requests.get("https://wiki.metakgp.org/w/" + id + ":_" + name).text soup = BeautifulSoup(temp, 'html.parser') soup = soup.find_all("p") soup = soup[2].text soup = soup.translate(str.maketrans('', '', string.punctuation)) soup = soup.split(" ") return soup
def tokenize_and_remove_punctuations(s): s = BeautifulSoup(s, "lxml").text factory = StemmerFactory() stemmer = factory.create_stemmer() s = stemmer.stem(s) translator = str.maketrans('', '', string.punctuation) modified_string = s.translate(translator) modified_string = ''.join([i for i in modified_string if not i.isdigit()]) return nltk.word_tokenize(modified_string)
def clean_html_and_extract_text(raw_html): ''' Clean an html string that comes from "cleaned_value" column ''' # global foo ## use regular expressions to remove roman numberals inside brackets ## eg. (iv), (ix) etc. raw_html = re.sub('\([v|i|x]+\)', '', raw_html) raw_html = re.sub('\s\d+\s', '', raw_html) ## clear off the non ascii characters, remove the html tags ## and get just the text from the document raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore') cleantext = BeautifulSoup(raw_html).text cleantext = " ".join(cleantext.split()) cleantext = ''.join(x for x in cleantext if x in string.printable) # foo.append(cleantext) # for checking on various libraries # extract_fog_score(cleantext) ## clear off punctuations in the text table = cleantext.maketrans("", "", string.punctuation) cleantext = cleantext.translate(table) ## clear off all arabic numerals / digits in the text which are attached ## together with text numbers = re.findall('\d+', cleantext) for number in numbers: cleantext = cleantext.replace(number, " ") ## clear off numbers and normalize spaces between words ## and lowercase it cleantext = " ".join([ text for text in cleantext.split(" ") if text.strip() is not "" and text.isdigit() is False ]).lower() ## remove any non-printable (non-ascii) characters in the text printable = set(string.printable) cleantext = list(filter(lambda x: x in printable, cleantext)) cleantext = "".join(cleantext) ## remove roman numberals from string which ## are not in brackets toremove = [ ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ', ' x ' ] text_array = cleantext.split("\s+") cleantext = [word.strip() for word in text_array if word not in toremove] cleantext = " ".join(cleantext) return cleantext.strip()
def clean_text(text): # MD - This is unnecessary text = re.sub(r'\{\{.*?\}\}', '', text, flags=re.S) text = re.sub(r'<ref>.*?</ref>', '', text, flags=re.S) text = re.sub(r'\[\[File:.*?\|.*?\|.*?\|(.*?)\]\]', r'\1', text, flags=re.S) text = BeautifulSoup(text, 'lxml').get_text() text = text.translate(UGLY_TEXT_MAP) text = text.replace("'''", '"').replace("''", '"') text = text.strip() return text
def sentencePreProcess(body): #this removes all the HTML tags clean_body = BeautifulSoup(body, "lxml").text #this removes all the punctuation clean_body = clean_body.translate(translator) #tokenize the given sentence word_tokens = word_tokenize(clean_body) #remove the stop words filtered_sentence = [w for w in word_tokens if not w in stop_words] #convert from list to sentence body = ' '.join(word for word in filtered_sentence) return body
def scrubData(text): # remove numbers text = re.sub(r'\d+', '', text) # remove html text = BeautifulSoup(text, "html.parser").get_text() #Removing the square brackets text = re.sub('\[[^]]*\]', '', text) # Removing URL's text = re.sub(r'http\S+', '', text) # remove punctuation translator = str.maketrans('', '', string.punctuation) text.translate(translator) # split string into word tokens and lowercase word_tokens = toktok.tokenize(text) words = [word for word in word_tokens if word.isalpha()] words = [token.lower() for token in words] # remove stopwords words = [w for w in words if not w in stop_Words] # stemming each token word #words = [porter.stem(word) for word in words] #words = [sno.stem(word) for word in words] # lemmatizing each token word words = [lem.lemmatize(word) for word in words] # Join back all the word tokens into one string text_review = '' for word in words: text_review += str(word) + ' ' text = text_review.lower() return text
def clean_filing(text, remove_xbrl=True, to_lower=True, remove_punctuation=True): if remove_xbrl: xml_start = max(text.find('<XBRL>'), text.find('<xbrl>')) text = text[:xml_start] text = BeautifulSoup(text, "lxml").text.encode('ascii', 'ignore').decode("utf-8") text = re.sub(r'[0-9]+', '', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\"', '', text) if to_lower: text = text.lower() if remove_punctuation: text = text.translate(str.maketrans('', '', string.punctuation)) return text
def calculate_stats(instance): if instance._content is not None: stats = {} content = instance._content # How fast do average people read? WPM = 250 # Use BeautifulSoup to get readable/visible text raw_text = BeautifulSoup(content, 'html.parser').getText() # Process the text to remove entities entities = r'\&\#?.+?;' raw_text = raw_text.replace(' ', ' ') raw_text = re.sub(entities, '', raw_text) # Flesch-kincaid readbility stats counts sentances, # so save before removing punctuation tmp = raw_text # Process the text to remove punctuation drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' raw_text = raw_text.translate(dict((ord(c), u'') for c in drop)) # Count the words in the text words = raw_text.lower().split() word_count = Counter(words) # Return the stats stats['word_counts'] = word_count stats['wc'] = sum(word_count.values()) # Calulate how long it'll take to read, rounding up stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM if stats['read_mins'] == 0: stats['read_mins'] = 1 # Calculate Flesch-kincaid readbility stats readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc']) stats['fi'] = "{:.2f}".format(flesch_index(readability_stats)) stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats)) instance.stats = stats
def process_text(text): """Remove any punctuation, numbers, newlines, and stopwords. Convert to lower case. Split the text string into individual words, stem each word, and append the stemmed word to words. Make sure there's a single space between each stemmed word. Args: text (str): A text. Returns: str: Cleaned, normalized, and stemmed text. """ # Remove HTML tags. text = BeautifulSoup(text, "html.parser").get_text() # Normalize links replacing them with the str 'link'. text = re.sub('http\S+', 'link', text) # Normalize numbers replacing them with the str 'number'. text = re.sub('\d+', 'number', text) # Normalize emails replacing them with the str 'email'. text = re.sub('\S+@\S+', 'email', text, flags=re.MULTILINE) # Remove punctuation. text = text.translate(str.maketrans('', '', string.punctuation)) # Remove whitespaces. text = text.strip() # Convert all letters to lower case. text = text.lower() # Create the stemmer. stemmer = SnowballStemmer('english') # Split text into words. words = text.split() # Remove stopwords. words = [w for w in words if w not in stopwords.words('english')] # Stem words. words = [stemmer.stem(w) for w in words] return ' '.join(words)
def _fetch_CDN_(self, resp): if 'alt="Upgrade to Pornhub Premium to enjoy this video."' in resp: #upgrade to premium message with nothing to fetch just remove that link from file and move on return True if 'var player_quality_' in resp: p720 = resp.find('var player_quality_720p = \'') if p720 == -1: p420 = resp.find('var player_quality_480p = \'') if p420 == -1: p240 = resp.find('var player_quality_240p = \'') if p240 == -1: #nothing is there print( "\n[None] No Video Format could be found -- Removing the Link" ) return True else: print("[FETCHED -- 240px]") start = p240 + 27 end = resp.find('\'', p240 + 30) else: print("[FETCHED -- 420px]") start = p420 + 27 end = resp.find('\'', p420 + 30) else: print("[FETCHED -- 720px]") start = p720 + 27 end = resp.find('\'', p720 + 30) #print resp[start:end] file_name = BeautifulSoup(resp, "html.parser") file_name = str(file_name.title.string) file_name = file_name.translate(None, "'*:\"\/?<>|") download = Download(resp[start:end], "%s.mp4" % (file_name)) download = download.now() if download: return True return False else: pass
def clean_text(text): """ Purpose: This function performs cleaning of text from undesired symbols parameters: text string Returns: text string """ text = BeautifulSoup(text, "lxml").text #remove html tags text = text.lower() #lowercase the etst text = text.strip() #remove trailing spaces text = re.sub(r' +', ' ', text) # replace replace_symbols by space in text text = re.sub(r"[/(){}\[\]\|@,;]", ' ', text) # replace replace_symbols by space in text text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,']", "", text) #Replacing special character with none text = re.sub(r'[0-9]+', '', text) #Replacing numbers with none #remove puntutation text = " ".join( text.translate(str.maketrans('', '', string.punctuation)) for text in text.split() if text.isalpha()) return (text)
def calculate_stats(instance): if instance._content is not None: stats = {} content = instance._content # How fast do average people read? WPM = 180 # Use BeautifulSoup to get readable/visible text raw_text = BeautifulSoup(content, 'html.parser').getText() # Process the text to remove entities entities = r'\&\#?.+?;' raw_text = raw_text.replace(' ', ' ') raw_text = re.sub(entities, '', raw_text) # Process the text to remove punctuation drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”' raw_text = raw_text.translate(dict((ord(c), u'') for c in drop)) # Count the words in the text words = raw_text.lower().split() word_count = Counter(words) # Return the stats total_words = sum(word_count.values()) stats['word_counts'] = word_count stats['total_words'] = total_words stats['wc'] = total_words # Calulate how long it'll take to read, rounding up stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM if stats['read_mins'] == 0: stats['read_mins'] = 1 instance.stats = stats
def _fetch_CDN_(self,resp): if 'alt="Upgrade to Pornhub Premium to enjoy this video."' in resp: #upgrade to premium message with nothing to fetch just remove that link from file and move on return True if 'var player_quality_' in resp: p720 = resp.find('var player_quality_720p = \'') if p720 == -1: p420 = resp.find('var player_quality_480p = \'') if p420 == -1: p240 = resp.find('var player_quality_240p = \'') if p240 == -1: #nothing is there print("\n[None] No Video Format could be found -- Removing the Link") return True else: print("[FETCHED -- 240px]") start = p240 + 27 end = resp.find('\'',p240+30) else: print("[FETCHED -- 420px]") start = p420 + 27 end = resp.find('\'',p420+30) else: print("[FETCHED -- 720px]") start = p720 + 27 end = resp.find('\'',p720+30) #print resp[start:end] file_name = BeautifulSoup(resp,"html.parser") file_name = str(file_name.title.string) file_name = file_name.translate(None,"'*:\"\/?<>|") download = Download(resp[start:end],"%s.mp4"%(file_name)) download = download.now() if download: return True return False else: pass
def process_text(text): mentions = text.count('@') hashtags = text.count('#') urls = len(find_urls(text)) # Remove links text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split()) # Remove mentions text = ' '.join( re.sub("(@[A-Za-z0-9^\w]+)", " ", text.replace('@ ', '@').replace('# ', '#')).split()) # Replace hashtags with words if text.count('#') > 0: text = ' '.join(re.findall('[A-Z][^A-Z]*', text.replace('#', ' '))) # Remove HTML tags text = BeautifulSoup(text).get_text() # Save content length (exluding links and mentions) length = len(text) # Remove punctuation symbols text = ' '.join( re.sub("[\.\,\¡\¿\!\?\:\;\-\=\*\(\)\[\]\"\'\“\_\+\”\%\/\‘\’]", " ", text).split()) text = text.translate(remove_digits).translate(remove_punctuation) # Lower case to avoid case sensitive problems text = text.lower() # Replace emojis with names text = emoji.demojize(text) # Add space between emojis and other characters ind = -2 for c in range(text.count(':')): ind = text.find(':', ind + 2) if c % 2 == 0: newLetter = ' :' else: newLetter = ': ' text = "".join((text[:ind], newLetter, text[ind + 1:])) # Replace emoji names with spanish meaning result = [] parts = text.split(' ') for part in parts: if part: if part[0] == ':': em = handle_emoji_tone(part) em = emoji_meaning(em) if em: result.append(em) else: result.append(part) text = ' '.join(result) # Filter using NLTK library append it to a string word_tokens = word_tokenize(text) result = [w for w in word_tokens if not w in stop_words] text = ' '.join(result) # Check if text contains at least a word analysis = TextBlob(text) try: # Sentiment analysis eng = analysis.translate(to='en') sentiment = eng.sentiment polarity = sentiment.polarity subjectivity = sentiment.subjectivity except Exception as e: polarity = 0.0 subjectivity = 0.0 result = { 'no_hashtags': [hashtags], 'no_mentions': [mentions], 'no_urls': [urls], 'effective_length': [length], 'polarity': [polarity], 'subjectivity': [subjectivity] } return result
words = [i for i in tokens if not i in stop_words] #remove stop words stems = stem_tokens(words, stemmer) return stems #Pre-processing step print("Pre-processing documents...") for file in file_name: if(counter < 201): start = timer() with open(os.path.join(folder_dir,rel_path,file), 'rb') as f: read_data = f.read() #Read from file input_str = BeautifulSoup(read_data, "lxml").get_text() # Extract text from document input_str = input_str.casefold() #Convert to lower-case input_str = re.sub(r'\d+', '', input_str) #Remove numbers input_str = input_str.translate(str.maketrans("","",string.punctuation)) #Remove punctuation input_str = " ".join(input_str.split()) #Removes whitespaces input_str = input_str.replace("\n"," ") #Removes newline input_str = unicodedata.normalize("NFKD", input_str) #Removes unicode characters. corpus[file] = input_str print(counter) counter+=1 f.close() else: break #print(list(corpus.values())[0]) --Print first document's text for testing values = [] files = [] for k,v in corpus.items(): values.append(v)
def cleanText(text): text = BeautifulSoup(text).get_text() text = text.translate(UGLY_TEXT_MAP) text = text.replace("'''", '"') return text
#!/usr/bin/env python # coding: utf-8 # In[18]: import nltk import numpy as np import pandas as pd from urllib import request from bs4 import BeautifulSoup from nltk import word_tokenize import re # In[14]: url = 'https://www.bbc.com/urdu/sport-49174685' html = request.urlopen(url).read().decode('utf8') # In[55]: raw = BeautifulSoup(html, 'html.parser').get_text() raw = raw.translate( {ord(c): " " for c in "\\!@#$%^&*\'\"()[]{};:,./<>?\|`~-=_+\n"}) tokens = word_tokenize(raw) tokens = [w for w in tokens if not re.match(r'[A-Z]+', w, re.I)] tokens = [w for w in tokens if not re.match(r'[0-9]+', w, re.I)] len(tokens) # In[ ]:
# Hardcoded ZIP codes known for their respective categories categories = { 'beach': [33109, 32407, 29572, 90266], 'nightlife': [70130, int('02108'), 10017, 60642], 'hiking': [98101, 37202, 80303, 84104], 'architecture': [70130, int('02108'), 98101, 37202] } #TODO see about the jq stuff implemented in the cloud functions with open('secrets', 'r') as s: key = s.readlines() key = key[0].strip('\n') with io.open('meetup_bio_data', 'w', encoding='utf8') as f: for topic, locations in categories.items(): for location in locations: endpoint = 'https://api.meetup.com/find/groups?key={}&zip={}&text={}'.format( key, location, topic) r = requests.get(endpoint) r = r.json() # Group level description for group in r: description = BeautifulSoup(group['description'], "lxml").text description = description.translate( {ord(c): None for c in '\r\n\"\''}) if len(description) > 0: f.write("\"" + description[:1000] + "\"" + "," + topic + '\n')
#END OUTPUT HELPER SECTION# #MAIN METHOD SECTION# #This is the main method. It takes the user's initial sentence, #gives menu options for the perturbation type, and outputs the perturbed sentence. if __name__ == '__main__': #model = Word2Vec.load("new_model")#"300features_40minwords_10context") #TODO this should not be hard coded #sentence = input("Please enter a sentence to perturb: ") with open('sentence.txt', 'r') as myfile: sentence = myfile.read().replace('\n', ' ') raw = BeautifulSoup(sentence).get_text() sentence = raw.translate(str.maketrans('', '', string.punctuation)) original_sentence = word_tokenize(sentence) perturbation_type = input( "Please select a perturbation type. 1) Random, 2) Anchor Points 3) Reverse Engineering: " ) perturbation_type = int(perturbation_type) perturbed_sentence = original_sentence if (perturbation_type == 1): num_words_to_perturb = input( "Please enter a number of words to perturb: ") num_words_to_perturb = int(num_words_to_perturb) num_features_to_perturb = input( "Please enter a number of features to perturb as a number between 1 and 300: " ) num_features_to_perturb = int(num_features_to_perturb)
def clean_raw_text(self, text, remove_html=False, lower_case=False, remove_punctuation=False, remove_stopwords=False, remove_digits=False, remove_emoji=False, remove_urls=False, stemming=False, spell_correction=False): ''' text --> Text data that to be cleaned, if remove_html is True, it will remove the html tags and then return the text if lower_case is True, it will convert the text to lower case if remove_punctuation is True, will remove the punctuations (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~) if remove_stopwords is True,it will remove all the stopwords in our text if remove_digits is True, it will remove all the digits in our text if remove_emoji is True, it will remove the emoji's in our text if remove_urls is True, it will remove the urls in our text if stemming is True , then it will do stemming if spell_correction is True, then it will correct our spellings ''' if remove_html: text = BeautifulSoup(text, "lxml").text if lower_case: text = str(text).lower() if remove_punctuation: text = text.translate(str.maketrans('', '', string.punctuation)) if remove_stopwords: stop_words = set(stopwords.words('english')) text = (' '.join([ word for word in str(text).split() if word not in stop_words ])) if remove_digits: text = text.translate(str.maketrans('', '', digits)) if remove_emoji: # https://stackoverflow.com/a/49146722/330558 emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" "]+", flags=re.UNICODE) text = emoji_pattern.sub(r'', text) if remove_urls: url_pattern = re.compile(r'https?://\S+|www\.\S+') text = url_pattern.sub(r'', text) if stemming: stemmer = PorterStemmer() text = (' '.join( [stemmer.stem(word) for word in str(text).split()])) if spell_correction: # https://norvig.com/spell-correct.html spell = SpellChecker() corrected_text = [] misspelled_words = spell.unknown(text.split()) for word in text.split(): if word in misspelled_words: corrected_text.append(spell.correction(word)) else: corrected_text.append(word) text = " ".join(corrected_text) return text
# ajout d'une propriété renseigant uniquement le mois mois = re.findall("[0-3][0-9]/([0-1][0-9])/[0-9]{4}", data_dict["dateStr"]) data_dict["mois"] = mois[0] #ajout d'une propriété renseignant uniquement l'année annee = re.findall("[0-3][0-9]/[0-1][0-9]/([0-9]{4})", data_dict["dateStr"]) data_dict["année"] = annee[0] #Ajout de l'url de la page de l'avalanche au dictionnaire data_dict["URL"] = driver.current_url # nettoyage de la description de l'avalanche raw_desc = BeautifulSoup(data_dict["description"], features="html.parser").get_text() # retire toutes les balises html substitut = str.maketrans("\n\t\r", " ") # créé une substitution pour les caractère spéciaux clean_desc = raw_desc.translate(substitut) # applique la subsitution à la description data_dict["description"] = clean_desc # affecte la description nettoyé à la propriété ad hoc de l'avalanche # retraint de la propriété auteurs_photos (inutile) del data_dict["auteurs_photos"] # Si le fichier n'existe pas, il est créé et la premiere avalanche y est insérée if not (os.path.isfile(FICHIER_CIBLE)): print("# Création du fichier ' " + FICHIER_CIBLE + "'") with open(FICHIER_CIBLE, 'w', encoding='utf-8') as json_file: json.dump(data_dict, json_file) print("| Ajout de l'avalanche " + str(data_dict["id"]) + " au fichier") else : # accrétion des données dans un tableau list_avalanche.append(data_dict) print("Ajout de l'avalanche " + str(data_dict["id"]) + " à la liste cache")
def set_body_word_count(self): body_basic_html = self.body.stream_block.render_basic(self.body) body_text = BeautifulSoup(body_basic_html, 'html.parser').get_text() remove_chars = string.punctuation + '“”’' body_words = body_text.translate(body_text.maketrans(dict.fromkeys(remove_chars))).split() self.body_word_count = len(body_words)
def _parse_energy_level_section(str, last_data=None): data = {} splitted_str = str.split('\n') for i, line in enumerate(splitted_str): clean_str = BeautifulSoup(line.strip(), "lxml").text if sys.version_info[0] < 3: # f**k python2 btw. clean_str = clean_str.encode("utf-8") if clean_str.strip() == '': continue if i == 0: data['configuration'] = clean_str.replace('\xa0', '') if i == 1: data['term'] = clean_str.replace('\xa0', '') if i == 3: if ',' in clean_str: data['J'] = clean_str.strip() else: resplit = re.split("a?\/a?", clean_str) if len(resplit) == 2: data['J'] = float(resplit[0].replace(' ', '')) / float( resplit[1]) else: data['J'] = int(clean_str.strip()) if i == 4: clean_str = clean_str.strip() clean_str = clean_str.translate( {ord(i): None for i in ' ()[]+x'}) data['level (eV)'] = float(clean_str) # refind1 = re.findall(r"\d+\.\d+", clean_str.replace(' ', ''))[0] # if type(refind1) == float: # data['level (eV)'] = refind1 # else: # data['level (eV)'] = float(re.findall(r"\d+", clean_str.replace(' ', ''))[0]) if i == 5: data['uncertainty (eV)'] = float(clean_str.replace(' ', '')) if i == 6: data['level splittings (eV)'] = float( clean_str.replace(' ', '')) try: if i == 7: data['leading percentages'] = float(clean_str) except ValueError: # leading percentage is not always there if i == 7: data['reference'] = clean_str.replace('\xa0', '') if 'configuration' not in data: data['configuration'] = '' if 'term' not in data: data['term'] = '' if data['configuration'] == '': # data['configuration'] = last_data['configuration'] if data['term'] == '': data['term'] = last_data['term'] return data