Ejemplo n.º 1
0
 def doc_preprocessing(doc):
     # Removes HTML tags
     doc = BeautifulSoup(doc, features="lxml").get_text()
     # Remove numbers
     doc = doc.translate(str.maketrans('', '', "0123456789"))
     # Remove punctuation
     doc = doc.translate(
         str.maketrans('', '', self.strip_punctuation))
     return doc
Ejemplo n.º 2
0
 def doc_preprocessing(doc):
     # Removes HTML tags
     doc = BeautifulSoup(doc, features="lxml").get_text()
     # Remove accentuation
     doc = unicodedata.normalize('NFKD', doc).encode(
         'ASCII', 'ignore').decode('ASCII')
     # Remove numbers
     doc = doc.translate(str.maketrans('', '', "0123456789"))
     # Remove punctuation
     doc = doc.translate(
         str.maketrans('', '', self.strip_punctuation))
     return doc
Ejemplo n.º 3
0
def word_clean(words):
    text = BeautifulSoup(words, 'lxml')
    text = text.get_text()
    text = text.encode('ascii', 'replace').decode()
    text = str(' '.join(text.split('\n'))).lower()
    text = text.translate(None, string.punctuation)
    text = text.translate(None, string.digits)
    words = [word for word in text.split(' ') if word != '']
    spwords = spell_check(words)
    res = " ".join(
        [word1 for word1, word2 in zip(words, spwords) if word1 == word2])
    return res
Ejemplo n.º 4
0
def pre_processing(question):
    def lemmatize_with_pos_tag(sentence):
        tokenized_sentence = TextBlob(sentence)
        tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
        words_and_tags = [(word, tag_dict.get(pos[0], 'n'))
                          for word, pos in tokenized_sentence.tags]
        lemmatized_list = [wd.lemmatize(tag) for wd, tag in words_and_tags]
        return " ".join(lemmatized_list)

    question = BeautifulSoup(question, 'html.parser').get_text()
    question = question.lower()
    question.translate(str.maketrans(" ", " ", string.punctuation))
    question = lemmatize_with_pos_tag(question)
    return question
Ejemplo n.º 5
0
 def set_body_word_count(self):
     body_basic_html = self.body.stream_block.render_basic(self.body)
     body_text = BeautifulSoup(body_basic_html, 'html.parser').get_text()
     remove_chars = string.punctuation + '“”’'
     body_words = body_text.translate(
         body_text.maketrans(dict.fromkeys(remove_chars))).split()
     self.body_word_count = len(body_words)
Ejemplo n.º 6
0
def clean_html(raw_html):

    # remove roman numberals inside brackets
    raw_html = re.sub('\([v|i|x]+\)', '', raw_html)
    raw_html = re.sub('\s\d+\s', '', raw_html)

    raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore')

    cleantext = BeautifulSoup(raw_html).text
    cleantext = " ".join(cleantext.split())

    # clean all arabic numerals
    numbers = re.findall('\d+', cleantext)
    for number in numbers:
        cleantext = cleantext.replace(number, " ")

    # remove punctuations
    table = cleantext.maketrans("", "", string.punctuation)
    cleantext = cleantext.translate(table)

    # remove non - ascii
    printable = set(string.printable)
    cleantext = list(filter(lambda x: x in printable, cleantext))
    cleantext = "".join(cleantext)

    # remove roman from string
    toremove = [
        ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ',
        ' x '
    ]
    text_array = cleantext.split("\s+")
    cleantext = [word.strip() for word in text_array if word not in toremove]
    cleantext = " ".join(cleantext)

    return cleantext.strip()
Ejemplo n.º 7
0
 def doc_preprocessing(doc):
     # Removes HTML tags
     doc = BeautifulSoup(doc, features="lxml").get_text()
     # Lowercase
     doc = doc.lower()
     # Remove numbers
     doc = doc.translate(str.maketrans('', '', "0123456789"))
     return doc
Ejemplo n.º 8
0
 def doc_preprocessing(doc):
     # Removes HTML tags
     doc = BeautifulSoup(doc, features="lxml").get_text()
     # Lowercase
     doc = doc.lower()
     # Remove punctuation
     doc = doc.translate(
         str.maketrans('', '', self.strip_punctuation))
     return doc
Ejemplo n.º 9
0
def get_keywords(id, name):
    name = name.replace(" ", "_")
    temp = requests.get("https://wiki.metakgp.org/w/" + id + ":_" + name).text
    soup = BeautifulSoup(temp, 'html.parser')
    soup = soup.find_all("p")
    soup = soup[2].text
    soup = soup.translate(str.maketrans('', '', string.punctuation))
    soup = soup.split(" ")
    return soup
Ejemplo n.º 10
0
def tokenize_and_remove_punctuations(s):
    s = BeautifulSoup(s, "lxml").text
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    s = stemmer.stem(s)
    translator = str.maketrans('', '', string.punctuation)
    modified_string = s.translate(translator)
    modified_string = ''.join([i for i in modified_string if not i.isdigit()])
    return nltk.word_tokenize(modified_string)
Ejemplo n.º 11
0
def clean_html_and_extract_text(raw_html):
    '''
       Clean an html string that comes from "cleaned_value"  column
    '''
    #    global foo

    ## use regular expressions to remove roman numberals inside brackets
    ## eg. (iv), (ix) etc.
    raw_html = re.sub('\([v|i|x]+\)', '', raw_html)
    raw_html = re.sub('\s\d+\s', '', raw_html)

    ## clear off the non ascii characters, remove the html tags
    ## and get just the text from the document
    raw_html = bytes(raw_html, 'utf-16').decode("utf-16", 'ignore')
    cleantext = BeautifulSoup(raw_html).text
    cleantext = " ".join(cleantext.split())
    cleantext = ''.join(x for x in cleantext if x in string.printable)

    # foo.append(cleantext)

    # for checking on various libraries
    # extract_fog_score(cleantext)

    ## clear off punctuations in the text
    table = cleantext.maketrans("", "", string.punctuation)
    cleantext = cleantext.translate(table)

    ## clear off all arabic numerals / digits in the text which are attached
    ## together with text
    numbers = re.findall('\d+', cleantext)
    for number in numbers:
        cleantext = cleantext.replace(number, " ")

    ## clear off numbers and normalize spaces between words
    ## and lowercase it
    cleantext = " ".join([
        text for text in cleantext.split(" ")
        if text.strip() is not "" and text.isdigit() is False
    ]).lower()

    ## remove any non-printable (non-ascii) characters in the text
    printable = set(string.printable)
    cleantext = list(filter(lambda x: x in printable, cleantext))
    cleantext = "".join(cleantext)

    ## remove roman numberals from string which
    ## are not in brackets
    toremove = [
        ' ii ', ' iii ', ' iv ', ' v ', ' vi ', ' vii ', ' viii ', ' ix ',
        ' x '
    ]
    text_array = cleantext.split("\s+")
    cleantext = [word.strip() for word in text_array if word not in toremove]
    cleantext = " ".join(cleantext)

    return cleantext.strip()
Ejemplo n.º 12
0
def clean_text(text):
    # MD - This is unnecessary
    text = re.sub(r'\{\{.*?\}\}', '', text, flags=re.S)
    text = re.sub(r'<ref>.*?</ref>', '', text, flags=re.S)
    text = re.sub(r'\[\[File:.*?\|.*?\|.*?\|(.*?)\]\]', r'\1', text, flags=re.S)
    text = BeautifulSoup(text, 'lxml').get_text()
    text = text.translate(UGLY_TEXT_MAP)
    text = text.replace("'''", '"').replace("''", '"')
    text = text.strip()
    return text
Ejemplo n.º 13
0
def sentencePreProcess(body):
    #this removes all the HTML tags
    clean_body = BeautifulSoup(body, "lxml").text
    #this removes all the punctuation
    clean_body = clean_body.translate(translator)
    #tokenize the given sentence
    word_tokens = word_tokenize(clean_body)
    #remove the stop words
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    #convert from list to sentence
    body = ' '.join(word for word in filtered_sentence)
    return body
Ejemplo n.º 14
0
def scrubData(text):
    # remove numbers
    text = re.sub(r'\d+', '', text) 

    # remove html
    text = BeautifulSoup(text, "html.parser").get_text()

    #Removing the square brackets
    text = re.sub('\[[^]]*\]', '', text)

    # Removing URL's
    text = re.sub(r'http\S+', '', text)

    # remove punctuation 
    translator = str.maketrans('', '', string.punctuation) 
    text.translate(translator) 

    # split string into word tokens and lowercase
    word_tokens = toktok.tokenize(text)
    words = [word for word in word_tokens if word.isalpha()]
    words = [token.lower() for token in words]

    # remove stopwords
    words = [w for w in words if not w in stop_Words]

    # stemming each token word
    #words = [porter.stem(word) for word in words]
    #words = [sno.stem(word) for word in words]
    
    # lemmatizing each token word
    words = [lem.lemmatize(word) for word in words]

    # Join back all the word tokens into one string 
    text_review = ''
    for word in words:
        text_review += str(word) + ' '

    text = text_review.lower()
    return text
Ejemplo n.º 15
0
def clean_filing(text, remove_xbrl=True, to_lower=True, remove_punctuation=True):
    if remove_xbrl:
        xml_start = max(text.find('<XBRL>'), text.find('<xbrl>'))
        text = text[:xml_start]
    
    text = BeautifulSoup(text, "lxml").text.encode('ascii', 'ignore').decode("utf-8")

    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\"', '', text)
        
    if to_lower:
        text = text.lower()
        
    if remove_punctuation:
        text = text.translate(str.maketrans('', '', string.punctuation))
        
    return text
Ejemplo n.º 16
0
def calculate_stats(instance):

    if instance._content is not None:
        stats = {}
        content = instance._content

        # How fast do average people read?
        WPM = 250

        # Use BeautifulSoup to get readable/visible text
        raw_text = BeautifulSoup(content, 'html.parser').getText()

        # Process the text to remove entities
        entities = r'\&\#?.+?;'
        raw_text = raw_text.replace('&nbsp;', ' ')
        raw_text = re.sub(entities, '', raw_text)

        # Flesch-kincaid readbility stats counts sentances,
        # so save before removing punctuation
        tmp = raw_text

        # Process the text to remove punctuation
        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
        raw_text = raw_text.translate(dict((ord(c), u'') for c in drop))

        # Count the words in the text
        words = raw_text.lower().split()
        word_count = Counter(words)

        # Return the stats
        stats['word_counts'] = word_count
        stats['wc'] = sum(word_count.values())

        # Calulate how long it'll take to read, rounding up
        stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
        if stats['read_mins'] == 0:
            stats['read_mins'] = 1

        # Calculate Flesch-kincaid readbility stats
        readability_stats = stcs, words, sbls = text_stats(tmp, stats['wc'])
        stats['fi'] = "{:.2f}".format(flesch_index(readability_stats))
        stats['fk'] = "{:.2f}".format(flesch_kincaid_level(readability_stats))

        instance.stats = stats
Ejemplo n.º 17
0
def process_text(text):
    """Remove any punctuation, numbers, newlines, and stopwords. Convert to lower case. Split the text string into individual words, stem each word, and append the stemmed word to words. Make sure there's a single space between each stemmed word.

    Args:
        text (str): A text.

    Returns:
        str: Cleaned, normalized, and stemmed text.
    """
    # Remove HTML tags.
    text = BeautifulSoup(text, "html.parser").get_text()

    # Normalize links replacing them with the str 'link'.
    text = re.sub('http\S+', 'link', text)

    # Normalize numbers replacing them with the str 'number'.
    text = re.sub('\d+', 'number', text)

    # Normalize emails replacing them with the str 'email'.
    text = re.sub('\S+@\S+', 'email', text, flags=re.MULTILINE)
    
    # Remove punctuation.    
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove whitespaces.
    text = text.strip()
    
    # Convert all letters to lower case.
    text = text.lower()
    
    # Create the stemmer.
    stemmer = SnowballStemmer('english')
    
    # Split text into words.
    words = text.split()
    
    # Remove stopwords.
    words = [w for w in words if w not in stopwords.words('english')]
    
    # Stem words.
    words = [stemmer.stem(w) for w in words]
    
    return ' '.join(words)
Ejemplo n.º 18
0
 def _fetch_CDN_(self, resp):
     if 'alt="Upgrade to Pornhub Premium to enjoy this video."' in resp:
         #upgrade to premium message with nothing to fetch just remove that link from file and move on
         return True
     if 'var player_quality_' in resp:
         p720 = resp.find('var player_quality_720p = \'')
         if p720 == -1:
             p420 = resp.find('var player_quality_480p = \'')
             if p420 == -1:
                 p240 = resp.find('var player_quality_240p = \'')
                 if p240 == -1:
                     #nothing is there
                     print(
                         "\n[None] No Video Format could be found -- Removing the Link"
                     )
                     return True
                 else:
                     print("[FETCHED -- 240px]")
                     start = p240 + 27
                     end = resp.find('\'', p240 + 30)
             else:
                 print("[FETCHED -- 420px]")
                 start = p420 + 27
                 end = resp.find('\'', p420 + 30)
         else:
             print("[FETCHED -- 720px]")
             start = p720 + 27
             end = resp.find('\'', p720 + 30)
         #print resp[start:end]
         file_name = BeautifulSoup(resp, "html.parser")
         file_name = str(file_name.title.string)
         file_name = file_name.translate(None, "'*:\"\/?<>|")
         download = Download(resp[start:end], "%s.mp4" % (file_name))
         download = download.now()
         if download:
             return True
         return False
     else:
         pass
Ejemplo n.º 19
0
def clean_text(text):
    """
    Purpose: This function performs cleaning of text from undesired symbols
    
    parameters: text string
    Returns:  text string
    """

    text = BeautifulSoup(text, "lxml").text  #remove html tags
    text = text.lower()  #lowercase the etst
    text = text.strip()  #remove trailing spaces
    text = re.sub(r' +', ' ', text)  # replace replace_symbols by space in text
    text = re.sub(r"[/(){}\[\]\|@,;]", ' ',
                  text)  # replace replace_symbols by space in text
    text = re.sub(r"[-()\"#/@;:{}`+=~|.!?,']", "",
                  text)  #Replacing special character with none
    text = re.sub(r'[0-9]+', '', text)  #Replacing numbers with none

    #remove puntutation
    text = " ".join(
        text.translate(str.maketrans('', '', string.punctuation))
        for text in text.split() if text.isalpha())
    return (text)
Ejemplo n.º 20
0
def calculate_stats(instance):

    if instance._content is not None:
        stats = {}
        content = instance._content

        # How fast do average people read?
        WPM = 180

        # Use BeautifulSoup to get readable/visible text
        raw_text = BeautifulSoup(content, 'html.parser').getText()

        # Process the text to remove entities
        entities = r'\&\#?.+?;'
        raw_text = raw_text.replace('&nbsp;', ' ')
        raw_text = re.sub(entities, '', raw_text)

        # Process the text to remove punctuation
        drop = u'.,?!@#$%^&*()_+-=\|/[]{}`~:;\'\"‘’—…“”'
        raw_text = raw_text.translate(dict((ord(c), u'') for c in drop))

        # Count the words in the text
        words = raw_text.lower().split()
        word_count = Counter(words)

        # Return the stats
        total_words = sum(word_count.values())
        stats['word_counts'] = word_count
        stats['total_words'] = total_words
        stats['wc'] = total_words

        # Calulate how long it'll take to read, rounding up
        stats['read_mins'] = (stats['wc'] + WPM - 1) // WPM
        if stats['read_mins'] == 0:
            stats['read_mins'] = 1

        instance.stats = stats
Ejemplo n.º 21
0
	def _fetch_CDN_(self,resp):		
		if 'alt="Upgrade to Pornhub Premium to enjoy this video."' in resp:
			#upgrade to premium message with nothing to fetch just remove that link from file and move on
			return True
		if 'var player_quality_' in resp:			
			p720 = resp.find('var player_quality_720p = \'')
			if p720 == -1:
				p420 = resp.find('var player_quality_480p = \'')
				if p420 == -1:
					p240 = resp.find('var player_quality_240p = \'')
					if p240 == -1:
						#nothing is there
						print("\n[None] No Video Format could be found -- Removing the Link")
						return True
					else:
						print("[FETCHED -- 240px]")
						start = p240 + 27
						end = resp.find('\'',p240+30)
				else:
					print("[FETCHED -- 420px]")
					start = p420 + 27
					end = resp.find('\'',p420+30)
			else:
				print("[FETCHED -- 720px]")
				start = p720 + 27
				end = resp.find('\'',p720+30)
			#print resp[start:end]				
			file_name = BeautifulSoup(resp,"html.parser")
			file_name = str(file_name.title.string)
			file_name = file_name.translate(None,"'*:\"\/?<>|")
			download = Download(resp[start:end],"%s.mp4"%(file_name))
			download = download.now()			
			if download:				
				return True
			return False
		else:
			pass
Ejemplo n.º 22
0
def process_text(text):
    mentions = text.count('@')
    hashtags = text.count('#')
    urls = len(find_urls(text))

    # Remove links
    text = ' '.join(re.sub("(\w+:\/\/\S+)", " ", text).split())

    # Remove mentions
    text = ' '.join(
        re.sub("(@[A-Za-z0-9^\w]+)", " ",
               text.replace('@ ', '@').replace('# ', '#')).split())

    # Replace hashtags with words
    if text.count('#') > 0:
        text = ' '.join(re.findall('[A-Z][^A-Z]*', text.replace('#', ' ')))

    # Remove HTML tags
    text = BeautifulSoup(text).get_text()

    # Save content length (exluding links and mentions)
    length = len(text)

    # Remove punctuation symbols
    text = ' '.join(
        re.sub("[\.\,\¡\¿\!\?\:\;\-\=\*\(\)\[\]\"\'\“\_\+\”\%\/\‘\’]", " ",
               text).split())
    text = text.translate(remove_digits).translate(remove_punctuation)

    # Lower case to avoid case sensitive problems
    text = text.lower()

    # Replace emojis with names
    text = emoji.demojize(text)

    # Add space between emojis and other characters
    ind = -2
    for c in range(text.count(':')):
        ind = text.find(':', ind + 2)
        if c % 2 == 0:
            newLetter = ' :'
        else:
            newLetter = ': '
        text = "".join((text[:ind], newLetter, text[ind + 1:]))

    # Replace emoji names with spanish meaning
    result = []
    parts = text.split(' ')
    for part in parts:
        if part:
            if part[0] == ':':
                em = handle_emoji_tone(part)
                em = emoji_meaning(em)
                if em:
                    result.append(em)
            else:
                result.append(part)

    text = ' '.join(result)

    # Filter using NLTK library append it to a string
    word_tokens = word_tokenize(text)
    result = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(result)

    # Check if text contains at least a word
    analysis = TextBlob(text)
    try:
        # Sentiment analysis
        eng = analysis.translate(to='en')
        sentiment = eng.sentiment
        polarity = sentiment.polarity
        subjectivity = sentiment.subjectivity

    except Exception as e:
        polarity = 0.0
        subjectivity = 0.0

    result = {
        'no_hashtags': [hashtags],
        'no_mentions': [mentions],
        'no_urls': [urls],
        'effective_length': [length],
        'polarity': [polarity],
        'subjectivity': [subjectivity]
    }

    return result
Ejemplo n.º 23
0
    words = [i for i in tokens if not i in stop_words] #remove stop words
    stems = stem_tokens(words, stemmer)
    return stems

#Pre-processing step
print("Pre-processing documents...")
for file in file_name:
    if(counter < 201):
        start = timer()
        with open(os.path.join(folder_dir,rel_path,file), 'rb') as f:
            read_data = f.read() #Read from file

        input_str = BeautifulSoup(read_data, "lxml").get_text() # Extract text from document
        input_str = input_str.casefold() #Convert to lower-case
        input_str = re.sub(r'\d+', '', input_str) #Remove numbers 
        input_str = input_str.translate(str.maketrans("","",string.punctuation)) #Remove punctuation
        input_str = " ".join(input_str.split()) #Removes whitespaces
        input_str = input_str.replace("\n"," ") #Removes newline
        input_str = unicodedata.normalize("NFKD", input_str) #Removes unicode characters.
        corpus[file] = input_str
        print(counter)
        counter+=1
        f.close()
    else:
        break   
#print(list(corpus.values())[0]) --Print first document's text for testing
values = []
files = []

for k,v in corpus.items():
    values.append(v)
Ejemplo n.º 24
0
def cleanText(text):
    text = BeautifulSoup(text).get_text()
    text = text.translate(UGLY_TEXT_MAP)
    text = text.replace("'''", '"')
    return text
Ejemplo n.º 25
0
#!/usr/bin/env python
# coding: utf-8

# In[18]:

import nltk
import numpy as np
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup
from nltk import word_tokenize
import re

# In[14]:

url = 'https://www.bbc.com/urdu/sport-49174685'
html = request.urlopen(url).read().decode('utf8')

# In[55]:

raw = BeautifulSoup(html, 'html.parser').get_text()
raw = raw.translate(
    {ord(c): " "
     for c in "\\!@#$%^&*\'\"()[]{};:,./<>?\|`~-=_+\n"})
tokens = word_tokenize(raw)
tokens = [w for w in tokens if not re.match(r'[A-Z]+', w, re.I)]
tokens = [w for w in tokens if not re.match(r'[0-9]+', w, re.I)]
len(tokens)

# In[ ]:
Ejemplo n.º 26
0
# Hardcoded ZIP codes known for their respective categories
categories = {
    'beach': [33109, 32407, 29572, 90266],
    'nightlife': [70130, int('02108'), 10017, 60642],
    'hiking': [98101, 37202, 80303, 84104],
    'architecture': [70130, int('02108'), 98101, 37202]
}

#TODO see about the jq stuff implemented in the cloud functions
with open('secrets', 'r') as s:
    key = s.readlines()
    key = key[0].strip('\n')

with io.open('meetup_bio_data', 'w', encoding='utf8') as f:
    for topic, locations in categories.items():
        for location in locations:
            endpoint = 'https://api.meetup.com/find/groups?key={}&zip={}&text={}'.format(
                key, location, topic)
            r = requests.get(endpoint)
            r = r.json()
            # Group level description
            for group in r:
                description = BeautifulSoup(group['description'], "lxml").text
                description = description.translate(
                    {ord(c): None
                     for c in '\r\n\"\''})
                if len(description) > 0:
                    f.write("\"" + description[:1000] + "\"" + "," + topic +
                            '\n')
Ejemplo n.º 27
0
def cleanText(text):
	text = BeautifulSoup(text).get_text()
	text = text.translate(UGLY_TEXT_MAP)
	text = text.replace("'''", '"')
	return text
Ejemplo n.º 28
0
#END OUTPUT HELPER SECTION#

#MAIN METHOD SECTION#
#This is the main method.  It takes the user's initial sentence,
#gives menu options for the perturbation type, and outputs the perturbed sentence.
if __name__ == '__main__':

    #model = Word2Vec.load("new_model")#"300features_40minwords_10context") #TODO this should not be hard coded

    #sentence = input("Please enter a sentence to perturb: ")
    with open('sentence.txt', 'r') as myfile:
        sentence = myfile.read().replace('\n', ' ')

    raw = BeautifulSoup(sentence).get_text()
    sentence = raw.translate(str.maketrans('', '', string.punctuation))
    original_sentence = word_tokenize(sentence)

    perturbation_type = input(
        "Please select a perturbation type.  1) Random, 2) Anchor Points 3) Reverse Engineering: "
    )
    perturbation_type = int(perturbation_type)
    perturbed_sentence = original_sentence
    if (perturbation_type == 1):
        num_words_to_perturb = input(
            "Please enter a number of words to perturb: ")
        num_words_to_perturb = int(num_words_to_perturb)
        num_features_to_perturb = input(
            "Please enter a number of features to perturb as a number between 1 and 300: "
        )
        num_features_to_perturb = int(num_features_to_perturb)
 def clean_raw_text(self,
                    text,
                    remove_html=False,
                    lower_case=False,
                    remove_punctuation=False,
                    remove_stopwords=False,
                    remove_digits=False,
                    remove_emoji=False,
                    remove_urls=False,
                    stemming=False,
                    spell_correction=False):
     '''
 text --> Text data that to be cleaned,
 if remove_html is True, it will remove the html tags and then return the text
 if lower_case is True, it will convert the text to lower case
 if remove_punctuation is True, will remove the punctuations (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
 if remove_stopwords is True,it will remove all the stopwords in our text
 if remove_digits  is True, it will remove all the digits in our text
 if remove_emoji is True, it will remove the emoji's in our text
 if remove_urls is True, it will remove the urls in our text
 if stemming is True , then it will do stemming
 if spell_correction is True, then it will correct our spellings 
 '''
     if remove_html:
         text = BeautifulSoup(text, "lxml").text
     if lower_case:
         text = str(text).lower()
     if remove_punctuation:
         text = text.translate(str.maketrans('', '', string.punctuation))
     if remove_stopwords:
         stop_words = set(stopwords.words('english'))
         text = (' '.join([
             word for word in str(text).split() if word not in stop_words
         ]))
     if remove_digits:
         text = text.translate(str.maketrans('', '', digits))
     if remove_emoji:
         # https://stackoverflow.com/a/49146722/330558
         emoji_pattern = re.compile(
             "["
             u"\U0001F600-\U0001F64F"  # emoticons
             u"\U0001F300-\U0001F5FF"  # symbols & pictographs
             u"\U0001F680-\U0001F6FF"  # transport & map symbols
             u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
             u"\U00002702-\U000027B0"
             u"\U000024C2-\U0001F251"
             "]+",
             flags=re.UNICODE)
         text = emoji_pattern.sub(r'', text)
     if remove_urls:
         url_pattern = re.compile(r'https?://\S+|www\.\S+')
         text = url_pattern.sub(r'', text)
     if stemming:
         stemmer = PorterStemmer()
         text = (' '.join(
             [stemmer.stem(word) for word in str(text).split()]))
     if spell_correction:
         # https://norvig.com/spell-correct.html
         spell = SpellChecker()
         corrected_text = []
         misspelled_words = spell.unknown(text.split())
         for word in text.split():
             if word in misspelled_words:
                 corrected_text.append(spell.correction(word))
             else:
                 corrected_text.append(word)
         text = " ".join(corrected_text)
     return text
Ejemplo n.º 30
0
    # ajout d'une propriété renseigant uniquement le mois
    mois = re.findall("[0-3][0-9]/([0-1][0-9])/[0-9]{4}", data_dict["dateStr"])
    data_dict["mois"] = mois[0]

    #ajout d'une propriété renseignant uniquement l'année
    annee = re.findall("[0-3][0-9]/[0-1][0-9]/([0-9]{4})", data_dict["dateStr"])
    data_dict["année"] = annee[0]

    #Ajout de l'url de la page de l'avalanche au dictionnaire
    data_dict["URL"] = driver.current_url

    # nettoyage de la description de l'avalanche
    raw_desc = BeautifulSoup(data_dict["description"], features="html.parser").get_text() # retire toutes les balises html
    substitut = str.maketrans("\n\t\r", "   ") # créé une substitution pour les caractère spéciaux
    clean_desc = raw_desc.translate(substitut) # applique la subsitution à la description
    data_dict["description"] = clean_desc # affecte la description nettoyé à la propriété ad hoc de l'avalanche

    # retraint de la propriété auteurs_photos (inutile)
    del data_dict["auteurs_photos"]

    # Si le fichier n'existe pas, il est créé et la premiere avalanche y est insérée
    if not (os.path.isfile(FICHIER_CIBLE)):
        print("# Création du fichier ' " + FICHIER_CIBLE + "'")
        with open(FICHIER_CIBLE, 'w', encoding='utf-8') as json_file:
            json.dump(data_dict, json_file)
        print("| Ajout de l'avalanche " + str(data_dict["id"]) + " au fichier")
    else :
        # accrétion des données dans un tableau
        list_avalanche.append(data_dict)
        print("Ajout de l'avalanche " + str(data_dict["id"]) + " à la liste cache")
Ejemplo n.º 31
0
 def set_body_word_count(self):
     body_basic_html = self.body.stream_block.render_basic(self.body)
     body_text = BeautifulSoup(body_basic_html, 'html.parser').get_text()
     remove_chars = string.punctuation + '“”’'
     body_words = body_text.translate(body_text.maketrans(dict.fromkeys(remove_chars))).split()
     self.body_word_count = len(body_words)
Ejemplo n.º 32
0
    def _parse_energy_level_section(str, last_data=None):

        data = {}
        splitted_str = str.split('\n')
        for i, line in enumerate(splitted_str):
            clean_str = BeautifulSoup(line.strip(), "lxml").text
            if sys.version_info[0] < 3:  # f**k python2 btw.
                clean_str = clean_str.encode("utf-8")

            if clean_str.strip() == '': continue

            if i == 0: data['configuration'] = clean_str.replace('\xa0', '')

            if i == 1: data['term'] = clean_str.replace('\xa0', '')

            if i == 3:
                if ',' in clean_str:
                    data['J'] = clean_str.strip()
                else:
                    resplit = re.split("a?\/a?", clean_str)
                    if len(resplit) == 2:
                        data['J'] = float(resplit[0].replace(' ', '')) / float(
                            resplit[1])
                    else:
                        data['J'] = int(clean_str.strip())

            if i == 4:
                clean_str = clean_str.strip()
                clean_str = clean_str.translate(
                    {ord(i): None
                     for i in ' ()[]+x'})
                data['level (eV)'] = float(clean_str)


#                refind1 = re.findall(r"\d+\.\d+", clean_str.replace(' ', ''))[0]
#                if type(refind1) == float:
#                    data['level (eV)'] = refind1
#                else:
#                    data['level (eV)'] = float(re.findall(r"\d+", clean_str.replace(' ', ''))[0])

            if i == 5:
                data['uncertainty (eV)'] = float(clean_str.replace(' ', ''))

            if i == 6:
                data['level splittings (eV)'] = float(
                    clean_str.replace(' ', ''))

            try:
                if i == 7: data['leading percentages'] = float(clean_str)
            except ValueError:  # leading percentage is not always there
                if i == 7: data['reference'] = clean_str.replace('\xa0', '')

        if 'configuration' not in data:
            data['configuration'] = ''

        if 'term' not in data:
            data['term'] = ''

        if data['configuration'] == '':  #
            data['configuration'] = last_data['configuration']

        if data['term'] == '':
            data['term'] = last_data['term']

        return data