Ejemplo n.º 1
0
 def txt2words(self, txt, remove_stopwords=True):
   txt = BeautifulSoup(txt).get_text()
   txt = ftfy.fix_text(txt)
   txt = txt.replace("\\n", '')
   txt = re.sub("[^0-9a-zA-Z]"," ", txt)
   if remove_stopwords:
     words = [self.save_stem(w) for w in txt.lower().split() if (w not in self.stopwords) & (len(w) > 2) & (not w.isdigit())]
   else:
     words = [self.save_stem(w) for w in txt.lower().split() if (len(w) > 2) & (not w.isdigit())]
   return words
Ejemplo n.º 2
0
    def report_to_wordlist(report):
        # Function to convert document text to a sequence of words,
        # optionally removing stop words.  Returns a list of words.

        # Remove HTML tags and related
        report_text = BeautifulSoup(report).get_text()
    
        # Remove non-letters
        report_text = re.sub("[^a-zA-Z]"," ", report_text)
       
        # Convert words to lower case and split them
        words = report_text.lower().split()
        myStops = ["any", "my","like","another","one","two","else","bras","ago","cos","get","yet","k","go", "every", "sort", "push","pull"]

        stoplist = set(stopwords.words("english") + myStops)
        words = [w for w in words if (not w in stoplist and len(w)>3)]
        
        wordListTuple = Counter(words).most_common()
        
        
        listofWords = [[tuple[0],tuple[1]] for tuple in wordListTuple]
       
        #print listofWords
        #lisofWords = map(list, wordListTuple)    
        # Return a list of words
        return listofWords
Ejemplo n.º 3
0
def process_song(song, remove_stopwords = True):
    # Function to convert raw song lyrics to a sequence of words,
    # optionally removing stop words. Returns a list of words.
    #
    # 1. Remove HTML
    song_text = BeautifulSoup(song).get_text()
    #  
    # 2. Remove \\n, separate out comman and ! symbols from words \
    # and remove rest of the characters.
    song_text = re.sub(r"\\n"," ", song_text)
    # TODO: Should we keep comman and ! ??
    song_text = re.sub("(,|!)",r" \1", song_text)
    song_text = re.sub("[^a-zA-Z',!]"," ", song_text)
    #
    # 3. Convert words to lower case and split them
    words = song_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    #    return(words)
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return( " ".join(words)) 
Ejemplo n.º 4
0
def clean_review(raw_review, remove_stopwords = False, output_format = "string"):
    """
    Input:
            raw_review: raw text of a movie review
            remove_stopwords: a boolean variable to indicate whether to remove stop words
            output_format: if "string", return a cleaned string 
                           if "list", a list of words extracted from cleaned string.
    Output:
            Cleaned string or list.
    """
    
    # Remove HTML markup
    text = BeautifulSoup(raw_review)
    
    # Keep only characters
    text = re.sub("[^a-zA-Z]", " ", text.get_text())
    
    # Split words and store to list
    text = text.lower().split()
    
    if remove_stopwords:
    
        # Use set as it has O(1) lookup time
        stops = set(stopwords.words("english"))
        words = [w for w in text if w not in stops]
    
    else:
        words = text
    
    # Return a cleaned string or list
    if output_format == "string":
        return " ".join(words)
        
    elif output_format == "list":
        return words
Ejemplo n.º 5
0
    def review_to_words(raw_review, remove_stopwords = False):
        # BeautifulSoup pulls data out of html file
        # here it removes html tags and markups
        text = BeautifulSoup(raw_review).get_text()

        # replace numbers by word number
        text=re.sub(r'[0-9]+','number',text)

        # remove punctuations (they can be analyzed for better results)
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        text = text.lower()

        #make a list of words
        words_list = text.split()

        #download nltk text data sets, including stop words
        #nltk.download()

        if remove_stopwords:
            # get stopwords, searching a set is faster than searching a list
            stops = set(stopwords.words('english'))
            # remove stopwords
            words_list = [word for word in words_list if not word in stops]

        # reduce words to their stems
        stemmer=PorterStemmer()
        words_list=[stemmer.stem(word) for word in words_list]
        # return the list of words
        return words_list
Ejemplo n.º 6
0
def reviewToWordList(rawReview, removeStopWords = False):
    """
    Converts a document to sequence of words
    optionally removing stop words
    will later extend to optionally remove numbers
    
    I/O
    -Input: raw html in string form
    -Output: list of words
    """
    
    #Remove HTML
    cleanedReview = BeautifulSoup(rawReview).get_text()
    
    #Remove non-letters
    cleanedReview = re.sub("[^a-zA-Z]",
                           " ",
                           cleanedReview)
    
    #Convert words to lowerCase
    cleanedReview = cleanedReview.lower()
    
    #Split Words
    wordList = cleanedReview.split()
    
    #Optionally remove stop words
    if ( removeStopWords ):
        stops = set(stopwords.words('english'))
        wordList = [ word for word in wordList if word not in stops]
    
    #Return list of words
    return(wordList)
def process_strings( string ):
    # 1. Remove HTML
    words = BeautifulSoup(string).get_text()
    
       
    # separate joint words
    words = re.sub('(\w+)([A-Z][a-z]+)',lambda m:  " " + m.group(1) +\
               " " + m.group(2),  words  )
    
    # 3. Convert to lower case
    words = words.lower() 
    
    # remove unwanted characters
    ddd = re.sub('[^a-zA-Z0-9\s]', " ", words )
    ddd2 = re.sub( "(\d+)x(\d+)", lambda m: m.group(1) + " " + m.group(2)  , ddd )
    ddd3 = re.sub( "(\d+)x\s", lambda m: m.group(1) + " ", ddd2 )
    ddd4 = re.sub( "\sx(\d+)", lambda m:  " " + m.group(1), ddd3 )
    ddd5 = re.sub( "\sx\s",  " " , ddd4 )
    fff = re.sub( "(\D+)(\d+)", lambda m:  m.group(1) + " " + m.group(2), ddd5 ) 
    fff2 = re.sub( "(\d+)(\D+)", lambda m:  m.group(1) + " " + m.group(2), fff )
    words = re.sub( "(\d+)(\D+)(\d+)", lambda m:  m.group(1) + " " + m.group(2) + " " \
                  + m.group(3), fff2)
    for  i in range(1,10):   
      words = re.sub('\s(ft|sq|in|gal|cu|h|oz|dia|yd|yds|a|p|qt|ah|amp|gpm|mp\
                       |quart|watt|cc|d|inc|incl|lb|lbs|lin|ln|mil|mm|no|n|oc\
                       |od|pc|pal|pt|s|sch|cs|case|pallet|w)\s'  , lambda m: " ", words )
        
    # Join the words back into one string separated by space
    return (  words.split() )
Ejemplo n.º 8
0
def get_flickr_image_title(url):
    def meta(tag):
        return tag.name == 'meta' and 'name' in tag.attrs and tag['name'] == 'title'

    html = HTTP.request('GET', url + '/sizes/o/')
    title =  BeautifulSoup(html.data, 'html5lib').find(meta)['content'].split('|')[0]
    return title.lower()
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]

        b=[]
        stemmer = english_stemmer #PorterStemmer()
        for word in words:
            b.append(stemmer.stem(word))

        # 5. Return a list of words
        return(words)
Ejemplo n.º 10
0
def review_to_words(raw_review):
    # function to convert a raw review to a string of words
    # the input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    # 1. remove html
    review_text = BeautifulSoup(raw_review).get_text() 
    #
    # 2. remove non-letters        
    # letters_only = re.sub("[^a-za-z]", " ", review_text) 
    #
    # 3. convert to lower case, split into individual words
    # words = letters_only.lower().split()                             
    words = review_text.lower().split()                             
    #
    # 4. in python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    # stops = set(stopwords.words("english"))                  
    # 
    # 5. remove stop words
    # meaningful_words = [w for w in words if not w in stops]   
    #
    # 6. join the words back into one string separated by space, 
    # and return the result.
    # return( " ".join( meaningful_words ))
    return( " ".join( words ))
Ejemplo n.º 11
0
def reviewToWords(rawReview):
    """
    Converts raw review to a string of words
    -Input is single html string
    -Output is preprocessed single string
    """
    cleanedReview = None
    
    #Remove HTML
    cleanedReview = BeautifulSoup(rawReview)
    
    #Remove numbers and punctuation
    cleanedReview = re.sub("[^a-zA-Z]",
                           " ",
                           cleanedReview.get_text())
    
    #Make all words lowercase
    cleanedReview = cleanedReview.lower()
    
    #Split into individual words
    cleanedReviewWords = cleanedReview.split()
    
    #Convert to set instead of list for efficiency
    stops = set(stopwords.words("english"))
    
    #Remove stop words
    meaningfulWords = [word for word in cleanedReviewWords if word not in stops]
    
    #Join words back into one string
    return (" ".join( meaningfulWords ))
Ejemplo n.º 12
0
def _extract_date(tag: str, el: bs4.element.Tag, verbose: bool = False) -> list:
    result = []

    if len(el) > 300:
        return []

    # if verbose:
    #     print(el)

    if tag == 'meta' and el.has_attr('content'):
        result.append(el['content'])
    if tag == 'abbr' and all([el.has_attr('itemprop'), el.has_attr('title')]):
        result.append(el['title'])
    # if tag == 'time' and el.has_attr('datetime'):
    #     result.append(el['datetime'])

    _ = el.prettify()
    _ = BeautifulSoup(_, "lxml").getText()
    _ = _[:300]
    if _:
        result.append(_.lower().strip())
    #
    # if verbose:
    #     pprint.pprint(result)
    return result
    def review_str_to_wordlist(raw_review, clean_method,
                               remove_numbers=True, remove_punct=True,
                               remove_stopwords=True):
        """Clean one single review item (string) and return it as a list of
        words

        :param raw_review: the unprocessed raw review string
        :param clean_method: the method to clean review, e.g., BeautifulSoup
        :param remove_numbers: boolean if remove numbers
        :param remove_punct: boolean, if remove punctuations
        :param remove_stopwords: boolean, if remove stopwords
        :returns: cleaned reviews,
        :rtype: string

        """
        if clean_method == 'BeautifulSoup':
            word_list = BeautifulSoup(raw_review, 'lxml').get_text()
        else:
            sys.exit(('review_str_to_wordlist: The clean method not '
                      'supported yet!'))

        if remove_numbers and remove_punct:
            word_list = re.sub('[^a-zA-Z]', ' ', word_list).lower().split()
        elif remove_numbers and not remove_punct:
            word_list = re.sub('[0-9]', ' ', word_list).lower().split()
        elif not remove_numbers and remove_punct:
            word_list = re.sub('[^a-zA-Z0-9]', ' ', word_list).lower().split()
        else:
            word_list = word_list.lower().split()

        if remove_stopwords:
            stops = set(stopwords.words('english'))
            word_list = [word for word in word_list if word not in stops]

        return word_list
class MovieReview(object):
	def __init__(self, mreview):
		self.mreview = mreview
		self.mreview_clean = None
		self.mreview_word_list = []
		self.mreview_sentence_list = []

	def clean_review(self):
		# function to clean the review by stripping html from review text body
		self.mreview_clean = BeautifulSoup(self.mreview).get_text()

	def remove_punctuation_and_nums(self):
		self.mreview_clean = re.sub("[^a-zA-Z]", " ", self.mreview_clean)

	def split_review_into_words(self):
		# function to split the review text to list of words
	    self.mreview_word_list = self.mreview_clean.lower().split()

	def remove_stop_words(self):
		self.mreview_word_list = [word for word in self.mreview_word_list if not word in set(stopwords.words("english"))]
		self.mreview_clean = " ".join(self.mreview_word_list)

	def split_review_into_sentences(self):
		# function to split review into list of sentences
		# where each setence is a list of words
		extracted_sentences = TOKENIZER.tokenize(self.mreview_clean.strip())
		for extracted_sentence in extracted_sentences:
			if len(extracted_sentence) > 0:
				# extracted_sentence needs to be operated on if stopword or punctuation
				# removal is required eventually(not required for word2Vec)
				self.mreview_sentence_list.append(extracted_sentence.lower().split())
Ejemplo n.º 15
0
    def review_to_wordlist( review, remove_stopwords=False , generate_bigrams=False):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z\'\"]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()

        if generate_bigrams:
            bigrams = []
            for gram in KaggleWord2VecUtility.generate_ngrams(words, 2):
                bigrams.append('{0} {1}'.format(gram[0], gram[1]))
            words.extend(bigrams)
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]
        #
        # 5. Return a list of words
        return(words)
def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words, stopwords are needed here
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    
    review_text = BeautifulSoup(review).get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    
    #3.5 remove more words
#    review_text = re.sub(filter_words, " ", review_text)
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)
Ejemplo n.º 17
0
def text_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    text = BeautifulSoup(review,'html.parser').get_text()
    #
    # 2. Remove non-letters
    text = re.sub("[^A-za-z0-9^,?!.\/'+-=]"," ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\?", " ? ", text)
    #
    # 3. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    # 5. Return a list
    return(words)
Ejemplo n.º 18
0
def retrieve_from_url(url):
    """
    Retrieves text from url, removes all string formatting \n and \t
    """
    soup = BeautifulSoup(requests.get(url).text)
    soup = soup.text.replace("\n", " ").replace("\t", " ")
    return soup.lower()
Ejemplo n.º 19
0
def sentimentToWordlist(rawReview, removeStopwords=False, removeNumbers=False, removeSmileys=False):
    
    # use BeautifulSoup library to remove the HTML/XML tags (e.g., <br />)
    reviewText = BeautifulSoup(rawReview).get_text()

    # Emotional symbols may affect the meaning of the review
    smileys = """:-) :) :o) :] :3 :c) :> =] 8) =) :} :^)
                :D 8-D 8D x-D xD X-D XD =-D =D =-3 =3 B^D :( :/ :-( :'( :D :P""".split()
    smiley_pattern = "|".join(map(re.escape, smileys))

    # [^] matches a single character that is not contained within the brackets
    # re.sub() replaces the pattern by the desired character/string
    
	# Check to see how we need to perform cleanup
    if removeNumbers and removeSmileys:
        reviewText = re.sub("[^a-zA-Z]", " ", reviewText)
    elif removeSmileys:
        reviewText = re.sub("[^a-zA-Z0-9]", " ", reviewText)
    elif removeNumbers:
        reviewText = re.sub("[^a-zA-Z" + smiley_pattern + "]", " ", reviewText)
    else:
        reviewText = re.sub("[^a-zA-Z0-9" + smiley_pattern + "]", " ", reviewText)

    # split in to a list of words
    words = reviewText.lower().split()

    if removeStopwords:
        # create a set of all stop words
        stops = set(stopwords.words("english"))
        # remove stop words from the list
        words = [w for w in words if w not in stops]
               
    return words
    def review_to_wordlist( review, remove_stopwords=False ):
        # Function to convert a document to a sequence of words,
        # optionally removing stop words.  Returns a list of words.
        #
        # 1. Remove HTML
        review_text = BeautifulSoup(review).get_text()
        #
        # 2. Remove non-letters
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        #
        # 3. Convert words to lower case and split them
        words = review_text.lower().split()
        #
        # 4. Optionally remove stop words (false by default)
        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]

        # * Keep only more common appeared words, seem to make result worse
        # common_words = nltk.FreqDist(words).most_common(50)
        # words = [ w[0] for w in common_words ]

        # * Morphological processing, no clear improvement
        # words = filter(lambda w: w != None, [ wn.morphy(w) for w in words ])

        # 5. Return a list of words
        return(words)
Ejemplo n.º 21
0
def processing(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text() 
    # 2. Convert all to lower Case
    review_text=review_text.lower()    
    # 3. Remove Punctuations        
    letters_only = remove_punctuations(review_text)
    return letters_only
Ejemplo n.º 22
0
def review_to_wordlist(review, remove_stopwords=False):
	review_text = BeautifulSoup(review).get_text()
	review_text = re.sub("^[a-zA-Z]", " ", review_text)
	words = review_text.lower().split()
	if remove_stopwords:
		stops = set(stopwords.words("english"))
		words = [w for w in words if not w in stops]
	return(words)
 def sentenceToWordList(self, review, remove_stopwords=False):
     review_text = BeautifulSoup(review).getText()
     review_text = re.sub("[^a-zA-Z]", " ", review_text)
     words = review_text.lower().split()
     if remove_stopwords:
         stops = set(stopwords.words("english"))
         words = [w for w in words if not w in stops]
     return(words)
 def review_to_words(review, stopwords):
     #Function to convert a review into a sequence of words
     review_text = BeautifulSoup(review).get_text()
     review_text = re.sub("[^a-zA-Z]",
                          ' ',
                          review_text)
     words = review_text.lower().split()
     words = [w for w in words if w not in stopwords]
     return " ".join(words)
Ejemplo n.º 25
0
def review_to_words( raw_review ):
    review_words_only = BeautifulSoup(raw_review)
    review_words_only = re.sub("[^0-9a-zA-z]", " ", review_words_only.get_text())
    review_words_only = review_words_only.lower()
    # words_in_lower_case = only_letters_lower.split()
    
    # words_without_stopwords = [w for w in words_in_lower_case if not w in stopwords.words("english")]
    # return " ".join( words_without_stopwords )
    return review_words_only
Ejemplo n.º 26
0
def preprocess_post(text):
    """Preprocessor for MSE questions, to be applied before TFIDF.

       Strips out HTML, converts everything to lowercase, and removes digits.
    """
    result = BeautifulSoup(text).get_text()
    result = result.lower()
    result = ''.join(c for c in result if not c.isdigit())
    return BeautifulSoup(text).get_text()
Ejemplo n.º 27
0
def get_restaurants(url):
    try:
        urls = get_page_urls(url)
        for url in urls:
            data = get_text_from_url(url)
            search_div = BeautifulSoup(str(data)).find('div', class_='search-results-content')
            uls = BeautifulSoup(str(search_div)).findAll('ul', class_='ylist ylist-bordered search-results')
            for restaurant in BeautifulSoup(str(uls[1])).findAll('li', class_='regular-search-result'):
                main_attrs = BeautifulSoup(str(restaurant)).find('div', class_='main-attributes')
    
                rating = BeautifulSoup(str(main_attrs)).find('div', class_='rating-large')
                rating_data = str(BeautifulSoup(str(rating)).find('i').attrs['title'])
                rating_data = rating_data.replace('star rating', '')
                
                review_count = str(BeautifulSoup(str(main_attrs)).find('span', class_='review-count rating-qualifier').text.strip())
                review_count = review_count.replace(' reviews', '')
    
                sub_url = BeautifulSoup(str(main_attrs)).find('a').attrs['href']
                url = 'http://www.yelp.com' + sub_url
    
                category_data = BeautifulSoup(str(main_attrs)).find('div', class_='price-category')
                category_str_list = BeautifulSoup(str(category_data)).findAll('span', class_='category-str-list')
                categories = ''
                for a in BeautifulSoup(str(category_str_list)).findAll('a'):
                    categories = categories +  a.text.strip() + ','
    
                expensive_level = BeautifulSoup(str(category_data)).find('span', 'business-attribute price-range').text
                
    
                h3 = BeautifulSoup(str(restaurant)).find('h3', class_='search-result-title')
                h3_a = BeautifulSoup(str(h3)).find('a').text
                name = h3_a.strip()
    
                sec_attrs = BeautifulSoup(str(restaurant)).find('div', class_='secondary-attributes')
                address = BeautifulSoup(str(sec_attrs)).find('address')
                if '<br/>' in str(address):
                    address = str(address).replace('<br/>', ' ')
                address = BeautifulSoup(str(address)).find('address').text.strip()
    
                city = get_city_from_address(address)
    
                if not str(city).lower() in address.lower():
                    print 'Invalid city detected'
                RestaurantModel.objects.create(
                    name=name,
                    expensivelevel=expensive_level,
                    city=city,
                    current_rating=float(rating_data),
                    url=url,
                    category=categories,
                    address=address,
                    reviewcount=review_count
                        )
        set_db_status(False)
    except Exception, e:
        print str(e) + 'get restturats'
        set_db_status(False)
def review_to_wordlist(review,remove_stopwords=False):
    review_text = BeautifulSoup(review).get_text() 
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    review_text = re.sub(r'(.)\1+', r'\1\1',review_text) # replace doubled up letters
    words = review_text.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return(words)
Ejemplo n.º 29
0
def preproc(review, use_stopwords=False):
    review_text = BeautifulSoup(review, "lxml").get_text()
    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    if use_stopwords:
        stops = set(nltk.stopwords.words("english"))
        words = [w for w in review_text.split() if not w in stops]
        return " ".join(words)

    return review_text.lower()
Ejemplo n.º 30
0
def clean_text(text):
    # Remove HTML
    review_text = BeautifulSoup(text='lxml').get_text()
    # Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #Convert words to lower case and split them
    words = review_text.lower().split()
    #Remove stopwords
    stops = set(stopwords.words('english'))
    words = [w for w in words if not w in stops]
    return words
Ejemplo n.º 31
0
def tweet_cleaning_for_sentiment_analysis(tweet):

    #Escaping HTML characters
    tweet = BeautifulSoup(tweet).get_text()

    #Special case not handled previously.
    tweet = tweet.replace('\x92', "'")
    tweet = tweet.replace('"', "'")
    tweet = tweet.replace("…", ".")
    tweet = tweet.replace("\\\'", "'")
    tweet = tweet.replace("#", "")
    tweet = tweet.replace("—", "")

    #Removal of hastags/account
    tweet = ' '.join(
        re.sub("(@[A-Za-z0-9]+)|(#[A-Za-z0-9]+)", " ", tweet).split())

    #Removal of address
    tweet = ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())

    #Removal of Punctuation

    tweet = ' '.join(re.sub("[\[\]\'\\\.\,\!\?\:\;\-\=]", " ", tweet).split())

    #Lower case
    tweet = tweet.lower()

    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    CONTRACTIONS = load_dict_contractions()
    tweet = tweet.replace("’", "'")
    tweet = tweet.replace("   ", " ")
    tweet = tweet.replace("  ", " ")
    words = tweet.split()
    reformed = [
        CONTRACTIONS[word] if word in CONTRACTIONS else word for word in words
    ]
    tweet = " ".join(reformed)

    # Standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))

    #Deal with emoticons source: https://en.wikipedia.org/wiki/List_of_emoticons
    SMILEY = load_dict_smileys()
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)

    #Deal with emojis
    tweet = emoji.demojize(tweet)

    tweet = tweet.replace(":", " ")
    tweet = ' '.join(tweet.split())

    return tweet
def cleanReview(review):
    #1.Remove HTML
    review_text = BeautifulSoup(review).get_text()
    #2.Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    #3.Convert words to lower case
    review_text = review_text.lower()
    #4.remove stop words
    review_words = word_tokenize(review)
    words = [w for w in review_words if not w in stopwords.words("english")]
    return ' '.join(words)
Ejemplo n.º 33
0
def install(name):
    """
    Usage:
        dash.py install <name>...

    Options:
        -h --help             Show this screen and exit.
    """
    if isinstance(name, list):
        return [install(n) for n in name]

    content = ""
    name = name.lower()
    if os.path.exists(name):
        content = open(name, "r").read()
    else:
        if '//' in name:
            url = name
        else:
            url = "https://raw.github.com/whtsky/Dash.py/" \
                  "master/dash_py/packages/%s.yaml" % name
        if resource_exist(url):
            r = requests.get(url)
            content = r.content

    if content:
        package = yaml.load(content)
        install_package(package)
        return

    # Try to download document from rtfd
    r = requests.get("https://readthedocs.org/projects/%s/downloads/" % name)
    if r.status_code != 200:
        logger.error("Can't find package %s" % name)
        return

    name = BeautifulSoup(r.content).title.string.split("|")[0].strip()

    for branch in ['stable', 'master', 'latest']:
        if branch not in r.content:
            continue
        docset_url = "https://media.readthedocs.org/dash/" \
                     "{0}/{1}/{2}.tgz".format(name.lower(), branch, name)
        if resource_exist(docset_url):
            install_package({
                "name": name,
                "type": "docset",
                "url": docset_url,
                "format": "tar"
            })
            return

    logger.error("Can't find package %s" % name)
    return -1
def headline_to_words(headline):
    
    stemmer = WordNetLemmatizer()
    
    text = BeautifulSoup(headline, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [stemmer.lemmatize(w) for w in words] # stem
    
    return words
Ejemplo n.º 35
0
def review_to_words(review):
    nltk.download("stopwords", quiet=True)

    text = BeautifulSoup(review, "html.parser").get_text()  # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())  # Convert to lower case
    words = text.split()  # Split string into words
    words = [w for w in words
             if w not in stopwords.words("english")]  # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words]  # stem

    return words
Ejemplo n.º 36
0
    def review_to_wordlist(review, remove_stopwords=False):
        review_text = BeautifulSoup(review).get_text()
        review_text = re.sub("[^a-zA-Z]", " ", review_text)

        words = review_text.lower().split()

        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if w not in stops]

        return (words)
Ejemplo n.º 37
0
def clean_sentences(df):
    reviews = []

    for sent in tqdm(df['Phrase']):
        review_text = BeautifulSoup(sent).get_text()
        review_text = re.sub("[^a-zA-Z]"," ", review_text)
        words = word_tokenize(review_text.lower())
        lemma_words = [lemmatizer.lemmatize(i) for i in words]
        reviews.append(lemma_words)

    return(reviews)
Ejemplo n.º 38
0
def clean_review(raw):
    # remove HTML
    review_text = BeautifulSoup(raw).get_text()
    # remove no-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    words = review_text.lower().split()
    # remove stopwords
    stops = stopwords.words("english")
    meaningful_words = [w for w in words if w not in stops]

    return " ".join(meaningful_words)
Ejemplo n.º 39
0
 def clean_text(text):
     text = BeautifulSoup(text, "lxml").text  # HTML decoding
     text = text.lower()  # lowercase text
     text = REPLACE_BY_SPACE_RE.sub(
         ' ', text)  # replace REPLACE_BY_SPACE_RE symbols by space in text
     text = BAD_SYMBOLS_RE.sub(
         ' ', text)  # delete symbols which are in BAD_SYMBOLS_RE from text
     text = ' '.join(word for word in text.split()
                     if word not in STOPWORDS)  # delete stopwors from text
     stemmer = PorterStemmer()
     return text
Ejemplo n.º 40
0
 def clean_text(self, text):
     """
         text: a string  
         return: modified initial string
     """
     text = BeautifulSoup(text, "lxml").text                                   # HTML decoding
     text = text.lower()                                                       # lowercase text
     text = REPLACE_BY_SPACE_RE.sub(' ', text)                                 # replace REPLACE_BY_SPACE_RE symbols by space in text
     text = BAD_SYMBOLS_RE.sub('', text)                                       # delete symbols which are in BAD_SYMBOLS_RE from text
     text = ' '.join(word for word in text.split() if word not in STOPWORDS)   # delete stopwors from text
     return text
def clean_data(text):
    cleaned = []
    for t in text:
        t = (re.sub(r'@[A-Za-z0-9]+', '', t))  # remove @mentions
        t = re.sub('https?://[A-Za-z0-9./]+', '', t)  # remove links
        t = re.sub("[^a-zA-Z]", " ", t)  # Remove numbers and punctuations
        t = BeautifulSoup(t, 'lxml')  # remove html encoded text
        t = t.text.replace("RT", "")
        t = t.lower()
        cleaned.append(t)
    return cleaned
Ejemplo n.º 42
0
def review_to_wordlist(review, remove_stopwords=False):
    # 1. Remove HTML
    review_text = BeautifulSoup(review, features="html.parser").get_text()
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words
Ejemplo n.º 43
0
def review_to_wordlist(review):
    '''
    把IMDB的评论转成词序列
    '''
    # 去掉HTML标签,拿到内容
    review_text = BeautifulSoup(review).get_text()
    # 用正则表达式取出符合规范的部分
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 小写化所有的词,并转成词list
    words = review_text.lower().split()
    # 返回words
    return words
Ejemplo n.º 44
0
def clean_sentences(sentence):
    # 去除HTML Tag
    review = BeautifulSoup(sentence, features="html5lib").get_text()
    # 去除标点
    review = re.sub("[^a-zA-Z]", " ", review)
    # 所有字母小写
    review = review.lower()
    # 将句子分割成单词列表
    words_lists = review.split()

    # return like ['word0', 'word1', ...]
    return words_lists
Ejemplo n.º 45
0
def clean_text(text):
    """
        text: a string
        return: cleaned initial string
    """
    text = BeautifulSoup(text, "lxml").text  # HTML decoding
    text = re.compile('[/(){}\[\]\|@,;]').sub(
        ' ', text)  # replace matched symbols by space in text
    text = re.compile('[^0-9a-z #+_]').sub(
        '', text)  # delete symbols which are in symbols_re from text
    text = text.lower()
    return text
def simplified_answer(answer):
    # nltk.download("stopwords", quiet=True)

    text = BeautifulSoup(answer, "html.parser").get_text()  # Remove HTML tags
    # text = re.sub(r"[^a-zA-Z0-9\-]", " ", text.lower())  # Convert to lower case
    # Removed "-" to better leverage the pretrained vocabulary
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())  # Convert to lower case

    words = text.split()  # Split string into words
    # words = [w for w in words if w not in stopwords.words("english")]  # Remove stopwords
    # words = [PorterStemmer().stem(w) for w in words]  # stem
    return ' '.join(words)
Ejemplo n.º 47
0
def clean_text(text):
    # 去标签
    text = BeautifulSoup(text, 'html.parser').get_text()

    # 去标点
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # 小写
    words = text.lower().split()
    # 去停用词
    stopwords = {}.fromkeys([line.rstrip() for line in open('stopwords.txt')])
    # words = [word for word in words if word not in stopwords]
    return ' '.join(words)
Ejemplo n.º 48
0
def email_list_action():
    try:
        banner()
        loading_4()
    except KeyboardInterrupt:
        print()
        error_text("Detected Ctrl+C. Shutting down...")
        exit(1)
    email_code()
    info_text("Installing awk if not installed...")
    os.system("apt-get -qq install -y awk 2> /dev/null")
    HASH_FILE = str(sys.argv[2])
    info_text("Changing email:hash to hash...")
    os.system("awk -F: \'{print $2}\' %s > only_hahes.txt" % HASH_FILE)
    HASH_FILE = "only_hahes.txt"
    os.system("echo " " > results.txt")
    info_text("Checking the hashes. This might take a while...")
    with open(HASH_FILE, "r") as reader:
        while True:
            line = reader.readline()
            if not line:
                break
            HASHED = line.strip()
            URL = "https://md5decrypt.net/en/Api/api.php?hash=%s&hash_type=md5&email=%s&code=%s" % (
                HASHED, USER_EMAIL, API_CODE)
            try:
                PAGE = requests.get(
                    URL, headers=HEADERS
                )  # Uses requests lib to get the content of the page
                PAGE_CONTENT = BeautifulSoup(PAGE.content,
                                             "html.parser").get_text()
            except Exception as e:
                error_text(
                    "Exception happened while connecting to md5decrypt: ") + e
            if PAGE_CONTENT.strip() != "":
                if "error" not in PAGE_CONTENT.lower():
                    with open("results.txt",
                              "a") as add_text:  # this is better
                        add_text.write("{}:{}".format(HASHED, PAGE_CONTENT))
                elif "ERROR CODE : 002" in PAGE_CONTENT:
                    os.system("rm results.txt")
                    print()
                    print(" %s%s[!] Error. Wrong email / code.%s" %
                          (Style.RESET_ALL, Fore.RED, Style.RESET_ALL))
                    print()
                    exit(1)
    os.system("echo " " >> results.txt")
    success_text("All done! Results sent to results.txt [hash:text]")
    warning_text(
        "Make sure to move results.txt if you are going to run the script again. It will delete the actual one!"
    )
    print()
    exit(1)
Ejemplo n.º 49
0
def tweet_cleaning_for_sentiment_analysis(tweet):
    translator = Translator()

    #Escaping HTML characters

    tweet = BeautifulSoup(tweet).get_text()

    #Deal with emoticons
    words = tweet.split()
    reformed = [SMILEY[word] if word in SMILEY else word for word in words]
    tweet = " ".join(reformed)

    #Special case not handled previously.
    tweet = tweet.replace('\x92', "")
    tweet = tweet.replace('\x85', "")

    #Removal of hastags/account
    tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)", " user ",
                            tweet).split())  #verificare posizione nel testo
    tweet = ' '.join(re.sub("#", "", tweet).split())

    #|(#[A-Za-z0-9]+)
    #Removal of address
    tweet = ' '.join(
        re.sub("(\w+:\/\/\S+)", " url ",
               tweet).split())  #valutare cancellezione url se a fine frase

    #Removal of Punctuation
    tweet = ' '.join(re.sub("[\.\,\!\?\:\;\-\=\]", " ",
                            tweet).split())  #rimuovere << oppure ""

    #CONTRACTIONS source: https://en.wikipedia.org/wiki/Contraction_%28grammar%29
    tweet = tweet.replace("’", " ")

    # Standardizing words
    tweet = ''.join(''.join(s)[:2] for _, s in itertools.groupby(tweet))

    number = emoji.emoji_count(tweet)

    #Deal with emojis

    tweet = emoji.demojize(tweet, use_aliases=False, delimiters=("", ""))
    tweet = tweet.replace("_", " ")

    tweet = tweet.replace(":", " ")
    if number != 0:
        tweet = translator.translate(tweet, src='en', dest='it').text

    tweet = ' '.join(tweet.split())

    #Lower case
    tweet = tweet.lower()
    return tweet
def cleanText(text):
    text = BeautifulSoup(text, "lxml").text
    text = re.sub(r'\|\|\|', r' ', text)
    text = re.sub(r'http\S+', r'<URL>', text)
    text = text.lower()
    text = text.replace('x', '')

    clean_text = []
    for w in word_tokenize(text):
        if w.lower() not in stop:
            clean_text.append(w)
    return clean_text
Ejemplo n.º 51
0
def preprocess_text(text):
    # Removing html tags
    processed_text = BeautifulSoup(text, features="html.parser").get_text()
    # Remove capitalization
    processed_text = processed_text.lower()
    # Remove punctuations and numbers
    processed_text = re.sub(r"[^a-z'\s]", "", processed_text)
    # Remove quotations
    processed_text = re.sub(r"([^a-z])\'|\'([^a-z])", r"\1\2", processed_text)
    # Remove excessive whitespace
    processed_text = re.sub(r"\s+", r" ", processed_text)
    return processed_text
Ejemplo n.º 52
0
def reviewToWordlist(review):
    #First remove the HTML.
    reviewText = BeautifulSoup(review, features="html.parser").get_text()

    #Use regular expressions to only include words.
    reviewText = re.sub("[^a-zA-Z]", " ", reviewText)

    #Convert words to lower case and split them into separate words.
    words = reviewText.lower().split()

    #Return a list of words
    return words
Ejemplo n.º 53
0
    def tweet_to_wordlist(tweet, remove_stopwords=False):

        tweet_text = BeautifulSoup(tweet).get_text()
        tweet_text = re.sub("[^a-zA-Z]", " ", tweet_text)

        words = tweet_text.lower().split()

        if remove_stopwords:
            stops = set(stopwords.words("english"))
            words = [w for w in words if not w in stops]

        return (words)
Ejemplo n.º 54
0
def review_to_words(review):
    nltk.download('stopwords', quiet=True)
    stemmer = PorterStemmer()

    text = BeautifulSoup(review, 'html.parser').get_text()  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text.lower())  # Convert to lower case
    words = text.split()  # Split string into words
    words = [w for w in words
             if w not in stopwords.words('english')]  # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words]  # stem

    return words
Ejemplo n.º 55
0
def review_wordlist(review, remove_stopwords=False):
    # 1. Removing html tags
    review_text = BeautifulSoup(review, "html.parser").get_text()
    # 2. Removing non-letter.
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # 3. Converting to lower case and splitting
    words = review_text.lower().split()
    # 4. Optionally remove stopwords
    if remove_stopwords:
        words = [w for w in words if not w in stops]

    return words
Ejemplo n.º 56
0
def singleBook( location ):
    myDir = location
    book = epub.read_epub(myDir)
    items = book.get_items_of_type(ebooklib.ITEM_DOCUMENT)
    outputtext = ""
    for item in items:
        cleantext = BeautifulSoup(item.get_content(), "lxml").text
        cleantext = re.sub(r'[^\w\s]', '', cleantext)
        cleantext = cleantext.lower()
        outputtext += cleantext
    print(outputtext)
    return outputtext
Ejemplo n.º 57
0
def review_to_wordlist(review):
 
    # 去掉<br /><br />
    review_text = BeautifulSoup(review, "html.parser").get_text()
    # 去除标点
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    # 分词
    words = review_text.lower().split()
    #去除停用词
    #words = [w for w in words if not w in stopwords.words("chinese")]
    # 返回words
    return words
Ejemplo n.º 58
0
def clean_text(text):

    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))

    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
Ejemplo n.º 59
0
 def doc_preprocessing(doc):
     # Removes HTML tags
     doc = BeautifulSoup(doc, features="lxml").get_text()
     # Lowercase
     doc = doc.lower()
     # Remove accentuation
     doc = unicodedata.normalize('NFKD', doc).encode(
         'ASCII', 'ignore').decode('ASCII')
     # Remove punctuation
     doc = doc.translate(
         str.maketrans('', '', self.strip_punctuation))
     return doc
Ejemplo n.º 60
0
    def review_to_wordlist(review, remove_stopwords=False):
        review_text = BeautifulSoup(review, 'html.parser').get_text()
        review_text = re.sub('[^a-zA-Z]', ' ', review_text)
        words = review_text.lower().split()

        if remove_stopwords:
            stops = set(stopwords.words('english'))
            words = [w for w in words if not w in stops]

        stemmer = SnowballStemmer('english')
        words = [stemmer.stem(w) for w in words]
        return words