Python removeSpecialCharacter Examples, Utilities.messageCleaner.removeSpecialCharacter Python Examples

Example #1

0

Show file

File: additionalDataGatherer.py Project: robienoor/NLTKForumScraper

def getListOfFromCSV(fileName):

    data = []

    with open(fileName, encoding = 'utf-8', errors = 'ignore') as f:
        data = f.readlines()
  
    list = []
    for member in data: 
        member = messageCleaner.removeSpecialCharacter(member)
        list.append(member)

    return list

Example #2

0

Show file

File: postsGatherer.py Project: robienoor/NLTKForumScraper

def __gatherWebMD(forumDetails):

    url = forumDetails[0]
    treatment = forumDetails[1]

    # WebMD is problamtic in that it provides three types of ratings. To simplify the problem we will only be pulling the satisfaction rating, as this
    # seems to be the most general one
    forumName = 'Webmd.com'
    results = requests.get(url)
    soup = BeautifulSoup(results.content, 'html.parser')

    rawPosts = soup.find_all(name="div", attrs={'class': 'userPost'})
    posts = []
    maxRating = 5

    for rawPost in rawPosts:

        try:
            review = rawPost.find(name="p",
                                  attrs={
                                      'id': re.compile('comFull.*')
                                  }).get_text()
            rating = rawPost.find(name="div",
                                  attrs={
                                      'class': 'catRatings lastEl clearfix'
                                  }).get_text()

        except:
            print(
                " Error[__gatherWebMD(url)]= Threw an error whilst looking for a review or rating"
            )
            continue

        # Removing special characters here
        review = messageCleaner.removeSpecialCharacter(review)
        review = review.replace("Comment:", "")
        review = review.replace("Hide Full Comment", "")

        rating = re.findall('\d+', rating)
        if (review and rating):
            # As mentioned, webmd provide three
            forumPost = post(review, __scaleRatings(rating[0], maxRating), url)
            posts.append(forumPost)

    webMD = forum(forumName, url, maxRating, posts, treatment)
    return webMD

Example #3

0

Show file

File: postsGatherer.py Project: robienoor/NLTKForumScraper

def __gatherDrugsCom(forumDetails):

    url = forumDetails[0]
    treatment = forumDetails[1]

    forumName = 'Drugs.com'
    results = requests.get(url)
    soup = BeautifulSoup(results.content, 'html.parser')

    rawPosts = soup.find_all(name="div", attrs={'class': 'boxList'})
    posts = []
    maxRating = 10

    # Extract the reviews and the ratings. If a post does not provide a review and a rating we will
    # dismiss it because it will not be useful during the learning phase

    for rawPost in rawPosts:

        try:
            review = rawPost.find(name="div", attrs={'class': 'user-comment'})
            review = review.find(name="span").get_text()

            rating = rawPost.find(name="div", attrs={
                'class': 'rating-score'
            }).get_text()

        except:
            logger.error(
                " Error[__gatherDrugsCom(url)] = Threw an error whilst looking for a review or rating"
            )
            continue

        # Removing special characters here
        review = messageCleaner.removeSpecialCharacter(review)

        if (review and rating):
            forumPost = post(review, __scaleRatings(rating, maxRating), url)
            posts.append(forumPost)

    drugsComForum = forum(forumName, url, maxRating, posts, treatment)
    return drugsComForum

Example #4

0

Show file

File: postsGatherer.py Project: robienoor/NLTKForumScraper

def __gatherWebMD(forumDetails):

    url = forumDetails[0]
    treatment = forumDetails[1]

    # WebMD is problamtic in that it provides three types of ratings. To simplify the problem we will only be pulling the satisfaction rating, as this 
    # seems to be the most general one 
    forumName = 'Webmd.com'
    results = requests.get(url)
    soup = BeautifulSoup(results.content, 'html.parser')

    rawPosts = soup.find_all(name="div", attrs = {'class' : 'userPost'})
    posts = []
    maxRating = 5

    for rawPost in rawPosts:

        try: 
            review = rawPost.find(name = "p" , attrs = {'id' : re.compile('comFull.*')}).get_text()
            rating = rawPost.find(name = "div" , attrs = {'class' : 'catRatings lastEl clearfix'}).get_text()

        except:
            print(" Error[__gatherWebMD(url)]= Threw an error whilst looking for a review or rating")
            continue

        # Removing special characters here
        review = messageCleaner.removeSpecialCharacter(review)
        review = review.replace("Comment:", "")
        review = review.replace("Hide Full Comment", "")

        rating = re.findall('\d+', rating)
        if(review and rating):
            # As mentioned, webmd provide three
            forumPost = post(review, __scaleRatings(rating[0], maxRating), url)
            posts.append(forumPost)

    webMD = forum(forumName, url, maxRating, posts, treatment)
    return webMD

Example #5

0

Show file

File: postsGatherer.py Project: robienoor/NLTKForumScraper

def __gatherDrugsCom(forumDetails):

    url = forumDetails[0]
    treatment = forumDetails[1]

    forumName = 'Drugs.com'
    results = requests.get(url)
    soup = BeautifulSoup(results.content, 'html.parser')

    rawPosts = soup.find_all(name = "div",  attrs = {'class' : 'boxList'})
    posts = []
    maxRating = 10

    # Extract the reviews and the ratings. If a post does not provide a review and a rating we will
    # dismiss it because it will not be useful during the learning phase
    
    for rawPost in rawPosts:

        try:
            review = rawPost.find(name = "div",  attrs = {'class' : 'user-comment'})
            review = review.find(name = "span").get_text()

            rating = rawPost.find(name = "div",  attrs = {'class' : 'rating-score'}).get_text()

        except:
            logger.error(" Error[__gatherDrugsCom(url)] = Threw an error whilst looking for a review or rating")
            continue

        # Removing special characters here
        review = messageCleaner.removeSpecialCharacter(review)

        if(review and rating):
            forumPost = post(review, __scaleRatings(rating, maxRating), url)
            posts.append(forumPost)
    
    drugsComForum = forum(forumName , url, maxRating, posts, treatment)
    return drugsComForum