Ejemplo n.º 1
0
def main():
    """ Main function.
    """

    input_url = "https://www.webmd.com/drugs/drugreview-1701-citalopram-oral.aspx?drugid=1701&drugname=citalopram-oral"
    scraper = WebMDScraper("citalopram_train.csv")
    scraper.scrape(input_url, 10)
def test_scrape_no_reviews():
    """
    Tests that scrape function works for page with no reviews
    """
    scraper = WebMDScraper()
    scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-174349-8HR-Muscle-Aches-'
        'Pain-oral.aspx?drugid=174349&drugname=8HR-Muscle-Aches-Pain-oral')
    assert len(scraper.reviews) == 0
Ejemplo n.º 3
0
def test_webmd_scrape():
    """Test webmd scrape"""
    input_url = 'https://www.webmd.com/drugs/drugreview-151652-banzel.aspx?drugid=151652&drugname=banzel'
    webmd_scraper = WebMDScraper()
    webmd_scraper.scrape(input_url)
    assert len(webmd_scraper.review_list) > 5

    keys = list(webmd_scraper.review_list[-1].keys())
    assert 'comment' in keys
    assert 'effectiveness' in keys
    assert 'ease of use' in keys
    assert 'satisfaction' in keys
def test_scrape_with_parameters():
    """
    Tests that, when calling scrape function with a scraper with non-default parameters
    the correct types of data are stored in the 'reviews' attribute
    """
    scraper = WebMDScraper(collect_urls=True, collect_user_ids=True)
    scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-5659-'
        'methotrexate-sodium-injection.aspx?drugid=5659&drugname=methotrexate-sodium-injection'
    )
    assert len(scraper.reviews) > 5
    data_collected = list(scraper.reviews[0].keys())
    assert len(data_collected) == 6
    assert 'user id' in data_collected
    assert 'url' in data_collected
Ejemplo n.º 5
0
    def collect(self, url, testing=False):
        """Scrapes drug reviews and saves them as dictionary property

        Args:
            url: WebMD URL where all the reviews are
        """
        if self.meta['locked']:
            print('Dataset locked. Please load a different dataset.')
            return

        self.meta['startTimestamp'] = time()
        self.meta['drugs'] = [self.drug_name]

        scraper = None
        if self.scraper == 'WebMD':
            scraper = WebMDScraper()
        elif self.scraper == 'EverydayHealth':
            scraper = EverydayHealthScraper()
        elif self.scraper == 'Drugs':
            scraper = DrugsScraper()
        elif self.scraper == 'DrugRatingz':
            scraper = DrugRatingzScraper()

        if testing:
            scraper = WebMDScraper(False, 1)

        self.reviews = scraper.scrape(url)
        self.meta['endTimestamp'] = time()
def test_scrape_empty_reviews():
    """
    Tests to make sure that if the scrape function is called on a scraper
    that already has collected data in 'reviews', that those reviews are discarded
    """
    scraper = WebMDScraper()
    scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-5659-'
        'methotrexate-sodium-injection.aspx?drugid=5659&drugname=methotrexate-sodium-injection'
    )
    num_reviews = len(scraper.reviews)
    scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-5659-'
        'methotrexate-sodium-injection.aspx?drugid=5659&drugname=methotrexate-sodium-injection'
    )
    assert num_reviews == len(scraper.reviews)
def test_scrape_correct_review_data():
    """
    Tests to make sure that when the scrape function is called,
    that the last review in the scraped reviews list as the correct data
    (this data is the data from the oldest review for this drug)
    """
    scraper = WebMDScraper(collect_user_ids=True, collect_urls=True)
    scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-8953-A-G-Pro-oral.aspx?drugid=8953&drugname=A-G-Pro-oral'
    )
    assert scraper.reviews[-1]['comment'][:10] == 'I started '
    assert scraper.reviews[-1]['comment'][-10:] == 'vitamin :)'
    assert scraper.reviews[-1][
        'user id'] == 'A95, 13-18 Female  on Treatment for 1 to 6 months (Patient)'
    assert scraper.reviews[-1]['rating']['effectiveness'] == 5
    assert scraper.reviews[-1]['rating']['ease of use'] == 5
    assert scraper.reviews[-1]['rating']['satisfaction'] == 5
    assert scraper.reviews[-1]['date'] == '10/6/2010 10:10:35 PM'
def test_scrape_assert_title_error():
    """
    Tests that when the scrape function is called with an invalid url
    that does have a title but the title is wrong (doesn't have the phrase 'User Reviews & Ratings - ')
    that an AssertionError is raised and the function returns 0
    """
    scraper = WebMDScraper()
    returned = scraper.scrape('https://www.webmd.com/drugs/2/index')
    assert returned == 0
def test_scrape_invalid_url_no_title():
    """
    Tests that when the scrape function is called on a url that
    lacks a title (invalid url), it raises an AttributeError and returns 0
    """
    scraper = WebMDScraper()
    returned = scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-bhilknhj')
    assert returned == 0
def test_scrape_default_parameter():
    """
    Tests that, when calling scrape function with a scraper with default parameters
    the correct types of data are stored in the 'reviews' attribute
    and that the correct number of reviews are collected (more
    than 5, this proves that it's scraping multiple pages)
    """
    scraper = WebMDScraper()
    scraper.scrape(
        'https://www.webmd.com/drugs/drugreview-5659-'
        'methotrexate-sodium-injection.aspx?drugid=5659&drugname=methotrexate-sodium-injection'
    )
    assert len(scraper.reviews) > 5
    data_collected = list(scraper.reviews[0].keys())
    assert len(data_collected) == 4
    assert 'comment' in data_collected
    assert 'rating' in data_collected
    assert 'date' in data_collected
    assert 'drug' in data_collected
Ejemplo n.º 11
0
def main():
    """ Main function.
    """
    input_url = 'https://www.webmd.com/drugs/drugreview-1701-citalopram-oral.aspx?drugid=1701&drugname=citalopram-oral'
    webmd_scraper = WebMDScraper()
    webmd_scraper.scrape(input_url)
Ejemplo n.º 12
0
    def collect_urls(self, file_path, start=0):
        """Scrape all reviews for all drugs urls in file

        Args:
            start: index to start at if continuing from previous run
        """
        if self.meta['locked']:
            print('Dataset locked. Please load a different dataset.')
            return

        scraper = None
        if self.scraper == 'WebMD':
            scraper = WebMDScraper()
        elif self.scraper == 'EverydayHealth':
            scraper = EverydayHealthScraper()
        elif self.scraper == 'Drugs':
            scraper = DrugsScraper()
        elif self.scraper == 'DrugRatingz':
            scraper = DrugRatingzScraper()

        urls = []

        with open(file_path) as csv_file:
            reader = csv.DictReader(csv_file)
            for row in reader:
                if row['URL'] != 'Not found':
                    urls.append({'name': row['Drug'], 'url': row['URL']})
        print('Found {} urls.'.format(len(urls)))

        if os.path.isfile(self.drug_name.lower() + '-dataset.pickle'):
            self.load()
        else:
            print('Saving meta...')
            drug_names = [x['name'] for x in urls]
            self.meta['drugs'] = drug_names
            self.meta['startTimestamp'] = time()
            self.save()

        # Loop through urls starting at start index
        for i in range(start, len(urls)):
            drug = urls[i]
            print('\n{} drugs left to scrape.'.format(len(urls) - i))
            print('Scraping {}...'.format(drug['name']))
            reviews = scraper.scrape(drug['url'])

            # If it's the first drug then replace self.reviews instead of appending
            if drug['name'] == urls[0]['name']:
                self.reviews = reviews
            else:
                self.reviews += reviews

            # Save our progress and let the user know the data is safe
            self.meta['endTimestamp'] = time()
            self.save()
            print('{} reviews saved. Safe to quit.'.format(drug['name']))

            # Let the user know what start index to use to continue later
            if i < len(urls) - 1:
                print('To continue run with parameter start={}'.format(i + 1))

        print('\nAll urls scraped!')
Ejemplo n.º 13
0
def main():
    scraper = WebMDScraper(
    )  # or DrugsScraper(), DrugRatingsScraper(), or EverydayHealthScraper()
    url = scraper.get_url('Galzin')  # or any other drug name
    scraper.scrape(url)
    print('Scraped %d reviews.' % len(scraper.reviews))
def main():
    scraper = WebMDScraper(
    )  # non funziona DrugsScraper(), non funziona DrugRatingzScraper(), or EverydayHealthScraper()
    url = ""
    json_aggregrationReviews = {"website": "webmd.com"}
    json_aggregrationReviews["ratingSystem"] = "stars"
    json_aggregrationReviews["itemsNamesAggregration"] = input_list
    reviewsAggregrate = []
    for i in range(len(input_list)):
        json_reviews = {"name": input_list[i]}
        try:
            url = scraper.get_url(input_list[i])  # or any other drug name
            scraper.scrape(url)
            dataframe_reviews = pd.DataFrame.from_dict(scraper.reviews)
            json_reviews["averageEffectiveness"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["effectiveness"].mean(), 1)
            json_reviews["averageEaseOfUse"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["ease of use"].mean(), 1)
            json_reviews["averageSatisfaction"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["satisfaction"].mean(), 1)
            json_reviews["minRating"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["satisfaction"].min(), 1)
            json_reviews["maxRating"] = round(
                pd.DataFrame.from_records(
                    dataframe_reviews["rating"])["satisfaction"].max(), 1)
            json_reviews["reviews"] = scraper.reviews
        except:
            print("Could not get " + input_list[i] + " from webmd website")
            webmd_names_errors.append(input_list[i])
        reviewsAggregrate.append(json_reviews)
    json_aggregrationReviews["aggregrateReviews"] = reviewsAggregrate

    with open("webmdresult.json", "w") as f:
        obj = json.dumps(json_aggregrationReviews, indent=4)
        f.write(obj)

    scraper2 = EverydayHealthScraper()
    json_aggregrationReviews = {"website": "everydayhealth.com"}
    json_aggregrationReviews["ratingSystem"] = "stars"
    json_aggregrationReviews["itemsNamesAggregration"] = input_list
    reviewsAggregrate = []
    for i in range(len(input_list)):
        json_reviews = {"name": input_list[i]}
        try:
            url = scraper2.get_url("Adderall")
            print(url)
            scraper2.scrape(url)
            dataframe_reviews = pd.DataFrame.from_dict(scraper2.reviews)
            json_reviews["averageRating"] = round(
                dataframe_reviews["rating"].mean(), 1)
            json_reviews["minRating"] = round(
                dataframe_reviews["rating"].min(), 1)
            json_reviews["maxRating"] = round(
                dataframe_reviews["rating"].max(), 1)
            json_reviews["reviews"] = scraper2.reviews
        except:
            print("Could not get " + input_list[i] +
                  " from everydayhealthscraper website ")
            everydayhealth_names_errors.append(input_list[i])
        reviewsAggregrate.append(json_reviews)

    json_aggregrationReviews["aggregrateReviews"] = reviewsAggregrate

    with open("everydayhealth.json", "w") as f:
        obj = json.dumps(json_aggregrationReviews, indent=4)
        f.write(obj)

    if (len(webmd_names_errors) != 0):
        print("I could not get from webmd " + str(webmd_names_errors))

    if (len(everydayhealth_names_errors) != 0):
        print("I could not get from everydayhealth " +
              str(everydayhealth_names_errors))