def get_1001tracklists_data(dataframe): """ A function to retrieve data from 1001Tracklists.com :param dataframe: A reference dataframe (with 1001Tracklists Track ID) :return: A dataframe with number of plays and unique DJ supports. """ dataframe["1001T_TotPlays"] = 0 dataframe["1001T_Supports"] = 0 initialize_VPN(save=1, area_input=['random countries europe 20']) for idx, row in dataframe.iterrows(): print(f'{idx} | {row["1001Tracklists_ID"]} | {row["Track_Name"]}') if pd.notna(row['1001Tracklists_ID'] ) and row['1001Tracklists_ID'] not in exception_1001T: call = get_1001tracklists_track_data(row['1001Tracklists_ID']) if isinstance(call, str): print(call) rotate_VPN() data_1001tt = get_1001tracklists_track_data( row['1001Tracklists_ID']) else: data_1001tt = call dataframe.loc[idx, "1001T_Supports"], dataframe.loc[ idx, "1001T_TotPlays"] = data_1001tt terminate_VPN() return dataframe
def get_soundcloud_data(data_frame): """ A function to get data from Soundcloud (here, plays for each music). :param data_frame: A dataframe with the Soundcloud links associated to each music. :return: The same dataframe but with the total number of views for each music. """ df = data_frame.fillna("NONE") soundcloud_dict = {} tracks_plays = {} for an_idx, a_row in df.iterrows(): soundcloud_dict[a_row["Soundcloud_Link1"]] = [an_idx] soundcloud_dict[a_row["Soundcloud_Link2"]] = [an_idx] df1 = pd.DataFrame(soundcloud_dict).transpose().rename({0: "idx"}, axis=1) \ .drop(index="NONE") tracks = list(df1.index) initialize_VPN(save=1, area_input=['complete rotation']) for idx, track_url in enumerate(tracks): print(f'{idx} | {track_url}') try: tracks_plays[track_url] = { 'plays': soundcloud_scrapping(track_url) } except (ConnectionError, IndexError): print('IP BLOCKED - Need Rotation') rotate_VPN() tracks_plays[track_url] = { 'plays': soundcloud_scrapping(track_url) } sleep(1) terminate_VPN() df2 = pd.DataFrame(tracks_plays).transpose().rename({ 0: "idx", 1: "plays" }, axis=1) concat = pd.merge(df1, df2, left_index=True, right_index=True) plays_sum = concat.groupby("idx").sum() final = data_frame.join(plays_sum, how='outer').rename({'plays': "Soundcloud_Plays"}, axis=1) final.Soundcloud_Plays = final.Soundcloud_Plays.fillna(0) return final
def soundcloud_scrapping(soundcloud_url): """ :param soundcloud_url: :return: """ plays = 0 page_link = f'https://soundcloud.com/{soundcloud_url}/' success = False n_fail = 0 while not success: if n_fail < 3: try: page_response = requests.get(page_link, headers=Headers().generate()) soup = BeautifulSoup(page_response.content, "html.parser") plays = str( soup.find_all("meta", property="soundcloud:play_count")[0]) plays = int(re.search('meta content="(.+?)"', plays).group(1)) success = True except requests.exceptions.ConnectionError: n_fail += 1 print( "ConnectionError (from \"requests\"): retrying in 5 sec..." ) sleep(5) else: print('IP SOFT-LOCKED - Need Rotation') rotate_VPN() return plays
def parseReview(self, response): asin = response.meta['item'] self.driver.get(response.url) try: element = WebDriverWait(self.driver, 5).until( EC.presence_of_element_located(( By.XPATH, '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]' ))) #scrolling to element time.sleep(1) soup = BeautifulSoup(self.driver.page_source, "html.parser") #create xml tree to use xpath dom = etree.HTML(str(soup)) reviewTitle = dom.xpath( '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-title"]//text()' ) reviewTitle = [i.strip() for i in reviewTitle if i.strip() != ''] reviewRatings = dom.xpath( '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]//text()' ) reviewText = dom.xpath( '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-body"]//text()' ) reviewText = [i.strip() for i in reviewText if i.strip() != ''] reviewDate = dom.xpath('//*[@data-hook="review-date"]/text()') all_v = list( zip(reviewTitle, reviewText, reviewRatings, reviewDate)) for i in all_v: rtitle = i[0] rtext = i[1] rrating = i[2] rdate = i[3] dict_ = { "asin": asin, "reviewTitle": rtitle, "reviewText": rtext, "reviewRatings": rrating, "reviewDate": rdate } with open("Reviews.json", "a") as fl: json.dump(dict_, fl) fl.write('\n') nextpage = dom.xpath( '//*[@class="a-last"]//@href') # nextpage link self.counter = self.counter + 1 if self.counter % 200 == 0: # to rotate proxy after every 250 rotations print( '!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*(' ) settings = initialize_VPN(save=1, area_input=['complete rotation']) rotate_VPN(settings) if str(nextpage) != '[]': # go to next page nextpage = "https://www.amazon.com" + nextpage[0] yield Request(nextpage, callback=self.parseReview, dont_filter=True, meta={'item': asin}) else: pass except: pass
class AmazonrSpider(scrapy.Spider): name = 'amazonr' allowed_domains = ['*'] start_urls = ['http://amazon.com/'] df = pd.read_csv( r'C:\Amazon Reviews scraper(part1)\Scraper\reviewsScraper\cleanedProducts.csv' ) link_asin = list(zip(df.SeeAllReviews, df.asin))[500:1500] DRIVER_PATH = r"E:\ChromeDriver\chromedriver.exe" driver = webdriver.Chrome(executable_path=DRIVER_PATH) counter = 0 settings = initialize_VPN(save=1, area_input=['complete rotation']) rotate_VPN(settings) def start_requests(self): for pair in self.link_asin: self.driver.get(pair[0]) request = Request(self.driver.current_url, callback=self.parseReview, dont_filter=True, meta={'item': pair[1]}) yield request def parseReview(self, response): asin = response.meta['item'] self.driver.get(response.url) try: element = WebDriverWait(self.driver, 5).until( EC.presence_of_element_located(( By.XPATH, '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]' ))) #scrolling to element time.sleep(1) soup = BeautifulSoup(self.driver.page_source, "html.parser") #create xml tree to use xpath dom = etree.HTML(str(soup)) reviewTitle = dom.xpath( '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-title"]//text()' ) reviewTitle = [i.strip() for i in reviewTitle if i.strip() != ''] reviewRatings = dom.xpath( '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-star-rating"]//text()' ) reviewText = dom.xpath( '//*[@class="a-section a-spacing-none review-views celwidget"]//*[@data-hook="review-body"]//text()' ) reviewText = [i.strip() for i in reviewText if i.strip() != ''] reviewDate = dom.xpath('//*[@data-hook="review-date"]/text()') all_v = list( zip(reviewTitle, reviewText, reviewRatings, reviewDate)) for i in all_v: rtitle = i[0] rtext = i[1] rrating = i[2] rdate = i[3] dict_ = { "asin": asin, "reviewTitle": rtitle, "reviewText": rtext, "reviewRatings": rrating, "reviewDate": rdate } with open("Reviews.json", "a") as fl: json.dump(dict_, fl) fl.write('\n') nextpage = dom.xpath( '//*[@class="a-last"]//@href') # nextpage link self.counter = self.counter + 1 if self.counter % 200 == 0: # to rotate proxy after every 250 rotations print( '!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*(!@#$%^&*()!@##$$%!@#$%^&*(' ) settings = initialize_VPN(save=1, area_input=['complete rotation']) rotate_VPN(settings) if str(nextpage) != '[]': # go to next page nextpage = "https://www.amazon.com" + nextpage[0] yield Request(nextpage, callback=self.parseReview, dont_filter=True, meta={'item': asin}) else: pass except: pass
def run(): email_addr, verification_code_email, password = get_email() if not email_addr: return print(email_addr) use_vpn, status, url, code_sender = get_input_args() if use_vpn: initialize_VPN(save=1, area_input=["random countries europe 30"], skip_settings=1) rotate_VPN() driver = get_driver() answers = get_answers() while status <= 0: driver.get(url) if status == -1: claim_prize(driver, email_addr) print("current status: %s" % status) time.sleep(2) if use_vpn: try: switch_to_frame(driver) skip_button = WebDriverWait(driver, 5).until( EC.visibility_of_element_located( (By.CSS_SELECTOR, "button.bg-transparent"))) button_click(driver, skip_button) except: pass elif status != -1: switch_to_frame(driver) input_email_and_accept_terms(driver, email_addr) # Wait for email to arrive time.sleep(6) verification_code = get_verification_code(verification_code_email, password, code_sender) switch_to_frame(driver) enter_verification_code(driver, verification_code) time.sleep(2) switch_to_frame2(driver) start_quiz(driver, status) if status > -2: status = game_loop(driver, answers) current_url = driver.current_url if "http" in current_url: url = current_url if status != -1: status = claim_prize(driver, email_addr) driver.close() terminate_VPN()
from nordvpn_switcher import initialize_VPN, rotate_VPN, terminate_VPN import time settings = initialize_VPN(save=1, area_input=['Ireland']) while True: rotate_VPN(settings, google_check=1) time.sleep(1800) #e.g. rotate servers every hour
import os import time import pandas as pd from nordvpn_switcher import initialize_VPN, rotate_VPN, terminate_VPN settings = initialize_VPN() rotate_VPN(settings) from auxiliary.unitedstateszipcode_scraper import ZipCodeUSA zipCodeScraper = ZipCodeUSA() donors = pd.read_csv( "D:\Programming\Python\DonorsChoose\data\DonorsChoose\Donors.csv") #donors = donors.loc[donors["Donor State"] != 'other'] zipcodes = pd.Series(donors["Donor Zip"].unique(), name="zipcodes") zipcodes = pd.to_numeric(zipcodes, errors="coerce").dropna().astype(int).astype(str) for i in range(1, 3): zipcodes.loc[zipcodes.str.len() == i] = ( zipcodes.loc[zipcodes.str.len() == i] + "0" * (3 - i)).values zipcodes = zipcodes.unique() zipcode_df = pd.DataFrame(index=range(len(zipcodes)), columns=[ "id", "Population", "Population Density", "Housing Units", "Median Home Value", "Land Area", "Water Area", "Occupied Housing Units", "Median Household Income" ]) additions = [
from nordvpn_switcher import initialize_VPN, rotate_VPN import time ####################### ## WINDOWS OR LINUX ### ####################### # [1] save settings file as a variable instructions = initialize_VPN( ) #this will guide you through a step-by-step guide, including a help-menu with connection options for i in range(3): rotate_VPN(instructions) #refer to the instructions variable here print( '\nDo whatever you want here (e.g.scraping). Pausing for 10 seconds...\n' ) time.sleep(10) # [2] if you'd like to skip the step-by-step menu (because you want to automate your script fully without any required human intervention), use the area_input parameter instructions = initialize_VPN(area_input=[ 'Belgium,France,Netherlands' ]) # <-- Be aware: the area_input parameter expects a list, not a string for i in range(3): rotate_VPN(instructions) #refer to the instructions variable here print( '\nDo whatever you want here (e.g.scraping). Pausing for 10 seconds...\n' ) time.sleep(10)