def box_office_scraper(): # Save log file trigger_log_save() # Run scraper for latest data refresh scraper() # return "Scrape complete (Flask)!" # returns HTML/text (r.text) return {"data": "Flask data key"} # returns JSON (r.json())
def box_office_scraper_view(): # Save the log file trigger_log_save() # Run the scaper for latest data refresh scraper() # return "Scrape complete! (FastAPI)" # Sending HTML (r.text) return {"data": "FastAPI data key"} # Sending JSON (r.json())
def onClick(self, event): window = Toplevel(self.root) window.geometry("670x500") x = scraper() res = x.search(self.entry.get(), self.entry2.get(), int(self.limitentry.get())) self.text = Text(window, borderwidth=0, relief=SUNKEN) self.text.insert(INSERT, res) self.text.place(x=0, y=0)
def scrapeDataOnePage(pageNum, pageSize, statusId, organizationId, outputFile): target_url = 'https://bidsandtenders.ic9.esolg.ca/Modules/BidsAndTenders/services/bidsSearch.ashx?pageNum=' + str( pageNum) + '&pageSize=' + str(pageSize) + '&statusId=' + str( statusId) + '&organizationId=' + str( organizationId) + '&sortColumn=UtcPublishDate&sortDir=DESC' # request the URL and parse the JSON try: response = requests.get(target_url) response.raise_for_status() jsonRes = response.json() except HTTPError as http_err: print(f'HTTP error occurred: {http_err}') except Exception as err: print(f'Other error occurred: {err}') #extract viewUrl & run scraper on each view site data = jsonRes["data"]["tenders"] #scrape data for each in data: scraper(each["viewUrl"], outputFile)
def main(): #create an empty json file to fill with post information posts = { "hearthstone": [], "blizzard": [], "HongKong": [], "overwatch": [], "gaming": [] } #api endpoint to submission_endpoint = "https://api.pushshift.io/reddit/search/submission/" #Set the scraper start time as the beginning of two days before the blizzard announcement start_time = datetime.datetime(2019, 10, 7, 0, 0, 0).timestamp() #converts to epoch time #When the most recent time to get posts #end_time = 1570824407 #Timestamp of the last post we got when we first pulled the data end_time = datetime.datetime( 2019, 10, 12, 0, 0, 0).timestamp() #Get the posts through Friday Night THIS IS A NEW RUN #Calls the scaper function to contiously build up a json file with the posts returned from each subreddit #Full json object is realized with the final scraper method return into all_posts hearthstone_posts = scraper(submission_endpoint, subreddits[0], start_time, end_time, posts) blizzard_posts = scraper(submission_endpoint, subreddits[1], start_time, end_time, hearthstone_posts) HongKong_posts = scraper(submission_endpoint, subreddits[2], start_time, end_time, blizzard_posts) overwatch_posts = scraper(submission_endpoint, subreddits[3], start_time, end_time, HongKong_posts) all_posts = scraper(submission_endpoint, subreddits[4], start_time, end_time, overwatch_posts) #Save all_posts with open("posts/all_posts_new.txt", 'w') as output: json.dump(all_posts, output)
def main(): cases = None casualties = None recoveries = None while True: if DEBUG is True: print("Going for another run") wp = (world_pop(WP_URL, DEBUG)) results = scraper(URL, DEBUG) if results is False: logger.error( "An error occurred retrieving stats from {}. Trying again in {}" .format(URL, WAIT_TIME)) else: try: r_keys = list(results.keys()) r_cases = int(results[r_keys[0]].replace(',', '')) r_casualties = int(results[r_keys[1]].replace(',', '')) r_recoveries = int(results[r_keys[2]].replace(',', '')) clean_fatal_rate = r_casualties / r_cases * 100 fatal_rate = "{0:.2f}%".format(clean_fatal_rate, 2) clean_recov_rate = r_recoveries / r_cases * 100 recover_rate = "{0:.2f}%".format(clean_recov_rate, 2) clean_active_cases = r_cases - r_casualties - r_recoveries active_rate = "{0:.2f}%".format( clean_active_cases / r_cases * 100, 2) active_cases = f'{clean_active_cases:,}' clean_closed_cases = r_cases - clean_active_cases closed_rate = "{0:.2f}%".format( clean_closed_cases / r_cases * 100, 2) closed_cases = f'{clean_closed_cases:,}' closed_fatal_rate = "{0:.2f}%".format( r_casualties / clean_closed_cases * 100, 2) closed_recov_rate = "{0:.2f}%".format( r_recoveries / clean_closed_cases * 100, 2) results["Fatality Rate"] = fatal_rate results["Recovered Rate"] = recover_rate results["Active Cases"] = active_cases results["Active Cases %"] = active_rate results["Closed Cases"] = closed_cases results["Closed Cases %"] = closed_rate results["Closed Fatality Rate"] = closed_fatal_rate results["Closed Recovered Rate"] = closed_recov_rate results.update(wp) wp_keys = list(wp.keys()) wp_count = int(wp[wp_keys[0]].replace(',', '')) clean_infect_rate = r_cases / wp_count * 100 infect_rate = "{0:.2f}%".format(clean_infect_rate, 2) results["Total Population Infected"] = infect_rate logger.info(results) if cases is None: cases = r_cases casualties = r_casualties recoveries = r_recoveries if cases != r_cases: if cases < r_cases: diff = r_cases - cases cases = r_cases logger.info( "Confirmed cases have risen by: {}. Count now stands at: {}" .format(diff, cases)) elif cases > r_cases: diff = cases - r_cases cases = r_cases logger.info( "Confirmed cases have decreased by: {}. Count now stands at: {}" .format(diff, cases)) if casualties != r_casualties: if casualties < r_casualties: diff = r_casualties - casualties casualties = r_casualties logger.info( "Fatal cases have risen by: {}. Count now stands at: {}. Rate: {}" .format(diff, casualties, fatal_rate)) elif casualties > r_casualties: diff = casualties - r_casualties casualties = r_casualties logger.info( "Fatal cases have decreased by: {}. Count now stands at: {}. Rate: {}" .format(diff, casualties, fatal_rate)) if recoveries != r_recoveries: if recoveries < r_recoveries: diff = r_recoveries - recoveries recoveries = r_recoveries logger.info( "Recovery cases have risen by: {}. Count now stands at: {}. Rate: {}" .format(diff, recoveries, recover_rate)) if recoveries > r_recoveries: diff = recoveries - r_recoveries recoveries = r_recoveries logger.info( "Recovery cases have decreased by: {}. Count now stands at: {}. Rate: {}" .format(diff, recoveries, recover_rate)) except KeyError: logger.error( "KeyError in data. Trying again in {}. Data: {}".format( WAIT_TIME, results)) pass except ValueError: logger.error( "ValueError in data. Trying again in {}. Data: {}".format( WAIT_TIME, wp)) pass time.sleep(WAIT_TIME)
if __name__ == "__main__": # get 50 posts each from the above subreddits in the past 2 days submission_endpoint = "https://api.pushshift.io/reddit/search/submission/" start_time = datetime.datetime(2019, 10, 30, 0, 0, 0).timestamp() end_time = datetime.datetime(2019, 11, 1, 0, 0, 0).timestamp() posts = { "hearthstone": [], "blizzard": [], "HongKong": [], "overwatch": [], "gaming": [] } subreddits = ["hearthstone", "blizzard", "HongKong", "overwatch", "gaming"] sample = [] hearthstone_posts = scraper(submission_endpoint, subreddits[0], start_time, end_time, posts) # randomly select 50 posts hearthstone_sample = random.sample(hearthstone_posts["hearthstone"], 10) sample.extend(hearthstone_sample) blizzard_posts = scraper(submission_endpoint, subreddits[1], start_time, end_time, hearthstone_posts) blizzard_sample = random.sample(blizzard_posts["blizzard"], 10) sample.extend(blizzard_sample) hongkong_posts = scraper(submission_endpoint, subreddits[2], start_time, end_time, blizzard_posts) hongkong_samples = random.sample(hongkong_posts["HongKong"], 10) sample.extend(hongkong_samples) overwatch_posts = scraper(submission_endpoint, subreddits[3], start_time, end_time, hongkong_posts) overwatch_samples = random.sample(overwatch_posts["overwatch"], 10) sample.extend(overwatch_samples)
#this script is to scrape data on 1 single view site from scrape import scraper URL='https://lkdsb.bidsandtenders.ca/Module/Tenders/en/Tender/Detail/be1fba85-2eb6-4094-b7ad-f7fdd0c3a1f2#' filename = "single-view" scraper(URL, filename)
from scrape import scraper from file_utils import read_csv, write_csv, read_file, write_file from generate import _render_template, preprocess from image import pass_gen from mail import sendmail import json # Scraping the webpage and storing the data in a csv data = scraper('http://scrape.kjscecodecell.com/') write_csv(data) # Reading the scraped data from the csv and preprocessing the data participants = read_csv() participants = preprocess(participants) # Getting the list of mails to whom mails have already been sent sent_mails = read_file() # Looping over all participants for participant in participants: # Checking if the participant was sent a mail previously if participant['email'] not in sent_mails: name = participant['name'] email = participant['email'] phone = participant['phone'] payment_status = participant['payment'] # Generating a message from the template message = _render_template(name, payment_status) # Generating a custom image
from app import db, Bitcoin from scrape import scraper url = "https://coinmarketcap.com/2/" scraper_data = scraper(url) def put_together(): #will make sure the database empty db.drop_all() #create the columns of database db.create_all() #iterates via the table while scraping at the same time, and filling the coumns with the info #ממלא את העמודות עם המידע והשמות של העמודות for coin in scraper_data: new_row = Bitcoin(Name=coin[0], Price=coin[1], _24h=coin[2], _7d=coin[3], Market_Cap=coin[4], Volume=coin[5], Circulating_Supply=coin[6]) db.session.add(new_row) db.session.commit() if __name__ == '__main__': put_together()
from scrape import scraper from csv_utils import read_csv, write_csv, get_unpaid_participants from generate import _render_template from mail import sendmail import json data = scraper('http://scrape.surge.sh/') write_csv(data, "studentdetails.csv") unpaid_participants, paid_count = get_unpaid_participants("studentdetails.csv") total_seats = 500 for participant in unpaid_participants: html = _render_template(participant[0], total_seats - paid_count) sendmail(to_email=participant[0], html=html)
from googleapiclient.discovery import build from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from scrape import scraper # If modifying these scopes, delete the file token.pickle. SCOPES = ['https://www.googleapis.com/auth/spreadsheets'] # The ID and range of the spreadsheet you are adding to. SAMPLE_SPREADSHEET_ID = '1GR5X2Ryk-S3cs1CE7fl60bpXw0TI-lLv7EMAkzAtUj0' #SAMPLE_RANGE_NAME = 'Sheet1!A2:C' range_name = 'Sheet1!A2:V' # FirstTime = True #The stats of all players on the Toronto Raptors for each games in a list stats = scraper() def main(): """Shows basic usage of the Sheets API. Prints values from a sample spreadsheet. """ creds = None # The file token.pickle stores the user's access and refresh tokens, and is # created automatically when the authorization flow completes for the first # time. if os.path.exists('token.pickle'): with open('token.pickle', 'rb') as token: creds = pickle.load(token) # If there are no (valid) credentials available, let the user log in. if not creds or not creds.valid:
def scrape_data(): print('scraping...') x = myColl.delete_many({}) resp = scraper() x = myColl.insert_many(resp) return 'Scraped'
def main(): parser = argparse.ArgumentParser() # Positional arguments parser.add_argument('url', help='URL to the 4chan thread you want to scrape', type=str) parser.add_argument('destination', help='Destination folder in your home folder', type=str) # Optional arguments parser.add_argument('-q', '--quiet', help='Run the script in quiet mode, no outputs', action='store_true', default=False) parser.add_argument('-v', '--verbose', help='Run with increased verbosity', action='store_true', default=False) parser.add_argument( '-w', '--watch', help= 'Watch the thread, will check thread every 5 minutes for new posts until thread 404s', action='store_true', default=False) parser.add_argument( '-i', '--interval', help='Specify the wait-time when watching a thread in seconds', type=int) args = parser.parse_args() # Set behaviour: verbose = args.verbose quiet = args.quiet watch = args.watch interval = 300 if not args.interval else args.interval # Set variables link = args.url destination = args.destination board = link.split('/')[3] # 'tv', 'wg', or similar thread_id = link.split('/')[5] # '114804039' or similar url = f'https://a.4cdn.org/{board}/thread/{thread_id}.json' content_url = f'https://i.4cdn.org/{board}/' # Determine platform/OS and set appropriate path system = platform.system() if system == 'Linux': home = os.environ['HOME'] destination = f'{home}/{destination}/' elif system == 'Windows': home == os.environ['HOMEPATH'] destination = f'{home}\\{destination}\\' else: if not quiet: print('Unsupported system, exiting') sys.exit(2) if verbose: print('Will scrape using the following information:') print(f'\tLink: \t\t{link}') print(f'\tBoard: \t\t{board}') print(f'\tThread ID: \t{thread_id}') print(f'\tURL: \t\t{url}') print(f'\tContent URL: \t{content_url}') print(f'\tDestination: \t{destination}\n') if watch: print(f'Will watch the thread. Interval: {interval}') # Create the destination folder if verbose: print(f'--> creating folder: {destination}') try: os.makedirs(destination, exist_ok=True) except Exception as e: if not quiet: print(f'Could not create destination folder: {e}') sys.exit(3) # Get the thread in JSON-representation: try: if verbose: print( f'--> getting the thread metadata from thread id: {board}/{thread_id}' ) posts = refresh_post_list(url, quiet, verbose) except Exception as e: if not quiet: print(f'Could not get thread metadata, reason: {e}') sys.exit(4) # Set timestamp start_time = posts[0]['time'] # Provide more verbose information about the thread: if verbose: first_post = posts[0] if first_post.get('sub'): title = first_post['sub'] else: title = None no_of_images = first_post['images'] no_of_replies = first_post['replies'] time_of_first_post = datetime.utcfromtimestamp(start_time).strftime( '%Y-%m-%d %H:%M:%S') print('--> metainformation about the thread:') if title: print(f'\tTitle: {title}') print(f'\tNumber of images: {no_of_images}') print(f'\tNumber of replies: {no_of_replies}') print(f'\tTime of first post: {time_of_first_post} UTC') new_time = scraper(posts, start_time, content_url, destination, quiet, verbose) if verbose: print(f'--> timestamp of last post: {new_time}') if watch: if not quiet: print('--- watching thread ---') while True: if verbose: print( f'--> waiting {timedelta(seconds=interval)} before refreshing thread' ) time.sleep(interval) if verbose: print('--> refreshing list of posts') posts = refresh_post_list(url, quiet, verbose) # Check if thread is closed: if posts[0].get('closed'): if posts[0]['closed']: if not quiet: print('Thread is closed, exiting') break if verbose: print('--> attempting to download new images') new_time = scraper(posts, new_time, content_url, destination, quiet, verbose)