def collect(url, visited = None): ''' Used to access a new page and put it's data into a page instance Controls the users specified operations Runs only if it needs to Running time depends on factors such as internet speed and cached results. ''' # Find the locations of the files file_path = os.getcwd()+'/data/'+getDomain(url) # Create an empty set if visited is not supplied if visited is None: if os.path.exists(file_path): visited = set(os.listdir(file_path)) else: visited = set() # Check if there is data existing for the website if os.path.exists(file_path): # Check if the page has been visited if url.replace('/', '|')+'.txt' not in visited: scrape(url) else: scrape(url) return visited
def start_updater(DB_LOCATION, SCRAPE_PERIOD, SMTP_USERNAME, SMTP_PASSWORD, webapp_context): # Used in a thread to scrape periodically. def scrape_and_update(webapp_context): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) while webapp_context["running"]: time.sleep(SCRAPE_PERIOD) db_connection = sqlite3.connect(DB_LOCATION) db_cursor = db_connection.cursor() previous_scrape_datetime = datetime.datetime.now() try: scrape(DB_LOCATION) email_new_urops(DB_LOCATION, previous_scrape_datetime, SMTP_USERNAME, SMTP_PASSWORD) webapp_context["data_json"] = create_data_json( DB_LOCATION, previous_scrape_datetime) except: traceback.print_exc() db_connection.commit() db_connection.close() # Make initial data_json, start scraping thread, before serving. previous_scrape_datetime = datetime.datetime.now() scrape(DB_LOCATION) email_new_urops(DB_LOCATION, previous_scrape_datetime, SMTP_USERNAME, SMTP_PASSWORD) webapp_context["data_json"] = create_data_json(DB_LOCATION, previous_scrape_datetime) updater = threading.Thread(target=scrape_and_update, args=(webapp_context, )) updater.daemon = True updater.start()
def collect(url, visited=None): ''' Used to access a new page and put it's data into a page instance Controls the users specified operations Runs only if it needs to Running time depends on factors such as internet speed and cached results. ''' # Find the locations of the files file_path = os.getcwd() + '/data/' + getDomain(url) # Create an empty set if visited is not supplied if visited is None: if os.path.exists(file_path): visited = set(os.listdir(file_path)) else: visited = set() # Check if there is data existing for the website if os.path.exists(file_path): # Check if the page has been visited if url.replace('/', '|') + '.txt' not in visited: scrape(url) else: scrape(url) return visited
def iterate(urlList, n): if len(urlList) >= 1: for i in range(0, len(urlList)): #CATCH UNSUPPORTED LINKS print "ATTEMPT: %s" % (urlList[i]) if urlList[i][-1] == 'd': print "Not supported" continue elif urlList[i].find('"') != -1 or urlList[i].find(".js") != -1: print "Not supported" continue html = urllib2.urlopen(urlList[i]).read() scrape(html, urlList) if len(urlList) > n: break
def scrape_and_update(webapp_context): loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) while webapp_context["running"]: time.sleep(SCRAPE_PERIOD) db_connection = sqlite3.connect(DB_LOCATION) db_cursor = db_connection.cursor() previous_scrape_datetime = datetime.datetime.now() try: scrape(DB_LOCATION) email_new_urops(DB_LOCATION, previous_scrape_datetime, SMTP_USERNAME, SMTP_PASSWORD) webapp_context["data_json"] = create_data_json( DB_LOCATION, previous_scrape_datetime) except: traceback.print_exc() db_connection.commit() db_connection.close()
def main(): if input( "Do you want to scrape (s) new images or use an existing (e) folder? (s/e): " ) == 's': folder = scrape() else: folder = None if input( "\nWould you like to use classifier.py to remove outliers? (y/n): " ) != 'n': classifier(folder) if input("\nWould you like to use average.py to generate an image? (y/n): " ) != 'n': average(folder) return
'c++', 'vulkan', 'opengl', 'python 3', 'golang' ] # print options print("OPTIONS:") i = 0 while i < len(options_list): print(', '.join(options_list[i:i + int((len(options_list) / 5))])) i = i + int((len(options_list) / 5)) mail_id = input("Enter your e-mail(gmail) id here: ") password = input("Enter your password: "******"Enter what you want from the above stack: ") tech_need_list = [] # take all technologies user wants while tech_need.lower() != 'exit': tech_need_list.append(tech_need.lower()) tech_need = input("Enter what you want from the above stack: ") # call to scrape function which return rows with required data site_data = scrape(tech_need_list) # if we get no matching data don't do anything else write to file and send mail if len(site_data) != 0: writeFile(site_data) sendMail(mail_id, password) print("DONE!! :)") else: print('NO matching data found for you :(')
def get_data(self, url, wd): self.data.class_name, self.data.quality, self.data.difficulty, self.data.comments, self.name, self.data.date = scrape( self.url, wd)
You can then download the full dataframe as an excel sheet for convenience. **NOTE: Parsing through all job descriptions can take sometime. (up to 30 seconds). ** ''' # Hacky way of allowing downloading global_df = None ###################################################################################################################### # Creating app instance and designing layout # ###################################################################################################################### # create scrape instance scraper = scrape() # better stylesheet external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] # Create app app = dash.Dash(external_stylesheets=external_stylesheets) # assign server instance server = app.server # Create layout app.layout = html.Div(children=[ dcc.Markdown(heading), dbc.Progress(id="progress", value=0, striped=True, animated=True), html.Div(children=[
#UrlScraper import urllib2 from scrape import * from iterate import * url = "" #How to download images f = open("test.gif", "wb") f.write(urllib2.urlopen(url).read()) f.close() urlList = [] site = urllib2.urlopen(url) html = site.read() scrape(html, urlList) print len(urlList) #for i in range(0, len(urlList)): # print urlList[i] iterate(urlList, 150) for i in range(0, len(urlList)): print urlList[i] print len(urlList) scrapeHTML(urlList)
import time, dropbox, os, datetime from dropbox.files import WriteMode from dropbox.exceptions import ApiError from scrape import * from BellBot import * listings = 'Listings.txt' client = dropbox.Dropbox('key') counter = 0 while True: if counter == 3: counter = 0 try: print('scraping cattle exchange now') listings = scrape() print('getting cattle listings') get_listings_text(listings) print('scrape is complete') except: print('exception occured') # upload to one drive folder with open(listings, 'rb') as f: dest_path = datetime.datetime.today().strftime('%d-%m-%Y') dest_path = '/Cowbell/{}_listings.txt'.format(dest_path) try: client.files_upload(f.read(), dest_path, mode=WriteMode('add')) except ApiError: print('API Error occured')