def generate(): "----------Start up the course times------------" autotable = AutoTable() scraper = Scraper(autotable) autotable = scraper.build_table() #builder = Builder(autotable) #autotable = builder.build_table() start_time = time.time() print("Year Space") year_solutions = generate_semester(autotable.year.courses) if len(year_solutions) == 0: optimal_solution = fall_winter_merge(autotable.fall.courses, autotable.winter.courses) else: optimal_solution = year_fall_winter_merge(year_solutions, autotable.fall.courses, autotable.winter.courses) print("Fall") for day in optimal_solution[0][0]: for timeslot in day: print(timeslot) print("Winter") for day in optimal_solution[0][1]: for timeslot in day: print(timeslot) print("Fall Distance: " + str(optimal_solution[1]) + " Winter Distance: " + str(optimal_solution[2])) print("--- Full algorithm %s seconds ---" % (time.time() - start_time)) root = Tk() gui1 = MyFirstGUI(optimal_solution[0][0], "Fall", root) root.mainloop() root = Tk() gui2 = MyFirstGUI(optimal_solution[0][1], "Winter", root) root.mainloop()
def generate(): "----------Start up the course times------------" autotable = AutoTable() scraper = Scraper(autotable) autotable = scraper.build_table() #builder = Builder(autotable) #autotable = builder.build_table() start_time = time.time() "----------Get all Fall Timetables------------" courses = autotable.fall.courses courses.extend(autotable.year.courses) space1 = autotable.solution_space(courses) "----------Get all Winter Timetables------------" courses = autotable.winter.courses courses.extend(autotable.year.courses) space2 = autotable.solution_space(courses) "-----------Combine fall and winter-------------" listed = autotable.index_year_courses(autotable.year.courses) compatible = autotable.construct_year(space1,space2,listed) print("Fall:") for section in compatible[0][0][0]: print(section.name) print(section) print("Winter:") for section in compatible[0][1][0]: print(section.name) print(section) print("Distance: "+str(compatible[0][2])) print("--- %s seconds ---" % (time.time() - start_time))
def crawler_main(self, indice_link): source = re.findall(REGEX_SOURCE, self.__url) url = self.__url parsed = Crawler.crawlear_web(self, url) ultima_pagina = (parsed.xpath(XPATH_ULTIMA_PAGINA)[0]) # transforma el string con , como separador de mil en entero locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') last_page = int(locale.atof((re.findall(REGEX_ULTIMA_PAGINA, ultima_pagina))[0])) cantidad_paginas = math.ceil(last_page / 24) indice_pagina = 1 # indice para cambio de pagina while cantidad_paginas >= indice_pagina: contador = 1 sigiente_pagina = self.__nuevo_link.format(indice_pagina) parsed3 = Crawler.crawlear_web(self, sigiente_pagina) unidades_href = parsed3.xpath(XPATH_HREF_UNIDADES) for elemento in unidades_href: parsed2 = Crawler.crawlear_web(self, elemento) instancia_scraper = Scraper.Scraper() instancia_scraper.crear_dicc(parsed2, elemento, source, indice_link) print("Nuevo Aviso", contador) contador += 1 indice_pagina += 1 print("Cambio de pagina") print("Nueva Url")
def init(): global crawler global domain global bot global timeout global lag global depth global emailscan global tprint # Remove previous files try: shutil.rmtree(".cache") except OSError: pass try: os.remove("emails.txt") except OSError: pass # Process cmd line arguments cmdArgs() # Initialize scraper object crawler = Scraper(domain, bot, timeout, lag, depth, emailscan) # Pretty print thread tprint = threading.Thread(target=ThreadPrettyPrint)
def writeToCSV(self): scraper = Scraper() finishedUltimaList = scraper.getUltimaList() f = open("{}.csv".format(self.animeName.get()), 'w', newline='') writer = csv.writer(f) for row in finishedUltimaList: self.progressBar.step(10) writer.writerow(row) self.progressBar.stop() f.close()
def update_navigation() -> None: scraper = Scraper.Scraper() nav_data = scraper.get_all_nav() # Subject Names json.dump(nav_data["subject_names"], open(make_path(navigation_dir, 'subject_names.json'),'w')) # Subject -> Colleges json.dump(nav_data["subject_colleges"], open(make_path(navigation_dir, 'subject_colleges.json'),'w')) # College -> Subjects json.dump(nav_data["college_subjects"], open(make_path(navigation_dir, 'college_subjects.json'),'w')) # Errors json.dump(nav_data["errors"], open(make_path(error_dir, 'update_nav.json'),'w'))
def __init__(self): self.scraper = Scraper.Scraper() self.materias = {} # Dicionario: Codigo_materia->Materia_obj self.nomes_materias = {} # Dicionario: Codigo_materia->nome_materia self.curso_materias = { } # Dicionario: Codigo_curso->lista_materias_do_curso (Ordem alfabetica) self.lista_cursos = [ ] # Lista com o nome de todos os cursos. Guardados em tuplas (cod, nome) if os.path.exists(DICT_FILE): with open(DICT_FILE, "rb") as f: self.materias, self.nomes_materias, self.curso_materias, self.lista_cursos = cPickle.load( f)
def initUI(self): #Scraper 클래스 선언 self.scraper = Scraper.Scraper() self.setWindowTitle(self.title) self.mainWidget = MainWidget() self.setCentralWidget(self.mainWidget) self.statusBar = QStatusBar(self) self.setStatusBar(self.statusBar) self.show() self.mainWidget.pathSetButton.clicked.connect(self.openFileNameDialog) self.openFileNameDialog() # 스크래핑 버튼에 스크래핑 함수 연결 self.mainWidget.scrapeButton.clicked.connect(self.scrape)
def on_click(self): s = Scraper.Scraper(self.url.text(), self.container.text(), self.records.text()) formatted = "" for record in s.run(): for item in record: formatted += item + '; ' formatted += '\n' msg = QMessageBox() msg.setIcon(QMessageBox.Information) msg.setText("Data Scrape Complete") msg.setInformativeText("Press Show Details to view") msg.setWindowTitle("Complete") msg.setDetailedText(str(formatted)) msg.setStandardButtons(QMessageBox.Ok | QMessageBox.Close) msg.setFixedWidth(500) msg.exec_()
def influencer_bin_data(influencers, user, pw): for i in range(len(influencers)): print('Extracting info from ' + influencers[i].decode('utf-8') + '... using ' + user) scraper = Scraper(influencers[i], user, pw) user_data = scraper.run() print('\n' + 'No. of followers scraped for ' + influencers[i].decode('utf-8') + ' : ' + str(len(user_data))) scraper.close() # save data for each user file_name = 'data/followers_' + influencers[i].decode( 'utf-8') + '.pickle' with open(file_name, 'wb') as file: pickle.dump(user_data, file) # track done list of users with open('done_list.txt', 'a') as file: file.write(influencers[i].decode('utf-8') + '\n')
def main(): # initialize core objects scraper = Scraper(Config.praw('CLIENT_ID'), Config.praw('CLIENT_SECRET'), Config.praw('USER_AGENT'), Config.praw('USERNAME'), Config.praw('PASSWORD')) parser = Parser() notifier = Notifier(Config.twilio('ACCOUNT_SID'), Config.twilio('AUTH_TOKEN')) # initialize time for loop startTime = time.time() while True: try: # grab last 100 new mechmarket posts posts = scraper.grabNewPosts('mechmarket', 100) # loop through posts for post in posts: have = parser.parseHave(post.title) # does this need the or? or is the search case insensitive? if parser.keywordSearch('milkshake', have) or parser.keywordSearch( 'Milkshake', have): # notify if we found it notify(notifier, post, f'Milkshake found '\ f'{post.title} '\ f'{post.url} ') # sleep for 60 seconds time.sleep(60.0 - ((time.time() - startTime) % 60.0)) print( f'Starting new loop at {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}' ) except: # disable during testing # notifier.sendMessage('8042100901','17572092974','Something broke and the bot has stopped') break
def executeScript(self): scraper = Scraper() tempName = self.animeName.get() tempName = tempName.split() tempName.append('episode') animeName = "" for i in tempName: animeName = animeName + i + "-" permAnimeName = animeName self.count = 1 a = True while a: animeName = animeName + str(self.count) a = scraper.getHTMLTags("dummy string", animeName) animeName = permAnimeName self.count += 1 self.writeToCSV()
def startScraping(keyword=None): global resultDict SID = request.cookies.get(session.sessionID,None) if keyword == "": return """ <script> window.location.href = "/" </script>""" if SID in scraperDict.keys(): if threadDict[SID].is_alive(): return "ただいま処理中です。お待ちください。{now}".format(now=str(datetime.datetime.now())) else: try: with open(resultJsonPath+keyword,"r",encoding="utf-8") as res: return res.read() except FileNotFoundError: response = make_response("""<script> window.location.href = "/scraping/{keyword}" </script>""".format(keyword=keyword)) max_age = liveLimit expires = int(datetime.datetime.now().timestamp()) + max_age response.set_cookie(session.sessionID, value=str(session.getSID()), max_age=max_age, expires=expires, path='/', secure=None, httponly=False) return response else: scraperDict[SID] = Scraper.Scraper(0,0) threadDict[SID] = threading.Thread(target=wrapScraping,args=(scraperDict[SID],keyword,10,)) threadDict[SID].daemon = True threadDict[SID].start() return "ただいま処理中です。お待ちください。{now}".format(now=str(datetime.datetime.now()))
def find_courses(): username = "******" password = get_password() driver = Driver('firefox') driver.home() assert (driver.login(username, password)) driver.goto_menu("Onderwijs") driver.fillout() scraper = Scraper(driver) scraper.find_courses() scraper.print_courses() scraper.find_courses_names() scraper.scrape_course_elements() driver.shutdown() with open("courses.dat", 'wb+') as courses_file: pickle.dump(scraper.courses, courses_file) for course in scraper.courses: print(course) return scraper.courses
#encoding:utf-8 import sys import Queue from Scraper import * import time import sys import urlparse import bs4 import threading test_obj = Scraper(single_page=False, workers_num=15) test_obj.feed(['http://www.freebuf.com']) time.sleep(5) z = test_obj.get_result_urls_queue() while True: try: print z.get(timeout=4) except: break
''' Created on Oct 19, 2017 @author: FB ''' import datetime from pandas.io.pytables import HDFStore import Scraper import os if __name__ == '__main__': '''TODO: Dynamische Abfrage, bis zu welchem Datum die Datenbank bereits Patente hat''' startdate = datetime.date(2004, 3, 13) waiting_time = 50 # Waiting time to not overstrain google cwd = os.getcwd() download_dir = cwd + "\\PatentScraper" Patent_info = Scraper().downloadCSV(startdate=startdate, waiting_time=50, download_dir=download_dir, DaysTillStore=10)
import Scraper foo = Scraper.Scraper(wr_bb_100=10, wr_observed=10, std_dev=140, hands=151) #foo.scrape_page_x_times(500, file_name="rawdatafrom500scrapes.csv") #print(foo.average_with_rake_exclude_first_x_cols(rake=3, cols=8)) #foo.sort_data_by_plus_minus(file_name="plus_minus_500_scrapes.csv") foo.read_raw_file_from_csv("rawdatafrom500scrapes.csv") #foo.sort_data_by_plus_minus(file_name="plus_minus_500.csv") print(foo.average_with_rake_exclude_first_x_cols()) #print(foo.get_avg_of_plus_minus(two_col_array=foo.read_plus_minus_file_from_csv("plus_minus_1k_no_bw.csv")))
import requests from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from fake_useragent import UserAgent import Scraper if __name__ == '__main__': s = Scraper.Scraper() s.scrape_product_page()
''' DeviantArt Image downloader - Takes one or more keywords as input and downloads the specified number of images from deviantArt ''' import sys, os import re import requests import Downloader import Scraper #Input keywords keywords = input("Enter search terms : ").split() d = Downloader.Downloader() sc = Scraper.Scraper() #Download the corresponding DeviantArt page response = d.downloadPage(keywords) #Scrape all page urls linkTags = sc.getLinkTags(response) #Open all page urls, download them and scrape the image urls along with names imageURLs, imageNames = sc.getImageTags(linkTags) #Create a new directory in C:\Users\HP\Pictures\Scraped if it does no already exist. Then switch to that directory try: os.chdir(r'C:\Users\HP\Pictures\Scraped' + '\\' + " ".join(keywords).title()) except FileNotFoundError: os.makedirs(r'C:\Users\HP\Pictures\Scraped' + '\\' + " ".join(keywords).title()) os.chdir(r'C:\Users\HP\Pictures\Scraped' + '\\' + " ".join(keywords).title())
def main(argv): logging.basicConfig(filename='AutoDownload.log', level=logging.INFO, format='%(asctime)s - %(levelname)s:%(message)s') # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler() console.setLevel(logging.INFO) # set a format which is simpler for console use formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') # tell the handler to use this format console.setFormatter(formatter) # add the handler to the root logger logging.getLogger('').addHandler(console) rootlogger = logging.getLogger('AutoDownload') logger = logging.getLogger('AutoDownload.main') logger.info("*** START ***") logger.info("command args: [%s]", " ".join(argv)) try: current_month = date.today().month current_year = date.today().year current_day = 0 month_year_str = '{!s}-{!s}'.format( month_digit_to_string(current_month), str(current_year)) cfgstr = "config.ini" card_type = 'cd' statement_issued_time = 'd' security_mode = False try: opts, args = getopt.getopt(argv, "hdasc:m:t:", ["month="]) except getopt.GetoptError: logger.exception("usage error") print_usage() sys.exit(2) for opt, arg in opts: if opt == '-h': print_usage() sys.exit() elif opt == '-c': cfgstr = arg elif opt == "-m": statement_issued_time = 'm' month_year_str = arg elif opt == '-d': statement_issued_time = 'd' current_day = int(date.today().day) elif opt == '-a': statement_issued_time = 'a' elif opt == '-t': card_type = arg elif opt == '-s': security_mode = True else: print("Invalid arguments!") logger.error("invalid argument [%s]", opt) return cfg_path = Path(cfgstr) if not cfg_path.is_file(): print("missing config file") logger.error("missing config file (%s)", str(cfg_path)) return cfg = configparser.ConfigParser() cfg.read(str(cfg_path)) new_log_level = logging.getLevelName(cfg['Log']['level']) logging.getLogger('AutoDownloads').setLevel(new_log_level) console.setLevel(new_log_level) logger.info("log level is %s", logger.getEffectiveLevel()) scraper = Scraper.Scraper(cfg) scraper.download(statement_issued_time, month_year_str, current_day, card_type, security_mode) logger.info("--- DONE ---") except: logger.exception("!!! FAILED !!!") sys.exit(3) logger.info("### END ###")
#encoding:utf-8 import sys import Queue from Scraper import * import time import sys import urlparse import bs4 import threading test_obj = Scraper(single_page=True, workers_num=15) test_obj.feed(['http://freebuf.com']) time.sleep(5) z = test_obj.get_result_urls_queue() while True: try: print z.get(timeout=4) except: print "error" break
from FileDownloader import * from ArchiveExtractor import * current_folder = os.path.dirname(os.path.realpath(__file__)) brand_name = "HARTÔ" skip_web_scraping = False download_files = True output_json = [] output_json_path = current_folder + "/exported_datas/" + brand_name.lower( ) + "_product_pages_data.json" # init the scraper scraper = Scraper(export_dir=current_folder + "/exported_datas", download_dir=current_folder + "/downloads_product_pages") # >>> données sur la marque ---------------------------------------------------------------------------------------------------- # nom qui sera utilisé dans la base de données scraper.set_brand_name(brand_name) # homepage du site à scraper # parfois, les liens sont relatifs, il faut ajouter homepage + lien pour avoir l'url correcte scraper.homepage = "https://www.hartodesign.fr/" # première page à charger, pas forcément la homepage scraper.load_page(scraper.homepage) # <<< données sur la marque ----------------------------------------------------------------------------------------------------
def do_GET(self): mode = getValue("mode", self.requestline) message = format("%s is not a mode") % mode if mode == "echo": message = "Hola Mundo" if mode == "init": message = getHTML(None) if mode == 'train': modelName = getValue("modelName", self.requestline) ml = Classifier() ml.train() ml.save(modelName) message = "Training Successful" if mode == "test": print(self.requestline) link = getValue("link", self.requestline) modelName = getValue("modelName", self.requestline) scraper = Scraper() scraper.scrape(link) ml = Classifier() ml.load(modelName) outputs = ml.test('./Articles/UserQuery.txt') result = biasCalculation(outputs) message = getHTML(result) # else: # print(self.requestline) # string = self.requestline # string = string.replace("GET /", "") # index = string.index(" ") # file = open(string[:index],"rb") # message = "hello"#file.read() # return file(string) # Send response status code self.send_response(200) # Send headers self.send_header('Content-type', 'text/html') self.send_header("Access-Control-Allow-Origin", "*") self.send_header('Access-Control-Allow-Headers', 'Content-Type,Authorization') self.send_header('Access-Control-Allow-Methods', 'GET') self.end_headers() # Write content as utf-8 data self.wfile.write(bytes(message, "utf8")) return
def scrape(): scraper = Scraper() print('Done!')
import logging import pymongo import schedule import time from Scraper import * logging.basicConfig( filename='scheduledscraperlog.log', format='%(asctime)s %(levelname)s %(name)s %(threadName)s : ' '%(message)s', level=logging.INFO) url = 'https://www.espn.com/nba/scoreboard' scraper = Scraper(url) mongo_url = 'mongodb+srv://dbBoris:[email protected]/test?retryWrites=true&w=majority' mongo_client = pymongo.MongoClient(mongo_url, maxPoolSize=50, connect=False) db = pymongo.database.Database(mongo_client, 'games_database') collection = pymongo.collection.Collection(db, 'games_collection') scraper.scrape() # Initial scraping logging.info("Scheduled scraper initialized.") schedule.every(1).minutes.do(scraper.scrape) while 1: schedule.run_pending() for game in scraper.data: # New games enter the database, while existing games (match both name and date) are updated (only if needed). if not db.collection.count_documents({ 'name': game.get('name'),
from os import listdir import re import json from Scraper import * scraper = Scraper() brand_name = "VERPAN" new_json = [] # new_format = { # "name": product name # "details": component name # "collection": collection name # "type": string # "designers": string[], # "description": string, # "brandWebsiteUrl": string # "width": int en mm, # "height": int en mm, # "depth": int en mm, # "seatHeight": int en mm, # "diameter": int en mm, # "weight": int en g, # "designedDate": année 4 chiffres, # "thumbnail": string, # "sourceFiles": string[] # } # value of type # - Furniture
def update(): #feild initilization host = "https://hamstudy.org" first, last = (0, 5) #control variable range for extracting specific links print("Web Host: ") scraper = Scraper(host) secondaryURL = scraper.href(last) allQuestions = [] allanswers = [] allcorrectAnswers = [] print() #gets the different sections(techician, general, Amateur Extra) first, last = (5, 8) sections = scraper.href(first, last) #loops through each section and gets the questions and answers for type in sections: typeHandle = type.replace('/', '') link = host + type scraper.modifyURL(link) first, last = (0, 17) extension = host + ''.join(scraper.href(21, 22)) #open Main Links scraper.modifyURL(extension) first, last = (7, 17) mainLinks = scraper.href(first, last) #getting sublinks from mainLinks print("\nTop Links in " + typeHandle + ": ") first, last = (17, None) allLinks = [] for extension in mainLinks: scraper.modifyURL(host + extension) subLinks = scraper.href(first, last) allLinks.append(subLinks) #clear files before adding contents open('Questions.txt', 'w').close() open('Answers.txt', 'w').close() open('CorrectAnswers.txt', 'w').close() #Opening all links print("\nOpening all links " + typeHandle + ": ") for links in allLinks: for link in links: scraper.modifyURL(host + link) sectionQuestions = scraper.get_web_element("div", "question") sectionAnswers = scraper.get_web_element("li", "answer") correctAnswers = scraper.get_web_element( "li", "answer correct") with open("data/" + typeHandle + 'Questions.txt', 'a') as fileHandle: for question in sectionQuestions: fileHandle.write('%s\n' % question.text.strip()) with open("data/" + typeHandle + 'Answers.txt', 'a') as fileHandle: for answers in sectionAnswers: fileHandle.write('%s\n' % answers.div.text) with open("data/" + typeHandle + 'CorrectAnswers.txt', 'a') as fileHandle: for answer in correctAnswers: fileHandle.write('%s\n' % answer.label.text[0].strip())
import json import Scraper #import CourseCatalog as CC from TermMasterSchedule import Quarters scraper = Scraper.Scraper() data: dict = scraper.scrapeTMS(terms=["Fall Quarter 20-21"], colleges=["Arts and Sciences"], subjects=["GEO"]) assert (len(data) != 0), "No data was found!" json.dump(data, open('temp_TMS.json', 'w'))
from Scraper import * from config import * import requests from bs4 import BeautifulSoup DDSM_page = requests.get(DDSM_url) DDSM_html = BeautifulSoup(DDSM_page.text) DDSM_table = DDSM_html.find('table') DDSM_links = DDSM_table.findAll('a') Scraper(DDSM_links)