def main(): parser = argparse.ArgumentParser(description='Web screenshots') parser.add_argument( "source", help="full URL or path to the file with URLs list you need to go", type=str) parser.add_argument("--depth", help="depth of walk through", type=int, default=0) parser.add_argument( "--fullpage", help= "sets the screenshot in full page format with saving several images", action="store_true", default=False) args = parser.parse_args() crawler = WebCrawler.WebCrawler() crawler.depth_of_walk = args.depth crawler.is_fullpage = args.fullpage if args.source.startswith("http"): crawler.urls_need_to_go.append(args.source) crawler.walk_control() else: crawler.read_from_file(args.source)
def __init__(self, percept): # To make sure the crawled links stay within investopedia. Agent.domain_name = 'investopedia.com' # Sensor, Last parameter is used to make sure all the links stay in the investopedia website. Agent.crawler = WebCrawler('Group 8 AI Project', percept, Agent.domain_name) # Check the info on the crawler # print(Agent.crawler.project_name) # States / Knowledge ''' Stocks array:[0] = apple [1] = amazon [2] = netflix ''' # Stocks to find, Can change. model = {'AAPL', 'AMZN','NFLX'} # Pandas Graph initialization graph = Graph(model)
def start(self): print "Start Crawling!" for country in range(COUNTRY_AMOUNT): print("######################### " + url[country] + " #########################") crawler = WebCrawler.WebCrawler(url[country]) crawler.crawl() crawler.write_files(countries[country]) result = crawler.get_brand_score() self.crawlers[country] = crawler sum = 0 if result: for bd in range(BRAND_AMOUNT): sum = sum + float("{0:.2f}".format(result[brand[bd]])) for bd in range(BRAND_AMOUNT): if result[brand[bd]]: score = float("{0:.2f}".format(result[brand[bd]])) self.setTableCell(country+2, bd+1, ('%.1f'%(score*100.0/sum)) + '%(' + ('%.1f'%(score)) + ')') else: self.setTableCell(country+2, bd+1, '0%(0)') tkMessageBox.showinfo("Info", "Crawling done!") self.flag = 1
def __init__(self, master, rows=6, columns=10): self.root = master self.crawlers = [WebCrawler.WebCrawler("ukurl") for i in range(6)] self.flag = 0 master.minsize(width=860, height=320) master.maxsize(width=860, height=320) start = Button(master, text="Start", command=self.start) start.grid(row=9, column=0, pady=5, padx=5) export = Button(master, text="Export", command=self.export) export.grid(row=9, column=1, pady=5, padx=5) clear = Button(master, text="Clear", command=self.clear) clear.grid(row=9, column=2, pady=5, padx=5) exit = Button(master, text="Exit", command=master.quit) exit.grid(row=9, column=3, pady=5, padx=5) for column in range(columns): title = Label(master, text=titles[column], borderwidth=1, font=("Helvetica", 15)) title.grid(row=0, column=column, sticky="nsew", padx=19, pady=5) for row in range(rows): content = Label(master, text=countries[row], borderwidth=1, font=("Helvetica", 15)) content.grid(row=2+row, column=0, sticky="nsew", padx=14, pady=10) self.clear()
def main(self): self.db = Recipesdb.Recipesdb() #test connection #test_name = "test category" #test_parent_category = None #self.db.insert_category(test_name, None, "", test_parent_category) #self.db.delete_category_byname(test_name) self.joCrawler = WebCrawler.WebCrawler() starting_url = "https://www.jamieoliver.com/recipes/" self.crawl_link(None, starting_url, None)
def PatternRecogWorker(self): wc = wcr.WebCrawler() ##get the list of symbols from wikipedia, we will implement this function later; ##symbollist_html = wc.url_open(symbollist_url) # debug by some specific symbol: # indiceslist = ['S&P500_test'] indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000'] ##read the list of symbol from local file: for eachindex in indiceslist: print('>>>>>>>>>>start processing ' + eachindex + '<<<<<<<<<<') f = open('C:\\MyProjects\\PatternRecog\\' + eachindex + '.md') symbolList = f.read().split(sep='|') for each in symbolList: print('processing ' + each) symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + each + '?®ion=US&lang=en-US&includePrePost=false&interval=1d&range=2y&corsDomain=finance.yahoo.com' df = dfy.DataFactory() df.json_digest(wc.url_open(symboldata_url)) if df.is_excepttion == 0: # bug fix when today's volume is None. if df.volume[-1] is not None: # if df.volume[-1] < 0.3M, I dont even bother checking it: if df.open_price != 0 and df.volume[-1] > 150000: re = rce.RecognitionEngine() re.timespansChecker(eachindex, each, df.open_price, df.close_price, df.high_price, df.low_price, df.volume[-1]) time.sleep(2) print( 'done with patter recognition jobs, start merging data and write it to excel' ) merg_indiceslist = ['RussellMidCap', 'Russell2000'] # merg_indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000'] listtype = ['_HighAlert'] # listtype = ['_HighAlert', '_WatchList'] for each in merg_indiceslist: for lt in listtype: mergexl = exl.merge2Excel() mergexl.merge2ExcelWorker(each, lt)
def todaypricefetcher(self): webc = wc.WebCrawler() # go through each symbols in the kaeo for eachSymbol in self.today_exl.keys(): print('KAEO: processing ' + eachSymbol) symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + eachSymbol + '?®ion=US&lang=en-US&includePrePost=false&interval=1d&range=2d&corsDomain=finance.yahoo.com' df = dfy.DataFactory() df.json_digest(webc.url_open(symboldata_url)) if df.is_excepttion == 0: if df.close_price != 0: # compute the change %: change = ((df.close_price[-1] - df.close_price[0]) / df.close_price[0]) * 100 todaycell = '[' + '%.2f' % change + '%_' + '%.2f' % df.close_price[-1] +'<' + \ str('%.1f' % (df.volume[-1] / 1000000)) + 'M>' self.today_exl[eachSymbol].append(todaycell) time.sleep(2)
# -*- coding: utf-8 -*- import WebCrawler, re url = raw_input('设置入口url(例-->http://www.baidu.com): \n') thNumber = int(raw_input('设置线程数:')) # 之前类型未转换出bug Maxdepth = int(raw_input('最大搜索深度:')) wc = WebCrawler.WebCrawler(thNumber, Maxdepth) wc.Craw(url) title_file = open(r'titles/title_and_link.txt', 'w') for key, value in WebCrawler.title_link.items(): if len(key) > 1 and len(value) > 1: # key = re.sub('\n+', ' ', key) # key = re.sub('\n', ' ', key) title_file.write(key) title_file.write(value + "\n") # print type(key),type(value),"\n" # print title_file title_file.close() title_filename = open(r'titles/title_and_file.txt', 'w') for key, value in WebCrawler.title_file.items(): if len(key) > 1 and len(value) > 1: # key = re.sub('\n+', ' ', key) # key = re.sub('\n', ' ', key) title_filename.write(key) title_filename.write(value + "\n") # print type(key),type(value),"\n" # print title_file title_filename.close()
enlaceExterno = "\n\n[+]External link: " buscarLogin = "******" pdf = "\n\n[+]PDF files" totales = "Totals: " indice = "\n[+]Index: " ayudaPdf = "\nType <int-int>: To download a range \nType <int,int,*>: To download the selected files\nNumber negative to exit\n" descargarPdf = "\n\n[+]Download PDF...\n" imprimirCabecera() #Ejecucion del Codigo if paginaAtacar == None: imprimirAyuda() else: print obtenerEnlaces paginaReal = getPaginaPrincipal(paginaAtacar) webCrawler = WebCrawler.WebCrawler(paginaReal, limiteHilos, limiteTiempo) webCrawler.agregarEnlace(paginaAtacar) paginasAnalizadas = 0 #Calcular Dominio principal enlaceDividido = paginaReal.split("/")[2] dominioDividido = enlaceDividido.split(".") objetivo = dominioDividido[len(dominioDividido) - 2] while paginasAnalizadas <= limitePaginas: try: if webCrawler.getNumeroHilos() < webCrawler.getLimiteHilos(): pagina = webCrawler.getEnlaces()[paginasAnalizadas] hilo = HiloAnalizar.HiloAnalizar(pagina, paginaReal, webCrawler) hilo.start() tiempoEjecucion = webCrawler.getHoraActual( ) - webCrawler.getHoraInicio()
import WebCrawler wc = WebCrawler.WebCrawler("baseURLs") wc.crawl() print(wc.get_brand_score())
# -*- coding: cp936 -*- import WebCrawler url = raw_input('设置入口url(例-->http://www.baidu.com): \n') thNumber = int(raw_input('设置线程数:')) #之前类型未转换出bug wc = WebCrawler.WebCrawler(thNumber) wc.Craw(url)
from WebCrawler import * wc = WebCrawler() #urls = wc.fetch_urls_from_source('http://wikipedia.org') #url = wc.normalize_url('/', 'wiki.org') #print(url) wc.crawl() #print(Helper.get_domain("http://us.rd.yahoo.com/finance/news/rss/story/*http://finance.yahoo.com/news/tokyo-gas-no-hurry-buy-113424587.html")) #print(wc.get_disallowed_sites("http://stackoverflow.com/", "*")) #if wc.is_allowed("ssl.reddit.com/res"): # print('true') #else: # print('false')
####################################################### Web-Crawler implementation to test. ####################################################### ''' # Initialize the web crawler params project_name = 'ExtraCredit' ################################################## # Can Change this url to change testing methods. base_url = 'https://www.investopedia.com' domain_name = 'investopedia.com' # Set up the state space to undergo the search crawler = WebCrawler(project_name, base_url, domain_name) crawler.crawl(project_name, base_url, 0) # crawler.setup() # Call to the iterative deepening algorithm, searching for certain links start_time = time.time() # The problem to solve, finding the link for the Apple stock information. Ex: 'Domain_name.com/AAPL' goal = {'tsla'} search_result = iterative_deepening_search(goal, crawler, 2) elapsed = time.time() - start_time print("Time to search: " + str(elapsed)) goal = {'aapl'} start_time = time.time() search_result = iterative_deepening_search(goal, crawler, 2)