def Search_Name(self): if not self.crawlName.get() == "" or None: if wc.Allow_Certain_Folder_Name(self.crawlName.get()): wc.Crawling_Image(self.crawlName.get(), 100) else: self.Set_Progress_Message('검색어에 \%/:*?"<>|.를 넣을 수 없습니다.') else: self.Set_Progress_Message("검색어를 입력해주세요")
def __init__(self, percept): # To make sure the crawled links stay within investopedia. Agent.domain_name = 'investopedia.com' # Sensor, Last parameter is used to make sure all the links stay in the investopedia website. Agent.crawler = WebCrawler('Group 8 AI Project', percept, Agent.domain_name) # Check the info on the crawler # print(Agent.crawler.project_name) # States / Knowledge ''' Stocks array:[0] = apple [1] = amazon [2] = netflix ''' # Stocks to find, Can change. model = {'AAPL', 'AMZN','NFLX'} # Pandas Graph initialization graph = Graph(model)
def start(self): print "Start Crawling!" for country in range(COUNTRY_AMOUNT): print("######################### " + url[country] + " #########################") crawler = WebCrawler.WebCrawler(url[country]) crawler.crawl() crawler.write_files(countries[country]) result = crawler.get_brand_score() self.crawlers[country] = crawler sum = 0 if result: for bd in range(BRAND_AMOUNT): sum = sum + float("{0:.2f}".format(result[brand[bd]])) for bd in range(BRAND_AMOUNT): if result[brand[bd]]: score = float("{0:.2f}".format(result[brand[bd]])) self.setTableCell(country+2, bd+1, ('%.1f'%(score*100.0/sum)) + '%(' + ('%.1f'%(score)) + ')') else: self.setTableCell(country+2, bd+1, '0%(0)') tkMessageBox.showinfo("Info", "Crawling done!") self.flag = 1
def __init__(self, master, rows=6, columns=10): self.root = master self.crawlers = [WebCrawler.WebCrawler("ukurl") for i in range(6)] self.flag = 0 master.minsize(width=860, height=320) master.maxsize(width=860, height=320) start = Button(master, text="Start", command=self.start) start.grid(row=9, column=0, pady=5, padx=5) export = Button(master, text="Export", command=self.export) export.grid(row=9, column=1, pady=5, padx=5) clear = Button(master, text="Clear", command=self.clear) clear.grid(row=9, column=2, pady=5, padx=5) exit = Button(master, text="Exit", command=master.quit) exit.grid(row=9, column=3, pady=5, padx=5) for column in range(columns): title = Label(master, text=titles[column], borderwidth=1, font=("Helvetica", 15)) title.grid(row=0, column=column, sticky="nsew", padx=19, pady=5) for row in range(rows): content = Label(master, text=countries[row], borderwidth=1, font=("Helvetica", 15)) content.grid(row=2+row, column=0, sticky="nsew", padx=14, pady=10) self.clear()
def display_appids(): """Calling this def will print all the games with their appids in the console""" clear = lambda:os.system('cls') clear() Results = wc.get_appids() for k in range(len(Results)): name = str(Results[k][0]).replace("\t","").replace("\n","").replace("\r","") appid = str(Results[k][1]).replace("http://steamcommunity.com/market/search?appid=","").replace("\t","").replace("\n","").replace("\r","") print name + " (" + appid + ")"
def display(appID): """Calling this def will print the top ten most popular items, amount available and current price """ crawlData = wc.crawl_page(appID) if(len(crawlData )!= 0): clear = lambda:os.system('cls') clear() print("Amount\tPrice\tName\n") Results = crawlData for k in range(len(Results)): print(str(Results[k][1])+"\t"+str(Results[k][2]).replace(" USD","")+"\t"+str(Results[k][0]))
def PatternRecogWorker(self): wc = wcr.WebCrawler() ##get the list of symbols from wikipedia, we will implement this function later; ##symbollist_html = wc.url_open(symbollist_url) # debug by some specific symbol: # indiceslist = ['S&P500_test'] indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000'] ##read the list of symbol from local file: for eachindex in indiceslist: print('>>>>>>>>>>start processing ' + eachindex + '<<<<<<<<<<') f = open('C:\\MyProjects\\PatternRecog\\' + eachindex + '.md') symbolList = f.read().split(sep='|') for each in symbolList: print('processing ' + each) symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + each + '?®ion=US&lang=en-US&includePrePost=false&interval=1d&range=2y&corsDomain=finance.yahoo.com' df = dfy.DataFactory() df.json_digest(wc.url_open(symboldata_url)) if df.is_excepttion == 0: # bug fix when today's volume is None. if df.volume[-1] is not None: # if df.volume[-1] < 0.3M, I dont even bother checking it: if df.open_price != 0 and df.volume[-1] > 150000: re = rce.RecognitionEngine() re.timespansChecker(eachindex, each, df.open_price, df.close_price, df.high_price, df.low_price, df.volume[-1]) time.sleep(2) print( 'done with patter recognition jobs, start merging data and write it to excel' ) merg_indiceslist = ['RussellMidCap', 'Russell2000'] # merg_indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000'] listtype = ['_HighAlert'] # listtype = ['_HighAlert', '_WatchList'] for each in merg_indiceslist: for lt in listtype: mergexl = exl.merge2Excel() mergexl.merge2ExcelWorker(each, lt)
def todaypricefetcher(self): webc = wc.WebCrawler() # go through each symbols in the kaeo for eachSymbol in self.today_exl.keys(): print('KAEO: processing ' + eachSymbol) symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + eachSymbol + '?®ion=US&lang=en-US&includePrePost=false&interval=1d&range=2d&corsDomain=finance.yahoo.com' df = dfy.DataFactory() df.json_digest(webc.url_open(symboldata_url)) if df.is_excepttion == 0: if df.close_price != 0: # compute the change %: change = ((df.close_price[-1] - df.close_price[0]) / df.close_price[0]) * 100 todaycell = '[' + '%.2f' % change + '%_' + '%.2f' % df.close_price[-1] +'<' + \ str('%.1f' % (df.volume[-1] / 1000000)) + 'M>' self.today_exl[eachSymbol].append(todaycell) time.sleep(2)
msg = message.text print str(receiver) + "\t" + str(sender) + "\t" + str(message) m = regexp.REMatcher(msg) if str(msg).strip() == "None": print("It's a Sticker") break elif m.match(r't-bot'): msg = "Dominus." elif m.match(r'/Weather\s*(.*)'): msg = str(weather.get(m.group(1))) elif re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg, re.I): m = re.finditer(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg, re.I) print "Link detect" if m: for link in m: URL = link.group(0) print URL msg = str(WebCrawler.get(URL)) if msg == 'NO': print "Yahoo internal URL" break else: receiver.sendMessage("%s" % msg) break else: break receiver.sendMessage(" %s" % (msg)) #receiver.sendMessage("[%s] %s" % (sender.name, msg))
#!/usr/bin/env python # -*- coding: utf-8 -*- ''' http://blog.csdn.net/cashey1991/article/details/6262704 ''' import WebCrawler url = 'http://www.baidu.com/' thNumber = 5 Maxdepth = 2 wc = WebCrawler.WebCrawler(thNumber, Maxdepth) wc.Craw(url) print WebCrawler.getUrl() print '**********************'
from WebCrawler import * wc = WebCrawler() #urls = wc.fetch_urls_from_source('http://wikipedia.org') #url = wc.normalize_url('/', 'wiki.org') #print(url) wc.crawl() #print(Helper.get_domain("http://us.rd.yahoo.com/finance/news/rss/story/*http://finance.yahoo.com/news/tokyo-gas-no-hurry-buy-113424587.html")) #print(wc.get_disallowed_sites("http://stackoverflow.com/", "*")) #if wc.is_allowed("ssl.reddit.com/res"): # print('true') #else: # print('false')
# CATEGORIES_DEFINITION = { # "" : 0, # "Economicos/Generalistas" : 1, # "TMT/Turismo/Mkt" : 2, # "Imobiliario/Transportes/Jogos/TI" : 3, # "Saude e bem-estar/Ambiente/Gastronomia/Arte Cultura e Lazer" : 4, # "Auto/Desporto/Moda e Decoracao" : 5, # "Regionais" : 6, # "Blogs" : 7, # "Outros" : 8, # "Internacionais" : 9 # } CATEGORIES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] from WebCrawler import * from db import * # Inicializar o modulo que comunica com a BD db = Db(DB_HOST, DB_DATABASE, DB_USER, DB_PASSWORD) # Inicializar o WebCrawler crawler = WebCrawler(WC_HOST, WC_PORT, WC_LOGPATH, CATEGORIES, MAX_SOURCE_IN_MEM, MAX_TIME_URL_UPDATE) # Comecar as pesquisas que vindas da BD db.start(crawler) # Comecar o modo server (para que o webcraler possa responder aos pedidos vindos de fora) crawler.start()
def main(url): string = Web2String.url2string(url) print(string) entity = company_identifier.entity(string) print(entity) newsTitles, newsContent, newsSources, other_articles_URL = WebCrawler.getInfo( entity) # array of article urls # main URL summary main_summary = summary.summary(url) print(main_summary) print(newsTitles) # print(string) main_article_sentiment = round(evaluate_NN.evaluate_NN(string) * 100) # analyze sentiment of main article print(main_article_sentiment) # arrays that will hold similar/different articles and their sentiments other_article_sentiment = list() # for x in range(len(other_articles_URL)): curr_article = Web2String.url2string(other_articles_URL[x]) # analyze sentiment of article and put in array other_article_sentiment.append( round(evaluate_NN.evaluate_NN(curr_article) * 100)) a_summary = list() for x in range(len(other_articles_URL)): a_summary.append(summary.summary(other_articles_URL[x])) stock_dates, stock_data = StockToPython.stock_to_JSON(entity) other_articles_titles_dict = { 'other_articles_titles': { 'one': newsTitles[0], 'two': newsTitles[1], 'three': newsTitles[2], 'four': newsTitles[3], 'five': newsTitles[4] } } other_articles_sources_dict = { 'other_articles_sources': { 'one': newsSources[0], 'two': newsSources[1], 'three': newsSources[2], 'four': newsSources[3], 'five': newsSources[4] } } other_articles_sentiment_dict = { 'other_articles_sentiment': { 'one': other_article_sentiment[0], 'two': other_article_sentiment[1], 'three': other_article_sentiment[2], 'four': other_article_sentiment[3], 'five': other_article_sentiment[4] } } other_articles_links_dict = { 'other_articles_links': { 'one': other_articles_URL[0], 'two': other_articles_URL[1], 'three': other_articles_URL[2], 'four': other_articles_URL[3], 'five': other_articles_URL[4] } } article_summary = { 'article_summary': { 'one': a_summary[0], 'two': a_summary[1], 'three': a_summary[2], 'four': a_summary[3], 'five': a_summary[4] } } main_article_sentiment_dict = { 'main_article_sentiment': main_article_sentiment } entity_dict = {'company_name': entity} main_article_summary = {'main_summary': main_summary} stock_dates_dict = {'stock_dates': stock_dates} stock_data_dict = {'stock_data': stock_data} rv = {} rv.update(other_articles_titles_dict) rv.update(other_articles_sources_dict) rv.update(other_articles_sentiment_dict) rv.update(other_articles_links_dict) rv.update(main_article_sentiment_dict) rv.update(entity_dict) rv.update(article_summary) rv.update(main_article_summary) rv.update(stock_dates_dict) rv.update(stock_data_dict) rv_json = json.dumps(rv) print(rv_json) return rv_json # main('https://www.cnet.com/news/apples-q3-earnings-are-all-about-the-iphone-11-hints/')
def test_crawler(self): try: WebCrawler.main() pass except: self.fail("Crawler failed!")