Python WebCrawler 예제들, WebCrawler Python 예제들

예제 #1

0

파일 보기

파일: GuiManager.py 프로젝트: SangSun-Park/3-1softwareproject

 def Search_Name(self):
     if not self.crawlName.get() == "" or None:
         if wc.Allow_Certain_Folder_Name(self.crawlName.get()):
             wc.Crawling_Image(self.crawlName.get(), 100)
         else:
             self.Set_Progress_Message('검색어에 \%/:*?"<>|.를 넣을 수 없습니다.')
     else:
         self.Set_Progress_Message("검색어를 입력해주세요")

예제 #2

0

파일 보기

	def __init__(self, percept):

		# To make sure the crawled links stay within investopedia.
		Agent.domain_name = 'investopedia.com'

		# Sensor, Last parameter is used to make sure all the links stay in the investopedia website.
		Agent.crawler = WebCrawler('Group 8 AI Project', percept, Agent.domain_name)



		# Check the info on the crawler
		# print(Agent.crawler.project_name)

		# States / Knowledge
		
		'''
		Stocks array:[0] = apple
					 [1] = amazon
					 [2] = netflix
		'''
		

		# Stocks to find, Can change.
		model = {'AAPL', 'AMZN','NFLX'}

		# Pandas Graph initialization
		graph = Graph(model)

예제 #3

0

파일 보기

    def start(self):
        print "Start Crawling!"
        for country in range(COUNTRY_AMOUNT):
            print("######################### " + url[country] + " #########################")
            crawler = WebCrawler.WebCrawler(url[country])
            crawler.crawl()
            crawler.write_files(countries[country])
            result = crawler.get_brand_score()

            self.crawlers[country] = crawler

            sum = 0

            if result:
                for bd in range(BRAND_AMOUNT):
                    sum = sum + float("{0:.2f}".format(result[brand[bd]]))

                for bd in range(BRAND_AMOUNT):
                    if result[brand[bd]]:
                        score = float("{0:.2f}".format(result[brand[bd]]))
                        self.setTableCell(country+2, bd+1, ('%.1f'%(score*100.0/sum)) + '%(' +  ('%.1f'%(score)) + ')')
                    else:
                        self.setTableCell(country+2, bd+1, '0%(0)')

        tkMessageBox.showinfo("Info", "Crawling done!")
        self.flag = 1

예제 #4

0

파일 보기

    def __init__(self, master, rows=6, columns=10):

        self.root = master

        self.crawlers = [WebCrawler.WebCrawler("ukurl") for i in range(6)]
        self.flag = 0

        master.minsize(width=860, height=320)
        master.maxsize(width=860, height=320)

        start = Button(master, text="Start", command=self.start)
        start.grid(row=9, column=0, pady=5, padx=5)

        export = Button(master, text="Export", command=self.export)
        export.grid(row=9, column=1, pady=5, padx=5)

        clear = Button(master, text="Clear", command=self.clear)
        clear.grid(row=9, column=2, pady=5, padx=5)

        exit = Button(master, text="Exit", command=master.quit)
        exit.grid(row=9, column=3, pady=5, padx=5)

        for column in range(columns):
            title = Label(master, text=titles[column], borderwidth=1, font=("Helvetica", 15))
            title.grid(row=0, column=column, sticky="nsew", padx=19, pady=5)

        for row in range(rows):
            content = Label(master, text=countries[row], borderwidth=1, font=("Helvetica", 15))
            content.grid(row=2+row, column=0, sticky="nsew", padx=14, pady=10)

        self.clear()

예제 #5

0

파일 보기

파일: ConsoleDisplay.py 프로젝트: ChristiaanV/SteamMarketCrawler

def display_appids():
    """Calling this def will print all the games with their appids in the console"""    
    
    clear = lambda:os.system('cls')
    clear()
    Results = wc.get_appids()
    for k in range(len(Results)):
        name =  str(Results[k][0]).replace("\t","").replace("\n","").replace("\r","")
        appid = str(Results[k][1]).replace("http://steamcommunity.com/market/search?appid=","").replace("\t","").replace("\n","").replace("\r","")
        print name + " (" + appid + ")"

예제 #6

0

파일 보기

파일: ConsoleDisplay.py 프로젝트: ChristiaanV/SteamMarketCrawler

def display(appID):
    """Calling this def will print the top ten most popular items, amount available and current price """
    
    crawlData = wc.crawl_page(appID)
    if(len(crawlData )!= 0):
        clear = lambda:os.system('cls')
        clear()
        print("Amount\tPrice\tName\n")
        Results = crawlData
        for k in range(len(Results)):
            print(str(Results[k][1])+"\t"+str(Results[k][2]).replace(" USD","")+"\t"+str(Results[k][0]))

예제 #7

0

파일 보기

    def PatternRecogWorker(self):
        wc = wcr.WebCrawler()

        ##get the list of symbols from wikipedia, we will implement this function later;
        ##symbollist_html = wc.url_open(symbollist_url)

        # debug by some specific symbol:
        # indiceslist = ['S&P500_test']

        indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000']

        ##read the list of symbol from local file:
        for eachindex in indiceslist:
            print('>>>>>>>>>>start processing ' + eachindex + '<<<<<<<<<<')
            f = open('C:\\MyProjects\\PatternRecog\\' + eachindex + '.md')
            symbolList = f.read().split(sep='|')

            for each in symbolList:

                print('processing ' + each)
                symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + each + '?&region=US&lang=en-US&includePrePost=false&interval=1d&range=2y&corsDomain=finance.yahoo.com'
                df = dfy.DataFactory()
                df.json_digest(wc.url_open(symboldata_url))
                if df.is_excepttion == 0:
                    # bug fix when today's volume is None.
                    if df.volume[-1] is not None:
                        # if df.volume[-1] < 0.3M, I dont even bother checking it:
                        if df.open_price != 0 and df.volume[-1] > 150000:
                            re = rce.RecognitionEngine()
                            re.timespansChecker(eachindex, each, df.open_price,
                                                df.close_price, df.high_price,
                                                df.low_price, df.volume[-1])

                time.sleep(2)

        print(
            'done with patter recognition jobs, start merging data and write it to excel'
        )

        merg_indiceslist = ['RussellMidCap', 'Russell2000']
        # merg_indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000']

        listtype = ['_HighAlert']
        # listtype = ['_HighAlert', '_WatchList']

        for each in merg_indiceslist:
            for lt in listtype:
                mergexl = exl.merge2Excel()
                mergexl.merge2ExcelWorker(each, lt)

예제 #8

0

파일 보기

    def todaypricefetcher(self):

        webc = wc.WebCrawler()

        # go through each symbols in the kaeo
        for eachSymbol in self.today_exl.keys():

            print('KAEO: processing ' + eachSymbol)
            symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + eachSymbol + '?&region=US&lang=en-US&includePrePost=false&interval=1d&range=2d&corsDomain=finance.yahoo.com'
            df = dfy.DataFactory()
            df.json_digest(webc.url_open(symboldata_url))
            if df.is_excepttion == 0:
                if df.close_price != 0:

                    # compute the change %:
                    change = ((df.close_price[-1] - df.close_price[0]) /
                              df.close_price[0]) * 100
                    todaycell = '[' + '%.2f' % change + '%_' + '%.2f' % df.close_price[-1] +'<' + \
                                str('%.1f' % (df.volume[-1] / 1000000)) + 'M>'

                    self.today_exl[eachSymbol].append(todaycell)

            time.sleep(2)

예제 #9

0

파일 보기

파일: t-bot.py 프로젝트: taiwei608/t-bot

        msg = message.text
        print str(receiver) + "\t" + str(sender) + "\t" + str(message)
        m = regexp.REMatcher(msg)

        if str(msg).strip() == "None":
            print("It's a Sticker")
            break
        elif m.match(r't-bot'):
            msg = "Dominus."
        elif m.match(r'/Weather\s*(.*)'):
            msg = str(weather.get(m.group(1)))
        elif re.search(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg, re.I):
            m = re.finditer(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', msg, re.I)
            print "Link detect"
            if m:
                for link in m:
                    URL = link.group(0)
                    print URL
                    msg = str(WebCrawler.get(URL))
                    if msg == 'NO':
                        print "Yahoo internal URL"
                        break
                    else:
                        receiver.sendMessage("%s" % msg)
            break
        else:
            break
        receiver.sendMessage(" %s" % (msg))
        #receiver.sendMessage("[%s] %s" % (sender.name, msg))

예제 #10

0

파일 보기

파일: test.py 프로젝트: lymneu/UnitScan

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
http://blog.csdn.net/cashey1991/article/details/6262704
'''

import WebCrawler  
  
url = 'http://www.baidu.com/'
thNumber = 5
Maxdepth = 2
  
wc = WebCrawler.WebCrawler(thNumber, Maxdepth)  
wc.Craw(url)  

print WebCrawler.getUrl()

print '**********************'

예제 #11

0

파일 보기

파일: Main.py 프로젝트: Roknahr/pyCrawler

from WebCrawler import *

wc = WebCrawler()

#urls = wc.fetch_urls_from_source('http://wikipedia.org')

#url = wc.normalize_url('/', 'wiki.org')
#print(url)
wc.crawl()
#print(Helper.get_domain("http://us.rd.yahoo.com/finance/news/rss/story/*http://finance.yahoo.com/news/tokyo-gas-no-hurry-buy-113424587.html"))
#print(wc.get_disallowed_sites("http://stackoverflow.com/", "*"))
#if wc.is_allowed("ssl.reddit.com/res"):
#    print('true')
#else:
#    print('false')

예제 #12

0

파일 보기

파일: startCrawler.py 프로젝트: davidmduarte/dmdWebCrawler

# CATEGORIES_DEFINITION = {
# 	"" : 0,
# 	"Economicos/Generalistas" : 1,
# 	"TMT/Turismo/Mkt" : 2,
# 	"Imobiliario/Transportes/Jogos/TI" : 3,
# 	"Saude e bem-estar/Ambiente/Gastronomia/Arte Cultura e Lazer" : 4,
# 	"Auto/Desporto/Moda e Decoracao" : 5,
# 	"Regionais" : 6,
# 	"Blogs" : 7,
# 	"Outros" : 8,
# 	"Internacionais" : 9
# }

CATEGORIES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

from WebCrawler import *
from db import *

# Inicializar o modulo que comunica com a BD
db = Db(DB_HOST, DB_DATABASE, DB_USER, DB_PASSWORD)

# Inicializar o WebCrawler
crawler = WebCrawler(WC_HOST, WC_PORT, WC_LOGPATH, CATEGORIES, MAX_SOURCE_IN_MEM, MAX_TIME_URL_UPDATE)

# Comecar as pesquisas que vindas da BD 
db.start(crawler)

# Comecar o modo server (para que o webcraler possa responder aos pedidos vindos de fora)
crawler.start()

예제 #13

0

파일 보기

def main(url):
    string = Web2String.url2string(url)
    print(string)
    entity = company_identifier.entity(string)
    print(entity)
    newsTitles, newsContent, newsSources, other_articles_URL = WebCrawler.getInfo(
        entity)  # array of article urls

    # main URL summary
    main_summary = summary.summary(url)
    print(main_summary)

    print(newsTitles)
    # print(string)
    main_article_sentiment = round(evaluate_NN.evaluate_NN(string) *
                                   100)  # analyze sentiment of main article
    print(main_article_sentiment)
    # arrays that will hold similar/different articles and their sentiments
    other_article_sentiment = list()
    #
    for x in range(len(other_articles_URL)):
        curr_article = Web2String.url2string(other_articles_URL[x])
        # analyze sentiment of article and put in array
        other_article_sentiment.append(
            round(evaluate_NN.evaluate_NN(curr_article) * 100))

    a_summary = list()
    for x in range(len(other_articles_URL)):
        a_summary.append(summary.summary(other_articles_URL[x]))

    stock_dates, stock_data = StockToPython.stock_to_JSON(entity)

    other_articles_titles_dict = {
        'other_articles_titles': {
            'one': newsTitles[0],
            'two': newsTitles[1],
            'three': newsTitles[2],
            'four': newsTitles[3],
            'five': newsTitles[4]
        }
    }
    other_articles_sources_dict = {
        'other_articles_sources': {
            'one': newsSources[0],
            'two': newsSources[1],
            'three': newsSources[2],
            'four': newsSources[3],
            'five': newsSources[4]
        }
    }
    other_articles_sentiment_dict = {
        'other_articles_sentiment': {
            'one': other_article_sentiment[0],
            'two': other_article_sentiment[1],
            'three': other_article_sentiment[2],
            'four': other_article_sentiment[3],
            'five': other_article_sentiment[4]
        }
    }
    other_articles_links_dict = {
        'other_articles_links': {
            'one': other_articles_URL[0],
            'two': other_articles_URL[1],
            'three': other_articles_URL[2],
            'four': other_articles_URL[3],
            'five': other_articles_URL[4]
        }
    }
    article_summary = {
        'article_summary': {
            'one': a_summary[0],
            'two': a_summary[1],
            'three': a_summary[2],
            'four': a_summary[3],
            'five': a_summary[4]
        }
    }
    main_article_sentiment_dict = {
        'main_article_sentiment': main_article_sentiment
    }
    entity_dict = {'company_name': entity}
    main_article_summary = {'main_summary': main_summary}
    stock_dates_dict = {'stock_dates': stock_dates}
    stock_data_dict = {'stock_data': stock_data}

    rv = {}

    rv.update(other_articles_titles_dict)
    rv.update(other_articles_sources_dict)
    rv.update(other_articles_sentiment_dict)
    rv.update(other_articles_links_dict)
    rv.update(main_article_sentiment_dict)
    rv.update(entity_dict)
    rv.update(article_summary)
    rv.update(main_article_summary)
    rv.update(stock_dates_dict)
    rv.update(stock_data_dict)

    rv_json = json.dumps(rv)
    print(rv_json)
    return rv_json


# main('https://www.cnet.com/news/apples-q3-earnings-are-all-about-the-iphone-11-hints/')

예제 #14

0

파일 보기

파일: WebCrawlerUnitTest.py 프로젝트: dulguunbatmunkh/News-Aggregator

 def test_crawler(self):
     try:
         WebCrawler.main()
         pass
     except:
         self.fail("Crawler failed!")

Python WebCrawler, OptimizedLeetcode 예제들