예제 #1
0
def main():

    parser = argparse.ArgumentParser(description='Web screenshots')
    parser.add_argument(
        "source",
        help="full URL or path to the file with URLs list you need to go",
        type=str)
    parser.add_argument("--depth",
                        help="depth of walk through",
                        type=int,
                        default=0)
    parser.add_argument(
        "--fullpage",
        help=
        "sets the screenshot in full page format with saving several images",
        action="store_true",
        default=False)
    args = parser.parse_args()

    crawler = WebCrawler.WebCrawler()
    crawler.depth_of_walk = args.depth
    crawler.is_fullpage = args.fullpage

    if args.source.startswith("http"):
        crawler.urls_need_to_go.append(args.source)
        crawler.walk_control()
    else:
        crawler.read_from_file(args.source)
예제 #2
0
	def __init__(self, percept):

		# To make sure the crawled links stay within investopedia.
		Agent.domain_name = 'investopedia.com'

		# Sensor, Last parameter is used to make sure all the links stay in the investopedia website.
		Agent.crawler = WebCrawler('Group 8 AI Project', percept, Agent.domain_name)



		# Check the info on the crawler
		# print(Agent.crawler.project_name)

		# States / Knowledge
		
		'''
		Stocks array:[0] = apple
					 [1] = amazon
					 [2] = netflix
		'''
		

		# Stocks to find, Can change.
		model = {'AAPL', 'AMZN','NFLX'}

		# Pandas Graph initialization
		graph = Graph(model)
예제 #3
0
    def start(self):
        print "Start Crawling!"
        for country in range(COUNTRY_AMOUNT):
            print("######################### " + url[country] + " #########################")
            crawler = WebCrawler.WebCrawler(url[country])
            crawler.crawl()
            crawler.write_files(countries[country])
            result = crawler.get_brand_score()

            self.crawlers[country] = crawler

            sum = 0

            if result:
                for bd in range(BRAND_AMOUNT):
                    sum = sum + float("{0:.2f}".format(result[brand[bd]]))

                for bd in range(BRAND_AMOUNT):
                    if result[brand[bd]]:
                        score = float("{0:.2f}".format(result[brand[bd]]))
                        self.setTableCell(country+2, bd+1, ('%.1f'%(score*100.0/sum)) + '%(' +  ('%.1f'%(score)) + ')')
                    else:
                        self.setTableCell(country+2, bd+1, '0%(0)')

        tkMessageBox.showinfo("Info", "Crawling done!")
        self.flag = 1
예제 #4
0
    def __init__(self, master, rows=6, columns=10):

        self.root = master

        self.crawlers = [WebCrawler.WebCrawler("ukurl") for i in range(6)]
        self.flag = 0

        master.minsize(width=860, height=320)
        master.maxsize(width=860, height=320)

        start = Button(master, text="Start", command=self.start)
        start.grid(row=9, column=0, pady=5, padx=5)

        export = Button(master, text="Export", command=self.export)
        export.grid(row=9, column=1, pady=5, padx=5)

        clear = Button(master, text="Clear", command=self.clear)
        clear.grid(row=9, column=2, pady=5, padx=5)

        exit = Button(master, text="Exit", command=master.quit)
        exit.grid(row=9, column=3, pady=5, padx=5)

        for column in range(columns):
            title = Label(master, text=titles[column], borderwidth=1, font=("Helvetica", 15))
            title.grid(row=0, column=column, sticky="nsew", padx=19, pady=5)

        for row in range(rows):
            content = Label(master, text=countries[row], borderwidth=1, font=("Helvetica", 15))
            content.grid(row=2+row, column=0, sticky="nsew", padx=14, pady=10)

        self.clear()
예제 #5
0
    def main(self):
        self.db = Recipesdb.Recipesdb()

        #test connection
        #test_name = "test category"
        #test_parent_category = None
        #self.db.insert_category(test_name, None, "", test_parent_category)
        #self.db.delete_category_byname(test_name)

        self.joCrawler = WebCrawler.WebCrawler()
        starting_url = "https://www.jamieoliver.com/recipes/"
        self.crawl_link(None, starting_url, None)
예제 #6
0
    def PatternRecogWorker(self):
        wc = wcr.WebCrawler()

        ##get the list of symbols from wikipedia, we will implement this function later;
        ##symbollist_html = wc.url_open(symbollist_url)

        # debug by some specific symbol:
        # indiceslist = ['S&P500_test']

        indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000']

        ##read the list of symbol from local file:
        for eachindex in indiceslist:
            print('>>>>>>>>>>start processing ' + eachindex + '<<<<<<<<<<')
            f = open('C:\\MyProjects\\PatternRecog\\' + eachindex + '.md')
            symbolList = f.read().split(sep='|')

            for each in symbolList:

                print('processing ' + each)
                symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + each + '?&region=US&lang=en-US&includePrePost=false&interval=1d&range=2y&corsDomain=finance.yahoo.com'
                df = dfy.DataFactory()
                df.json_digest(wc.url_open(symboldata_url))
                if df.is_excepttion == 0:
                    # bug fix when today's volume is None.
                    if df.volume[-1] is not None:
                        # if df.volume[-1] < 0.3M, I dont even bother checking it:
                        if df.open_price != 0 and df.volume[-1] > 150000:
                            re = rce.RecognitionEngine()
                            re.timespansChecker(eachindex, each, df.open_price,
                                                df.close_price, df.high_price,
                                                df.low_price, df.volume[-1])

                time.sleep(2)

        print(
            'done with patter recognition jobs, start merging data and write it to excel'
        )

        merg_indiceslist = ['RussellMidCap', 'Russell2000']
        # merg_indiceslist = ['S&P500', 'RussellMidCap', 'Russell2000']

        listtype = ['_HighAlert']
        # listtype = ['_HighAlert', '_WatchList']

        for each in merg_indiceslist:
            for lt in listtype:
                mergexl = exl.merge2Excel()
                mergexl.merge2ExcelWorker(each, lt)
예제 #7
0
    def todaypricefetcher(self):

        webc = wc.WebCrawler()

        # go through each symbols in the kaeo
        for eachSymbol in self.today_exl.keys():

            print('KAEO: processing ' + eachSymbol)
            symboldata_url = 'https://query1.finance.yahoo.com/v8/finance/chart/' + eachSymbol + '?&region=US&lang=en-US&includePrePost=false&interval=1d&range=2d&corsDomain=finance.yahoo.com'
            df = dfy.DataFactory()
            df.json_digest(webc.url_open(symboldata_url))
            if df.is_excepttion == 0:
                if df.close_price != 0:

                    # compute the change %:
                    change = ((df.close_price[-1] - df.close_price[0]) /
                              df.close_price[0]) * 100
                    todaycell = '[' + '%.2f' % change + '%_' + '%.2f' % df.close_price[-1] +'<' + \
                                str('%.1f' % (df.volume[-1] / 1000000)) + 'M>'

                    self.today_exl[eachSymbol].append(todaycell)

            time.sleep(2)
예제 #8
0
# -*- coding: utf-8 -*-
import WebCrawler, re
url = raw_input('设置入口url(例-->http://www.baidu.com): \n')
thNumber = int(raw_input('设置线程数:'))  # 之前类型未转换出bug
Maxdepth = int(raw_input('最大搜索深度:'))

wc = WebCrawler.WebCrawler(thNumber, Maxdepth)
wc.Craw(url)

title_file = open(r'titles/title_and_link.txt', 'w')
for key, value in WebCrawler.title_link.items():
    if len(key) > 1 and len(value) > 1:
        # key = re.sub('\n+', ' ', key)
        # key = re.sub('\n', ' ', key)
        title_file.write(key)
        title_file.write(value + "\n")
    # print type(key),type(value),"\n"
# print title_file
title_file.close()

title_filename = open(r'titles/title_and_file.txt', 'w')
for key, value in WebCrawler.title_file.items():
    if len(key) > 1 and len(value) > 1:
        # key = re.sub('\n+', ' ', key)
        # key = re.sub('\n', ' ', key)
        title_filename.write(key)
        title_filename.write(value + "\n")
    # print type(key),type(value),"\n"
# print title_file
title_filename.close()
예제 #9
0
파일: Main.py 프로젝트: 3L3N4/Tex
    enlaceExterno = "\n\n[+]External link: "
    buscarLogin = "******"
    pdf = "\n\n[+]PDF files"
    totales = "Totals: "
    indice = "\n[+]Index: "
    ayudaPdf = "\nType <int-int>: To download a range \nType <int,int,*>: To download the selected files\nNumber negative to exit\n"
    descargarPdf = "\n\n[+]Download PDF...\n"

imprimirCabecera()
#Ejecucion del Codigo
if paginaAtacar == None:
    imprimirAyuda()
else:
    print obtenerEnlaces
    paginaReal = getPaginaPrincipal(paginaAtacar)
    webCrawler = WebCrawler.WebCrawler(paginaReal, limiteHilos, limiteTiempo)
    webCrawler.agregarEnlace(paginaAtacar)
    paginasAnalizadas = 0
    #Calcular Dominio principal
    enlaceDividido = paginaReal.split("/")[2]
    dominioDividido = enlaceDividido.split(".")
    objetivo = dominioDividido[len(dominioDividido) - 2]
    while paginasAnalizadas <= limitePaginas:
        try:
            if webCrawler.getNumeroHilos() < webCrawler.getLimiteHilos():
                pagina = webCrawler.getEnlaces()[paginasAnalizadas]
                hilo = HiloAnalizar.HiloAnalizar(pagina, paginaReal,
                                                 webCrawler)
                hilo.start()
                tiempoEjecucion = webCrawler.getHoraActual(
                ) - webCrawler.getHoraInicio()
예제 #10
0
파일: driver.py 프로젝트: DdMad/WebCrawler
import WebCrawler

wc = WebCrawler.WebCrawler("baseURLs")
wc.crawl()
print(wc.get_brand_score())
예제 #11
0
파일: test.py 프로젝트: lsjsss/PythonClass
# -*- coding: cp936 -*-
import WebCrawler

url = raw_input('设置入口url(例-->http://www.baidu.com): \n')
thNumber = int(raw_input('设置线程数:'))    #之前类型未转换出bug

wc = WebCrawler.WebCrawler(thNumber)
wc.Craw(url)
예제 #12
0
파일: Main.py 프로젝트: Roknahr/pyCrawler
from WebCrawler import *

wc = WebCrawler()

#urls = wc.fetch_urls_from_source('http://wikipedia.org')

#url = wc.normalize_url('/', 'wiki.org')
#print(url)
wc.crawl()
#print(Helper.get_domain("http://us.rd.yahoo.com/finance/news/rss/story/*http://finance.yahoo.com/news/tokyo-gas-no-hurry-buy-113424587.html"))
#print(wc.get_disallowed_sites("http://stackoverflow.com/", "*"))
#if wc.is_allowed("ssl.reddit.com/res"):
#    print('true')
#else:
#    print('false')
예제 #13
0
	#######################################################
		Web-Crawler implementation to test.
	#######################################################

'''

# Initialize the web crawler params
project_name = 'ExtraCredit'

##################################################
# Can Change this url to change testing methods.
base_url = 'https://www.investopedia.com'
domain_name = 'investopedia.com'

# Set up the state space to undergo the search
crawler = WebCrawler(project_name, base_url, domain_name)
crawler.crawl(project_name, base_url, 0)
# crawler.setup()

# Call to the iterative deepening algorithm, searching for certain links
start_time = time.time()

# The problem to solve, finding the link for the Apple stock information. Ex: 'Domain_name.com/AAPL'
goal = {'tsla'}
search_result = iterative_deepening_search(goal, crawler, 2)
elapsed = time.time() - start_time
print("Time to search: " + str(elapsed))

goal = {'aapl'}
start_time = time.time()
search_result = iterative_deepening_search(goal, crawler, 2)