def initializeSpiders(): homePageList = ['http://finans.mynet.com/borsa/hisseler/', 'http://finans.mynet.com/borsa/hisseler/c-e/', 'http://finans.mynet.com/borsa/hisseler/f-j/', 'http://finans.mynet.com/borsa/hisseler/k-q/', 'http://finans.mynet.com/borsa/hisseler/r-z/'] for i in range(0,5): Spider(PROJECT_NAME,homePageList[i],DOMAIN_NAME)
def main(currentTime, startTime, web, weblevel, app, applevel, device, devicelevel, sys, syslevel, method): start_time = time.time() print("程序运行中....") #新建一个爬虫对象 spider = Spider.Spider() #web应用漏洞 page = web level = weblevel type = 1 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) #应用程序漏洞 page = app level = applevel type = 2 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) #网络设备漏洞 page = device level = devicelevel type = 3 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) #操作系统漏洞 page = sys level = syslevel type = 4 spider = SpiderMain(currentTime, startTime, spider, page, level=level, type=type, method=method) spider.save_doc(currentTime, startTime) end_time = time.time() print("总共花费了%s" % str((end_time - start_time) / 60) + "分钟!")
def __init__(self): #,self.handle.application self.data = [] httpd = make_server('', 1234, self.handle) print('Server HTTP on port 1234...') #Application类的实例化 self.app = Application() #Spider类的实例化 self.spider = Spider() httpd.serve_forever()
def __init__(self, dbUser, dbPassword, homeWebPageInfoFile): """constructor """ self._init(homeWebPageInfoFile) self.spider = Spider.Spider() self.parser = Parser.Parser() self.conn = Connection(conf.dbHost, conf.dbName, user=dbUser, password=dbPassword) #db instance
def startSpider(): print('WhiteList spider started!', file=sys.stderr) try: daemonize(PIDFILE, stdout='/tmp/spider-log.log', stderr='/tmp/spider-err.log') except RuntimeError as e: print(e, file=sys.stderr) raise SystemExit(1) io = IO.IO() spider = Spider.Spider(io) spider.start()
def __init__(self, word): ''' Constructor to crawl web for a word ''' self.word = word sp = Spider(word, spread=2, limit=0.01) self.web = sp.crawl('Graph.shelve') # Crawled web self.graph = Shelveopen('Graph.shelve') self.paths = [] # To store all paths self.scores = [] # To store corresponding pathscores self.clientfeatures = [] # Feature vector for client self.standardfeatures = [] # To compare against
def main(): # parse arguments args = parse_args() if args is None: exit() resultsFilePath = args.results_path #resultsFile = open(resultsFilePath, 'w') regionUrl = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html" regionUrlStarter = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/" spider = Spider(regionUrl, regionUrlStarter) spider.processData() print(spider.provinceList) province_dict = spider.provinceList[0] with open(resultsFilePath, "w") as f: json.dump(province_dict, f, ensure_ascii=False) print("加载入文件完成...")
def worker(config, verbose=False): """ worker - start the spider Args: config - the Config object verbose - should the verbosity logs should be printed Returns: None """ # All spiders consume the global common queue of tasks spider = Spider.Spider(config.max_depth, config.crawl_interval, config.crawl_timeout, config.target_url, config.output_directory, tasks, verbose) spider.start()
def buildDataSet(): books = open("data/item/book.txt", "r") bookList = [] for book in books: bookList.append(book.rstrip()) movies = open("data/item/movie.txt", "r") movieList = [] for movie in movies: movieList.append(movie.rstrip()) songs = open("data/item/music.txt", "r") songList = [] for song in songs: songList.append(song.rstrip()) spider = Spider.Spider() for item in bookList: urls = search(item + " book", 'com', 'en', '0', 'off', 1, 0, 11, random.uniform(15.0, 45.0), True, {}, '') for url in urls: if (type(url) is str): print(spider.fetch(url, "book", item)) for item in movieList: for url in search(item + " movie", 'com', 'en', '0', 'off', 1, 0, 11, random.uniform(15.0, 45.0), True, {}, ''): if type(url) is str: print(spider.fetch(url, "movie", item)) for item in songList: for url in search(item + " song", 'com', 'en', '0', 'off', 1, 0, 11, random.uniform(15.0, 45.0), True, {}, ''): if type(url) is str: print(spider.fetch(url, "song", item)) books.close() movies.close() songs.close()
# list for seed URLs that user inputs usrInput = input("Enter seed URL or press ENTER to continue: ") while (usrInput != "" and usrInput != " "): check = urllib.parse.urlparse(usrInput) if check.netloc != '' and check.scheme != '': seed = [0, usrInput] seeds.append(seed) else: print('Invalid URL') usrInput = input("Enter seed URL or press ENTER to continue: ") keyword = input("Enter keyword to search for: ").lower() spider = Spider.Spider( seeds, maxDepth, keyword ) # Create spider object initialised with the users seed urls and keyword spider.crawl() # Start crawling process results = spider.results # Get list of URL results from the Spider adjList = spider.adjacencyList c = 0.15 # This is the damping factor pArray = [[0 for col in range(len(adjList))] for row in range(len(adjList))] # Stores the page ranks matrix vArray = [1 / len(adjList) for col in range(len(adjList))] # PageRank Vector temp = [0 for col in range(len(adjList))] rankRes = [ ] # Store the results hyperlinks and corresponding page rank centrality # Calculate PageRank Matrix for i in range(len(adjList)):
# -*- coding: gb18030 -*- import Spider import sys import time import random import threading s = Spider.Spider() # 日期 # year=int(sys.argv[1]) # month=int(sys.argv[2]) # day=int(sys.argv[3]) year = 2012 month = 12 day = 3 delta = s.timeDelta(year, month) # 一个月一个月的抓取 def get(year, month, day): # 日期 date = s.handleDate(year, month, day) # 页数 try: allNum = s.getAllNum(date) except Exception as e: print e while allNum == 200: print ("I suspect there is not as many as 200 pages in one day. Let's try again!") time.sleep(random.random()) allNum = s.getAllNum(date)
import Spider spi = Spider.Spider() spi.getCont()
def __init__(self, url, page=1): self.Spider = Spider.Spider() self.url = url self.Filename = "./Page.htm" self.page = page
""" author: Yann Liu target: scrawl the weather data from http://www.tianqihoubao.com/ usage: host: the ip of mysql, default 127.0.0.1 username: the user of mysql, default root password:the password of mysql db: database that store weather data start_time: when to spider, default Jan,2015 end_time: when to end, default Dec,2019 """ from Spider import * if __name__ == '__main__': spider = Spider(host='127.0.0.1', username='******', password='******', db='weather_data', start_time='201501', end_time='201912') spider.run()
spiderCnf['path'] = site.path spiderCnf['maxTimeCount'] = 30 spiderCnf['webScanTimeout'] = task.web_scan_timeout spiderCnf['endTime'] = time.time() + 1800 spiderCnf['maxnum'] = task.spider_url_count spiderCnf['title'] = site.title spiderCnf['ip'] = site.ip spiderCnf['cookie'] = cookie spiderCnf['webSearchSiteState'] = task.web_search_site_state spiderCnf['webSearchSiteTimeout'] = task.web_search_site_timeout spiderCnf['includeUrl'] = site.include_url spiderCnf['excludeUrl'] = site.exclude_url spiderCnf['downloadDir'] = SCANER_SPIDER_DOWNLOAD_DIR #import plugins.lib.common #argv['rec'] = plugins.lib.common.request_exception_counter(200) spiderCnf['rec'] = None # import Spider2 as Spider import Spider ''' if task.spider_type == 2: import Spider2 as Spider else: import Spider ''' spider = Spider.Spider(spiderCnf) spider.start() # url="http://192.168.5.135:8503/vulnerabilities/sqli/" # spider.startTester(url)
def newSpider(self, env="", limit=""): return Spider.Spider(env, limit=limit)
import Spider s = Spider.Spider(666) s.crawl()
def main(): start_url = argv[1] spider = Spider(levels=2) spider.build_web(start_url) spider.save()
import Spider import csv from bs4 import BeautifulSoup spider = Spider.Spider() rows = [] for line in open('url.txt','r'): rows.append(line.rstrip('\n')) for row in rows: soup = spider.getSoup(row) items = spider.getContent(soup) spider.createPersonCSV(items) # soup = spider.getSoup('https://baike.baidu.com/item/%EF%BB%BF%E8%94%A1%E6%98%89/1020848') # items = spider.getContent(soup) # spider.printContent(items)
class Engine: # 最大关键词长度 MAXKEYWORDLEN = 16 #映射表 targetMap = {} #目标url targetUrl = "http://www.csdn.net" #目标深度 targetDepth = 2 #爬虫 spider = Spider.Spider(targetUrl, targetDepth) #文件分析 htmlIndexer = FileAnalyzer.HtmlIndexer() #倒排索引表建立 mapBuilder = MapBuilder.MapBuilder() #匹配摘要前后的文字正则 #briefPat = u"[\u4e00-\u9fa5]{" #maxBrief = 40 briefPat = u"[\u4e00-\u9fa5]{0,40}" def __init__(self): #抓取 print "fetching......" for i in range(1, self.targetDepth + 1): self.spider.visitCurrent() print "depth: ", i, '/', self.spider.maxDepth, " done" #建立索引文件 print "indexing......" self.htmlIndexer.getHtml() self.htmlIndexer.startIndex() #获取倒排索引表 print "mapping" self.targetMap = self.mapBuilder.getMap() def __getUrlAndWeight(self, word): res = [] if (word in self.targetMap): res = self.targetMap[word] return res def __mergeUrlAndWeight(self, result): ans = [] while 0 != len(result): temp = result[0] result.remove(temp) i = 0 while i >= 0 and i < len(result): if (result[i][0] == temp[0]): temp[1] += result[i][1] result.remove(result[i]) i = i - 1 i = i + 1 ans.append(temp) return ans def __getBrief(self, targetWord, targetResult): resList = [] for res in targetResult: try: filename = self.spider.path + res[0].replace( '/', '_') + self.spider.HTMLEXT file = codecs.open(filename, "r", "UTF-8") content = file.read() '''length = self.maxBrief brief = "" while(length > 0): brief = re.search(self.briefPat + str(length) + u'}' + targetWord + self.briefPat + str(length) + ur'}', content) if (brief): break length -= 1''' brief = re.search(self.briefPat + targetWord + self.briefPat, content) if (brief): string = brief.group() res.append(string) res.append(len(string.split(targetWord)[0])) res.append(res[len(res) - 1] + len(targetWord) - 1) resList.append(res) file.close() except: None return resList def getResult(self, targetWord): #截取关键词 targetWord = targetWord.decode('utf-8') if (len(targetWord) > self.MAXKEYWORDLEN): targetWord = targetWord[0:self.MAXKEYWORDLEN] result = [] #将搜索词作为关键字查找 #targetWord = targetWord.decode('utf-8') #tempResult = self.__getUrlAndWeight(targetWord) #tempResult = self.__getBrief(targetWord, tempResult) #result += tempResult #将分词的结果作为关键字 #targetSplit = Analyzer.getChiSegList(targetWord, self.htmlIndexer.chiStopWordsList) #chiTargetSplit = #engTargetSplit = targetSplit = Analyzer.getChiSegList( Analyzer.getAllChiInStr(targetWord), self.htmlIndexer.chiStopWordsList) + Analyzer.getEngSegList( Analyzer.getAllEngInStr(targetWord), self.htmlIndexer.engStopWordsList) for word in targetSplit: tempResult = self.__getUrlAndWeight(word) tempResult = self.__getBrief(word, tempResult) result += tempResult #将url结果相同的条目合并 mergedRes = self.__mergeUrlAndWeight(result) #将结果按照权重排序 mergedRes.sort(key=lambda uaw: uaw[1], reverse=True) '''for res in mergedRes: if(len(res) >= 3): mergedRes.remove(res) result = []''' for i in mergedRes: i[0] = 'http://' + i[0] return mergedRes def startSearch(self): while (1): print "请输入关键字############################################" key = raw_input() #key = key.decode('utf-8') result = self.getResult(key) writer = HtmlWriter.HtmlWriter() writer.write(result) for urlAndWeight in result: print urlAndWeight[0], urlAndWeight[1], urlAndWeight[2]
# Python Web Crawler Tutorial - 17 - Running the Final Program - https://www.youtube.com/watch?v=ciwWSedS1XY&t=331s import threading from queue import Queue from domain import * from general import * from Spider import * PROJECT_NAME = 'thenewboston' HOMEPAGE = 'https://thenewboston.com/' DOMAIN_NAME = get_domain_name(HOMEPAGE) QUEUE_FILE = PROJECT_NAME + '/' + PROJECT_NAME + '_queue_url.txt' CRAWLED_FILE = PROJECT_NAME + '/' + PROJECT_NAME + '_crawled_url.txt' NUMBER_OF_THREADS = 8 # There are a lot of factors for this one thread_queue = Queue() Spider(PROJECT_NAME, HOMEPAGE, DOMAIN_NAME) # Create Worker Threads # Threads will die when main exit def create_spiders(): # just loop 8 time, using _ to disregards the values for _ in range(NUMBER_OF_THREADS): thread = threading.Thread(target=crawl) thread.daemon = True thread.start() # Execute next in the queue def crawl(): while True:
#!/usr/bin/env python # coding: utf-8 # In[11]: from Spider import * from Domain import * from Queue import * # In[13]: project_name = "Crawler" home_page = "https://www.cognizant.com/" domain_name = get_domain(home_page) spider = Spider(project_name, home_page, domain_name) spider.find_link(home_page) spider.to_crawl()
import Spider import config import load_data import workflow import urlGenerator import io import locateCareerPage spidy = Spider.Spider() spidy.crawl(config.employer_links) spidy.get_job_description_all() db = workflow.connect_to_database('sqlite:///data.db') table_clean = workflow.create_table_clean(db) table_raw= workflow.create_table_raw(db) #workflow.update_table_clean(table_clean) load_data.save_list_to_file(spidy.url_list, "clean_job_urls.txt") db_company = workflow.connect_to_database('sqlite:///data_company_name.db') table_companies = db_company['companies'] company_names = workflow.get_column_from_table(table_companies, 'name', True) data = [] f = open('site_blacklist.csv','r') data=f.read() f.close() blacklist_company_names = [] blacklist_company_url = []
def testWalk(): legs = configLegs(connexion=Connexion()) spider = Spider(legs) spider.move()
# encoding:utf-8 import Spider import time import threading from ProxyPool import app from Config import parse from Spider import Spider # parse为从config中读取出的参数 host = parse['pool']['host'] port = parse['pool']['port'] database_name = parse['database']['database_name'] sleep_time = int(parse['spider']['sleep_time']) s = Spider(database_name) # 定时获取 def get_ip(): while True: s() time.sleep(sleep_time) thread = threading.Thread(target=get_ip()) app.run(host=host, port=port)
from Spider import * from Download import * if __name__ == "__main__": spider = Spider() manager = DownloadManager(spider.getImagePaths())
def getOriginalPrice(self, url): spider = Spider.Spider(url) return Spider.searchOriginalPrice(spider.getHtmltree())
motor_id[6], motor_id[7], motor_id[8])) print("\t\t |_____________|") print("\t\t\t %d" % motor_id[9]) print("\t\t\t %d" % motor_id[10]) print("\t\t\t %d" % motor_id[11]) elif line == "quit" or line == "": pass else: print("Unknown command") if __name__ == "__main__": spider = None ctrl = dyn.create_controller(verbose=False, timeout=0.5, motor_range=[0, 20]) spider = Spider(configLegs(ctrl.motors, simulator=False)) gamepadThread = GamepadHandler(spider) gamepadThread.daemon = True gamepadThread.start() terminalThread = TerminalThread(spider) terminalThread.daemon = True terminalThread.start() while True: time.sleep(1.0) spider.move(startNow=False)
def searchOrderGoods(self): #没有分包的包裹路径 basepath = '//*[@id="normalorder"]//*[@class="merch_bord"]//table[@class="tabl_merch"]' #有分包的包裹中每个包件的路径 subpath = '//*[@id="normalorder"]//*[@class="merch_bord"]//*[@class="sort_package_list"]/table' packages = self._htmltree.xpath(subpath) if len(packages) > 0: basepath = subpath #todo: #每个订单可能有若干个包裹,每个包裹可能有若干个分包, 当前只考虑了其中一种情况 books = self._htmltree.xpath( basepath + '//*[@class="tab_w1"]/*[@name="productname"]') titles = self._htmltree.xpath( basepath + '//*[@class="tab_w1"]/*[@name="productname"]/@title') hrefs = self._htmltree.xpath( basepath + '//*[@class="tab_w1"]/*[@name="productname"]/@href') prices = self._htmltree.xpath(basepath + '//*[@class="tab_w3"]') bonuses = self._htmltree.xpath(basepath + '//*[@class="tab_w2"]') amounts = self._htmltree.xpath(basepath + '//*[@class="tab_w6"]') sums = self._htmltree.xpath(basepath + '//*[@class="tab_w4"]') #换购商品或分册信息 subbooks = self._htmltree.xpath( basepath + '//*[@class="tab_w1"]/*[@class="present"]') ordernr = self._htmltree.xpath( '//*[@id="normalorder"]//div[@id="divorderhead"][@class="order_news"]/p/text()' ) parcel = self._htmltree.xpath( '//*[@id="normalorder"]//div[@class="business_package"]') ordertime = self._htmltree.xpath( '//*[@id="normalorder"]//div[@id="divorderhead"][@class="order_news"]//span[@class="order_news_hint"]/span' ) others = self._htmltree.xpath( '//*[@id="normalorder"]//div[@class="ditail_frame_notop"]/table[@class="tabl_other"]' ) endprice = self._htmltree.xpath( '//*[@id="normalorder"]//div[@class="price_total"]/span[1]') payment = self._htmltree.xpath( '//*[@id="normalorder"]//*[@class="order_detail_frame"]/ul[position()=4]/li' ) #国内物流信息 logispath = '//*[@id="normalorder"]//p[@class="p_space"]' logiscompany = self._htmltree.xpath(logispath + '/span[4]/span') logisnr = self._htmltree.xpath(logispath + '/span[7]/span') cncompany = "" if (logiscompany): cncompany = logiscompany[0].text cnnr = "" if (logisnr): cnnr = logisnr[0].text #国际物流信息 header = "" consignee = self._htmltree.xpath('//*[@id="label_name"]')[0].text for code, (en, cn, pattern) in get_transports_info().items(): if re.match(pattern, consignee): header += u"【" + cn + u"】" break #采购账号 for code, pattern in get_ddusers_info().items(): if re.match(pattern, consignee): header += u"【" + code + u"】" break wb = openpyxl.Workbook() ws = wb.active j = 0 for i, book in enumerate(books): #预售商品 res = book.xpath('../span[@class="c_red"]') if len(res) != 0: #是预售 ws.cell(row=i + j + 1, column=1, value='[YS] ' + titles[i]).hyperlink = hrefs[i] else: ws.cell(row=i + j + 1, column=1, value=titles[i]).hyperlink = hrefs[i] if len(prices[i].xpath('./text()')) != 0: #没有折扣的情况,比如订单35737447378 ws.cell(row=i + j + 1, column=2, value=prices[i].text) else: res = prices[i].xpath('./span') ws.cell(row=i + j + 1, column=2, value=res[0].text) ws.cell(row=i + j + 1, column=3, value=bonuses[i].text) ws.cell(row=i + j + 1, column=4, value=amounts[i].text) #小计以数字形式保存 sum = re.findall('\d+.\d+', sums[i].text)[0] ws.cell(row=i + j + 1, column=5, value=sum) #当当编号 sn = Spider.split_ddsn(hrefs[i]) #团购所需图书信息 #团购表和采购表从第7行开始有区别 spider = Spider.Spider(hrefs[i]) if self._tuan: titlesn = ws.cell(row=i + j + 1, column=1).value + ' [' + sn + ']' ws.cell(row=i + j + 1, column=1, value=titlesn) ws.cell(row=i + j + 1, column=7, value=self.getOriginalPrice(hrefs[i])) ws.cell(row=i + j + 1, column=8, value=sn) ws.cell(row=i + j + 1, column=9, value=spider.searchISBN()) ws.cell(row=i + j + 1, column=10, value=spider.searchPress()) adress = spider.searchSmallAndBigPicture() if adress: ws.cell(row=i + j + 1, column=11, value=adress[0]) ws.cell(row=i + j + 1, column=12, value=adress[1]) else: ws.cell(row=i + j + 1, column=7, value=sn) ws.cell(row=i + j + 1, column=8, value=spider.searchISBN()) #换购商品或分册信息 res = books[i].xpath('../br') subbook = books[i].xpath('../span[@class="present"]') for s, elem in enumerate(subbook): j += 1 hgtitle = elem.xpath('../a/@title') hghref = elem.xpath('../a/@href') hgprice = prices[i].xpath('./span/text()') hgamount = amounts[i].xpath('./text()') hgsum = sums[i].xpath('./text()') stext = elem.xpath('./text()') if re.match(u'.*换购', stext[0]): #有换购 ws.cell(row=i + j + 1, column=1, value='[HG] ' + hgtitle[1 + s]).hyperlink = hghref[1 + s] else: #有分册 ws.cell(row=i + j + 1, column=1, value='[FC] ' + hgtitle[1 + s]).hyperlink = hghref[1 + s] ws.cell(row=i + j + 1, column=2, value=hgprice[s]) if amounts[i].text: ws.cell(row=i + j + 1, column=4, value=hgamount[1 + s]) else: ws.cell(row=i + j + 1, column=4, value=hgamount[s]) ws.cell(row=i + j + 1, column=5, value=re.findall('\d+.\d+', hgsum[1 + s])[0]) ws.cell(row=i + j + 1, column=7, value=Spider.split_ddsn(hghref[1 + s])) lastrow = len(books) + len(subbooks) if len(ordernr) != 0: #普通订单不分包裹 #订单号,下单时间,付款方式,快递单号等 nr = '' for n in ordernr: if n.strip() != '': nr = n.strip() break if len(ordertime) == 0: ws.cell(row=lastrow + 1, column=1, value=header + nr + payment[0].text + cncompany + cnnr) elif len(ordertime) == 1: ws.cell(row=lastrow + 1, column=1, value=header + nr + ordertime[0].text + payment[0].text + cncompany + cnnr) elif len(ordertime) == 2: ws.cell(row=lastrow + 1, column=1, value=header + nr + ordertime[0].text + ordertime[1].text + payment[0].text + cncompany + cnnr) #最终价 if (endprice[0].text.find(u'\xa5')) >= 0: #包含¥符号 ws.cell(row=lastrow + 1, column=6, value=endprice[0].text.replace(u'\xa5', u'')) else: ws.cell(row=lastrow + 1, column=6, value=endprice[0].text) #优惠 bonus = others[0].xpath('.//span') for i, elem in enumerate(bonus): if i == 0: if (bonus[0].text.find(u'\xa5')) >= 0: #包含¥符号 ws.cell(row=lastrow + 1, column=5, value=bonus[0].text.replace(u'\xa5', u'')) else: ws.cell(row=lastrow + 1, column=5, value=bonus[0].text) else: ws.cell(row=lastrow + 1 + i - 1, column=3, value=bonus[i].text) else: #分包裹 for i, elem in enumerate(parcel): note = elem.xpath( './/span[@class="business_package_bg"]/b/text()') nr = elem.xpath( './/span[@class="business_package_bg"]/text()[1]') time = elem.xpath( './/span[@class="business_package_bg"]//span[@class="t_time_n"]' ) if len(logiscompany) >= i + 1: ws.cell(row=lastrow + 1 + i, column=1, value=header + note[0] + nr[0] + time[0].text + payment[0].text + logiscompany[i].text + logisnr[i].text) else: ws.cell(row=lastrow + 1 + i, column=1, value=header + note[0] + nr[0] + time[0].text + payment[0].text) ws.cell(row=lastrow + 1 + i, column=6, value=endprice[i].text) bonus = others[i].xpath('.//span') ws.cell(row=lastrow + 1 + i, column=5, value=bonus[0].text) wb.save(get_excel_name())