def main(): arg_parser = argparse.ArgumentParser( description='Mobile App Crawler Manual', formatter_class=RawTextHelpFormatter) arg_parser.add_argument('-m', '--method', help=('crawl_new: Scrap PlayStore top 300 app information ' 'for each category\n' 'crawl_old: Update collected app information\n' 'update_apk: Download APK file')) arg_parser.add_argument('-d', '--desktop', type=str2bool, default=True, help=('True(Default): Show web browser (use selenium)\n' 'False: Do not show web browser (use virtual screen)')) args = arg_parser.parse_args() if(args.desktop != None): desktop = args.desktop if(args.method != None): method = args.method playstore_crawler = Crawler(is_desktop = desktop) if(method == "crawl_new"): playstore_crawler.crawl_new() elif(method == "crawl_old"): playstore_crawler.crawl_old() elif(method == "update_apk"): playstore_crawler.update_apk() playstore_crawler.close()
def get(self): cr = Crawler() url = "http://loaq.kr/" json_file = cr.start(url) return json_file
def main(): arg_parser = argparse.ArgumentParser(description='APK크롤러 실행법') arg_parser.add_argument('--method', help='실행시키고자 하는 기능(crawl_new,\ crawl_old, update_apk)') arg_parser.add_argument('--desktop', type=str2bool, default=True,\ help='Desktop으로 실행시키려면 true, Server로 실행시키려면 false') args = arg_parser.parse_args() if (args.desktop != None): desktop = args.desktop if (args.method != None): method = args.method playstore_crawler = Crawler(is_desktop=desktop) if (method == "crawl_new"): playstore_crawler.crawl_new() elif (method == "crawl_old"): playstore_crawler.crawl_old() elif (method == "update_apk"): playstore_crawler.update_apk() playstore_crawler.close()
def getPageMorts(url, pageFilter): crawl = Crawler(url, pageFilter=pageFilter) urlsMort = [] for page in crawl: print("HttpCode:%d Url: %s " % (page.codeHTTP, page.url)) if page.codeHTTP not in range(200, 300): urlsMort.append((page.codeHTTP, page.url)) # a new dictionary to stock the results {url who has the dead liens: [dead liens]} pageParents = {} for url in urlsMort: for pageParent in crawl.pagesToCrawl_dict[url[1]]: if pageParent in pageParents: if url[0] in pageParents[pageParent]: pageParents[pageParent][url[0]].append(url[1]) else: pageParents[pageParent][url[0]] = [url[1]] else: pageParents[pageParent] = {url[0]: [url[1]]} print "\n Crawler Complet!\n" with open('liensMort.txt', 'w') as dump_file: for pageParent in pageParents: dump_file.write('Dans la page : \n{}\n'.format(pageParent)) dump_file.write('\n') codeHTTP = pageParents[pageParent].keys() codeHTTP.sort() for code in codeHTTP: dump_file.write('HTTP return code {}\n'.format(code)) for url in pageParents[pageParent][code]: dump_file.write(' {}\n'.format(url)) dump_file.write('*' * 80 + '\n\n')
def findSimilar(self,link,limit): # we call the read text function from the Crawler to read the new link # we use the construcotr with empty variables crawler = Crawler.Crawler('',0,0,0) self.limit = limit file = open("Data/page%d.txt" % self.limit, 'w') try: self.title , text = crawler.getText(link) # we combine the lists of string to a single string text = ''.join(text) for t in text: file.write(t) file.close() except: "Link is not accesible" file.close() sys.exit(0) indexer = Indexer.Indexer() indexer.start() cosineSimilarity = indexer.getCosineSimilarity() linksId = [ i for i in range(self.limit)] linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)] return cosineSimilarity , linksIdSorted
def main(): stocknum = str(600000) total = dict() for i in range(1, 10): page = str(i) crawler = Crawler(stocknum, page) datalist = crawler.getData() comments = File(stocknum + '_page_' + page, 'json', './data/') comments.inputData(datalist) data = open('./data/' + stocknum + '_page_' + page + '.json', 'r').read() jsonData = json.loads(data) for detail in jsonData: num = '1' if '年' not in detail['age'].encode( 'utf-8') else detail['age'].encode('utf-8').replace('年', '') num = float(num) date = detail['time'][4:14].encode('utf-8') total[date] = total[date] if date in total.keys() else { 'num': 0, 'content': 0 } total[date]['num'] = total[date]['num'] + num if total[date][ 'num'] else num total[date]['content'] = total[date][ 'content'] + detail['content'] * num if total[date][ 'content'] else detail['content'] * num total = json.dumps(total) totalfile = File(stocknum, 'json', './data/') totalfile.inputData(total)
def storeUrl4Comp(): db = pymysql.connect("localhost", "root", "root", "kg4company_2", charset="utf8") cur = db.cursor() sql1 = u"select name from city;" reCount = cur.execute(sql1) # 返回受影响的行数 cities1 = cur.fetchall() sql2 = u"select name from industry;" cur.execute(sql2) cities2 = cur.fetchall() cities = cities1 + cities2 sql3 = u"insert into urlList(url4comp) values('{name}');" cities = cities[71:] for city in cities: print city[0] for idx in range(1, 31): print('page: ' + str(idx)) cwler = cw.Crawler() cwler.getCompUrl(idx, city[0]) list_ = cwler.listComp if len(list_) == 0: break for li in list_: print li sqlN = sql3.format(name=li) cur.execute(sqlN) db.commit() time.sleep(2) print('page: ' + str(idx) + ' is done') time.sleep(2) print(city[0] + ' is done')
def __init__(self, busca, distancia=5): self.ponto_inicial = 'http://www.rugbyfluminense.com.br/' self.distancia = 2 self.crawler = Crawler.Crawler() self.criar_no = Criar_No.Criar_No self.catalogo = [] self.busca = busca #### termo buscado self.grafo = Grafo.Grafo()
def __init__(self, __textUrlData, __pageRankData): # instantiates data / service classes self.__textData = __textUrlData; self.__pageRankData = __pageRankData; self.__webscraper = WebScrape(self.__textData, self.__pageRankData); self.__crawler = Crawler(self.__textData); # loads all data structures by calling lower tier helper classes self.__generate_data_structures();
def parse_home(): try: indice_link = 0 for url in HOME_URL: instancia_crawler_main = Crawler.Crawler(url, NUEVO_LINK[indice_link]) instancia_crawler_main.crawler_main(indice_link) indice_link += 1 except ValueError as ve: print(ve)
def to_url(request): s = request.GET.get('text') s = s.split(" ") r = [] for i in s: if i != " " and i !="": r.append(i) a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/') try: a.downloadPages() except Exception, e: pass
def main(): # starting url url = "https://en.wikipedia.org/wiki/London" # limit of pages limit = 10 # bool variable to keep or delete previous data read delete = True # number of threads threadsNumber = 3 t0 = time.time() # by calling the Crawler the threads start collecting data from the links crawler = Crawler.Crawler(url, limit, delete, threadsNumber,printBool=False) crawler.start() print("Read ", limit, " pages, using ", threadsNumber, " threads in ", time.time() - t0, " seconds.") queryProcessor = QueryProcessor() link = "https://en.wikipedia.org/wiki/Paris" # the k most similar links k = 4 similarities , links = queryProcessor.findSimilar(link,limit) title = queryProcessor.getTitle() titles = crawler.getTitles() urls = crawler.getLinks() print() print(k," pages most similar to ",title) print() for i in range(k): x = links[i] print("With ",similarities[x]," similarity is",titles[x]," link: ",urls[x]) print() print("Completed in ",time.time() -t0," seconds") # we delete the last page of the Data file, # this page is the page we just compared, by removing it # we can re-run the program with another page path, dirs, files = next(os.walk("Data")) last = len(files) -1 os.remove("Data/page%i.txt" % last)
def subjectActivated(self, text): self.subject = text if self.subject == "Mathemastics": self.crawler = Crawler.Crawler("math") elif self.subject == "Physics": self.crawler = Crawler.Crawler("physics") elif self.subject == "Computer Science": self.crawler = Crawler.Crawler("cs") elif self.subject == "Quantitative Biology": self.crawler = Crawler.Crawler("q-bio") elif self.subject == "Quantitative Finance": self.crawler = Crawler.Crawler("q-fin") elif self.subject == "Statistics": self.crawler = Crawler.Crawler("stat")
def __init__(self, config): print(color + 'Env created') self.config = config self.default_action = config['Env_config']['default_action'] [self.angle_vect_size, self.pressure_vect_size, self.reward_vect_size] = self.config['Sim_config']['obs_vector_space'] print(color, 'obs vect space', self.angle_vect_size, self.pressure_vect_size, self.reward_vect_size) self.vect_size = self.angle_vect_size + self.pressure_vect_size + self.reward_vect_size self.action_space_size = self.angle_vect_size self.data = "" self.crawler = Crawler.Crawler(config=self.config) self.cycle_id = 0 self.cycle_reset_period = self.config['Env_config'][ 'cycle_reset_period'] self.temp_agent_reward_que = send_que = multiprocessing.Queue() self.reward_delay_length = 10
def CrawlBegin(paras, threadname, taskque, crawl_function): download_count = 0 restart_count = 0 #若捕捉到错误status=0,则重新生成实例Crawler则打印错误,循环 #直至收到任务完成信号status=1,循环停止 while True: try: print threadname, "successfully started" c = Crawler.Crawler(threadname, paras) status, download_count_iter = c.Crawling(threadname, taskque, crawl_function) download_count += download_count_iter if status == 1: print threadname, "successfully finished", time.ctime() break restart_count += 1 except Exception, e: print "(Spider)", e traceback.print_exc() continue
def spider_thread(city): iii=0 for idx in range(1, 30): cwler = cw.Crawler() cwler.getCompUrl(idx,city) list_=cwler.listComp print list_ for li in list_: iii=iii+1 print city+' company:'+str(iii) ci = connectWeb.compInfo() pri = connectWeb.projectInfo() pei = connectWeb.peopleInfo() ca= connectWeb.compAptitude() str4comp=ci.getInfo(li) pri.getInfo(li,str4comp) pei.getInfo(li,str4comp) ca.getInfo(li,str4comp) time.sleep(3) print city+' This company is done, take a rest' time.sleep(5) print city+' This page is done, take a rest'
def main(): print("Starting our Web Crawler") baseUrl = input("Website > ") numberOfThreads = input("No Threads > ") linksToCrawl = queue.Queue() urlLock = threading.Lock() linksToCrawl.put(baseUrl) haveVisited = [] crawlers = [] errorLinks = [] with open("links.txt", "w+") as f: for i in range(int(numberOfThreads)): crawler = Crawler(baseUrl, linksToCrawl, haveVisited, errorLinks, urlLock, f) crawler.run() crawlers.append(crawler) for crawler in crawlers: crawler.join() print("Total Number of Pages Visited {}".format(len(haveVisited))) print("Total Number of Pages with Errors {}".format(len(errorLinks)))
def check0x00(web): print(R + '\n ====================================================') print(R + ' C R O S S S I T E R E Q U E S T F O R G E R Y') print(R + ' ====================================================') time.sleep(0.7) print(O + ' [This module has only full support for domains of startpages]') print(O + ' [Hence, may not satisfactorily work for all domains]\n') if 'http' not in web: web = 'http://' + web # Just to make sure BeautifulSoup is working properly :) form1 = """<form action="/drupal/?q=node&destination=node" accept-charset="UTF-8" method="post" id="user-login-form"> <div><div class="form-item" id="edit-name-wrapper"> <label for="edit-name">Username: <span class="form-required" title="This field is required.">*</span></label> <input type="text" maxlength="60" name="name" id="edit-name" size="15" value="test1" class="form-text required" /> </div> <div class="form-item" id="edit-pass-wrapper"> <label for="edit-pass">Password: <span class="form-required" title="This field is required.">*</span></label> <input type="password" value="a9z8e7" name="pass" id="edit-pass" maxlength="60" size="15" class="form-text required" /> </div> <input type="submit" name="op" id="edit-submit" value="Log in" class="form-submit" /> <div class="item-list"><ul><li class="first"><a href="/drupal/?q=user/register" title="Create a new user account.">Create new account</a></li> <li class="last"><a href="/drupal/?q=user/password" title="Request new password via e-mail.">Request new password</a></li> </ul></div><input type="hidden" name="form_build_id" id="form-6a060c0861888b7321fab4f5ac6cb908" value="form-6a060c0861888b7321fab4f5ac6cb908" /> <input type="hidden" name="form_id" id="edit-user-login-block" value="user_login_block" /> </div></form> """ form2 = """<form action="/drupal/?q=node&destination=node" accept-charset="UTF-8" method="post" id="user-login-form"> <div><div class="form-item" id="edit-name-wrapper"> <label for="edit-name">Username: <span class="form-required" title="This field is required.">*</span></label> <input type="text" maxlength="60" name="name" id="edit-name" size="15" value="test2" class="form-text required" /> </div> <div class="form-item" id="edit-pass-wrapper"> <label for="edit-pass">Password: <span class="form-required" title="This field is required.">*</span></label> <input type="password" value="a9z8e7" name="pass" id="edit-pass" maxlength="60" size="15" class="form-text required" /> </div> <input type="submit" name="op" id="edit-submit" value="Log in" class="form-submit" /> <div class="item-list"><ul><li class="first"><a href="/drupal/?q=user/register" title="Create a new user account.">Create new account</a></li> <li class="last"><a href="/drupal/?q=user/password" title="Request new password via e-mail.">Request new password</a></li> </ul></div><input type="hidden" name="form_build_id" id="form-6a060c0861888b7321fab4f5ac6cb908" value="form-6a060c0861888b7321fab4f5ac6cb908" /> <input type="hidden" name="form_id" id="edit-user-login-block" value="user_login_block" /> </div></form> """ Cookie0 = cookielib.CookieJar() Cookie1 = cookielib.CookieJar() resp1 = urllib2.build_opener(urllib2.HTTPCookieProcessor(Cookie0)) resp2 = urllib2.build_opener(urllib2.HTTPCookieProcessor(Cookie1)) actionDone = [] csrf = '' init1 = web form = Form() # Hope it works properly (no lxml error ;=;) bs1 = BeautifulSoup(form1).findAll('form', action=True)[0] bs2 = BeautifulSoup(form2).findAll('form', action=True)[0] action = init1 resp1.open(action) resp2.open(action) crawler = Crawler(init1, resp1) print(GR + " [*] Initializing crawling...") global url try: while crawler.noinit(): url = crawler.next() print(C + ' [+] Crawling :> ' + B + url) try: soup = crawler.process(web) if not soup: continue i = 0 print(O + ' [*] Retrieving all forms on ' + C + url + O + '...') for m in getAllForms(soup): action = uri.buildAction(url, m['action']) if not action in actionDone and action != '': try: print() result = form.prepareFormInputs(m) r1 = request(url, action, result, resp1) result = form.prepareFormInputs(m) r2 = request(url, action, result, resp2) if (len(csrf) > 0): if not re.search(csrf, r2): print( G + '[+] Looks like we got a CSRF vulnerability on ' + O + url + G + '!\n') try: if m['name']: print(R + '\n =====') print(R + ' PoC') print(R + ' =====\n') print(B + ' [+] URL : ' + P + url) print(C + ' [+] Name : ' + O + m['name']) print(G + ' [+] Action : ' + O + m['action']) except KeyError: print(R + '\n =====') print(R + ' PoC') print(R + ' =====\n') print(B + ' [+] URL : ' + P + url) print(G + ' [+] Action : ' + O + m['action']) print(O + ' [+] Code : ' + W + urllib.urlencode(result)) print('') continue o2 = resp2.open(url).read() try: form2 = getAllForms(BeautifulSoup(o2))[i] except IndexError: print(R + ' [-] Form Error') continue contents2 = form.prepareFormInputs(form2) r3 = request(url, action, contents2, resp2) checkdiff = difflib.ndiff(r1.splitlines(1), r2.splitlines(1)) checkdiff0 = difflib.ndiff(r1.splitlines(1), r3.splitlines(1)) result12 = [] for n in checkdiff: if re.match('\+|-', n): result12.append(n) result13 = [] for n in checkdiff0: if re.match('\+|-', n): result13.append(n) if len(result12) <= len(result13): try: if m['name']: print(R + '\n =====') print(R + ' PoC') print(R + ' =====\n') print(B + ' [+] URL : ' + P + url) print(C + ' [+] Name : ' + O + m['name']) print(G + ' [+] Action : ' + W + m['action']) except KeyError: print(R + '\n =====') print(R + ' PoC') print(R + ' =====\n') print(B + ' [+] URL : ' + P + url) print(G + ' [+] Action : ' + W + m['action']) print(O + ' [+] Code : ' + W + urllib.urlencode(result)) print('') except urllib2.HTTPError as msg: print(msg.__str__()) pass actionDone.append(action) i += 1 except urllib2.URLError as e: print(R + ' [-] Exception at %s' % url) print(R + ' [-] Error : ' + str(e)) continue except KeyboardInterrupt: print(R + "\n [-] Interrupted by user")
def processPaper(paperId): content = cr.crawlPaperMain(paperId) soup = BeautifulSoup(content) paper = pr.getPaperInfo(soup) paper.id = paperId outPut_sql(paper) def processUser(userId): content = cr.crawlAuthorPub(userId) pr.parseAuthorPub(content) cr = Crawler.Crawler() pr = Parse.Parse() sql = SQLConn.MysqlUti() Init(pr, sql) pr.testPaper('2488441') paperId = sql.getPaper() while ((paperId != None) or (sql.getUser() != None)): while paperId != None: while (checkTime() == False): print('sleeping... ') time.sleep(1.5 * 3600) print(paperId) try:
'--output-encoding', help='encoding for shell', default=OUTPUT_ENCODING) parser.set_defaults(map_save_to_dot=MAP_SAVE_TO_DOT) args = parser.parse_args() print("Running configuration:") for attr, value in sorted(args.__dict__.iteritems()): print("\t{0} = {1}".format(attr, value)) c = Crawler(args.language, args.currency, args.user_place, args.departure_point, args.ignored_points, args.departure_month, args.departure_year, args.price_limit, args.flights_limit, args.selenium_host, args.selenium_port, args.selenium_start_cmd, args.selenium_load_timeout, args.map_save_to_dot, args.map_dot_filename, args.output_encoding) try: c.create_map() c.analyze_map() except KeyboardInterrupt: print("Ctrl-C pressed...") except SeleniumError as err: print(str(err)) except Exception as err: traceback.print_exc(file=sys.stdout) finally: c.cleanup()
from Crawler import * c = Crawler() c.movie_info(8)
def print_signature(): print('═══════════════════════════════════════════════════════════════') print('███████╗██╗ ██╗██╗████████╗███████╗ ██╗ █████╗ ██████╗') print('██╔════╝██║ ██║██║╚══██╔══╝██╔════╝ ██║ ██╔══██╗██╔══██╗') print('███████╗██║ ██║██║ ██║ █████╗ ██║ ███████║██████╔╝') print('╚════██║██║ ██║██║ ██║ ██╔══╝ ██║ ██╔══██║██╔══██╗') print('███████║╚██████╔╝██║ ██║ ███████╗██╗███████╗██║ ██║██████╔╝') print('╚══════╝ ╚═════╝ ╚═╝ ╚═╝ ╚══════╝╚═╝╚══════╝╚═╝ ╚═╝╚═════╝') print('═══════════════════════════════════════════════════════════════') print(' 블로그 사진 크롤러 ') print(' develop by woosik yoon [suitelab.github.io]') print('═══════════════════════════════════════════════════════════════') if __name__ == "__main__": print_signature() cr = Crawler() while True: url = input('블로그 주소를 입력하세요(종료는 exit 입력) : ') if url.upper() == 'EXIT': cr.driver.close() sys.exit(1) if not url_validate(url): print('잘못된 주소입니다.') continue cr.start(url) print( '═══════════════════════════════════════════════════════════════')
def __init__(self, project_name, url): CrawlerUrl.project_name = project_name CrawlerUrl.queue_file = 'storage/' + CrawlerUrl.project_name + '/queue.txt' CrawlerUrl.crawled_file = 'storage/' + CrawlerUrl.project_name + '/crawler.txt' Crawler(CrawlerUrl.project_name, url, url)
def main(base_url, max_count, flag): craw = crawler.Crawler(base_url) craw.crawling(base_url, max_count, flag)
def __init__(self, master=None): Frame.__init__(self, master) self.pack() self.createWidgets() self.c = C.Crawler() #crawler module
#!/usr/bin/python from PyQt4 import QtCore, QtGui import Crawler import warnings import time # Global variables warnings.filterwarnings("ignore") cr = Crawler.Crawler("it") message = "" try: _fromUtf8 = QtCore.QString.fromUtf8 except AttributeError: def _fromUtf8(s): return s try: _encoding = QtGui.QApplication.UnicodeUTF8 def _translate(context, text, disambig): return QtGui.QApplication.translate(context, text, disambig, _encoding) except AttributeError: def _translate(context, text, disambig): return QtGui.QApplication.translate(context, text, disambig) class Ui_Form(QtGui.QWidget):
# -*- coding: utf-8 -*- import sys reload(sys) sys.setdefaultencoding('utf8') from Tkinter import * import tkinter.messagebox from Crawler import * AGS = AGSpiel() C = Crawler(AGS) from tkinter import * import tkinter.messagebox app = Tk() app.title("AG-Spiel Crawler") app.geometry("1000x600+100+100") def AuslesenStarten(): inArb.set("In Arbeit") app.update() Eingabe = Eingabefeld.get().split(",") txtAusgabe.delete("1.0", END) for i in range(0, len(Eingabe)): Eingabe[i] = Eingabe[i].replace(" ", "") print Eingabe[i] C.AG_Hinzufuegen(AG(AGS, i, Eingabe[i])) for meineAG in C.get_AGListe():
import Crawler as c import pandas as pd import os from datetime import datetime import time crawler = c.Crawler() crawler.crawl() df = pd.DataFrame() count = 1 print("Start merging..") for i in os.listdir("./raw/"): print(str(count) + ". file: " + str(i)) count += 1 df = df.append(pd.read_csv("./raw/" + str(i), sep=";", encoding="utf-8", decimal=","), sort=False) df = df.drop_duplicates(subset="URL") succ = df.to_csv('./data/' + str(datetime.now())[:19].replace(':', '').replace('.', '') + '.csv', sep=';', decimal=',', encoding='utf-8', index_label='timestamp')
def crawl(): for i in range(len(urls)): crawler = Crawler.Crawler(urls[i], filesName[i]) crawler.start()
"""Pull data from geth and parse it into mongo.""" import Crawler print("Booting processes.") c = Crawler.Crawler() print("Update complete.")