def main(): arg_parser = argparse.ArgumentParser(description='APK크롤러 실행법') arg_parser.add_argument('--method', help='실행시키고자 하는 기능(crawl_new,\ crawl_old, update_apk)') arg_parser.add_argument('--desktop', type=str2bool, default=True,\ help='Desktop으로 실행시키려면 true, Server로 실행시키려면 false') args = arg_parser.parse_args() if (args.desktop != None): desktop = args.desktop if (args.method != None): method = args.method playstore_crawler = Crawler(is_desktop=desktop) if (method == "crawl_new"): playstore_crawler.crawl_new() elif (method == "crawl_old"): playstore_crawler.crawl_old() elif (method == "update_apk"): playstore_crawler.update_apk() playstore_crawler.close()
def main(): arg_parser = argparse.ArgumentParser( description='Mobile App Crawler Manual', formatter_class=RawTextHelpFormatter) arg_parser.add_argument('-m', '--method', help=('crawl_new: Scrap PlayStore top 300 app information ' 'for each category\n' 'crawl_old: Update collected app information\n' 'update_apk: Download APK file')) arg_parser.add_argument('-d', '--desktop', type=str2bool, default=True, help=('True(Default): Show web browser (use selenium)\n' 'False: Do not show web browser (use virtual screen)')) args = arg_parser.parse_args() if(args.desktop != None): desktop = args.desktop if(args.method != None): method = args.method playstore_crawler = Crawler(is_desktop = desktop) if(method == "crawl_new"): playstore_crawler.crawl_new() elif(method == "crawl_old"): playstore_crawler.crawl_old() elif(method == "update_apk"): playstore_crawler.update_apk() playstore_crawler.close()
def main(): stocknum = str(600000) total = dict() for i in range(1, 10): page = str(i) crawler = Crawler(stocknum, page) datalist = crawler.getData() comments = File(stocknum + '_page_' + page, 'json', './data/') comments.inputData(datalist) data = open('./data/' + stocknum + '_page_' + page + '.json', 'r').read() jsonData = json.loads(data) for detail in jsonData: num = '1' if '年' not in detail['age'].encode( 'utf-8') else detail['age'].encode('utf-8').replace('年', '') num = float(num) date = detail['time'][4:14].encode('utf-8') total[date] = total[date] if date in total.keys() else { 'num': 0, 'content': 0 } total[date]['num'] = total[date]['num'] + num if total[date][ 'num'] else num total[date]['content'] = total[date][ 'content'] + detail['content'] * num if total[date][ 'content'] else detail['content'] * num total = json.dumps(total) totalfile = File(stocknum, 'json', './data/') totalfile.inputData(total)
def get(self): cr = Crawler() url = "http://loaq.kr/" json_file = cr.start(url) return json_file
def getContent(session, object, title): error_data = OrderedDict() return_data = OrderedDict() highkorea = object s = session topicurl = title['titleURL'] topictitle = title['title'] topicauthor = title['author'] lastup = title['lastup'] try: tup = highkorea.staticGet(s, highkorea.stem + topicurl.strip('.')) s, html, soup = tup[0], tup[1].text, tup[2] lastpg = soup.select('div.topic-actions div span a') lastpage = lastpg[-1].get('href') if lastpg != [] else soup.select( '#page-body h2 a')[0].get('href') m = re.compile('&start=(\d+)').search(lastpage) num = 1 if m: num = int(m.group(1)) url = re.sub('&start=(\d+)', '', lastpage) else: url = lastpage content_datas = list() html_datas = list() image_datas = list() for i in range(0, num, 10): tup = highkorea.staticGet( s, highkorea.stem + url.strip('.') + '&start={}'.format(i)) s, html, soup = tup[0], tup[1].text, tup[2] authors = soup.select(".author strong a.username-coloured") contents = soup.find_all('div', {'class': 'content'}) images = soup.select('dl.attachbox dd dl dt a img') for author, content in zip(authors, contents): content_data = OrderedDict() for br in content.find_all('br'): br.replace_with('\n') content_data['author'] = author.text content_data['content'] = content.text content_datas.append(content_data) for image in images: image_data = OrderedDict() image_data['src'] = image.get('src') image_data['name'] = image.get('alt') image_datas.append(image_data) html_datas.append(html) return_data['image'] = image_datas return_data['html'] = html_datas return_data['content'] = content_datas return_data['url'] = topicurl return_data['title'] = topictitle return_data['author'] = topicauthor return_data['lastup'] = lastup # return_data[topicurl] = return_data return return_data except Exception as e: error_data[title['titleURL']] = e cr.mkjson(error_data, '/home/kyw/json_datas/highkorea', 'hkContent_error.json') pass
def main(): #instancia do coletor diretorio = sys.argv[1] projeto = sys.argv[2] createDir(projeto) #coletor = ColetorFT(projeto) coletor = Crawler(diretorio,projeto) coletor.start()
def main(): first_url = raw_input(u"""请输入首地址:""") deepth = int(raw_input(u"""请输入层数:""")) examlpe_url = raw_input(u"""请输入示例下载链接""") crawler_model = Crawler(first_url,deepth,examlpe_url) crawler_model.start() for items in crawler_model.final_url_list: print items + '\n'
def getGamesInCertainRange(date_range, game_dict, data_collection, owner): start_date = time.strptime(date_range.split("-")[0], "%Y/%m/%d") end_date = time.strptime(date_range.split("-")[1], "%Y/%m/%d") res = {} rep = {} #contain games that do not have enough owner print("start searching for the existing data...") raw_data_list = data_collection.find() for data in raw_data_list: if "release_date" in data and data["release_date"] != "Not released": release_date = time.strptime( str(data["release_date"][0]) + "/" + str(data["release_date"][1]) + "/" + str(data["release_date"][2]), "%Y/%m/%d") if time.mktime(release_date) >= time.mktime( start_date) and time.mktime(release_date) <= time.mktime( end_date): print("Found desired game!") app_data = Crawler.checkSteamSpy(data["appid"]) if app_data is None or app_data["owners"] < int(owner): print("Owners of game id " + str(data["appid"]) + " is not enough! Deleting...") rep[data["appid"]] = data["name"] continue res[data["appid"]] = data["name"] print("start checking newly added games") for k in game_dict.keys(): if k in res or k in rep: #data already existed continue result = data_collection.find_one({"appid": k}) if data_collection.count( {"appid": k}) != 0 and "release_date" in result and result[ "release_date"] != "Not released": release_date = time.strptime( str(result["release_date"][0]) + "/" + str(result["release_date"][1]) + "/" + str(result["release_date"][2]), "%Y/%m/%d") else: #Not in raw database bsObj = Crawler.connectToSteamSpy(k) release_date = Crawler.getReleaseDateFromSteamSpy(bsObj) if release_date == "Not released": print("Appid: " + str(k) + " has not been recorded on SteamSpy") continue if time.mktime(release_date) >= time.mktime( start_date) and time.mktime(release_date) <= time.mktime( end_date): print("Found desired game!") app_data = Crawler.checkSteamSpy(k) if app_data is None or app_data["owners"] < int(owner): print("Owners of game id " + str(k) + " is not enough! Deleting...") continue res[k] = game_dict[k] return res
def GetDumpsOfCrawler(): pagData = Crawler.crawl('https://www.cnj.jus.br/sgt/versoes.php') soups = Crawler.getSoup(pagData) linkOfDumps = Crawler.GetUrlOfDumps(soups) dataDump = [] for dump in linkOfDumps: data = Crawler.crawl(dump) dataSoup = Crawler.getSoup(data) if re.search("\d{1,2}\_dump_estrutura\.sql", dump): ExecuteQueryEstrutura(dataSoup.text) else: ExecuteQueryDados(dataSoup.text)
def task(): while True: db_path = '/root/mysite/info.db' conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute('select openid,account,passwd from user where is_valid="True"') value = cursor.fetchall() for each in value: Crawler.invoke(each[0], each[1], each[2]) time.sleep(10) with open('/root/mysite/task_log.txt','a') as f: f.write(str(time.strftime('%Y-%m-%d-%H-%M',time.localtime(time.time())))+':finish\r\n') time.sleep(2*60 * 60)
def getTitles(session, object, forumurl): error_data = list() data = list() highkorea = object s = session m = re.compile('&start=(\d+)').search(forumurl) num = 1 datPat = re.compile( '\(.\) (?P<month>\d{2}) (?P<day>\d{2}), (?P<year>\d{4}) (?P<hour>\d{1,2}):(?P<minute>\d{2}) (?P<noon>[a-z]{2})' ) if m: num = int(m.group(1)) url = re.sub('&start=(\d+)', '', forumurl) else: url = forumurl for i in range(0, num, 25): tup = highkorea.staticGet( s, highkorea.stem + url.strip('.') + '&start={}'.format(i)) s, soup = tup[0], tup[2] titles = soup.find_all("a", {"class": "topictitle"}) dates = soup.find_all("dd", {"class": "lastpost"}) authors = soup.select('li.row dl dt a.username-coloured') year, month, day, hour, minute, noon, unixtime = '', '', '', '', '', '', 0 for date, tempTitle, tempAuthor in zip([date.text for date in dates], titles, authors): article = OrderedDict() m = datPat.search(date) try: titleURL = tempTitle.get('href') title = tempTitle.text author = tempAuthor.text if m: month, day, year, hour, minute, noon = m.group( "month"), m.group("day"), m.group("year"), m.group( "hour"), m.group("minute"), m.group("noon") hour = int(hour) + 12 if noon == 'pm' else int(hour) d = datetime.datetime(int(year), int(month), int(day), hour, int(minute)) unixtime = int(time.mktime(d.timetuple())) else: error_data.append(titleURL) article['titleURL'] = titleURL article['title'] = title article['author'] = author article['lastup'] = unixtime data.append(article) except: error_data.append(tempTitle.get('href')) cr.mkjson(error_data, '/home/kyw/json_datas/highkorea', 'hkTitle_error.json') return data
def __init__(self, bot, msg): self.bot = bot self.msg = msg #self datas chat self.chat_id = str(msg['chat']['id']) self.chat_type = msg['chat']['type'] try: self.chat_title = str(msg['chat']['title']) except: self.chat_title = '' #self datas messages self.msgt = msg.get('text') self.msg_id = msg['message_id'] self.msg_s = msg['text'].split(' ') #self datas users self.uid = msg['from']['id'] self.nick = msg['from']['username'] self.group_id = str(-1001166468779) #instancias self.scan = Scanners.Scans(msg) self.log = PyMyAdmin.LogManager(self.msg, self.uid, self.nick, self.chat_id, self.chat_title) self.crawler = Crawler.Crawlers(self.msg) self.generators = Generators.generators(self.msg) self.data = PyMyAdmin.Database('', '', '').get_statistic()
def getPageMorts(url, pageFilter): crawl = Crawler(url, pageFilter=pageFilter) urlsMort = [] for page in crawl: print("HttpCode:%d Url: %s " % (page.codeHTTP, page.url)) if page.codeHTTP not in range(200, 300): urlsMort.append((page.codeHTTP, page.url)) # a new dictionary to stock the results {url who has the dead liens: [dead liens]} pageParents = {} for url in urlsMort: for pageParent in crawl.pagesToCrawl_dict[url[1]]: if pageParent in pageParents: if url[0] in pageParents[pageParent]: pageParents[pageParent][url[0]].append(url[1]) else: pageParents[pageParent][url[0]] = [url[1]] else: pageParents[pageParent] = {url[0]: [url[1]]} print "\n Crawler Complet!\n" with open('liensMort.txt', 'w') as dump_file: for pageParent in pageParents: dump_file.write('Dans la page : \n{}\n'.format(pageParent)) dump_file.write('\n') codeHTTP = pageParents[pageParent].keys() codeHTTP.sort() for code in codeHTTP: dump_file.write('HTTP return code {}\n'.format(code)) for url in pageParents[pageParent][code]: dump_file.write(' {}\n'.format(url)) dump_file.write('*' * 80 + '\n\n')
def level_chosen(self, event): level = self.level_choice.GetStringSelection() # Get level chosen self.subject_choice.Clear() self.paper_checklist.Clear() self.year_choice.Clear() self.season_choice.Clear() self.num_choice.Clear() self.region_choice.Clear() self.pairs_info = {} self.files_info = {} if level == self.level_list[0]: # Not choosing a level return # Cache global cache_folder cache_subject = os.path.join(cache_folder, "GCE Guide %s" % level) if not os.path.exists(cache_subject): self.subject_dict = Crawler.visit_level( Crawler.levels_dict[level]) # Return subject list if self.subject_dict == -1: # Connection error wx.MessageBox( "Please check your Internet connection and retry.", "Connection Error") self.level_choice.SetSelection(0) return else: Cache.store(self.subject_dict, cache_subject) else: self.subject_dict = Cache.load(cache_subject) subject_list = ["----- Select subject -----" ] + [each for each in self.subject_dict] self.subject_choice.Set(subject_list) # Update subject list self.subject_choice.SetSelection(0)
def storeUrl4Comp(): db = pymysql.connect("localhost", "root", "root", "kg4company_2", charset="utf8") cur = db.cursor() sql1 = u"select name from city;" reCount = cur.execute(sql1) # 返回受影响的行数 cities1 = cur.fetchall() sql2 = u"select name from industry;" cur.execute(sql2) cities2 = cur.fetchall() cities = cities1 + cities2 sql3 = u"insert into urlList(url4comp) values('{name}');" cities = cities[71:] for city in cities: print city[0] for idx in range(1, 31): print('page: ' + str(idx)) cwler = cw.Crawler() cwler.getCompUrl(idx, city[0]) list_ = cwler.listComp if len(list_) == 0: break for li in list_: print li sqlN = sql3.format(name=li) cur.execute(sqlN) db.commit() time.sleep(2) print('page: ' + str(idx) + ' is done') time.sleep(2) print(city[0] + ' is done')
def level_chosen(self, event): self.init_choice() self.subject_choice.Clear() if self.level_choice.GetSelection() == 0: return else: level = self.level_choice.GetStringSelection() # Get level chosen # Cache cache_folder = Cache.customized_directory() cache_subject = os.path.join(cache_folder, "GCE Guide %s" % level) if not os.path.exists(cache_subject): self.subject_dict = Crawler.visit_level( Crawler.levels_dict[level]) # Return subject list if self.subject_dict == -1: # Connection error wx.MessageBox( "Please check your Internet connection and retry.", "Connection Error") self.level_choice.SetSelection(0) return else: Cache.store(self.subject_dict, cache_subject) else: self.subject_dict = Cache.load(cache_subject) subject_list = ["----- Select subject -----" ] + [each for each in self.subject_dict] self.subject_choice.Set(subject_list) # Update subject list self.subject_choice.SetSelection(0)
def findSimilar(self,link,limit): # we call the read text function from the Crawler to read the new link # we use the construcotr with empty variables crawler = Crawler.Crawler('',0,0,0) self.limit = limit file = open("Data/page%d.txt" % self.limit, 'w') try: self.title , text = crawler.getText(link) # we combine the lists of string to a single string text = ''.join(text) for t in text: file.write(t) file.close() except: "Link is not accesible" file.close() sys.exit(0) indexer = Indexer.Indexer() indexer.start() cosineSimilarity = indexer.getCosineSimilarity() linksId = [ i for i in range(self.limit)] linksIdSorted = [x for _,x in sorted(zip(cosineSimilarity,linksId),reverse=True)] return cosineSimilarity , linksIdSorted
def upload_file(): if request.method == 'POST': # check if the post request has the file part if 'file' not in request.files: flash('No file part') return redirect(request.url) file = request.files['file'] # if user does not select file, browser also # submit an empty part without filename if file.filename == '': flash('No selected file') return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) redirect(url_for('upload_file', filename=filename)) try: crawMobi = Crawler.crawler() crawMobi.menu() return render_template('download.html') except Exception as e: return render_template('error.html') return render_template('home.html')
def getTitles(session, object, forumurl): error_data = list() data = list() zion = object s = session m = re.compile('&start=(\d+)').search(forumurl) num = 1 if m: num = int(m.group(1)) url = re.sub('&start=(\d+)', '', forumurl) else: url = forumurl for i in range(0, num, 25): tup = zion.staticGet( s, zion.stem + url.strip('.') + '&start={}'.format(i)) s, soup = tup[0], tup[2] if i == 0: forums = soup.find_all("a", {"class": "forumtitle"}) if forums != []: forumURLs = [forum.get('href') for forum in forums] forumtitles = [forum.text for forum in forums] tup = getLastPage(s, zion, forumtitles, forumURLs) for x in tup[1].values(): internal_data = getTitles(s, zion, x) data.extend(internal_data) titles = soup.find_all("a", {"class": "topictitle"}) authors = soup.find_all("a", {"class": "username"}) if soup.find_all( "a", {"class": "username"}) != [] else soup.find_all( "a", {"class": "username-coloured"}) for tempTitle, tempAuthor in zip(titles, authors): article = OrderedDict() try: titleURL = tempTitle.get('href') title = tempTitle.text author = tempAuthor.text article['titleURL'] = titleURL article['title'] = title article['author'] = author data.append(article) except: error_data.append(tempTitle.get('href')) if data == []: cr.mktxt(forumurl + '\n', '/home/kyw/json_datas/zion', 'zion_noTitle.txt') return data
def __init__(self, busca, distancia=5): self.ponto_inicial = 'http://www.rugbyfluminense.com.br/' self.distancia = 2 self.crawler = Crawler.Crawler() self.criar_no = Criar_No.Criar_No self.catalogo = [] self.busca = busca #### termo buscado self.grafo = Grafo.Grafo()
def GetPage(url): while True: p = GetProxy() page = Crawler.CrawlSinglePage(url, proxy=p) if len(page) > 10: break InvalidProxy(p) return page
def run(self): """ Start processing. """ # parse the command line arguments and set logging options try: self.args = self.parser.parse_args() self.configureLogging() self.logger.info("Started with {0}".format(' '.join(sys.argv[1:]))) except Exception as e: self.parser.print_help() sys.exit(e) # load the configuration file try: with open(self.args.config) as f: self.config.readfp(f) except Exception as e: self.logger.critical("Could not load the specified configuration file") sys.exit(e) # set options Cfg.LOG_EXC_INFO = self.args.trace # execute commands with Timer.Timer() as t: if self.args.crawl: import Crawler Crawler.crawl(self.config, self.args.update) if self.args.clean: import Cleaner Cleaner.clean(self.config, self.args.update) if self.args.infer: import Facter Facter.infer(self.config, self.args.update) if self.args.graph: import Grapher Grapher.graph(self.config, self.args.update) if self.args.transform: import Transformer Transformer.transform(self.config) if self.args.post: import Poster Poster.post(self.config) if self.args.analyze: import Analyzer Analyzer.analyze(self.config, self.args.update) self.logger.info("Indexer finished in {0}:{1}:{2}".format(t.hours, t.minutes, t.seconds))
def __init__(self, __textUrlData, __pageRankData): # instantiates data / service classes self.__textData = __textUrlData; self.__pageRankData = __pageRankData; self.__webscraper = WebScrape(self.__textData, self.__pageRankData); self.__crawler = Crawler(self.__textData); # loads all data structures by calling lower tier helper classes self.__generate_data_structures();
def init(username,password,sleepTime = 600,configList = None): fileOld = 'old'+username+'.txt' fileNew = 'new'+username+'.txt' if (configList): mailto_list = str(configList[3].strip('\n')) mail_host = str(configList[4].strip('\n')) mail_user = str(configList[5].strip('\n')) mail_pass = str(configList[6].strip('\n')) mail_postfix = str(configList[7].strip('\n')) else: mailto_list = raw_input("to email address, [email protected]:") mail_host = raw_input("smtp server, smtp.qq.com:") mail_user = raw_input("mailUser, [email protected]:") mail_pass = raw_input("mailPassword:"******"mailPostfix, qq.com:") while(True): try: Crawler.init(username,password) except Exception as e: print (e) continue new = open(fileNew, 'U').readlines() try: old = open(fileOld, 'r').readlines() except: old = open(fileOld, 'w+').readlines() diff = difflib.ndiff(old,new) if (not (filecmp.cmp(fileOld,fileNew))): print "not same" content = 'studentID= ' + username + ' \n' for i in diff: content += i print content mail = sendMail(content,mailto_list,mail_host,mail_user,mail_pass,mail_postfix) print "new grades" new = open(fileNew,'r') old = open(fileOld,'w') old.writelines(new.readlines()) else: print "same" print "sleeping...@", print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) time.sleep(sleepTime)
def get_link(): with open('/home/kyw/agoraHTMLnumber/agoraHTMLnumber.txt', 'a') as wf: agora = 'http://c2djzrn6qx6kupkn.onion/' with requests.Session() as s: agora = cr.Site(agora) tup = agora.staticGet(s, agora.stem) s, html = tup[0], tup[1] links = re.compile('href="(\d+\.html)').findall(html.text) wf.write(links[-1].strip('.html') + '\n') return links[-1]
def parse_home(): try: indice_link = 0 for url in HOME_URL: instancia_crawler_main = Crawler.Crawler(url, NUEVO_LINK[indice_link]) instancia_crawler_main.crawler_main(indice_link) indice_link += 1 except ValueError as ve: print(ve)
class SearchEngine: def __init__(self): indexer = Indexer() self.graph = Graph() self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html", "http://mysql12.f4.htw-berlin.de/crawl/d06.html", "http://mysql12.f4.htw-berlin.de/crawl/d08.html"}, self.graph, indexer) self.crawler.crawl() self.scorer = Scorer(indexer.index, indexer.documents) self.pageRank = PageRank(self.graph) self.pageRank.calc() def search (self, string, scoreOnly = False): query = string.split() scores = self.scorer.scoreQuery(query) if scoreOnly: results = scores else: results = {} for url, score in scores.items(): results[url] = score * self.graph.get_document(url).rank sortedResults = sorted(results.items(), key=operator.itemgetter(1), reverse = True) for res in sortedResults: print(res) def printPageRanks(self): print('Page ranks:') print(' d01 - d02 - d03 - d04 - d05 - d06 - d07 - d08') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d01.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d02.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d03.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d04.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d05.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d06.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d07.html").rank, 4), end = ' - ') print(round(self.graph.get_document("http://mysql12.f4.htw-berlin.de/crawl/d08.html").rank, 4), end = '\n\n')
def to_url(request): s = request.GET.get('text') s = s.split(" ") r = [] for i in s: if i != " " and i !="": r.append(i) a= Crawler.Crawler(0, 1, r, '/home/katrin/databasetemp/') try: a.downloadPages() except Exception, e: pass
def __init__(self): indexer = Indexer() self.graph = Graph() self.crawler = Crawler({"http://mysql12.f4.htw-berlin.de/crawl/d01.html", "http://mysql12.f4.htw-berlin.de/crawl/d06.html", "http://mysql12.f4.htw-berlin.de/crawl/d08.html"}, self.graph, indexer) self.crawler.crawl() self.scorer = Scorer(indexer.index, indexer.documents) self.pageRank = PageRank(self.graph) self.pageRank.calc()
def fetch_url(url): # print "-> %r requesting" % (url) url = url.rstrip('\n') try: response = Crawler.data_request(url) # print "-> %r request complete" % (url) if (response is not None) and (response not in error_list): return url, response, None else: return url, None, response except Exception as e: return url, None, e
def test(): c = Crawler("http://www.artsandscience.utoronto.ca/ofr/timetable/winter/csc.html") c.extract_course_code() c.course_init() file = FileSystem("course.txt", c.get_courses()) file.writing() print "finished"
def findword(): #Verifica se os perâmetros foram enviados if (request.args.get('urls', default=[], type=str) and request.args.get('termo', default="error", type=str)): #Pega a string com as urls e trata para se tornar uma lista de urls array = request.args.get('urls', default=[], type=str) #Tratamento contra erros básicos array = array.replace("[", "") array = array.replace("]", "") array = array.replace("'", "") array = array.replace('"', "") array = array.split(",") #verifica se contém . na url e se contem o protocolo http ou https for i in range(len(array)): #se não tiver ponto dá erro if not (array[i].find('.') > -1): return root() #se não contém adiciona ao começo da string if not (array[i].find('http') > -1): array[i] = 'http://' + (array[i].strip()) #Atribui o termo word = request.args.get('termo', default="", type=str) #verifica se não é vazio if (not word.strip()): return root() #verifica e atribui o parâmetro opcional ignorecache ignorecache = request.args.get('ignorecache', default='False', type=str) if (ignorecache == 'False' or ignorecache == 'false' or ignorecache == '0' or ignorecache == ''): return jsonify(Crawler.spider(array, word, len(array), False)) #Qualquer caso que não seja parametro inexistente ou igual a false passa True return jsonify(Crawler.spider(array, word, len(array), True)) #Caso os parametros não sejam passados retorna a página com referencia de parametros return root() print("\n" * 5) print("TEMPO API ASYNC") print("--- %s seconds ---" % (time.time() - start_time))
def crawl_urls(random_url_list, web_df, keyword_df, i, base_dict, save_after_itr, url_jump_lag, num_retries, wait_range): status_flag = '' random_urls_sub = random_url_list[i:i + save_after_itr] html_dict = venom.download_randomly(random_urls_sub, url_jump_lag=url_jump_lag, num_retries=num_retries, wait_range=wait_range) list_of_dicts = [] for domain_url, domain_data in html_dict.items(): # link_tag, link_class, text_tag, text_class = '', '', '', '' html = list(domain_data.values())[0] domain_name = list(domain_data.keys())[0] href = '' visible_text = '' status_flag = '' if html is not None: href, visible_text = get_data_from_html(web_df, html, domain_name, base_dict) else: print('Web page not scrapped:', domain_url) status_flag = 'Access Denied' list_of_dicts.append({ "Domain Name": domain_name, "URL": domain_url, "Product Link": href, "Product Description": visible_text, 'Status Flag': status_flag }) crawled_df = pd.DataFrame(list_of_dicts) if crawled_df.empty or (crawled_df is None): # Create empty dataframe crawled_df = pd.DataFrame(columns=[ 'Domain Name', 'Product Description', 'Product Link', 'URL', 'MATERIAL_NO', 'MANUFACTURER_NAME_1', 'MANUFACTURER_PT_NO_1', 'NOUN_ENGLISH', 'MODIFIER_ENGLISH', 'Keywords', 'Status Flag' ]) else: crawled_df['Product Description'].fillna('', inplace=True) crawled_df['Product Link'].fillna('', inplace=True) crawled_df = crawled_df.merge(keyword_df, on='URL', how='left') crawled_df.drop('Search_URL', axis=1, inplace=True) return crawled_df
def ProcessTask(self, task): crawlerParams = { 'board': task['board'], 'blFromJson': True, 'start': -(task['searchPages']), 'end': -1 } crawler = Crawler.PttWebCrawler(crawlerParams) articles = crawler.parse_articles() if self.LOW_PRICE_TASK == task['type']: self.HandleLowPriceTask(articles, task)
def doSongciCrawler(): content = urllib2.urlopen(Crawler.songurl).read() #print content parser = Crawler.MyParser() parser.feed(content) print len(parser.linkList) print len(parser.linkDescList) linkDescList = parser.linkDescList linkList = parser.linkList #ind = 0 savedCnt = 0 failedCnt = 0 #len(linkList) for ind in range(len(linkList)): item = linkList[ind] print Crawler.base_url + item guwenPage = urllib2.urlopen(Crawler.base_url + item).read() newParser = Crawler.MyParser() newParser.feed(guwenPage) guwenContNow = newParser.songciCont res = Crawler.songciContHandle(guwenContNow) for item in res: if len(item) > 0: try: songci = Songci() songci.set('allStr', ' '.join(item)) songci.set('category', linkDescList[ind].strip()) songci.set('name', item[0].strip()) songci.set('author', item[1].strip()) songci.set('content', item[2].strip()) songci.save() savedCnt += 1 except Exception, e: failedCnt += 1 #if str(e).find('LeanCloudError: [137] A unique field was given a value that is already taken.') is -1: print e print ' '.join(item), 'saving failed'
def getText(url): #访问链接并获得新闻内容 html = Crawler.askUrl(url) if html == None: return None soup = BeautifulSoup(html, "html.parser") contents = soup.find_all('span', {'class': 'bjh-p'}) if len(contents) == 0: contents = soup.find_all('p') text = '' for i in range(0, len(contents)): text += contents[i].text return text
def main(): print("Starting our Web Crawler") baseUrl = input("Website > ") numberOfThreads = input("No Threads > ") linksToCrawl = queue.Queue() urlLock = threading.Lock() linksToCrawl.put(baseUrl) haveVisited = [] crawlers = [] errorLinks = [] with open("links.txt", "w+") as f: for i in range(int(numberOfThreads)): crawler = Crawler(baseUrl, linksToCrawl, haveVisited, errorLinks, urlLock, f) crawler.run() crawlers.append(crawler) for crawler in crawlers: crawler.join() print("Total Number of Pages Visited {}".format(len(haveVisited))) print("Total Number of Pages with Errors {}".format(len(errorLinks)))
def agoraMultiCrawler(new_html): with requests.Session() as s: return_data = OrderedDict() agora = 'http://c2djzrn6qx6kupkn.onion/' agora = cr.Site(agora) # if i == 0: # tup = agora.staticGet(s, agora.stem) # s, html, soup = tup[0], tup[1].text, tup[2] tup = agora.staticGet(s, agora.stem + "/{}.html".format(new_html)) s, html, soup = tup[0], tup[1].text, tup[2] messages = soup.find_all("div", {"class": "message"}) labels = soup.find_all("label") ids = soup.find_all("span", {"class": "reflink"}) content_data = list() for id, label, message in zip(ids, labels, messages): temp_data = OrderedDict() posterman = label.find("span", { "class": "postername" }).get_text().encode( 'iso-8859-1').decode('utf-8').strip('\n') if label.find( "span", {"class": "postername"}) is not None else None filetitle = label.find("span", { "class": "filetitle" }).get_text().encode( 'iso-8859-1').decode('utf-8').strip('\n') if label.find( "span", {"class": "filetitle"}) is not None else None for lab in label("span"): lab.decompose() mid = id.find_all('a')[-1].get_text() date = label.get_text().encode('iso-8859-1').decode('utf-8').strip( '\n').strip(' ') ms = message.get_text().encode('iso-8859-1').decode('utf-8').strip( '\n') temp_data['author'] = posterman temp_data['title'] = filetitle temp_data['id'] = mid temp_data['date'] = date temp_data['message'] = ms content_data.append(temp_data) return_data['html'] = html return_data['content'] = content_data return_data['url'] = tup[1].url return return_data
parser.set_defaults(map_save_to_dot=MAP_SAVE_TO_DOT) args = parser.parse_args() print("Running configuration:") for attr, value in sorted(args.__dict__.iteritems()): print("\t{0} = {1}".format(attr, value)) c = Crawler(args.language, args.currency, args.user_place, args.departure_point, args.ignored_points, args.departure_month, args.departure_year, args.price_limit, args.flights_limit, args.selenium_host, args.selenium_port, args.selenium_start_cmd, args.selenium_load_timeout, args.map_save_to_dot, args.map_dot_filename, args.output_encoding) try: c.create_map() c.analyze_map() except KeyboardInterrupt: print("Ctrl-C pressed...") except SeleniumError as err: print(str(err))
def handleRule(rule): ic = ItemCrawler() db_avg_cost = DB.get_avg_price(rule.link) Log("{} for less then {} Chaos".format(rule.name, (PRICE_MULTIPLICATOR*db_avg_cost)),"CURRENT SCAN", "blue", 0) hits = ic.hits(rule.link) if not rule.enabled: return isFirst = True hitCnt = 0 hitSum = 0 for hit in hits: cost = Crawler.getCostFromEntry(hit) ign = Crawler.getIGNFromEntry(hit) (corrupted, item_name) = Crawler.getItemNameFromEntry(hit) item_print = item_name if not item_name: item_name = "<NOT FOUND>" else: if corrupted: item_print = Fore.RED + "Corrupted" + Fore.RESET + " " + item_name chaos_cost = Currency.getChaosValue(cost) # Solange noch kein Item gefunden wurde ist db_avg_cost = 0 und db_avg_cost * PRICE_IGNORE_MIN ebenfalls. # Das gleiche gilt für PRICE_IGNORE_MAX if not db_avg_cost: db_avg_cost = chaos_cost if chaos_cost < db_avg_cost * PRICE_IGNORE_MIN or chaos_cost > db_avg_cost * PRICE_IGNORE_MAX: Log("Someone is selling {} ({}) for {} chaos!!".format(item_print, rule.link, chaos_cost), "TROLL ALERT", "red", 0) continue if(hitCnt < MAX_HITS_FOR_AVG): hitSum += chaos_cost hitCnt += 1 # if chaos_cost <= rule.price and isFirst: if chaos_cost <= (PRICE_MULTIPLICATOR*db_avg_cost): Log("Schnappaaah : ", "INFO", "yellow", 0) Log(rule.name, "RULE", None, 1) Log(item_print, "ITEM", None, 1) Log(rule.link, "LINK", None, 1) Log(cost, "COST", None, 1) Log(chaos_cost, "CHAOS COST", None, 1) Log("@{} Hi, I would like to buy your {} listed for {} in {}\n".format(ign, item_name, cost, LEAGUE), "WHISPER", None, 1) if rule.alert: os.system("start C:\\Users\\Ramon\\workspace\\git\\rockwurst\\testsound.mp3") isFirst = False if len(hits) > 0: hitAvg = hitSum / hitCnt Log("took the first {} items with an avarage of {} chaos".format(hitCnt, hitAvg), "SCAN RESULT", "white", 0) DB.insert_scan(rule.link, hitAvg)
from scipy.io import loadmat, savemat from scipy.misc import imread, imsave import cPickle import subprocess import re import glob from Crawler import * from ModalDB import * from Settings import Settings #Crawl videos if True: c = Crawler() q='how+to+hard+boil+an+egg' VIDS = c.searchYoutube(q.decode('utf-8'),5) print VIDS print 'Dowload Them' c.downloadVideos() c.getSubtitles() with open('ids.bn', 'wb') as fp: cPickle.dump(VIDS, fp) #VIDS = cPickle.load(open('ids.bn','rb')) import os modaldb_client = ModalClient(root=Settings.data_dir, schema=Settings.my_schema) # here, we specify the DB's location (on disk) and schema. #modaldb_client.clear_db() # empty the database just in case. #Pushing videos for vid_id in VIDS: