def main(): """TODO... ADD LOGGING FOR: 1. Process start 2. Collection/write start 3. Collection/write end 4. Process end """ start_time = time.time() print(f'Starting {PROJECT_NAME}') # No need to hit API to collect data if we can just use the sample data if not LOCAL: # Authenticate Reddit w/ credentials reddit = Reddit() # Retrieve top posts from subreddit post_data = reddit.collect_data() # Dump post data to file as JSON write_data(post_data) else: print('LOCAL: True; skipping PRAW to use sample-reddit.json') # Download images using data recently saved in JSON file Downloader.download_images() time_elapsed = round(time.time() - start_time, 3) print(f'Finished {PROJECT_NAME} in {time_elapsed} seconds.')
def main(): #docopt parse the args arguments = docopt(__doc__, version='0.1.1-dev') #TODO get the version from __init__.py host = arguments.get('--host', 'mafreebox.free.fr') port = arguments.get('--port', '80') password = arguments.get('--password', None) if not password: password = getpass('Freebox password: '******'client', False) download_mode = arguments.get('download', False) #login fbx_client = freeboxClient(host, port, None, password)#params are host, port, username, password login_sucess = fbx_client.login() if client_mode: print "login success : %s" % login_sucess #if download, call the downloader if download_mode: path_or_url = arguments.get('PATH_OR_URL', None) if not path_or_url: print u"Give a file torrent path or url" #no we will find if path_or_url is a path or an url torrent_file = None torrent_url = None if os.path.isfile(path_or_url): torrent_file = path_or_url else: torrent_url = path_or_url downloader = Downloader(fbx_client) downloader.add_file_to_download(torrent_file, torrent_url)
class DownloaderTestCase(unittest.TestCase): master_playlist_url = 'https://rtsvodww-vh.akamaihd.net/i/1998/vers/vers_19980407_standard_vers_1998-04-07_Arch00_094728-,100k,700k,1200k,.mp4.csmil/master.m3u8' index_playlist_url = 'https://rtsvodww-vh.akamaihd.net/i/1998/vers/vers_19980407_standard_vers_1998-04-07_Arch00_094728-,100k,700k,1200k,.mp4.csmil/index_2_av.m3u8' def setUp(self): self.downloader = Downloader() def test_get_master_playlist_url(self): page_url = 'https://www.rts.ch/archives/tv/culture/verso/4716197-gribouille-en-metro.html' self.assertEqual(self.downloader.get_master_playlist_url(page_url), self.master_playlist_url) def test_get_index_playlist_url(self): self.assertEqual( self.downloader.get_index_playlist_url(self.master_playlist_url), self.index_playlist_url ) def test_get_segment_urls(self): urls = self.downloader.get_segment_urls(self.index_playlist_url) num_segments = 33 self.assertEqual(len(urls), num_segments) for i in range(num_segments): self.assertEqual(urls[i], 'https://rtsvodww-vh.akamaihd.net/i/1998/vers/vers_19980407_standard_vers_1998-04-07_Arch00_094728-,100k,700k,1200k,.mp4.csmil/segment' + str(i+1) + '_2_av.ts')
def __init__(self, file_info): Downloader.__init__(self) self.start = None self.end = None self.file_info = file_info self.file = None self.cal_offset() self.downloaded_byte = 0
def download(dataset): log('Download dataset {}...'.format(dataset['id'])) if dataset['type'] == 'url': Downloader(CurlDownloader(), dataset['localFolder'], dataset['sourceUrls']).perform() elif dataset['type'] == 'kaggle': Downloader(KaggleDownloader(), dataset['localFolder'], dataset['sourceUrls']).perform() log('Done!')
def main(): login() #songsList = getSongsFromInternet() songsList = getSongsFromLocalarea("songsList.txt") #partition(songsList) downloader = Downloader(songsList) downloader.download(3, threadNum=10) #print(songsList) print("done.")
def __init__(self, url_q, wb_q, info_q): self.url_count = 1 self.handle_urls = set() self.url_suf = 'http://weibo.com' self.url_q = url_q self.wb_q = wb_q self.info_q = info_q self.downloader = Downloader() self.validater = Validater()
def no_proxy(): if request.method == 'POST': title = request.form["ocw"] long_title = media[title].split('/')[-1] filepath = urljoin("downloads/", long_title) downloader = Downloader() downloader.get(media[title], filepath) flash('Successfully downloaded {0}'.format(long_title)) return render_template('proxy.html', page_title="No proxy", action="no_proxy")
def test_download_to_dir(self): dir = tempfile.mkdtemp() try: d = Downloader(dir) with util.CaptureStdout(): with d.download('file://' + __file__) as f: filename = f finally: shutil.rmtree(dir) self.assertEqual(dir, os.path.dirname(filename))
def test_file_removed_on_exception(self): try: d = Downloader() with util.CaptureStdout(): with d.download('file://' + __file__) as f: filename = f raise TestException() except TestException: pass self.assertFalse(os.path.exists(filename))
def main(): # tie_url = 'http://tieba.baidu.com/p/4774287212' tie_url = input('请输入帖子url:') tiebaimg = TiebaImage() img_url_list = tiebaimg.getImgUrls(tie_url) if not img_url_list: print('未解析到图片url!!!') return downloader = Downloader() downloader.start(img_url_list)
def __init__(self): if not os.path.isdir(PLAYLIST_DIR): print(f'No config found at {PLAYLIST_DIR}') exit(1) if not os.path.isdir(DOWNLOADS_DIR): os.mkdir(DOWNLOADS_DIR) self.downloader = Downloader(DOWNLOADS_DIR) self.audio = None
def run(self): while self.current < self.end: self.printCurrent() # Load pageview data artDict = self.buildArtDict() print artDict # Process pageview data dl = Downloader(self.current,self.current+timedelta(days=1)) dl.run(artDict) self.updateCurrentTime()
def test_rule(url, regexp=''): download = Downloader() html1 = download.get(url) #print html1 text1 = process_selector(selector, html1.text) md51 = md5(text1.encode('utf-8')) html2 = download.get(url) text2 = process_selector(selector, html2.text) md52 = md5(text2.encode('utf-8')) if md51 == md52: print 'md5 is same' else: print md51, md52
def download(url='', title='', artist='', gender='', album=''): cleanMp3s() url = request.form['url'] title = request.form['title'] artist = request.form['artist'] gender = request.form['gender'] album = request.form['album'] downloader = Downloader(url, title, artist, gender, album) try: path = downloader.download() except IOError as e: return str(e) return send_from_directory(os.path.abspath('.'), path, as_attachment=True)
def get_data(url): print(url) down = Downloader(headers=headers_home) path = 'cache/hz.meituan.com/index.html' if os.path.exists(path): os.remove(path) uuid = get_uuid('http://hz.meituan.com/', down) if not uuid: return data = {} type_ = 'c' + url.split('/c')[-1][:-1] print(type_) cateId = type_[1:] areaId = '-1' # print(cateId, areaId) data['FIRST_LEVEL_DIRECTORY'] = '生活服务' data['SECOND_LEVEL_DIRECTORY'] = class_[type_] down.headers = headers_get index = 0 while True: index = index + 1 down.headers['Referer'] = url + '/' + 'pn' + str(index) + '/' url_get = 'http://apimobile.meituan.com/group/v4/poi/pcsearch/50?uuid='+uuid+'&userid=-1&limit=32&offset='+str((index-1)*32)+'&cateId='+cateId+'&areaId='+areaId html = down(url_get) try: search_result = json.loads(html)['data']['searchResult'] except Exception as e: print('in get_data error ',e) if search_result == []: print('search_result is None') break # print(search_result) for one_item in search_result: data['SHOP_ID'] = one_item['id'] data['SHOP_PHOTOS'] = one_item['imageUrl'] data['SHOP_NAME'] = one_item['title'] data['ADDRESS'] = one_item['address'] data['RANK_STARS'] = one_item['avgscore'] data['AVG_PRICE_TITLE'] = one_item['avgprice'] tuangou = one_item['deals'] if not tuangou: data['GROUP_BUYING_NUMBER'] = 0 data['GROUP_BUYING'] = None else: data['GROUP_BUYING_NUMBER'] = len(tuangou) taocan = '' for one in tuangou: taocan = taocan + '价格' + str(one['price']) + ' 门市价' + str(one['value']) + ' 出售' + str(one['sales']) data['GROUP_BUYING'] = taocan db.insert_into(data)
def downloadlink(url='', title='', artist='', gender='', album=''): cleanMp3s() url = request.form['url'] title = request.form['title'] artist = request.form['artist'] gender = request.form['gender'] album = request.form['album'] downloader = Downloader(url, title, artist, gender, album) path = downloader.download() dir = 'files/' if not os.path.exists(dir): os.makedirs(dir) newpath = dir + path os.rename(path, newpath) return '<a href="/' + newpath + '">' + newpath + '</a>'
def threaded_crawler(max_threads=10): # urls that still need to be crawled crawl_queue = [seed_url] # urls that have been seen seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def proxess_queue(): while True: try: url = crawl_queue.pop() except IndexError: # empty queue break else: html = D(url) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: # can start some more threads thread = threading.Tread(target=proxess_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def __init__(self, master=True): threading.Thread.__init__(self) self.pagestore = PageStore() self.downloader = Downloader(); self.connection = Connection(MONGODB_HOST, MONGODB_PORT) db = self.connection.download if master: db.drop_collection('downurl') for f, tb in ((SAVE_URL_RE_BLACK, 'save_url_black'), (SAVE_URL_RE_WHITE, 'save_url_white'), (DOWN_URL_RE_BLACK, 'down_url_black'), (DOWN_URL_RE_WHITE, 'down_url_white')): if os.path.exists(f): db.drop_collection(tb) logger.info('load rule:%s...' % f) for s in set(open(f).readlines()): s = s.strip() if s: db[tb].insert({'pattern': s}) logger.info('load rule:%s...OK' % f) self.downurl, self.allurl, self.watchurl, self.updateurl, self.secceedurl = db.downurl, db.allurl, db.watchurl, db.updateurl, db.secceedurl self.save_url_black = self.load_re(db.save_url_black) self.save_url_white = self.load_re(db.save_url_white) self.down_url_black = self.load_re(db.down_url_black) self.down_url_white = self.load_re(db.down_url_white) if master: self.load_watch_url() self.load_update_url() self.reload_allurl() logger.info('allurl:%d' % self.allurl.find().count()) logger.info('secceedurl:%d' % self.secceedurl.find().count()) logger.info('updateurl:%d' % self.updateurl.find().count()) logger.info('watchurl:%d' % self.watchurl.find().count()) logger.info('downurl:%d' % self.downurl.find().count())
def install_from_url(url, install_location): if not os.path.isdir(install_location): raise DictError("Specified path is not a valid directory") if not os.access(install_location, os.W_OK): raise DictError("User not allowed to write to specified directory") if os.path.isdir(UNZIPPED_TEMP): shutil.rmtree(UNZIPPED_TEMP) if os.path.isdir(TEMP_DICT): shutil.rmtree(TEMP_DICT) os.mkdir(TEMP_DICT) os.mkdir(UNZIPPED_TEMP) #file_name = TEMP_DICT + "/mod.zip" #file_name = download_file(url) #file_name = Downloader(url, "") downloader = Downloader(url, TEMP_DICT) downloader.show() downloader.exec_() file_name = str(downloader.file_name) unzip2(file_name, UNZIPPED_TEMP) up_one = False for path, dirs, files in os.walk(UNZIPPED_TEMP): for f in reversed(dirs): print("In folder: " + f) if f == "GameData": up_one = True tree = path + "/" + f print("GameData Detected, copying to GameData folder and deleting: " + tree) copy_and_delete_tree(tree, install_location) if up_one: print("Upped one\n") copytree(UNZIPPED_TEMP, install_location + "/..") else: copytree(UNZIPPED_TEMP, install_location) os.remove(file_name) print("Finished\n")
def graph(pageid): """ For a given page ID pageid, create a graph based on the JSON file of the HTML. """ D = Downloader() html = D("http://graph.facebook.com/" + pageid) return json.loads(html)
def unzip(self, downloader: Downloader): """Unzips the package(s) for this project. In the Project class there is only one package but derived classes can specify more than one package to unzip. Parameters ---------- downloader : Downloader The downloader that was used to download the package(s). """ split_name = self.name.split("/") if len(split_name) == 1: downloader.unzip(split_name[0]) else: downloader.unzip(split_name[1])
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None): """Crawl from the given seed URL following links matched by link_regex """ # the queue of URL's that still need to be crawled crawl_queue = [seed_url] # the URL's that have been seen and at what depth seen = {seed_url: 0} # track how many URL's have been downloaded num_urls = 0 rp = get_robots(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache) while crawl_queue: url = crawl_queue.pop() depth = seen[url] # check url passes robots.txt restrictions if rp.can_fetch(user_agent, url): html = D(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) if depth != max_depth: # can still crawl further if link_regex: # filter for links matching our regular expression links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen[link] = depth + 1 # check link is within same domain if same_domain(seed_url, link): # success! add this new link to queue crawl_queue.append(link) # check whether have reached downloaded maximum num_urls += 1 if num_urls == max_urls: break else: print 'Blocked by robots.txt:', url
def crawl(self): for conf in config_lists: for url in conf['urls']: resp = Downloader().download(url, conf) if resp: proxy_list = PageParser().parse(resp, conf) print(proxy_list) print('正在验证代理可以用性') valid_many(proxy_list, 'spider')
def main(reparse=False): """Main entry point for this ETL process. Downloads, updates db, stores the nightly data. This is the binary to run from a cron job. """ os.chdir(os.path.dirname(__file__)) logger = log.logger() logger.info('Starting ETL of FBO Nightly data.') # Figure out where we put data datadir = get_datadir() dbdir = get_dbdir() if not os.path.exists(os.path.join(dbdir, "sqlite3")): os.makedirs(os.path.join(dbdir, "sqlite3")) # Get a database connection, create db if needed db = model.FBO( "development", db_conf_file=os.path.join( dbdir, "dbconf.yml")) # Make sure the db schema is up to date, create tables, etc. db.migrate() assert os.path.exists(datadir) # Download raw data files dloader = Downloader(datadir, db, 'nightly') dloader.download(fname_urls, True) # Do our ETL nights = Nightlies(db) nights.etl_from_dir(reparse=reparse) # Close the db connection db.close() info('Finished ETL of FBO data.')
def install_ampl(filename, **kwargs): if installed('ampl'): return dir = filename.replace('.tgz', '') url = 'http://ampl.com/demo/' + filename install_dir = kwargs.get('install_dir', opt_dir) with Downloader(kwargs.get('download_dir', '.')).download(url) as f: with closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(install_dir) add_to_path(os.path.join(install_dir, dir, 'ampl')) add_to_path(os.path.join(install_dir, dir, 'ampl.lic'))
def search(keyword): D = Downloader() url = 'https://www.google.com/search?q=' + urllib.quote_plus(keyword) html = D(url) tree = lxml.html.fromstring(html) links = [] for result in tree.cssselect('h3.r a'): link = result.get('href') qs = urlparse.urlparse(link).query links.extend(urlparse.parse_qs(qs).get('q', [])) return links
def threaded_crawler(seed_url, delay=5, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60, scrape_callback=None, cache=None): """Crawl this website in multiple threads """ #crawl_queue = Queue.deque([seed_url]) crawl_queue = [seed_url] seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) print link if link not in seen: seen.add(link) crawl_queue.append(link) threads = [] while threads or crawl_queue: for thread in threads: if not thread.is_alive(): threads.remove(thread) while len(threads) < max_threads and crawl_queue: thread = threading.Thread(target=process_queue) thread.setDaemon(True) thread.start() threads.append(thread) time.sleep(SLEEP_TIME)
def install_maven(**kwargs): if installed('mvn'): return # 3.2.5 is the most recent version of Maven compatible with Java 6. dir = 'apache-maven-3.2.5' url = 'http://mirrors.sonic.net/apache/maven/maven-3/3.2.5/binaries/{0}-bin.tar.gz'.format( dir) install_dir = kwargs.get('install_dir', opt_dir) with Downloader(kwargs.get('download_dir', '.')).download(url) as f: with closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(install_dir) add_to_path(os.path.join(install_dir, dir, 'bin', 'mvn'))
def __init__(self): self.today = time.strftime("%Y-%m-%d",time.localtime(time.time())) self.urllogpath = "../data/url" os.system("mkdir -p %s" % self.urllogpath) self.urllog = "../data/url/" + "downloadedurl_" + self.today + ".txt" self.subpagepath = "../data/subpagepath" os.system("mkdir -p %s" % self.subpagepath) self.baseurl = BASEURLS self.suburl = {} self.downloader = Downloader() self.html2db = Html2db()
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxies=None, num_retries=1, scrape_callback=None, cache=None): crawl_queue = Queue.deque([seed_url]) seen = {seed_url: 0} num_urls = 0 # rp = get_robots(seed_url) D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, cache=cache) thrtl = throttle.Throttle(delay) headers = headers or {} if user_agent: headers['User-agent'] = user_agent while crawl_queue: url = crawl_queue.pop() if True: # rp.can_fetch(user_agent, url): print url thrtl.wait(url) html = D(url) links = [] if scrape_callback: links.extend(scrape_callback(url, html) or []) depth = seen[url] if depth != max_depth: if link_regex: links.extend(link for link in get_links(html) if re.match(link_regex, link)) for link in links: link = normalize(seed_url, link) if link not in seen: seen[link] = depth + 1 # if same_domain(seed_url, link): crawl_queue.append(link) num_urls += 1 if num_urls == max_urls: break else: print 'Blocked by robots.txt:', url
def threaded_crawler(seed_url, delay=5, cache=None, scrape_callback=None, user_agent='wswp', proxies=None, num_retries=1, max_threads=10, timeout=60): """Crawl this website in multiple threads """ # the queue of URL's that still need to be crawled #crawl_queue = Queue.deque([seed_url]) crawl_queue = [seed_url] # the URL's that have been seen seen = set([seed_url]) D = Downloader(cache=cache, delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries, timeout=timeout) def process_queue(): while True: try: url = crawl_queue.pop() except IndexError: # crawl queue is empty break else: html = D(url) if scrape_callback: try: links = scrape_callback(url, html) or [] except Exception as e: print 'Error in callback for: {}: {}'.format(url, e) else: for link in links: link = normalize(seed_url, link) # check whether already crawled this link if link not in seen: seen.add(link) # add this new link to queue crawl_queue.append(link) # wait for all download threads to finish threads = [] while threads or crawl_queue: # the crawl is still active for thread in threads: if not thread.is_alive(): # remove the stopped threads threads.remove(thread) while len(threads) < max_threads and crawl_queue: # can start some more threads thread = threading.Thread(target=process_queue) thread.setDaemon(True) # set daemon so main thread can exit when receives ctrl-c thread.start() threads.append(thread) # all threads have been processed # sleep temporarily so CPU can focus execution on other threads time.sleep(SLEEP_TIME)
def process(rules): for rule in rules: download = Downloader() html = download.get(rule.url) if html == None: logger.error('%s无法访问'%rule.corp) continue elif rule.selector: text = process_selector(rule,html.text) elif rule.types == 'github': rule.selector = "div.commit-group-title" text = process_selector(rule,html.text) else: text = html.text if text == None: continue hash_list = dataConfig.hash_list() html_md5 = md5(text.encode('utf-8')) #text编码为unicode if debug: print 'html:',text[:20] print 'hash_list:',hash_list print 'html_md5',html_md5 if len(hash_list) > 0: if rule.corp in hash_list.keys(): if html_md5 == hash_list[rule.corp]: logger.info('%s no change'%rule.corp) else: #如果hash改变,说明有更新,发送邮件通知 logger.warning('%s has update'%rule.corp) dataConfig.update_hash(rule.corp,html_md5) context = '<a href={0}>{0}</a>'.format(rule.url) Notification(rule.message).notification(context) else: #如果不存在该corp,则添加该hash logger.info('添加新的监控app:%s'%rule.corp) dataConfig.add_hash(rule.corp,html_md5) else: #如果hash列表为空,则先初始化 logger.info('wam init ....') dataConfig.add_hash(rule.corp,html_md5)
def search(keyword): """ Google search for a keyword. """ D = Downloader() url = "https://www.google.com/search?q=" + ul.quote_plus(keyword) html = D(url) tree = lxml.html.fromstring(html) links = [] for result in tree.cssselect("h3.r a"): link = result.get("href") qs = ulp.urlparse(link).query links.extend(ulp.parse_qs(qs).get("q", [])) return links
def main(self): n = 0 #初始化,第一次运行函数 dw = Downloader(self.url) dw.GetHtml() parser = Parser(dw.GetHtml()) self.list = parser.ReturnUrl() urlMa = UrlManager(parser.ReturnUrl()) while n < 100: #此处开始循环 crawingurl = urlMa.print_2()[0] print(crawingurl) dw = Downloader(crawingurl) dw.GetHtml() parser = Parser(dw.GetHtml()) print(parser.ReturnTitle()) urlMa = UrlManager(parser.ReturnUrl()) n = n + 1
def init_connection(self): try: self.vk_session = vk_api.VkApi(login=os.getenv("LOGIN"), password=os.getenv("PASSW")) try: self.vk_session.auth(token_only=True) except vk_api.AuthError as e: print(e) sys.exit(0) except vk_api.exceptions.Captcha as e: print("CAPTCHA") print(e.get_url()) code = input() e.try_again(key=code) print("ID:", os.getpid()) print("Got VK API Session") self.group_session = vk_api.VkApi(token=os.getenv("KEY")) print("Got Group Session") self.longpoll = VkBotLongPoll(self.group_session, os.getenv("GROUP_ID")) print("Got Longpoll Object") self.api = self.vk_session.get_api() print("Got API Object") self.group_api = self.group_session.get_api() print("Got Group API Object") self.upload = vk_api.VkUpload(self.vk_session) print("Got Upload Object") self.loader = Downloader() print("Got Downloader Object") except (requests.exceptions.ConnectionError) as e: print("Reinitializing session data") print(e) print("Timeout:", self.timeout) time.sleep(self.timeout) self.timeout += 1 self.init_connection()
def down_info_by_id(one_id=None): if not one_id: return None data = {} down = Downloader(headers=headers_home) id = one_id['SHOP_ID'] sql = 'update crawler.mt_meishi set LABEL_IS_CCRAWLED = 2 where SHOP_ID = ' + id db.update_data(sql) url = HOMEURL + id + '/' uuid, data['TELEPHONE'], data['BUSINESS_TIME'] = get_uuid_phone_openTime_wifi(url, down) if uuid: data['REVIEW_COUNT'], data['NETIZEN_EVALUTION'] = get_review(uuid, id, url, down) if data['NETIZEN_EVALUTION'] == None: return limit = ''' ''' for key, value in data.items(): if data[key] != None: if type(data[key]) == int: limit = limit + str(key) + "=" + str(data[key]) + "," else: limit = limit + str(key) + "=" + "'" + data[key] + "'" + "," limit = limit[:-1] sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id db.update_data(sql) else: print('uuid is None') return limit = '' sql = '' now_time = datetime.now() now_time = str(now_time) now_time = now_time.split('.')[0] data['UPDATE_TIME'] = now_time data['LABEL_IS_CCRAWLED'] = 1 try: for key, value in data.items(): if data[key] != None: if type(data[key]) == int: limit = limit + str(key) + "=" + str(data[key]) + "," else: limit = limit + str(key) + "=" + "'" + data[key] + "'" + "," limit = limit[:-1] sql = 'update crawler.mt_meishi set ' + limit + ' where SHOP_ID = ' + id db.update_data(sql) except Exception as e: print(e) pass
def install_cmake(package, **kwargs): if kwargs.get('check_installed', True) and installed('cmake'): return dir, version, minor = re.match(r'(cmake-(\d+\.\d+)\.(\d+).*-[^\.]+)\..*', package).groups() # extractall overwrites existing files, so no need to prepare the # destination. url = 'https://cmake.org/files/v{0}/{1}'.format(version, package) install_dir = kwargs.get('install_dir', opt_dir) with Downloader(kwargs.get('download_dir', '.')).download(url) as f: iszip = package.endswith('zip') with zipfile.ZipFile(f) if iszip \ else closing(tarfile.open(f, 'r:gz')) as archive: archive.extractall(install_dir) dir = os.path.join(install_dir, dir) if platform.system() == 'Darwin': dir = glob.glob(os.path.join(dir, 'CMake*.app', 'Contents'))[0] cmake_path = os.path.join(dir, 'bin', 'cmake') if install_dir != '.': add_to_path(cmake_path) return cmake_path
def main(): args = [i.lower() for i in sys.argv] if 'help' in args or len(args) is 1: print_help() if 'download' in args: down = Downloader() down.download() down.preprocess() down.write_out(train="train.dat",test="test.dat") if 'tag' in args: t = Tagger() t.tag("test.dat") t.write_out("test_tagged.dat") if 'train' in args: m = Model() m.train("train.dat") m.write_out() if 'test' in args: m = Model("model.mdl") m.test("test_tagged.dat")
class Spider(): def __init__(self): self.today = time.strftime("%Y-%m-%d",time.localtime(time.time())) self.urllogpath = "../data/url" os.system("mkdir -p %s" % self.urllogpath) self.urllog = "../data/url/" + "downloadedurl_" + self.today + ".txt" self.subpagepath = "../data/subpagepath" os.system("mkdir -p %s" % self.subpagepath) self.baseurl = BASEURLS self.suburl = {} self.downloader = Downloader() self.html2db = Html2db() def get_safe_utf8(self,s): if isinstance(s,str): return s else: return s.encode('utf-8','ignore') def detect_html(self,html): if not html:return None try: return html.decode('utf-8') except: return html.decode('gbk','ignore') def normal_url(self,url): u = urlparse(url) if u.fragment: return url[:-(len(u.fragment) + 1)] return url def link_parse(self,html,base): if not html or not base: return urls soup = BeautifulSoup(html) for li in soup.findAll('li'): try: li.contents[0].contents[0] except: continue title = li.contents[0].contents[0] #title = self.get_safe_utf8(title) href = li.contents[0]["href"] time = li.contents[1].strip() time = time.replace(u')',"") time = time.replace(u'(',"") #title = self.cleanHtmlTag(self.get_safe_utf8(title)) if not href:continue if href in self.suburl.keys():continue href = self.normal_url(self.get_safe_utf8(urljoin(base, self.get_safe_utf8(href)))) #self.suburl[href] = (title,time) if time == self.today: self.suburl[href] = (title,time) #print title #print href #print time return True def cleanHtmlAgain(self,value): regex1 = "<[\s\S]*?>" value = re.subn(regex1,"",value,re.M) return value[0] def cleanHtmlTag(self,html): html = html.strip() html = html.strip("\n") result = [] parser = HTMLParser() parser.handle_data = result.append parser.feed(html) parser.close() res = ''.join(result) res = self.cleanHtmlAgain(res) return res def getSubUrl(self,baseurl): tmp = "" maxturnpage = 5 regex = "\/[a-zA-Z0-9]+_[a-zA-Z0-9]+\.htm$" for i in range(1,maxturnpage): if(re.search(regex,baseurl)): regextmp = "\.htm$" tmp = re.sub(regextmp,"_" + str(i) + ".htm",baseurl) else: regexdel = "_\d?\.htm$" urltmp = re.sub(regexdel,"_" + str(i) + ".htm",baseurl) baseurl = urltmp html, redirect, code = self.downloader.fetch(self.get_safe_utf8(baseurl)) if code == 200: html = self.detect_html(html) self.link_parse(html,redirect) print 'baseurl down succeed : %s' % baseurl baseurl = tmp return True def deleteDownloadedUrl(self): print "There are %s urls need to download!" % len(self.suburl.keys()) if os.path.isfile(self.urllog): logfile = open(self.urllog) if logfile: for line in logfile.readlines(): line = line.strip() if line in self.suburl.keys(): del self.suburl[line] else: print("Could not open the logfile : %s",self.urllog) else: print ("the logfile : " + self.urllog + " is not exists this time !") print "There are %s urls REALLY need to download!" % len(self.suburl.keys()) def downloadPages(self,enChannel,chChannel): enChannelpath = self.subpagepath + "/" + enChannel os.system("mkdir -p %s" % enChannelpath) num = 0 for suburl in self.suburl.keys(): title = self.suburl[suburl][0] pubtime = self.suburl[suburl][1] html, redirect, code = self.downloader.fetch(self.get_safe_utf8(suburl)) if code == 200: print "suburl download succeed : %s" % suburl html = self.detect_html(html) subpagefile = enChannelpath + "/content_" + self.today +"_" + str(num) + ".html" num = num + 1 try: fileout = open(subpagefile,"w") fileout.write(self.get_safe_utf8(html) + "\n") fileout.close() except IOError, e: sys.stderr.write("could not open the subpagefile : %s" + subpagefile) soup = BeautifulSoup(html) for div in soup.findAll("div",id="Zoom"): content = self.cleanHtmlTag(str(div)) inserttime = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) try: title = self.get_safe_utf8(title) except: title =title content = self.get_safe_utf8(content) html = self.get_safe_utf8(html) chChannel = self.get_safe_utf8(chChannel) suburl = self.get_safe_utf8(suburl) self.html2db.datainsert(title,content,html,chChannel,suburl,pubtime,inserttime) print title print suburl print pubtime #print content #print html print chChannel print inserttime print "################################################################################"
import os from download import Downloader from uncompress import Uncompresser from makergb import MakeRGB from makepreview import MakePreview if __name__ == '__main__': with open("creds.txt","r") as f: lines = f.readlines() username = lines[0].strip() password = lines[1].strip() # create tool instances dler = Downloader(username=username,password=password,DEBUG=True) uncomp = Uncompresser(DEBUG=True) rgb = MakeRGB(DEBUG=True) prev = MakePreview(DEBUG=True) # create list of known images #LC80130312013273LGN00 prefix = 'LC8013031' #postfix = 'LGN01' images = [ '2013273LGN00', # Sept 29th, 2013 ] print "Processing {0} image archives ...".format(len(images)) for image in images:
def test_download_to_temp_dir(self): d = Downloader() with util.CaptureStdout(): with d.download('file://' + __file__) as f: filename = f self.assertEqual(tempfile.gettempdir(), os.path.dirname(filename))
def __init__(self, target): self.downloader = Downloader() self.projects = [] self.projects.append(Project( "pugixml", "PUGIXML", None, False)) self.projects.append(libgit2Project(target)) self.projects.append(Project( "Ishiko/Errors", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoErrors.sln", False)) self.projects.append(Project( "Ishiko/Collections", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoCollections.sln", False)) self.projects.append(Project( "Ishiko/Process", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoProcess.sln", False)) self.projects.append(Project( "DiplodocusDB/Core", "DIPLODOCUSDB", "Makefiles/$(compiler_short_name)/DiplodocusDBCore.sln", False)) self.projects.append(Project( "DiplodocusDB/TreeDB/Core", "DIPLODOCUSDB", "Makefiles/$(compiler_short_name)/DiplodocusTreeDBCore.sln", False)) self.projects.append(Project( "DiplodocusDB/TreeDB/XMLTreeDB", "DIPLODOCUSDB", "Makefiles/$(compiler_short_name)/DiplodocusXMLTreeDB.sln", False)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Core", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyCore.sln", False)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Make", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyMake.sln", False)) self.projects.append(Project( "Ishiko/TestFramework/Core", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoTestFrameworkCore.sln", True)) self.projects.append(Project( "Ishiko/WindowsRegistry", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoWindowsRegistry.sln", True)) self.projects.append(Project( "Ishiko/FileTypes", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoFileTypes.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UICore", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUICore.sln", True)) self.projects.append(wxWidgetsProject()) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UIElements", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUIElements.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UIImplementation", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUIImplementation.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UI", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithy.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Tests/Core", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyCoreTests.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Tests/Make", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyMakeTests.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Tests/UICore", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUICoreTests.sln", True)) self.tests = [] self.tests.append(Test("CodeSmithyIDE/CodeSmithy/Tests/Core", "CodeSmithyCoreTests.exe")) self._init_downloader()
class Projects: def __init__(self, target): self.downloader = Downloader() self.projects = [] self.projects.append(Project( "pugixml", "PUGIXML", None, False)) self.projects.append(libgit2Project(target)) self.projects.append(Project( "Ishiko/Errors", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoErrors.sln", False)) self.projects.append(Project( "Ishiko/Collections", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoCollections.sln", False)) self.projects.append(Project( "Ishiko/Process", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoProcess.sln", False)) self.projects.append(Project( "DiplodocusDB/Core", "DIPLODOCUSDB", "Makefiles/$(compiler_short_name)/DiplodocusDBCore.sln", False)) self.projects.append(Project( "DiplodocusDB/TreeDB/Core", "DIPLODOCUSDB", "Makefiles/$(compiler_short_name)/DiplodocusTreeDBCore.sln", False)) self.projects.append(Project( "DiplodocusDB/TreeDB/XMLTreeDB", "DIPLODOCUSDB", "Makefiles/$(compiler_short_name)/DiplodocusXMLTreeDB.sln", False)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Core", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyCore.sln", False)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Make", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyMake.sln", False)) self.projects.append(Project( "Ishiko/TestFramework/Core", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoTestFrameworkCore.sln", True)) self.projects.append(Project( "Ishiko/WindowsRegistry", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoWindowsRegistry.sln", True)) self.projects.append(Project( "Ishiko/FileTypes", "ISHIKO", "Makefiles/$(compiler_short_name)/IshikoFileTypes.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UICore", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUICore.sln", True)) self.projects.append(wxWidgetsProject()) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UIElements", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUIElements.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UIImplementation", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUIImplementation.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/UI", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithy.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Tests/Core", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyCoreTests.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Tests/Make", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyMakeTests.sln", True)) self.projects.append(Project( "CodeSmithyIDE/CodeSmithy/Tests/UICore", "CODESMITHY", "Makefiles/$(compiler_short_name)/CodeSmithyUICoreTests.sln", True)) self.tests = [] self.tests.append(Test("CodeSmithyIDE/CodeSmithy/Tests/Core", "CodeSmithyCoreTests.exe")) self._init_downloader() def get(self, name): for project in self.projects: if project.name == name: return project return None def set_environment_variables(self, output): print("") output.print_step_title("Setting environment variables") env = {} for project in self.projects: value = os.getcwd() + "/Build/" + project.name.split("/")[0] if project.env_var in env: old_value = env[project.env_var] if (old_value != value): exception_text = "Conflicting values for " + \ "environment variable " + project.env_var + " (" + \ value + " vs " + old_value + ")" raise RuntimeError(exception_text) else: env[project.env_var] = value for var_name in env: print(" " + var_name + ": " + env[var_name]) os.environ[var_name] = env[var_name] output.next_step() def download(self): self.downloader.download() def build(self, build_tools, build_configuration, input, state, output): # For now only bypass pugixml, libgit2 and wxWidgets because they # are independent from the rest. More complex logic is required to # handle the other projects. # Unless we have built all project succesfully. for project in self.projects: if state.build_complete: project.built = True elif project.name in ["libgit2", "pugixml", "wxWidgets"]: if project.name in state.built_projects: project.built = True for project in self.projects: print("") output.print_step_title("Building " + project.name) if project.built: print(" Using previous execution") else: project.unzip(self.downloader) project.build(build_tools, build_configuration, input, output) state.set_built_project(project.name) output.next_step() state.set_build_complete() def test(self, compiler, architecture_dir_name, input): for test in self.tests: # TODO executable_path = "Build/" + test.project_name + \ "/Makefiles/VC15/x64/Debug/" + test.executable try: subprocess.check_call([executable_path]) except subprocess.CalledProcessError: launchIDE = input.query(" Tests failed. Do you you want to" " launch the IDE?", ["y", "n"], "n") if launchIDE == "y": self.get(test.project_name).launch(compiler, architecture_dir_name) raise RuntimeError(test.project_name + " tests failed.") def _init_downloader(self): for project in self.projects: project_downloader = project.create_downloader() self.downloader.merge(project_downloader)
def test_download(self): d = Downloader() with util.CaptureStdout(): with d.download('file://' + __file__) as f: self.assertEqual(readfile(__file__), readfile(f))
class Spider(threading.Thread): def __init__(self, master=True): threading.Thread.__init__(self) self.pagestore = PageStore() self.downloader = Downloader(); self.connection = Connection(MONGODB_HOST, MONGODB_PORT) db = self.connection.download if master: db.drop_collection('downurl') for f, tb in ((SAVE_URL_RE_BLACK, 'save_url_black'), (SAVE_URL_RE_WHITE, 'save_url_white'), (DOWN_URL_RE_BLACK, 'down_url_black'), (DOWN_URL_RE_WHITE, 'down_url_white')): if os.path.exists(f): db.drop_collection(tb) logger.info('load rule:%s...' % f) for s in set(open(f).readlines()): s = s.strip() if s: db[tb].insert({'pattern': s}) logger.info('load rule:%s...OK' % f) self.downurl, self.allurl, self.watchurl, self.updateurl, self.secceedurl = db.downurl, db.allurl, db.watchurl, db.updateurl, db.secceedurl self.save_url_black = self.load_re(db.save_url_black) self.save_url_white = self.load_re(db.save_url_white) self.down_url_black = self.load_re(db.down_url_black) self.down_url_white = self.load_re(db.down_url_white) if master: self.load_watch_url() self.load_update_url() self.reload_allurl() logger.info('allurl:%d' % self.allurl.find().count()) logger.info('secceedurl:%d' % self.secceedurl.find().count()) logger.info('updateurl:%d' % self.updateurl.find().count()) logger.info('watchurl:%d' % self.watchurl.find().count()) logger.info('downurl:%d' % self.downurl.find().count()) def load_re(self, tb): s = set([r['pattern'] for r in tb.find()]) return [re.compile(s) for r in s] def get_safe_utf8(self, s): if isinstance(s, str): return s else: return s.encode('utf-8', 'ignore') def getmd5(self, s): m = md5.new() m.update(self.get_safe_utf8(s)) return m.hexdigest() def get_one_task(self, tb): row = tb.find_and_modify(remove=True) if not row:return None row = self.allurl.find_one(row) return row['url'] if row else None def add_one_task(self, url, tb): s = url.lower() if s.startswith('http://') or s.startswith('https://'): k = self.getmd5(s) self.allurl.insert({'url': url, '_id':k}) tb.insert({'_id': k}) def load_watch_url(self): if not os.path.exists(WATCH_URL_FILE): return logger.info('load watch urls...') with open(WATCH_URL_FILE) as f: while True: url = f.readline() if not url:break self.add_one_task(url.strip(), self.watchurl) logger.info('load watch urls...%d' % self.watchurl.count()) def normal_url(self, url): u = urlparse(url) if u.fragment: return url[:-(len(u.fragment) + 1)] return url def load_update_url(self): if not os.path.exists(UPDATE_URL_FILE): return logger.info('load update urls...') with open(UPDATE_URL_FILE) as f: while True: url = f.readline() if not url:break self.add_one_task(url.strip(), self.updateurl) logger.info('load update urls...%d' % self.updateurl.count()) def check_url(self, url, black, white): for p in black: if p.search(url): return False if not white: return True for p in white: if p.search(url): return True return False def check_add_new_task(self, url): s = url.lower() #error url if not s.startswith('http://') and not s.startswith('https://'): return False #don't save url if not self.check_url(url, self.save_url_black, self.save_url_white): return False k = self.getmd5(s) #already save if self.allurl.find({'_id':k}).count(): return False self.allurl.insert({'url': url, '_id':k}) #dont't down if not self.check_url(url, self.down_url_black, self.down_url_white): return False #already down succeed if self.secceedurl.find({'_id':k}).count(): return False self.downurl.insert({'_id': k}) return True def reload_allurl(self): logger.info('reload all url...') for row in self.allurl.find(): k, url = row['_id'], row['url'] if not self.check_url(url, self.down_url_black, self.down_url_white): continue if self.secceedurl.find({'_id':k}).count(): continue self.downurl.insert({'_id':k}) logger.info('reload all url...%d ' % self.downurl.find().count()) def detect_html(self, html): if not html:return None try: return html.decode('utf-8') except: return html.decode('gbk', 'ignore') def process_url(self, url): html, redirect, code = self.downloader.fetch(self.get_safe_utf8(url)) if code == 200: html = self.detect_html(html) for href in self.link_parse(html, redirect): try: self.check_add_new_task(href) except Exception as e: logger.exception('%s,%s:%s' % (type(href), href, e.message)) for k in set([self.getmd5(url.lower()), self.getmd5(redirect.lower())]): self.secceedurl.insert({'_id': k}) if html: self.pagestore.succeed(url, html) return True return False def link_parse(self, html, base): urls = set() if not html or not base:return urls soup = BeautifulSoup(html) for a in soup.findAll('a'): href = a.get('href') if not href:continue if href in urls:continue href = self.normal_url(self.get_safe_utf8(urljoin(base, self.get_safe_utf8(href)))) urls.add(href) return urls def get_url_block(self): while True: for tb in (self.watchurl, self.downurl, self.updateurl): url = self.get_one_task(tb) if url:return url logger.info('no any task') time.sleep(1) def proce_one_url(self): url = self.get_url_block() logger.info('down:%s' % url) ret = False try: ret = self.process_url(url) except Exception as e: logger.exception('url:%s %s' % (url, e.message)) if not ret: self.pagestore.failed(url) def run(self): while True: try: while True: self.proce_one_url() except Exception,e: logger.exception(e.message) time.sleep(1)