def download_data(): url = 'http://ufldl.stanford.edu/housenumbers/' # Download datasets train_filename = download(url, 'train.tar.gz') test_filename = download(url, 'test.tar.gz') extra_filename = download(url, 'extra.tar.gz') return train_filename, test_filename, extra_filename
def crawl_sitemap(url): # download the sitemap file sitemap = download(url) # extract the sitemap links links = re.findall('<loc>(.*?)<loc>', sitemap) # download each link for link in links: html = download(link)
def __call__(self, tpl, raw_data, json_stuff): """Process whole post into directory""" keys = [] funcs = [] self.urls = [] self.prefix = tpl[0] self.number = tpl[1] ignore = [ 'id', 'to_id', 'from_id', 'date', 'likes', 'reposts', 'signer_id', 'copy_owner_id', 'copy_post_id', 'copy_post_date', 'copy_post_type', 'reply_count', 'post_type', 'post_source', 'online', 'attachment', 'copy_text', 'media', 'can_edit', # comments fix 'uid', 'cid', 'reply_to_cid', 'reply_to_uid', 'reply_owner_id', 'reply_post_id', ] for k in raw_data.keys(): if k in ignore: continue try: f = getattr(self, k) keys.append(k) funcs.append(f) except AttributeError: logging.warning("Not implemented: {}".format(k)) logging.info("Saving: {} for {}".format(', '.join(keys), raw_data['id'])) self.post_directory = make_dir(self.directory, str(raw_data['id'])) self.save_raw(json_stuff) for (f, k) in zip(funcs, keys): f(k, raw_data) if self.urls and not self.args.no_download: download( self.urls, self.post_directory, )
def test_setlogin(self): self.login = login() self.login.login() self.a = self.login.login1() sleep(2) if self.a == "第一次登陆": self.project = project() self.project.project() self.dow = download() self.dow.download() self.DropDown = DropDown() print(self) # def test_affiche(self): self.rpage = Returnpage() self.taffiche = Affiche() try: self.taffiche.affiche() self.title = self.taffiche.Unread() except BaseException as e: self.assertEqual(0, 1, "公告模块,未读测试未通过") if self.title != '无未读公告': try: self.taffiche.Read(self.title) except BaseException: self.assertEqual(0, 1, "公告模块,已读测试未通过") else: print('无未读公告') sleep(1) self.rpage.returnpage()
def main(): urls_queue=queue.Queue()#save ts file urls # q_lock=threading.Lock() buffer_dict={} thread_num=0#seq of the thread thread_max=50 wq=''#key world epi_dict={} answer_titles=[] answer_urls=[] signal=1 wq=input('请输入关键词') answer_titles,answer_urls=find_animation(wq,answer_titles,answer_urls)#print the seq-ani_title list # print(answer_urls) wantedani_seq=int(input('请输入想看的动漫对应的序号:')) ani_url='https://www.yhdm123.com'+answer_urls[wantedani_seq] print(ani_url) epi_max,epititles= find_episode(ani_url,epi_dict) print(answer_titles[wantedani_seq],'共有',epi_max,'集') print(epititles) start_episode=int(input('从哪一集开始下载:')) end_episode=int(input('哪一集结束:')) epi_seq=start_episode while epi_seq<=end_episode: episode_url='https://www.yhdm123.com'+epi_dict[epi_seq] print(episode_url) fseq_max=ts_request(urls_queue,episode_url)#fill urls_queue with ts file url print('文件数:',fseq_max) create_thread(thread_num, thread_max, threads,urls_queue,buffer_dict,signal) activate_thread(threads) #download to local download(buffer_dict,signal,fseq_max,answer_titles[wantedani_seq],epi_seq) epi_seq+=1 # print(threading.alive_)/ignore return
def __call__(self, tpl, raw_data, json_stuff): """Process whole post into directory""" keys = [] funcs = [] self.urls = [] self.prefix = tpl[0] self.number = tpl[1] ignore = ['id', 'to_id', 'from_id', 'date', 'likes', 'reposts', 'signer_id', 'copy_owner_id', 'copy_post_id', 'copy_post_date', 'copy_post_type', 'reply_count', 'post_type', 'post_source', 'online', 'attachment', 'copy_text', 'media', 'can_edit', # comments fix 'uid', 'cid', 'reply_to_cid', 'reply_to_uid', 'reply_owner_id', 'reply_post_id', ] for k in raw_data.keys(): if k in ignore: continue try: f = getattr(self, k) keys.append(k) funcs.append(f) except AttributeError: logging.warning("Not implemented: {}".format(k)) logging.info("Saving: {} for {}".format(', '.join(keys), raw_data['id'])) self.post_directory = make_dir(self.directory, str(raw_data['id'])) self.save_raw(json_stuff) for (f, k) in zip(funcs, keys): f(k, raw_data) if self.urls and not self.args.no_download: download(self.urls, self.post_directory, )
with open('testarrayData.csv', 'a') as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(fieldnames) nrows = len(test_array) #CONVERT DATA FROM A LIST TO A CSV AND PERFORM WEBSCRAP for i in range(nrows): print(i) next = False if next: continue link = test_array[i] if str(link).startswith("http"): print("Getting Link") html = download(link) soup = BeautifulSoup(html, 'html.parser') print("Got link") Scrap_Preprocess.extraction(soup) print("extracted, goodnight") time.sleep(5) else: next = True data = pd.read_csv("testarrayData.csv") y_pred1 = Pred_module.prediction(data) Solic_links = [] for i in range(len(y_pred1)): if y_pred1[i] == 0:
from Asset import AssetMenu from WorkOrder import WorkOrderMenu from Energy import EnergyMenu from Maintain import MaintainMenu from Visitor import VisitorMenu from inspection import InspectionMenu from Knowledge import KnowledgeMenu from payment import paymentMenu login = login() login.login() a = login.login1() sleep(2) if a == "第一次登陆": project = project() project.project() dow = download() dow.download() DropDown = DropDown() # 公告 affiche = AfficheMenu() # # 巡检 # inspection=PatrolMenu() # # 服务台 # requirment=RequirmentMenu() # # 工单(未走) # wordorder=WorkOrderMenu() # # 计划性维护 # maintain=MaintainMenu() # # 资产 # asset=AssetMenu() # # 能源管理
#SSLDOASSTATE driver.get('https://ssl.doas.state.ga.us/PRSapp/PR_index.jsp') time.sleep(5) js = 'document.querySelectorAll(`input`)[2].click()' driver.execute_script(js) time.sleep(10) page_link = driver.find_elements(By.TAG_NAME, 'a') all_hrefs = [] for page in page_link: all_hrefs.append(page.get_attribute('href')) driver.quit() for i in range(5, len(all_hrefs)): href = all_hrefs[i] time.sleep(2) html = download(href) soup = BeautifulSoup(html, 'html.parser') print("\n\n****************************************\n\n") combined, phone_number_list, email_list, date_list = extraction_functions.extraction( soup) print("Date:", date_list) print("List_Phone_Number:", phone_number_list) print("Email:", email_list) combined_number = [ com for com in combined if extraction_functions.only_alpha(com) ] combined_number = [com for com in combined_number if len(com) < 20] predict_matrix_number = extraction_functions.fill_matrix(combined_number) class_ids = ["0", "1"] Bid_Number_predictions = classifierLoad.predict(predict_matrix_number) probabilities = []
import os from Download import download from DataCollecting import serialize_all_case_objs from pca import serialize_training_result ''' 初始化所需要的文件 ''' if not os.path.exists('data'): download() # 如果在DataCollecting中以main运行,在其他地方调用反序列化时会出现AttributeError,要from DataCollecting import Case才可以 # 为什么不能在DataCollecting.py里序列化: # 如果在该类定义的模块序列化,其他模块反序列化时会找不到定义,得import这个类。 # 在本模块中序列化,序列化后的开头为"c__main__",在其他模块中反序列化后也以为在自己的__main__里,就会报找不到属性,得import。 # 在其他模块中序列化,序列化后的开头为"c"+类所在的模块的名称,知道去哪里找。 # 但如果是远程传输,远程没有该模块,则应该在本模块中序列化,使模块名为__main__,在远程反序列化的模块里复制该类的定义(类如果有方法,方法名要相同,方法体可以不相同,因为不序列化方法体) serialize_all_case_objs() serialize_training_result()
from Download import download import numpy as np years = [2017] months = [10] days = list(range(4, 10)) channels = list(range(1, 17)) for year in years: for month in months: for day in days: for channel in channels: channel = '%s' % str(channel).zfill(2) path = "data_goesr/%s/%s/%s/ch_%s" % (year, month, day, channel) print("Download data to ", path) download(year=year, month=month, day=day, ch=channel)