def download_data():
    url = 'http://ufldl.stanford.edu/housenumbers/'
    # Download datasets
    train_filename = download(url, 'train.tar.gz')
    test_filename = download(url, 'test.tar.gz')
    extra_filename = download(url, 'extra.tar.gz')
    return train_filename, test_filename, extra_filename
Beispiel #2
0
def crawl_sitemap(url):
    # download the sitemap file
    sitemap = download(url)
    # extract the sitemap links
    links = re.findall('<loc>(.*?)<loc>', sitemap)
    # download each link
    for link in links:
        html = download(link)
Beispiel #3
0
    def __call__(self, tpl, raw_data, json_stuff):
        """Process whole post into directory"""
        keys = []
        funcs = []
        self.urls = []
        self.prefix = tpl[0]
        self.number = tpl[1]
        ignore = [
            'id',
            'to_id',
            'from_id',
            'date',
            'likes',
            'reposts',
            'signer_id',
            'copy_owner_id',
            'copy_post_id',
            'copy_post_date',
            'copy_post_type',
            'reply_count',
            'post_type',
            'post_source',
            'online',
            'attachment',
            'copy_text',
            'media',
            'can_edit',
            # comments fix
            'uid',
            'cid',
            'reply_to_cid',
            'reply_to_uid',
            'reply_owner_id',
            'reply_post_id',
        ]
        for k in raw_data.keys():
            if k in ignore:
                continue
            try:
                f = getattr(self, k)
                keys.append(k)
                funcs.append(f)
            except AttributeError:
                logging.warning("Not implemented: {}".format(k))
        logging.info("Saving: {} for {}".format(', '.join(keys),
                                                raw_data['id']))
        self.post_directory = make_dir(self.directory, str(raw_data['id']))

        self.save_raw(json_stuff)
        for (f, k) in zip(funcs, keys):
            f(k, raw_data)

        if self.urls and not self.args.no_download:
            download(
                self.urls,
                self.post_directory,
            )
Beispiel #4
0
 def test_setlogin(self):
     self.login = login()
     self.login.login()
     self.a = self.login.login1()
     sleep(2)
     if self.a == "第一次登陆":
         self.project = project()
         self.project.project()
     self.dow = download()
     self.dow.download()
     self.DropDown = DropDown()
     print(self)
     # def test_affiche(self):
     self.rpage = Returnpage()
     self.taffiche = Affiche()
     try:
         self.taffiche.affiche()
         self.title = self.taffiche.Unread()
     except BaseException as e:
         self.assertEqual(0, 1, "公告模块,未读测试未通过")
     if self.title != '无未读公告':
         try:
             self.taffiche.Read(self.title)
         except BaseException:
             self.assertEqual(0, 1, "公告模块,已读测试未通过")
     else:
         print('无未读公告')
     sleep(1)
     self.rpage.returnpage()
Beispiel #5
0
def main():
    urls_queue=queue.Queue()#save ts file urls
    # q_lock=threading.Lock()
    buffer_dict={}
    thread_num=0#seq of the thread
    thread_max=50
    wq=''#key world
    epi_dict={}
    answer_titles=[]
    answer_urls=[]
    signal=1
    wq=input('请输入关键词')
    answer_titles,answer_urls=find_animation(wq,answer_titles,answer_urls)#print the seq-ani_title list
    # print(answer_urls)
    wantedani_seq=int(input('请输入想看的动漫对应的序号:'))
    ani_url='https://www.yhdm123.com'+answer_urls[wantedani_seq]
    print(ani_url)
    
    epi_max,epititles= find_episode(ani_url,epi_dict)
    print(answer_titles[wantedani_seq],'共有',epi_max,'集')
    print(epititles)
    start_episode=int(input('从哪一集开始下载:'))
    end_episode=int(input('哪一集结束:'))
    
    epi_seq=start_episode
    
    while epi_seq<=end_episode:
        episode_url='https://www.yhdm123.com'+epi_dict[epi_seq]
        print(episode_url)
        fseq_max=ts_request(urls_queue,episode_url)#fill urls_queue with ts file url
        print('文件数:',fseq_max)
        
        create_thread(thread_num, thread_max, threads,urls_queue,buffer_dict,signal)
        activate_thread(threads)
        
        #download to local
        download(buffer_dict,signal,fseq_max,answer_titles[wantedani_seq],epi_seq)
        epi_seq+=1
        # print(threading.alive_)/ignore
    return
Beispiel #6
0
    def __call__(self, tpl, raw_data, json_stuff):
        """Process whole post into directory"""
        keys = []
        funcs = []
        self.urls = []
        self.prefix = tpl[0]
        self.number = tpl[1]
        ignore = ['id', 'to_id', 'from_id', 'date',
                  'likes', 'reposts', 'signer_id',
                  'copy_owner_id', 'copy_post_id', 'copy_post_date',
                  'copy_post_type', 'reply_count', 'post_type',
                  'post_source', 'online', 'attachment', 'copy_text',
                  'media', 'can_edit',
                  # comments fix
                  'uid', 'cid', 'reply_to_cid', 'reply_to_uid',
                  'reply_owner_id', 'reply_post_id',
                ]
        for k in raw_data.keys():
            if k in ignore:
                continue
            try:
                f = getattr(self, k)
                keys.append(k)
                funcs.append(f)
            except AttributeError:
                logging.warning("Not implemented: {}".format(k))
        logging.info("Saving: {} for {}".format(', '.join(keys), raw_data['id']))
        self.post_directory = make_dir(self.directory, str(raw_data['id']))

        self.save_raw(json_stuff)
        for (f, k) in zip(funcs, keys):
            f(k, raw_data)

        if self.urls and not self.args.no_download:
            download(self.urls,
                      self.post_directory,
            )
Beispiel #7
0
with open('testarrayData.csv', 'a') as output:
    writer = csv.writer(output, lineterminator='\n')
    writer.writerow(fieldnames)

nrows = len(test_array)

#CONVERT DATA FROM A LIST TO A CSV AND PERFORM WEBSCRAP
for i in range(nrows):
    print(i)
    next = False
    if next:
        continue
    link = test_array[i]
    if str(link).startswith("http"):
        print("Getting Link")
        html = download(link)
        soup = BeautifulSoup(html, 'html.parser')
        print("Got link")
        Scrap_Preprocess.extraction(soup)
        print("extracted, goodnight")
        time.sleep(5)
    else:
        next = True

data = pd.read_csv("testarrayData.csv")
y_pred1 = Pred_module.prediction(data)

Solic_links = []

for i in range(len(y_pred1)):
    if y_pred1[i] == 0:
Beispiel #8
0
from Asset import AssetMenu
from WorkOrder import WorkOrderMenu
from Energy import EnergyMenu
from Maintain import MaintainMenu
from Visitor import VisitorMenu
from inspection import InspectionMenu
from Knowledge import KnowledgeMenu
from payment import paymentMenu
login = login()
login.login()
a = login.login1()
sleep(2)
if a == "第一次登陆":
    project = project()
    project.project()
dow = download()
dow.download()
DropDown = DropDown()
# 公告
affiche = AfficheMenu()
# # 巡检
# inspection=PatrolMenu()
# # 服务台
# requirment=RequirmentMenu()
# # 工单(未走)
# wordorder=WorkOrderMenu()
# # 计划性维护
# maintain=MaintainMenu()
# # 资产
# asset=AssetMenu()
# # 能源管理
Beispiel #9
0
#SSLDOASSTATE
driver.get('https://ssl.doas.state.ga.us/PRSapp/PR_index.jsp')
time.sleep(5)
js = 'document.querySelectorAll(`input`)[2].click()'
driver.execute_script(js)
time.sleep(10)
page_link = driver.find_elements(By.TAG_NAME, 'a')
all_hrefs = []
for page in page_link:
    all_hrefs.append(page.get_attribute('href'))
driver.quit()

for i in range(5, len(all_hrefs)):
    href = all_hrefs[i]
    time.sleep(2)
    html = download(href)
    soup = BeautifulSoup(html, 'html.parser')
    print("\n\n****************************************\n\n")
    combined, phone_number_list, email_list, date_list = extraction_functions.extraction(
        soup)
    print("Date:", date_list)
    print("List_Phone_Number:", phone_number_list)
    print("Email:", email_list)
    combined_number = [
        com for com in combined if extraction_functions.only_alpha(com)
    ]
    combined_number = [com for com in combined_number if len(com) < 20]
    predict_matrix_number = extraction_functions.fill_matrix(combined_number)
    class_ids = ["0", "1"]
    Bid_Number_predictions = classifierLoad.predict(predict_matrix_number)
    probabilities = []
Beispiel #10
0
import os
from Download import download
from DataCollecting import serialize_all_case_objs
from pca import serialize_training_result
'''
初始化所需要的文件
'''

if not os.path.exists('data'):
    download()

# 如果在DataCollecting中以main运行,在其他地方调用反序列化时会出现AttributeError,要from DataCollecting import Case才可以
# 为什么不能在DataCollecting.py里序列化:
# 如果在该类定义的模块序列化,其他模块反序列化时会找不到定义,得import这个类。
# 在本模块中序列化,序列化后的开头为"c__main__",在其他模块中反序列化后也以为在自己的__main__里,就会报找不到属性,得import。
# 在其他模块中序列化,序列化后的开头为"c"+类所在的模块的名称,知道去哪里找。
# 但如果是远程传输,远程没有该模块,则应该在本模块中序列化,使模块名为__main__,在远程反序列化的模块里复制该类的定义(类如果有方法,方法名要相同,方法体可以不相同,因为不序列化方法体)
serialize_all_case_objs()
serialize_training_result()
Beispiel #11
0
from Download import download
import numpy as np

years = [2017]
months = [10]
days = list(range(4, 10))
channels = list(range(1, 17))

for year in years:
    for month in months:
        for day in days:
            for channel in channels:
                channel = '%s' % str(channel).zfill(2)
                path = "data_goesr/%s/%s/%s/ch_%s" % (year, month, day,
                                                      channel)
                print("Download data to ", path)
                download(year=year, month=month, day=day, ch=channel)