def main(url, data, obj): ''' 主函数,在该函数中,完成以下功能: 1、获取给定url页面中的所有链接 2、判断链接的url,并添加参数 3、将转换完成的url存入文件。 ''' print '====BEGIN======' try: fetcher = Fetch(url=url, from_encoding='GBK') #定义Fetch实例 content = fetcher.get_content().renderContents() #获取content内容 # 利用Beautiful.Beautiful得到content内容, # renderContents()方法的作用? # for link in fetcher.get_all_link(): params = data.split(',') #处理如果存在参数 for param in params: if param in link: params.remove(param) newlink = '%s?%s' % (link, '&'.join(params)) #链接替换 content = content.replace('href="%s"' % link, 'href="%s"' % newlink) obj_file = open(obj, 'w') obj_file.write(content) obj_file.close() print '====OVER=======' except Exception, e: print 'an exception occur:%s' % str(e)
def main(url, data, obj): ''' 主函数,在该函数中,完成以下功能: 1、获取给定url页面中的所有链接 2、判断链接的url,并添加参数 3、将转换完成的url存入文件爱呢。 ''' print '====BEGIN======' try: fetcher = Fetch(url = url, from_encoding = 'GBK') content = fetcher.get_content().renderContents() for link in fetcher.get_all_link(): params = data.split(',') #处理如果存在参数 for param in params: if param in link: params.remove(param) newlink = '%s?%s' % (link,'&'.join(params)) #链接替换 content = content.replace('href="%s"' % link, 'href="%s"' % newlink) obj_file = open(obj, 'w') obj_file.write(content) obj_file.close() print '====OVER=======' except Exception,e: print 'an exception occur:%s' % str(e)
def fidelityParse(url="", queue=None, result_queue=None, uniq_urls=None): """ :param url: url queue: request的队列 result_queue:解析后数据的队列 """ htmldoc = Fetch(url) soup = BeautifulSoup(htmldoc, "html.parser") # 获取其他日期的,以及下一周的 date_list_links = soup.find( "div", class_="date-list-links").find("ul").findAll("li") for links in date_list_links: if links.attrs.get("class", [0])[0] in ["firstitem", "selected"]: continue link = links.find("a").attrs.get("href") complete_link = parse.urljoin(settings.Fidelity_Domain, link) # print(complete_link) # mutex.acquire() if _puase_date(complete_link): if uniq_urls is not None and not uniq_urls.get(complete_link): uniq_urls[complete_link] = 1 print(len(uniq_urls)) queue.put(Request(url=complete_link, ParserFunc=fidelityParse)) # mutex.release() trs = soup.find("table", class_="datatable-component").find("tbody").find_all("tr") for tr in trs: try: tds = tr.findAll("td") th = tr.find("th") company = "" website = "" if th: company = th.find("strong").text website_elem = th.find("a", href="#") # 会出现没有的情况 if website_elem: website = th.find("a", href="#").attrs.get( "onclick").split(",'")[1].strip("')") symbol = tds[0].text.replace("\n", "").replace("\t", "") if len(tds) < 6: continue dividend = tds[1].text announcement_date = cov_date_format(tds[2].text, source_format, target_format) record_date = cov_date_format(tds[3].text, source_format, target_format) ex_date = cov_date_format(tds[4].text, source_format, target_format) pay_date = cov_date_format(tds[5].text, source_format, target_format) result_queue.put( FidelityModel(company, website, symbol, dividend, announcement_date, record_date, ex_date, pay_date)) except Exception: traceback.print_exc()
def do_work(argv): url = argv.get('url') workmanager = argv.get('workmanager') max_size = argv.get('max_size') fetcher = Fetch(url) for resource in fetcher.get_all_resource(): if len(resourcelist) > max_size: break if resource not in resourcelist: resourcelist.append(resource) logger.get_logger.info(resource) for href in fetcher.get_all_link(): if len(resourcelist) > max_size: break if href not in urllist: urllist.append(href) workmanager.add_job(do_work, workmanager=workmanager, url=href, max_size=max_size)
def select_file(self): file_type = [('All Excel Files', '*.xlsx'),('All files', '*')] dialog = tkFileDialog.Open(self, filetypes=file_type) file1 = dialog.show() occ_list = [] if file1 != '': f = Fetch(file1, 'Sheet1') f.get_data() f.create_jobs() self.occupation_dict = f.convert_data_to_dict() for k in self.occupation_dict.keys(): occ_list.append(k) self.my_cb['cb']['values'] = occ_list self.my_cb['cb'].current(0) self.my_cb['cb'].config(state='readonly')
def test(self): f = Fetch() self.assertEqual(f.value(), 1)
import matplotlib.pyplot as plt from fetcher import Fetch import sys f = Fetch('bls_salary_test.xlsx', 'Sheet1') f.get_data() f.create_jobs() f.format_title_salary()