Example #1
0
def main(url, data, obj):
    '''
    主函数,在该函数中,完成以下功能:
        1、获取给定url页面中的所有链接
        2、判断链接的url,并添加参数
        3、将转换完成的url存入文件。
    '''
    print '====BEGIN======'
    try:
        fetcher = Fetch(url=url, from_encoding='GBK')  #定义Fetch实例

        content = fetcher.get_content().renderContents()  #获取content内容
        #  利用Beautiful.Beautiful得到content内容,
        #  renderContents()方法的作用?
        #
        for link in fetcher.get_all_link():
            params = data.split(',')
            #处理如果存在参数
            for param in params:
                if param in link:
                    params.remove(param)
            newlink = '%s?%s' % (link, '&'.join(params))
            #链接替换
            content = content.replace('href="%s"' % link,
                                      'href="%s"' % newlink)
        obj_file = open(obj, 'w')
        obj_file.write(content)
        obj_file.close()
        print '====OVER======='
    except Exception, e:
        print 'an exception occur:%s' % str(e)
Example #2
0
def main(url, data, obj):
    '''
    主函数,在该函数中,完成以下功能:
        1、获取给定url页面中的所有链接
        2、判断链接的url,并添加参数
        3、将转换完成的url存入文件爱呢。
    '''
    print '====BEGIN======'
    try: 
        fetcher = Fetch(url = url, from_encoding = 'GBK')
        content = fetcher.get_content().renderContents()
        for link in fetcher.get_all_link():
            params = data.split(',')
            #处理如果存在参数
            for param in params:
                if param in link:
                   params.remove(param)
            newlink = '%s?%s' % (link,'&'.join(params))
            #链接替换
            content = content.replace('href="%s"' % link, 'href="%s"' % newlink)
        obj_file = open(obj, 'w')
        obj_file.write(content)
        obj_file.close()
        print '====OVER======='
    except Exception,e:
        print 'an exception occur:%s' % str(e)
Example #3
0
def fidelityParse(url="", queue=None, result_queue=None, uniq_urls=None):
    """
    :param url: url
    queue: request的队列
    result_queue:解析后数据的队列
    """
    htmldoc = Fetch(url)
    soup = BeautifulSoup(htmldoc, "html.parser")

    # 获取其他日期的,以及下一周的
    date_list_links = soup.find(
        "div", class_="date-list-links").find("ul").findAll("li")
    for links in date_list_links:
        if links.attrs.get("class", [0])[0] in ["firstitem", "selected"]:
            continue
        link = links.find("a").attrs.get("href")
        complete_link = parse.urljoin(settings.Fidelity_Domain, link)
        # print(complete_link)
        # mutex.acquire()
        if _puase_date(complete_link):
            if uniq_urls is not None and not uniq_urls.get(complete_link):
                uniq_urls[complete_link] = 1
                print(len(uniq_urls))
                queue.put(Request(url=complete_link, ParserFunc=fidelityParse))
        # mutex.release()

    trs = soup.find("table",
                    class_="datatable-component").find("tbody").find_all("tr")
    for tr in trs:
        try:
            tds = tr.findAll("td")
            th = tr.find("th")
            company = ""
            website = ""
            if th:
                company = th.find("strong").text
                website_elem = th.find("a", href="#")
                # 会出现没有的情况
                if website_elem:
                    website = th.find("a", href="#").attrs.get(
                        "onclick").split(",'")[1].strip("')")
            symbol = tds[0].text.replace("\n", "").replace("\t", "")
            if len(tds) < 6:
                continue
            dividend = tds[1].text
            announcement_date = cov_date_format(tds[2].text, source_format,
                                                target_format)
            record_date = cov_date_format(tds[3].text, source_format,
                                          target_format)
            ex_date = cov_date_format(tds[4].text, source_format,
                                      target_format)
            pay_date = cov_date_format(tds[5].text, source_format,
                                       target_format)
            result_queue.put(
                FidelityModel(company, website, symbol, dividend,
                              announcement_date, record_date, ex_date,
                              pay_date))
        except Exception:
            traceback.print_exc()
Example #4
0
def do_work(argv):
    url = argv.get('url')
    workmanager = argv.get('workmanager')
    max_size = argv.get('max_size')
    fetcher = Fetch(url)
    for resource in fetcher.get_all_resource():
        if len(resourcelist) > max_size:
            break
        if resource not in resourcelist:
            resourcelist.append(resource)
            logger.get_logger.info(resource)

    for href in fetcher.get_all_link():
        if len(resourcelist) > max_size:
            break
        if href not in urllist:
            urllist.append(href)
            workmanager.add_job(do_work,
                                workmanager=workmanager,
                                url=href,
                                max_size=max_size)
Example #5
0
	def select_file(self):
		file_type = [('All Excel Files', '*.xlsx'),('All files', '*')]
		dialog = tkFileDialog.Open(self, filetypes=file_type)
		file1 = dialog.show()

		occ_list = []
		if file1 != '':
			f = Fetch(file1, 'Sheet1')
			f.get_data()
			f.create_jobs()
			self.occupation_dict = f.convert_data_to_dict()
			for k in self.occupation_dict.keys():
				occ_list.append(k)
			self.my_cb['cb']['values'] = occ_list
			self.my_cb['cb'].current(0)
			self.my_cb['cb'].config(state='readonly')
Example #6
0
    def test(self):
        f = Fetch()

        self.assertEqual(f.value(), 1)
Example #7
0
import matplotlib.pyplot as plt
from fetcher import Fetch
import sys

f = Fetch('bls_salary_test.xlsx', 'Sheet1')

f.get_data()
f.create_jobs()
f.format_title_salary()