Exemple #1
0
    def parse_category_type(self, response):
        """
        解析业态url,获取页码url
        :param response:
        :return:
        """
        city_name = response.meta.get('city_name')
        category_type = response.meta.get('category_type')
        business_district = response.meta.get('business_district')
        adname = response.meta.get('item')
        dic = {
            'item': adname,
            'business_district': business_district,
            'category_type': category_type,
            'city_name': city_name
        }
        try:
            all_page = PyQuery(
                response.body).find('.page').find('a').eq(-2).attr('href')
        except:
            all_page = None
        if all_page:
            num = all_page.split('/')[-1].split('p')[-1]
            try:
                c_num, aid = num.split('?')
            except:
                c_num = num
                aid = ""
            head, head1, mid, end = all_page.split('p')

            for c_page in range(1, int(c_num) + 1):
                if aid == '':
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page)
                else:
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page) + '?' + aid
                md5_url = self.md5(n_page)
                ret = spider_service.select_url(md5_url, self.md5_table)
                if not ret:
                    create_sleep()
                    spider_service.update_into(md5_url, self.md5_table)
                    yield scrapy.Request(url=n_page,
                                         meta=dic,
                                         callback=self.parse_page,
                                         headers=header2)

        else:
            time.sleep(3)
            md5_url = self.md5(response.url)
            spider_service.update_into(md5_url, self.md5_table)
            yield scrapy.Request(url=response.url,
                                 meta=dic,
                                 callback=self.parse_page,
                                 headers=header3)
Exemple #2
0
def GetUSDTmarketPrice():
    '''
    url = "http://webforex.hermes.hexun.com/forex/quotelist?code=FOREXUSDCNY&column=Code,Price"
    req = requests.get(url)
    html = req.text
    #print(html)
    s = re.findall("{.*}",str(html))[0]
    sjson = json.loads(s)


    USDTmarketPrice = float(sjson["Data"][0][0][1]/10000.00)
    return USDTmarketPrice
    '''
    list1=['1','2','3','4','5','6','7']
    pricelist=[]

    #marketRes=requests.get("http://www.feixiaohao.com/currencies/bitcoin/")
    marketRes=PyQuery(url="http://www.feixiaohao.com/currencies/tether/")
    #xxx=marketRes("#markets").html()
    for data in marketRes("tr"):
        priceinfo=PyQuery(data).text().encode("utf-8")
        if priceinfo[0] in list1:
            prices=priceinfo.split(" ")[3].replace("¥","")
            prices=prices.replace(",","")
            pricelist.append(float(prices))

    return sum(pricelist)/len(pricelist)
def calculate_concat(question_text, answer):
    query_url = define_url(question_text, answer.get_text())
    google_results, full_page = search(query_url, full_page=True)
    # Extract the number of total google results
    answer.total_results = get_google_total_results(full_page)

    for result in google_results:
        result_text = PyQuery(result).text()
        # If Google doesn't find enough results, it includes some that aren't really relevant,
        # adding "Missing words: <keywords>", where keywords are words
        # included in the search query (answer, here).
        # In this context, the results described are not useful and are excluded

        # If the answer is in the result text
        # If "Mancanti:" is not in the result text (so, it's a relevant result)
        # If the answer is not in the "Must include" section
        if answer.get_text() in result_text.lower() and \
            not "Mancanti:" in result_text and \
                not answer.get_text() in result_text.split("\n")[-1].lower():
            # Yay! This is a relevant result!
            answer.results += 1

    # Calculate the score of the answer
    answer.score = answer.total_results * (answer.results
                                           if answer.results > 0 else 1)

    return f"{answer.get_text()[:40]:^40}{answer.score:<10}{answer.results:^10}{answer.total_results:<10}"
 def test_user_with_permisions_follows_everyone(self):
     response = self.get_response()
     query = PyQuery(response.content)
     query = query("table#queryTable td.name").text()
     names = map(lambda u: u.username, (list(self.watcher.watches.all()) + [self.no_perms_user.user]))
     names.append("watcher")
     self.assertEqual(set(query.split()), set(names))
 def test_followed_users_shows_correctly(self):
     response = self.get_response()
     query = PyQuery(response.content)
     query = query("table#queryTable td.name").text()
     names = map(lambda u: u.username, self.watcher.watches.all())
     names.append("watcher")
     self.assertEqual(set(query.split()), set(names))
 def test_followed_users_shows_correctly(self):
     response = self.get_response()
     query = PyQuery(response.content)
     query = query('table#queryTable td.name').text()
     names = map(lambda u: u.username, self.watcher.watches.all())
     names.append('watcher')
     self.assertEqual(set(query.split()), set(names))
Exemple #7
0
    def get_info(self, area):
        try:

            # print pqhtml.outerHtml()

            form = area('#buyform')

            pvid = form('#arypvid').attr('value')

            stock = form('#arystock').attr('value')

            text = form('#arysubtext').attr('value')

            price = PyQuery(form('#aryprice').attr('value')
                            or 'None').text().replace(' ', '')

            origprice = PyQuery(form('#aryorigprice').attr('value')
                                or 'None').text().replace(' ', '')

            pvids = filter(lambda x: x, pvid.split('|'))
            texts = filter(lambda x: x,
                           text.split('|')) or [self.cfg.DEFAULT_ONE_SIZE]
            prices = map(
                lambda x: re.search(r'(\d[\d.]*)', x.replace(',', '')).groups(
                )[0], filter(lambda x: x, price.split('|')))
            stocks = map(lambda x: x if int(x) else self.DEFAULT_STOCK_NUMBER,
                         filter(lambda x: x, stock.split('|')))
            origprices = map(
                lambda x: re.search(r'(\d[\d.]*)', x.replace(',', '')).groups(
                )[0], filter(lambda x: x, origprice.split('|')))

            sizes = map(
                lambda x: zip(
                    ['sku', 'inventory', 'name', 'price', 'listPrice'], x),
                zip(pvids, stocks, texts, prices, origprices))
            sizes = map(lambda x: dict(x), sizes)

            price = max(prices)
            listPrice = max(origprices)

            if not sizes:
                raise ValueError, 'Get size, price, info Fail'

            return price, listPrice, sizes

        except Exception, e:
            raise
 def test_user_with_permisions_follows_everyone(self):
     response = self.get_response()
     query = PyQuery(response.content)
     query = query('table#queryTable td.name').text()
     names = map(
         lambda u: u.username,
         (list(self.watcher.watches.all()) + [self.no_perms_user.user]))
     names.append('watcher')
     self.assertEqual(set(query.split()), set(names))
Exemple #9
0
def get_content_text(content):
    # content = [s.extract() for s in content('style')]
    content_text = PyQuery(str(content)).text()
    content_text = content_text.replace('\r\n', '\n').replace('\r', '\n')
    final_content_text = ''
    for each_text in content_text.split('\n'):
        each_final_text = remove_special_char(each_text).strip()
        if each_final_text != '':
            final_content_text += each_final_text + '\n'
    return final_content_text.strip()
Exemple #10
0
 def parse_page(self, page, link):
     doc = PyQuery(page)
     hrefs = doc.find('a[href]')
     for href in hrefs:
         href_attr = PyQuery(href).attr("href")
         href_no_qs = href_attr.split("?")[0]
         if href_no_qs not in self.queue:
             if not self.is_absolute(href_attr):
                 self.queue.append(urljoin(link, href_no_qs))
                 self.add_to_backlinks(link, urljoin(link, href_no_qs))
             else:
                 self.queue.append(href_no_qs)
                 self.add_to_backlinks(link, href_no_qs)
Exemple #11
0
    def parse_page(self, response):
        """
        解析页码
        :param response:
        :return:
        """
        city_name = response.meta.get('city_name')
        adname = response.meta.get('adname')
        end_page = PyQuery(
            response.body).find('.page').find('a').eq(-2).attr('href')
        dic = {"city_name": city_name, "adname": adname}
        if end_page:
            num = end_page.split('/')[-1].split('p')[-1]
            try:
                c_num, aid = num.split('?')
            except:
                c_num = num
                aid = ""
            head, head1, mid, end = end_page.split('p')

            for c_page in range(1, int(c_num) + 1):
                if aid == '':
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page)
                else:
                    n_page = head + 'p' + head1 + 'p' + mid + 'p{}'.format(
                        c_page) + '?' + aid
                create_sleep()
                yield scrapy.Request(url=n_page,
                                     meta=dic,
                                     callback=self.parse_area,
                                     headers=header1)
        else:
            create_sleep()
            yield scrapy.Request(url=response.url,
                                 meta=dic,
                                 callback=self.parse_area,
                                 headers=header1)
Exemple #12
0
def get_questions():
    with session() as s:
        login(s, register=True)

        r = browse(s, BASE_URL)
        pq = PyQuery(r)

        questions = []
        for question_div in pq('div.grid-question'):
            link = PyQuery(question_div).parent().attr('href')
            if '/play/' in link:
                questions.append(link.split('/play/')[-1])

        return sorted(questions, key=int)
Exemple #13
0
def GetBtcMarketPrice():
    list1=['1','2','3','4','5','6','7']
    pricelist=[]

    #marketRes=requests.get("http://www.feixiaohao.com/currencies/bitcoin/")
    marketRes=PyQuery(url="http://www.feixiaohao.com/currencies/bitcoin/")
    #xxx=marketRes("#markets").html()
    for data in marketRes("tr"):
        priceinfo=PyQuery(data).text().encode("utf-8")
        if priceinfo[0] in list1:
            prices=priceinfo.split(" ")[3].replace("¥","")
            prices=prices.replace(",","")
            pricelist.append(int(prices))

    return sum(pricelist)/len(pricelist)
Exemple #14
0
def get_cloud_rate(scene_name):
    """Read the MTL file and return the cloud_rate of the scene."""
    sat = 'L%s' % scene_name[2]
    mtl_path = join(settings.MEDIA_ROOT, sat, scene_name, scene_name + '_MTL.txt')

    if isfile(mtl_path):
        with open(mtl_path, 'r') as f:
            lines = f.readlines()
            cloud_rate = [float(line.split(' = ')[-1]) for line in lines if 'CLOUD_COVER' in line][0]
            return cloud_rate
    else:
        url_code = get_metadata_code(scene_name)
        metadata = PyQuery(
            'http://earthexplorer.usgs.gov/metadata/%s/%s/' % (url_code, scene_name)
        )
        metadata = metadata.text()[metadata.text().find('Cloud Cover '):]
        return float(metadata.split(' ')[2])
Exemple #15
0
 def verify_token(self):
     for x in range(5):
         try:
             WebDriverWait(self.driver, wait).until(
                 EC.element_to_be_clickable((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-setting > div > div.card-body.d-flex.flex-column > div.overflow-container.flex-grow-1 > ul > li")))
             HTML = self.driver.find_elements_by_css_selector(".card-control")[
                         0].get_attribute("innerHTML")
             Doc = PQ(HTML)
             Doc = Doc('.list-group-item-action').text()
             Doc = Doc.replace(" ", "\n")
             Doc = Doc.split("\n")
             # print(Doc)
             path = Doc.index("表名")
             pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]'
             self.driver.find_element_by_xpath(pathh).click()
             break
         except:
             time.sleep(3)
     return "dp_ads." + Doc[path+1]
Exemple #16
0
def get_info_5(file_html):
    """
    获取页面关键信息
    """
    text_pq = Pq(file_html)
    tr_list = text_pq('.normal-fir').find('.name')

    for tr_item in tr_list:
        td_list = Pq(tr_item).find('a')
        item_dict = {
            u'公司编号': td_list.attr('company_id'),
            u'公司名称': '',
            u'公司链接': '',
            u'职位名称': td_list.text(),
            u'职位链接': td_list.attr('href'),
            u'薪资待遇': '',
            u'工作地点': '',
            u'工作经验': '',
            u'最低学历': '',
            u'招聘人数': '',
            u'公司规模': '',
        }

        company = Pq(tr_item).find('.s-tit14.fl')
        item_dict[u'公司名称'] = company.text()

        introduce_list = Pq(tr_item).find('.s-butt.s-bb1 ul li')
        for introduce_item in introduce_list:
            item_text = Pq(introduce_item).text()
            item_list = item_text.split(': ')
            if len(item_list) < 2:
                continue
            key = item_list[0]
            value = item_list[1]
            item_dict[key] = value
        # 获取公司联系方式
        contact_dict = get_contact(item_dict[u'公司编号'])
        item_dict = dict(item_dict, **contact_dict)
        yield item_dict
    print '单页共 %s 条记录' % len(tr_list)
Exemple #17
0
def get_info_5(file_html):
    """
    获取页面关键信息
    """
    text_pq = Pq(file_html)
    tr_list = text_pq('.normal-fir').find('.name')

    for tr_item in tr_list:
        td_list = Pq(tr_item).find('a')
        item_dict = {
            u'公司编号': td_list.attr('company_id'),
            u'公司名称': '',
            u'公司链接': '',
            u'职位名称': td_list.text(),
            u'职位链接': td_list.attr('href'),
            u'薪资待遇': '',
            u'工作地点': '',
            u'工作经验': '',
            u'最低学历': '',
            u'招聘人数': '',
            u'公司规模': '',
        }

        company = Pq(tr_item).find('.s-tit14.fl')
        item_dict[u'公司名称'] = company.text()

        introduce_list = Pq(tr_item).find('.s-butt.s-bb1 ul li')
        for introduce_item in introduce_list:
            item_text = Pq(introduce_item).text()
            item_list = item_text.split(': ')
            if len(item_list) < 2:
                continue
            key = item_list[0]
            value = item_list[1]
            item_dict[key] = value
        # 获取公司联系方式
        contact_dict = get_contact(item_dict[u'公司编号'])
        item_dict = dict(item_dict, **contact_dict)
        yield item_dict
    print '单页共 %s 条记录' % len(tr_list)
def parseDetail(url, place):
    logger.info(url)
    qDetail = PyQuery(url)
    place["pNmEng"] = qDetail(".dest_toptitle > div > div > p").remove('span').text().strip()
    try:
        place["pDesc"] = qDetail(".toggle_l:first > .text_style").html().strip()
    except Exception as ex:
        logger.error(ex)
    # mapSrc = qDetail(".s_sight_map > a > img").attr('src').split('%7C')[1].split('&')[0]
    # place["lng"] = mapSrc.split(',')[1]
    # place["lat"] = mapSrc.split(',')[0]
    place["lng"] = qDetail("#Lon").val()
    place["lat"] = qDetail("#Lat").val()
    ctypeAList = qDetail(".s_sight_con:first > a")
    place["viewTypes"] = []
    for element in ctypeAList[:3]:
        viewType = {}
        viewHref = PyQuery(element).attr("href")
        viewType["codeId"] = viewHref.split("/")[-1].split(".")[0].replace("s", "")
        viewType["codeName"] = PyQuery(element).text()
        place["viewTypes"].append(viewType)
    try:
        place["contactTel"] = PyQuery(qDetail(".s_sight_con")[2]).text().strip()
        place["website"] = PyQuery(qDetail(".s_sight_con")[3])("a").text()
    except Exception as ex:
        logger.error(ex)
    place["openHours"] = ""
    for element in qDetail("dt:contains('开放时间')").nextAll("dd"):
        place["openHours"] += PyQuery(element).outerHtml()
    place["expense"] = ""
    for element in qDetail("dt:contains('门票信息')").nextAll("dd"):
        place["expense"] += PyQuery(element).outerHtml()
    place["districtid"] = qDetail("#ctmdistrict").val() # 取得圖片用ID, #JS_DistrictId的值一樣
    place["resourceid"] = qDetail("#wentClickID").attr("dataresource-cat") # 取得圖片用ID
    place["totalImgCount"] = qDetail(".r_text").text().replace("全部", "").replace("张照片", "") # 取得圖片用,數量
    place["countryEngName"] = inputCountryJson["eName"]
    # place["countryChnName"] = PyQuery(qDetail("i.arrow")[1]).parent('a').text() # 國家中文
    place["countryChnName"] = inputCountryJson["cName"] # 國家中文
    place["cityEngName"] = qDetail("#EName").val() # city英文
Exemple #19
0
    def click_dataset(self,lan):
        #---PyQuery→Xpath---
        for x in range(5):
            try:
                HTML = self.driver.find_elements_by_css_selector(".card-control")[
                            0].get_attribute("innerHTML")
                Doc = PQ(HTML)
                Doc = Doc('.list-group-item-action').text()
                Doc = Doc.replace(" ", "\n")
                Doc = Doc.split("\n")
                # print(Doc)
                path = Doc.index(lan)
                pathh = '//*[@id="dp_ads.' + Doc[path+1] +'"]'
                self.driver.find_element_by_xpath(pathh).click()
                break
            except:
                time.sleep(3)

        #對照頁面上的→維度條件
        WebDriverWait(self.driver, wait).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)")))
        check = self.driver.find_element_by_css_selector("body > app-root > app-golden-main > app-golden-content > main > div > div > aside.block-fitler > div > div.card-body > h3:nth-child(3)").get_attribute("innerText")
        return check 
Exemple #20
0
 def load_file_list(self):
     try:
         sl = self.db_id.lower()
         file_url = BASE_FILE_URL + sl + "/"
         txt = load_with_retry(file_url)
         pq = PyQuery(txt)
         pq = pq('a')
         files = []
         for a in pq[1:]:
             name = a.attrib['href'].split('/')[-1]
             if not name.lower().startswith(sl + self.file_prefix_delimiter):
                 continue
             prvtxt = PyQuery(a).prev()[0].tail
             prvtxt = prvtxt.split()
             f = {
                 "name": name[len(sl) + len(self.file_prefix_delimiter):],
                 "modified": datetime.datetime.strptime(prvtxt[0] + " " + prvtxt[1] + " " + prvtxt[2],
                                                        "%m/%d/%Y %H:%M %p"),
                 "size": int(prvtxt[3])
             }
             files.append(f)
         return files
     except:
         return []
Exemple #21
0
def parse_transaction_tr(tr: PyQuery, domain: str) -> dict:
    """
    解析一页一号/二号平台站点的四类交易的tr
    :param tr: 
    :param domain: 
    :return: 
    """
    init_dict = dict()
    tds = tr.find("td")
    """平台1和平台2区别对待"""
    if domain == domain:
        """第一个td,取订单号和客户帐号"""
        first = PyQuery(tds[0])
        texts_1 = first.text().split("\n")
        ticket = int(re.search(r'\d{4,}', texts_1[0]).group())  # 订单号
        login = int(re.search(r'\d{6,}', texts_1[-1]).group())  # 客户帐号
        init_dict['ticket'] = ticket
        init_dict['login'] = login
        """第二个td,取英文名和真实姓名"""
        second = PyQuery(tds[1])
        texts_2 = second.text().split("\n")
        nick_name = texts_2[0][4:].strip("")
        real_name = texts_2[-1][5:].strip("")
        init_dict['nick_name'] = nick_name
        init_dict['real_name'] = real_name
        """第三个td,取交易指令和品种"""
        third = PyQuery(tds[2])
        texts_3 = third.text().split("\n")
        command = texts_3[0].lower()
        init_dict['command'] = command
        sys_val = domain
        print("domain = {}, command = {}, tds'length = {}".format(
            sys_val, command, len(tds)))
        init_dict['system'] = sys_val
        # print(ticket, command, texts_3)
        if command == "balance" or command == "credit":
            """出入金和赠金,少了几个td"""
            """第四个,交易时间"""
            eighth = PyQuery(tds[4]).text()
            the_time = get_datetime_from_str(eighth)  # 交易时间
            init_dict['time'] = the_time
            # print("出入金时间:{}".format(the_time))
            """
            第五个,盈亏
            """
            ninth = PyQuery(tds[5]).text()
            profit = re.search(r'[+, -]?\d+.?\d*', ninth)
            if profit is not None:
                profit = float(profit.group())
                init_dict['profit'] = profit
            """第六个,点差"""
            tenth = PyQuery(tds[6]).text()
            spread_profit = None
            try:
                spread_profit = float(tenth)
            except ValueError as e:
                print(e)
            finally:
                if spread_profit is None:
                    pass
                else:
                    init_dict['spread_profit'] = spread_profit
            """第七个,注释"""
            comment = None
            try:
                eleventh = PyQuery(tds[7]).text()
                comment = eleventh

            except IndexError as e:
                print(e)
            finally:
                if comment is not None:
                    init_dict['comment'] = comment
                else:
                    pass

            init_dict = {k: v for k, v in init_dict.items() if v is not None}
        else:
            """buy和sell的情况"""
            symbol = ''
            if len(texts_3) > 1:
                symbol = texts_3[-1].lower()
                init_dict['symbol'] = symbol
            """第四个td,取交易手数"""
            fourth = PyQuery(tds[3])
            lot_find = re.search(r'\d+.?\d*', fourth.text())
            lot = lot_find if lot_find is None else float(lot_find.group()) if symbol != "hk50mini" else \
                float(lot_find.group()) / 10
            init_dict['lot'] = lot
            """
            第五个,取价格,
            """
            fifth = PyQuery(tds[4])
            prices = fifth.text().split("\n")
            enter_price = float(re.search(r'\d+.?\d*',
                                          prices[0]).group())  # 开仓
            exit_price = float(re.search(r'\d+.?\d*',
                                         prices[-1]).group())  # 平仓
            init_dict['enter_price'] = enter_price
            init_dict['exit_price'] = exit_price
            """
            第六个,止盈/止损,
            """
            sixth = PyQuery(tds[5])
            stop = sixth.text().split("\n")
            stop_losses = float(re.search(r'\d+.?\d*', stop[0]).group())  # 止损
            take_profit = float(re.search(r'\d+.?\d*', stop[-1]).group())  # 止盈
            init_dict['stop_losses'] = stop_losses
            init_dict['take_profit'] = take_profit
            """
            第七个,利息/佣金,
            """
            seventh = PyQuery(tds[6])
            seventh = seventh.text().split("\n")
            swap_match = re.search(r'[+, -]?\d+.?\d*', seventh[0])
            if swap_match is not None:
                swap = float(swap_match.group())  # 利息
            else:
                swap = None
            commission_match = re.search(r'[+, -]?\d+.?\d*', seventh[-1])
            if commission_match is not None:
                commission = float(commission_match.group())  # 手续费
            else:
                commission = None
            init_dict['swap'] = swap
            init_dict['commission'] = commission
            """第八个,交易时间"""
            eighth = PyQuery(tds[7]).text()
            eighth = eighth.split("\n")
            if command not in ["balance", "credit"]:
                open_time = get_datetime_from_str(
                    eighth[0].split(":")[1])  # 开仓时间
                init_dict['open_time'] = open_time
                if eighth[-1].find("持仓中") != -1:
                    """持仓中"""
                    pass
                else:
                    close_time_list = eighth[-1].split(":")
                    if len(close_time_list) > 1:
                        close_time = get_datetime_from_str(
                            close_time_list[1])  # 平仓时间
                        init_dict['close_time'] = close_time
                    else:
                        pass
            else:
                pass
            """
            第九个,盈亏
            """
            ninth = PyQuery(tds[8]).text()
            profit = re.search(r'[+, -]?\d+.?\d*', ninth)
            if profit is not None:
                profit = float(profit.group())
                init_dict['profit'] = profit
            """注意,平台1和平台2的列数不一样,平台1有点差,11列,平台2没有点差,10列"""
            if len(tds) == 11:
                """第十个,点差"""
                tenth = PyQuery(tds[-2]).text()
                spread_profit = float(tenth)
                init_dict['spread_profit'] = spread_profit
            else:
                pass
            """最后一个,注释"""
            eleventh = PyQuery(tds[-1]).text()
            comment = eleventh
            init_dict['comment'] = comment

    else:
        """平台2的解析"""
        """第一个td,取订单号和客户帐号"""
        first = PyQuery(tds[0])
        texts_1 = first.text().split("\n")
        ticket = int(re.search(r'\d{4,}', texts_1[0]).group())  # 订单号
        login = int(re.search(r'\d{6,}', texts_1[-1]).group())  # 客户帐号
        init_dict['ticket'] = ticket
        init_dict['login'] = login
        """第二个td,取英文名和MT名称"""
        second = PyQuery(tds[1])
        texts_2 = second.text().split("\n")
        nick_name = texts_2[0][4:].strip("")
        real_name = texts_2[-1][5:].strip("")
        init_dict['nick_name'] = nick_name
        init_dict['real_name'] = real_name
        """第三个td,取交易指令和品种"""
        third = PyQuery(tds[2])
        texts_3 = third.text().split("\n")
        command = texts_3[0].lower()
        init_dict['command'] = command
        sys_val = domain
        print("domain = {}, command = {}, tds'length = {}".format(
            sys_val, command, len(tds)))
        init_dict['system'] = sys_val
        # print(ticket, command, texts_3)
        if command == "balance" or command == "credit":
            """出入金和赠金,少了几个td"""
            """第四个,交易时间"""
            eighth = PyQuery(tds[4]).text()
            the_time = get_datetime_from_str(eighth)  # 交易时间
            init_dict['time'] = the_time
            # print("出入金时间:{}".format(the_time))
            """
            第五个,盈亏
            """
            ninth = PyQuery(tds[5]).text()
            profit = re.search(r'[+, -]?\d+.?\d*', ninth)
            if profit is not None:
                profit = float(profit.group())
                init_dict['profit'] = profit
            """第六个,注释"""
            comment = None
            try:
                eleventh = PyQuery(tds[-1]).text()
                comment = eleventh

            except IndexError as e:
                print(e)
            finally:
                if comment is not None:
                    init_dict['comment'] = comment
                else:
                    pass

            init_dict = {k: v for k, v in init_dict.items() if v is not None}
        else:
            """buy和sell的情况"""
            symbol = ''
            if len(texts_3) > 1:
                symbol = texts_3[-1].lower()
                init_dict['symbol'] = symbol
            """第四个td,取交易手数"""
            fourth = PyQuery(tds[3])
            lot_find = re.search(r'\d+.?\d*', fourth.text())
            lot = lot_find if lot_find is None else float(lot_find.group()) if symbol != "hk50mini" else \
                float(lot_find.group()) / 10
            init_dict['lot'] = lot
            """
            第五个,取价格,
            """
            fifth = PyQuery(tds[4])
            prices = fifth.text().split("\n")
            enter_price = float(re.search(r'\d+.?\d*',
                                          prices[0]).group())  # 开仓
            exit_price = float(re.search(r'\d+.?\d*',
                                         prices[-1]).group())  # 平仓
            init_dict['enter_price'] = enter_price
            init_dict['exit_price'] = exit_price
            """
            第六个,止盈/止损,
            """
            sixth = PyQuery(tds[5])
            stop = sixth.text().split("\n")
            stop_losses = float(re.search(r'\d+.?\d*', stop[0]).group())  # 止损
            take_profit = float(re.search(r'\d+.?\d*', stop[-1]).group())  # 止盈
            init_dict['stop_losses'] = stop_losses
            init_dict['take_profit'] = take_profit
            """
            第七个td是空的
            """
            """第八个,交易时间"""
            eighth = PyQuery(tds[7]).text()
            eighth = eighth.split("\n")
            if command not in ["balance", "credit"]:
                open_time = get_datetime_from_str(
                    eighth[0].split(":")[1])  # 开仓时间
                init_dict['open_time'] = open_time
                if eighth[-1].find("持仓中") != -1:
                    """持仓中"""
                    pass
                else:
                    close_time_list = eighth[-1].split(":")
                    if len(close_time_list) > 1:
                        close_time = get_datetime_from_str(
                            close_time_list[1])  # 平仓时间
                        init_dict['close_time'] = close_time
                    else:
                        pass
            else:
                pass
            """
            第九个,盈亏
            """
            ninth = PyQuery(tds[8]).text()
            profit = re.search(r'[+, -]?\d+.?\d*', ninth)
            if profit is not None:
                profit = float(profit.group())
                init_dict['profit'] = profit
            """注意,平台1和平台2的列数不一样,平台1有点差,11列,平台2没有点差,10列"""
            if len(tds) == 11:
                """第十个,点差"""
                tenth = PyQuery(tds[-2]).text()
                spread_profit = float(tenth)
                init_dict['spread_profit'] = spread_profit
            else:
                pass
            """最后一个,注释"""
            eleventh = PyQuery(tds[-1]).text()
            comment = eleventh
            init_dict['comment'] = comment
    """先整理初始化字典"""
    init_dict = {k: v for k, v in init_dict.items() if v is not None}  # 去None
    """只记录指定类型的单子"""
    if init_dict['command'] in ['balance', 'credit', 'buy', 'sell']:
        return init_dict
    else:
        return None
from subprocess import call

url = 'http://www.vagrantup.com/downloads.html'

request = requests.get(url)
links = PyQuery(request.text)

foi = False
for link in links('a'):
	if PyQuery(link).attr('href')[-3:] == 'dmg':
		foi = True
		break

if foi:
	remotefilename = PyQuery(link).attr('href')
	filename = remotefilename.split('/')[-1]
	locfile = os.environ['HOME'] + '/Downloads/' + filename
	if os.path.isfile(locfile):
		print locfile + ' is already downloaded'
	else:
		stream = requests.get(remotefilename, stream = True)
		with open(locfile, 'wb') as fd:
			print "getting " + filename
			count = 0
			for chunk in stream.iter_content(4096):
				count += 1
				if count%100 == 0:
					sys.stdout.write('.')
				fd.write(chunk)
			print 'done'
		print 'Vagrant dmg file downloaded to ' + os.environ['HOME'] + '/Downloads'
Exemple #23
0
 def test_not_billiable_projects_not_shown(self):
     response = self.test_client.get(reverse('client_projects'))
     query = PyQuery(response.content)
     query = query('table#queryTable td.name').text()
     self.assertNotIn('FakeProject3', set(query.split()))
Exemple #24
0
 def test_billing_types_for_each_project_shown(self):
     response = self.test_client.get(reverse('client_projects'))
     query = PyQuery(response.content)
     query = query('table#queryTable td.type').text()
     self.assertEqual(set(('HOUR', 'FIXED')), set(query.split()))
Exemple #25
0
 def test_all_billiable_projects_shown(self):
     response = self.test_client.get(reverse('client_projects'))
     query = PyQuery(response.content)
     query = query('table#queryTable td.name').text()
     self.assertEqual(set(('FakeProject1', 'FakeProject2')),
                      set(query.split()))
Exemple #26
0
 #     item = PyQuery(i).text()
 #     if ":" in item:
 #         itemlist = item.split(':', 1)
 #     elif ":" in item:
 #         itemlist = item.split(':', 1)
 #     item_dict[itemlist[0]] = itemlist[1]
 #     pass
 # print(item_dict)
 root = etree.HTML(html)
 dl = root.xpath('//*[@class="overview"]//dl')
 item_dict = {}
 for i in dl:
     print(PyQuery(i).text())
     item = PyQuery(i).text()
     if ":" in item:
         itemlist = item.split(':', 1)
     elif ":" in item:
         itemlist = item.split(':', 1)
     item_dict[itemlist[0]] = itemlist[1].strip()
     pass
 select = root.xpath(
     '//*[@id="shareholderInfo_wrapper"]//table//tbody/tr')
 title = ['序号', '股东名称']
 gd = {}
 gudong = {}
 for i in select:
     nom = 0
     for j in title:
         gd[title[nom]] = i.xpath(".//text()")[nom]
         nom += 1
     gudong[gd['序号']] = gd
Exemple #27
0
def scan_proxy_qiaodm():
    """
    扫描代理资源
    :return:
    """
    import requests
    from pyquery import PyQuery as Pq

    source_site = 'http://ip.qiaodm.com/'

    header = {
        'Host':
        'ip.qiaodm.com',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
    }

    s = requests.session()
    # 抓取页面
    file_html = s.get(source_site).content

    # 保存文件
    # with open('test.html', 'a') as f:
    #     f.write(file_html.encode('utf-8'))
    #
    # # 读取抓取的页面
    # with open('test.html', 'r') as f:
    #     file_html = f.read()

    text_pq = Pq(file_html)
    tr_list = text_pq('tbody').find('tr[style="text-align: center;"]')
    print '单页共 %s 条记录' % len(tr_list)
    for tr_item in tr_list:
        # print Pq(tr_item).html()
        # print('---------------------')
        td_list = Pq(tr_item).find('td')
        # print '单条共 %s 列字段' % len(td_list)
        field_list = []
        for td_item in Pq(td_list):
            field = Pq(td_item).text()
            field_list.append(field)
            # print field
            # print('++++++++++++++++++')

        # 特殊处理ip地址
        ip = Pq(td_list).eq(0).html()
        # 去除干扰信息
        ip = html.replace_html(ip, r'<p style="display:none;"/>')
        ip = html.replace_html(ip, r'<p style="display: none;"/>')
        ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>')
        # 去除标签
        ip = html.strip_html(ip)
        # print ip
        # 过滤掉非法ip地址
        if len(ip.split('.')) != 4:
            continue

        # 特殊处理端口
        port_key = Pq(td_list).eq(1).attr('class').split()[1]
        if port_key not in PortDict:
            print '发现新端口: %s' % port_key
            continue
        port = PortDict.get(port_key, '')

        ProsyItem['Ip'] = ip.replace(' ', '')
        ProsyItem['Port'] = port
        ProsyItem['Type'] = field_list[2].strip()
        ProsyItem['AnonymousDegree'] = field_list[3].strip()
        ProsyItem['Area'] = field_list[4].strip()
        ProsyItem['Speed'] = field_list[5].strip()
        ProsyItem['ScanTime'] = field_list[6].strip()
        # print ProsyItem
        proxy_item = json.dumps(ProsyItem, ensure_ascii=False)
        html.save_file('proxy.json', proxy_item + '\n', 'a')
Exemple #28
0
def scan_proxy():
    """
    扫描代理资源
    :return:
    """
    import requests
    from pyquery import PyQuery as Pq

    source_site = 'http://ip.qiaodm.com/'

    header = {
        'Host': 'ip.qiaodm.com',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36'
    }

    s = requests.session()
    # 抓取页面
    file_html = s.get(source_site).content

    # 保存文件
    # with open('test.html', 'a') as f:
    #     f.write(file_html.encode('utf-8'))
    #
    # # 读取抓取的页面
    # with open('test.html', 'r') as f:
    #     file_html = f.read()

    text_pq = Pq(file_html)
    tr_list = text_pq('tbody').find('tr[style="text-align: center;"]')
    print '单页共 %s 条记录' % len(tr_list)
    for tr_item in tr_list:
        # print Pq(tr_item).html()
        # print('---------------------')
        td_list = Pq(tr_item).find('td')
        # print '单条共 %s 列字段' % len(td_list)
        field_list = []
        for td_item in Pq(td_list):
            field = Pq(td_item).text()
            field_list.append(field)
            # print field
            # print('++++++++++++++++++')

        # 特殊处理ip地址
        ip = Pq(td_list).eq(0).html()
        # 去除干扰信息
        ip = html.replace_html(ip, r'<p style="display:none;"/>')
        ip = html.replace_html(ip, r'<p style="display: none;"/>')
        ip = html.replace_html(ip, r'<p style=.*?display:.*?none;.*?>.*?</p>')
        # 去除标签
        ip = html.strip_html(ip)
        # print ip
        # 过滤掉非法ip地址
        if len(ip.split('.')) != 4:
            continue

        # 特殊处理端口
        port_key = Pq(td_list).eq(1).attr('class').split()[1]
        if port_key not in PortDict:
            print '发现新端口: %s' % port_key
            continue
        port = PortDict.get(port_key, '')

        ProsyItem['Ip'] = ip.replace(' ', '')
        ProsyItem['Port'] = port
        ProsyItem['Type'] = field_list[2].strip()
        ProsyItem['AnonymousDegree'] = field_list[3].strip()
        ProsyItem['Area'] = field_list[4].strip()
        ProsyItem['Speed'] = field_list[5].strip()
        ProsyItem['ScanTime'] = field_list[6].strip()
        # print ProsyItem
        proxy_item = json.dumps(ProsyItem, ensure_ascii=False)
        html.save_file('proxy.json', proxy_item + '\n', 'a')
Exemple #29
0
from pprint import pprint

'''
how to use
python3 getLineAllCities.py http://you.ctrip.com/sitelist/asia120001.html ./output/cities/asia
python3 getLineAllCities.py http://you.ctrip.com/sitelist/europe120002.html ./output/cities/europe
python3 getLineAllCities.py http://you.ctrip.com/sitelist/northamerica120004.html ./output/cities/northamerica
python3 getLineAllCities.py http://you.ctrip.com/sitelist/southamerica120005.html ./output/cities/southamerica
python3 getLineAllCities.py http://you.ctrip.com/sitelist/oceania120003.html ./output/cities/oceania
python3 getLineAllCities.py http://you.ctrip.com/sitelist/africa120006.html ./output/cities/africa
python3 getLineAllCities.py http://you.ctrip.com/sitelist/nanji120481.html ./output/cities/nanji
todo 南極的place要獨立去抓,因為南極沒有國家
todo 有些國家不在這7個洲裡,需再確認是否要全抓
'''

'''
Main
'''
targetUrl = sys.argv[1]
outputDirectory = sys.argv[2]

qList = PyQuery(targetUrl)
for element in qList('.normalbox')('li > a'):
    countryUrl = PyQuery(element).attr('href').replace('/place', '/countrysightlist')
    targetJson = outputDirectory + "/" + countryUrl.split('/')[2].replace('.html', '') + ".json"
    countryUrl = "http://you.ctrip.com" + countryUrl
    # if (countryUrl == "http://you.ctrip.com/countrysightlist/southkorea100042.html"):
    pprint(countryUrl)
    pprint(targetJson)
    getCtripAllCity.main(countryUrl, targetJson)
Exemple #30
0
            rawdata= myutils.ungzip(response)
    #         print rawdata
            pquery = PyQuery(rawdata.decode('utf-8'))
            for li in pquery(".TreeList li"):
    
                self.pfolder = myutils.filenameCheck(PyQuery(li)("a").text())
                
                while os.path.exists(os.path.join(self.root,self.pfolder)):
                    self.pfolder = self.pfolder + "_2"
                try:
                    os.mkdir(os.path.join(self.root,self.pfolder))
                except Exception,e:
                    print "%s created error" %(os.path.join(self.root,self.pfolder))
                else:
                    strParam = PyQuery(li)("a").attr('onclick')
                    aParam = strParam.split('(')[1].strip(')').split(',')
                    param = {}
                    param["id"] = aParam[0].strip().strip("'")
                    param["code"] = aParam[1].strip().strip("'")+ "?"
                    param["type"] = aParam[2].strip().strip("'")
                    param["fileid"] = aParam[3].strip().strip("'")
                    self.get_child_catalog(param)
                    time.sleep(1)
            self.deal_error()
            
    def get_child_catalog(self,param):

        url = "http://tongji.cnki.net/kns55/Navi/GetChildCatalog.aspx"
        
        req = urllib2.Request(url,urllib.urlencode(param),self.req_header)
#         print req.get_full_url()
raw_html = r.text
pq = PQ(raw_html)

# Select just the set of elements that you want to extract
# Use CSS selectors!
ul = pq("ul")[1]
elements = PQ(ul).children()

# Once you have the elements identfied, extract the text
texts = []
for el in elements:
    # extract text
    text = PQ(el).text()

    # Append to "texts" list
    texts.append(text)

# Split the text into year, first name, and last name
data = []
for text in texts:
    # Split text into a tuple of the 3 data points
    split_vars = text.split()
    year = int(split_vars[0][:4])
    name = " ".join(split_vars[1:])
    tup = (year, name)

    # Append the tuple to the "data" list
    data.append(tup)

print(data)