Exemple #1
0
def get_url(url):
    response = requests.get(url)
    doc = PyQuery(response.text)
    for article in doc('article'):
        h = PyQuery(article)
        print h.find('h1.entry-title').text().encode('utf-8')
        print h.find('div.entry-content p').text().encode('utf-8')
Exemple #2
0
    def test_calendar_tag_rendering(self, timezone_mock):
        timezone_mock.now.return_value = tz_datetime(2015, 1, 10, 12)
        page_with_apphook = self.create_base_pages()
        other_config = EventsConfig.objects.create(namespace='other')
        self.create_event(
            title='ev1',
            start_date=tz_datetime(2015, 1, 13),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            title='ev2',
            start_date=tz_datetime(2015, 1, 15),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            de=dict(
                title='ev3',
                start_date=tz_datetime(2015, 1, 16),
                publish_at=tz_datetime(2015, 1, 10)
            )
        )
        self.create_event(
            title='ev4',
            start_date=tz_datetime(2015, 1, 18),
            publish_at=tz_datetime(2015, 1, 10),
            app_config=other_config
        )
        self.create_event(
            title='ev5',
            start_date=tz_datetime(2015, 1, 22),
            end_date=tz_datetime(2015, 1, 27),
            publish_at=tz_datetime(2015, 1, 10)
        )
        self.create_event(
            title='ev6',
            start_date=tz_datetime(2015, 1, 25),
        )
        # make use of default tests self.app_config namespace, instead of
        # hard coding it
        template_str = """
        {%% load aldryn_events %%}
        {%% calendar 2015 1 'en' '%s' %%}
        """ % self.app_config.namespace
        t = Template(template_str)
        with override('en'):
            html = t.render(SekizaiContext({}))
            table = PyQuery(html)('table.table-calendar')
            page_url_en = page_with_apphook.get_absolute_url()
        links = table.find('td.events, td.multiday-events').find('a')

        # test if tag rendered important elements
        self.assertEqual('1', table.attr('data-month-numeric'), )
        self.assertEqual('2015', table.attr('data-year'))
        self.assertEqual('10', table.find('td.today').text())
        self.assertEqual(8, links.length)  # 13, 15, 22, 23, 24, 25, 26, 27
        expected_days = (13, 15, 22, 23, 24, 25, 26, 27)
        for position, day in enumerate(expected_days):
            event_url = '{0}2015/1/{1}/'.format(page_url_en, day)
            rendered_url = links[position].attrib['href']
            self.assertEqual(event_url, rendered_url)
def get_phonetic_symbol(text):
    data = { "keyfrom" : "deskdict.mini", "q" : text, "doctype" : "xml", "xmlVersion" : 8.2,
             "client" : "deskdict", "id" : "cee84504d9984f1b2", "vendor": "unknown", 
             "in" : "YoudaoDict", "appVer" : "5.4.46.5554", "appZengqiang" : 0, "le" : "eng", "LTH" : 40}
    ret = requests.get("http://dict.youdao.com/search", params=data).text
    if isinstance(ret, unicode):
        ret = ret.encode('utf-8')
    pq = PyQuery(ret, parser="xml")
    
    phonetic_symbol = pq.find('usphone').text()
    phonetic_type = _("US")
    
    try:    
        if phonetic_symbol == '':
            phonetic_symbol = pq.find('ukphone').text()
            phonetic_type = _("UK")
    except: 
        pass
    
    if phonetic_symbol == '' or phonetic_symbol == None or phonetic_symbol.isspace():
        return ""
    else:
        if isinstance(phonetic_type, unicode):
            phonetic_type = phonetic_type.encode('utf-8')

        if isinstance(phonetic_symbol, unicode):
            phonetic_symbol = phonetic_symbol.encode('utf-8')
        
        return "[%s] %s" % (phonetic_type, phonetic_symbol)
def get_old_fashion_comments(answer_url):
  aid = comment_list_id(answer_url)
  comment_box_link = 'http://www.zhihu.com/node/AnswerCommentBoxV2?params=%7B%22answer_id%22%3A%22{}%22%2C%22load_all%22%3Atrue%7D'.format(aid)  # | log
  # log('comments: ' + comment_box_link)
  r = old_client._session.get(comment_box_link)
  # print(str(r.content))
  doc = PyQuery(str(r.content, encoding='utf-8'))
  comments = []
  for div in doc.find('div.zm-item-comment'):
    div = PyQuery(div)
    cid = div.attr('data-id')
    vote_count = int(div.find('span.like-num').find('em').text())
    content = div.find('div.zm-comment-content').html()
    author_text = div.find('div.zm-comment-hd').text().replace('\n', ' ')
    if ' 回复 ' in author_text:
      author, reply_to = author_text.split(' 回复 ')
    else:
      author, reply_to = author_text, None

    comment = OldFashionComment(cid=cid,
                                vote_count=vote_count,
                                content=content,
                                author=OldFashionAuthor(author),
                                reply_to=OldFashionAuthor(reply_to) if reply_to else None)
    comments.append(comment)
  return comments
Exemple #5
0
    def update_forums(client, group, session):
        logging.info("Updating forums list for {}".format(group))
        query = Forum.get_forum_page(client, group.gid)
        reg = regex.compile(r"^forum\.php\?mod=forumdisplay&fid=(\d+)$")

        for row in query.find("table.fl_tb>tr"):
            sub_query = PyQuery(row)
            href = sub_query.find("td").eq(1).find("a").attr("href")
            if not href:
                continue

            fid = int(reg.findall(href)[0])

            name = sub_query.find("td").eq(1).find("h2>a").clone().children().remove().end().text()
            last_update = sub_query.find("td").eq(3).find("div>cite").clone().children().remove().end().text()
            last_update = dateparse(last_update)

            existence = session.query(Forum).filter(Forum.fid == fid)
            if existence.count() == 0:
                logging.info("<Forum(fid={})> not found, creating one".format(fid))
                forum = Forum(fid=fid, name=name, updated_at=last_update, group=group, fresh=False)
                session.add(forum)
            else:
                forum = existence.one()
                if forum.updated_at != last_update:
                    logging.info("{} found, stale: against {} ".format(forum, last_update))
                    forum.updated_at = last_update
                    forum.fresh = False
                    session.add(forum)
                else:
                    logging.info("{} found, fresh".format(forum))
Exemple #6
0
    def _parse_table(self, table):

        # Initialize table
        parsed_rows = []

        # Parse table
        qtable = PyQuery(table)

        # Get headers
        headers = self._get_headers(qtable)
        if not headers:
            return

        # Get rows
        rows = qtable.find("tr")

        # Loop over rows
        for row in rows:

            # Get columns
            qrow = PyQuery(row)
            cols = qrow.find("td").map(self._get_text)[:]

            # Parse column values
            for colidx in range(len(cols)):
                col = reduce(lambda x, y: re.sub(y[0], y[1], x), self._trans, cols[colidx])
                cols[colidx] = col

            # Append parsed columns
            if cols:
                parsed_rows.append(cols)

        return {"headers": headers, "data": parsed_rows}
def process_chapter(chapter_path, index_path, enable_stem) :

	# removed characters
	remove_chars = ['.',',',';','?','!','-',u'–',u'―',u'—',u'~',':','"',')','(','[',']','/','\\',"'s",u'’s',"'",u'‘',u'’',u'“',u'”', u'¿', '*', '<','>','&','{','}']

	restricted_words = ['a', 'and', 'about', 'above', 'across', 'after', 'against', 'along', 'among', 'around', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'but', 'by', 'despite', 'down', 'during', 'except', 'for', 'from', 'in', 'inside', 'into', 'like', 'near', 'of', 'off', 'on', 'onto', 'out', 'outside', 'over', 'past', 'since', 'the', 'through', 'throughout', 'till', 'to', 'toward', 'under', 'underneath', 'until', 'up', 'upon', 'with', 'within', 'without']
		
	# create jQuery object
	html = open(chapter_path, 'r').read()
	jquery = PyQuery(html)
	
	print jquery.find('.chapter').attr('data-osis')
	
	# find all verses, remove all notes and verse numbers
	verses = jquery('span.verse')
	verses.find('span.note').remove()
	verses.find('span.cf').remove()
	verses.find('.v-num').remove()
	
	for verse in verses:
		v = PyQuery(verse)
		osis = v.attr('data-osis')
		text = v.text()
		
		# remove punctuation
		for s in remove_chars:
			text = text.replace(s, '')
		
		words = text.split(' ')
		
		for word in words:
			word = word.strip().lower()
			
			#there's got to be a cleaner way to do this in Python
			is_restricted = True
			try:
				restricted_words.index(word)
			except:	
				is_restricted = False
								
			if word != '' and not is_restricted and not word.isnumeric():
			
				# stemmer?
				if enable_stem:
					word = stem(word)
			
				word_path = index_path + word + '.json'
				
				# check for file
				if os.path.exists(word_path):
					f = open(word_path,'a')
					f.write(',"' + osis + '"')
					f.close()
				else:
					f = open(word_path,'a')
					f.write('["' + osis + '"')
					f.close()				
Exemple #8
0
    def test_device_elements(self):
        response = self.client.get(self.url)
        self.assertEqual(response.status_code, 200)

        tree = PyQuery(response.content)
        radios = tree.find('input[type="radio"]')
        self.assertEqual(len(radios), 2)

        tree = PyQuery(response.content)
        checkboxes = tree.find('input[type="checkbox"]')
        self.assertEqual(len(checkboxes), 1)
Exemple #9
0
    def getFullInfo(self, interval=0.5):
        self.getBasicInfo()

        # collCount
        if self.INFO['shopType'] in ['7', '1', '3', '4']:
            regxrs = re.findall(
                r'J_SCollCount\"\s+data\-info\=\"param\=(.+?)\&',
                self.content)
            if regxrs:
                params = {'keys': regxrs[0], 't': '%.0f' % (time.time()
                          * 1000), 'callback': 'TShop.setShopStat'}
                domain = 'http://count.tbcdn.cn/counter3'
                tUrl = domain + '?' + '&'.join([k + '=' + v for (k, v) in
                                                params.items()])
                r = request(tUrl)
                self.INFO['collCount'] = str(re.findall(r'\"\S+\"\:(\d+)',
                                                        r.text, re.S)[0])
            else:
                if PyQuery:
                    pyjq_obj = PyQuery(self.content.decode(self.res.encoding))
                    data_info = (pyjq_obj.find('dl.collect-num dt') or pyjq_obj.find('.collect-num span')).attr('data-info')
                    count_url = re.sub(
                        r'param=(?P<param>.+?)\&countUrl=(?P<count>.+?)\&.+',
                        r'\g<count>?callback=jsonp357&t=%d&keys=\g<param>' % (time.time()*1000),
                        data_info,
                        re.S)
                    collCount = re.sub(
                        r'.+\:\s*(?P<coll>\d+).+',
                        r'\g<coll>',
                        request(count_url).content)
                    if collCount:
                        self.INFO['collCount'] = int(collCount)

        else:
            coll_url = 'http://favorite.taobao.com/collect_item_relation.htm?itemtype=0&itemNumid=%s' % self.INFO['shopId']
            try:
                res = request(coll_url)
                pyjq_obj = PyQuery(res.content.decode(res.encoding))
                self.INFO['collCount'] = pyjq_obj.find("div.add-fav-msg strong").html().strip()
            except:
                pass
        time.sleep(interval)

        # itemAmount
        if self.INFO['shopType'] == '2':
            tUrl = self.INFO['shopLink'] + '?search=y&orderType=_hotsell'
            r = request(tUrl)
            try:
                self.INFO['itemAmount'] = \
                    str(re.findall(r'\<div\s+class\=\"search\-result\"\s*\>.+?(\d+)', r.text, re.S)[0])
            except:
                pass
        time.sleep(interval)
        return self.INFO
Exemple #10
0
    def test_initial_post(self):
        mock_request = self._get_post_request(type='generic')
        response, MockDevice = self._post_device_addition(mock_request)

        self.assertEqual(response.status_code, 200)
        self.assertFalse(MockDevice.objects.create.called)

        tree = PyQuery(response.content)
        self.assertEqual(len(tree.find('.error')), 0)
        device_name = tree.find('input[type=text][name=name]')
        self.assertEqual(len(device_name), 1)
        self.assertEqual(device_name.val(), 'Authentication device')
    def parse(self, response):
        html = Pq(response.body)
        job = items.OnetJob()
        job['url'] = response.url
        job['alt_title'] = html.find('[class="titleb"]').text()
        job['job_sample'] = html.find(
            'p:contains("Sample of reported job titles:")').text()

        job['summary'] = html.find(
            '#realcontent').find('p:eq(0)').text()

        job['job_sample'] = job['job_sample'].replace(
            'Sample of reported job titles:', '').split(', ')

        job['tasks'] = self._list(html, '.section_Tasks .moreinfo')
        job['tools'] = self._list(
            html, '.section_ToolsTechnology .moreinfo:first')
        job['technology'] = self._list(
            html, '.section_ToolsTechnology .moreinfo:last')
        job['knowledge'] = self._list(html, '.section_Knowledge .moreinfo')
        job['skills'] = self._list(html, '.section_Skills .moreinfo')
        job['abilities'] = self._list(html, '.section_Abilities .moreinfo')
        job['work_activities'] = {
            'basic': self._list(html, '.section_WorkActivities .moreinfo'),
            'detailed': self._list(
                html, '.section_DetailedWorkActivities .moreinfo'),
        }
        job['work_context'] = self._list(
            html, '.section_WorkContext .moreinfo')

        job['job_zone'] = self._table(html, '#content table:first')
        job['education'] = self._table(html, '#content table:eq(1)')

        job['interests'] = self._list(html, None, custom=html.find(
            '[name="Interests"]').siblings('.moreinfo:first'))

        job['work_styles'] = self._list(
            html, '.section_WorkStyles .moreinfo')

        job['interests'] = self._list(html, None, custom=html.find(
            '[name="WorkValues"]').siblings('.moreinfo:eq(1)'))

        job['related_occupations'] = self._table(
            html, '.section_RelatedOccupations table')

        job['wages_employment'] = self._table(
            html, '[summary="Wages & Employment Trends information'
                  ' for this occupation"]')

        job['job_openings'] = ''
        job['additional_info'] = ''
        return job
 def parse(self, response):
     category = items.ONetCategory()
     html = Pq(response.body)
     category['url'] = response.url
     category['name'] = html.find('.reportdesc:eq(0)').text().replace(
         'Save Table ( XLS / CSV )', '')
     category['id'] = response.url.replace('{}?i'.format(
         self.root_url), '').replace(
             '&g=Go', '').replace('=', '').replace('.', '')
     category['bls_url'] = html.find(
         'div.reportdesc a:first').attr('href')
     category['occupation_data'] = self._extract_occupations(html)
     return category
Exemple #13
0
def _fetch_mdn_page(url):
    data = bleach.clean(_get_page(url), attributes=ALLOWED_ATTRIBUTES,
                        tags=ALLOWED_TAGS, strip_comments=False)

    root = PyQuery(data)
    toc = root.find('#article-nav div.page-toc ol')[0]
    content = root.find('#pageText')[0]

    toc.set('id', 'mdn-toc')
    content.set('id', 'mdn-content')

    return (etree.tostring(toc, pretty_print=True),
        etree.tostring(content, pretty_print=True))
Exemple #14
0
 def feed(self, data, sentence):
     d = PyQuery(data)
     sets = d(".sentences_set")
     for s in sets:
         s = PyQuery(s)
         if s.find(".mainSentence .sentenceContent a").text().strip() == sentence:
             structure = s.find(".mainSentence .sentenceContent .romanization.furigana").text()
             translations = s.find(".translations:first") \
                             .find(".sentence > img[title='English']") \
                             .parent().find(".sentenceContent > a") \
                             .map(lambda i, o: o.text)
             return (structure, translations)
     return (None, None)
Exemple #15
0
    def _get_invoices(self):
        randomnum = str(int(math.floor((random.random() * 99999) + 1)))
        response = self.browser.get('https://www.endesaclientes.com/ss/Satellite?c=Page&pagename=SiteEntry_IB_ES%2FBill_Search%2FSearch_List&rand={}'.format(randomnum))
        pq = PyQuery(response.content)
        invoices = []

        def getParam(name, rowid):
            return pq.find('input[id={}_{}]'.format(name, rowid))[0].value

        for row in pq.find('.invoices_body_row'):
            invoice = {}
            row_id = row.attrib['id'].replace('trBill', '')
            invoice['billNumber'] = getParam('numBill', row_id)
            invoice['secBill'] = getParam('secBill', row_id)
            invoice['contractNumber'] = getParam('contractNumber', row_id)
            invoice['holderCompanyCode'] = getParam('holderCompanyCode', row_id)
            invoice['businessLine'] = getParam('businessLine', row_id)
            invoice['numscct'] = ''
            invoice['refBill'] = getParam('refBill', row_id)

            date = pq(row).find('td')[3].text.strip()
            invoice['date'] = datetime.datetime.strptime(date, '%a %b %d %H:%M:%S %Z %Y').strftime('%Y-%m-%d')

            if not self._invoice_exists(invoice):
                invoice['method'] = 'get'
                invoice['url'] = self._pdf_download_url(invoice)
                invoice['name'] = self._invoice_name(invoice)
                invoices.append(invoice)
        return invoices
Exemple #16
0
def station_parse(content):
    '''Parsing bus station and check station.
    '''
    OFF = '0-0'
    stations = []
    bus_status = []
    content = json.loads(content[3:].decode('utf-8'))
    status = content['status']
    info = content['info']
    if status == 1 and info != '':
        pqContent = PyQuery(info)('#upInfo li')
        for station in pqContent:
            pqStation = PyQuery(station)
            station_name = pqStation('.station').text()
            stations.append(station_name)
            buses = pqStation.find('.bus')
            if buses.size() > 0:
                left_count = 0
                on_station_count = 0
                for bus in buses:
                    if PyQuery(bus).attr('style'):
                        left_count+=1
                    else:
                        on_station_count+=1
                bus_status.append('{0}-{1}'.format(on_station_count, left_count))
            else:
                bus_status.append(OFF)
    if not stations:
        return None

    return (tuple(bus_status), tuple(stations))
Exemple #17
0
 def fetch_urls(self, queue, quantity):
     while not queue.empty():
         url = queue.get()
         html = self.s.get(url, headers=self.headers).text
         pq = PyQuery(html)
         size = pq.find('tbody tr').size()
         for index in range(size):
             item = pq.find('tbody tr').eq(index)
             ip = item.find('td').eq(0).text()
             port = item.find('td').eq(1).text()
             _type = item.find('td').eq(3).text()
             self.result_arr.append({
                 str(_type).lower(): '{0}://{1}:{2}'.format(str(_type).lower(), ip, port)
             })
             if len(self.result_arr) >= quantity:
                 break
Exemple #18
0
def get_meme_url(meme):
    gen = GENERATORS.get(meme)
    if gen:
        pq = PyQuery(url="http://memegenerator.net/%s" % gen[2])
        return pq.find('a img.large').attr('src')
    else:
        return None
def getPageLinkIfValid(element, currentPageNumber):
    pyElement = PyQuery(element)
    pageNumberText = pyElement.find('span').text()

    if pageNumberText.isdigit() and int(pageNumberText) > currentPageNumber:
        return 'https://www.youtube.com' + pyElement.attr('href')
    return None
Exemple #20
0
def scrape_growler_guys(location):
    taps = {}

    if LOCAL_TEST:
        page = PQ(filename='test.html')
    else:
        page = PQ(location['url'], headers=HEADERS)
    
    beer_list = page('.tap-list li')
    for item in beer_list:
        beer_obj = PQ(item)
        tap_number = beer_obj.find('.tap_number').text().strip()

        beer = build_beer_record(
            location = location['name'],
            name = beer_obj('.beerName .title').text().strip().encode('utf-8'),
            style = beer_obj('.beerName .style').text().strip().strip('- ').lower().encode('utf-8'),
            brewery = beer_obj('.brewery').text().strip().encode('utf-8'),
            city = beer_obj('.breweryInfo .txt').text().strip().strip('- ').replace(' ,',',').encode('utf-8'),
        )

        # make a hash value for the key
        h = hashlib.md5(b'{0} {1}'.format(beer['location'], tap_number))
        beer_key = h.hexdigest()
        taps[beer_key] = beer

    return taps
Exemple #21
0
def html_to_records(html):
    pq = PyQuery(html)
    rows = pq.find('table tr')
    get_row = lambda r: map(lambda th: th.text, r)
    headers = get_row(rows[0])
    for row in rows[1:]:
        yield dict(zip(headers, get_row(row)))
def get_forums():
    logging.info('Récupération des forums')
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()])

    d = PyQuery(url=config.rooturl + '/a-f1/', opener=fa_opener)
    
    save.forums = []
    levels = {}
    n = 1

    for i in progress([i for i in d.find("select option") if i.get("value", "-1") != "-1"]):
        id = i.get("value", "-1")
        logging.debug('Récupération: forum %s', id)
        title = re.search('(((\||\xa0)(\xa0\xa0\xa0))*)\|--([^<]+)', i.text).group(5)
        level = len(re.findall('(\||\xa0)\xa0\xa0\xa0', i.text))
        
        if level <= 0:
            parent = 0
        else:
            parent = levels[level-1]
        
        levels[level] = n
        
        d = PyQuery(url=config.rooturl+'/admin/index.forum?part=general&sub=general&mode=edit&fid=' + id + '&extended_admin=1&' + tid, opener=fa_opener)
        try:
            description = d("textarea").text()
        except:
            description = ""
        
        save.forums.append({'id': int(id[1:]), 'newid': n, 'type': id[0], 'parent': parent, 'title': title, 'description': description, 'parsed': False})
        n += 1
class Shows:
    def __init__(self, crawler):
        self.crawler = crawler
        self.categories = PyQuery(self.crawler.url)
        self.categories_iter = self.categories.find("li.svtoa-anchor-list-item a")
        self.i = 0

    def __iter__(self):
        return self

    def next(self):
        if self.i == self.categories_iter.length:
            raise StopIteration

        link = self.categories_iter[self.i]

        py_link = PyQuery(link)
        href = py_link.attr('href')
        html_class = href.split('/')[-1:][0]
        title = py_link.text()
        # thumbnail_url = self.crawler.baseurl + PyQuery(link).find('img').attr('src')
        url = href

        show = Show(title, url, html_class)
        show.clips = Episodes(self.crawler, url)

        self.i += 1
        return show
Exemple #24
0
 def test_robots_are_inactive(self):
     page_extension = MetaTagPageExtension(extended_object=self.page, robots_indexing=False, robots_following=False)
     page_extension.save()
     self.page.publish('en')
     response = self.client.get('/')
     content = PyQuery(response.content)
     self.assertEqual(content.find('meta[name="robots"]').attr('content'), 'noindex, nofollow')
Exemple #25
0
def get_saml_response(response):
    tree = PyQuery(response.content)
    inputtag = tree.find('input[name="SAMLResponse"]')
    assert len(inputtag) == 1
    encoded_response = inputtag[0].get('value')
    samlresponse = base64.b64decode(encoded_response)
    return samlresponse
def get_img_urls(content):
    if not content:
        return []
    url_list = []
    doc = PyQuery(content)
    nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img')
    for node in nodeList:
        url = PyQuery(node).attr('src')
        if not url:
            continue
        if url.find('60x60') > 0:
            url=url.replace('60x60','400x400')
            url_list.append(url)
    needDescImg = True
    if needDescImg:
        link_url = doc('div#desc-lazyload-container').attr('data-tfs-url')
        if not link_url:
           return url_list
        desc_content = fetchPageWithUrl(link_url)
        #懒惰匹配模式
        imgNodes = re.findall('<img[^<>]*>.*?', desc_content)
        #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content)
        for node in imgNodes:
            nodeQ = PyQuery(node)
            desc_url = nodeQ('img').attr('src')
            if desc_url:
                desc_url = desc_url.replace('\\"', '')
            if not desc_url:
                continue
            if 'gif' in desc_url: #gif图片不要
                continue
            #if '//gd' in desc_url or '/2015/' in desc_url:
            url_list.append(desc_url)
    return url_list
Exemple #27
0
    def _export_(self):
        self.logger.debug("Récupération des messages du sujet %d (page %d)", self.topic.topic_id, self.page)

        response = self.session.get("/t{}p{}-a".format(self.topic.topic_id, self.page))
        document = PyQuery(response.text)

        pattern = re.compile(r"/u(\d+)")

        for element in document.find("tr.post"):
            e = PyQuery(element)

            post_id = int(e("td span.name a").attr("name"))

            self.logger.info("Récupération du message %d (sujet %d)", post_id, self.topic.topic_id)

            match = pattern.fullmatch(clean_url(e("td span.name strong a").eq(0).attr("href") or ""))
            if match:
                poster = self.users[int(match.group(1))]
            else:
                poster = AnonymousUser()

            post = e("td div.postbody div").eq(0).html()
            if not post:
                self.logger.warning("Le message  %d (sujet %d) semble être vide", post_id, self.topic.topic_id)
                post = ""

            # Get title
            title = e("table td span.postdetails").contents()[1]
            # Remove "Sujet :" before the title and spaces at the end
            title = title[7:].rstrip()

            # Get the date and time of the post
            timestamp = parse_date(e("table td span.postdetails").contents()[3])

            self.add_child(Post(post_id, post, title, timestamp, poster))
 def parseProductsAndCategoriesByCategory(self, category_page_content, category_info):
     doc = PyQuery(category_page_content)
     productList, categoryList = [], []
     if category_info.parent_categories and len(category_info.parent_categories) == 2:
         productList = self.parseProductsByCategory(category_page_content, category_info)
         return productList, categoryList
     if category_info.name == 'New Arrivals': #特殊处理一下
         for level2Node in doc.find('div#js_catelist_sec > div.item'):
             level2NodeQ = PyQuery(level2Node)
             level2CateName = level2NodeQ.children('p > a').text()
             for level3Node in level2NodeQ.children('ul > li > a'):
                 categoryInfo = self.createCategory(PyQuery(level3Node))
                 categoryInfo.parent_categories = [category_info.name, level2CateName]
                 categoryList.append(categoryInfo.formalize())
         return  productList, categoryList
     if category_info.name == 'Clearance':
         level2NodeList = doc('div.catelist > ul.cataUl_list > li > a')
         for level2Node in level2NodeList:
             categoryInfo = self.createCategory(PyQuery(level2Node))
             categoryInfo.parent_categories = ['Clearance']
             categoryList.append(categoryInfo.formalize())
         return productList, categoryList
     if doc.find('div#js_catelist_sec > div.cur > ul > li'):
         nodeList = doc.find('div#js_catelist_sec > div.cur > ul > li > a')
         for node in nodeList:
             nodeQ = PyQuery(node)
             categoryInfo = self.newCategory()
             categoryInfo.name = nodeQ.text()
             categoryInfo.url = nodeQ.attr('href')
             categoryInfo.set_categories(category_info)
             categoryList.append(categoryInfo.formalize())
     elif doc.find('div.catelist > ul > li.cur > div.menuList > p'):
         nodeList = doc.find('div.catelist > ul > li.cur > div.menuList > p > a')
         for node in nodeList:
             nodeQ = PyQuery(node)
             categoryInfo = self.newCategory()
             categoryInfo.name = nodeQ.text()
             categoryInfo.url = nodeQ.attr('href')
             if  category_info.parent_categories:
                 result = category_info.parent_categories + [category_info.name]
             else:
                 result = [category_info.name]
             categoryInfo.parent_categories = result
             categoryList.append(categoryInfo.formalize())
     else:
         productList = self.parseProductsByCategory(category_page_content, category_info)
     return productList, categoryList
Exemple #29
0
def create_meme(title, lines):
    url = "%s/%s" % (GENURL, title)
    pq = PyQuery(url=url)
    form = pq.find('div.instance_form_create_small form')
    if len(form) == 0:
        return "Error: something changed or something weird happened."
    else:
        url = "%s%s" % (GENURL, form[0].attrib['action'])
        data = {
            'languageCode': 'en',
            'generatorID': form.find('#generatorID').val(),
            'imageID': form.find('#imageID').val(),
            'text0': lines[0],
            'text1': len(lines) > 1 and lines[1] or '',
        }
        postq = PyQuery(url=url, data=data, method='post')
        return GENURL + postq.find('div.instance_large img')[0].attrib['src']
Exemple #30
0
def test_hreflang_basic(base_url):
    """Ensure that we're specifying the correct value for lang and hreflang."""
    url = base_url + '/en-US/docs/Web/HTTP'
    resp = requests.get(url)
    assert resp.status_code == 200
    html = PyQuery(resp.text)
    assert html.attr('lang') == 'en'
    assert html.find('head > link[hreflang="en"][href="{}"]'.format(url))
Exemple #31
0
    def getFullInfo(self, interval=0.5):
        self.getBasicInfo()

        # collCount
        if self.INFO['shopType'] in ['7', '1', '3', '4']:
            regxrs = re.findall(
                r'J_SCollCount\"\s+data\-info\=\"param\=(.+?)\&', self.content)
            if regxrs:
                params = {
                    'keys': regxrs[0],
                    't': '%.0f' % (time.time() * 1000),
                    'callback': 'TShop.setShopStat'
                }
                domain = 'http://count.tbcdn.cn/counter3'
                tUrl = domain + '?' + '&'.join(
                    [k + '=' + v for (k, v) in params.items()])
                r = request(tUrl)
                self.INFO['collCount'] = str(
                    re.findall(r'\"\S+\"\:(\d+)', r.text, re.S)[0])
            else:
                if PyQuery:
                    pyjq_obj = PyQuery(self.content.decode(self.res.encoding))
                    data_info = (pyjq_obj.find('dl.collect-num dt')
                                 or pyjq_obj.find('.collect-num span')
                                 ).attr('data-info')
                    if data_info:
                        count_url = re.sub(
                            r'param=(?P<param>.+?)\&countUrl=(?P<count>.+?)\&.+',
                            r'\g<count>?callback=jsonp357&t=%d&keys=\g<param>'
                            % (time.time() * 1000), data_info, re.S)
                        collCount = re.sub(r'.+\:\s*(?P<coll>\d+).+',
                                           r'\g<coll>',
                                           request(count_url).content)
                        if collCount:
                            self.INFO['collCount'] = int(collCount)

        else:
            coll_url = 'http://favorite.taobao.com/collect_item_relation.htm?itemtype=0&itemNumid=%s' % self.INFO[
                'shopId']
            try:
                res = request(coll_url)
                pyjq_obj = PyQuery(res.content.decode(res.encoding))
                self.INFO['collCount'] = pyjq_obj.find(
                    "div.add-fav-msg strong").html().strip()
            except:
                pass
        time.sleep(interval)

        # itemAmount
        if self.INFO['shopType'] == '2':
            tUrl = self.INFO['shopLink'] + '?search=y&orderType=_hotsell'
            r = request(tUrl)
            try:
                self.INFO['itemAmount'] = \
                    str(re.findall(
                        r'\<div\s+class\=\"search\-result\"\s*\>.+?(\d+)', r.text, re.S)[0])
            except:
                pass
        time.sleep(interval)
        self.getRateInfo()
        return self.INFO
Exemple #32
0
from pyquery import PyQuery as pq

db = pymysql.connect('localhost', 'root', '123456', 'zxshop')
cursor = db.cursor()

url = 'http://www.tcmap.com.cn/list/daima_list.html'


def downpage(url) -> '爬取网页数据':
    r = requests.get(
        url,
        headers={
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
        })

    r.encoding = 'gbk'

    return r.text


html_doc = downpage(url)

pq = pq(html_doc)

html = pq.find("#list360")

data = html.text()

for line in data:
    print(line[:6], '=', line[6:])
Exemple #33
0
from pyquery import PyQuery

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
            sdf
            <a>a</a>
             <li class="item-0">first item</li>
             <li class="item-1"><a href="link2.html">second item</a></li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
             <li class="item-1 active"><a href="link4.html">fourth item</a></li>
             <li class="item-0"><a href="link5.html">fifth item</a></li>
         </ul>
     </div>
 </div>
'''
query = PyQuery(html)
print(query.find('a'))

print('_______')
print(query.children('a'))


 def get_copmany_name(text):
     if text == '' or text is None:
         return None
     jq = PyQuery(text, parser='html')
     company = jq.find('.info-dl').eq(1).find('dd').text()
     return company.strip()
Exemple #35
0
def _query_selector(pq, args):
    selector = args.get('selector')
    if not selector:
        return pq
    return pq.find(selector)
Exemple #36
0
 def _slice(res: str, index: int = 1) -> GoogleResponse:
     utf8_parser = HTMLParser(encoding="utf-8")
     d = PyQuery(fromstring(res, parser=utf8_parser))
     data = d.find(".g")
     pages = list(d.find("td").items())[1:-1]
     return GoogleResponse(data, pages, index)
Exemple #37
0
def get_posts():
    global month
    logging.info('Récupération des messages')
    if config.debug:
        progress = progressbar.NoProgressBar()
    else:
        progress = progressbar.ProgressBar(widgets=[progressbar.SimpleProgress('/'), ' ', progressbar.Bar("#","[","]"), progressbar.Percentage()], maxval=save.nbposts)
    progress.start()

    n = len(save.posts)

    ids = [i["id"] for i in save.posts]
    
    for topic in [i for i in save.topics if i["parsed"] == False]:
        logging.debug('Récupération : messages du topic %d', topic["id"])
        subposts = []
        subids = []
        d = PyQuery(url=config.rooturl + '/t' + str(topic['id']) + '-a', opener=fa_opener)
        result = re.search('function do_pagination_start\(\)[^\}]*start = \(start > \d+\) \? (\d+) : start;[^\}]*start = \(start - 1\) \* (\d+);[^\}]*\}', d.text())

        try:
            pages = int(result.group(1))
            topicsperpages = int(result.group(2))
        except:
            pages = 1
            topicsperpages = 0
        
        for page in range(0,pages):
            if page >= 1:
                d = PyQuery(url=config.rooturl + '/a-t' + str(topic['id']) + '-' + str(page*topicsperpages) + '.htm', opener=fa_opener)
            
            for i in d.find('tr.post'):
                e = PyQuery(i)
                
                id = int(e("td span.name a").attr("name"))
                if id not in ids and id not in subids:
                    logging.debug('Récupération : message %d (topic %d)', id, topic["id"])
                    author = e("td span.name").text()
                    post = htmltobbcode.htmltobbcode(e("td div.postbody div").eq(0).html(), save.smileys)
                    result = e("table td span.postdetails").text().split(" ")
                    if result[-3] == "Aujourd'hui":
                        title = " ".join(e("table td span.postdetails").text().split(" ")[1:-3])
                        date = e("table td span.postdetails").text().split(" ")[-3:]
                        timestamp = time.mktime(datetime.datetime.combine(datetime.date.today(), datetime.time(int(date[2].split(":")[0]),int(date[2].split(":")[1]))).timetuple())
                    elif result[-3] == "Hier":
                        title = " ".join(e("table td span.postdetails").text().split(" ")[1:-3])
                        date = e("table td span.postdetails").text().split(" ")[-3:]
                        timestamp = time.mktime(datetime.datetime.combine(datetime.date.today()-datetime.timedelta(1), datetime.time(int(date[2].split(":")[0]),int(date[2].split(":")[1]))).timetuple())
                    else:
                        title = " ".join(e("table td span.postdetails").text().split(" ")[1:-6])
                        date = e("table td span.postdetails").text().split(" ")[-6:]
                        timestamp = time.mktime(datetime.datetime(int(date[3]),month[date[2]],int(date[1]),int(date[5].split(":")[0]),int(date[5].split(":")[1])).timetuple())
                    
                    subposts.append({'id': id, 'post': post, 'title': title, 'topic': topic["id"], 'timestamp': int(timestamp), 'author': author})
                    subids.append(id)
                    n += 1
                    progress.update(n)
                else:
                    logging.warning('Le message %d a déjà été récupéré.', id)
        save.posts.extend(subposts)
        ids.extend(subids)
        [i for i in save.topics if i == topic][0]["parsed"] = True
    
    progress.end()
Exemple #38
0
    def wrap(self, invoked_from_wrapper=False):
        # Handle called from another wrapper.
        md_section_list = None
        if isinstance(self.section.contents, list):
            md_section_list = self.section.contents

        elif invoked_from_wrapper and \
                isinstance(self.section.contents.contents, str):
            md_section_list = [self.section.contents]

        if not isinstance(md_section_list, list):
            raise ValueError('Markdown section does not have valid contents ' +
                             '(must be a list)')

        for section in md_section_list:
            # === Start wrappers ===
            if section.type == MD_TYPE_DIV:
                temp_section = MarkdownSection('markdown', section.contents,
                                               {}, {})
                invoke(self.cell_object, temp_section)
                continue

            if section.type == MD_TYPE_CODE:
                md_code.invoke(self.cell_object, section)
                self.cell_object.update_paragraph()
                continue

            if section.type == MD_TYPE_QUOTE:
                md_blockquote.invoke(self.cell_object, section)
                self.cell_object.update_paragraph()
                continue

            if section.type == MD_TYPE_UNORDERED_LIST:
                md_ul.invoke(self.cell_object, section)
                self.cell_object.update_paragraph()
                continue

            if section.type == MD_TYPE_ORDERED_LIST:
                md_ol.invoke(self.cell_object, section)
                self.cell_object.update_paragraph()
                continue

            if section.type == MD_TYPE_LIST_ITEM:
                md_li.invoke(self.cell_object, section)
                continue

            if section.type == MD_TYPE_TABLE:
                table_html = section.extra['original_html']
                t = PyQuery(table_html)
                headers = [i.find('th') for i in t.find('tr').items()][0]
                headers = [c.text() for c in headers.items()]

                rows = [
                    i.find('td') for i in t.find('tr').items() if i.find('td')
                ]
                data = []
                for row in rows:
                    r = {
                        headers[i]: c.text()
                        for i, c in enumerate(row.items())
                    }
                    data.append(r)
                s = Section("table", data, {"tableColumns": headers}, {})
                table.invoke(self.cell_object, s)
                continue

            # Fix wrapped:
            #   (Some times there are elements which contain other elements,
            #    but are not considered one of the declared wrappers)
            # They are in MD_ETC_WRAPPERS.
            if isinstance(section.contents,
                          list) and section.type in MD_ETC_WRAPPERS:
                is_inside_wrapper = False

                if 'inline' in section.extra:
                    is_inside_wrapper = True

                if section.type == 'span':
                    section.propagate_extra('check_newline',
                                            True,
                                            only_multiple_children=False)

                # TODO: Fix problem with H1 no newline even if in span.
                temp_section = MarkdownSection('markdown', section.contents,
                                               {}, section.extra,
                                               section.attrs)
                invoke(self.cell_object,
                       temp_section,
                       invoked_from_wrapper=is_inside_wrapper)
                continue

            # === Elements ===
            if section.type in SHOULD_NEW_LINE and section.get_extra(
                    'check_newline'):
                self.cell_object.add_paragraph()

            if section.type == MD_TYPE_HORIZONTAL_LINE:
                md_hr.invoke(self.cell_object, section)
                continue

            # Add a block (newline) if not called from a wrapper
            #  (Should come after hr)
            if not invoked_from_wrapper:
                self.cell_object.add_paragraph()

            if section.type in MD_TYPES_HEADERS:
                # We want to keep the h{1...6} for styling
                insert_header(self.cell_object,
                              section.contents,
                              header=section.type,
                              style=section.get_style())

                continue

            if section.type in [MD_TYPE_TEXT, MD_TYPE_INLINE_TEXT]:
                if invoked_from_wrapper:
                    self.cell_object.add_run()

                if not section.contents:
                    continue

                if '{date}' in section.contents:
                    try:
                        formatted_date = get_formatted_date('', section.layout)
                    except ParserError as e:
                        formatted_date = 'n/a'
                    section.contents = section.contents.replace(
                        '{date}', formatted_date)

                insert_text(self.cell_object, section)
                continue

            if section.type == MD_TYPE_LINK:
                md_link.invoke(self.cell_object, section)
                continue

            if section.type == MD_TYPE_IMAGE:
                md_image.invoke(self.cell_object, section)
                continue

            if DEBUG:
                raise ValueError(
                    f'Section type is not defined: {section.type}')
 def __get_company_name(text):
     jq = PyQuery(text, parser='html')
     return jq.find('#zhizhao').find('.xinxi').find('tr').eq(0).find(
         'td').eq(1).find('span').text().strip()
        elif tag == 'span' and attrs.__contains__(('class', 'event-location')):
            self.key['event-location'] = True
        elif tag == 'h3' and attrs.__contains__(('class', 'event-title')):
            self.key['event-title'] = True

    def handle_data(self, data):
        if self.key['time']:
            print 'Time:%s\t|' % data,
            self.key['time'] = None
        elif self.key['event-title']:
            print 'Title:%s\t|' % data,
            self.key['event-title'] = None
        elif self.key['event-location']:
            print 'Location:%s\t|' % data
            self.key['event-location'] = None


parser = MyHTMLParser()
html = urllib.urlopen('http://www.python.org/events/python-events/').read()
parser.feed(html)

from pyquery import PyQuery
doc = PyQuery(url='https://www.python.org/events/python-events/')
for event in doc('.list-recent-events li'):
    event = PyQuery(event)
    loc = event.find('.event-location').text()
    time = event.find('time').text()
    name = event.find('.event-title').text()
    print 'event:%s' % name
    print '\ttime:%s' % time
    print '\tlocation:%s' % loc
Exemple #41
0
    def next(self):
        if self.i == self.episodes_iter.length:
            raise StopIteration

        # Index all episodes
        link = self.episodes_iter[self.i]

        # Parse the current episode from the long list of episodes
        article = PyQuery(link)
        episode = article.find('a.playLink')
        full_url = self.crawler.baseurl + article.find('a.playLink').attr('href')
        broadcasted = article.find('time').attr('datetime')
        episode_date = parse(broadcasted).replace(tzinfo=None)
        published = article.attr('data-published')

        if self.crawler.skip_urls:
            if full_url in self.crawler.skip_urls:
                self.i += 1
                return self.next()

        if published.find('idag') != -1:
            published = '%s' % datetime.today()

        if published.find(u'ikväll') != -1:
            self.i += 1
            return self.next()

        if published.find(u'igår') != -1:
            published = '%s' % (datetime.today() - timedelta(days=1))

        try:
            published_date = parse(published, parserinfo=sverje()).replace(tzinfo=None)
        except ValueError as err:
            print err
            print published

        if self.crawler.min is not None:
            if published_date < self.crawler.min:
                self.i += 1
                return self.next()

        if self.crawler.max is not None:
            if published_date > self.crawler.max:
                self.i += 1
                return self.next()

        if len(broadcasted) < 1:
            broadcasted = '1970-01-01 00:00:00'

        #Check if the url contains an extra /Random-Title, if so, remove it
        if len(full_url.split('/')) == 6:
            url = full_url.rpartition('/')[0]
        else:
            url = full_url

        if (url.find('video') != -1 or url.find('klipp') != -1) and len(broadcasted) > 1:

            available = parse_date(article.attr('data-available'), '+')
            length = article.attr('data-length')

            if not episode.attr('href').startswith('http'):
                try:
                    # Get the episode from url
                    article_full = PyQuery(url)
                    thumbnail = article_full.find('img.svtHide-No-Js').eq(0).attr('data-imagename')
                    meta = article_full.find('.playBoxConnectedToVideoMain div')

                    episode = Episode()

                    desc = article_full.find('.playBoxConnectedToVideoMain p').text()
                    if desc is not None:
                        if len(desc) == 0:
                            desc = article_full.find('.playBoxConnectedToVideoMain span')
                    desc = sanitize_description(unicode(desc))

                    if str(meta).find('Kan endast ses i Sverige') == -1:
                        rights = 1
                    else:
                        rights = 2

                    if str(meta).find('Kan ses i mobilen') > -1:
                        on_device = 1
                    else:
                        on_device = 2

                    try:
                        episodeTitle = article_full.find('title').eq(0).text().replace('| SVT Play', '')
                        episode.url = url
                        episode.title = episodeTitle
                        episode.published = published
                        episode.published_date = published_date
                        episode.title_slug = shellquote(episodeTitle)
                        episode.http_status = 200
                        episode.http_status_checked_date = datetime.utcnow().replace(tzinfo=utc)
                        episode.date_available_until = available
                        episode.date_broadcasted = broadcasted
                        episode.length = length
                        episode.description = desc
                        episode.viewable_on_device = on_device
                        episode.viewable_in = rights
                        episode.kind_of = self.kind_of
                        episode.thumbnail_url = thumbnail
                    except AttributeError:
                        self.i += 1
                        return self.next()

                    self.i += 1
                    return episode

                except HTTPError as err:
                    self.i += 1
                    return self.next()
def get_brand(code):
    q = PyQuery("https://kabutan.jp/stock/?code=7203")
    sector = q.find('#stockinfo_i2 > div > a')[0].text
    print(sector)
    print(code)
Exemple #43
0
import requests
from pyquery import PyQuery
import pickle
from helper.move import *

moves = []

for x in range(1, 8):
    data = requests.get("https://pokemondb.net/move/generation/" + str(x))
    src = PyQuery(data.text)

    trs = src.find('.ent-name')
    length = len(moves)
    i = length
    for tr in trs:
        moves.append([])
        moves[i].append(tr.text)
        i += 1

    trs = src.find('.type-icon')
    i = length
    for tr in trs:
        moves[i].append(tr.text)
        i += 1

    trs = src.find('td:nth-child(3)')
    i = length
    for tr in trs:
        if tr.attrib["data-sort-value"] == "special":
            moves[i].append(1)
        elif tr.attrib["data-sort-value"] == "physical":
Exemple #44
0
 def getBasicInfo(self):
     self.INFO['shopId'] = re.findall(REGX['shopId'], self.content)
     self.INFO['userId'] = re.findall(REGX['userId'], self.content)
     self.INFO['shopName'] = re.findall(REGX['shopname'], self.content)
     self.INFO['shopLink'] = re.findall(REGX['shopLink'], self.content)
     self.INFO['wangwangNick'] = re.findall(REGX['wangwang'], self.content)
     self.INFO['shopRank'] = re.findall(REGX['shopRank'], self.content)
     self.INFO['shopGrade'] = re.findall(REGX['shopGrade'], self.content,
                                         re.S)
     self.INFO['shopRate'] = re.findall(REGX['shopRate'], self.content)
     self.INFO['shopKeeper'] = re.findall(REGX['shopKeeper'], self.content,
                                          re.S)
     self.INFO['company'] = re.findall(
         REGX['company'], self.content.decode(self.res.encoding, 'ignore'),
         re.S)
     self.INFO['location'] = re.findall(REGX['location'], self.content,
                                        re.S)
     self.INFO['goodsRate'] = re.findall(
         REGX['goodsRate'], self.content.decode(self.res.encoding,
                                                'ignore'), re.S)
     self.INFO['itemAmount'] = re.findall(
         REGX['itemAmount'], self.content.decode(self.res.encoding,
                                                 'ignore'), re.S)
     self.INFO['setupTime'] = re.findall(
         REGX['setupTime'], self.content.decode(self.res.encoding,
                                                'ignore'), re.S)
     self.INFO['shopType'] = re.findall(
         REGX['shopType'], self.content) or re.findall(
             r'"*siteId"*:\s*[\'\"](\d+)[\'\""]', self.content)
     if self.INFO['shopType'][0] in ['4']:
         self.INFO['shopId'] = re.findall(REGX_4[r'shopId'], self.content)
         self.INFO['userId'] = re.findall(REGX_4['userId'], self.content)
     for (k, v) in self.INFO.items():
         if v:
             if isinstance(v, list):
                 if k == 'shopRate':
                     self.INFO[k] = ','.join(self.INFO[k])
                 elif k == 'shopLink':
                     if len(
                             re.findall(r'http\:\/\/store\.taobao\.com',
                                        self.INFO[k][0])) == 1:
                         try:
                             self.INFO[k] = \
                                 (re.findall(
                                     r'\<a\s+class\=\"hCard\sfn\"\s+href\=\"(.+?)\"', self.content)[0])[:-1]
                         except:
                             self.INFO[k] = self.INFO[k][0]
                     else:
                         self.INFO[k] = self.INFO[k][0]
                 else:
                     self.INFO[k] = _trim_html(v[0])
                     if k in [
                             'company', 'goodsRate', 'itemAmount',
                             'setupTime'
                     ]:
                         self.INFO[k] = self.INFO[k].encode(
                             'utf-8', 'ignore')
                     else:
                         self.INFO[k] = \
                             self.INFO[k].\
                             decode(self.res.encoding, 'ignore').\
                             encode('utf-8', 'ignore')
             else:
                 self.INFO[k] = None
         else:
             self.INFO[k] = None
     if not self.INFO['userId']:
         for regx in [r'userId\s*\=\s*(\d+)']:
             regxrs = re.findall(regx, self.content, re.S)
             if len(regxrs) > 0:
                 self.INFO['userId'] = regxrs[0]
                 break
     if not self.INFO['shopLink']:
         for regx in \
             [r'\<h3\s+class\=\"shop\-title\"\s*\>\s*\<a.+?href\=\"(.+?)\"',
              r'\<a\sclass\=\"hCard\sfn\s*\"\shref\s*\=\s*\"(.+?)\"',
              r'\<a\s+class\=\"shop-name\s*\"\s+href\s*\=\s*\"(.+?)\"']:
             regxrs = re.findall(regx, self.content, re.S)
             if len(regxrs) > 0:
                 self.INFO['shopLink'] = regxrs[0]
                 break
     # print re.findall(r'<title>(.+?)</title>',
     # self.content.decode(self.res.encoding), re.S)[0].encode('utf-8')
     if not self.INFO['shopName']:
         for regx in [
                 r'<div\s+class\=\"name\"\s*\>\s*\<span\>\s*(.+?)\<',
                 r'\<h3\s+class\=\"shop\-title\"\s*\>\s*\<a.+?.+?\>\s*(.+?)\<',
                 r'\<a\s+class\=\"shop-name\s*\"\s*href\=.+?\>(.+?)\<\/a\>',
                 r'<title>(.+?)</title>'
         ]:
             shopName = re.findall(regx,
                                   self.content.decode(self.res.encoding),
                                   re.S)
             if len(shopName) > 0:
                 self.INFO['shopName'] = shopName[0]
                 break
         self.INFO['shopName'] = re.sub(r'(\<span.*?\>).+?(\<\/span\>)',
                                        r'', self.INFO['shopName'] or '',
                                        re.S)
         self.INFO['shopName'] = self.INFO['shopName'].encode(
             'utf-8').replace('首页-', '').replace(
                 '-淘宝网', '') if self.INFO['shopName'] else None
     if not self.INFO['wangwangNick']:
         for regx in [r'data-nick\s*=\s*"(.+?)"']:
             wangwangNick = re.findall(regx, self.content, re.S)
             if len(wangwangNick) > 0:
                 self.INFO['wangwangNick'] = unquote(wangwangNick[0])
                 break
     if not self.INFO['wangwangNick']:
         self.INFO['wangwangNick'] = self.INFO['shopName']
     if not self.INFO['shopKeeper']:
         self.INFO['shopKeeper'] = self.INFO['wangwangNick']
     if not self.INFO['shopName']:
         self.INFO['shopName'] = self.INFO['wangwangNick']
     if not self.INFO['shopRank']:
         self.INFO['shopRank'] = re.findall(r'/newrank/(.+?)\.gif',
                                            self.content)
         self.INFO['shopRank'] = self.INFO['shopRank'][0] if self.INFO[
             'shopRank'] else None
     pyjq_obj = PyQuery(self.content.decode(self.res.encoding))
     if not self.INFO['itemAmount'] and self.INFO['shopType'] != '2':
         shop_intro = pyjq_obj.find("div.shop-intro")
         if shop_intro:
             self.INFO['itemAmount'] = \
                 int(list(shop_intro.items("dl"))[2].find("dd span").html())
     if self.INFO['shopType'] == '2':
         if not self.INFO['shopLink']:
             shopLink = re.search(r'href="(http://\S+?.tmall.com)"',
                                  self.content, re.S)
             if shopLink:
                 self.INFO['shopLink'] = shopLink.group(1)
         if not self.INFO['company']:
             company = pyjq_obj.find('div.extend ul li')
             if len(company) >= 3:
                 company = company[2].find('div').text.encode(
                     'utf-8').strip('\n\r\t ')
                 self.INFO['company'] = company if len(
                     company) < 200 else None
     else:
         pass
     if not self.INFO['location']:
         i = pyjq_obj.text().find('所在地区'.decode('utf-8'))
         j = pyjq_obj.text().find('宝贝'.decode('utf-8'), i)
         self.INFO['location'] = pyjq_obj.text()[i + 5:j].encode('utf-8')
     self.INFO['location'] = self.INFO['location'].strip('-').strip() \
         if self.INFO['location'] and len(self.INFO['location']) < 20 \
         else None
     if not self.INFO['location']:
         loca_html = pyjq_obj.find('.locus')
         if loca_html:
             loca = loca_html.text().lstrip(
                 '所 在 地:'.decode('utf-8')).lstrip('所 在 地:'.decode('utf-8'))
             self.INFO['location'] = loca.encode('utf-8')
     if self.INFO['location']:
         self.INFO['location'] = self.INFO['location'].strip('-')
     self.INFO['location2'] = _parse_loca(self.INFO['location'])
     return self.INFO
Exemple #45
0
 def __get_data(self):
     resp = self.session.get(reportURL)
     doc = PyQuery(resp.text)
     html = doc.html()
     tiwen = 36.5 + random.uniform(0, 0.3)
     tiwen = round(tiwen, 1)
     zxMatch = re.findall(r'f8_state={.*?"SelectedValue":"(.+?)"', html)[0]
     gnMatch = re.findall(r'f14_state={.*?"SelectedValue":"(.+?)"', html)[0]
     shengMatch = re.findall(r'f16_state={.+?"SelectedValueArray":\["(.+?)"]', html)[0]
     shiMatch = re.findall(r'f17_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0]
     xianMatch = re.findall(r'f18_state={.*?"F_Items":(.+?),"SelectedValueArray":\["(.+?)"]', html)[0]
     # print(shiMatch)
     xxMatch = re.findall(r'f20_state={.*?"Text":"(.+?)"', html)[0]
     F_State = template % (
     self.date, zxMatch, gnMatch, shengMatch, shiMatch[0], shiMatch[1], xianMatch[0], xianMatch[1], xxMatch, "否")
     return {
         'F_State': base64.b64encode(F_State.encode()),
         '__VIEWSTATE': doc.find('#__VIEWSTATE').attr('value'),
         '__EVENTTARGET': 'p1$ctl00$btnSubmit',
         '__EVENTARGUMENT': '',
         '__VIEWSTATEGENERATOR': doc.find('#__VIEWSTATEGENERATOR').attr('value'),
         'p1$ChengNuo': 'p1_ChengNuo',
         'p1$BaoSRQ': self.date,
         'p1$DangQSTZK': '良好',
         'p1$TiWen': str(tiwen),
         'F_TARGET': 'p1_ctl00_btnSubmit',
         'p1_Collapsed': 'false',
         'p1$CengFWH_RiQi': '',
         'p1$CengFWH_BeiZhu': '',
         'p1$JieChu_RiQi': '',
         'p1$JieChu_BeiZhu': '',
         'p1$TuJWH_RiQi': '',
         'p1$TuJWH_BeiZhu': '',
         'p1$JiaRen_BeiZhu': '',
         'p1$ZaiXiao': zxMatch,
         "p1$MingTDX": "不到校",
         "p1$MingTJC": "否",
         "p1$BanChe_1$Value": '0',
         "p1$BanChe_1": '不需要乘班车',
         "p1$BanChe_2$Value": '0',
         "p1$BanChe_2": '不需要乘班车',
         'p1$GuoNei': '国内',
         "p1$ddlGuoJia$Value": "-1",
         "p1$ddlGuoJia": "选择国家",
         'p1$ddlSheng$Value': shengMatch,
         'p1$ddlSheng': shengMatch,
         'p1$ddlShi$Value': shiMatch[1],
         'p1$ddlShi': shiMatch[1],
         'p1$ddlXian$Value': xianMatch[1],
         'p1$ddlXian': xianMatch[1],
         'p1$XiangXDZ': xxMatch,
         "p1$FanXRQ": "",
         "p1$WeiFHYY": "",
         "p1$ShangHJZD": "",
         'p1$QueZHZJC$Value': '否',
         'p1$QueZHZJC': '否',
         'p1$DangRGL': '否',  # 是否隔离
         'p1$DaoXQLYGJ': '',  # 旅游国家
         'p1$DaoXQLYCS': '',  # 旅游城市
         'p1$Address2': '中国',
         'p1$SuiSM': '绿色',  # 随申码颜色
         'p1$LvMa14Days': '是',  # 截止今天是否连续14天健康码为绿色
         'p1$GeLDZ': '',
         "p1_SuiSMSM_Collapsed": "false",
         "p1_GeLSM_Collapsed": 'false',
         "p1_SuiSMSM_Collapsed": 'false'
     }
Exemple #46
0
    def test_ongoing_events_in_event_list(self, managers_timezone_mock,
                                          tag_timezone_mock):
        managers_timezone_mock.now.return_value = tz_datetime(
            2014, 4, 7, 9, 30)
        tag_timezone_mock.now.return_value = tz_datetime(2014, 4, 7, 9, 30)

        root_page = self.create_root_page(
            publication_date=tz_datetime(2014, 4, 1))
        root_page.publish('en')
        page = api.create_page(title='Events en',
                               template=self.template,
                               language='en',
                               published=True,
                               parent=root_page,
                               apphook='EventListAppHook',
                               apphook_namespace=self.app_config.namespace,
                               publication_date=tz_datetime(2014, 4, 1))
        page.publish('en')

        # happens in Apr 5
        ev1 = self.create_event(title='ev1',
                                start_date=tz_datetime(2014, 4, 5),
                                publish_at=tz_datetime(2014, 4, 1))
        # Apr 6 12:00 to Apr 7 9:00
        ev2 = self.create_event(title='ev2',
                                start_date=tz_datetime(2014, 4, 6),
                                end_date=tz_datetime(2014, 4, 7),
                                start_time='12:00',
                                end_time='09:00',
                                publish_at=tz_datetime(2014, 4, 2))
        # happens in Apr 7
        ev3 = self.create_event(title='ev3',
                                start_date=tz_datetime(2014, 4, 7),
                                publish_at=tz_datetime(2014, 4, 3))
        # happens in Apr 8
        ev4 = self.create_event(title='ev4',
                                start_date=tz_datetime(2014, 4, 8),
                                publish_at=tz_datetime(2014, 4, 4))

        # setUp app config
        original_app_data = self.app_config.app_data.copy()
        self.app_config.app_data = {'config': {'show_ongoing_first': True}}
        self.app_config.save()

        with force_language('en'):
            response = self.client.get(page.get_absolute_url('en'))
            context = response.context_data

        # tearDown app config
        self.app_config.app_data = original_app_data
        self.app_config.save()

        actual_ongoing = [event.pk for event in context['ongoing_objects']]
        expected_ongoing = [event.pk for event in [ev2, ev3]]
        self.assertEqual(actual_ongoing, expected_ongoing)

        actual_object_list = [event.pk for event in context['object_list']]
        expected_object_list = [event.pk for event in [ev4, ev1]]
        self.assertEqual(actual_object_list, expected_object_list)

        ongoing_list = PyQuery(response.content)('.events-upcoming')
        links = ongoing_list.find('h2 a')
        self.assertEqual(2, links.length)
        self.assertEqual(ev4.get_absolute_url(), links[0].attrib['href'])
        self.assertEqual(ev1.get_absolute_url(), links[1].attrib['href'])
# -*- coding: utf-8 -*-
from pyquery import PyQuery

q = PyQuery('https://kabutan.jp/stock/?code=7203')
sector = q.find('#stockinfo_i2 > div > a')[0].text
print(sector)
Exemple #48
0
 def isMetaRefresh(self, this):
     httpEquiv = PyQuery(this).attr['http-equiv']
     return (httpEquiv and httpEquiv.find('refresh') > -1)
Exemple #49
0
    def get_search_list_html(self, keyword, session):
        param_list = []
        try:
            session.headers = {
                "Host":
                "gsxt.zjaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                "Connection":
                "keep-alive",
                "Referer":
                "http://zj.gsxt.gov.cn/client/entsearch/list?isOpanomaly=&pubType=1&searchKeyWord=0B46FE9E9DBAF27F&currentPage=2",
            }

            # 先获得加密关键字信息
            script = "strEnc('{keyword}','a','b','c')".format(keyword=keyword)
            search_key_word = self.get_encry_pripid_detail(
                encry_zj_conf['url'], script)
            if search_key_word is None:
                return param_list, self.SEARCH_ERROR

            search_url = 'http://{host}/client/entsearch/list?isOpanomaly=&pubType=1&searchKeyWord={searchkey}'.format(
                host=self.host, searchkey=search_key_word)

            r = self.task_request(session, session.get, url=search_url)
            if r is None:
                return param_list, self.SEARCH_ERROR

            content = r.text
            if content is None:
                return param_list, self.SEARCH_ERROR

            # 这个IP已经被封禁
            if util.judge_feature(content):
                self.report_session_proxy(session)
                return param_list, self.SEARCH_ERROR

            jq = PyQuery(content, parser='html')

            # 先判断有多少数据
            if jq.find('h3.title').find('span.light').text() == '0':
                return param_list, self.SEARCH_NOTHING_FIND

            item_list = jq.find('div.mod.enterprise-info').find(
                '.enterprise-info-list').find('li').items()
            for item in item_list:
                a_info = item.find('a')
                if a_info is None or len(a_info) <= 0:
                    continue

                href = a_info.attr('href')
                if href is None or href == '':
                    continue

                a_info.find('span[class=tip]').remove()
                a_info.find('i').remove()
                company = a_info.text()
                search_name = company.replace(' ', '')
                if search_name == '':
                    return None

                param = {
                    'Referer': search_url,
                    'href': href,
                    'search_name': search_name,
                }

                seed_code = None
                code_text = item.find('.item-text').find('.code').text()
                if code_text is not None and code_text.strip() != '':
                    part = code_text.split(':')
                    if len(part) >= 2:
                        seed_code = part[1]

                if seed_code is not None and seed_code.strip() != '':
                    param['unified_social_credit_code'] = seed_code

                param_list.append(param)
        except Exception as e:
            self.log.exception(e)
            return param_list, self.SEARCH_ERROR

        return param_list, self.SEARCH_SUCCESS if len(
            param_list) > 0 else self.SEARCH_ERROR