Example #1
0
    def test_new_key(self):
        data = {"description": "This is meant for a test app"}
        url = reverse('authkeys.new', locale='en-US')

        # Check out the creation page, look for the form.
        resp = self.client.get(url)
        eq_(200, resp.status_code)
        page = pq(resp.content)
        eq_(1, page.find('form.key').length)

        # We don't have this key yet, right?
        keys = Key.objects.filter(description=data['description'])
        eq_(0, keys.count())

        # Okay, create it.
        resp = self.client.post(url, data, follow=False)
        eq_(200, resp.status_code)

        # We have the key now, right?
        keys = Key.objects.filter(description=data['description'])
        eq_(1, keys.count())

        # Okay, and it should belong to the logged-in user
        key = keys[0]
        eq_(key.user, self.user)

        # Take a look at the description and key shown on the result page.
        page = pq(resp.content)
        ok_(data['description'], page.find('.key .description').text())
        ok_(key.key, page.find('.key .key').text())

        # Ensure the secret on the page checks out.
        secret = page.find('.key .secret').text()
        ok_(key.check_secret(secret))
Example #2
0
 def getUserAnswers(self, all):
     # 获取最新的文件的qID和aID
     latestFile = self.getLatestAnswerFileName()
     latestQID = 0
     latestAID = 0
     if latestFile is None:  # 没有符合格式的文件,需要全抓
         all = True
     else:  # 计算出最新的questionID和answerID
         pattern = re.compile('^\[\d{4}-\d{2}-\d{2}\].*-q(\d{1,50})-a(\d{1,50}).html$')
         match = pattern.findall(latestFile)
         for pp in match:
             latestQID = pp[0]
             latestAID = pp[1]
     # 默认是要抓第一页的,顺便计算回答的总页数
     pageContent = urllib2.urlopen("{}?page={}".
                                       format(self.answerURL, self.startPage)).read()
     d = pq(pageContent)
     pageMax = self.getMaxPageNumber(d)
     currentPage = self.startPage
     ret = False
     while True:
         self.logging("parsing page {} of {}".format(currentPage, pageMax), True)
         # 如果不是需要全部抓取,那么看看现在抓够了没有
         # 遇到老答案之后,再向前寻找10个老答案,并更新
         ret = self.parseAnswerAndSave(d, latestQID, latestAID, all)
         if not all and ret:  # 不用全抓,而且发现了重复 
             return
         if currentPage >= pageMax:  # 已经是最后一页
             break
         # 计算下一页的pq值
         currentPage += 1
         pageContent = urllib2.urlopen("{}?page={}".
                                       format(self.answerURL, currentPage)).read()
         d = pq(pageContent)
def parse_current_docket(docket_record):
    # grab the file with the URL mangled slightly to grab 100k records
    docket_file = urllib2.urlopen(docket_record['url'] + "&ctl00_ctl00_cphContentMain_MainContent_gvCommentListChangePage=1_100000").read()
    page = pq(etree.fromstring(docket_file, parser))

    docket = dict(docket_record)

    docket['title'] = page('.dyn_wrap h1').text().strip()
    assert docket['title'], 'no title found'

    headers = [item.text().strip() for item in page('.rgMasterTable thead th').items()]

    docket['comments'] = []

    # check if there's a no-records message
    if len(page('.rgMasterTable .rgNoRecords')):
        return docket
    
    for row in page('.rgMasterTable tbody tr').items():
        tds = row.find('td')
        cell_text = [item.text().strip() for item in tds.items()]
        cdata = dict(zip(headers, cell_text))
        
        link = pq(tds[-1]).find('a')

        doc = {
            'url': urlparse.urljoin(docket['url'], link.attr('href')),
            'details': {},
            'release': [fix_spaces(cdata['Release'])],
            'date': cdata['Date Received'],
            'doctype': 'public_submission',
        }

        vc_matches = re.findall(r"ViewComment\.aspx\?id=(\d+)", doc['url'])
        if vc_matches:
            doc['id'] = vc_matches[0]
            doc['subtype'] = 'comment'
            detail_columns = ['Organization', 'First Name', 'Last Name']
        else:
            ep_matches = re.findall(r"ViewExParte\.aspx\?id=(\d+)", doc['url'])
            if ep_matches:
                doc['id'] = "EP-%s" % ep_matches[0]
                doc['subtype'] = 'exparte'
                detail_columns = ['Organization']
            else:
                assert False, "expected either comment or exparte link: %s" % doc['url']

        for rdg_label, cftc_label in (('Organization Name', 'Organization'), ('First Name', 'First Name'), ('Last Name', 'Last Name')):
            if cftc_label in detail_columns and cdata[cftc_label]:
                doc['details'][rdg_label] = cdata[cftc_label]

        docket['comments'].append(doc)

    assert len(docket['comments']) < 100000, "we probably exceeded one page"

    # then strip out all the ones that aren't about this document
    release = fix_spaces(page('a[id*=rptReleases_hlReleaseLink]').text().strip())
    docket['comments'] = [comment for comment in docket['comments'] if comment['release'][0] == release]

    return docket
 def test__render(self):
     w = MultiEmailWidget()
     output = w.render('test', ['*****@*****.**', '*****@*****.**'])
     self.assertEqual(1, len(pq('textarea', output)))
     self.assertEqual(
         pq('textarea', output).text(),
         '[email protected],[email protected]')
def scrape_press_releases():
    releases_page = pq(scraperwiki.scrape(BASE_URL + 'news-releases'))
    for row in releases_page.find('.recordListTitle'):
        sleep(1)

        title = ''
        date = None
        content = ''
        attachments = []

        links = pq(row).find('a')
        page = pq(scraperwiki.scrape(links.eq(0).attr('href')))
        title = _extract_title_from(page)
        content = _readable(page.find('.content').html())
        date = _extract_date_from(page)
        for attachment in page.find('.file_link a'):
            att = pq(attachment)
            attachments.append({att.text(): att.attr('html')})
    
        args = [title, date, content]
        kwargs = {}
        if len(attachments):
            kwargs.update(attachments=attachments)
        
        gasp.add_press_release(*args, **kwargs)
Example #6
0
def getAnimeURL(searchText):
    try:
        searchText = sanitiseSearchText(searchText)
        
        html = requests.get(BASE_URL + "/anime/all?name=" + searchText.replace(" ", "%20"))
        ap = pq(html.text)

        animeList = []

        #If it's taken us to the search page
        if ap.find('.cardDeck.pure-g.cd-narrow[data-type="anime"]'):
            for entry in ap.find('.card.pure-1-6'):
                entryTitle = pq(entry).find('a').text()
                entryURL = pq(entry).find('a').attr('href')
                
                anime = {}
                anime['title'] = entryTitle
                anime['url'] = BASE_URL + entryURL
                animeList.append(anime)

            closestName = difflib.get_close_matches(searchText.lower(), [x['title'].lower() for x in animeList], 1, 0.85)[0]
            closestURL = ''
            
            for anime in animeList:
                if anime['title'].lower() == closestName:
                    return anime['url']
            
        #Else if it's taken us right to the series page, get the url from the meta tag
        else:
            return ap.find("meta[property='og:url']").attr('content')
        return None
            
    except:
        #traceback.print_exc()
        return None
Example #7
0
    def test_delete(self):
        """Can delete badge"""
        user = self._get_user()
        badge = Badge(creator=user, title="Test III",
                      description="Another test")
        badge.save()
        slug = badge.slug

        badge.award_to(user)

        self.client.login(username="******", password="******")

        r = self.client.get(reverse('badger.views.detail',
            args=(badge.slug,)), follow=True)
        doc = pq(r.content)

        eq_('badge_detail', doc.find('body').attr('id'))
        delete_url = doc.find('a.delete_badge').attr('href')
        ok_(delete_url is not None)

        r = self.client.get(delete_url)
        doc = pq(r.content)
        eq_('badge_delete', doc.find('body').attr('id'))
        eq_("1", doc.find('.awards_count').text())

        r = self.client.post(delete_url, {}, follow=True)
        doc = pq(r.content)

        try:
            badge = Badge.objects.get(slug=slug)
            ok_(False)
        except Badge.DoesNotExist:
            ok_(True)
    def get_content(self,total_pq):
        '''获取用户发表微博内容'''
        data = total_pq("div[node-type=feed_list_content]")
        i = 0
        for d in data :
            d = pq(d)
            if i == 0 and str(d("span")) != "": #不爬取置顶帖/热帖span.W_icon_feedpin/feedhot
                self.containsFirstTagWeibo = True
            else:
                if '//' in d.text():   #用户发表微博存在"转发"情况
                    p1=re.compile('(.*?)\s?//\s?<a',re.S)  #找出用户自己所发内容,不含//后面的转发内容
                    match = p1.search(d.outerHtml())
                    if match:
                        if match.group(1).strip() == '':  #发表内容为空
                            self.content_list.append('')
                        else:
                            data_pq = pq(match.group(1))
                            #print '~~~~~~~~~~~~',data_pq.outerHtml()
                            content = self.get_content_src(data_pq)
                            #print '1111111111', content
                            self.content_list.append(content)
                    else:
                        #用户发表的内容就是含有//本身
                        self.content_list.append(d.text())

                else: #用户直接发表微博,没有转发情况
                    content = self.get_content_src(d)                
                    self.content_list.append(content)
            i = i+1
        return self.content_list
Example #9
0
 def run(self):
     opener = build_opener(HTTPCookieProcessor())
     d = pq(opener.open(self.url).read())
     rates = d(".count")
     self.descrip = pq(rates[0]).html()
     self.service = pq(rates[1]).html()
     self.speed = pq(rates[2]).html()
Example #10
0
 def test_map(self):
     def ids_minus_one(i, elem):
         return int(self.klass(elem).attr('id')[-1]) - 1
     assert self.klass('div', self.html).map(ids_minus_one) == [0, 1]
     
     d = pq('<p>Hello <b>warming</b> world</p>')
     self.assertEqual(d('strong').map(lambda i,el: pq(this).text()), [])
Example #11
0
    def test_bug869301_revisions_feed_locale(self):
        """Links to documents in revisions feed with ?all_locales should
        reflect proper document locale, regardless of requestor's locale"""
        d = document(title='HTML9', locale="fr")
        d.save()
        now = datetime.datetime.now()
        for i in xrange(1, 6):
            created = now + datetime.timedelta(seconds=5 * i)
            revision(save=True,
                     document=d,
                     title='HTML9',
                     comment='Revision %s' % i,
                     content="Some Content %s" % i,
                     is_approved=True,
                     created=created)

        resp = self.client.get('%s?all_locales' %
                               reverse('wiki.feeds.recent_revisions',
                                       args=(),
                                       kwargs={'format': 'rss'},
                                       locale='en-US'))
        self.assertEqual(200, resp.status_code)
        feed = pq(resp.content)
        self.assertEqual(5, len(feed.find('item')))
        for i, item in enumerate(feed.find('item')):
            href = pq(item).find('link').text()
            self.assertTrue('/fr/' in href)
Example #12
0
def get_autolab_grades():
    #Autolab has their SSL certificates misconfigured, so we won't verify them
    s = authenticate('https://autolab.cs.cmu.edu/auth/users/auth/shibboleth',{"verify":False})

    main = s.get('https://autolab.cs.cmu.edu').content
    d = pq(main)
    current_courses = d('#content > .rolodex > .course > h1 > a')
    grades = {}

    for course in current_courses:
        page_1 = s.get('https://autolab.cs.cmu.edu%s/assessments' % d(course).attr('href')).content
        gradebook = pq(pq(page_1)('.action-links > li > a')[1]).attr('href')

        course_page = s.get('https://autolab.cs.cmu.edu%s' % gradebook).content
        course_name = d(course).text()
        cd = pq(course_page)

        grades[course_name] = {}

        assignments = cd('.grades tr')
        for assgn in assignments:
            if d(assgn).attr('class') == 'header': continue

            name = cd(assgn).find("td > span > a").text()
            score = cd(assgn).find("td > a").text()
            total = cd(assgn).find("span.max_score").text()

	    if name is not None and score is not None and total is not None:
	        grades[course_name][name] = [float(score), float(total)]


    return grades
def ods2csv(content,admins=''):  

  file_like_object = StringIO(content)

  xml = zipfile.ZipFile(file_like_object).read('content.xml')  
    
  def rep_repl(match):  
    return '<table:table-cell>%s' %match.group(2) * int(match.group(1))  
  def repl_empt(match):  
    n = int(match.group(1))  
    pat = '<table:table-cell/>'  
    return pat*n if (n<100) else pat  
      
  p_repl = re.compile(r'<table:table-cell [^>]*?repeated="(\d+)[^/>]*>(.+?table-cell>)')  
  p_empt = re.compile(r'<table:table-cell [^>]*?repeated="(\d+)[^>]*>')  
  xml = re.sub(p_repl, rep_repl, xml)  
  xml = re.sub(p_empt, repl_empt, xml)  
      
  d = pq(xml, parser='xml')  
  ns={'table': 'urn:oasis:names:tc:opendocument:xmlns:table:1.0'}  
  selr = CSSSelector('table|table-row', namespaces=ns)  
  selc = CSSSelector('table|table-cell', namespaces=ns)  
  rowxs = pq(selr(d[0]))  
  data = []  
  for ir,rowx in enumerate(rowxs):  
    cells = pq(selc(rowx))  
    if cells.text():  
      data.append([cells.eq(ic).text().encode('utf-8') for ic in range(len(cells))])  
  if data:
    return data
  else:
    logger_script=logging.getLogger("Script Error")
    logger_script.warning("Google retuned empty file for table <b>%s</b>" % IP_table_name)
    sending_log("DHCP_ERROR: script got an error","<b>Google returned empty file</b>, script ended without any changes made!",error_log_file_name=error_log_file_name,admins=admins)
    sys.exit(exit_code_dict['google_returned_empty_file'])
Example #14
0
    def test_answer_creator_can_edit(self):
        """The creator of an answer can edit his/her answer."""
        self.client.login(username='******', password='******')

        # Initially there should be no edit links
        response = get(self.client, 'questions.answers',
                       args=[self.question.id])
        doc = pq(response.content)
        eq_(0, len(doc('ol.answers a.edit')))

        # Add an answer and verify the edit link shows up
        content = 'lorem ipsum dolor sit amet'
        response = post(self.client, 'questions.reply',
                        {'content': content},
                        args=[self.question.id])
        doc = pq(response.content)
        eq_(1, len(doc('ol.answers a.edit')))
        new_answer = self.question.answers.order_by('-created')[0]
        eq_(1, len(doc('#answer-%s a.edit' % new_answer.id)))

        # Make sure it can be edited
        content = 'New content for answer'
        response = post(self.client, 'questions.edit_answer',
                        {'content': content},
                        args=[self.question.id, new_answer.id])
        eq_(200, response.status_code)

        # Now lock it and make sure it can't be edited
        self.question.is_locked = True
        self.question.save()
        response = post(self.client, 'questions.edit_answer',
                        {'content': content},
                        args=[self.question.id, new_answer.id])
        eq_(403, response.status_code)
Example #15
0
    def test_top_contributors(self):
        # There should be no top contributors since there are no solutions.
        cache_top_contributors()
        response = get(self.client, 'questions.questions')
        doc = pq(response.content)
        eq_(0, len(doc('#top-contributors ol li')))

        # Solve a question and verify we now have a top conributor.
        answer = Answer.objects.all()[0]
        answer.created = datetime.now()
        answer.save()
        answer.question.solution = answer
        answer.question.save()
        cache_top_contributors()
        response = get(self.client, 'questions.questions')
        doc = pq(response.content)
        lis = doc('#top-contributors ol li')
        eq_(1, len(lis))
        eq_('pcraciunoiu', lis[0].text)

        # Make answer 8 days old. There should no be top contributors.
        answer.created = datetime.now() - timedelta(days=8)
        answer.save()
        cache_top_contributors()
        response = get(self.client, 'questions.questions')
        doc = pq(response.content)
        eq_(0, len(doc('#top-contributors ol li')))
Example #16
0
    def common_vote(self):
        """Helper method for question vote tests."""
        # Check that there are no votes and vote form renders
        response = get(self.client, 'questions.answers',
                       args=[self.question.id])
        doc = pq(response.content)
        eq_('0 people', doc('div.have-problem mark')[0].text)
        eq_(1, len(doc('div.me-too form')))

        # Vote
        post(self.client, 'questions.vote', args=[self.question.id])

        # Check that there is 1 vote and vote form doesn't render
        response = get(self.client, 'questions.answers',
                       args=[self.question.id])
        doc = pq(response.content)
        eq_('1 person', doc('div.have-problem mark')[0].text)
        eq_(0, len(doc('div.me-too form')))

        # Voting again (same user) should not increment vote count
        post(self.client, 'questions.vote', args=[self.question.id])
        response = get(self.client, 'questions.answers',
                       args=[self.question.id])
        doc = pq(response.content)
        eq_('1 person', doc('div.have-problem mark')[0].text)
Example #17
0
    def common_answer_vote(self):
        """Helper method for answer vote tests."""
        # Check that there are no votes and vote form renders
        response = get(self.client, 'questions.answers',
                       args=[self.question.id])
        doc = pq(response.content)
        eq_(1, len(doc('form.helpful input[name="helpful"]')))

        # Vote
        post(self.client, 'questions.answer_vote', {'helpful': 'y'},
             args=[self.question.id, self.answer.id])

        # Check that there is 1 vote and vote form doesn't render
        response = get(self.client, 'questions.answers',
                       args=[self.question.id])
        doc = pq(response.content)

        eq_('1 out of 1 person', doc('#answer-1 div.helpful mark')[0].text)
        eq_(0, len(doc('form.helpful input[name="helpful"]')))

        # Voting again (same user) should not increment vote count
        post(self.client, 'questions.answer_vote', {'helpful': 'y'},
             args=[self.question.id, self.answer.id])
        doc = pq(response.content)
        eq_('1 out of 1 person', doc('#answer-1 div.helpful mark')[0].text)
Example #18
0
    def _delete_flow(self, user):
        """Private method used to walk through account deletion flow."""
        self.client.login(email=user.email)
        user_id = User.objects.get(email=user.email).id

        r = self.client.get(reverse('profile.edit'))
        doc = pq(r.content)

        # Make sure there's a link to a confirm deletion page, and nothing
        # pointing directly to the delete URL.
        eq_(reverse('profile.delete_confirm'),
            doc('a.btn-danger').attr('href'),
            'We see a link to a confirmation page.')
        self.assertFalse(any((reverse('profile.delete') in el.action)
                              for el in doc('#main form')),
            "We don't see a form posting to the account delete URL.")

        # Follow the link to the deletion confirmation page.
        r = self.client.get(doc('a.btn-danger').attr('href'))

        # Test that we can go back (i.e. cancel account deletion).
        doc = pq(r.content)
        eq_(reverse('profile.edit'),
            doc('#cancel-action').attr('href'))

        # Test that account deletion works.
        delete_url = doc('#delete-action').closest('form').attr('action')
        r = self.client.post(delete_url, follow=True)
        eq_(200, r.status_code)
        self.assertFalse(_logged_in_html(r))

        # Make sure the user data isn't there anymore
        assert not User.objects.get(id=user_id).first_name
        assert not User.objects.get(id=user_id).email
        assert not User.objects.get(id=user_id).is_active
Example #19
0
 def movie_search(self, **kw):
     """ Fallback for movie.search """
     keyword = kw['q']
     url = 'http://movie.douban.com/subject_search?search_text=%s&cat=1002' % \
           keyword.encode('utf-8')
     html = pq(url=url, parser='html', opener=lambda url, **kw:
                 urllib2.urlopen(urllib2.Request(url, headers={
                     'User-Agent': self.recbysns.UA,
                     'Cookie': self.cookie}), timeout=10).read())
     movies = []
     for movie in html('#content table .item'):
         movie = pq(movie)
         id = int(re.match('http://movie.douban.com/subject/(\d+)/',
                  movie('.nbg').attr('href')).group(1))
         image = movie('.nbg img').attr('src')
         pub = movie('.pl2>.pl').text()
         rating = pq(movie('.pl2 .star'))
         if rating and rating('.rating_nums').text():
             numRaters = int(re.match(u'\((\d+)',
                             rating('.pl').text()).group(1))
             average = rating('.rating_nums').text()
             rating = {"numRaters": numRaters, "average": average}
         else:
             rating = {"numRaters": 0, "average": 0}
         titles = [title.strip()
                   for title in movie('.pl2>a').text().split('/')]
         movies.append({'id': id, 'titles': titles,
                        'image': image, 'pub': pub, 'rating': rating})
     return {'movies': movies, 'total': len(movies)}
def scrape_inspection(inspection_url, facility):
    try:
        inspection = {}
        inspection['facility'] = facility['_id']
        if 'id' in facility:
            inspection['facility_id'] = facility['id']
        inspection['_id'] = inspection_url
        inspection['url'] = inspection_url
        inspection_resp = requests.get(inspection['url'])
        doc = pq(inspection_resp.content)

        info = doc.find('div#inspectionInfo tr td')
        for (counter, pair) in enumerate(grouper(info, 2)):
            value = pq(pair[1]).text()
            if counter == 0:
                date = dateutil.parser.parse(value)
                inspection['date'] = date.date()
            elif counter == 2:
                inspection['priority'] = value
            elif counter == 3:
                inspection['purpose'] = value
            elif counter == 4:
                inspection['result'] = value
            elif counter == 5:
                inspection['actions'] = value

        print "inspection: %s" % inspection
        save_inspection(inspection)
        return inspection, inspection_resp
    except:
        logger.exception("Could not scrape inspection %s" %
                         inspection.get('url', ''))
Example #21
0
    def test_mozillian_can_vouch(self):
        """
        Tests the vouching system's happy path.

        Kind of a big test because we want to:
        a. Test registration's happy path
        b. Test vouching
        c. Test account deletion
        """
        moz_client = self.mozillian_client
        r = moz_client.get(reverse('profile', args=[self.pending.username]))
        eq_(200, r.status_code)
        doc = pq(r.content)
        self.assertTrue(doc('form#vouch-form'))

        vouch_url = reverse('vouch')
        data = dict(vouchee=self.pending.get_profile().id)
        vouched_profile = moz_client.post(vouch_url, data, follow=True)
        self.pending = User.objects.get(pk=self.pending.pk)
        eq_(200, vouched_profile.status_code)

        r = moz_client.get(reverse('profile', args=[self.pending.username]))
        eq_(200, r.status_code)
        doc = pq(r.content)
        self.assertTrue(not doc('form#vouch-form'))

        eq_(self.pending.get_profile().vouched_by.user, self.mozillian,
            'Credit given')
Example #22
0
 def get_results(self, response, sort=True):
     """Return pks of add-ons shown on search results page."""
     addons = pq(response.content)('#pjax-results div[data-addon]')
     pks = [int(pq(a).attr('data-addon')) for a in addons]
     if sort:
         return sorted(pks)
     return pks
Example #23
0
    def test_support_link(self):
        # Test no link if no support url or contribution.
        self.enable_waffle()
        r = self.client.get(self.add)
        eq_(pq(r.content)('.support-link').length, 0)

        # Test support email if no support url.
        self.webapp.support_email = {'en-US': '*****@*****.**'}
        self.webapp.save()
        r = self.client.get(self.add)
        doc = pq(r.content)('.support-link')
        eq_(doc.length, 1)

        # Test link to support url if support url.
        self.webapp.support_url = {'en-US': 'test'}
        self.webapp.save()
        r = self.client.get(self.add)
        doc = pq(r.content)('.support-link a')
        eq_(doc.length, 1)
        eq_(doc.attr('href'), 'test')

        # Test link to support flow if contribution.
        c = Contribution.objects.create(addon=self.webapp, user=self.user,
                                        type=amo.CONTRIB_PURCHASE)
        r = self.client.get(self.add)
        doc = pq(r.content)('.support-link a')
        eq_(doc.length, 1)
        eq_(doc.attr('href'), reverse('support', args=[c.id]))
Example #24
0
    def test_known_authors_filter(self):
        # There are a total of 11 revisions
        url = urlparams(reverse('dashboards.revisions', locale='en-US'),
                        authors=RevisionDashboardForm.ALL_AUTHORS)
        response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        eq_(response.status_code, 200)

        page = pq(response.content)
        revisions = page.find('.dashboard-row')

        eq_(11, revisions.length)

        # Only testuser01 is in the Known Authors group, and has 2 revisions
        url = urlparams(reverse('dashboards.revisions', locale='en-US'),
                        authors=RevisionDashboardForm.KNOWN_AUTHORS)
        response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        eq_(response.status_code, 200)

        page = pq(response.content)
        revisions = page.find('.dashboard-row')

        eq_(2, revisions.length)

        # Of the 11 revisions, 9 are by users not in the Known Authors group
        url = urlparams(reverse('dashboards.revisions', locale='en-US'),
                        authors=RevisionDashboardForm.UNKNOWN_AUTHORS)
        response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest')
        eq_(response.status_code, 200)

        page = pq(response.content)
        revisions = page.find('.dashboard-row')

        eq_(9, revisions.length)
Example #25
0
    def test_bug_709938_interests(self):
        testuser = self.user_model.objects.get(username='******')
        self.client.login(username=testuser.username,
                          password=TESTUSER_PASSWORD)

        url = reverse('users.user_edit', args=(testuser.username,))
        response = self.client.get(url, follow=True)
        doc = pq(response.content)

        test_tags = [u'science,Technology,paradox,knowledge,modeling,big data,'
                     u'vector,meme,heuristics,harmony,mathesis universalis,'
                     u'symmetry,mathematics,computer graphics,field,chemistry,'
                     u'religion,astronomy,physics,biology,literature,'
                     u'spirituality,Art,Philosophy,Psychology,Business,Music,'
                     u'Computer Science']

        form = self._get_current_form_field_values(doc)

        form['user-interests'] = test_tags

        response = self.client.post(url, form, follow=True)
        eq_(200, response.status_code)
        doc = pq(response.content)
        eq_(1, doc.find('ul.errorlist li').length)
        assert ('Ensure this value has at most 255 characters'
                in doc.find('ul.errorlist li').text())
def get_perm(perm):
    jq = pq(base_url.format(perm, name))
    over = len(jq(".ws-ds-text"))
    if not (over):
        return [
            {
                "Name": pq(x)("td:eq(0)").text().encode("utf-8"),
                "Email": "{}@illinois.edu".format(
                    pq(x)("script")
                    .text()
                    .replace('displayIllinois("', "")
                    .replace("'", "")
                    .replace(")", "")
                    .replace('"', "")
                    .encode("utf-8")
                ),
            }
            for x in jq(".ws-ds-dept-details table tr:gt(0):odd")
        ]
    else:
        return concr(
            get_perm,
            map(lambda x: "{}{}".format(perm, x), get_character_permutations(num_characters=1)),
            max_workers=10,
        )
 def get_perm(perm):
     jq = pq(
         "http://www1.appstate.edu/cgi-bin/cgiwrap/jmm/newcso4.pl",
         data={"last": name, "first": "{}*".format(perm), "type": "student"},
         method="post",
     )
     over = len(jq("p:contains('too many results')"))
     if not (over):
         return [
             {
                 "Name": re.findall(r"name\: .*", pq(x).text())[0]
                 .replace("name:", "")
                 .strip()
                 .encode("utf-8"),
                 "Email": pq(x)("a[href^=mailto]").text(),
             }
             for x in jq("#maintext table tr:gt(0) td pre")
             if re.findall(r"name\: .*", pq(x).text())
         ]
     else:
         with concurrent.futures.ThreadPoolExecutor(max_workers=10) as thread:
             return list(
                 itertools.chain(
                     *list(
                         thread.map(
                             get_perm,
                             ["{}{}".format(perm, x) for x in get_character_permutations(num_characters=1)],
                         )
                     )
                 )
             )
Example #28
0
    def test_post_bad_site(self):
        self.client.login(username='******', password='******')
        url = '/datasheet/bulk_import/'
        with open(self.fpath_bad_site) as f:
            response = self.client.post(url, {
                'datasheet_id': self.ds.pk,
                'organization': 'Coast Savers', 
                'project_id': 1, 
                'csvfile': f
                }
            )
        d = pq(response.content)
        el = d("ul.errorlist li")
        self.assertEqual(response.status_code, 400, response.content)
        self.assertEqual(len(el), 1)
        self.assertTrue("TestSite3" in el[0].text_content(), el[0].text_content())
        self.assertTrue("is not in the database" in el[0].text_content(), el[0].text_content())

        # Now add the site
        ca = State.objects.get(name="California")
        testsite3 = Site(sitename="TestSite3", state=ca, county="Santa Cruz")
        testsite3.save()
        with open(self.fpath_bad_site) as f:
            response = self.client.post(url, {
                'datasheet_id': self.ds.pk,
                'organization': 'Coast Savers', 
                'project_id': 1, 
                'csvfile': f
                }
            )
        d = pq(response.content)
        el = d("ul.errorlist li")
        self.assertEqual(response.status_code, 200, response.content)
        self.assertEqual(len(el), 0)
def parse_count_by_use(html, city):
    #dir_name = get_result_path()
    file_name1 = os.path.join(dir_name, 'shenzhen_gov_ershoufang_count_by_use_day_region.txt')
    file_name2 = os.path.join(dir_name, 'shenzhen_gov_ershoufang_count_by_use_month_region.txt')
    table1 = pq(html)("#ctl00_ContentPlaceHolder1_clientList1 tr")
    table2 = pq(html)("#ctl00_ContentPlaceHolder1_clientList2 tr")
    date1 = pq(html)('#ctl00_ContentPlaceHolder1_lblCurTime1').text()
    date2 = pq(html)('#ctl00_ContentPlaceHolder1_lblCurTime2').text()
    infos1 = infos2 = ''
    if os.path.exists(file_name1):
        fr = open(file_name1, 'r')
        infos1 = fr.read().decode('utf8')
        fr.close()
    if os.path.exists(file_name2):
        fr = open(file_name2, 'r')
        infos2 = fr.read().decode('utf8')
        fr.close()
    #print len(table1)
    #print len(table2)
    info1 = get_info(table1, date1, city)
    #print 'table2'
    info2 = get_info(table2, date2, city)
    #print 'end table2'
    if date1 + ',' + city not in infos1:
        fw1 = open(file_name1, 'a')
        fw1.write(','.join(info1).encode('utf8') + '\n')
        fw1.close()
    #print 'end date1'
    if date2 + ',' + city not in infos2:
        fw2 = open(file_name2, 'a')
        fw2.write(','.join(info2).encode('utf8') + '\n')
        fw2.close()
Example #30
0
 def test_perf_warning(self):
     eq_(self.addon.ts_slowness, None)
     doc = pq(self.client.get(self.detail_url).content)
     eq_(doc('.performance-note').length, 0)
     self.addon.update(ts_slowness=100)
     doc = pq(self.client.get(self.detail_url).content)
     eq_(doc('.performance-note').length, 1)
Example #31
0
def cam_get_name(course_page):
    doc = pq(course_page)
    name = doc('title')[0].text[:doc('title')[0].text.find('|') - 1]
    return name
def getVideo_title(url):
    html = bilibili().getHtml(url)
    doc = pq(html)
    video_title = doc('#viewbox_report > h1 > span').text()
    return video_title
Example #33
0
import requests
from pyquery import PyQuery as pq

# 知乎热门话题
url = 'https://www.zhihu.com/explore'

# 伪造User-Agent,防止被反爬
headers = {
    'User-Agent':
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
# 使用requests获取html
html = requests.get(url, headers=headers).text
# 使用pyquery解析html
doc = pq(html)

items = doc('.explore-tab .feed-item').items()
for item in items:
    # 标题
    question = item.find('h2').text()
    # 作者
    author = item.find('.author-link-line').text()
    # 内容
    answer = pq(item.find('.content').html()).text()

    # a 代表以追加的方式写入到文本,指定编码为utf-8
    file = open('explore.txt', 'a', encoding='utf-8')
    file.write('\n'.join([question, author, answer]))
    # 每个热门话题以50个等号的形式分割
    file.write('\n' + '=' * 50 + '\n')
    file.close()
Example #34
0
def parse_base(item):
    if item is None:
        return None

    logger.info("*** base ***")
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    company_short_name = ""
    product_name = d('div.line-title> span> h1').clone().children().remove().end().text().strip()
    if product_name is None or product_name.strip() == "":
        product_name = d('div.line-title> span> b').clone().children().remove().end().text().strip()
    temps = product_name.split("/",1)
    if len(temps) == 2:
        product_name = temps[0].strip()
        company_short_name = temps[1].strip()
    if company_short_name == "":
        company_short_name = product_name
    logger.info("product name: %s" % product_name)
    logger.info("company short name: %s" % company_short_name)

    company_name = d('div.des-more> div').eq(0).text().strip().replace("公司全称:","")
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    if company_name is None or company_name.strip() == "":
        try:
            company_name = d('div.des-more> h2').text().strip()
        except:
            pass
    if company_name == "暂无" or company_name == "暂未收录":
        company_name = ""

    company_name = name_helper.company_name_normalize(company_name)
    logger.info("company name: %s" % company_name)

    if company_short_name == "" and company_name == "":
        return

    establish_date = None
    str = d('div.des-more> div').eq(1).text().strip().replace("成立时间:","")
    result = util.re_get_result('(\d*)\.(\d*)',str)
    if result != None:
        (year, month) = result
        try:
            if int(month) > 12:
                month = "1"
        except:
            month = "1"
        establish_date = datetime.datetime.strptime("%s-%s-1" % (year,month), '%Y-%m-%d')
    logger.info("establish date: %s" % establish_date)

    locationId=0
    str = d('span.loca').text().strip()
    #logger.info(str)
    result = util.re_get_result(u'(.*?)·(.*?)$',str)
    if result != None:
        (province, city) = result
        province = province.strip()
        city = city.strip()
        logger.info("location: %s-%s" % (province, city))

        locationId = 0
        result = parser_db_util.get_location(city)
        if result != None:
            locationId = result["locationId"]
        else:
            result = parser_db_util.get_location(province)
            if result != None:
                locationId = result["locationId"]

    if locationId == 0:
        loc1,loc2 = name_helper.get_location_from_company_name(company_name)
        if loc1 is not None:
            result = parser_db_util.get_location(loc1)
            if result != None:
                locationId = result["locationId"]
    logger.info("locationId: %d" % locationId)

    company_status = 2010
    str = d('div.des-more> div').eq(2).text().strip()
    if str == "已关闭":
        company_status = 2020
    logger.info("company_status: %d" % company_status)

    funding_type = 0
    str = d("span.tag.bg-c").text().strip()
    logger.info("融资需求: %s" % str)
    if str == "融资需求 · 需要融资":
        funding_type = 8020
    elif str == "融资需求 · 寻求收购":
        funding_type = 8020
    logger.info("funding_type=%d" % funding_type)
    try:
        brief = d("h2.seo-slogan").text().strip()
    except:
        brief = ""
    logger.info("brief: %s" % brief)

    if brief.find("暂未收录"):
        brief = ""
    field = d("span.scope.c-gray-aset> a").eq(0).text().strip()
    logger.info("field: %s" % field)

    sub_field = d("span.scope.c-gray-aset> a").eq(1).text().strip()
    logger.info("sub field: %s" % sub_field)

    tags = d("div.tagset.dbi.c-gray-aset> a >span").text().strip().replace(" ",",")
    logger.info("tags: %s" % tags)

    desc = d("div.des,div.desc,div.introduction,div.abstract,div.summary").text().\
        replace("购买数据请联系","").replace('*****@*****.**',"").replace("itjuzi是一家数据服务公司","").strip()
    logger.info("********desc: %s" % desc)

    #logo
    logo = d("div.pic >img").attr("src")
    #if logo:
    #    logo = logo.replace("http://", "https://")
    logger.info("logo: %s", logo)


    # website = d('div.link-line> a').text().strip()
    # if website is None or website == "":
    #     website = d('div.link-line> a.webTink').text().strip()
    # if website is None or website == "":
    #     try:
    #         logger.info("here")
    #         website = d('div.link-line> span.weblink> a').eq(1).text().strip()
    #         logger.info(website)
    #     except:
    #         pass
    artifacts = []
    for ty in [1,2,3]:
        if ty == 1:
            was = d('div.link-line> a')
        else:
            was = d('div.link-line> span.weblink,span.webTink> a')

        for wa in was:
            webs =[]

            try:
                website = pq(wa).attr("href").strip()
                if website=="http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:

            #     website = pq(wa).text().strip()
            except:
                pass
            try:
                website = pq(wa).text().strip()
                if website == "http://%e6%9a%82%e6%97%a0" or website == "http://tt":
                    website = ""
                website = url_helper.url_normalize(website)
                logger.info("website: %s" % website)
                webs.append(website)
            # else:
            #     website = pq(wa).text().strip()
            except:
                pass

            #
            # if website=="http://%e6%9a%82%e6%97%a0":
            #     website = ""
            # website = url_helper.url_normalize(website)
            # logger.info("website: %s" % website)

            # artifacts = []
            for website in webs:
                type, app_market, app_id = url_helper.get_market(website)
                if type == 4010:
                    flag, domain = url_helper.get_domain(website)
                    if flag is not None:
                        if flag is False:
                            domain = None
                        artifacts.append({
                            "type":4010,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })

                elif type == 4020:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4020,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": website
                        })

                elif type == 4030:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type": 4030,
                            "name": product_name,
                            "desc": None,
                            "link": website,
                            "domain": None
                        })

                elif type == 4040:
                    domain = app_id
                    if domain is not None:
                        artifacts.append({
                                "type":4040,
                                "name":product_name,
                                "desc":desc,
                                "link":website,
                                "domain": domain
                        })
                elif type == 4050:
                    domain = None
                    if app_market == 16010 or app_market == 16020:
                        android_app = parser_db_util.find_android_market(app_market, app_id)
                        if android_app:
                            domain = android_app["apkname"]
                    else:
                        domain = app_id
                    if domain is not None:
                        artifacts.append({
                            "type":4050,
                            "name":product_name,
                            "desc":desc,
                            "link":website,
                            "domain": domain
                        })


    #获投状态
    roundStr = d('span.t-small.c-green').text().replace("(","").replace(")","").replace("获投状态:","").strip()
    fundingRound,roundStr = itjuzi_helper.getFundingRound(roundStr)
    logger.info("获投状态: %d, %s", fundingRound, roundStr)

    logger.info("")


    return {
        "shortName": company_short_name,
        "fullName": company_name if company_name is not None and company_name.strip() != "" else None,
        "productName": product_name,
        "description": desc,
        "brief": brief,
        "round": fundingRound,
        "roundDesc": roundStr,
        "companyStatus": company_status,
        "fundingType": funding_type,
        "locationId": locationId,
        "establishDate": establish_date,
        "logo": logo,
        "sourceId": company_key,
        "field": field,
        "subField": sub_field,
        "tags": tags,
        "type":41010,
        "artifacts":artifacts
    }
Example #35
0
def parse_artifact(item):
    if item is None:
        return None

    artifacts = []
    company_key = item["key"]
    html = item["content"]
    #logger.info(html)
    d = pq(html)

    #artifact
    logger.info("*** artifact ***")
    lis = d('ul.list-prod> li> div.on-edit-hide')
    for li in lis:
        l = pq(li)
        strtype = l('h4> span.tag').text().strip()
        #logger.info(strtype)
        if strtype != u"网站" and strtype != "app":
            continue

        link = l('h4> b> a').attr("href").strip()
        if link == "":
            continue

        domain = None
        type = None
        if strtype == u"网站":
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4010:
                link = url_helper.url_normalize(link)
                flag, domain = url_helper.get_domain(link)
                if flag is None:
                    continue
                if flag is False:
                    domain = None

        if type != 4010:
            type, app_market, app_id = url_helper.get_market(link)
            if type == 4040:
                domain = app_id
            elif type == 4050:
                if app_market == 16010 or app_market == 16020:
                    android_app = parser_db_util.find_android_market(app_market, app_id)
                    if android_app:
                        domain = android_app["apkname"]
                else:
                    domain = app_id
            if domain is None and type !=4030 and type != 4020:
                continue

        name = l('h4> b').text().strip()
        desc = l('p').text().strip()
        logger.info("type: %s, name: %s, link: %s, desc: %s" % (type, name,link,desc))
        artifact = {
            "type":type,
            "name":name,
            "desc":desc,
            "link":link,
            "domain": domain
        }
        artifacts.append(artifact)

    logger.info("")
    return artifacts
Example #36
0
def cam_get_requirements(course_page):
    doc = pq(course_page)
    temptext = doc('fieldset.collapsible.collapsed.group-entry-requirements').find('p').eq(0).text()
    alevel = temptext[:temptext.find('\n')]
    ib = temptext[(temptext.find('\n') + 2):]
    return (alevel + '; ' + ib)
Example #37
0
def cam_get_description(course_page):
    doc = pq(course_page)
    return "placeholder description"
Example #38
0
## 初始化
# 字符串初始化
# doc = pq(html)
# print(doc('a'))
# URL初始化
# doc = pq('https://github.com')
# print(doc('title'))
# 文件初始化
# doc = pq(filename='test.html')
# print(doc('p'))

## 基本css选择器
with open('test.html', encoding='utf-8') as f:
    html = f.read()
doc = pq(html)
# print(doc('#container .list li'))
# print(type(doc('#container .list li')))

## 查找节点

# 子节点
# find
# item = doc('.list')
# print(item)
# # print(type(item))
# lis = item.find('li')
# # print(type(lis))
# print(lis)
# children
# lst = item.children()
Example #39
0
from pyquery import PyQuery as pq

doc = pq(url='https://www.toutiao.com/')
print(doc('title').text())

# 和上面的是一样的
# import requests
#
# doc = pq(requests.get('https://www.toutiao.com/').text)
# print(doc('title'))
Example #40
0
def ox_get_requirements(course_page):
    doc = pq(course_page)
    alvl = doc('div#content-tab--2').children().filter('ul').children().filter('li').eq(0).text()
    hghr = doc('div#content-tab--2').children().filter('ul').children().filter('li').eq(1).text()
    ib = doc('div#content-tab--2').children().filter('ul').children().filter('li').eq(2).text()
    return (alvl + "; " + hghr + "; " + ib)
Example #41
0
 def get_page(self, url):
     return pq(self.s.get(url).text)
Example #42
0
import requests
from pyquery import PyQuery as pq
from pathlib import Path
import ff14_tool_fish.fishinfo as fishinfo


url='https://cn.ff14angler.com/index.php'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
f = requests.get(url,headers=headers)
html = f.text
html.encode('utf-8')
ff=pq(html)
sel=ff('select[name=fish]').find('option').items()
path = "F:/htmls/fish/"
for i in sel:
    id=i.attr('value')
    if id=='0':
        continue
    elif int(id) <= 3157:
        id_file = Path(path + id + ".html")
        if id_file.exists():
            continue
        print(id)
        url1='https://cn.ff14angler.com/fish/'+id
        print(url1)
        html1 = requests.get(url1).text
        html1.encode('utf-8')
        my_file = Path(path)
        if my_file.exists():
            fishinfo.write_to_file(path + id + ".html", html1)
    def _get_total_cases_by_region(self, href, html):
        lga_norm_map = {
            'Locally Acquired—close contact with confirmed case':
            DataTypes.SOURCE_CONFIRMED,
            'Locally acquired—no known contact': DataTypes.SOURCE_COMMUNITY,
            'Locally acquired—contact known': DataTypes.SOURCE_CONFIRMED,
            'Interstate acquired': DataTypes.SOURCE_INTERSTATE,
            'Overseas acquired': DataTypes.SOURCE_OVERSEAS,
            'Under investigation': DataTypes.SOURCE_UNDER_INVESTIGATION,
            'Total': DataTypes.TOTAL,
        }
        hhs_norm_map = {
            'Total cases': DataTypes.TOTAL,
            'Active cases': DataTypes.STATUS_ACTIVE,
            'Total recovered': DataTypes.STATUS_RECOVERED,
            'Total deaths': DataTypes.STATUS_DEATHS
        }
        du = self._get_date(href, html)

        if href == self.STATS_BY_REGION_URL_2:
            regions = []

            # Add by HHS table
            # Total cases | Active cases | Total recovered | Total deaths
            table = pq(html)('#QLD_Cases_By_HHS')[0]
            headers = [
                hhs_norm_map[pq(i).text().strip().strip('[12345]').strip()]
                for i in table[0][0][1:]
            ]
            for tr in table[3]:
                hhs = pq(tr[0]).text().strip().strip('*')

                for xx, td in enumerate(tr[1:]):
                    value = int(pq(td).text().strip().replace(',', ''))

                    regions.append(
                        DataPoint(region_schema=Schemas.HHS,
                                  region_parent='AU-QLD',
                                  region_child=hhs.title(),
                                  datatype=headers[xx],
                                  value=value,
                                  date_updated=du,
                                  source_url=href))

            # Add by LGA table
            # Overseas acquired | Locally acquired—contact known |
            # Locally acquired—no known contact | Interstate acquired |
            # Under investigation | Total
            table = pq(html)('table#LGA')[0]

            headers = [
                lga_norm_map[pq(i).text().strip()] for i in table[0][0][1:]
            ]
            for tr in table[1][1:]:
                lga = pq(tr[0]).text().split('(')[0].strip()

                for xx, td in enumerate(tr[1:]):
                    value = int(pq(td).text().strip().replace(',', ''))

                    regions.append(
                        DataPoint(region_schema=Schemas.LGA,
                                  region_parent='AU-QLD',
                                  region_child=lga.title(),
                                  datatype=headers[xx],
                                  value=value,
                                  date_updated=du,
                                  source_url=href))
            return regions

        else:
            table = pq(pq(html)('table.table.table-bordered.header-basic'))
            if not table:
                return None

            if not 'Total confirmed' in pq(table[0]).text().replace(
                    '\n', ' ').replace('  ', ' '):
                #print("NOT TOTAL:", table.text())
                return None

            regions = []
            for tr in table('tr'):
                if 'total' in pq(tr).text().lower():
                    continue

                tds = pq(tr)('td')
                for x, td in enumerate(tds):
                    if x == 0:
                        # HACK: one day had "271" prefixed to "North West"
                        hhs_region = pq(td).text().strip().lstrip(
                            '271*').strip().strip('*')
                    elif x >= 1:
                        if len(tds) > 2:
                            # New format:
                            # HHS*
                            # Active cases
                            # Recovered cases
                            # Deaths
                            # Total confirmed cases to date
                            datatype = [
                                DataTypes.STATUS_ACTIVE,
                                DataTypes.STATUS_RECOVERED,
                                DataTypes.STATUS_DEATHS, DataTypes.TOTAL
                            ][x - 1]
                        else:
                            datatype = DataTypes.TOTAL

                        try:
                            value = int(pq(td).text().strip())
                            regions.append(
                                DataPoint(region_schema=Schemas.HHS,
                                          region_parent='AU-QLD',
                                          region_child=hhs_region.title(),
                                          datatype=datatype,
                                          value=value,
                                          date_updated=du,
                                          source_url=href))
                        except ValueError:
                            # WARNING!!!
                            pass

            return regions
Example #44
0
def fetchMiaopaiData():
  uname = '/app/yxtk/script/useragent.txt'
  f1 = open("/app/yxtk/script/data/onlylady.sql",'w',buffering=-1)
  with open(uname) as f:
        useragents = f.readlines()  
  userAgent = random.choice(useragents) 
  headers = {
             'Accept':'application/json, text/javascript, */*; q=0.01',
             'Accept-Encoding':'gzip, deflate, sdch',
             'Accept-Language':'zh-CN,zh;q=0.8',
             'Cache-Control':'max-age=0',
             'Connection':'keep-alive',
             'Host':'http://streetstyle.onlylady.com/',
             'Referer':'http://streetstyle.onlylady.com/',
             'Upgrade-Insecure-Requests':'1',
             'X-Requested-With':'XMLHttpRequest',
             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
            }
  while True:
      for j in range(1,12):
          time.sleep(1)
          if j == 1:
            url = 'http://streetstyle.onlylady.com/'
          if j == 2:
            url = 'http://fashion.onlylady.com/'
          if j == 3:
            url = 'http://show.onlylady.com/'
          if j == 4:
            url = 'http://luxury.onlylady.com/'
          if j == 5:
            url = 'http://accessories.onlylady.com/'
          if j == 6:
            url = 'http://jewelry.onlylady.com/'
          if j == 7:
            url = 'http://watch.onlylady.com/'
          if j == 8:
            url = 'http://hufu.onlylady.com/'
          if j == 9:
            url = 'http://zhuangban.onlylady.com/'
          if j == 10:
            url = 'http://hair.onlylady.com/'
          if j == 11:
            url = 'http://body.onlylady.com/'
          print url;
          try:
              encoding_support = ContentEncodingProcessor
              req = urllib2.Request(url)
              res = urllib2.urlopen(req)
              html = res.read()
              res.close()
              doc = pq(html)
              divs = doc('div.c3_r_item')
              for div in divs.items():
                  clo_url = div('div.bt').children('a').attr('href')
                  m = re.findall(r'(\w*[0-9]+)\w*',str(clo_url))
                  clo_id = str(m[2])
                  clo_pic = div('div.img').children('a').children('img').attr('src')
                  print clo_pic
                  clo_title = div('div.bt').children('a').text()
                  clo_title = "\" "+clo_title.replace('\"','\'')+" \""
                  clo_title = clo_title.replace("\n",'')
                  clo_title = clo_title.replace(",",',')
                  print clo_title
                  clo_date = div('div.date').text()
                  imageUrl=qiniuUpdate(clo_pic.strip())

                  req = urllib2.Request(clo_url)
                  res = urllib2.urlopen(req)
                  html1 = unicode(res.read(),'GBK')
                  html1 = re.sub(r'<script>(.*?)</script>','',html1)
                  res.close()
                  doc1 = pq(html1)
                  con = doc1('div.detail_content')
                  con('img').removeAttr("style")
                  con('img').removeAttr("width")
                  con('img').removeAttr("height")
                  con('img').attr("style","width:100%")
                  p = con('div.detail_content').html()
                  if p is None or p =='':
                    continue
                  p = re.sub(r'&#13;','',p)
                  p = re.sub(r'<style.*>([\S\s\t]*?)</style>','',p)
                  p = re.sub(r'<script.*>([\S\s\t]*?)</script>','',p)
                  p = re.sub(r'<p[^>]*>','<p>',p)
                  p = re.sub(r'<(?!img|br|p|/p).*?>','',p)
                  p = re.sub(r'\r','',p)
                  p = re.sub(r'\n','',p)
                  p = re.sub(r'\s','',p)
                  p = re.sub(r'src=',' src=',p)

                  #newqiniu = pq(p)
                  #imgs = newqiniu('img')
                  #for image in imgs.items():
                    #imgurl = image('img').attr('src')
                    #newimgurl = qiniuUpdate(imgurl.strip())
                    #p = p.replace(str(imgurl),str(newimgurl))
                  sql = "INSERT INTO 3rd_clothes(id,creator,modifier,create_time,modify_time,is_deleted,clothes_id,title,clothes_date,img_url,sort,user_id,thumbnail_url,source,tag,content,push_flag,recommend_flag,view_status)VALUES(NULL,'sys','sys',now(),now(),'n','"+clo_id+"'," +clo_title.strip() + ",'" + clo_date.strip() +"','"+clo_pic.strip()+"','0','','"+imageUrl+"','onlylady','','"+p.strip()+"',0,NULL,0);"+'\n'
                  print sql
                  f1.writelines(sql)
                  file_name = urllib2.unquote(clo_pic.strip()).decode('utf8').split('/')[-1]
                  os.remove('/app/yxtk/script/'+file_name)
          except Exception as e:
            print e
      break
  f1.close()
Example #45
0
a = '''
<body>
    <h><a href='www.biaoti.com'>tomtao626</a></h>
    <p class='vota'>段落1</p>
    <p>段落2</p>
</body>
'''

# 提取内容
"""
    提取标签内容,用.text()
    提取标签属性值,用.attr()
    提取子孙节点内容用.text(),如果只提取子节点用.html(),可能提取出来的是子节点的整个标签
"""

doc = pq(a)
# 提取标签内容
print(doc('h').text())  # 'tomtao626'
print(doc('h').html())  # '<a href="www.biaoti.com">tomtao626</a>'
print(doc('body').html())  # '\n    <h><a href="www.biaoti.com">tomtao626</a></h>\n    <p>段落1</p>\n    <p>段落2</p>\n'
print(doc('p').text()) # '段落1 段落2'
print(doc('p').text().split(' '))  # ['段落1', '段落2']
print(doc('p:nth-of-type(1)').text())  # '段落1'
print(doc('body').text())  # 'tomtao626 段落1 段落2'

# 提取标签属性
print(doc('h a').attr('href'))  # 'www.biaoti.com'
print(doc('p').attr('class'))
# 识别标签

# 只根据标签来识别
    def _get_total_dhr(self, href, html):
        # Technically, this is a bad idea adding other irrelevant values to
        # DHR here..but separating it everywhere thru this class would be
        # far worse!
        sbr2_map = {
            'Number of confirmed cases': DataTypes.TOTAL,
            'Last 24 hours': DataTypes.NEW,
            'Active cases': DataTypes.STATUS_ACTIVE,
            'Recovered': DataTypes.STATUS_RECOVERED,
            'Current hospitalisations': DataTypes.STATUS_HOSPITALIZED,
            'Deaths': DataTypes.STATUS_DEATHS
        }
        du = self._get_date(href, html)

        if href in (self.STATS_BY_REGION_URL, self.STATS_BY_REGION_URL_2):
            # New format as of 22 April
            r = []

            table = pq(html)('#QLD_Cases')
            if table:
                for tr in table[1:]:
                    datatype = sbr2_map[pq(
                        tr[0]).text().strip().strip('[12345]').strip()]
                    if datatype is None:
                        continue

                    r.append(
                        DataPoint(region_schema=Schemas.ADMIN_1,
                                  region_parent='AU',
                                  region_child='AU-QLD',
                                  datatype=datatype,
                                  value=int(pq(tr[1]).text().strip()),
                                  date_updated=du,
                                  source_url=href))

            deaths = pq(html)('.qh-fact-wrapper .lost span')
            if deaths:
                r.append(
                    DataPoint(region_schema=Schemas.ADMIN_1,
                              region_parent='AU',
                              region_child='AU-QLD',
                              datatype=DataTypes.STATUS_DEATHS,
                              value=int(pq(deaths[0]).text().strip()),
                              date_updated=du,
                              source_url=href))
            return r
        else:
            # As of 9th April, the format has added recovered/deaths etc info
            totals_dict = self.__get_totals_from_table(html)
            if not totals_dict:
                return []

            r = []
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-QLD',
                          datatype=DataTypes.STATUS_RECOVERED,
                          value=totals_dict['recovered'],
                          date_updated=du,
                          source_url=href))
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-QLD',
                          datatype=DataTypes.STATUS_DEATHS,
                          value=totals_dict['deaths'],
                          date_updated=du,
                          source_url=href))
            r.append(
                DataPoint(region_schema=Schemas.ADMIN_1,
                          region_parent='AU',
                          region_child='AU-QLD',
                          datatype=DataTypes.STATUS_ACTIVE,
                          value=totals_dict['active'],
                          date_updated=du,
                          source_url=href))
            return r
Example #47
0
import requests, json, re
from pyquery import PyQuery as pq

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
}

r = requests.get(
    'https://zh.wikipedia.org/wiki/%E8%87%BA%E7%81%A3%E8%A1%8C%E6%94%BF%E5%8D%80%E4%BA%BA%E5%8F%A3%E5%88%97%E8%A1%A8',
    headers=headers)

doc = pq(r.text)

cities = []

for i in doc('.wikitable').eq(0).find('tbody tr'):
    tds = doc(doc(i).find('td'))

    if len(tds.eq(1).text()):
        cities.append({
            'COUNTYNAME':
            tds.eq(1).text(),
            'COUNTYENG':
            tds.eq(2).text(),
            'COUNTYTYPE':
            tds.eq(3).text(),
            'COUNTYFLAG':
            'https:%s' % (tds.eq(4).find('img').eq(0).attr('src')),
            'COUNTYAREA':
            tds.eq(5).text(),
    def _get_total_cases_tested(self, href, html):
        #if href in (self.STATS_BY_REGION_URL, self.STATS_BY_REGION_URL_2):

        # New format as of 22 April
        tested = pq(html)('.qh-fact-wrapper .tested span')
        if tested:
            return self._extract_number_using_regex(
                compile('([0-9,]+)'),
                pq(tested[0]).text().strip(),
                region_schema=Schemas.ADMIN_1,
                region_parent='AU',
                region_child='AU-QLD',
                date_updated=self._get_date(href, html),
                datatype=DataTypes.TESTS_TOTAL,
                source_url=href)

        # NOTE: This is actually a different page to the press releases!
        #  I needed to get some of these from web.archive.org.
        #  Some of the stats may be a day or more old,
        #  so will need to add the date of the stat as well(!)

        value = self._extract_number_using_regex(
            compile('Total samples tested: <strong>([0-9,]+)'),
            html,
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-QLD',
            date_updated=self._get_date(href, html),
            datatype=DataTypes.TESTS_TOTAL,
            source_url=href)
        if value:
            return value

        # Find the start of the # samples tested table
        th_regex = compile(
            '<th id="table[^"]+">[^<]*?As at ([^<]+)[^<]*?</th>[^<]*'
            '<th id="table[^"]+">[^<]*?(?:Samples|Patients) tested[^<]*?</th>',
            DOTALL | MULTILINE)
        match = th_regex.search(html)
        if not match:
            #print("NOT INITIAL MATCH!")
            return None  # WARNING!!!

        # Get the date - it's in format "30 March 2020"
        date_updated = self._extract_date_using_format(match.group(1).strip())
        slice_from = match.end(1)  # CHECK ME!
        html = html[slice_from:]

        # Get the # samples total
        value = self._extract_number_using_regex(
            compile(
                # Total number changed from being enclosed in a <strong>
                # tag to a <b> tag, so changed to be as broad as NSW
                # <strong>Total</strong></td>
                # <td headers="table59454r1c2"><b>37,334‬</b></td>
                r'<td[^>]*?>(?:<[^</>]+>)?Total(?:</[^<>]+>)?</td>'
                r'[^<]*?<td[^>]*?>.*?([0-9,]+).*?</td>',
                MULTILINE | DOTALL),
            html,
            region_schema=Schemas.ADMIN_1,
            region_parent='AU',
            region_child='AU-QLD',
            date_updated=date_updated,
            datatype=DataTypes.TESTS_TOTAL,
            source_url=href)
        if not value:
            #print("NOT SECOND MATCH!")
            return None  # WARNING!
        return value
Example #49
0
 def test_send_message_to_prefilled(self):
     url = urlparams(reverse('messages.new'), to=self.user2.username)
     response = self.client.get(url, follow=True)
     eq_(200, response.status_code)
     eq_(self.user2.username,
         pq(response.content)('#id_to')[0].attrib['value'])
Example #50
0
 def test_public_stats_stats_notes(self):
     addon = amo.tests.addon_factory(public_stats=True)
     response = self.client.get(self.get_public_url(addon))
     assert pq(response.content)('#stats-note h2').length == 1
Example #51
0
#PyQueryの利用

from pyquery import PyQuery as pq

html = '''
<ul>
    <li id = "nya">item1</li>
    <li>item2</li>
    <li>item3</li>
    <li>item4</li>
</ul>
'''

#インスタンスの生成

dom = pq(html)

#CSSセレクタの利用

print("①", dom("#hoge"))

#DOMの操作

dom("li").each(lambda i, node: pq(node).attr(class_="fuga"))

print("②", dom)

#スクレイピング

dom = pq("http://www.python.org/about/")
Example #52
0
 def test_no_markup_in_message_list(self):
     response = self._test_send_message_to(self.user2.username)
     eq_(
         pq(response.content)('read').text(),
         pq(response.content)('read').html())
Example #53
0
 def test_page(self):
     self._step()
     r = self.client.get(self.url)
     eq_(r.status_code, 200)
     eq_(pq(r.content)('#submit-details').length, 1)
Example #54
0
 def test_send_message_page(self):
     # Make sure page loads.
     response = self.client.get(reverse('messages.new'), follow=True)
     eq_(200, response.status_code)
     assert len(pq(response.content)('#id_message'))
Example #55
0
 def test_hotness_ignore(self):
     # Defaults to ignore compat mode for Fx v10, both are compatible.
     res = self.client.get(self._url(version='10.0'))
     assert res.status_code == 200
     assert pq(res.content)('.featured-addons').length == 2
Example #56
0
 def _get_review_detail_page_source(self, url):
   r = requests.get(url, headers = self.headers, proxies=PROXY)
   r.encoding = 'utf-8'
   page_source = r.text
   return pq(fromstring(page_source))
Example #57
0
 def test_hotness_strict(self):
     # Defaults to strict compat mode, both are within range.
     res = self.client.get(self._url())
     assert res.status_code == 200
     assert pq(res.content)('.featured-addons').length == 2
Example #58
0
 def test_page(self):
     self._step()
     r = self.client.get(self.url)
     eq_(r.status_code, 200)
     eq_(pq(r.content)('#upload-file').length, 1)
Example #59
0
 def test_no_version(self):
     """Don't display a version number for themes."""
     r = self.client.get(self.url)
     assert pq(r.content)('h1 .version') == []
Example #60
0
 def test_hotness_strict_filtered(self):
     # Defaults to strict compat mode, one is within range.
     res = self.client.get(self._url(version='6.0'))
     assert res.status_code == 200
     assert pq(res.content)('.featured-addons').length == 1
     self.assertContains(res, self.addon2.name)