Beispiel #1
0
    def handle_starttag(self, tag, attrs):
        HTMLParser.handle_starttag(self, tag, attrs)
        for el in self.tags:
            cnt = el[0]
            cur_cnt = 0
            for n in el[1]:
                if n[1]==False:#has not got the tag in list
                    if tag == n[0]:#get the target node
                        attrdict = dict(attrs)
                        bhit = True
                        for at in n[2]:
                            if(attrdict.has_key(at)):
                                if (n[2][at] == attrdict[at]):
                                    continue
                                else:
                                    bhit = False
                                    break
                            else:
                                bhit = False
                                break
                        n[1]=bhit
                        if bhit:cur_cnt+=1 #all the attrs are hitted 
                    break
                else:
                    cur_cnt+=1

            el[2][0] = (cur_cnt == cnt)#all target node start tag has been got,set flag to get data
Beispiel #2
0
 def handle_starttag(self, tag, attrs):
     if not self.inbody and tag == 'link':
         a = dict(attrs)
         if a.get('rel') == 'alternate' and 'href' in a:
             goodness = LinkParser.type_goodness.get(a.get('type'), 0)
             if goodness > self.goodness:
                 self.goodness = goodness
                 self.href = a['href']
     else:
         if tag == 'body':
             self.inbody = True
     
     HTMLParser.handle_starttag(self, tag, attrs)
Beispiel #3
0
 def handle_starttag(self, tag, attrs):
     HTMLParser.handle_starttag(self, tag, attrs)
     # print "handle starttag:", tag
     if tag == "ul":
         # print attrs
         __class = [v for k, v in attrs if k == 'class']
         if __class:
             # print _class
             if __class[0] == 'zy_course_listNN':
                 self.is_class_list = True
     elif tag == 'a':
         if self.is_class_list:
             # print attrs
             self.class_list.append((ROOT_URL + attrs[0][1][0:-4]+'/', attrs[1][1]))
 def handle_starttag(self, tag, attrs):
     HTMLParser.handle_starttag(self, tag, attrs)
     # print "handle starttag:", tag
     if tag == 'article':
         self.is_article = True
     elif tag == 'div':
         if self.is_article:
             __class = [v for k, v in attrs if k == 'class']
             if __class[0] == 'lead-img':
                 self.is_state = True
             elif __class[0] == 'artc-bt':
                 self.is_class = True
     elif tag == 'a':
         if self.is_state:
             pass  # 暂不实现
         elif self.is_class:
             self.section_list.append((ROOT_URL + attrs[0][1], attrs[1][1]))
Beispiel #5
0
 def handle_starttag(self, tag, attrs):
     attrs = dict(attrs)
     if tag == "li" and self._RE_GROUP_ID.search(attrs.get("id", "")):
         self.handle_start_group(tag, attrs)
     elif tag == "span" and self._RE_GROUP_WEIGHT_ID.search(attrs.get("id", "")):
         self.handle_start_group_weight(tag, attrs)
     elif tag == "div" and self._RE_GROUP_NAME_ID.search(attrs.get("id", "")):
         self.handle_start_group_name(tag, attrs)
     elif tag == "li" and self._RE_NETWORK_ID.search(attrs.get("id", "")):
         self.handle_start_network(tag, attrs)
     elif tag == "span" and self._RE_NETWORK_WEIGHT_ID.search(attrs.get("id", "")):
         self.handle_start_network_weight(tag, attrs)
     elif tag == "div" and self._RE_NETWORK_NAME_ID.search(attrs.get("id", "")):
         self.handle_start_network_name(tag, attrs)
     elif tag == "div" and self._RE_NETWORK_DESCRIPTION_ID.search(attrs.get("id", "")):
         self.handle_start_network_description(tag, attrs)
     else:
         HTMLParser.handle_starttag(self, tag, attrs)
Beispiel #6
0
 def handle_starttag(self, tag, attrs):
     attrs = dict(attrs)
     if tag == "li" and self._RE_GROUP_ID.search(attrs.get("id", "")):
         self.handle_start_group(tag, attrs)
     elif tag == "span" and self._RE_GROUP_WEIGHT_ID.search(
             attrs.get("id", "")):
         self.handle_start_group_weight(tag, attrs)
     elif tag == "div" and self._RE_GROUP_NAME_ID.search(attrs.get(
             "id", "")):
         self.handle_start_group_name(tag, attrs)
     elif tag == "li" and self._RE_NETWORK_ID.search(attrs.get("id", "")):
         self.handle_start_network(tag, attrs)
     elif tag == "span" and self._RE_NETWORK_WEIGHT_ID.search(
             attrs.get("id", "")):
         self.handle_start_network_weight(tag, attrs)
     elif tag == "div" and self._RE_NETWORK_NAME_ID.search(
             attrs.get("id", "")):
         self.handle_start_network_name(tag, attrs)
     elif tag == "div" and self._RE_NETWORK_DESCRIPTION_ID.search(
             attrs.get("id", "")):
         self.handle_start_network_description(tag, attrs)
     else:
         HTMLParser.handle_starttag(self, tag, attrs)
Beispiel #7
0
def main():
    global download_link
    p = optparse.OptionParser(usage="%prog url...")
    (options, args) = p.parse_args()
    if len(args) != 1:
        p.print_help()
        sys.exit(1)

    url = args[0]
    random.seed()
    ip = '158.250.33.%d' % random.randint(16, 250)
    user_agent = 'Mozilla/5.0 (X11; U; OpenVMS AlphaServer_ES40; en-US; rv:1.4) Gecko/20030826 SWB/V1.4 (HP)'

    req = Request(url)
    req.add_header('X-Forwarded-For', ip)
    req.add_header('User-Agent', user_agent)
    fd = urlopen(req)
    page = fd.read()
    fd.close()

    p = HTMLParser()
    p.handle_starttag = find_download_link
    p.feed(page)
    if not download_link:
        print "Download link not found on %s." % url
        sys.exit(100)

    full_link = urljoin(url, download_link)
    print "============================================================"
    print "* Download link: %s" % full_link
    print "============================================================"
    print

    wget = Popen(['wget',
                  '--header', 'X-Forwarded-For: %s' % ip,
                  '--referer', url,
                  '-U', user_agent,
                  '--content-disposition',
                  full_link])
    os.waitpid(wget.pid, 0)
    sys.exit(0)
Beispiel #8
0
	def Run(self):
		print 'Start process Artsit %s ...' % self.Id
		parser = HTMLParser()
		parser.handle_starttag = self.FindSong
		while(self.Next):
			url = GetSongUrl % (self.Index, self.Id)
			self.Index += PageSize
			raw_content = http_read(url)
			raw_object = None
			if raw_content is None or len(raw_content) == 0:
				continue
			try:
				raw_object = json.loads(raw_content)
			except Exception, e:
				pass
			if raw_object is None:
				continue
			content = None
			try:
				content = raw_object['data']['html']
				content = content.decode('unicode_escape')
			except Exception, e:
				pass
Beispiel #9
0
        title_ = ""
        for k, v in attrs:
            if k and k == "href" and v.find("/artist/") != -1:
                href_ = v
                artid_ = v[v.find("/artist/") + len("/artist/") :]
            if k and k == "title":
                title_ = v
        if artid_ != "":
            if artid_.isdigit():
                Artist_List_[artid_] = title_
            elif Category_List_Switch_:
                Category_List_.add(PRE_URL_ + href_)


parser = HTMLParser()
parser.handle_starttag = Find_Artist_Link
handle = urllib.urlopen(URL_)
raw_content = handle.read()
handle.close()
parser.feed(raw_content)

print '"' + URL_ + '" has been processed.'

Category_List_Switch_ = False

for l_ in Category_List_:
    handle = urllib.urlopen(l_)
    raw_content = handle.read()
    handle.close()
    parser.feed(raw_content)
    print '"' + l_ + '" has been processed.'
Beispiel #10
0
 def handle_starttag(self, tag, attrs):
     HTMLParser.handle_starttag(self, tag, attrs)
     if tag == "section":
         for k, v in attrs:
             if v == "free":
                 print("k ==============" + k)
Beispiel #11
0
                                        for i in range(0, 3):
                                            if i > 0:
                                                common.log("try download lrc %s again, time: %d" % (songId, i))
                                            if dwn_lrc[0].transfer(lrclink, songId, "text/plain"):
                                                break
                                            elif i == 2:
                                                db_.add_failed(lrclink, songId, "text/plain", 2)
                                    Order_[0] = Order_[0] + 1
                            # Order_[0] = Order_[0] + 1
                            print "song %d has been saved." % songId
                        Find_Song_Switch_[0] = True
        except Exception, e:
            common.log("Find_Song_Link: " + str(e))

    parser = HTMLParser()
    parser.handle_starttag = Find_Song_Link

    for k_ in artist_list:
        print "start process artist %s ..." % k_
        Order_[0] = 0
        SongNameMap = {}
        s_ = 0
        Find_Song_Switch_[0] = True
        while Find_Song_Switch_[0]:
            Find_Song_Switch_[0] = False
            raw_content = common.http_read(GetSongs_URL_Template_ % (s_, k_))
            s_ = s_ + 25
            if raw_content is None:
                continue
            try:
                raw_object = json.loads(raw_content)
Beispiel #12
0
 def handle_starttag(self, tag, attrs):
     HTMLParser.handle_starttag(self, tag, attrs)
     print('<' + tag + '>')
Beispiel #13
0
 def handle_starttag(self, tag, attrs):
     """处理起始标签"""
     HTMLParser.handle_starttag(self, tag, attrs)
     if not HTMLParser.get_starttag_text(self).endswith("/>"):
         print "<", tag, ">"
Beispiel #14
0
 def handle_starttag(self, tag, attrs):
     ''' 处理起始标签 '''
     HTMLParser.handle_starttag(self, tag, attrs)
     if not HTMLParser.get_starttag_text(self).endswith('/>'):
         print('<',tag,'>')
Beispiel #15
0
 def handle_starttag(self, tag, attrs):
     if tag == u'br':
         self.content.append(u'\n')
     if tag == u'p':
         self.content.extend([u'\n', u'\n'])
     HTMLParser.handle_starttag(self, tag, attrs)
Beispiel #16
0
 def handle_starttag(self, tag, attrs):
     return HTMLParser.handle_starttag(self, tag, attrs)
Beispiel #17
0
	def handle_starttag(self, tag, attrs):
		HTMLParser.handle_starttag(self, tag, attrs)
		if tag == 'a':
			for name, value in attrs:
				if name == 'href':
					self.link = urlparse.urljoin(self.url, value)
Beispiel #18
0
#!/usr/bin/python
from urllib import urlopen
from HTMLParser import HTMLParser
text = urlopen('http://python.org/community/jobs').read()
parser = HTMLParser()
parser.handle_starttag(tag='li')
parser.feed(text)
print attrs
parser.close()
Beispiel #19
0
def handle_starttag_(tag, attrs):
    if tag == 'a':
        for (key, value) in attrs:
            if (key and key == 'href' and value and value.find('/song/') != -1
                    and value.count('/') == 2):
                try:
                    downloadPage_list.append('http://music.baidu.com' + value +
                                             '/download?__o=%2Fartist%2F' +
                                             artist_id)
                except Exception, e:
                    pdb.set_trace()


parser = HTMLParser()
parser.handle_starttag = handle_starttag_
parser.feed(raw_content)

download_list = []


def handle_starttag_1(tag, attrs):
    if tag == 'a':
        for (key, value) in attrs:
            if (key and key == 'href' and value
                    and value.find('/data/music/file?link=http://') != -1):
                try:
                    download_list.append(
                        value[value.find('/data/music/file?link=') + 1:])
                except Exception, e:
                    pdb.set_trace()
Beispiel #20
0
                                                    lrclink, songId,
                                                    'text/plain'):
                                                break
                                            elif i == 2:
                                                db_.add_failed(
                                                    lrclink, songId,
                                                    'text/plain', 2)
                                    Order_[0] = Order_[0] + 1
                            #Order_[0] = Order_[0] + 1
                            print 'song %d has been saved.' % songId
                        Find_Song_Switch_[0] = True
        except Exception, e:
            common.log('Find_Song_Link: ' + str(e))

    parser = HTMLParser()
    parser.handle_starttag = Find_Song_Link

    for k_ in artist_list:
        print 'start process artist %s ...' % k_
        Order_[0] = 0
        SongNameMap = {}
        s_ = 0
        Find_Song_Switch_[0] = True
        while (Find_Song_Switch_[0]):
            Find_Song_Switch_[0] = False
            raw_content = common.http_read(GetSongs_URL_Template_ % (s_, k_))
            s_ = s_ + 25
            if raw_content is None:
                continue
            try:
                raw_object = json.loads(raw_content)
Beispiel #21
0
        title_ = ''
        for k, v in attrs:
            if (k and k == 'href' and v.find('/artist/') != -1):
                href_ = v
                artid_ = v[v.find('/artist/') + len('/artist/'):]
            if (k and k == 'title'):
                title_ = v
        if (artid_ != ''):
            if artid_.isdigit():
                Artist_List_[artid_] = title_
            elif Category_List_Switch_:
                Category_List_.add(PRE_URL_ + href_)


parser = HTMLParser()
parser.handle_starttag = Find_Artist_Link
handle = urllib.urlopen(URL_)
raw_content = handle.read()
handle.close()
parser.feed(raw_content)

print '"' + URL_ + '" has been processed.'

Category_List_Switch_ = False

for l_ in Category_List_:
    handle = urllib.urlopen(l_)
    raw_content = handle.read()
    handle.close()
    parser.feed(raw_content)
    print '"' + l_ + '" has been processed.'
Beispiel #22
0
handle.close()
#print raw_content

downloadPage_list = []

def handle_starttag_(tag, attrs):
    if tag == 'a':
        for (key, value) in attrs:
            if(key and key == 'href' and value and value.find('/song/') != -1 and value.count('/') == 2):
                try:
                    downloadPage_list.append('http://music.baidu.com' + value + '/download?__o=%2Fartist%2F' + artist_id)
                except Exception, e:
                    pdb.set_trace()

parser = HTMLParser()
parser.handle_starttag = handle_starttag_
parser.feed(raw_content)

download_list = []

def handle_starttag_1(tag, attrs):
    if tag == 'a':
        for (key, value) in attrs:
            if(key and key == 'href' and value and value.find('/data/music/file?link=http://') != -1):
                try:
                    download_list.append(value[value.find('/data/music/file?link=') + 1:])
                except Exception, e:
                    pdb.set_trace()

parser.handle_starttag = handle_starttag_1
 def handle_endtag(self, tag):
     HTMLParser.handle_starttag(self, tag)
     print('<' + tag + '>')
Beispiel #24
0
 def handle_starttag(self, tag, attrs):
     """处理起始标签"""
     HTMLParser.handle_starttag(self, tag, attrs)
     if not HTMLParser.get_starttag_text(self).endswith("/>"):
         print "<",tag,">"
Beispiel #25
0
 def handle_starttag(self, tag, attrs):
     HTMLParser.handle_starttag(self, tag, attrs)
     if tag == 'meta':
         for (K, V) in attrs:
             print K, ':', V