def insert_to(project_url, destination, find_what, indent=0): url = ('%smagic/%s' % (project_url, destination)).replace('\\', '/') response = urllib2.urlopen(url) if response.getcode() == 200: with open(destination, 'r') as dest: dest_contents = dest.readlines() lines = ''.join(dest_contents) content = HTMLParser().unescape(response.read()) if content.replace(' ', '') in lines.replace(' ', ''): print_out('IGNORED', destination) return generated = [] for line in dest_contents: generated.append(line) if line.lower().find(find_what.lower()) >= 0: spaces = len(line) - len(line.lstrip()) for l in content.split('\n'): if l: generated.append('%s%s\n' % (' ' * (spaces + indent), l)) with open(destination, 'w') as dest: for line in generated: dest.write(line) print_out('INSERT', destination)
def get_pagecount(self): u"""Получить количество сниппетов результатов выдачи """ pagecount = 0 patterns = ( ur'<div[^>]+resultStats(?:[^>]+)?>Результатов: примерно (.*?)<', ur'<div[^>]+resultStats(?:[^>]+)?>(.*?)<nobr>', ur'<div[^>]+resultStats(?:[^>]+)?>Результатов:(.*?)</div>', ur'<div>Результатов:\s*(.*?)</div>', ur'Результатов:\s*(.*?),.*?</div>', ur'из примерно <b>(.*?)</b>', ur'<div>Результаты:.*?из\s*<b>\s*(\d+)\s*</b>') response = self.content for pattern in patterns: res = re.findall( pattern, response, re.DOTALL | re.IGNORECASE | re.UNICODE | re.MULTILINE) if res: # html-символы могут встречаться в виде своих числовых кодов (например, =  ) # избавимся от закодированных последовательностей в результирующей строке res_str = HTMLParser().unescape(res[0]) results_ints = re.findall(r'\d+', res_str.split(',')[0]) if results_ints: return int(''.join(results_ints)) return pagecount
def parse_detail(self, response): try: detail_json = json.loads(response.body) except ValueError: self.logger.error('[NO JSON]' + response.url) with open('no_json_decoded.log', 'w') as f: f.write(response.body) return if 'code' in detail_json: return clock = int(detail_json['islock']) if clock: self.logger.debug('[THIS BOOK IS CLOCK] ' + response.url) return item = MaterialSourceItem() item.update(response.meta['item']) item['relate_id'] = '%s_%s' % (item['source_id'], item['book_id']) item['url'] = 'http://www.jjwxc.net/onebook.php?novelid={}'.format( item['book_id']) item['folder_url'] = detail_json['novelCover'] item['title'] = detail_json['novelName'] item['author'] = detail_json['authorName'] item['gender'] = u'女性向小说' introduction = HTMLParser().unescape(detail_json['novelIntro']) item['introduction'] = '\n'.join(p.strip() for p in introduction.split('<br/>') if p != '') item['created_at'] = now_date() item['updated_at'] = today_date() yield item
def parse(self): # remove punctuation from artist/title clean_artist = Util.remove_punctuation(self.artist) clean_title = Util.remove_punctuation(self.title) # create lyrics Url url = "http://www.metrolyrics.com/" + clean_title.replace(" ", "-") + "-lyrics-" + clean_artist.replace(" ", "-") + ".html" try: resp = urllib2.urlopen(url, None, 3).read() except: return "" # verify title title = resp start = title.find("<title>") if start == -1: return "" title = title[(start+7):] end = title.find(" LYRICS</title>") if end == -1: return "" title = title[:end] title = HTMLParser().unescape(title) songdata = title.split(" - ") try: if self.artist != songdata[0].lower() or self.title != songdata[1].lower(): print "wrong artist/title! " + songdata[0].lower() + " - " + songdata[1].lower() return "" except: return "" self.lyrics = self.get_lyrics(resp) self.lyrics = string.capwords(self.lyrics, "\n").strip() return self.lyrics
def on_new_comment(self): if self.api.sender != self.config["bors_name"]: return comment_patterns = self.config.get("failure_comment_patterns", []) if not any(re.search(pat, self.api.comment) for pat in comment_patterns): return url = re.findall(r'.*\((.*)\)', self.api.comment) if not url: return # Substitute and get the new url # (e.g. http://build.servo.org/json/builders/linux2/builds/2627) json_url = re.sub(r'(.*)(builders/.*)', r'\1json/\2', url[0]) json_stuff = self.api.get_page_content(json_url) if not json_stuff: return build_stats = json.loads(json_stuff) failure_regex = r'Tests with unexpected results:\n(.*)\n</span><span' comments = [] for step in build_stats['steps']: for name, log_url in step['logs']: if name != 'stdio': continue stdio = self.api.get_page_content(log_url) failures = re.findall(failure_regex, stdio, re.DOTALL) if not failures: continue try: failures = HTMLParser().unescape(failures[0]) except UnicodeDecodeError: failures = HTMLParser().unescape(failures[0].decode('utf-8')) if 'css' in failures: self._check_css_failures(url) comment = [' ' * 4 + line for line in failures.split('\n')] comments.extend(comment) if comments: self.api.post_comment('\n'.join(comments))
def parse(self): # remove punctuation from artist/title clean_artist = self.artist clean_title = self.title for c in string.punctuation: clean_artist = clean_artist.replace(c, "") clean_title = clean_title.replace(c, "") # create lyrics Url url = "http://www.metrolyrics.com/" + clean_title.replace(" ", "-") + "-lyrics-" + clean_artist.replace(" ", "-") + ".html" print "metrolyrics Url " + url try: resp = urllib2.urlopen(url, None, 3).read() except: print "could not connect to metrolyrics.com" return "" # verify title title = resp start = title.find("<title>") if start == -1: print "no title found" return "" title = title[(start+7):] end = title.find(" LYRICS</title>") if end == -1: print "no title end found" return "" title = title[:end] title = HTMLParser().unescape(title) songdata = title.split(" - ") try: if self.artist != songdata[0].lower() or self.title != songdata[1].lower(): print "wrong artist/title! " + songdata[0].lower() + " - " + songdata[1].lower() return "" except: print "incomplete artist/title" return "" self.lyrics = self.get_lyrics(resp) self.lyrics = string.capwords(self.lyrics, "\n").strip() return self.lyrics
def check_failure_log(api, bors_comment): # bors_comment would be something like, # ":broken_heart: Test failed - [linux2](http://build.servo.org/builders/linux2/builds/2627)" # noqa # ... from which we get the relevant build result url url = iter(re.findall(r'.*\((.*)\)', str(bors_comment))).next() if not url: return # Substitute and get the new url # (e.g. http://build.servo.org/json/builders/linux2/builds/2627) json_url = re.sub(r'(.*)(builders/.*)', r'\1json/\2', url) json_stuff = api.get_page_content(json_url) if not json_stuff: return build_stats = json.loads(json_stuff) build_log = [] for step in build_stats['steps']: if 'failed' in step['text']: build_log = step['logs'] break failed_url = None for (name, log_url) in build_log: if name == 'stdio': failed_url = log_url break if not failed_url: return stdio = api.get_page_content(failed_url) failure_regex = r'.*Tests with unexpected results:\n(.*)\n</span><span' failures = iter(re.findall(failure_regex, stdio, re.DOTALL)).next() failures = HTMLParser().unescape(failures) if failures: comments = [' ' * 4 + line for line in failures.split('\n')] api.post_comment('\n'.join(comments))
def parse_detail(self, response): try: detail_json = json.loads(response.body) except ValueError: self.logger.error('[NO JSON]' + response.url) with open('no_json_decoded.log', 'w') as f: f.write(response.body) return if 'code' in detail_json: return clock = int(detail_json['islock']) if clock: self.logger.debug('[THIS BOOK IS CLOCK] ' + response.url) return sign = int(detail_json['isSign']) if sign == 0: self.logger.debug('[NOT SIGN] ' + response.url) return item = BookListItem() item['published_at'] = response.meta['published_at'] item['book_id'] = response.meta['book_id'] item['source_id'] = 7 item['url'] = 'http://www.jjwxc.net/onebook.php?novelid={}'.format(item['book_id']) item['folder_url'] = detail_json['novelCover'] item['title'] = detail_json['novelName'] item['author'] = detail_json['authorName'] item['author_id'] = detail_json['authorId'] if detail_json['novelClass'] == '': item['category'] = '' item['sub_category'] = '' else: item['category'] = detail_json['novelClass'] item['sub_category'] = '' # detail_json['novelClass'].split('-')[-1] introduction = HTMLParser().unescape(detail_json['novelIntro']) item['introduction'] = '\n'.join(p.strip() for p in introduction.split('<br/>') if p != '') item['status'] = 1 item['created_at'] = self.today item['updated_at'] = self.today yield item
def get_pagecount(self): u"""Получить количество сниппетов результатов выдачи """ pagecount = 0 patterns = (ur'<div[^>]+resultStats(?:[^>]+)?>Результатов: примерно (.*?)<', ur'<div[^>]+resultStats(?:[^>]+)?>(.*?)<nobr>', ur'<div[^>]+resultStats(?:[^>]+)?>Результатов:(.*?)</div>', ur'<div>Результатов:\s*(.*?)</div>', ur'Результатов:\s*(.*?),.*?</div>', ur'из примерно <b>(.*?)</b>', ur'<div>Результаты:.*?из\s*<b>\s*(\d+)\s*</b>') response = self.content for pattern in patterns: res = re.findall(pattern, response, re.DOTALL | re.IGNORECASE | re.UNICODE | re.MULTILINE) if res: # html-символы могут встречаться в виде своих числовых кодов (например, =  ) # избавимся от закодированных последовательностей в результирующей строке res_str = HTMLParser().unescape(res[0]) results_ints = re.findall(r'\d+', res_str.split(',')[0]) if results_ints: return int(''.join(results_ints)) return pagecount
def welcomemsg(html): query = urlparse.parse_qs(html.geturl()) messg = HTMLParser().unescape(query['msg'][0]) messg = messg.replace('<b>', '').replace('WELCOME ', '') return messg.split('</b>')
[i.extract() for i in c] s = soup.findAll('style') [i.extract() for i in s] try: texts = ''.join(soup.findAll(text=True)) texts = string.replace(texts, '\r', '\n') from HTMLParser import HTMLParser texts = HTMLParser().unescape(texts) except Exception, err: _logger.error( 'BeautifulSoup created but it failed to process doc, url: ' + doc['url'] + '\n' + traceback.format_exc()) return [] else: return [ re.sub(r'\s+', ' ', text.strip()) for text in texts.split('\n\n') ] def interpret(inpath, outpath): with open(inpath, "rb") as crawled_docs: docs = cPickle.load(crawled_docs) _logger.info('found ' + str(len(docs)) + ' docs from crawler\'s output') output_str = u'' for doc in docs: _logger.info('processing doc from url: ' + doc['url']) contents = parse_html(doc) output_str += unicode(doc['url'] + '\n\n' + '+' * 100 + '\n\n') for paragraph in contents: if is_valid_text(paragraph):
class Artist(object): """ This class represents an Artist. It is created knowing only its ID. To reduce API accesses, load information using Artist.update_data() only as needed. """ def __init__(self, id, parent_api): self.id = id self.parent_api = parent_api self.name = None self.notifications_enabled = None self.has_bookmarked = None self.image = None self.body = None self.vanity_house = None self.tags = [] self.similar_artists_and_score = {} self.statistics = None self.torrent_groups = [] self.requests = [] self.parent_api.cached_artists[self.id] = self # add self to cache of known Artist objects def update_data(self): if self.id > 0: response = self.parent_api.request(action='artist', id=self.id) elif self.name: self.name = HTMLParser().unescape(self.name) try: response = self.parent_api.request(action='artist', artistname=self.name) except Exception: self.name = self.name.split(" & ")[0] response = self.parent_api.request(action='artist', artistname=self.name) else: raise InvalidArtistException("Neither ID or Artist Name is valid, can't update data.") self.set_data(response) def set_data(self, artist_json_response): if self.id > 0 and self.id != artist_json_response['id']: raise InvalidArtistException("Tried to update an artists's information from an 'artist' API call with a different id." + " Should be %s, got %s" % (self.id, artist_json_response['id']) ) elif self.name: self.id = artist_json_response['id'] self.parent_api.cached_artists[self.id] = self self.name = HTMLParser().unescape(artist_json_response['name']) self.notifications_enabled = artist_json_response['notificationsEnabled'] self.has_bookmarked = artist_json_response['hasBookmarked'] self.image = artist_json_response['image'] self.body = artist_json_response['body'] self.vanity_house = artist_json_response['vanityHouse'] self.tags = [] for tag_dict in artist_json_response['tags']: tag = self.parent_api.get_tag(tag_dict['name']) tag.set_artist_count(self, tag_dict['count']) self.tags.append(tag) self.similar_artists_and_score = {} for similar_artist_dict in artist_json_response['similarArtists']: similar_artist = self.parent_api.get_artist(similar_artist_dict['artistId']) similar_artist.name = similar_artist_dict['name'] self.similar_artists_and_score[similar_artist] = similar_artist_dict['score'] self.statistics = artist_json_response['statistics'] self.torrent_groups = [] for torrent_group_item in artist_json_response['torrentgroup']: torrent_group = self.parent_api.get_torrent_group(torrent_group_item['groupId']) torrent_group.set_artist_group_data(torrent_group_item) self.torrent_groups.append(torrent_group) self.requests = [] for request_json_item in artist_json_response['requests']: request = self.parent_api.get_request(request_json_item['requestId']) request.set_data(request_json_item) self.requests.append(request) def __repr__(self): return "Artist: %s - ID: %s" % (self.name, self.id)
class Artist(object): """ This class represents an Artist. It is created knowing only its ID. To reduce API accesses, load information using Artist.update_data() only as needed. """ def __init__(self, id, parent_api): self.id = id self.parent_api = parent_api self.name = None self.notifications_enabled = None self.has_bookmarked = None self.image = None self.body = None self.vanity_house = None self.tags = [] self.similar_artists_and_score = {} self.statistics = None self.torrent_groups = [] self.requests = [] self.parent_api.cached_artists[ self.id] = self # add self to cache of known Artist objects def update_data(self): if self.id > 0: response = self.parent_api.request(action='artist', id=self.id) elif self.name: self.name = HTMLParser().unescape(self.name) try: response = self.parent_api.request(action='artist', artistname=self.name) except Exception: self.name = self.name.split(" & ")[0] response = self.parent_api.request(action='artist', artistname=self.name) else: raise InvalidArtistException( "Neither ID or Artist Name is valid, can't update data.") self.set_data(response) def set_data(self, artist_json_response): if self.id > 0 and self.id != artist_json_response['id']: raise InvalidArtistException( "Tried to update an artists's information from an 'artist' API call with a different id." + " Should be %s, got %s" % (self.id, artist_json_response['id'])) elif self.name: self.id = artist_json_response['id'] self.parent_api.cached_artists[self.id] = self self.name = HTMLParser().unescape(artist_json_response['name']) self.notifications_enabled = artist_json_response[ 'notificationsEnabled'] self.has_bookmarked = artist_json_response['hasBookmarked'] self.image = artist_json_response['image'] self.body = artist_json_response['body'] self.vanity_house = artist_json_response['vanityHouse'] self.tags = [] for tag_dict in artist_json_response['tags']: tag = self.parent_api.get_tag(tag_dict['name']) tag.set_artist_count(self, tag_dict['count']) self.tags.append(tag) self.similar_artists_and_score = {} for similar_artist_dict in artist_json_response['similarArtists']: similar_artist = self.parent_api.get_artist( similar_artist_dict['artistId']) similar_artist.name = similar_artist_dict['name'] self.similar_artists_and_score[ similar_artist] = similar_artist_dict['score'] self.statistics = artist_json_response['statistics'] self.torrent_groups = [] for torrent_group_item in artist_json_response['torrentgroup']: torrent_group = self.parent_api.get_torrent_group( torrent_group_item['groupId']) torrent_group.set_artist_group_data(torrent_group_item) self.torrent_groups.append(torrent_group) self.requests = [] for request_json_item in artist_json_response['requests']: request = self.parent_api.get_request( request_json_item['requestId']) request.set_data(request_json_item) self.requests.append(request) def __repr__(self): return "Artist: %s - ID: %s" % (self.name, self.id)