def _split(inputfile, outputdir): source = open(inputfile, 'r') html = source.read() source.close() if not os.path.isdir(outputdir): os.mkdir(outputdir) idx_slide=0 idx_section=0 parsed = PyQuery(html) for section in parsed('section'): slide = PyQuery(section) if slide.has_class('stack'): idx_section+=1 stack_path = os.path.join(outputdir,'%02d' % idx_section ) os.mkdir(stack_path) for sub_slide in PyQuery(slide.html())('section'): idx_slide+=1 _dump_slide(sub_slide, idx_slide, stack_path) else: if not slide.parent().has_class('stack'): idx_slide+=1 _dump_slide(slide, idx_slide, outputdir)
def is_div(partial, cls_name=None, id_name=None): """Helper function to detect if we have a well formated div partial. Params: partial (str): an HTML content (partial HTML code) page to test. class_name (str|None): if not `None` the name of the class that the div in `partial` must have. id_name (str|None): if not `None` the name of the id that the div in `partial` must have. Returns: bool: True if `partial` is a well formated div page with the provided class (if provided) and id (if provided), False if not. Examples: >>> is_div("<div>Plop</div>") True >>> is_div("<span>Plop</span>") False >>> is_div("<!DOCTYPE html><html>Hello</html>") False >>> is_div('<div class="useful">Plop</div>', "useful") True >>> is_div('<div class="useless">Plop</div>', "useful") False >>> is_div('<div class="useful" id="cat">Plop</div>', "useful", "cat") True >>> is_div('<div class="useful" id="dog">Plop</div>', "useful", "cat") False >>> is_div('<div class="useful">Plop</div>', "useful", "cat") False """ d = PyQuery(partial) div_ok = d.is_("div") cls_ok = d.has_class(cls_name) if cls_name else True id_ok = d.is_("#%s" % id_name) if id_name else True return div_ok and cls_ok and id_ok
def parse_status(status_div): """ 关于object_kind说明: 1001: 图书 1002: 电影 1003: 音乐 1005: 关注好友 1011: 活动 1012: 评论 1013: 小组话题 1014: (电影)讨论 1015: 日记 1018: 图文广播 1019: 小组 1020: 豆列 1021: 九点文章 1022: 网页 1025: 相册照片 1026: 相册 1043: 影人 1044: 艺术家 1062: board(???) 2001: 线上活动 2004: 小站视频 3043: 豆瓣FM单曲 3049: 读书笔记 3065: 条目 3072: 豆瓣FM兆赫 3090: 东西 3114: 游戏 5021: 豆瓣阅读的图片 5022: 豆瓣阅读的作品 """ if not isinstance(status_div, PyQuery): status_div = PyQuery(status_div) reshared_count = 0 like_count = 0 comments_count = 0 created_at = None is_noreply = False status_url = None target_type = None object_kind = None object_id = None reshared_detail = None blockquote = None douban_user_id = status_div.attr('data-uid') douban_id = status_div.attr('data-sid') is_saying = status_div.has_class('saying') is_reshared = status_div.has_class('status-reshared-wrapper') try: created_span = status_div.find('.actions>.created_at')[0] except: is_noreply = True try: """ 获取广播链接 """ exactly_link = PyQuery(status_div.find('.actions a').eq(0)) status_url = exactly_link.attr('href') except: pass try: """ 获取关于广播类型的属性 """ status_item_div = PyQuery( status_div.find('.status-item').eq(0)) target_type = status_item_div.attr('data-target-type') object_kind = status_item_div.attr('data-object-kind') object_id = status_item_div.attr('data-object-id') if not douban_user_id: douban_user_id = status_item_div.attr('data-uid') if not douban_id: douban_id = status_div.attr('data-sid') blockquote = PyQuery(status_item_div.find('blockquote')).html() except: pass if not is_noreply: """ 获取创建时间、回复、点赞、转播数 """ try: created_at = PyQuery(created_span).attr('title') reply_link = PyQuery( status_item_div.find('.actions>.new-reply')) comments_count = reply_link.attr('data-count') like_span = PyQuery( status_item_div.find('.actions>.like-count')) like_count = like_span.attr('data-count') if like_count is None: try: like_count = int( re.match(r'赞\((.*)\)', like_span.text().strip())[1]) except: like_count = 0 reshared_span = PyQuery( status_item_div.find('.actions>.reshared-count')) reshared_count = reshared_span.attr('data-count') if reshared_count is None: reshared_count = 0 except: pass if not douban_id or douban_id == 'None': """ 原广播已被删除 """ return None, None detail = { 'douban_id': douban_id, 'douban_user_id': douban_user_id, 'content': status_div.outer_html(), 'created': created_at, 'is_reshared': is_reshared, 'is_saying': is_saying, 'is_noreply': is_noreply, 'updated_at': now, 'reshared_count': reshared_count, 'like_count': like_count, 'comments_count': comments_count, 'status_url': status_url, 'target_type': target_type, 'object_kind': object_kind, 'object_id': object_id, 'user': self.fetch_user_by_id(douban_user_id), 'blockquote': blockquote, } if is_reshared: reshared_status_div = PyQuery( status_div.find('.status-real-wrapper').eq(0)) reshared_detail, _ = parse_status(reshared_status_div) if reshared_detail: detail['reshared_id'] = reshared_detail['douban_id'] if target_type == 'sns': attachments = [] images = status_div.find( '.attachments-saying.group-pics a.view-large') for img_lnk in images: attachments.append({ 'type': 'image', 'url': PyQuery(img_lnk).attr('href'), }) images = status_div.find( '.attachments-saying.attachments-pic img') for img in images: img_lnk = PyQuery(img).attr('data-raw-src') if img_lnk: attachments.append({ 'type': 'image', 'url': img_lnk, }) if attachments: self.save_attachments(attachments) detail['attachments'] = attachments elif target_type == 'movie' and object_kind == '1002': self.fetch_movie(object_id) elif target_type == 'book' and object_kind == '1001': self.fetch_book(object_id) elif target_type == 'music' and object_kind == '1003': self.fetch_music(object_id) return detail, reshared_detail
def rastreapartido(url): request = urllib.request.Request( url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'}) html = PyQuery(urllib.request.urlopen(request).read().decode('utf-8')) local = html('.home').text() visitante = html('.away').text() goles_local = html('.score>span:first').text() goles_visitante = html('.score>span:last').text() jornada = html('.mid>.info').text() njornada = jornada[:jornada.find("\n")] partido = Partido(local, visitante, goles_local, goles_visitante, njornada) equipos = {"local": local, "visitante": visitante} equipo_actual = "local" # la primera iteracion parsea la parte de la izqda, el equipo local print("Rastreando {local} - {visitante}".format(local=local, visitante=visitante)) for equipo in html('.player-list').children('ul'): for item in PyQuery(equipo).children('li'): jugador = PyQuery(item) if jugador('.events') is not None: for eventoHtml in jugador('.events').children('i'): evento = PyQuery(eventoHtml) if evento.has_class('icon-circle') or evento.has_class('icon-up-circled') \ or evento.has_class('icon-stop') or evento.has_class('icon-half-square'): """ solo interesan los goles y las tarjetas, para los cambios y demas no se hace nada """ id_jugador = Jugadores.get_jugador( jugador('strong').text(), equipos[equipo_actual]) if id_jugador is None: """ si no se encuentra en nombre que se saca de comuniazo en la bbdd, se busca el equivalente y se reeemplaza para que siempre concuerde con el que viene de comuniazo """ print("Jugador no relacionado: {}".format( jugador('strong').text())) print("jugadores del {}:".format( equipos[equipo_actual])) for posible in Jugadores.get_jugadores( equipos[equipo_actual]): print("{}: {}".format( posible['id'], posible['nombre'])) id_jugador = input( "cual corresponde ({})?: ".format( jugador('strong').text())) if int(id_jugador) > 0: """ si se le pone el id_jugador -1, es que el jugador del evento ya no está en la liga, se pasará a la API sin id, solo contara el evento para las estadisticas no quien es el protagonista """ Jugadores.reemplazar( id_jugador, jugador('strong').text()) elif int(id_jugador) == 0: """ se introduce un nuevo jugador si no se encuentra en la lista """ print("Nuevo jugador:") print("\tnombre: {}".format( jugador('strong').text())) posicion = input("\tposicion: ") dorsal = int(input("\tdorsal: ")) fecha_nacimiento = input( "\tfecha nacimiento: ") nacionalidad = input("\tnacionalidad: ") pais = input("\tpais de nacimiento: ") id_jugador = Jugadores.nuevo({ 'equipo': equipos[equipo_actual], 'nombre': jugador('strong').text(), 'posicion': posicion, 'dorsal': dorsal, 'nacionalidad': nacionalidad, 'pais_nacimiento': pais, 'fecha_nacimiento': fecha_nacimiento }) if evento.has_class('icon-circle'): """ gol """ texto = evento.attr('title') propia_meta = "en pp" in texto minuto = texto[texto.find('minuto') + 7:] partido.addgol( Gol(id_jugador, minuto, False, propia_meta)) elif evento.has_class('icon-up-circled'): """ penalti """ texto = evento.attr('title').split(" ") partido.addgol( Gol(id_jugador, texto[4], True, False)) elif evento.has_class('icon-stop'): """ tarjeta amarilla o roja """ texto = evento.attr('title').split(" ") partido.addtarjeta( Tarjeta(id_jugador, texto[3], texto[1].replace(',', ''))) elif evento.has_class('icon-half-square'): """ segunda amarilla que implica una roja también """ texto = evento.attr('title').split(" ") partido.addtarjeta( Tarjeta(id_jugador, texto[3], 'amarilla')) partido.addtarjeta( Tarjeta(id_jugador, texto[3], 'roja')) """ la segunda vuelta del for pertenece a la parte de la dcha, al equipo visitante """ equipo_actual = "visitante" return partido
def parse_items(self, urls): docs = [] threads = [ threading.Thread(target=get, args=(url, docs)) for url in urls ] for thread in threads: thread.start() for thread in threads: thread.join() for item_doc in docs: word_id = None match = re.search("notSatisfied(Lang)?\( ?'(\d+)' ?[,\)]", item_doc.html()) if match: word_id = match.group(2) for locale in item_doc("article.pronunciations"): locale = PyQuery(locale) lang_header = locale('header[id=%s]' % self.lang.split('_')[0]) if lang_header: word = re.compile(r"(.*) の発音").search( lang_header.text()).group(1) if self.lang == 'en_usa': els = locale('header[id=%s]' % self.lang).next_all() else: els = locale('.show-all-pronunciations li') lis = [] for el in els: el = PyQuery(el) if el.has_class('li-ad'): continue if el.is_('header'): break lis.append(el) for li in lis: i = PyQuery(li('span.play')) text = i.parents('li').eq(0).text() user = None match = re.search("発音したユーザ: (.*) \(", text) if match: user = match.group(1) onclick = i.attr('onclick') match = re.compile(r"Play\(.*,'(.*)',.*,.*,.*,.*,.*\)" ).search(onclick) if match: code = match.group(1) url = 'https://audio00.forvo.com/mp3/' + \ base64_decode(code) self.results.append({ 'word': word, 'url': url, 'word_id': word_id, 'user': user }) else: match = re.compile( r"PlayPhrase\(.*,'(.*)',.*\)").search(onclick) if match: code = match.group(1) url = 'https://audio00.forvo.com/phrases/mp3/' + \ base64_decode(code) self.results.append({ 'word': word, 'url': url, 'word_id': word_id, 'user': user })
def _render_span(self, p: Paragraph, pq: PyQuery, bold=False, italic=False, strike=False, underline=False, font_size=None, sub=False, sup=False): """ 转换span change 19.5.3 公式转换错误,则直接用图片 :param pq: :return: """ try: if pq.attr('data-latex'): # 公式 omml_str = converter.to_omml( self.mini_trim(pq.attr('data-latex'))) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return if pq.has_class("math-tex"): # 公式 if pq.attr('data-latex'): omml_str = pq.attr('data-latex') else: omml_str = html.unescape( pq.html()) if pq.html() is not None else '' omml_str = omml_str.replace(r'\(', '').replace(r'\)', '') omml_str = converter.to_omml(self.mini_trim(omml_str)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return # 阿凡题公式 if pq.has_class('afanti-latex'): metadata = AftQuestion(pq).parse_element() if metadata.startswith('^') or metadata.startswith('_'): last_ele = pq(p._element).children()[-1] metadata = last_ele.text[-1] + metadata last_ele.text = last_ele.text[:-1] omml_str = converter.to_omml(self.mini_trim(metadata)) omml_str = omml_str.replace( '<m:oMath', '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"' ) pq(p._element).append(omml_str) return except EquationConvertError: img = PyQuery('img', pq) self._render_img(p, img) return bold = any([ bold, self._get_pq_style(pq, 'font-weight') == 'bold', self._get_pq_style(pq, 'font-weight') == 'bolder' ]) italic = any( [italic, self._get_pq_style(pq, 'font-style') == 'italic']) strike = any([ strike, self._get_pq_style(pq, 'text-decoration') == 'line-through', self._get_pq_style(pq, 'text-decoration-line') == 'line-through' ]) underline = any([ underline, self._get_pq_style(pq, 'text-decoration') == 'underline', self._get_pq_style(pq, 'text-decoration-line') == 'underline' ]) if self._get_pq_style(pq, 'font-size'): size = self._get_pq_style(pq, 'font-size') if size.endswith('px'): size = size[:-2] size = int(float(size)) font_size = self.get_pt(size) elif size.endswith('pt'): size = size[:-2] size = float(size) font_size = Pt(size) # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size, # strike=strike) contents = pq.contents() for item in contents: if isinstance(item, (HtmlElement, _Element)): self._render_element(p, item, is_root=True, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) continue run = p.add_run(self._clear_text(item)) self.__force_simsun(run) if self._get_pq_style(pq, 'font-name'): run.font.name = self._get_pq_style(pq, 'font-name') if font_size: run.font.size = font_size run.underline = underline run.bold = bold run.italic = italic run.font.strike = strike run.font.superscript = sup run.font.subscript = sub