Beispiel #1
0
def _split(inputfile, outputdir):
    source = open(inputfile, 'r')
    html = source.read()
    source.close()

    if not os.path.isdir(outputdir):
        os.mkdir(outputdir)

    idx_slide=0
    idx_section=0

    parsed = PyQuery(html)
    
    for section in parsed('section'):
        slide = PyQuery(section)        
        if slide.has_class('stack'):
            idx_section+=1
            stack_path = os.path.join(outputdir,'%02d' % idx_section )
            os.mkdir(stack_path)
            for sub_slide in PyQuery(slide.html())('section'):
                idx_slide+=1
                _dump_slide(sub_slide, idx_slide, stack_path)
        else: 
            if not slide.parent().has_class('stack'):
                idx_slide+=1
                _dump_slide(slide, idx_slide, outputdir)                    
Beispiel #2
0
def is_div(partial, cls_name=None, id_name=None):
    """Helper function to detect if we have a well formated div partial.

    Params:
        partial (str): an HTML content (partial HTML code) page to test.
        class_name (str|None): if not `None` the name of the class that the div
            in `partial` must have.
        id_name (str|None): if not `None` the name of the id that the div
            in `partial` must have.

    Returns:
        bool: True if `partial` is a well formated div page with the provided
            class (if provided) and id (if provided), False if not.

    Examples:
        >>> is_div("<div>Plop</div>")
        True

        >>> is_div("<span>Plop</span>")
        False

        >>> is_div("<!DOCTYPE html><html>Hello</html>")
        False

        >>> is_div('<div class="useful">Plop</div>', "useful")
        True

        >>> is_div('<div class="useless">Plop</div>', "useful")
        False

        >>> is_div('<div class="useful" id="cat">Plop</div>', "useful", "cat")
        True

        >>> is_div('<div class="useful" id="dog">Plop</div>', "useful", "cat")
        False

        >>> is_div('<div class="useful">Plop</div>', "useful", "cat")
        False
    """
    d = PyQuery(partial)

    div_ok = d.is_("div")
    cls_ok = d.has_class(cls_name) if cls_name else True
    id_ok = d.is_("#%s" % id_name) if id_name else True

    return div_ok and cls_ok and id_ok
Beispiel #3
0
        def parse_status(status_div):
            """
            关于object_kind说明:
            1001: 图书
            1002: 电影
            1003: 音乐
            1005: 关注好友
            1011: 活动
            1012: 评论
            1013: 小组话题
            1014: (电影)讨论
            1015: 日记
            1018: 图文广播
            1019: 小组
            1020: 豆列
            1021: 九点文章
            1022: 网页
            1025: 相册照片
            1026: 相册
            1043: 影人
            1044: 艺术家
            1062: board(???)
            2001: 线上活动
            2004: 小站视频
            3043: 豆瓣FM单曲
            3049: 读书笔记
            3065: 条目
            3072: 豆瓣FM兆赫
            3090: 东西
            3114: 游戏
            5021: 豆瓣阅读的图片
            5022: 豆瓣阅读的作品

            """
            if not isinstance(status_div, PyQuery):
                status_div = PyQuery(status_div)
            reshared_count = 0
            like_count = 0
            comments_count = 0
            created_at = None
            is_noreply = False
            status_url = None
            target_type = None
            object_kind = None
            object_id = None
            reshared_detail = None
            blockquote = None
            douban_user_id = status_div.attr('data-uid')
            douban_id = status_div.attr('data-sid')
            is_saying = status_div.has_class('saying')
            is_reshared = status_div.has_class('status-reshared-wrapper')

            try:
                created_span = status_div.find('.actions>.created_at')[0]
            except:
                is_noreply = True

            try:
                """
                获取广播链接
                """
                exactly_link = PyQuery(status_div.find('.actions a').eq(0))
                status_url = exactly_link.attr('href')
            except:
                pass

            try:
                """
                获取关于广播类型的属性
                """
                status_item_div = PyQuery(
                    status_div.find('.status-item').eq(0))
                target_type = status_item_div.attr('data-target-type')
                object_kind = status_item_div.attr('data-object-kind')
                object_id = status_item_div.attr('data-object-id')
                if not douban_user_id:
                    douban_user_id = status_item_div.attr('data-uid')
                if not douban_id:
                    douban_id = status_div.attr('data-sid')
                blockquote = PyQuery(status_item_div.find('blockquote')).html()
            except:
                pass

            if not is_noreply:
                """
                获取创建时间、回复、点赞、转播数
                """
                try:
                    created_at = PyQuery(created_span).attr('title')
                    reply_link = PyQuery(
                        status_item_div.find('.actions>.new-reply'))
                    comments_count = reply_link.attr('data-count')
                    like_span = PyQuery(
                        status_item_div.find('.actions>.like-count'))
                    like_count = like_span.attr('data-count')
                    if like_count is None:
                        try:
                            like_count = int(
                                re.match(r'赞\((.*)\)',
                                         like_span.text().strip())[1])
                        except:
                            like_count = 0
                    reshared_span = PyQuery(
                        status_item_div.find('.actions>.reshared-count'))
                    reshared_count = reshared_span.attr('data-count')
                    if reshared_count is None:
                        reshared_count = 0
                except:
                    pass

            if not douban_id or douban_id == 'None':
                """
                原广播已被删除
                """
                return None, None

            detail = {
                'douban_id': douban_id,
                'douban_user_id': douban_user_id,
                'content': status_div.outer_html(),
                'created': created_at,
                'is_reshared': is_reshared,
                'is_saying': is_saying,
                'is_noreply': is_noreply,
                'updated_at': now,
                'reshared_count': reshared_count,
                'like_count': like_count,
                'comments_count': comments_count,
                'status_url': status_url,
                'target_type': target_type,
                'object_kind': object_kind,
                'object_id': object_id,
                'user': self.fetch_user_by_id(douban_user_id),
                'blockquote': blockquote,
            }

            if is_reshared:
                reshared_status_div = PyQuery(
                    status_div.find('.status-real-wrapper').eq(0))
                reshared_detail, _ = parse_status(reshared_status_div)
                if reshared_detail:
                    detail['reshared_id'] = reshared_detail['douban_id']

            if target_type == 'sns':
                attachments = []
                images = status_div.find(
                    '.attachments-saying.group-pics a.view-large')
                for img_lnk in images:
                    attachments.append({
                        'type': 'image',
                        'url': PyQuery(img_lnk).attr('href'),
                    })
                images = status_div.find(
                    '.attachments-saying.attachments-pic img')
                for img in images:
                    img_lnk = PyQuery(img).attr('data-raw-src')
                    if img_lnk:
                        attachments.append({
                            'type': 'image',
                            'url': img_lnk,
                        })
                if attachments:
                    self.save_attachments(attachments)
                    detail['attachments'] = attachments
            elif target_type == 'movie' and object_kind == '1002':
                self.fetch_movie(object_id)
            elif target_type == 'book' and object_kind == '1001':
                self.fetch_book(object_id)
            elif target_type == 'music' and object_kind == '1003':
                self.fetch_music(object_id)

            return detail, reshared_detail
Beispiel #4
0
    def rastreapartido(url):
        request = urllib.request.Request(
            url, headers={'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64)'})

        html = PyQuery(urllib.request.urlopen(request).read().decode('utf-8'))

        local = html('.home').text()
        visitante = html('.away').text()
        goles_local = html('.score>span:first').text()
        goles_visitante = html('.score>span:last').text()

        jornada = html('.mid>.info').text()
        njornada = jornada[:jornada.find("\n")]

        partido = Partido(local, visitante, goles_local, goles_visitante,
                          njornada)

        equipos = {"local": local, "visitante": visitante}
        equipo_actual = "local"  # la primera iteracion parsea la parte de la izqda, el equipo local
        print("Rastreando {local} - {visitante}".format(local=local,
                                                        visitante=visitante))

        for equipo in html('.player-list').children('ul'):
            for item in PyQuery(equipo).children('li'):
                jugador = PyQuery(item)

                if jugador('.events') is not None:
                    for eventoHtml in jugador('.events').children('i'):
                        evento = PyQuery(eventoHtml)

                        if evento.has_class('icon-circle') or evento.has_class('icon-up-circled') \
                                or evento.has_class('icon-stop') or evento.has_class('icon-half-square'):
                            """ solo interesan los goles y las tarjetas, para los cambios y demas no se hace nada """
                            id_jugador = Jugadores.get_jugador(
                                jugador('strong').text(),
                                equipos[equipo_actual])

                            if id_jugador is None:
                                """
                                si no se encuentra en nombre que se saca de comuniazo en la bbdd, se busca el
                                equivalente y se reeemplaza para que siempre concuerde con el que viene de comuniazo
                                """
                                print("Jugador no relacionado: {}".format(
                                    jugador('strong').text()))
                                print("jugadores del {}:".format(
                                    equipos[equipo_actual]))
                                for posible in Jugadores.get_jugadores(
                                        equipos[equipo_actual]):
                                    print("{}: {}".format(
                                        posible['id'], posible['nombre']))

                                id_jugador = input(
                                    "cual corresponde ({})?: ".format(
                                        jugador('strong').text()))
                                if int(id_jugador) > 0:
                                    """
                                    si se le pone el id_jugador -1, es que el jugador del evento ya no está en
                                    la liga, se pasará a la API sin id, solo contara el evento para las estadisticas no 
                                    quien es el protagonista
                                    """
                                    Jugadores.reemplazar(
                                        id_jugador,
                                        jugador('strong').text())
                                elif int(id_jugador) == 0:
                                    """
                                    se introduce un nuevo jugador si no se encuentra en la lista
                                    """
                                    print("Nuevo jugador:")
                                    print("\tnombre: {}".format(
                                        jugador('strong').text()))
                                    posicion = input("\tposicion: ")
                                    dorsal = int(input("\tdorsal: "))
                                    fecha_nacimiento = input(
                                        "\tfecha nacimiento: ")
                                    nacionalidad = input("\tnacionalidad: ")
                                    pais = input("\tpais de nacimiento: ")

                                    id_jugador = Jugadores.nuevo({
                                        'equipo':
                                        equipos[equipo_actual],
                                        'nombre':
                                        jugador('strong').text(),
                                        'posicion':
                                        posicion,
                                        'dorsal':
                                        dorsal,
                                        'nacionalidad':
                                        nacionalidad,
                                        'pais_nacimiento':
                                        pais,
                                        'fecha_nacimiento':
                                        fecha_nacimiento
                                    })

                            if evento.has_class('icon-circle'):
                                """ gol """
                                texto = evento.attr('title')
                                propia_meta = "en pp" in texto
                                minuto = texto[texto.find('minuto') + 7:]
                                partido.addgol(
                                    Gol(id_jugador, minuto, False,
                                        propia_meta))

                            elif evento.has_class('icon-up-circled'):
                                """ penalti """
                                texto = evento.attr('title').split(" ")
                                partido.addgol(
                                    Gol(id_jugador, texto[4], True, False))

                            elif evento.has_class('icon-stop'):
                                """ tarjeta amarilla o roja """
                                texto = evento.attr('title').split(" ")
                                partido.addtarjeta(
                                    Tarjeta(id_jugador, texto[3],
                                            texto[1].replace(',', '')))

                            elif evento.has_class('icon-half-square'):
                                """ segunda amarilla que implica una roja también """
                                texto = evento.attr('title').split(" ")
                                partido.addtarjeta(
                                    Tarjeta(id_jugador, texto[3], 'amarilla'))
                                partido.addtarjeta(
                                    Tarjeta(id_jugador, texto[3], 'roja'))
            """ la segunda vuelta del for pertenece a la parte de la dcha, al equipo visitante """
            equipo_actual = "visitante"

        return partido
Beispiel #5
0
 def parse_items(self, urls):
     docs = []
     threads = [
         threading.Thread(target=get, args=(url, docs)) for url in urls
     ]
     for thread in threads:
         thread.start()
     for thread in threads:
         thread.join()
     for item_doc in docs:
         word_id = None
         match = re.search("notSatisfied(Lang)?\( ?'(\d+)' ?[,\)]",
                           item_doc.html())
         if match:
             word_id = match.group(2)
         for locale in item_doc("article.pronunciations"):
             locale = PyQuery(locale)
             lang_header = locale('header[id=%s]' % self.lang.split('_')[0])
             if lang_header:
                 word = re.compile(r"(.*) の発音").search(
                     lang_header.text()).group(1)
                 if self.lang == 'en_usa':
                     els = locale('header[id=%s]' % self.lang).next_all()
                 else:
                     els = locale('.show-all-pronunciations li')
                 lis = []
                 for el in els:
                     el = PyQuery(el)
                     if el.has_class('li-ad'):
                         continue
                     if el.is_('header'):
                         break
                     lis.append(el)
                 for li in lis:
                     i = PyQuery(li('span.play'))
                     text = i.parents('li').eq(0).text()
                     user = None
                     match = re.search("発音したユーザ: (.*) \(", text)
                     if match:
                         user = match.group(1)
                     onclick = i.attr('onclick')
                     match = re.compile(r"Play\(.*,'(.*)',.*,.*,.*,.*,.*\)"
                                        ).search(onclick)
                     if match:
                         code = match.group(1)
                         url = 'https://audio00.forvo.com/mp3/' + \
                             base64_decode(code)
                         self.results.append({
                             'word': word,
                             'url': url,
                             'word_id': word_id,
                             'user': user
                         })
                     else:
                         match = re.compile(
                             r"PlayPhrase\(.*,'(.*)',.*\)").search(onclick)
                         if match:
                             code = match.group(1)
                             url = 'https://audio00.forvo.com/phrases/mp3/' + \
                                 base64_decode(code)
                             self.results.append({
                                 'word': word,
                                 'url': url,
                                 'word_id': word_id,
                                 'user': user
                             })
    def _render_span(self,
                     p: Paragraph,
                     pq: PyQuery,
                     bold=False,
                     italic=False,
                     strike=False,
                     underline=False,
                     font_size=None,
                     sub=False,
                     sup=False):
        """
        转换span
        change 19.5.3
            公式转换错误,则直接用图片
        :param pq:
        :return:
        """
        try:
            if pq.attr('data-latex'):  # 公式
                omml_str = converter.to_omml(
                    self.mini_trim(pq.attr('data-latex')))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
            if pq.has_class("math-tex"):  # 公式
                if pq.attr('data-latex'):
                    omml_str = pq.attr('data-latex')
                else:
                    omml_str = html.unescape(
                        pq.html()) if pq.html() is not None else ''
                omml_str = omml_str.replace(r'\(', '').replace(r'\)', '')
                omml_str = converter.to_omml(self.mini_trim(omml_str))

                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return

            # 阿凡题公式
            if pq.has_class('afanti-latex'):
                metadata = AftQuestion(pq).parse_element()
                if metadata.startswith('^') or metadata.startswith('_'):
                    last_ele = pq(p._element).children()[-1]
                    metadata = last_ele.text[-1] + metadata
                    last_ele.text = last_ele.text[:-1]

                omml_str = converter.to_omml(self.mini_trim(metadata))
                omml_str = omml_str.replace(
                    '<m:oMath',
                    '<m:oMath xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math"'
                )

                pq(p._element).append(omml_str)
                return
        except EquationConvertError:
            img = PyQuery('img', pq)
            self._render_img(p, img)
            return

        bold = any([
            bold,
            self._get_pq_style(pq, 'font-weight') == 'bold',
            self._get_pq_style(pq, 'font-weight') == 'bolder'
        ])
        italic = any(
            [italic, self._get_pq_style(pq, 'font-style') == 'italic'])
        strike = any([
            strike,
            self._get_pq_style(pq, 'text-decoration') == 'line-through',
            self._get_pq_style(pq, 'text-decoration-line') == 'line-through'
        ])
        underline = any([
            underline,
            self._get_pq_style(pq, 'text-decoration') == 'underline',
            self._get_pq_style(pq, 'text-decoration-line') == 'underline'
        ])

        if self._get_pq_style(pq, 'font-size'):
            size = self._get_pq_style(pq, 'font-size')
            if size.endswith('px'):
                size = size[:-2]
                size = int(float(size))
                font_size = self.get_pt(size)
            elif size.endswith('pt'):
                size = size[:-2]
                size = float(size)
                font_size = Pt(size)
        # self.__render_inline_element(p, pq, bold=bold, italic=italic, underline=underline, font_size=font_size,
        #                              strike=strike)

        contents = pq.contents()
        for item in contents:
            if isinstance(item, (HtmlElement, _Element)):
                self._render_element(p,
                                     item,
                                     is_root=True,
                                     bold=bold,
                                     italic=italic,
                                     strike=strike,
                                     underline=underline,
                                     font_size=font_size)
                continue
            run = p.add_run(self._clear_text(item))
            self.__force_simsun(run)
            if self._get_pq_style(pq, 'font-name'):
                run.font.name = self._get_pq_style(pq, 'font-name')
            if font_size:
                run.font.size = font_size

            run.underline = underline

            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.superscript = sup
            run.font.subscript = sub