Ejemplo n.º 1
0
    def test_insert_tag(self):
        builder = self.default_builder
        soup = self.soup(
            "<a><b>Find</b><c>lady!</c><d></d></a>", builder=builder)
        magic_tag = Tag(soup, builder, 'magictag')
        magic_tag.insert(0, "the")
        soup.a.insert(1, magic_tag)

        self.assertEqual(
            soup.decode(), self.document_for(
                "<a><b>Find</b><magictag>the</magictag><c>lady!</c><d></d></a>"))

        # Make sure all the relationships are hooked up correctly.
        b_tag = soup.b
        self.assertEqual(b_tag.next_sibling, magic_tag)
        self.assertEqual(magic_tag.previous_sibling, b_tag)

        find = b_tag.find(text="Find")
        self.assertEqual(find.next_element, magic_tag)
        self.assertEqual(magic_tag.previous_element, find)

        c_tag = soup.c
        self.assertEqual(magic_tag.next_sibling, c_tag)
        self.assertEqual(c_tag.previous_sibling, magic_tag)

        the = magic_tag.find(text="the")
        self.assertEqual(the.parent, magic_tag)
        self.assertEqual(the.next_element, c_tag)
        self.assertEqual(c_tag.previous_element, the)
Ejemplo n.º 2
0
 def insert_tag(self, tag_dict):
     """docstring for insert_tag"""
     tag = Tag(name=tag_dict.pop('name'))
     tag.attrs = tag_dict
     if not self.findAll('TAGS'):
         self.root.append(Tag(name='TAGS'))
     self.TAGS.append(tag)
     self.TAGS.append('\n')
Ejemplo n.º 3
0
 def reset(self):
     Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
     self.hidden = 1
     self.builder.reset()
     self.current_data = []
     self.currentTag = None
     self.tagStack = []
     self.preserve_whitespace_tag_stack = []
     self.pushTag(self)
Ejemplo n.º 4
0
def parse_monster_bbb_tag(tag: Tag, *args):
    cells = tag.find_all('td')
    equipment = cells[7].find_all('div')
    stats = cells[11].find_all('div')
    magic = cells[12].find_all('div')
    special = cells[13].find_all('div')
    statuses = cells[14].find_all('div')
    monster = dict(hiddenstreet_alias=tag.get('id'),
                   name=cells[0].strong.string.strip(),
                   level=cells[0].contents[-1].strip(),
                   health_points=convert(cells[1], int, True),
                   mana_points=convert(cells[2], int, True),
                   experience=convert(cells[3], int, True),
                   mesos=convert(cells[4], int, True),
                   knockback=convert(cells[5], int),
                   etc_drop=convert(cells[6]),
                   common_equipment=convert(equipment[0]),
                   warrior_equipment=convert(equipment[1]),
                   magician_equipment=convert(equipment[2]),
                   bowman_equipment=convert(equipment[3]),
                   thief_equipment=convert(equipment[4]),
                   pirate_equipment=convert(equipment[5]),
                   ore_drop=convert(cells[8]),
                   maker_item=convert(cells[9]),
                   useable_drop=convert(cells[10]),
                   weapon_attack=convert(stats[0], int),
                   magic_attack=convert(stats[1], int),
                   weapon_defence=convert(stats[2], int),
                   magic_defence=convert(stats[3], int),
                   phisical_dmg_reduction=convert(stats[4], int),
                   magical_dmg_reduction=convert(stats[5], int),
                   speed=convert(stats[6], int),
                   accuracy=convert(stats[7], int),
                   avoidability=convert(stats[8], int),
                   weakness_to_magic=convert(magic[0]),
                   normal_to_magic=convert(magic[1]),
                   resistance_to_magic=convert(magic[2]),
                   immune_to_magic=convert(magic[3]),
                   unique_attack=convert(special[0]),
                   health_points_recovery=convert(special[1], int),
                   mana_points_recovery=convert(special[2], int),
                   immune_against_status=convert(statuses[0]),
                   inflict_status=convert(statuses[1]),
                   common_location=None)

    monster['image_url'] = cells[0].find('img')
    if monster['image_url']:
        monster['image_url'] = monster['image_url'].get('src')
    try:
        monster['level'] = int(monster['level'])
    except ValueError:
        monster['level'] = None
    return monster
Ejemplo n.º 5
0
def knowledge_panel_title_parser(body: element.Tag) -> List[Dict]:
    """The title of knowledge panels that are clickable"""
    data = []
    for elm in body.find_all(attrs={"data-ru_q": True}):
        row = element_to_dict(elm, category='link-knowledge_panel_title')
        data.append(row)
    return data
Ejemplo n.º 6
0
def substitute_special_paragraphs(soup):

    for prefix, klass in prefix2class.items():
        substitute_special_paragraph(soup, prefix, klass)

    make_details = ['comment', 'question', 'doubt']
    for c in make_details:
        for e in list(soup.select('.%s' % c)):
            details = Tag(name='details')
            add_class(details, c)
            summary = Tag(name='summary')
            summary.append(c)
            details.append(summary)
            rest = e.__copy__()
            details.append(rest)
            e.replace_with(details)
Ejemplo n.º 7
0
def conversion_parser(body: element.Tag) -> List[Dict]:
    """See "how many ounces in a cup"."""
    data = []
    for elm in body.find_all('h2', text='Unit Converter'):
        row = element_to_dict(elm.parent, category='answer-unit_converter')
        data.append(row)
    return data
Ejemplo n.º 8
0
    def parse_news_item(self, row: element.Tag, base_url: str) -> NewsItem:
        date_cell, info_cell = row.find_all('td')

        date_string = date_cell.get_text(strip=True)
        date = parse_datetime(date_string)

        # Some info cells just have the title as a link, while others have
        # the title as text followed by links for each language.
        english_link = info_cell.find('a', string='English')
        if english_link:
            url = english_link['href']
            # These titles are sometimes followed by a colon. If so, drop
            # it. (Whitespace will already have been stripped.)
            title = first_text_in_element(info_cell)
            if title:
                title = title.strip(':')
        else:
            title_link = info_cell.find('a')
            url = title_link['href']
            title = title_link.get_text(strip=True)

        if url:
            url = urljoin(base_url, url)
        else:
            raise FormatError('No URL found')

        if not title:
            raise FormatError('No title content found')

        return NewsItem(id=url, url=url, title=title, date_published=date)
Ejemplo n.º 9
0
 def _get_votes_number(self, post: Tag) -> Optional[int]:
     votes_number_div = post.find('div', class_=self.VOTES_NUMBER_CLASS)
     if votes_number_div is None:  # not a post
         return None
     if 'k' in votes_number_div.text:  # votes specified in thousands
         return int(votes_number_div.text[:-1]) * 1000
     return int(votes_number_div.text)
Ejemplo n.º 10
0
def dict_def_parser(body: element.Tag) -> List[Dict]:
    """Dictionary definitions, gets the whole card."""
    data = []
    for elm in body.find_all('div', attrs={'id': 'dictionary-modules'}):
        row = element_to_dict(elm, category='answer-dictionary')
        data.append(row)
    return data
Ejemplo n.º 11
0
def ads_aria_parser(body: element.Tag) -> List[Dict]:
    """Catches ADs with a accessibility features"""
    data = []
    for elm in body.find_all(attrs={'aria-label': 'Ad'}):
        row = element_to_dict(elm, category='ads-aria')
        data.append(row)
    return data
Ejemplo n.º 12
0
    def parse_truyenfull_chapters(self, soup: Tag):
        truyen_id = self.__select_value(soup, 'input#truyen-id', 'value')
        total_page = self.__select_value(soup, 'input#total-page', 'value')
        truyen_ascii = self.__select_value(soup, 'input#truyen-ascii', 'value')
        assert truyen_id, 'No truen novel id found'
        total_page = int(str(total_page))
        logger.info('Total page count: %d', total_page)

        futures: List[Future] = []
        for page in range(total_page):
            params = urlencode({
                'type': 'list_chapter',
                'tid': int(truyen_id),
                'tascii': truyen_ascii,
                'tname': self.novel_title,
                'page': page + 1,
                'totalp': total_page,
            })
            url = 'https://truyenfull.vn/ajax.php?' + params
            logger.info('Getting chapters: %s', url)
            f = self.executor.submit(self.get_json, url)
            futures.append(f)
        # end for

        for f in futures:
            data = f.result()
            soup = self.make_soup(data['chap_list'])
            self.parse_all_links(soup.select('.list-chapter a'))
Ejemplo n.º 13
0
def replace_emoji_imgs(element: Tag) -> None:
    for img in element.find_all('img'):
        match = EMOJI_IMG_SRC_RX.match(img.get('src', ''))
        if match:
            emoji = EMOJI_MAP.get(match.group(1))
            if emoji:
                img.replace_with(emoji)
Ejemplo n.º 14
0
        def __init__(self, table_row: Tag) -> None:
            """コンテスト一覧ページ内のテーブルのある行タグから,コンテストインスタンスを初期化する.

            Args:
                table_row (Tag): 行タグ
            """
            table_data_list: List[Tag] = table_row.select('td')

            time_str: str = table_data_list[0].get_text()
            self.time = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S+0900')
            self.time_unix = int(self.time.timestamp())

            contest_tag: Tag = table_data_list[1].find('a')
            contest_href_match: Optional[
                Match[str]] = self.contest_href_pattern.search(
                    contest_tag['href'])
            assert contest_href_match is not None
            self.contest_slug = contest_href_match.group(1)
            self.contest_name = contest_tag.get_text()

            duration_str: str = table_data_list[2].get_text()
            duration_match: Optional[
                Match[str]] = self.duration_pattern.search(duration_str)
            assert duration_match is not None
            hours: int = int(duration_match.group(1))
            minutes: int = int(duration_match.group(2))
            self.duration_minutes = hours * 60 + minutes
Ejemplo n.º 15
0
    def set_number_available(self, raw_line: element.Tag):
        """Sets number_available class attribute from raw DOM element.

        Args:
            raw_line (element.Tag): Raw DOM element.

        Raises:
            _CUSTOM_ERRORS.CouldNotParseInfo: If the info cannot be found.
            Missing info will be mentionned in error message.
        """

        value = raw_line.find("td")

        if value is None:
            self.logger.write(
                log_level="error",
                message="Could not parse number of books available on book page.",
            )
            raise _CUSTOM_ERRORS.CouldNotParseInfo(
                title=self.title, info="Number available", url=self.url
            )

        number = re.findall("([0-9]+) available", value.get_text())

        if len(number) == 0:
            self.logger.write(
                log_level="error",
                message="Could not parse number of books available on book page.",
            )
            raise _CUSTOM_ERRORS.CouldNotParseInfo(
                title=self.title, info="Number available", url=self.url
            )

        self.number_available = int(number[0])
Ejemplo n.º 16
0
def stats_per_10_mins(career: Tag, ID: str) -> dict:
    all_stats  = {}
    stats_10   = {}
    per_10     = ' - Avg per 10 Min'
    stats_soup = career.find('div', attrs = {'data-category-id': ID})

    for table in stats_soup.children:
        for tr in table.tbody.children:
            stat = tr.contents[0].text.replace(' Done', '').strip()
            info = tr.contents[1].text

            for word in ('Blow', 'Kill'):
                if word in stat and f'{word}s' not in stat:
                    stat = stat.replace(word, f'{word}s').strip()

            all_stats[stat] = info

            if per_10 in stat or ('Accuracy' in stat and 'Best' not in stat):
                stats_10[stat.replace(per_10, '').strip()] = None

            if stat == 'Time Played':
                total_time = clock_to_mins(info) / 10

    for stat in stats_10:
        if '%' in all_stats[stat]:
            stats_10[stat] = all_stats[stat]
        elif ':' in all_stats[stat]:
            stats_10[stat] = mins_to_clock(clock_to_mins(all_stats[stat]) / total_time)
        else:
            stats_10[stat] = round(float(all_stats[stat]) / total_time, 2)

    return stats_10
Ejemplo n.º 17
0
def get_title(row: element.Tag) -> str:
    """Get the title of a row."""
    try:
        return row.find("a", {"class": "elco-anchor"}).text.strip()
    except Exception as e:
        logger.debug("Function get_title for row %s : %s", row, e)
        return None
Ejemplo n.º 18
0
def _extract_metadata_xml(root_el: bs4e.Tag) -> SessionMetadata:
    head_el = root_el.vorspann.kopfdaten
    sv_el = root_el.sitzungsverlauf

    sn_el = head_el.find("sitzungsnr")
    lp_el = head_el.find("wahlperiode")

    date_str = root_el.get("sitzung-datum")
    session_start = root_el.get("sitzung-start-uhrzeit")
    session_end = root_el.get("sitzung-ende-uhrzeit")

    return SessionMetadata(
        session_no=get_session_id_safe(lp_el.getText(), sn_el.getText()),
        legislative_period=int(lp_el.getText()),
        start=build_datetime(date_str, session_start),
        end=build_datetime(date_str, session_end))
Ejemplo n.º 19
0
def tab_parser(body: element.Tag) -> List[Dict]:
    """For tabs, sometimes on knowledge panels. # check this"""
    data = []
    for elm in body.find_all(role='tab'):  # what if we get rid of 'a'
        row = element_to_dict(elm, category='link-knowledge_panel_tab')
        data.append(row)
    return data
Ejemplo n.º 20
0
def fullpage_popup_parser(body: element.Tag) -> List[Dict]:
    """A clickthru of a fullpage. See events like "New Years Eve Party"."""
    data = []
    for elm in body.find_all('li', attrs={'data-encoded-docid': True}):
        row = element_to_dict(elm, category='link-fullpage')
        data.append(row)
    return data
Ejemplo n.º 21
0
def get_item_info(item: Tag) -> dict:
    response = dict()
    response['vacancy_name'] = item.find('div', {
        'class': 'search-item-name'
    }).find('a').text
    response['employer'] = item.find('div', {
        'class': 'vacancy-serp-item__meta-info'
    }).find('a').text
    response['location'] = item.find('span', {
        'class': 'vacancy-serp-item__meta-info'
    }).text
    salary = item.find('div', {
        'class': 'vacancy-serp-item__sidebar'
    }).find('div')
    response['salary'] = 'з/п не указана' if salary is None else salary.text
    return response
Ejemplo n.º 22
0
def check_if_removed_from_bugblog(bbt: Match, b: Tag, issue: Issue) -> None:
    if bbt is not None:
        text = strings.remove_smartquotes(bbt.group(1).strip())
        for row in b.find_all('tr'):
            data = row.find_all('td')
            rowtext = strings.remove_smartquotes(data[1].text.strip())
            if rowtext == text:
                break
            if strip_squarebrackets(rowtext) == strip_squarebrackets(text):
                # Fix this
                print(
                    "Issue #{id}'s bug blog text has differing autocard notation."
                    .format(id=issue.number))
                old_bbt = strings.get_body_field(issue.body, 'Bug Blog Text')
                body = re.sub(BBT_REGEX,
                              'Bug Blog Text: {0}'.format(rowtext),
                              issue.body,
                              flags=re.MULTILINE)
                new_bbt = strings.get_body_field(body, 'Bug Blog Text')
                issue.edit(body=body)
                print('Updated to `{0}`'.format(rowtext))
                issue.create_comment(
                    f'Changed bug blog text from `{old_bbt}` to `{new_bbt}`')
                break
        else:
            print('{id} is fixed!'.format(id=issue.number))
            repo.create_comment(
                issue, 'This bug has been removed from the bug blog!')
            issue.edit(state='closed')
Ejemplo n.º 23
0
def ads_local_parser(body: element.Tag) -> List[Dict]:
    """Localized ADs"""
    data = []
    for elm in body.find_all('li', attrs={'class': re.compile("^ads-")}):
        row = element_to_dict(elm, category='ads-text')
        data.append(row)
    return data
Ejemplo n.º 24
0
 def __init__(self, name: str, title: str, td: Tag):
     super().__init__(title)
     img = td.find('img')
     if 'NoPhoto' in img.attrs.get('src', ''):
         self.link: OptionalStr = None
     else:
         self.link = img.attrs.get('zoomimg')
Ejemplo n.º 25
0
def ebook_parser(body: element.Tag) -> List[Dict]:
    data = []
    for elm in body.find_all('g-expandable-content',
                             attrs={
                                 'jscontroller': True,
                                 'jsaction': True,
                                 'jsshadow': True,
                                 'aria-hidden': True,
                                 'data-eb': True,
                                 'data-mt': True,
                                 'data-quie': True,
                                 'data-ved': True
                             }):
        for div in elm.find_all('div',
                                recursive=True,
                                attrs={
                                    'class': True,
                                    'jsname': True,
                                    'role': 'button',
                                    'aria-haspopup': True,
                                    'tabindex': True,
                                    'jsaction': True
                                }):
            for e in div.find_all('div', text=True):
                category = 'organic'
                if e.text == 'Google Play Books':
                    category = 'link-google_play_books'
                row = element_to_dict(div, category=category)
                data.append(row)
    return data
Ejemplo n.º 26
0
def _parse_row(row: element.Tag, header: list) -> List[object]:
    """
    Parses individual rows from table and returns a dictionary
    """
    info_list = [item.text for item in row.find_all('td')]
    info = dict(zip(header, info_list))
    return info
Ejemplo n.º 27
0
def parsePcThreadHeader(liTag: Tag):
    import json
    metadata = json.loads(liTag.attrs.get("data-field"))
    thread = ThreadHeader()
    thread.kz = metadata.get("id")
    thread.author_name = metadata.get("author_name")
    thread.author_nickname = metadata.get("author_nickname")
    thread.author_portrait = metadata.get("author_portrait")
    thread.first_post_id = metadata.get("first_post_id")
    thread.reply = metadata.get("reply_num")
    thread.bakan = metadata.get("is_bakan")
    thread.vid = metadata.get("vid")
    thread.good = metadata.get("is_good")
    thread.top = metadata.get("is_top")
    thread.protal = metadata.get("is_protal")
    thread.membertop = metadata.get("is_membertop")
    thread.multi_forum = metadata.get("is_multi_forum")
    thread.frs_tpoint = metadata.get("frs_tpoint")
    titleATag = liTag.select_one(".j_th_tit a")
    if titleATag:
        thread.title = titleATag.text
    thread.mod_date = datetime.now()
    if not thread.kz:
        thread = None
    return thread
Ejemplo n.º 28
0
 def elementClass(self, name, namespace):
     if namespace is not None:
         warnings.warn(
             "BeautifulSoup cannot represent elements in any namespace",
             DataLossWarning)
     return Element(Tag(self.soup, self.soup.builder, name), self.soup,
                    namespace)
Ejemplo n.º 29
0
def check_for_missing_bugs(b: Tag) -> None:
    for row in b.find_all('tr'):
        data = row.find_all('td')
        row_text = data[1].text.strip()
        if row_text == 'Description':
            # BS4 is bad.
            continue
        issue = find_issue_by_code(row_text)
        if issue:
            labels = [c.name for c in issue.labels]
            categories = [c for c in labels if c in strings.METACATS]
            if categories:
                continue
            bbcat = re.match(strings.REGEX_BBCAT, data[2].text.strip())
            if bbcat is None:
                continue
            g1 = bbcat.group(1).strip()
            if g1 in strings.METACATS:
                issue.add_to_labels(g1)
                continue
            if bbcat.group(2) is not None:
                g2 = bbcat.group(2).strip()
                if g2 in strings.METACATS:
                    issue.add_to_labels(g2)
                    continue
            print(f'Unknown BBCat: {bbcat.group(0)}')
            continue
        print('Could not find issue for `{row}`'.format(row=row_text))
        text = 'From Bug Blog.\nBug Blog Text: {0}'.format(row_text)
        repo.get_repo().create_issue(strings.remove_smartquotes(row_text),
                                     body=strings.remove_smartquotes(text),
                                     labels=['From Bug Blog'])
Ejemplo n.º 30
0
    def get_rss_item_tags(self, item: Tag) -> list[str]:
        tags = item.find(self.rss_tags_name)

        if tags is not None:
            return [tag.strip() for tag in tags.text.split(",")]

        return []
Ejemplo n.º 31
0
Archivo: doc.py Proyecto: denene12/bot2
    def _match_end_tag(tag: Tag) -> bool:
        """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
        for attr in SEARCH_END_TAG_ATTRS:
            if attr in tag.get("class", ()):
                return True

        return tag.name == "table"
Ejemplo n.º 32
0
def getRecommendationGrade(recommendation: Tag) -> str:
    """Extract the grade for a given recommendation.
    
    Arguments:
        recommendation {Tag} -- Recommendation tag to be parsed.
    
    Raises:
        Exception -- Raised when grade is not found.
    
    Returns:
        str -- Grade of the recommendation.
    """

    # Change if additional grades are added
    possible_grades = ['A', 'B', 'C', 'E']

    # Extract 'strong' type objects
    strong_objects = recommendation.find_all('strong')

    # Only extract if grade
    for strong in strong_objects:
        if strong.text in possible_grades:
            return strong.text

    # No recommendation grade found
    raise Exception
Ejemplo n.º 33
0
def filter_parser(body: element.Tag) -> List[Dict]:
    """Checks for filters"""
    data = []
    for elm in body.find_all(attrs={"role": "button", "aria-pressed": True}):
        row = element_to_dict(elm, category='link-filter')
        data.append(row)
    return data
Ejemplo n.º 34
0
def extractinfo(info: Tag) -> None:
    if info is not None:
        workid = str(info['href']).split('=')[1]
        ps = info.find_all('p')
        worktitle = str(ps[0]).strip('<p>/')
        workauthor = str(ps[1]).strip('<p>/')
        wrks.append(Workinfo(workid, worktitle, workauthor))
Ejemplo n.º 35
0
def embed_css_files(soup):
    """ Look for <link> elements of CSS and embed them if they are local files"""
    # <link href="..." rel="stylesheet" type="text/css"/>
    for link in list(
            soup.findAll('link', attrs={
                'rel': 'stylesheet',
                'href': True
            })):
        href = link.attrs['href']
        if href.startswith('/'):  # not on windows?
            logger.info('Embedding %r' % href)
            data = open(href).read()
            style = Tag(name='style')
            style.attrs['type'] = 'text/css'
            style.string = data
            link.replace_with(style)
Ejemplo n.º 36
0
    def _get_report_type_and_subject_variants(form: Tag):
        """
        В форме на сервере содержаться два выпадающих списка
        Для формирования отчета необходимо отправить на сервер
        все варианты пар значений из первого поля и из второго,
        то есть таким образом данный метод формирует декартово
        произведение множеств из элементов этих списков.

        :param form: форма, в которой будем искать <select>`ы
        :type form: Tag
        :rtype: list
        """

        from itertools import product

        select_subject_attr_name_value = setting['base-param-for-form']['name-field-select-subject']
        select_report_type_attr_name_value = setting['base-param-for-form']['name-field-select-report']

        # теперь нам нужно извлечь данные из select`ов
        # тут у нас селект с типами отчетов
        select_report_type = form.find(attrs={'name': select_report_type_attr_name_value})
        options_select_report_type = select_report_type.find_all('option')

        # тут с подразделениями лукойла
        select_subject = form.find(attrs={'name': select_subject_attr_name_value})
        select_subject_report_type = select_subject.find_all('option')

        assert select_report_type and select_subject, 'Не удалось найти теги <select>'

        selects = []
        # в selects храним список словарей с двумя ключами ctl00$ContentPlaceHolder1$ddlTip и
        # ctl00$ContentPlaceHolder1$Subjects и соответствующими значениями - то есть комбинации
        # двух селектов т. е. декартово произведение двух множеств.

        for type_option, subject in product(options_select_report_type, select_subject_report_type):
            selects.append({
                select_report_type['name']: type_option['value'],
                select_subject['name']: subject['value']
            })

        return selects
 def create_question(self, element:Tag) -> Question:
     id = element.attrs["id"]
     title_element = element.select(Scrapper.QUESTION_TITLE)[0]
     title = title_element.text
     link = Scrapper.WEBSITE + title_element.attrs["href"]
     user_name = element.select(Scrapper.USER_NAME)[0].text
     reputation_string = element.select(Scrapper.REPUTATION)[0].text.replace(",", "").replace("k", "000")
     reputation = float(reputation_string)
     tags = [i.text for i in element.select(Scrapper.TAG)]
     votes = int(element.select(Scrapper.VOTES)[0].text)
     answers = int(element.select(Scrapper.ANSWERS)[0].text)
     views = int(element.select(Scrapper.VIEWS)[0].text.replace("views", ""))
     time = element.select(Scrapper.TIME)[0].text
     user = User(user_name, reputation)
     return Question(id, title, user, link, votes, answers, views, tags, time)
Ejemplo n.º 38
0
 def get_xml(self):
     xml = u'<?xml version="1.0" encoding="UTF-8" ?>\n'
     root = Tag(name=self.task)
     text = Tag(name='TEXT')
     text.append(CData(self.text()))
     tags = self.TAGS
     tokens = (BS(
         self.tokenizer.get_tokenized_as_xml().encode('utf-8'),
         'xml'
     )).TOKENS
     elements = [u'\n', text, u'\n', tags, u'\n', tokens, u'\n']
     for element in elements:
         if element: # if missing tags, system will crash
             root.append(element)
     xml += unicode(root)
     return xml
Ejemplo n.º 39
0
def convert(tag: Tag, field_type: Generic(str, int)=str, strip_comma=False):
    result = tag.find('div', class_='field-item')
    if result:
        result = result.string
    elif tag.div and 'field-label-inline' in tag.div.get('class'):
        result = tag.div.contents[-1].strip()
    else:
        try:
            result = tag.contents[-1].strip()
        except AttributeError:
            result = None
    if field_type == str:
        if result == '-' or result == '?':
            result = None
    else:
        if strip_comma:
            result = result.replace(',', '')
        try:
            result = field_type(result)
        except ValueError:
            result = None
    return result
Ejemplo n.º 40
0
def parse_weapon_tag(tag: Tag, weapon_type):
    cells = tag.find_all('td')
    weapon = dict(name=cells[1].strong.a.string,
                  weapon_type=weapon_type.value,
                  required_level=convert(cells[2], int),
                  required_stats=convert(cells[3]),
                  weapon_attack=convert(cells[4], int),
                  attack_speed=convert(cells[5]),
                  job=convert(cells[6]),
                  effects=convert(cells[7]),
                  available_upgrades=convert(cells[8], int),
                  sold_for=convert(cells[9]),
                  dropped_by=convert(cells[10]),
                  available_from=None,
                  remarks=None)

    try:
        tmp = weapon['sold_for'].index(' ')
        weapon['sold_for'] = int(weapon['sold_for'][:tmp].replace(',', ''))
    except ValueError:
        weapon['sold_for'] = 0
    return weapon
Ejemplo n.º 41
0
 def setProperty(self,name,value):
     if type(name)!=str or type(value)!=str:
         print_error('key and value must be str')
         return False
     if not verify(name,value):
         return False
     ele=self.soup.find(name=NAME,text=name)
     if ele==None:
         #print "create it"
         p=Tag(name='property')
         n=Tag(name=NAME)
         n.string=name
         v=Tag(name=VALUE)
         v.string=value
         p.append(n)
         p.append(v)
         configuration_tag=self.soup.find('configuration')
         configuration_tag.append(p)
     else:
         ele.parent.find(VALUE).string=value
         #print "set succed it"
     return True
Ejemplo n.º 42
0
 def extract_by_class(cls, review: Tag, class_name: str) -> Tag:
     return review.find(class_=class_name)