Exemple #1
0
    def parse_flat(self, html: Element) -> None:  # noqa: CCR001
        """Get info about flat.

        Get all info about flat in given html element.

        :param html: Given element
        """
        try:
            flat_url = html.find("a", first=True).attrs.get("href")
            flat_id = int(re.search(r"flat/(\d+)", flat_url).group(1))
            location = html.xpath(".//a[@data-name='GeoLabel']/text()")
            if self.domain == "ekb":
                location = location[1:]
            city, district, *location = location
            location = " ".join(location)
            price = html.xpath(".//span[@data-mark='MainPrice']/text()",
                               first=True)
            price = int(price.replace("₽", "").strip().replace(" ", ""))
            ppm = html.xpath(".//p[@data-mark='PriceInfo']/text()", first=True)
            ppm = int(ppm.replace("₽/м²", "").strip().replace(" ", ""))
            square = round(price / ppm, 2)
            if not Flat.exists(id=flat_id):
                Flat(
                    id=flat_id,
                    city=city,
                    district=district,
                    location=location,
                    price=price,
                    ppm=ppm,
                    square=square,
                )
                commit()
        except Exception as exc:
            print(exc)
            rollback()
Exemple #2
0
def yield_pron(
    request_html: requests_html.Element,
    ipa_xpath_selector: str,
    config: "Config",
) -> "Iterator[Pron]":
    for ipa_element in request_html.xpath(ipa_xpath_selector):
        m = re.search(config.ipa_regex, ipa_element.text)
        if not m:
            continue
        pron = m.group(1)
        # Removes parens around various segments.
        pron = pron.replace("(", "").replace(")", "")
        if _skip_pron(pron, config.skip_spaces_pron):
            continue
        try:
            # All pronunciation processing is done in NFD-space.
            pron = unicodedata.normalize("NFD", pron)
            pron = config.process_pron(pron)
        except IndexError:
            logging.info(
                "IndexError encountered processing %s during scrape of %s",
                pron,
                config.language,
            )
            continue
        if pron:
            # The segments package inserts a # in-between spaces.
            if not config.skip_spaces_pron:
                pron = pron.replace(" #", "")
            yield pron
def movements(process: requests_html.Element) -> List[Dict]:
    rows = process.xpath('//tr')
    result = []
    for row in rows:
        data = []
        for col in row.xpath('//td'):
            data.append(col.text)
        result.append({'data': data[0], 'movimento': ''.join(data[1:])})
    return result
def parts(process_parts: requests_html.Element) -> List[List[Dict]]:
    rows = process_parts.xpath('//tr')
    result = []
    for row in rows:
        data = []
        values = row.text.replace('\xa0', '').replace(':\n', ':').split('\n')
        for value in values:
            value = value.split(':')
            data.append({value[0]: value[1].strip()})
        result.append(data)
    return result
def general_data(process_general_data: requests_html.Element) -> Dict:
    result = {}
    names = [
        'Classe', 'Área', 'Assunto', 'Distribuição', 'Juiz', 'Relator',
        'Valor da ação'
    ]
    for name in names:
        field = process_general_data.xpath(
            f"//tr[contains(string(), '{name}')]", first=True)
        if field:
            field = field.text
            field = field.replace(': ', ':\n')
            field = field.split(':\n')
            result[field[0]] = field[1]
    return result
Exemple #6
0
def yield_pron(
    request_html: requests_html.Element,
    ipa_xpath_selector: str,
    config: "Config",
) -> "Iterator[Pron]":
    for ipa_element in request_html.xpath(ipa_xpath_selector):
        m = re.search(config.ipa_regex, ipa_element.text)
        if not m:
            continue
        pron = m.group(1)
        # Removes parens around various segments.
        pron = pron.replace("(", "").replace(")", "")
        if _skip_pron(pron):
            continue
        pron = config.process_pron(pron)
        if pron:
            yield pron