Python SearchItem Beispiele, search_engine_parser.core.base.SearchItem Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: myanimelist.py Projekt: buddhhu/search-engine-parser

    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in div with a numeric id
        :type single_result: `bs4.element.Tag`
        :return: parsed title, link and description of single result
        :rtype: str, str, str
        """
        rdict = SearchItem()
        link_tag = single_result.find("a", class_="fw-b")

        if return_type in (ReturnType.FULL, return_type.TITLE):
            title = link_tag.find("strong").text
            rdict["titles"] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            rdict["links"] = link_tag.get("href")

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            desc = single_result.find("div", class_="pt4").text.strip()
            rdict["descriptions"] = desc

        if return_type == ReturnType.FULL:
            data = list(single_result.find_all("td", class_="ac"))
            animetype = data[0].text.strip()
            episodes = data[1].text.strip()
            score = data[2].text.strip()

            rdict.update(
                {"episode_count": episodes, "animetypes": animetype, "ratings": score}
            )
        return rdict

Beispiel #2

0

Datei anzeigen

Datei: myanimelist.py Projekt: devajithvs/search-engine-parser

 def parse_result(self, results):
     """
     Runs every entry on the page through parse_single_result
     :param results: Result of main search to extract individual results
     :type results: list[`bs4.element.ResultSet`]
     :returns: dictionary. Containing titles, links, episodes, scores, types and descriptions.
     :rtype: dict
     """
     search_results = SearchItem()
     index = -1
     for each in results:
         index += 1
         # Skip the top row of table (always) and unimportant trs (Out of
         # range)
         if index <= 0 or index < self.page * 10 + 1 or index > self.page * 10 + 10:
             continue
         try:
             rdict = self.parse_single_result(each)
             # Create a list for all keys in rdict if not exist, else
             for key in rdict:
                 if key not in search_results.keys():
                     search_results[key] = list([rdict[key]])
                 else:
                     search_results[key].append(rdict[key])
         except Exception:  # pylint: disable=invalid-name, broad-except
             pass
     return search_results

Beispiel #3

0

Datei anzeigen

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="g">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        # Some unneeded details shown such as suggestions should be ignore
        if (single_result.find("h2", class_="wITvVb") and single_result.find("div", class_="LKSyXe"))\
                or single_result.find("div", class_="X7NTVe"):
            return

        results = SearchItem()
        els = single_result.find_all('div', class_='kCrYT')
        if len(els) < 2:
            return

        # First div contains title and url
        r_elem = els[0]

        # Get the text and link
        if return_type in (ReturnType.FULL, ReturnType.TITLE):
            link_tag = r_elem.find('a')
            if link_tag:
                title = link_tag.find('h3').text
            else:
                r_elem = els[1]
                title = r_elem.find('div', class_='BNeawe').text
            results['titles'] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = r_elem.find('a')
            if link_tag:
                raw_link = link_tag.get('href')
                raw_url = urljoin(self.base_url, raw_link)
                results['raw_urls'] = raw_url
                results['links'] = self.clean_url(raw_url)

        if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
            # Second Div contains Description
            desc_tag = els[1]
            if return_type in (ReturnType.FULL,
                               ReturnType.LINK) and not results.get('links'):
                link_tag = desc_tag.find('a')
                if link_tag:
                    desc_tag = els[0]
                    raw_link = link_tag.get('href')
                    raw_url = urljoin(self.base_url, raw_link)
                    results['raw_urls'] = raw_url
                    results['links'] = self.clean_url(raw_url)
            desc = desc_tag.text
            results['descriptions'] = desc
        return results

Beispiel #4

0

Datei anzeigen

Datei: googlescholar.py Projekt: sashka3076/search-engine-parser

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="gs_r gs_or gs_scl">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link, description, file link, result type of single result
        :rtype: dict
        """
        rdict = SearchItem()
        r_elem = single_result.find('h3', class_='gs_rt')
        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = r_elem.find('a')
            if link_tag:
                raw_link = link_tag.get('href')
            else:
                raw_link = ''
            rdict["links"] = raw_link

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            desc = single_result.find('div', class_='gs_rs')
            if desc:
                desc = desc.text
            else:
                desc = ''
            rdict["descriptions"] = desc

        if return_type in (ReturnType.FULL, return_type.TITLE):
            title = r_elem.text
            title = re.sub(r'^[\[\w+\]]+ ', '', title)
            rdict["titles"] = title

        if return_type == ReturnType.FULL:
            t_elem = single_result.find('span', class_='gs_ct1')
            if t_elem:
                result_type = t_elem.text
            else:
                result_type = ''

            f_elem = single_result.find('div', class_='gs_or_ggsm')
            if f_elem:
                flink_tag = r_elem.find('a')
                if flink_tag:
                    file_link = flink_tag.get('href')
                else:
                    file_link = ''
            else:
                file_link = ''

            rdict.update({
                "result_types": result_type,
                "files_links": file_link
            })

        return rdict

Beispiel #5

0

Datei anzeigen

Datei: aol.py Projekt: devajithvs/search-engine-parser

    def parse_single_result(self, single_result, return_type=ReturnType.FULL):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="algo-sr">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        h3_tag = single_result.find('h3')
        link_tag = h3_tag.find('a')
        if return_type in (ReturnType.FULL, return_type.TITLE):
            # Get the text and link
            rdict["titles"] = link_tag.text

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            rdict["links"] = link_tag.get("href")

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            caption = single_result.find('div', class_='compText aAbs')
            desc = caption.find('p', class_='lh-16')
            rdict["descriptions"] = desc.text

        return rdict

Beispiel #6

0

Datei anzeigen

    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div id="r1-{id}">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """

        rdict = SearchItem()

        if return_type in (ReturnType.FULL, return_type.TITLE):
            h2 = single_result.find(
                'h2', class_="result__title")  # pylint: disable=invalid-name
            # Get the text and link
            rdict["titles"] = h2.text.strip()

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = single_result.find('a', class_="result__url")
            # raw link is of format "/url?q=REAL-LINK&sa=..."
            raw_link = self.base_url + link_tag.get('href')
            re_str = re.findall("uddg=(.+)", raw_link)[0]
            re_str = re_str.replace("%3A", ":")
            link = re_str.replace("%2F", "/")
            link = link.replace("%2D", "-")
            rdict["links"] = link

        if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
            desc = single_result.find(class_='result__snippet')
            rdict["descriptions"] = desc.text

        return rdict

Beispiel #7

0

Datei anzeigen

Datei: googlenews.py Projekt: devajithvs/search-engine-parser

    def parse_single_result(self, single_result, return_type=ReturnType.FULL):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="g">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link, description, imge link, news source, date of single result
        :rtype: dict
        """
        rdict = SearchItem()

        if return_type in (ReturnType.FULL, return_type.TITLE):
            title_tag = single_result.find('h3')
            title = title_tag.text
            rdict["titles"] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = single_result.find('a')
            rdict["link"] = link_tag.get('href')

        if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
            desc_tag = single_result.find('div', class_='st')
            rdict["descriptions"] = desc_tag.text

        if return_type in (ReturnType.FULL, ):
            img_tag = single_result.find('img', class_='th')
            news_source_tag = single_result.find('span', class_='e8fRJf')
            date_tag = single_result.find('span', class_='f')

            rdict["image_url"] = img_tag.get('src')
            rdict["news_source"] = news_source_tag.text
            rdict["date"] = date_tag.text
        return rdict

Beispiel #8

0

Datei anzeigen

Datei: ask.py Projekt: sashka3076/search-engine-parser

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="PartialSearchResults-item">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: str, str, str
        """

        rdict = SearchItem()
        if return_type in (ReturnType.FULL, return_type.TITLE):
            rdict["titles"] = single_result.find('a').text

        if return_type in (ReturnType.FULL, return_type.TITLE):
            rdict["links"] = single_result.a["href"]

        if return_type in (ReturnType.FULL, return_type.TITLE):
            rdict["descriptions"] = single_result.find(
                'p', class_="PartialSearchResults-item-abstract").text

        return rdict

Beispiel #9

0

Datei anzeigen

Datei: google.py Projekt: ra2003/search-engine-parser

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="g">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        results = SearchItem()
        r_elem = single_result.find('div', class_='r')

        # Get the text and link
        if return_type in (ReturnType.FULL, return_type.TITLE):
            h3_tag = r_elem.find('h3')
            title = h3_tag.text
            if not title:
                title = h3_tag.find('div', class_='ellip').text
            results['titles'] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = r_elem.find('a')
            raw_link = link_tag.get('href')
            results['links'] = raw_link

        if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
            desc = single_result.find('span', class_='st')
            desc = desc.text
            results['descriptions'] = desc

        return results

Beispiel #10

0

Datei anzeigen

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <li class="b_algo">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        h2_tag = single_result.find('h2')
        link_tag = h2_tag.find('a')

        if return_type in (ReturnType.FULL, return_type.TITLE):
            rdict["titles"] = link_tag.text

        if return_type in (ReturnType.FULL, return_type.LINK):
            link = link_tag.get('href')
            rdict["links"] = link

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            caption = single_result.find('div', class_='b_caption')
            desc = caption.find('p')
            rdict["descriptions"] = desc.text

        return rdict

Beispiel #11

0

Datei anzeigen

    def parse_single_result(self, single_result, return_type=ReturnType.FULL):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="Sr">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        h3_tag = single_result.find('h3', class_='title')

        if return_type in (ReturnType.FULL, return_type.TITLE):
            title = h3_tag.text
            rdict["titles"] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = h3_tag.find('a')
            raw_link = link_tag.get('href')
            re_str = re.findall("/RU=(.+)/RK", raw_link)[0]
            re_str = re_str.replace("%3a", ":")
            link = re_str.replace("%2f", "/")
            rdict["links"] = link

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            desc = single_result.find('p', class_='lh-16')
            rdict["descriptions"] = desc.text

        return rdict

Beispiel #12

0

Datei anzeigen

Datei: yandex.py Projekt: devajithvs/search-engine-parser

    def parse_single_result(self, single_result, return_type=ReturnType.FULL):
        """
        Parses the source code to return

        :param single_result: single result found in <li class="serp-item">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: str, str, str
        """
        rdict = SearchItem()
        h3_tag = single_result.find('div', class_="organic__url-text")

        if return_type in (ReturnType.FULL, return_type.TITLE):
            # Get the text and link
            title = h3_tag.text
            # Handle read more type texts
            index = title.find("Read more")
            if index >= 0:
                title = title[0:int(index)]
            rdict["titles"] = title

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = single_result.find('a')
            link = link_tag.get('href')
            rdict["links"] = link

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            desc = single_result.find('div', class_="organic__content-wrapper")
            desc = desc.text
            rdict["descriptions"] = desc
        return rdict

Beispiel #13

0

Datei anzeigen

    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="summary">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        h3 = single_result.find('h3')  # pylint: disable=invalid-name
        link_tag = h3.find('a')
        if return_type in (ReturnType.FULL, return_type.TITLE):
            # Get the text and link
            rdict["titles"] = link_tag.text

        if return_type in (ReturnType.FULL, return_type.LINK):
            ref_link = link_tag.get('href')
            link = self.base_url + ref_link
            rdict["links"] = link

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            caption = single_result.find('div', class_='excerpt')
            rdict["descriptions"] = caption.text
        return rdict

Beispiel #14

0

Datei anzeigen

Datei: baidu.py Projekt: sashka3076/search-engine-parser

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in div with a numeric id
        :type single_result: `bs4.element.Tag`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        if return_type in (ReturnType.FULL, return_type.TITLE):
            h3_tag = single_result.find('h3')
            rdict["title"] = h3_tag.text

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link_tag = single_result.find('a')
            # Get the text and link
            rdict["links"] = link_tag.get('href')

        if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
            desc = single_result.find('div', class_='c-abstract')
            rdict["descriptions"] = desc if desc else ''
            return rdict

Beispiel #15

0

Datei anzeigen

    def parse_single_result(self, single_result, return_type=ReturnType.FULL, **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div id="r1-{id}">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """

        rdict = SearchItem()

        if return_type in (ReturnType.FULL, return_type.TITLE):
            h2 = single_result.find(
                'h2', class_="result__title")  # pylint: disable=invalid-name
            # Get the text and link
            rdict["titles"] = h2.text.strip()

        if return_type in (ReturnType.FULL, ReturnType.LINK):
            link = None
            link_tag = single_result.find('a', class_="result__a")
            rdict["links"] = link_tag.get('href')

        if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
            desc = single_result.find(class_='result__snippet')
            rdict["descriptions"] = desc.text

        return rdict

Beispiel #16

0

Datei anzeigen

Datei: youtube.py Projekt: devajithvs/search-engine-parser

    def parse_single_result(self, single_result, return_type=ReturnType.FULL):
        """
        Parses the source code to return

        :param single_result: single result found in <ytd-video-renderer class="style-scope">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        # pylint: disable=too-many-locals
        title_tag = single_result.find('a', class_='yt-uix-tile-link')

        if return_type in (ReturnType.FULL, return_type.TITLE):
            # Get the text and link
            rdict["titles"] = title_tag.text

        # try for single videos
        try:
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, return_type.DESCRIPTION):
                desc = single_result.find('div',
                                          class_="yt-lockup-description").text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                duration = single_result.find(
                    'span', class_='accessible-description').text
                ul_tag = single_result.find('ul', class_='yt-lockup-meta-info')

                channel_name = single_result.find(
                    'a', class_='yt-uix-sessionlink spf-link').text
                views_and_upload_date = ul_tag.find_all('li')
                upload_date = views_and_upload_date[0].text
                views = views_and_upload_date[1].text
                rdict.update({
                    "channels": channel_name,
                    "durations": duration,
                    "views": views,
                    "upload_dates": upload_date,
                })
        except BaseException:  # pylint: disable=broad-except
            link_tags = single_result.find_all(
                'a', class_='yt-uix-sessionlink spf-link')
            # TODO Optimize calls here so that we don't assign ref_link and channel_name
            # when we don't need them
            for i in link_tags:
                if i.get("href").startswith("/playlist"):
                    ref_link = i.get("href")
                elif i.get("href").startswith("/user"):
                    channel_name = i.text
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('span',
                                          class_='accessible-description').text
                rdict["descriptions"] = desc
            if return_type in (ReturnType.FULL, ):
                rdict.update({
                    "channels": channel_name,
                })
        return rdict

Beispiel #17

0

Datei anzeigen

Datei: github.py Projekt: sashka3076/search-engine-parser

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in container element
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        if self.type in (None, "Repositories"):
            h3 = single_result.find('div', class_='f4')  # pylint: disable=invalid-name
            link_tag = h3.find('a')
            # Get the text and link
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = link_tag.text
                rdict["titles"] = title

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = link_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_="mb-1")
                rdict["descriptions"] = desc.text

            if return_type in (ReturnType.FULL, ):
                stars_and_lang_div = single_result.find('div', class_='d-flex')
                lang = stars_and_lang_div.find('span',
                                               itemprop="programmingLanguage")
                stars = single_result.find('div', class_='mr-3').find('a')
                updated_on = single_result.find("relative-time").get("title")
                rdict.update({
                    "stars": "" if not stars else stars.text.strip(),
                    "languages": lang.text if lang else "",
                    "updated_on": updated_on,
                })

        if self.type == "Users":
            title_tag = single_result.find('div', class_='f4')
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.text
                rdict["titles"] = title

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.find('a').get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc_tag = single_result.find('p', class_='mb-1')
                desc = None
                if desc_tag:
                    desc = desc_tag.text.strip(' \n')
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                location_div = single_result.find('div', class_='d-flex')
                location_and_email = location_div.find_all('div',
                                                           class_='mr-3')
                location = email = None
                for single in location_and_email:
                    if single.get('href') == None:
                        location = single.text.strip(' \n')
                    else:
                        email = single.text

                rdict.update({
                    "locations": location,
                    "emails": email,
                })

        if self.type == "Wikis":
            title_tag = single_result.find('a', class_=None)

            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.get('title')
                rdict["title"] = title

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_="mb1").text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                last_updated = single_result.find('relative-time').get('title')
                repository = single_result.find('a', class_='muted-link').text
                rdict.update({
                    "repositories": repository,
                    "last_updated": last_updated,
                })

        if self.type == "Topics":
            title_div = single_result.find('div', class_='f4')
            title_tag = title_div.find('a', class_=None)
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                rdict["titles"] = title_tag.text
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link
            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = None
                desc_tag = single_result.find('p', class_=None)
                if desc_tag:
                    desc = desc_tag.text
                rdict["descriptions"] = desc

        if self.type == "Marketplace":
            title_tag = single_result.find('a', class_='no-underline')
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.get('title')
                rdict["titles"] = title_tag.text
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                link = title_tag.get('href')
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = None
                desc_tag = single_result.find('text-gray')
                if desc_tag:
                    desc = desc_tag.text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                categories = list()
                categories_tags = single_result.find_all('a', class_='Label')
                if categories_tags:
                    for i in categories_tags:
                        categories.append(str(i).strip('\n '))
            rdict["categories"] = categories

        if self.type == "RegistryPackages":
            title_tag = single_result.find('a', class_='h4')
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.text
                rdict["titles"] = title_tag.text

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_='mb-1').text.strip('\n ')
                rdict["descriptions"] = desc

        if self.type == "Issues":
            title_tag = single_result.find('a', class_=None)
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.text
                rdict["titles"] = title_tag.text

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_='mb-0').text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                repository = single_result.find('div', class_='ml-1').find(
                    'a', 'text-bold').text
                opened_by = self.base_url + \
                    single_result.find(
                        'div', class_='mr-3').find('a').get('href')
                opened_on = single_result.find('relative-time').get("title")
                rdict.update({
                    "opened_by": opened_by,
                    "opened_on": opened_on,
                    "respositories": repository,
                })

        if self.type == "Commits":
            title_p = single_result.find('div', class_="f4")
            title_tag = title_p.find('a')

            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.get('aria-label').strip("\n ")
                rdict["titles"] = title_tag.text

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                if ref_link.startswith("http"):
                    link = ref_link
                else:
                    link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                opened_on = None
                author = None
                if single_result.find('relative-time'):
                    opened_on = single_result.find('relative-time').get(
                        "title")
                desc = None
                if single_result.find('a', class_='commit-author'):
                    author_tag = single_result.find('a',
                                                    class_='commit-author')
                    author = author_tag.text
                    div = single_result.find('div', class_='d-flex')
                    repo = div.find('a').text
                    desc = "Committed to {}".format(repo)
                rdict["descriptions"] = desc
                if return_type == ReturnType.FULL:
                    rdict.update({
                        "authors": author,
                        "opened_on": opened_on,
                    })
        return rdict

Beispiel #18

0

Datei anzeigen

Datei: coursera.py Projekt: sashka3076/search-engine-parser

    def parse_single_result(self,
                            single_result,
                            return_type=ReturnType.FULL,
                            **kwargs):
        """
        Parses the source code to return

        :param single_result: single result found in <div class="gs_r gs_or gs_scl">
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link, description, file link, result type of single result
        :rtype: dict
        """
        rdict = SearchItem()

        if return_type in (ReturnType.FULL, return_type.LINK):
            link = single_result.find(
                'a', class_='rc-DesktopSearchCard anchor-wrapper').get('href')

            rdict["links"] = urljoin('https://www.coursera.org', link)

        if return_type in (ReturnType.FULL, return_type.TITLE):
            title = single_result.find('h2', class_="card-title").text
            rdict["titles"] = title

        if return_type in (ReturnType.FULL, ):
            partner_elem = single_result.find('span', class_='partner-name')
            partner = ''
            if partner_elem:
                partner = partner_elem.text

            rating_avg_elem = single_result.find('span', class_='ratings-text')
            rating_avg = None
            if rating_avg_elem:
                rating_avg = float(rating_avg_elem.text)

            enrollment_elem = single_result.find('span',
                                                 class_='enrollment-number')
            enrolment_number = None

            if enrollment_elem:
                enr_cl_txt = enrollment_elem.text.lower().replace(',', '').replace('.', '')\
                        .replace('m', '0' * 6).replace('k', '0' * 3)
                if enr_cl_txt.isdigit():
                    enrolment_number = int(enr_cl_txt)

            difficulty_elem = single_result.find('span', class_='difficulty')
            difficulty = ''
            if difficulty_elem:
                difficulty = difficulty_elem.text

            rating_count_elem = single_result.find('span',
                                                   class_='ratings-count')
            rating_count = None
            if rating_count_elem:
                rating_count_elem = rating_count_elem.find('span')
                rating_count_cl = rating_count_elem.text.replace(',', '')
                if rating_count_cl.isdigit():
                    rating_count = int(rating_count_cl)

            rdict.update({
                "partners": partner,
                "ratings_avg": rating_avg,
                "ratings_count": rating_count,
                "enrolments_numbers": enrolment_number,
                "difficulties": difficulty,
            })
        return rdict

Beispiel #19

0

Datei anzeigen

    def parse_single_result(self, single_result, return_type=ReturnType.FULL):
        """
        Parses the source code to return

        :param single_result: single result found in container element
        :type single_result: `bs4.element.ResultSet`
        :return: parsed title, link and description of single result
        :rtype: dict
        """
        rdict = SearchItem()
        if self.type in (None, "Repositories"):
            h3 = single_result.find('h3')  # pylint: disable=invalid-name
            link_tag = h3.find('a')
            # Get the text and link
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = link_tag.text
                rdict["titles"] = title

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = link_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_="col-12")
                rdict["descriptions"] = desc.text

            if return_type in (ReturnType.FULL,):
                stars_and_lang_div = single_result.find(
                    'div', class_='flex-shrink-0')
                lang = stars_and_lang_div.find(
                    'span', itemprop="programmingLanguage").text
                stars = stars_and_lang_div.find(
                    'a', class_='muted-link').text.strip()
                rdict.update({
                    "stars": stars,
                    "languages": lang,
                })

        if self.type == "Users":
            title_tag = single_result.find('a', class_=None)
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.text
                rdict["titles"] = title

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc_tag = single_result.find('p', class_='f5 mt-2')
                desc = None
                if desc_tag:
                    desc = desc_tag.text.strip(' \n')
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                location_tag = single_result.find('li', class_='mt-1')
                location = None
                if location_tag:
                    location = location_tag.text.strip(' \n')
                rdict.update({
                    "locations": location,
                })

        if self.type == "Wikis":
            title_tag = single_result.find('a', class_=None)

            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.get('title')
                rdict["title"] = title

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_=None).text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                last_updated = single_result.find(
                    'div', class_='updated-at').find('relative-time').text
                repository = single_result.find('a', class_='h5').text
                rdict.update({
                    "repositories": repository,
                    "last_updated": last_updated,
                })

        if self.type == "Topics":
            title_div = single_result.find('h3')
            title_tag = title_div.find('a', class_=None)
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                rdict["titles"] = title_tag.text
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link
            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = None
                desc_tag = single_result.find('p', class_=None)
                if desc_tag:
                    desc = desc_tag.text
                rdict["descriptions"] = desc

        if self.type == "Marketplace":
            title_tag = single_result.find('a', class_=None)
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.get('title')
                rdict["titles"] = title_tag.text
            if return_type in (ReturnType.FULL, ReturnType.LINK):
                link = title_tag.get('href')
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = None
                desc_tag = single_result.find('text-gray-light')
                if desc_tag:
                    desc = desc_tag.text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                categories = list()
                categories_tags = single_result.find('a', class_='topic-tag')
                if categories_tags:
                    for i in categories_tags:
                        categories.append(str(i).strip('\n '))
            rdict["categories"] = categories

        if self.type == "Packages":
            title_tag = single_result.find('a', class_='v-align-middle')
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.text
                rdict["titles"] = title_tag.text

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find(
                    'p', class_='col-12').text.strip('\n ')
                rdict["descriptions"] = desc

        if self.type == "Issues":
            title_tag = single_result.find('a', class_=None)
            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.text
                rdict["titles"] = title_tag.text

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                desc = single_result.find('p', class_=None).text
                rdict["descriptions"] = desc

            if return_type in (ReturnType.FULL, ):
                span = single_result.find('span', class_='flex-auto')
                opened_by = self.base_url + span.find('a').get('href')
                opened_on = span.find('relative-time').text
                rdict.update({
                    "opened_by": opened_by,
                    "opened_on": opened_on,
                })

        if self.type == "Commits":
            title_p = single_result.find('p', class_="commit-title")
            title_tag = title_p.find('a')

            if return_type in (ReturnType.FULL, ReturnType.TITLE):
                title = title_tag.get('aria-label').strip("\n ")
                rdict["titles"] = title_tag.text

            if return_type in (ReturnType.FULL, ReturnType.LINK):
                ref_link = title_tag.get('href')
                if ref_link.startswith("http"):
                    link = ref_link
                else:
                    link = self.base_url + ref_link
                rdict["links"] = link

            if return_type in (ReturnType.FULL, ReturnType.DESCRIPTION):
                opened_on = None
                author = None
                if single_result.find('relative-time'):
                    opened_on = single_result.find('relative-time').text
                desc = None
                if single_result.find('a', class_='commit-author'):
                    author_tag = single_result.find(
                        'a', class_='commit-author')
                    author = author_tag.text
                    div = single_result.find('div', class_='min-width-0')
                    repo = div.find('a', class_=None).text
                    desc = "Committed to {}".format(repo)
                rdict["descriptions"] = desc
                if return_type == ReturnType.FULL:
                    rdict.update({
                        "authors": author,
                        "opened_on": opened_on,
                    })
        return rdict