Esempio n. 1
0
    def run(self, data):

        """Performs a scraping of a faculty members GoogleScholar page.
        :param data is a faculty object
        :return: last faculty member handled
        """
        
        no_text_count = 0
        for faculty in data:
            faculty_name = faculty.name
    
            search_results = Faculty.search().query('match', name=faculty_name).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException("Professor id is ambiguous during search ... More than 1 result")

            search_dup = Document.search().query('match', faculty_id=faculty.faculty_id).query("match", source="GoogleScholar")
            search_dup.delete()

            faculty = search_results[0]
            if faculty.google_scholar is not None and "http" in faculty.google_scholar:
                scraper = ScraperFactory.create_scraper(faculty.google_scholar, ScraperType.GOOGLESCHOLAR)
                scrapps = scraper.get_scrapps()
                for scrapp in scrapps:
                    doc = Document()
                    doc.source = "GoogleScholar"
                    doc.faculty_id = faculty.faculty_id
                    doc.text = scrapp.title
                    doc.save()
            else:
                no_text_count += 1
        print("NO TEXT COUNT = ", no_text_count)
        return faculty
Esempio n. 2
0
    def run(self, data):
        """Performs a scraping of a faculty members ResearchId page.
        :param data is a faculty object
        :return: last faculty member handled
        """

        no_text_count = 0
        for faculty in data:
            faculty_name = faculty.name

            search_results = Faculty.search().query(
                'match', name=faculty_name).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )

            search_dup = Document.search().query(
                'match',
                faculty_id=faculty.faculty_id).query("match",
                                                     source="ResearchId")
            search_dup.delete()
            faculty = search_results[0]
            if faculty.research_id is not None:

                scraper = ScraperFactory.create_scraper(
                    faculty.research_id, ScraperType.RESEARCHID)
                scrapps = scraper.get_scrapps()

                keywords_and_description = scrapps[0]
                titles = scrapps[1:]

                doc = Document()
                doc.faculty_id = faculty.faculty_id
                doc.source = "ResearchId"
                try:
                    doc.text = keywords_and_description.meta_data[
                        "description"]
                except:
                    print("No description")
                    doc.text = ""
                try:
                    doc.user_keywords = keywords_and_description.meta_data[
                        "keywords"]
                except:
                    print("No keywords")
                doc.save()

                for scrapp in titles:
                    doc = Document()
                    doc.source = "ResearchId"
                    doc.faculty_id = faculty.faculty_id
                    doc.text = scrapp.title
                    doc.save()

            else:
                no_text_count += 1
        print("NO TEXT COUNT = ", no_text_count)
        return faculty
Esempio n. 3
0
    def run(self, data):
        """Performs a scraping of a faculty members GoogleScholar page.
        :param data is a faculty object
        :return: list of faculty members
        """

        faculty = data
        if isinstance(faculty, str):
            search_results = Faculty.search().query(
                'match', name=faculty_name).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )
            faculty = search_results[0]
        faculty_name = faculty.name

        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="GoogleScholar") \
            .delete()

        if faculty.google_scholar is not None and "http" in faculty.google_scholar:
            scraper = ScraperFactory.create_scraper(faculty.google_scholar,
                                                    ScraperType.GOOGLESCHOLAR)
            scrapps = scraper.get_scrapps()
            for scrapp in scrapps:
                doc = Document()
                doc.source = "GoogleScholar"
                doc.faculty_id = faculty.faculty_id
                doc.text = scrapp.title
                doc.save()

        return faculty
Esempio n. 4
0
    def run(self, data):
        """Performs a scraping of a faculty members ResearchId page.
        :param data is a faculty object
        :return: last faculty member handled
        """

        faculty = data
        if isinstance(faculty, str):
            search_results = Faculty.search().query('match',
                                                    name=faculty).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )
            faculty = search_results[0]

        faculty_name = faculty.name


        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="ResearchId") \
            .delete()

        print("Running researchid scrape on {}. Research id {}.".format(
            faculty_name, faculty.research_id))

        if faculty.research_id is not None:

            scraper = ScraperFactory.create_scraper(faculty.research_id,
                                                    ScraperType.RESEARCHID)
            scrapps = scraper.get_scrapps()

            keywords_and_description = scrapps[0]
            titles = scrapps[1:]

            doc = Document()
            doc.faculty_id = faculty.faculty_id
            doc.source = "ResearchId"
            try:
                doc.text = keywords_and_description.meta_data["description"]
            except:
                print("No description")
                doc.text = ""
            try:
                doc.user_keywords = keywords_and_description.meta_data[
                    "keywords"]
            except:
                print("No keywords")
            doc.save()

            for scrapp in titles:
                doc = Document()
                doc.source = "ResearchId"
                doc.faculty_id = faculty.faculty_id
                doc.text = scrapp.title
                doc.save()

        return faculty
Esempio n. 5
0
    def run(self, data):
        """Performs a scraping of a faculty members directory page.
        :param data: str or Faculty instance.
        :return: tuple of the faculty name and Scrapp produced by scraping the faculty directory page.
        """
        print("Running {} on {}".format(self.task_name, data))
        if isinstance(data, str):
            faculty_name = data
        else:
            faculty_name = data.name

        faculty_directory_url = URLs.build_faculty_url(faculty_name)

        scraper = ScraperFactory.create_scraper(faculty_directory_url,
                                                ScraperType.PROFILE)
        scrapp = scraper.get_scrapps()[0]

        ret_data = (data, scrapp)

        return ret_data
Esempio n. 6
0
    def run(self, data):
        """Performs a scraping of a faculty members directory page.
        :param data: str or Faculty instance.
        :return: tuple of the faculty name and Scrapp produced by scraping the faculty directory page.
        """

        tuplelist = []
        for faculty in data:
            if isinstance(faculty, str):
                faculty_name = faculty
            else:
                faculty_name = faculty.name

            faculty_directory_url = URLs.build_faculty_url(faculty_name)

            scraper = ScraperFactory.create_scraper(faculty_directory_url,
                                                    ScraperType.PROFILE)
            scrapp = scraper.get_scrapps()[0]

            tuple = (faculty_name, scrapp)
            tuplelist.append(tuple)
            print(tuple)

        return tuplelist