Ejemplo n.º 1
0
    def setup_class(cls):
        Keywords.search().delete()
        Faculty.search().delete()
        sleep(3)

        will = Faculty(meta={"id": 379},
                       name="William.Allison",
                       full_name="Allison, William.",
                       faculty_id=379,
                       email="*****@*****.**")
        will.save()
        will_keywords = Keywords(
            faculty_id=379,
            datasource="test",
            approach_id=0,
            keywords=["zebrafish", "evolutionary processes", "marine"])
        will_keywords.save()

        vince = Faculty(meta={"id": 356},
                        name="Vincent.Bouchard",
                        full_name="Bouchard, Vincent",
                        faculty_id=356,
                        email="*****@*****.**")
        vince.save()
        vince_keywords = Keywords(
            faculty_id=356,
            datasource="test",
            approach_id=0,
            keywords=["string theory", "number theory", "mathematics"])
        vince_keywords.save()

        sleep(3)
Ejemplo n.º 2
0
    def get(self):
        """HTTP Get that enables boolean query processing and search."""
        query = request.args.get('query')

        if query is None:
            abort(400)

        q_parser = parser.QueryParser()
        q_builder = builder.QueryBuilder()

        pf_query = q_parser.parse_query(query)
        elastic_query = q_builder.build(pf_query)

        # response = Faculty.search().query(elastic_query).execute()
        response = Keywords.search().query(elastic_query).execute()
        faculty_with_keywords = set()
        for keywords in response:
            faculty_with_keywords.add(keywords.faculty_id)
        schema = FacultySchema()
        results = [
            schema.dump(Faculty.safe_get(faculty_id))
            for faculty_id in faculty_with_keywords
        ]

        return {"data": results}
Ejemplo n.º 3
0
    def test_simple_search(self):
        """Test the results of queries on actual data."""
        query = "zebrafish"
        elastic_query = self.build_query(query)
        results = Keywords.search().query(elastic_query).execute()
        assert len(results) == 1

        query = "zebrafish OR mathematics"
        elastic_query = self.build_query(query)
        results = Keywords.search().query(elastic_query).execute()
        assert len(results) == 2

        query = "zebrafish AND mathematics"
        elastic_query = self.build_query(query)
        results = Keywords.search().query(elastic_query).execute()
        assert len(results) == 0
Ejemplo n.º 4
0
    def add_name_search_results(faculty_with_keywords, pf_query):
        """ Inserts results of pf_query on faculty index into faculty_with_keywords.

        If an faculty is returned from the query, but does not currently exist in
        faculty_with_words, the faculty member plus their entire keyword set,
        is inserted into the dictionary.
        :param faculty_with_keywords: Dictionary of faculty id's to keywords.
        :param pf_query: Postfix query created by the Query Builder.
        :returns: faculty_with_keywords also containing faculty whose names match the
            query.
        """
        # Add functionality of searching names in query.
        q_builder = builder.QueryBuilder()
        name_elastic_query = q_builder.build(pf_query, search_field="full_name")
        names_response = Faculty.search().query(name_elastic_query).execute()

        for faculty in names_response:
            # We already have the faculty who was searched in the results.
            if faculty.faculty_id in faculty_with_keywords:
                continue

            faculty_keywords = Keywords.search()\
                .query('match', faculty_id=faculty.faculty_id).execute()
            
            faculty_with_keywords[faculty.faculty_id] = faculty_keywords
Ejemplo n.º 5
0
    def get(self):
        """HTTP Get that enables boolean query processing and search."""
        query = request.args.get('query')
        dept = request.args.get('department')

        if query is None:
            abort(400)

        # Take dept string and turn it into an easy to compare set.
        try:
            if dept is not None:
                dept_filter = set([x.strip() for x in dept.split(',')])
            else:
                dept_filter = set()
        except:
            abort(400)

        q_parser = parser.QueryParser()
        q_builder = builder.QueryBuilder()

        try:
            pf_query = q_parser.parse_query(query)
        except parser.QueryException:
            abort(400)

        keywords_elastic_query = q_builder.build(pf_query)        
        response = Keywords.search().query(keywords_elastic_query).execute()    
        faculty_with_keywords = SearchAPI.get_faculty_with_keywords(response)
        
        SearchAPI.add_name_search_results(faculty_with_keywords, pf_query)
        
        return {
            "data": SearchAPI.create_results(faculty_with_keywords, dept_filter)
        }
Ejemplo n.º 6
0
    def get(self):
        """HTTP Get that enables boolean query processing and batch."""
        response = Keywords.search().query().execute()
        schema = KeywordSchema()
        results = [schema.dump(s) for s in response]

        return {"data": results}
Ejemplo n.º 7
0
    def get(self):
        """HTTP Get for the keyword list resource.

        Returns a list of faculty members from elasticsearch.
        :param page: URL Parameter for the page to fetch. Default - 0.
        :param results: URL Parameter for the number of results to return per page. Default - 20.
        :param id: URL Parameter to filter the results based on a faculty id.
        :param source: URL Parameter to filter the results based on the keyword source.
        :param approach: URL Parameter to filter results based on the approach_id.
        :return:
        """
        id = request.args.get("id", type=int)
        source = request.args.get("source", type=str)
        approach = request.args.get("approach", type=int)

        search = Keywords.search()
        search = apply_filters(search,
                               faculty_id=id,
                               datasource=source,
                               approach_id=approach)

        query, pagination_info = paginate_query(request, search)
        response = query.execute()

        schema = KeywordSchema()
        results = [schema.dump(keyword) for keyword in response]

        return {"pagination": pagination_info, "data": results}
Ejemplo n.º 8
0
    def run(self, data):
        """Updates a Keyword object information in Elasticsearch, based on the generator results.

        :param data: list of keyword objects
        :return:  returns True.
        """

        for key_object in data:
            key_search = Keywords.search().query('match', faculty_id=key_object.faculty_id) \
                .query('match' , datasource = key_object.datasource) \
                .query('match', approach_id = key_object.approach_id) \
                .execute()

            try:
                keywords = key_search[0]
            except IndexError:
                keywords = Keywords()
                keywords.faculty_id = key_object.faculty_id
                keywords.datasource = key_object.datasource
                keywords.approach_id = key_object.approach_id

            keywords.keywords = key_object.keywords
            keywords.save()
        return True
Ejemplo n.º 9
0
    def get(self):
        query = request.args.get('query')
        approach = request.args.get('approach')
        approach = int(approach)

        if query is None or approach is None:
            abort(400)

        q_parser = parser.QueryParser()
        q_builder = builder.QueryBuilder()

        try:
            pf_query = q_parser.parse_query(query)
        except parser.QueryException:
            abort(400)

        keywords_elastic_query = q_builder.build(pf_query)
        response = Keywords.search().query(keywords_elastic_query).execute()
        faculty_with_keywords = SearchAPI.get_faculty_with_keywords(response)

        empty_profs = []
        for faculty_id, keywords in faculty_with_keywords.items():
            filtered_keywords = []
            for keyword_obj in keywords:
                if keyword_obj.approach_id == approach:
                    filtered_keywords.append(keyword_obj)
            faculty_with_keywords[faculty_id] = filtered_keywords

            if len(filtered_keywords) == 0:
                empty_profs.append(faculty_id)

        for faculty_id in empty_profs:
            del faculty_with_keywords[faculty_id]

        results = SearchAPI.create_results(faculty_with_keywords,
                                           dept_filter=[])
        str_io = BytesIO()
        str_io.write(json.dumps(results, indent=4).encode())
        str_io.seek(0)

        return send_file(str_io,
                         as_attachment=True,
                         attachment_filename="batch_results.txt")
Ejemplo n.º 10
0
        """Updates a Keyword object information in Elasticsearch, based on the generator results.

        :param data: list of keyword objects
        :return:  returns True.
        """

        for key_object in data:
            key_search = Keywords.search().query('match', faculty_id=key_object.faculty_id) \
                .query('match' , datasource = key_object.datasource) \
                .query('match', approach_id = key_object.approach_id) \
                .execute()

            try:
                keywords = key_search[0]
            except IndexError:
                keywords = Keywords()
                keywords.faculty_id = key_object.faculty_id
                keywords.datasource = key_object.datasource
                keywords.approach_id = key_object.approach_id

            keywords.keywords = key_object.keywords
            keywords.save()
        return True


if __name__ == "__main__":
    from elasticsearch_dsl import connections
    connections.create_connection()
    Faculty.init()
    Keywords.init()
Ejemplo n.º 11
0
    def run(self, data):
        """Performs a scraping of a faculty members ResearchId page.
        :param data is a faculty object
        :return: last faculty member handled
        """

        faculty = data
        if isinstance(faculty, str):
            search_results = Faculty.search().query('match',
                                                    name=faculty).execute()
            if len(search_results) > 1:
                # Shouldn't happen, but could.
                raise WorkflowException(
                    "Professor id is ambiguous during search ... More than 1 result"
                )
            faculty = search_results[0]

        faculty_name = faculty.name


        Document.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", source="ResearchId") \
            .delete()

        Keywords.search().query('match', faculty_id=faculty.faculty_id) \
            .query("match", approach_id="4") \
            .delete()

        print("Running researchid scrape on {}. Research id {}.".format(
            faculty_name, faculty.research_id))

        if faculty.research_id is not None:

            scraper = ScraperFactory.create_scraper(faculty.research_id,
                                                    ScraperType.RESEARCHID)
            try:
                scrapps = scraper.get_scrapps()
            except ScraperException:
                return faculty

            keywords_and_description = scrapps[0]
            titles = scrapps[1:]

            doc = Document()
            doc.faculty_id = faculty.faculty_id
            doc.source = "ResearchId"

            keywords = Keywords()
            keywords.faculty_id = faculty.faculty_id
            keywords.datasource = "user_keywords"
            keywords.approach_id = "4"

            try:
                doc.text = keywords_and_description.meta_data["description"]
            except:
                print("No description")
                doc.text = ""
            try:
                doc.user_keywords = keywords_and_description.meta_data[
                    "keywords"]
                keywords.keywords = keywords_and_description.meta_data[
                    "keywords"]
            except:
                print("No keywords")
            doc.date = datetime.now()
            doc.save()
            keywords.save()

            for scrapp in titles:
                doc = Document()
                if scrapp.data_source == ScraperType.RESEARCHID:
                    doc.source = "ResearchId"
                else:
                    doc.source = "ResearchIdAbstract"
                doc.faculty_id = faculty.faculty_id
                if scrapp.data_source == ScraperType.RESEARCHID:
                    doc.text = scrapp.title
                else:
                    doc.text = scrapp.meta_data["text"]

                doc.date = datetime.now()
                doc.save()

        return faculty
Ejemplo n.º 12
0
    def teardown_class(cls):
        Faculty.get(id=379).delete()
        Faculty.get(id=356).delete()

        Keywords.search().query('match', faculty_id=379).delete()
        Keywords.search().query('match', faculty_id=356).delete()