Beispiel #1
0
    def extract_json_values(json_pages):
        pages = set()
        for json_page in json_pages:
            description = json_page.get("description", None)
            pageid = int(json_page.get("pageid", 0))
            orig_phrase = json_page.get("orig_phrase", None)
            orig_phrase_norm = json_page.get("orig_phrase_norm", None)
            wiki_title = json_page.get("wiki_title", None)
            wiki_title_norm = json_page.get("wiki_title_norm", None)

            relations_json = json_page.get("relations", None)
            rel_is_part_name = relations_json.get("isPartName", None)
            rel_is_disambiguation = relations_json.get("isDisambiguation", None)
            rel_disambiguation = relations_json.get("disambiguationLinks", None)
            rel_disambiguation_norm = relations_json.get("disambiguationLinksNorm", None)
            rel_parenthesis = relations_json.get("titleParenthesis", None)
            rel_parenthesis_norm = relations_json.get("titleParenthesisNorm", None)
            rel_categories = relations_json.get("categories", None)
            rel_categories_norm = relations_json.get("categoriesNorm", None)
            rel_be_comp = relations_json.get("beCompRelations", None)
            rel_be_comp_norm = relations_json.get("beCompRelationsNorm", None)
            rel_aliases = relations_json.get("aliases", None)
            rel_aliases_norm = relations_json.get("aliasesNorm", None)

            relations = WikipediaPageExtractedRelations(
                rel_is_part_name,
                rel_is_disambiguation,
                rel_parenthesis,
                rel_disambiguation,
                rel_categories,
                rel_aliases,
                rel_be_comp,
                rel_disambiguation_norm,
                rel_categories_norm,
                rel_aliases_norm,
                rel_parenthesis_norm,
                rel_be_comp_norm,
            )

            page = WikipediaPage(
                orig_phrase,
                orig_phrase_norm,
                wiki_title,
                wiki_title_norm,
                0,
                pageid,
                description,
                relations,
            )
            pages.add(WikipediaSearchPageResult(orig_phrase, page))

        return pages
    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]
        try:
            phrase_strip = ' '.join(phrase.replace('-', ' ').split())
            pages = set()
            best_results = self.get_best_elastic_results(phrase_strip)
            for result in best_results:
                _id = result['_id']
                if _id != 0:
                    result_source = result['_source']
                    if 'redirectTitle' in result_source:
                        redirect_title = result_source['redirectTitle']
                        red_result = None
                        while redirect_title and result_source[
                                'title'] != redirect_title:
                            red_result = self.get_redirect_result(
                                redirect_title)
                            if red_result is None or len(red_result) == 0:
                                print('could not find redirect title=' +
                                      redirect_title +
                                      ', does not exist in data')
                                redirect_title = None
                            elif 'redirectTitle' in red_result[0]['_source']:
                                redirect_title = red_result[0]['_source'][
                                    'redirectTitle']
                            else:
                                redirect_title = None

                        if red_result is not None and len(red_result) > 0:
                            result = red_result[0]
                            _id = result['_id']

                    elastic_page_result = self.get_page_from_result_v1(
                        phrase_strip, result, _id)
                    pages.add(
                        WikipediaSearchPageResult(phrase, elastic_page_result))

            self.cache[phrase] = pages
            return pages
        except Exception:
            traceback.print_exc()
Beispiel #3
0
    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]
        try:
            phrase_strip = " ".join(phrase.replace("-", " ").split())
            pages = set()
            best_results = self.get_best_elastic_results(phrase_strip)
            for result in best_results:
                _id = result["_id"]
                if _id != 0:
                    result_source = result["_source"]
                    if "redirectTitle" in result_source:
                        redirect_title = result_source["redirectTitle"]
                        red_result = None
                        while redirect_title and result_source[
                                "title"] != redirect_title:
                            red_result = self.get_redirect_result(
                                redirect_title)
                            if red_result is None or len(red_result) == 0:
                                print("could not find redirect title=" +
                                      redirect_title +
                                      ", does not exist in data")
                                redirect_title = None
                            elif "redirectTitle" in red_result[0]["_source"]:
                                redirect_title = red_result[0]["_source"][
                                    "redirectTitle"]
                            else:
                                redirect_title = None

                        if red_result is not None and len(red_result) > 0:
                            result = red_result[0]
                            _id = result["_id"]

                    elastic_page_result = self.get_page_from_result_v1(
                        phrase_strip, result, _id)
                    pages.add(
                        WikipediaSearchPageResult(phrase, elastic_page_result))

            self.cache[phrase] = pages
            return pages
        except Exception:
            traceback.print_exc()
    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]

        ret_pages = set()
        word_clean = phrase.replace('-', ' ')
        word_lower = word_clean.lower()
        word_upper = word_clean.upper()
        word_title = word_clean.title()
        words_set = {phrase, word_clean, word_lower, word_upper, word_title}
        for appr in words_set:
            try:
                page_result = self.get_page_redirect(appr)
                if page_result.pageid != 0:
                    full_page = self.get_wiki_page_with_items(phrase, page_result)
                    ret_pages.add(WikipediaSearchPageResult(appr, full_page))
            except Exception as e:
                print(e)

        self.cache[phrase] = ret_pages
        return ret_pages
Beispiel #5
0
    def extract_json_values(json_pages):
        pages = set()
        for json_page in json_pages:
            description = json_page.get('description', None)
            pageid = int(json_page.get('pageid', 0))
            orig_phrase = json_page.get('orig_phrase', None)
            orig_phrase_norm = json_page.get('orig_phrase_norm', None)
            wiki_title = json_page.get('wiki_title', None)
            wiki_title_norm = json_page.get('wiki_title_norm', None)

            relations_json = json_page.get('relations', None)
            rel_is_part_name = relations_json.get('isPartName', None)
            rel_is_disambiguation = relations_json.get('isDisambiguation',
                                                       None)
            rel_disambiguation = relations_json.get('disambiguationLinks',
                                                    None)
            rel_disambiguation_norm = relations_json.get(
                'disambiguationLinksNorm', None)
            rel_parenthesis = relations_json.get('titleParenthesis', None)
            rel_parenthesis_norm = relations_json.get('titleParenthesisNorm',
                                                      None)
            rel_categories = relations_json.get('categories', None)
            rel_categories_norm = relations_json.get('categoriesNorm', None)
            rel_be_comp = relations_json.get('beCompRelations', None)
            rel_be_comp_norm = relations_json.get('beCompRelationsNorm', None)
            rel_aliases = relations_json.get('aliases', None)
            rel_aliases_norm = relations_json.get('aliasesNorm', None)

            relations = WikipediaPageExtractedRelations(
                rel_is_part_name, rel_is_disambiguation, rel_parenthesis,
                rel_disambiguation, rel_categories, rel_aliases, rel_be_comp,
                rel_disambiguation_norm, rel_categories_norm, rel_aliases_norm,
                rel_parenthesis_norm, rel_be_comp_norm)

            page = WikipediaPage(orig_phrase, orig_phrase_norm, wiki_title,
                                 wiki_title_norm, 0, pageid, description,
                                 relations)
            pages.add(WikipediaSearchPageResult(orig_phrase, page))

        return pages