Exemple #1
0
class WikiOnline(object):
    def __init__(self):
        try:
            import pywikibot
        except (AttributeError, ImportError):
            logger.error(
                "pywikibot is not installed, please install nlp_architect with [all] package. "
                + "for example: pip install nlp_architect[all]")
            sys.exit()
        self.spacy = SpacyInstance()
        self.pywikibot = pywikibot
        self.cache = dict()
        self.site = pywikibot.Site(
            "en", "wikipedia")  # The site we want to run our bot on

    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]

        ret_pages = set()
        word_clean = phrase.replace("-", " ")
        word_lower = word_clean.lower()
        word_upper = word_clean.upper()
        word_title = word_clean.title()
        words_set = {phrase, word_clean, word_lower, word_upper, word_title}
        for appr in words_set:
            try:
                page_result = self.get_page_redirect(appr)
                if page_result.pageid != 0:
                    full_page = self.get_wiki_page_with_items(
                        phrase, page_result)
                    ret_pages.add(WikipediaSearchPageResult(appr, full_page))
            except Exception as e:
                logger.error(e)

        self.cache[phrase] = ret_pages
        return ret_pages

    # pylint: disable=protected-access
    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(
            text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid,
                                 description, relations)

        logger.debug("Page: {}. Extracted successfully".format(ret_page))

        return ret_page

    def get_wiki_page_item(self, page):
        if page is not None:
            try:
                item = self.pywikibot.ItemPage.fromPage(
                    page)  # this can be used for any page object
                item.get()  # need to call it to access any data.
                return item
            except (self.pywikibot.NoPage, AttributeError, TypeError,
                    NameError):
                pass
        return None

    def get_page_redirect(self, word):
        page = self.pywikibot.Page(self.site, word)
        if page.pageid != 0 and page.isRedirectPage():
            return page.getRedirectTarget()
        return page

    @staticmethod
    def get_aliases(item):
        if item is not None and item.aliases is not None:
            if "en" in item.aliases:
                aliases = item.aliases["en"]
                return aliases

        return None

    @staticmethod
    def get_description(item):
        description = {}
        if item is not None:
            item_desc = item.get()
            if "desctiptions" in item_desc and "en" in item_desc[
                    "descriptions"]:
                dict([("age", 25)])
                description["descriptions"] = dict([
                    ("en", item_desc["descriptions"]["en"])
                ])

        return description

    @staticmethod
    def is_disambiguation_page(item):
        if item is not None:
            dic = item.get()
            if dic is not None and "descriptions" in dic:
                desc = dic["descriptions"]
                if desc is not None and "en" in desc:
                    return desc["en"].lower() in DISAMBIGUATE_PAGE

        return False

    @staticmethod
    def is_name_description(text, item, is_disambiguation):
        if item is not None:
            if is_disambiguation:
                if WikipediaPageExtractedRelations.is_name_part(text):
                    return True
            else:
                dic = item.get()
                if dic is not None and "descriptions" in dic:
                    desc = dic["descriptions"]
                    if desc is not None and "en" in desc:
                        if [
                                s for s in NAME_DESCRIPTIONS
                                if s in desc["en"].lower()
                        ]:
                            return True
        return False

    # pylint: disable=no-else-return
    def extract_be_comp(self, text):
        first_sentence_start_index = text.index("'''")
        if first_sentence_start_index >= 0:
            last_temp_index = text.find("\n", first_sentence_start_index)
        if last_temp_index == -1:
            last_temp_index = len(text)

        first_paragraph = text[first_sentence_start_index:last_temp_index]
        if WikiOnline.extract_be_a_index(
                first_paragraph) == -1 and last_temp_index != len(text):
            return self.extract_be_comp(text[last_temp_index:])
        elif last_temp_index == len(text):
            return None, None

        first_paragraph_clean = re.sub(r"\([^)]*\)", "", first_paragraph)
        first_paragraph_clean = re.sub(r"<[^>]*>", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"{[^}]*}", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"\[\[[^]]*\]\]", "",
                                       first_paragraph_clean)
        first_paragraph_clean = re.sub(r"[\']", "", first_paragraph_clean)
        first_paragraph_clean = re.sub(r"&nbsp;", " ", first_paragraph_clean)

        return self.extract_be_comp_relations(first_paragraph_clean)

    # pylint: disable=not-callable
    def extract_be_comp_relations(self, first_paragraph):
        be_comp = set()
        be_comp_norm = set()
        if first_paragraph:
            doc = self.spacy.parser(first_paragraph)
            for token in doc:
                target = token.text
                target_lemma = token.lemma_
                relation = token.dep_
                governor = token.head.text
                governor_lemma = token.head.lemma_
                if relation == "acl":
                    break
                if relation == "punct" and target == ".":
                    break
                elif relation == "cop":
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == "nsubj":
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)
                elif relation == "dep":
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == "compound":
                    be_comp.add(target + " " + governor)
                    be_comp_norm.add(target_lemma + " " + governor_lemma)
                elif relation == "amod":
                    be_comp.add(target + " " + governor)
                    be_comp_norm.add(target_lemma + " " + governor_lemma)
                elif relation in ["conj", "appos"]:
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)

        return be_comp, be_comp_norm

    @staticmethod
    def extract_be_a_index(sentence):
        result = None
        if "is a" in sentence:
            result = sentence.index("is a")
        elif "are a" in sentence:
            result = sentence.index("are a")
        elif "was a" in sentence:
            result = sentence.index("was a")
        elif "were a" in sentence:
            result = sentence.index("were a")
        elif "be a" in sentence:
            result = sentence.index("be a")
        elif "is the" in sentence:
            result = sentence.index("is the")
        elif "are the" in sentence:
            result = sentence.index("are the")
        elif "was the" in sentence:
            result = sentence.index("was the")
        elif "were the" in sentence:
            result = sentence.index("were the")
        elif "be the" in sentence:
            result = sentence.index("be the")

        return result
Exemple #2
0
class WikiOnline(object):
    def __init__(self):
        import pywikibot
        self.spacy = SpacyInstance()
        self.pywikibot = pywikibot
        self.cache = dict()
        self.site = pywikibot.Site(
            'en', 'wikipedia')  # The site we want to run our bot on

    def get_pages(self, phrase):
        if phrase in self.cache:
            return self.cache[phrase]

        ret_pages = set()
        word_clean = phrase.replace('-', ' ')
        word_lower = word_clean.lower()
        word_upper = word_clean.upper()
        word_title = word_clean.title()
        words_set = {phrase, word_clean, word_lower, word_upper, word_title}
        for appr in words_set:
            try:
                page_result = self.get_page_redirect(appr)
                if page_result.pageid != 0:
                    full_page = self.get_wiki_page_with_items(
                        phrase, page_result)
                    ret_pages.add(WikipediaSearchPageResult(appr, full_page))
            except Exception as e:
                logger.error(e)

        self.cache[phrase] = ret_pages
        return ret_pages

    # pylint: disable=protected-access
    def get_wiki_page_with_items(self, phrase, page):
        item = self.get_wiki_page_item(page)
        pageid = page.pageid
        aliases = self.get_aliases(item)
        description = self.get_description(item)
        text = page.text
        page_title = page._link._title

        relations = WikipediaPageExtractedRelations()
        relations.is_disambiguation = self.is_disambiguation_page(item)
        relations.is_part_name = self.is_name_description(
            text, item, relations.is_disambiguation)
        relations.aliases = aliases
        relations.be_comp, relations.be_comp_norm = self.extract_be_comp(text)
        relations.extract_relations_from_text_v0(text)

        ret_page = WikipediaPage(phrase, None, page_title, None, 0, pageid,
                                 description, relations)

        logger.debug('Page: {}. Extracted successfully'.format(ret_page))

        return ret_page

    def get_wiki_page_item(self, page):
        if page is not None:
            try:
                item = self.pywikibot.ItemPage.fromPage(
                    page)  # this can be used for any page object
                item.get()  # need to call it to access any data.
                return item
            except (self.pywikibot.NoPage, AttributeError, TypeError,
                    NameError):
                pass
        return None

    def get_page_redirect(self, word):
        page = self.pywikibot.Page(self.site, word)
        if page.pageid != 0 and page.isRedirectPage():
            return page.getRedirectTarget()
        return page

    @staticmethod
    def get_aliases(item):
        if item is not None and item.aliases is not None:
            if 'en' in item.aliases:
                aliases = item.aliases['en']
                return aliases

        return None

    @staticmethod
    def get_description(item):
        description = {}
        if item is not None:
            item_desc = item.get()
            if 'desctiptions' in item_desc and 'en' in item_desc[
                    'descriptions']:
                dict([("age", 25)])
                description['descriptions'] = dict([
                    ('en', item_desc['descriptions']['en'])
                ])

        return description

    @staticmethod
    def is_disambiguation_page(item):
        if item is not None:
            dic = item.get()
            if dic is not None and 'descriptions' in dic:
                desc = dic['descriptions']
                if desc is not None and 'en' in desc:
                    return desc['en'].lower() in DISAMBIGUATE_PAGE

        return False

    @staticmethod
    def is_name_description(text, item, is_disambiguation):
        if item is not None:
            if is_disambiguation:
                if WikipediaPageExtractedRelations.is_name_part(text):
                    return True
            else:
                dic = item.get()
                if dic is not None and 'descriptions' in dic:
                    desc = dic['descriptions']
                    if desc is not None and 'en' in desc:
                        if [
                                s for s in NAME_DESCRIPTIONS
                                if s in desc['en'].lower()
                        ]:
                            return True
        return False

    # pylint: disable=no-else-return
    def extract_be_comp(self, text):
        first_sentence_start_index = text.index("'''")
        if first_sentence_start_index >= 0:
            last_temp_index = text.find('\n', first_sentence_start_index)
        if last_temp_index == -1:
            last_temp_index = len(text)

        first_paragraph = text[first_sentence_start_index:last_temp_index]
        if WikiOnline.extract_be_a_index(
                first_paragraph) == -1 and last_temp_index != len(text):
            return self.extract_be_comp(text[last_temp_index:])
        elif last_temp_index == len(text):
            return None, None

        first_paragraph_clean = re.sub(r'\([^)]*\)', '', first_paragraph)
        first_paragraph_clean = re.sub(r'<[^>]*>', '', first_paragraph_clean)
        first_paragraph_clean = re.sub(r'{[^}]*}', '', first_paragraph_clean)
        first_paragraph_clean = re.sub(r'\[\[[^]]*\]\]', '',
                                       first_paragraph_clean)
        first_paragraph_clean = re.sub(r'[\']', '', first_paragraph_clean)
        first_paragraph_clean = re.sub(r'&nbsp;', ' ', first_paragraph_clean)

        return self.extract_be_comp_relations(first_paragraph_clean)

    # pylint: disable=not-callable
    def extract_be_comp_relations(self, first_paragraph):
        be_comp = set()
        be_comp_norm = set()
        if first_paragraph:
            doc = self.spacy.parser(first_paragraph)
            for token in doc:
                target = token.text
                target_lemma = token.lemma_
                relation = token.dep_
                governor = token.head.text
                governor_lemma = token.head.lemma_
                if relation == 'acl':
                    break
                if relation == 'punct' and target == '.':
                    break
                elif relation == 'cop':
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == 'nsubj':
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)
                elif relation == 'dep':
                    be_comp.add(governor)
                    be_comp_norm.add(governor_lemma)
                elif relation == 'compound':
                    be_comp.add(target + ' ' + governor)
                    be_comp_norm.add(target_lemma + ' ' + governor_lemma)
                elif relation == 'amod':
                    be_comp.add(target + ' ' + governor)
                    be_comp_norm.add(target_lemma + ' ' + governor_lemma)
                elif relation in ['conj', 'appos']:
                    be_comp.add(target)
                    be_comp_norm.add(target_lemma)

        return be_comp, be_comp_norm

    @staticmethod
    def extract_be_a_index(sentence):
        result = None
        if 'is a' in sentence:
            result = sentence.index("is a")
        elif 'are a' in sentence:
            result = sentence.index("are a")
        elif 'was a' in sentence:
            result = sentence.index("was a")
        elif 'were a' in sentence:
            result = sentence.index("were a")
        elif 'be a' in sentence:
            result = sentence.index("be a")
        elif 'is the' in sentence:
            result = sentence.index("is the")
        elif 'are the' in sentence:
            result = sentence.index("are the")
        elif 'was the' in sentence:
            result = sentence.index("was the")
        elif 'were the' in sentence:
            result = sentence.index("were the")
        elif 'be the' in sentence:
            result = sentence.index("be the")

        return result