Ejemplo n.º 1
0
def main(path_topN_pkl):
    wiki_html = Wikipedia('fr', extract_format=ExtractFormat.HTML)
    wiki_page = Wikipedia('fr', extract_format=ExtractFormat.WIKI)

    sources = pkl.load(Path(path_topN_pkl).open('rb'))

    sources = [s[1].strip() for s in sources]

    for s in sources:
        dump_page(s, target_folder='data/10khtml', wiki_obj=wiki_html)
        dump_page(s, target_folder='data/10kpages', wiki_obj=wiki_page)
Ejemplo n.º 2
0
    def scrape_article_texts(self,
                             titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                             see_also=True, max_articles=10000, max_depth=1,
                             heading_text=True, title_text=True):
        """ Download text for an article and parse into sections and sentences

        >>> nlp('hello')  # to eager-load spacy model
        hello
        >>> texts = scrape_article_texts(['ELIZA'], see_also=False)
        >>> texts = list(texts)
        >>> len(texts)
        1
        >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3))
        >>> len(texts)
        10
        """
        if isinstance(titles, str):
            log.error(f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"')
            titles = find_titles(titles)
        exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
        # depth starts at zero here, but as additional titles are appended the depth will increase
        title_depths = list(zip(titles, [0] * len(titles)))
        text_lens = []
        # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
        # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
        titles_scraped = set([''])
        d, num_articles = 0, 0
        wiki = Wikipedia()
        # TODO: should be able to use depth rather than d:
        for depth in range(max_depth):
            while num_articles < max_articles and d <= depth and len(title_depths) > 0:
                title = ''

                # skip titles already scraped
                while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped):
                    # log.warning(f"Skipping {title} (already scraped)")
                    try:
                        title, d = title_depths.pop()
                    except IndexError:
                        log.info(f'Out of titles: {title_depths}')
                        break
                    title = title.strip()
                if d > max_depth or not title:
                    log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                    continue
                titles_scraped.add(title)
                log.info(f'len(title_depths): {len(title_depths)}')
                text = self.cache.get(title, None)
                if text:
                    yield text
                page = wiki.article(title)
                if not (len(getattr(page, 'text', '')) + len(getattr(page, 'summary', ''))):
                    log.warning(f"Unable to retrieve _{title}_ because article text and summary len are 0.")
Ejemplo n.º 3
0
    def get_article(
        self,
        title: str,
        exclude_headings=EXCLUDE_HEADINGS,
        see_also=True,
        prepend_section_headings=True,
        prepend_title_text=True,
    ):
        """ same as scrape_article_texts but for single article, and checks cache first """
        page_dict = self.cache.get(title)
        if page_dict and page_dict.get('text') and page_dict.get('summary'):
            return copy.copy(page_dict)
        self.wiki = Wikipedia()
        page = self.wiki.article(title)

        text, summary, see_also_links = '', '', []
        if page.exists():
            text = getattr(page, 'text', '')
            summary = getattr(page, 'summary', '')
        else:
            time.sleep(self.sleep_nonexistent_page)
            self.cache[title] = {}
            return {}

        # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition
        # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
        if see_also:
            # .full_text() includes the section heading ("See also"). .text does not
            section = page.section_by_title('See also')
            if section:
                for t in section.text.split('\n'):
                    log.info(f"  Checking _SEE ALSO_ link: {t}")
                    if t in page.links:
                        see_also_links.append(t)

        text = f'{page.title}\n\n' if prepend_title_text else ''
        # page.text
        for section in page.sections:
            if section.title.lower().strip() in exclude_headings:
                continue
            # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
            text += f'\n{section.title}\n' if prepend_section_headings else '\n'
            # spacy doesn't handle "latin" (extended ascii) apostrophes well.
            text += section.text.replace('’', "'") + '\n'
            self.section_titles[str(section.title).strip()] = str(
                section.title).lower().strip().replace('’', "'")
        page_dict = dict(title=page.title,
                         text=text,
                         summary=summary,
                         see_also_links=see_also_links)
        self.cache[title] = page_dict
        return page_dict
Ejemplo n.º 4
0
    def main(self):
        from aion_core.utils import remove_brackets, remove_space, remove_string_sequence
        from wikipediaapi import Wikipedia

        splitted_acph = self.activate_phrase.split("__and__")
        searched_article = remove_string_sequence(self.speech_input, splitted_acph[0], splitted_acph[-1])
        wiki = Wikipedia(aconf.Aion().get_language().split("_")[0])
        article = wiki.page(searched_article)

        if article.exists():
            article_text = remove_brackets(article.summary)
            article_text = remove_space(article_text)
            self.speech_output(alang.start("skills", "wikipedia", {"article_text": article_text}))
        else:
            self.speech_output(alang.start("skills", "wikipedia_article_not_found", {"article_name": searched_article}))
Ejemplo n.º 5
0
class Wikipedia:
    def __init__(self, title):
        self.wiki = Wiki('ru')
        self.title = title

    def page(self):
        page = self.wiki.page(self.title)
        if not page.exists():
            page = self
            setattr(page, 'sections', [])
        return page

    def summary(self):
        page = self.page()
        if page.sections != []:
            return {'Общая информация': page.summary}

    def parse_sections(self, sections, summary=None):
        info = {}

        if summary is not None:
            info.update(summary)

        for section in sections:
            if section.text is '':
                value = self.parse_sections(section.sections)
            else:
                value = section.text
            info[section.title] = value
        return info

    def sections(self):
        return self.parse_sections(self.page().sections, self.summary())
Ejemplo n.º 6
0
    def __init__(self, team_names='', team_colors='',
                 team1_players='', team2_players='',
                 options='', header='', question='',
                 twTags=''):
        if team_names == '' or team_colors == '' or team1_players == '' or team2_players == '':
            return

        if options == '' or header == '' or question == '':
            return

        self.listitems = list(options)
        self.header = str(header)
        self.question = str(question)
        self.wiki = Wikipedia('en')
        self.team_names = list(team_names)
        self.team_colors = list(team_colors)
        self.team1_players = list(team1_players)
        self.team2_players = list(team2_players)
        self.twTags = list(twTags)
Ejemplo n.º 7
0
def main(args):
    files = glob.glob(args.filepattern)
    id_wiki = Wikipedia(language='id')
    en_wiki = Wikipedia(language='en', extract_format=args.format)
    for corpus in files:
        print(corpus)
        if os.path.isfile(corpus):
            _, fname = os.path.split(corpus)
            if args.output_dir and os.path.isdir(args.output_dir):
                output_file = os.path.join(args.output_dir, fname)
                mode = 'w+'
                print(output_file)
                if os.path.exists(output_file) and args.duplicate_append:
                    print('file exists')
                    mode = 'a'
                fileout = codecs.open(output_file, mode=mode, encoding=args.encoding)
            else:
                fileout = sys.stdout
            data = codecs.open(corpus, mode='r', encoding=args.encoding)
            for title in get_jsonlpage(data):
                page = id_wiki.page(title)
                print(title)
                try:
                    # print(page.langlinks)
                    if 'en' in page.langlinks:
                        en_title = page.langlinks['en'].title
                        en_page = en_wiki.page(en_title)
                        print(en_title)
                        # print(en_page.text)
                        en_text = print_page(en_page, args.format)
                        print(en_text, file=fileout)
                except Exception:
                    continue
Ejemplo n.º 8
0
def birthday_of_rich(id=1):
    '''
    Parsing Billionaire Birthdays
    '''
    wiki = Wikipedia()
    for id in Rich.select().where(
            Rich.id >= id):  #No data on the wikipedia site
        # print(id, id.name)
        no_bday = [
            'Qin Yinglin', 'Colin Zheng Huang', 'Zhong Huijuan',
            'Walter P.J. Droege', 'Li Xiting', 'Yang Huiyan', 'Joseph Safra',
            'Lukas Walton', 'Theo Albrecht, Jr.', 'Zhang Yiming',
            'Lee Man Tat', 'Wang Wei', 'Radhakishan Damani', 'Liu Yonghao',
            'Wu Yajun', 'Sun Piaoyang', 'Pang Kang', 'Udo Tschira', 'Xu Hang',
            'Pallonji Mistry', 'Zhang Yong', 'Robert Ng', 'Iris Fontbona',
            'Donald Newhouse', 'Graeme Hart', 'Goh Cheng Liang', 'Hank Meijer',
            'Robin Zeng', 'Andreas Struengmann', 'Thomas Struengmann',
            'Hui Wing Mau', 'Quek Leng Chan', 'Sun Hongbin', 'Zhang Bangxin',
            'Lu Zhongfang', 'Cyrus Poonawalla', 'Scott Farquhar',
            'Gong Hongjia', 'Eric Wittouck', 'Xu Shihui', 'Wang Wenyin',
            'Zhang Fan', 'Chen Bang', 'Jiang Rensheng', 'Blair Parry-Okeden',
            'David Duffield', 'Eyal Ofer', 'John Grayken'
        ]
        if id.name in no_bday:
            id.bday = datetime.datetime(1, 1, 1)
            id.save()
            continue
        page_py = wiki.page(id.name)
        link = page_py.fullurl
        response = get_response(link)
        html_doc = BeautifulSoup(response.text, features='html.parser')
        date = html_doc.find('span', {'class': 'bday'})
        if date is None:
            bday = fix_for_data(id.name)
        else:
            bday = datetime.datetime.strptime(date.text, '%Y-%m-%d')
        zodiac = find_zodiac(bday)
        id.bday = bday.date()
        id.zodiac = zodiac
        id.save()
Ejemplo n.º 9
0
def dump_page(source: str,
              target_folder: Union[Path, str] = "pages",
              wiki_obj: Wikipedia = None,
              lang: str = 'fr'):
    if not wiki_obj:
        wiki_obj = Wikipedia(lang)

    target_folder = Path(target_folder)
    if not target_folder.exists():
        makedirs(target_folder)

    wikipage = wiki_obj.page(source)
    if not wikipage.exists():
        print(f"page {source} does not exist")

    else:
        page_info = wiki_obj.info(wikipage)
        if page_info.title != wikipage.title:
            wikipage = wiki_obj.page(page_info.title)
        wiki_title = wikipage.title.replace(' ', '_')
        target_file = target_folder / (wiki_title.replace("/", "__SLASH__") +
                                       ".pkl")
        pkl.dump(wikipage, target_file.open('wb'))
Ejemplo n.º 10
0
def get_company_details(company):
    """
        Params: 
            - company (str)
        Returns:
            - company_description (str)
    """
    wiki_wiki = Wikipedia('en')

    try:
        # try different methods for searching  for the company until something good is returned
        page = wiki_wiki.page(company + " (company)")

        if not page.exists():
            page = wiki_wiki.page(company)
    except Exception as err:
        printColoured(err, colour="red")
        raise InvalidUserInput(
            description="Connection timed out. Please try again later")

    company_data = page.text
    company_description = company_data.split("\n")[0]
    return company_description
Ejemplo n.º 11
0
def process_page(
    nlp: spacy.Language,
    category: Text,
    wiki: wikipediaapi.Wikipedia,
    page: Text,
) -> bool:
    """Fetches a single page and creates index files."""
    filename = os.path.join("pages", f"{page}.sentences.json")
    output_filename = filename.replace(".sentences.", ".index.")
    if not os.path.exists(filename):
        article = wiki.page(page)
        summary = wikipediaapi.WikipediaPageSection(wiki=wiki,
                                                    title='Summary',
                                                    text=article.summary)
        sections = [summary] + article.sections
        sentences = [
            dict(id=id, **sentence)
            for id, sentence in enumerate(extract_sections(nlp, sections))
        ]
        if any(refer in sentences[0]["line"].lower() for refer in kREFER):
            return False
        with open(filename, 'w') as outfile:
            json.dump(
                {
                    "category": category,
                    "title": page,
                    "sentences": sentences
                },
                outfile,
                indent=2)
    if not os.path.exists(output_filename):
        command = f'node util/single_index.js "{filename}" "{output_filename}"'
        subprocess.call(command, shell=True)

    bucket = fibs_firebase_config.get_bucket()
    blob = bucket.blob(f"pages/{page}.json")
    if not blob.exists():
        blob.upload_from_filename(filename, content_type='application/json')
        bucket.blob(f"indices/{page}.json").upload_from_filename(
            output_filename, content_type='application/json')
        return True
    return False
Ejemplo n.º 12
0
class Wiki(commands.Cog):
    """Error handling.
    """
    def __init__(self, bot):
        self.bot = bot
        self.wiki = Wikipedia('en')

    @commands.command(name='wiki', aliases=['page'])
    async def page(self, ctx, *, search):
        result = self.wiki.page(search)
        if not result.exists():
            return await ctx.send(embed=Embed(
                title="Page not found ⛔",
                description=f"No page was found under the name `{search}`",
                color=Color.blurple()))
        wiki = Embed(color=Color.dark_gold())
        wiki.title = result.title
        wiki.url = result.fullurl
        wiki.description = f'{result.text[0:500]}...'
        wiki.set_footer(
            text="Powered by Wikipedia",
            icon_url="https://i.ibb.co/jyX08CD/wikipedia-PNG39.png")
        wiki.timestamp = ctx.message.created_at
        return await ctx.send(embed=wiki)
Ejemplo n.º 13
0
class Post:
    def __init__(self, team_names='', team_colors='',
                 team1_players='', team2_players='',
                 options='', header='', question='',
                 twTags=''):
        if team_names == '' or team_colors == '' or team1_players == '' or team2_players == '':
            return

        if options == '' or header == '' or question == '':
            return

        self.listitems = list(options)
        self.header = str(header)
        self.question = str(question)
        self.wiki = Wikipedia('en')
        self.team_names = list(team_names)
        self.team_colors = list(team_colors)
        self.team1_players = list(team1_players)
        self.team2_players = list(team2_players)
        self.twTags = list(twTags)

    def getWikiUrl(self, player_name=''):
        if player_name == '':
            return None

        page_py = self.wiki.page(player_name)
        if page_py.exists() is False:
            return None

        return page_py.fullurl

    @staticmethod
    def tag(name, *content, style=None, href=None, **attrs):
        if style is not None:
            attrs['style'] = style

        if href is not None:
            attrs['href'] = href

        if attrs:
            attr_str = ''.join(' %s="%s"' % (attr, value)
                               for attr, value
                               in sorted(attrs.items()))
        else:
            attr_str = ""

        if content:
            return '\n'.join('<%s%s>%s</%s>' %
                             (name, attr_str, c, name) for c in content)
        else:
            return '<%s%s />' % (name, attr_str)

    def formatApi(self):
        http_part = "http --auth : --form POST http://www.tactification.com/api_rt/v1.0/new_post "
        question_tag = self.tag('div', self.question, style='color:black')
        br1 = self.tag('br')

        li_items = str()
        for item in self.listitems:
            li_items += ''.join(self.tag('li', item))

        ul = self.tag('ul', li_items)
        div1 = self.tag('div', ul, style='color:black')

        starring_tag = self.tag('div', "Starring:", style='color:black')

        team1_url = self.getWikiUrl(self.team_names[0])
        if team1_url is None:
            print(self.team_names[0])
            return

        a_team1 = self.tag('a', self.team_names[0], href=team1_url) + ': '
        a_items = str()
        for item in self.team1_players:
            print(item)
            player_url = self.getWikiUrl(item[0])
            if player_url is None:
                print(item)
                return

            a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),')

        a_items.rstrip(',')

        i_team1 = self.tag('i', a_team1+a_items, style="color:" + str(self.team_colors[0]))
        team2_url = self.getWikiUrl(self.team_names[1])
        if team2_url is None:
            print(self.team_names[1])
            return

        a_team2 = self.tag('a', self.team_names[1], href=team2_url) + ': '
        a_items = str()
        for item in self.team2_players:
            player_url = self.getWikiUrl(item[0])
            if player_url is None:
                print(item)
                return

            a_items += ''.join(self.tag('a', item[0], href=player_url) + '(' + str(item[1]) + '),')

        a_items.rstrip(',')

        i_team2 = self.tag('i', a_team2+a_items, style="color:" + str(self.team_colors[1]))
        header = " header={!r} ".format(self.header)
        twTag = (" twTags='#{}, #{}, #{}' ".format(*self.twTags))
        end_part = "tactical_gif@home_img.jpg tactical_pic_1750@with_help_msg.jpg tactical_pic_1575@with_help_msg_75.jpg tactical_pic_875@with_help_msg_50.jpg"
        final_command = http_part + "body='" + question_tag + br1 + div1 + starring_tag + i_team1 + br1 + i_team2 + "'" + header + twTag + end_part
        print(final_command)
Ejemplo n.º 14
0
 def __init__(self, bot):
     self.bot = bot
     self.wiki = Wikipedia('en')
Ejemplo n.º 15
0
def metadata(table, min_majority=.8):
    ''' Returns a datamart schema, assigning types to each variable, if at
    least min_majority of the values are of that type. '''
    lang = table.url.split('.', 1)[0].split('/')[-1]
    pg = Wikipedia(lang).page(table.url.rsplit('/', 1)[-1])
    try:
        date_updated = pg.touched
    except:
        date_updated = dt.now().strftime('%Y-%m-%mT%H:%M:%SZ')
    try:
        categories = [kw.lower().split(':')[-1] for kw in pg.categories]
        kws = categories
        # kws = [kw for kw in kws if not any(c in kw for c in WIKIPEDIA_IGNORE_CATEGORIES)]
        # kws = set(word for kw in kws for word in findall(r'\w+', kw) if not len(FIND_STOPWORDS(kw)))
    except:
        categories = []
        kws = []
    try:
        description = pg.summary.split('\n', 1)[0]
    except:
        description = ''
    try:
        langlinks = list({v.title for v in pg.langlinks.values()})
    except:
        langlinks = []
    res = {
        "title": table.context['r0'] if 'r0' in table.context else 'Table in %s' % pg.title,
        "description": description,
        "url": table.url,
        "keywords": list(kws),
        "date_updated": date_updated,
        "provenance": {
            "source": "wikipedia.org"
        },
        "materialization": {
            "python_path": "wikitables_materializer",
            "arguments": {
                "url": table.url,
                "xpath": table.xpath
            }
        },
        "additional_info": {
            "categories": categories,
            "sections": [s.title for s in pg.sections],
            "translations": langlinks
        }
    }
    res['variables'] = []
    for name in table.record[0].keys():
        var = {'name': name, 'semantic_type': []}
        values = [r[name] for r in table.record]
        min_sample = min_majority * len(values)
        dates = [d for d in map(find_dates, values) if d != None]
        if len(dates) >= min_sample:
            var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/Time')
            var['temporal_coverage'] = {'start': min(dates), 'end': max(dates)}
        entities = {v: t for v in values for v, t in find_entities(v).items()}
        locations = [v for v, t in entities.items() if t == 'GPE']
        if len(locations) >= min_sample:
            var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/Location')
        people = [v for v, t in entities.items() if t == 'PERSON']
        if len(people) >= min_sample:
            var['semantic_type'].append('https://schema.org/Person')
        if len(entities) >= min_sample:
            var['named_entity'] = list(entities.keys())
        numbers = [float(n) for n in values if n.strip().replace('.', '', 1).isdigit()]
        ranges = [n for n in values if BOOLEAN_SYNTAX_PROPERTIES['match-range'](n) is not None]
        if len(numbers) >= min_sample:
            var['semantic_type'].append('http://schema.org/Float')
        elif len(ranges) >= min_sample:
            var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/Interval')
        if not len(var['semantic_type']):
            if any(len(c) for c in values):
                var['semantic_type'].append('http://schema.org/Text')
            else:
                var['semantic_type'].append('https://metadata.datadrivendiscovery.org/types/MissingData')
        res['variables'].append(var)
    return res
Ejemplo n.º 16
0
        "Л": 0,
        "М": 0,
        "Н": 0,
        "О": 0,
        "П": 0,
        "Р": 0,
        "С": 0,
        "Т": 0,
        "У": 0,
        "Ф": 0,
        "Х": 0,
        "Ц": 0,
        "Ч": 0,
        "Ш": 0,
        "Щ": 0,
        "Э": 0,
        "Ю": 0,
        "Я": 0
    }
    for name in names_list:
        alphabet_dict[name[:1].upper()] += 1
    return alphabet_dict


wiki_wiki = Wikipedia('ru')
members = wiki_wiki.page('Категория:Животные по алфавиту').categorymembers
animal_names = get_russian_category_members(members)
counted_names = count_names(animal_names)
for key, value in counted_names.items():
    print(f'{key}: {value}')
Ejemplo n.º 17
0
from wikipediaapi import Wikipedia, ExtractFormat

wiki = Wikipedia('en', extract_format=ExtractFormat.HTML)

not_found = '<h1> Page not found </h1>'.encode('utf-8')


def search(term):

    page = wiki.page(term)

    if page.exists():
        summary = page.summary.encode('utf-8')
        if 'may refer to' in summary.decode('utf-8'):
            return not_found

        return summary

    else:
        return not_found
Ejemplo n.º 18
0
import pandas as pd
from wikipediaapi import Wikipedia

W_API = Wikipedia('en')
TOP_25_URL = "https://en.wikipedia.org/wiki/Wikipedia:Top_25_Report"


def wiki_page_list(url, n_pages=None, article_column='Article'):
    tables = pd.read_html(url)
    selected_table = None
    for table in tables:
        if article_column in table:
            selected_table = table
            break
    if selected_table is None:
        raise KeyError(f'No column "{article_column}" in any tables at {url}.')

    if n_pages is None:
        n_pages = selected_table.shape[0]
    return list(selected_table[article_column][:n_pages])


def top25():
    return wiki_page_list(TOP_25_URL, 25)
Ejemplo n.º 19
0
def scrape_articles(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                    see_also=True, max_articles=10000, max_depth=1):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> df = scrape_articles(['ELIZA'], see_also=False)
    >>> df.shape[0] > 80
    True
    >>> df.columns
    Index(['depth', 'title', 'section', 'sentence'], dtype='object')
    """
    titles = list([titles] if isinstance(titles, str) else titles)
    exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
    depths = list([0] * len(titles))
    title_depths = list(zip(titles, depths))
    sentences = []
    num_articles = 0
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    title, d = '', 0
    wiki = Wikipedia()
    for depth in range(max_depth):
        while num_articles < max_articles and d <= depth and len(title_depths):
            title = None
            # skip None titles and titles already scraped
            while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped):
                # log.warning(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.warning(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            page = wiki.article(title)
            if not (len(page.text) + len(page.summary)):
                log.error(f"Unable to retrieve {title}")
                time.sleep(2.17)
                continue
            num_articles += 1
            # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    log.info(f'  Checking see also link: {t}')
                    if t in page.links:
                        log.info(f'    yep, found it in page.links')
                        title_depths.append((t, d + 1))
                log.info(f'  extended title_depths at depth {d}: {title_depths}')
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text = section.text.replace('’', "'")  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
                # FIXME: need to rejoin short names before colons, like 'ELIZA:' 'Tell me...', and 'Human:' 'What...'
                # FIXME: need to split on question marks without white space but where next word is capitalized: ...to be unhappy?Though designed strictly...
                sentences.extend([
                    (d, title, section.title, s.text) for s in nlp(text).sents if (
                        len(s.text.strip().strip('"').strip("'").strip()) > 1)
                ])
            log.debug(f'Parsed {len(sentences)} sentences.')

            # retval = parse_sentences(
            #     title=title, sentences=sentences, title_depths=title_depths, see_also=see_also,
            #     exclude_headings=exclude_headings, d=d, depth=depth, max_depth=max_depth)
            # if retval is None:
            #     continue
            # else:
            #     sentences, title_depths = retval
            log.info(str([depth, d, num_articles, title]))
            if d > depth:
                log.warning(f"{d} > {depth}")
                break

    return pd.DataFrame(sentences, columns='depth title section sentence'.split())
Ejemplo n.º 20
0
from django.forms.models import model_to_dict
from django.http import Http404, JsonResponse
from django.shortcuts import render
from django.utils.functional import lazy
from .models import WikiArticle

from wikipediaapi import Wikipedia

WIKI_WIKI = Wikipedia('en')


class Message:
    send = True

    def __init__(self, text):
        if Message.send:
            self.action = 'send'
        else:
            self.action = 'receive'
        Message.send = not self.send
        self.text = text


def home(request):
    return render(request, 'home.html', {'name_json': 'Grieves'})


def chat_page(request, bot_name):
    return render(request, 'home.html', {'name_json': bot_name})

from typing import List
from wikipediaapi import Wikipedia, WikipediaPage

from src.exceptions.downloader_exceptions import PageNotFoundError
from src.downloader.models.DownloadedData import DownloadedData

wiki_extractor = Wikipedia(language='ja')


def get_wikipedia_data_for_output(
        search_page_names: List[str]) -> List[DownloadedData]:
    pages = download_wikipedia_pages(search_page_names=search_page_names)
    return [DownloadedData.from_wikipedia_page(page) for page in pages]


def download_wikipedia_pages(
        search_page_names: List[str]) -> List[WikipediaPage]:
    pages = list()
    for page_name in search_page_names:
        page = wiki_extractor.page(page_name)
        if not page.exists():
            raise PageNotFoundError(page_name=page_name,
                                    page_source="Wikipedia")
        pages.append(page)
    return pages
Ejemplo n.º 22
0
def scrape_article_texts(titles=TITLES, exclude_headings=EXCLUDE_HEADINGS,
                         see_also=True, max_articles=10000, max_depth=1,
                         heading_text=True, title_text=True):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> texts = scrape_article_texts(['ELIZA'], see_also=False)
    >>> len(texts)
    1
    >>> texts = scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3)
    >>> len(texts) == 10
    True
    """
    titles = [titles] if isinstance(titles, str) else titles
    exclude_headings = set([eh.lower().strip() for eh in (exclude_headings or [])])
    depths = list([0] * len(titles))
    # depth is always zero here, but this would be useful further down
    title_depths = list(zip(titles, depths))
    texts = []
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    title, d, num_articles = '', 0, 0
    wiki = Wikipedia()
    # TODO: should be able to use depth rather than d:
    for depth in range(max_depth):
        while num_articles < max_articles and d <= depth and len(title_depths):
            title = None

            # skip titles already scraped
            while len(title_depths) and len(titles_scraped) and (not title or title in titles_scraped):
                # log.warning(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.info(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            page = wiki.article(title)
            if not (len(page.text) + len(page.summary)):
                log.warn(f"Unable to retrieve {title}")
                time.sleep(2.17)
                continue
            # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    log.info(f'  Checking see also link: {t}')
                    if t in page.links:
                        log.info(f'    yep, found it in page.links')
                        title_depths.append((t, d + 1))
                log.info(f'  extended title_depths at depth {d}: {title_depths}')
            text = f'{page.title}\n\n' if title_text else ''
            # page.text
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text += f'\n{section.title}\n' if heading_text else '\n'
                text += section.text.replace('’', "'") + '\n'  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
            texts.append(text)
            log.warn(f'Added article "{page.title}" with {len(text)} characters. Total chars = {sum((len(t) for t in texts))}')
            log.warn(str([depth, d, num_articles, title]))
            if len(texts) >= max_articles:
                log.warn(f"num_articles={num_articles} ==> len(texts)={len(texts)} > max_depth={max_depth}")
                break
            if d > depth:
                log.warn(f"{d} > {depth}")
                break
    return texts
Ejemplo n.º 23
0
def get_filtered_complete_dic(pkl_with_stats_fn,
                              min_paragraphs=5,
                              min_len_paragraphs=500,
                              max_len_paragraphs=1000,
                              draft=False,
                              homonym=False,
                              years=False,
                              wiki_path=None,
                              clean_duplicates=False):
    with open(pkl_with_stats_fn, 'rb') as f:
        stats_uncleaned = pkl.load(f)

    # We filter out the sections errors
    stats = {
        key: stats_uncleaned[key]
        for key in stats_uncleaned if stats_uncleaned[key] != 'SectionError'
    }

    filtered_stats = filter_dic(stats,
                                min_len_paragraphs=min_len_paragraphs,
                                draft=draft,
                                homonym=homonym,
                                max_len_paragraphs=max_len_paragraphs)
    filtered_stats = filter_min_paras(filtered_stats, min_paragraphs)

    # We filter the years

    if clean_duplicates:
        if wiki_path is None:
            print("Error : give a wikipath for duplicates cleaning")
            return
        new_ft_stats = {}
        wiki_obj = Wikipedia('fr')
        for filename, stats in filtered_stats.items():
            try:
                with open(wiki_path + '/' + filename, 'rb') as f:
                    page = pkl.load(f)
            except FileNotFoundError:
                print("Not found :" + filename)
                continue
            page_info = wiki_obj.info(page)
            new_title = title = page_info.title
            new_title = new_title.replace(' ', '_')
            new_title += '.pkl'
            new_ft_stats[new_title] = stats
        filtered_stats = new_ft_stats

    if not years:
        print("Length before year fitering :", len(filtered_stats))
        if wiki_path is None:
            filtered_stats = {
                filename: filtered_stats[filename]
                for filename in filtered_stats
                if filter_years_articles(filename)
            }
        else:
            filtered_stats = {
                filename: filtered_stats[filename]
                for filename in filtered_stats
                if filter_years_articles(wiki_path + filename)
            }
    print("Final length : ", len(filtered_stats))
    return filtered_stats
Ejemplo n.º 24
0
def scrape_article_texts(titles=TITLES,
                         exclude_headings=EXCLUDE_HEADINGS,
                         see_also=True,
                         max_articles=10000,
                         max_depth=1,
                         heading_text=True,
                         title_text=True):
    """ Download text for an article and parse into sections and sentences

    >>> nlp('hello')  # to eager-load spacy model
    hello
    >>> texts = scrape_article_texts(['ELIZA'], see_also=False)
    >>> texts = list(texts)
    >>> len(texts)
    1
    >>> texts = list(scrape_article_texts(['Chatbot', 'ELIZA'], max_articles=10, max_depth=3))
    >>> len(texts)
    10
    """
    if isinstance(titles, str):
        log.error(
            f'DEPRECATED `titles` should be a list of strs, not titles="{titles}"'
        )
        titles = find_titles(titles)
    exclude_headings = set(
        [eh.lower().strip() for eh in (exclude_headings or [])])
    # depth starts at zero here, but as additional titles are appended the depth will increase
    title_depths = list(zip(titles, [0] * len(titles)))
    text_lens = []
    # FIXME: breadth-first search so you can do a tqdm progress bar for each depth
    # FIXME: record title tree (see also) so that .2*title1+.3*title2+.5*title3 can be semantically appended to sentences
    titles_scraped = set([''])
    d, num_articles = 0, 0
    wiki = Wikipedia()
    # TODO: should be able to use depth rather than d:
    for depth in range(max_depth):
        while num_articles < max_articles and d <= depth and len(
                title_depths) > 0:
            title = ''

            # skip titles already scraped
            while len(title_depths) and len(titles_scraped) and (
                    not title or title in titles_scraped):
                # log.warning(f"Skipping {title} (already scraped)")
                try:
                    title, d = title_depths.pop()
                except IndexError:
                    log.info(f'Out of titles: {title_depths}')
                    break
                title = title.strip()
            if d > max_depth or not title:
                log.info(f"{d} > {max_depth} or title ('{title}') is empty")
                continue
            titles_scraped.add(title)
            log.info(f'len(title_depths): {len(title_depths)}Looking')
            page = wiki.article(title)
            if not (len(getattr(page, 'text', '')) +
                    len(getattr(page, 'summary', ''))):
                log.warning(
                    f"Unable to retrieve _{title}_ because article text and summary len are 0."
                )
                time.sleep(2.17)
                continue
            # FIXME: this postprocessing of Article objects to compost a text string should be in separate funcition
            # TODO: see_also is unnecessary until we add another way to walk deeper, e.g. links within the article
            if see_also and d + 1 < max_depth:
                # .full_text() includes the section heading ("See also"). .text does not
                section = page.section_by_title('See also')
                if not section:
                    continue
                for t in section.text.split('\n')[1:]:
                    log.info(f"  Checking _SEE ALSO_ link: {t}")
                    if t in page.links:
                        log.info(
                            f'     Found title "{t}" in page.links at depth {d}, so adding it to titles to scrape...'
                        )
                        title_depths.append((t, d + 1))
                log.info(
                    f'  extended title_depths at depth {d}: {title_depths}')
            text = f'{page.title}\n\n' if title_text else ''
            # page.text
            for section in page.sections:
                if section.title.lower().strip() in exclude_headings:
                    continue
                # TODO: use pugnlp.to_ascii() or nlpia.to_ascii()
                text += f'\n{section.title}\n' if heading_text else '\n'
                text += section.text.replace(
                    '’', "'"
                ) + '\n'  # spacy doesn't handle "latin" (extended ascii) apostrophes well.
            yield text
            text_lens.append(len(text))
            log.warning(
                f'Added article "{page.title}" with {len(text)} chars.')
            log.info(f'  Total scraped {sum(text_lens)} chars')
            log.warning(str([depth, d, num_articles, title]))
            if len(text_lens) >= max_articles:
                log.warning(
                    f"num_articles={num_articles} ==> len(text_lens)={len(text_lens)} > max_depth={max_depth}"
                )
                break
            if d > depth:
                log.warning(f"{d} > {depth}")
                break
Ejemplo n.º 25
0
 def _wiki_api(lang):
     return Wikipedia(language=lang, extract_format=ExtractFormat.HTML)
Ejemplo n.º 26
0
 def __init__(self, title):
     self.wiki = Wiki('ru')
     self.title = title
Ejemplo n.º 27
0
from wikipediaapi import Wikipedia

player_name = input("Enter player name: ")
wiki = Wikipedia(language='en')
page = wiki.page(player_name)

for s in page.sections:
    if 'Club career' in s.title:
        print(s.title)
        for p in s.sections:
            print("\t"+p.title)
            if p.sections != None:
                for q in p.sections:
                    print("\t\t"+q.title)