Beispiel #1
0
def parse_epub(filename: str, abbr: bool, code: bool):
    """
    Parse an epub file
    """
    book = epub.read_epub(filename)
    title = book.get_metadata('DC', 'title')[0][0]
    remove_hashtags = title in TITLES_REMOVE_HASHTAGS  # indicate to remove hashtags
    print('\nParsing book "{0}"'.format(title))
    list_plaintexts = []
    counter_abbrs = Counter()
    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        name = item.get_name()
        if not re.match(REGEX_CHAPITRE, name):
            print('...Ignoring {0}'.format(name))
            continue
        print('...Parsing {0}'.format(name))
        # parse and clean chapter
        plaintext, abbrs = clean_epub_item(item, abbr, code, remove_hashtags)
        list_plaintexts.append(plaintext)
        counter_abbrs += Counter(abbrs)
    book_plaintext = '\n\n\n'.join(list_plaintexts)
    # replace numbers
    book_plaintext = filter_numbers(book_plaintext)
    # normalize
    book_plaintext = maybe_normalize(book_plaintext)
    if abbr:
        print('Abbreviation counts:\n{0}'.format(counter_abbrs.items()))
    return book_plaintext
Beispiel #2
0
def fetch_play_text(url):
    text = []

    if url and len(url) > 0:
        if 'libretheatre.fr' in url:
            text = fetch_play_text_libretheatre(url)
        elif 'wikisource.org' in url:
            text = fetch_play_text_wikisource(url)

    finaltext = []
    for line in text:
        line = maybe_normalize(line)
        line = maybe_normalize(line, mapping=mapping_specific)
        line = filter_numbers(line)
        line = line.strip()

        maybe_matches = re.finditer(PUNCT_NBSP, line)
        for maybe_match in maybe_matches:
            line = line.replace(
                maybe_match.group(0),
                "%s\u00a0%s" % (maybe_match.group(1), maybe_match.group(2)))

        finaltext += [line]

    return finaltext
Beispiel #3
0
def fetch_play_text(url):
    text = []

    if url and len(url) > 0:
        if 'libretheatre.fr' in url:
            text = fetch_play_text_libretheatre(url)
        elif 'wikisource.org' in url:
            text = fetch_play_text_wikisource(url)

    finaltext = []
    for line in text:
        line = maybe_normalize(line)
        line = maybe_normalize(line, mapping=mapping_specific)
        line = filter_numbers(line)
        line = line.strip()
        line = line.replace("\n", " ")

        finaltext += [line]

    return finaltext
Beispiel #4
0
def format_address(address, template):
    # NB: zipcode is sometime pronounced in 3 parts
    # ex: 75001 => soixante quinze zero zero un
    # and sometime pronounced in 2 parts
    # ex: 01090 => zero un quatre vingt dix
    # see unit tests for more info

    zipcode = address['zipcode']

    zipcode_alt = '{}{}, {}{}{}'.format(*zipcode)
    address.update(
        zipcode=zipcode_alt if zipcode.startswith('0') else zipcode,
        zipcode_alt=zipcode_alt,
    )

    str = template.format(street_lower='{}{}'.format(
        address['street'][0].lower(), address['street'][1:]),
                          **address)

    str = maybe_normalize(str, mapping=normalizers)
    str = filter_numbers(str)
    return str.strip()
Beispiel #5
0
                    if args.one:
                        break

                doc.expandNode(node)
                date_seance = node.firstChild.nodeValue

                if len(date_seance) != 17:
                    print("Bogus DateSeance?", date_seance)
                    continue

                seance_context = {'DateSeance': date_seance}

    if event == END_ELEMENT:
        indent_level -= 2
        if type(node) == Element and len(visited) > 0:
            old = visited.pop()
            del old

    if node.nodeName == 'texte' and seance_context is not None and 'DateSeance' in seance_context:
        doc.expandNode(node)

        if visited[-2].attributes and 'code_style' in visited[
                -2].attributes and visited[-2].attributes[
                    'code_style'].value == 'NORMAL':
            fullText = filter_numbers(recursive_text(node))
            fullText = re.compile('\s+').sub(' ', fullText)
            try:
                seance_context[node.nodeName].append(fullText)
            except KeyError:
                seance_context[node.nodeName] = [fullText]
def get_added_content(url, revid, lang):
    """
    Retrieves all content created by the contributor, except minor edits and derivative works like translations, 
    content mixed with other contributors, or reverts
    The "url" parameter specifies the API's base url.
    The "revid" parameter specifies the ID of the revision to check and retrieve.
    The "lang" parameter specifies the code of the processed language (e.g. "en", "fr", etc.)
    """
    #We want to compare the revision to the previous one, to see the content the contributor added (or not)
    compare_query = {
        "action": "compare",
        "fromrev": revid,
        "torelative": "prev",
        "prop": "rel|diffsize|size|diff|title",
        "format": "json"
    }
    #    print(compare_query)
    response = requests.post(url, params=compare_query).json()
    if "compare" not in response.keys():
        return None
    revid_size = response["compare"]["tosize"]

    if "prev" in response["compare"].keys(
    ):  #If there are previous revisions, we need to check if the current revision isn't a derivative work (i.e. a revert)
        #Check if it's a revert
        rvcontinue = None
        revid_size = 0
        current_size = response["compare"]["tosize"]
        previous_size = response["compare"]["fromsize"]
        if previous_size > current_size:
            return None
        while True:
            #Let's compare the current and previous revisions of the page
            pr_query = {
                "action": "query",
                "prop": "revisions",
                "rvprop": "ids|tags|size",
                "format": "json",
                #                    "revids":previous_revision_id
                "rvendid": revid,
                "titles": response["compare"]["totitle"]
            }  #for retrieving a list of previous revisions until the current one
            if rvcontinue != None:
                pr_query["rvcontinue"] = rvcontinue
            pr_response = requests.post(url, pr_query).json()
            for page in pr_response["query"]["pages"]:
                #Check if the current revision is a revert.
                for revision in pr_response["query"]["pages"][page][
                        "revisions"]:
                    #                    print(revision.keys())
                    if revision["revid"] == revid:
                        revid_tags = revision["tags"]
                        if "mw-rollback" in revid_tags:  #Here, we're sure it's a revert
                            return None
                        continue
                    #If this previous revision has the same size of the current revision, maybe the revision we want to retrieve is a revert. Let's be conservative, and consider it is.
                    if revision["size"] == revid_size:
                        return None
            if "continue" in pr_response.keys():
                rvcontinue = pr_response["continue"]["rvcontinue"]
            else:
                break
    #Now, let's retrieve the revision content!
    raw_html = response["compare"]["*"]
    document = html.document_fromstring(raw_html)
    added_lines = document.xpath("//td[@class='diff-addedline']")
    #        deleted_lines = document.xpath("//td[@class='diff-deletedline']")
    text_list = []
    for td in added_lines:
        for div in td:
            if len(
                    div.getchildren()
            ) > 0:  #if there are children tags, it's because this is an inline modification, and not an addition -> skip it
                continue
            else:
                text = div.text_content()
                if "#REDIRECT" in text:
                    return None

                try:
                    #TODO: convert scales (1/25000, etc.)
                    text = pypandoc.convert_text(text,
                                                 to="plain",
                                                 format="html").replace(
                                                     "\r\n", " ")
                    #to avoid removing relevant content in the {{lien}} template (French wikipedia)
                    text = re.sub(r"{{lien\|([^}]+)}}", r"\1", text)
                    text = pypandoc.convert_text(text,
                                                 to="html",
                                                 format="mediawiki").replace(
                                                     "\r\n", " ")
                    #and we retrieve the real plain text
                    #TODO: add cleaning up of (), [], etc.
                    text = html.document_fromstring(text)
                    text = text.text_content()
                    text = text.replace("\xa0", " ")
                    #replacing by a space rather than by nothing, to ease the further string cleanup
                    text = re.sub(r' \([^)]+\)', '', text)
                    text = re.sub(r'\([^)]+\)', '', text)
                    text = maybe_normalize(text)
                    text = maybe_normalize(text, mapping=mapping_specific)
                    text = re.sub(
                        r'(\d)\s+(\d)', r'\1\2', text
                    )  #In French, there's a space separation between thousand units. It isn't taken into account by num2words, so just let's remove those spaces.
                    #TODO: need to internationalize this part below
                    #converting latlon coordinates
                    #                    text = re.sub(r'([0-9]+) ?°([0-9]+) ?\'([0-9]+) ?\"', r"\1 degrés \2 minutes \3 secondes", text)
                    #            text = re.sub(r'-(\d*\.\d+|\d+)', "moins \1", text)
                    #                    for measure in measure_units:
                    #                        text = re.sub(r'(\[0-1]\[,.]\d+|\[0-1]) ?{measure}'.format(measure=measure), r"\1 {full_name}".format(full_name=measure_units[measure]), text)
                    #                        text = re.sub(r'(\d*\[,.]\d+|\d+) ?{measure}'.format(measure=measure), r"\1 {full_name}s".format(full_name=measure_units[measure]), text)
                    #                    text = text.replace(" ?%", r" pour cent")
                    #remove references between brackets
                    text = re.sub(r'\[[0-9]+\]', '', text)  #r'\[[0-9]+*\]'
                    detected_lang = langid.classify(text)[0]

                    if detected_lang != lang:
                        continue
                    #Transforming numbers in letters
                    try:
                        text = filter_numbers(text, lang=lang)
                    except:
                        pass
                    text = text.strip()

                    if is_garbage(text, lang) == True:
                        #                        print("garbage:", text)
                        continue


#                    text = correct_sentence(text, lang) #TODO: uncomment
                except:
                    continue  #if pandoc cannot convert wikicode, there's a problem, and we don't want to retrieve malformed text
                if len(text.split()) > 3:  #Let's not retrieve too short text
                    text_list.append(text)
    return " ".join(text_list)
Beispiel #7
0
def parse_one_book(bookid):
    this_line = 0
    has_title = False
    mainpage_marker = '    '
    has_mainpage = False
    has_start_mainpage = False
    has_end_mainpage = False

    ebook = load_etext(bookid, refresh_cache=True,
                       mirror=GUTENBERG_MIRROR).replace('\r\n', '\n')
    raw_text = remove_markup(strip_headers(ebook).strip()).split('\n')
    search_for_mainpage_marker = len(
        list(filter(lambda x: x.startswith(mainpage_marker), raw_text))) > 0
    #print('search_for_mainpage_marker', search_for_mainpage_marker)

    finaltext = []
    for line in raw_text:
        #print('LINE=="{}"'.format(line))

        this_line += 1

        if len(line) == 0:
            continue

        if not has_title:
            if (search_for_mainpage_marker
                    and line.startswith(mainpage_marker)) or True:
                if line.isupper():
                    has_title = True
                    #print('FOUND TITLE @', this_line, "'{}'".format(line))
            continue

        if not has_mainpage:
            if not has_start_mainpage:
                if (search_for_mainpage_marker
                        and line.startswith(mainpage_marker)) or True:
                    has_start_mainpage = True
                    #print('FOUND MAIN PAGE START @', this_line, "'{}'".format(line))
                continue
            else:
                if (search_for_mainpage_marker
                        and line.startswith(mainpage_marker)) or True:
                    has_end_mainpage = True
                    #print('FOUND MAIN PAGE END @', this_line, "'{}'".format(line))
                else:
                    continue

            has_mainpage = has_start_mainpage and has_end_mainpage

        if line.startswith('  '):
            #print('FOUND SOME EXTRA @', this_line, "'{}'".format(line))
            continue

        if line.isupper():
            #print('FOUND ONE CHAPTER @', this_line, "'{}'".format(line))
            continue

        line = maybe_normalize(line)
        line = maybe_normalize(line, mapping=mapping_specific)
        line = filter_numbers(line).lstrip()

        maybe_matches = re.finditer(PUNCT_NBSP, line)
        for maybe_match in maybe_matches:
            line = line.replace(
                maybe_match.group(0),
                "%s\u00a0%s" % (maybe_match.group(1), maybe_match.group(2)))

        finaltext += [line]

    return finaltext
def get_article_texts(lang, revid_list):
    """Retrieves revisions specified in the "revid_lisst".
    The "lang" parameter specifies the Wikipedia version, e.g. "fr"
    To be used only with the first revision of articles originally created by the contributor.
    """
    url = "https://{lang}.wikipedia.org/w/api.php".format(lang=lang)
    query = {"action":"parse",
             "format":"json"
             }
    text_list = []
    for revid in revid_list:
        time.sleep(1)
        query["oldid"] = revid
        try:
            response = requests.post(url, data=query)
        except:
            time.sleep(30)
            response = requests.post(url, data=query)
        response = response.json()
        if "parse" not in response.keys(): #it's possible that the revision was since deleted, in this case there's nothing to parse
            continue
        raw_html = response["parse"]["text"]["*"]
        document = html.document_fromstring(raw_html)
        all_p = document.xpath("//p")
        for p in all_p:
            text = p.text_content()
            text = text.replace("\xa0", " ")
            #replacing by a space rather than by nothing, to ease the further string cleanup
            text = re.sub(r' \([^)]+\)', '', text) 
            text = re.sub(r'\([^)]+\)', '', text) 
            text = maybe_normalize(text)
            text = maybe_normalize(text, mapping=mapping_specific)
            text = re.sub(r'(\d)\s+(\d)', r'\1\2', text) #In French, there's a space separation between thousand units. It isn't taken into account by num2words, so just let's remove those spaces.
            #TODO: need to internationalize this part below
            #converting latlon coordinates
#            text = re.sub(r'([0-9]+) ?°([0-9]+) ?\'([0-9]+) ?\"', r"\1 degrés \2 minutes \3 secondes", text)
##            text = re.sub(r'-(\d*\.\d+|\d+)', "moins \1", text)
#            for measure in measure_units:
#                text = re.sub(r'(\[0-1]\[,.]\d+|\[0-1]) ?{measure}'.format(measure=measure), r"\1 {full_name}".format(full_name=measure_units[measure]), text)
#                text = re.sub(r'(\d*\[,.]\d+|\d+) ?{measure}'.format(measure=measure), r"\1 {full_name}s".format(full_name=measure_units[measure]), text)
#                
#            text = re.sub(r'(\[0-1]\[,.]\d+|\[0-1]) ?°', r"\1 degré", text)
#            text = re.sub(r'(\d*\[,.]\d+|\d+) ?°', r"\1 degrés", text)
#            text = re.sub(r'(\d*\[,.]\d+|\d+) ?mm', r"\1 millimètres", text)
#            text = re.sub(r'(\d*\[,.]\d+|\d+) ?cm', r"\1 centimètres", text)
#            text = re.sub(r'(\d*\[,.]\d+|\d+) ?m[^a-z]', r"\1 mètres ", text)
#            text = re.sub(r'(\d*\[,.]\d+|\d+) ?km', r"\1 kilomètres", text)
#            text = text.replace(" ?%", r" pour cent") 
            #remove references between brackets
            text = re.sub(r'\[[0-9]+\]', '', text) #r'\[[0-9]+*\]'
            #Transforming numbers in letters
            text = filter_numbers(text, lang=lang)
            text = text.strip()
#        text= " ".join([p.text_content().replace("\xa0", " ") for p in all_p])
            if "\n" in text or is_garbage(text, lang) == True:                
                text = ""
            if langid.classify(text)[0] != lang:                
                text = ""
#            text = correct_sentence(text, lang) #TODO: uncomment
#            text = text.replace("%", "pour cent") 
            if len(text.split()) > 3:
                #TODO: check content spelling
#                try:
#                    matches = tool.check(text)
#                    text = language_check.correct(text, matches)
#                except Exception as e:
#                    print(text)
#                    print("erreur correction : ", str(e))
#                    print(revid)
#                    print("*"*20)
                text_list.append(text)
        
    return text_list