Ejemplo n.º 1
0
def get_gutenberg_document(url) -> str:
    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string."""
    # Get Project Gutenberg document ID from url string
    validate_url(url, expected_netloc='gutenberg.org')
    match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path)
    if not match:
        raise Exception('Not a valid url')
    document_id = int(match.group(1))
    return super_cleaner(strip_headers(load_etext(document_id).strip()),
                         mark_deletions=False)
def remove_header_footer(pgid, data):
    try:
        string = data.decode('utf-8')
    except UnicodeDecodeError:
        pass

    try:
        string = data.decode('cp1252')
    except UnicodeDecodeError:
        pass

    try:
        string = data.decode('latin-1')
    except UnicodeDecodeError:
        pass

    #cleaned_string = simple_cleaner(string)
    cleaned_string_lines = super_cleaner(string).strip().split('\n')
    cleaned_string_lines = [
        l for l in cleaned_string_lines if l.strip() != "[deleted]"
    ]
    cleaned_string = ('\n'.join(cleaned_string_lines)).strip()
    cleaned_string_lines = cleaned_string.split('\n')

    remove_first_line = False
    remove_last_line = False
    for l in skip_lines:
        if l in cleaned_string_lines[0].lower():
            remove_first_line = True

    for l in skip_lines:
        if l in cleaned_string_lines[-1].lower():
            remove_last_line = True

    if remove_first_line:
        cleaned_string_lines = cleaned_string_lines[1:]

    if remove_last_line:
        cleaned_string_lines = cleaned_string_lines[:-1]

    cleaned_string = '\n'.join(cleaned_string_lines)

    return cleaned_string.strip().encode('utf-8')
Ejemplo n.º 3
0
def random_gutenberg_document(language_filter='en') -> str:
    """Downloads a random document (book, etc.) from Project Gutenberg and returns it as a stirng.

    Keyword arguments:
        language_filter (str) -- restrict the random document to a paritcular language (default: English)
    """
    doc_language = None
    document = ''
    while (not doc_language or language_filter
           ) and doc_language != language_filter and len(document) == 0:
        # Keep grabbing random documents until 1 meets the language filter, if specified, and verify it really has text
        document_id = random.randint(
            1, 60134)  # Pick book at random (max id is currently 60134)
        lang_metadata = get_metadata('language', document_id)
        doc_language = next(
            iter(lang_metadata)) if len(lang_metadata) else False
        document = super_cleaner(strip_headers(
            load_etext(document_id).strip()),
                                 mark_deletions=False)
    return document
Ejemplo n.º 4
0
    # Get list of books and their links
    df = pd.read_csv(config.PATH_BOOK_LIST)

    # Set up vectors to store paragraphs (data) and labels (authors)
    X = []
    y = []
    stop_words = set(stopwords.words("english"))

    for _, row in df.iterrows():
        label = row.get("author")
        target_url = row.get("link")

        # Get text
        soup_html = urllib.request.urlopen(target_url).read()
        text = str(BeautifulSoup(soup_html, features="lxml"))
        text_clean = gclean.super_cleaner(text)

        # Separate into paragraphs
        paragraphs = text_clean.split("\n\n")
        paragraphs = [
            item for item in paragraphs if item not in config.REMOVE_WORDS
        ]
        paragraphs = [
            par for par in paragraphs
            if re.search(config.CHAPTER_REGEX, par) is None
        ]

        # Clean paragraphs
        paragraphs = list(map(lambda x: clean_paragraphs(x), paragraphs))

        # Append to data