Ejemplo n.º 1
0
    # Make sure file exists
    if (not isfile(args.input_file)):
        print(f"{c.FAIL}Error:{c.ENDC} Input file does not exist.")
        sys.exit(1)

    # Record start time
    t1 = datetime.now()

    # Otherwise, process the input file
    print(f"Processing {c.UNDERLINE}{args.input_file}{c.ENDC} ... ")

    # Set document variables
    word_count, sentence_count, paragraph_count, overused_phrase_count, repeated_word_count, avoid_word_count, complex_words, syllable_count, fog_index, reading_ease, grade_level = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

    # Instantiate Markdown parser
    md = Markdown()

    # Read the template
    fd = open("./assets/template.html", "r")
    template = fd.read().split("<!-- DIVIDER -->")
    fd.close()

    # Open the output file
    if (args.output_file == None):
        args.output_file = "./index.html"
    open(args.output_file, "w").close()
    outfile = open(args.output_file, "a")

    # Process file
    infile = open(args.input_file, "r")
    
Ejemplo n.º 2
0
def GenFile(iname):
    # Instantiate document statistics
    #   fk_wc is a special word count for the Flesch-Kincaid readability test
    #   word_count is a by-paragraph word count
    #   total_sentences is a count of all sentences in document
    #   total_word_count is the word count for the entire document
    #   total_overused_words is the count of overused words
    #   total_repeated_words is the count of unique repeated words in the document,
    #       not including the individual repeats
    #   total_avoid_words is the ocunt of words to avoid in the document
    #   complex_words is a running count of words with over three syllables
    #   syllable_count is a running cound of syllables in the document
    fk_wc = 0
    word_count = []
    total_sentences = 0
    total_word_count = 0
    total_overused_words = 0
    total_repeated_words = 0
    total_avoid_words = 0
    complex_words = 0
    syllable_count = 0

    # Open th template file, read its contents, and split them for easy access later
    template_fd = open("template.html", "r")
    template = template_fd.read().split("<!--Divider-->")
    template_fd.close()

    # Clear the output file, then write the opening HTML tags
    o_fd = open("index.html", "w").close()
    o_fd = open("index.html", "a")
    o_fd.write(template[0])

    # Open the source file
    fd = open(iname, "r")

    # Read the title from the source file
    title = fd.readline().strip()
    if (title[0] == "#"):
        title = title.split("](")
        title = "<h2 class='linkpost'><a href=\"" + title[
            1][:-3] + "\">" + title[0][3:] + "</a></h2>"
    else:
        title = "<h2 class='original'>" + title + "</h2>\n"

    # Get rid of the title separator (=) and the following blank line
    fd.readline()

    # Write the opening <article> tag and article title
    o_fd.write("<article>\n")
    o_fd.write(title)

    block = False
    # Iterate over each line in the file
    for line in iter(fd.readline, ""):
        fk_wc += line.count(" ") + 1

        # Save a "backup" of the line, for searching a sanitized version of it
        backup = line

        # If we're looking at an empty line, just skip over it; else continue
        if (len(line.strip()) == 0):
            continue

        # Do not collect stats on code snippets. Write them to the file and
        # move on.
        if (line[0:4] == "<pre" or block == True):
            if (line.find("</pre>") == -1):
                block = True
            else:
                block = False

        # Do not collect stats on images. Write them to the file and move on.
        if (line[0:2] == "![" or line[0:4] == "<pre"):
            o_fd.write(Markdown(line, "https://zacs.site/") + "\n")
            continue

        # Instantiate paragraph-specific statistics
        wc = 0  # Word count for current paragraph
        overused_words = 0  # Number of overused words
        repeated_words = 0  # Number of repeated words
        avoid_words = 0  # Number of words to avoid
        dict_count = {}  # A dictionary that will count occurences of each word

        # For each word in the list of overused words to avoid, search the
        # paragraph case insensitively. If a match is found, increment the count
        # of overused words in the paragraph and document, then highlight it.
        for word in overlap:
            m = re.search("[^\w]" + word + "[^\w]", line, re.IGNORECASE)
            if (m):
                overused_words += line.lower().count(word)
                total_overused_words += overused_words

                # The first replace will capture matches with uppercase letters
                # that start a sentence, or regular lowercase words; if the first
                # replace targeted matches with uppercase letters that start a
                # sentence, the second replace will capture all other occurences of
                # that word that may exist throughout the document.
                line = line.replace(
                    m.group(0),
                    " <span class='replace'>" + m.group(0) + "</span> ")
                if (m.group(0) != m.group(0).lower()):
                    line = line.replace(
                        m.group(0).lower(), " <span class='replace'>" +
                        m.group(0).lower().strip() + "</span> ")

        # For each word in the sentence, count repetitions. If there are three or more
        # of the same word in a sentnece, highlight all occurences. Also check for be
        # verbs as well, and highlight them accordingly.
        # for word in backup.split(" "):
        for word in re.split("(\s|--)", backup):

            if ("](" in word):
                word = word.split("](")[0]

            # This strips any special characters from the word, such as punctuation.
            stripped = re.sub(r"^[\W]+", "", word.strip())
            stripped = re.sub(r"[\W]+$", "", stripped)

            if (len(stripped) == 0):
                continue

            wc += 1

            # First check if we have decided to exclude the word, as in the case of "the",
            # "of", "a", "for", or similar words. If true, skip the word; else, proceed.
            if (stripped.lower() not in exclude):
                # If the word already exists in the dictionary, increment its count; else
                # instantiate it to 1
                if (stripped.lower() in dict_count):
                    dict_count[stripped.lower()] += 1
                else:
                    dict_count[stripped.lower()] = 1

                # Once there are at least three occurences of a word in the paragraph,
                # highlight it as a repeat word and incrememnt the number of unique words
                # repeated in the document.
                if (dict_count[stripped.lower()] == 3):
                    line = re.sub(
                        r"([^\w])" + stripped + r"([^\w])",
                        r"\1<span class='repeat " + stripped + "'>" +
                        stripped + r"</span>\2", line)
                    repeated_words += 1
                    total_repeated_words += 1

            # Check for be verbs, "ly" words in the document. If found, highlight
            # them and increment the be verb count.
            if (stripped.lower() in be_verbs) or (stripped.lower()[-2:]
                                                  == "ly"):
                line = re.sub(
                    r"([^\w])" + stripped + r"([^\w])",
                    r"\1<span class='avoid'>" + stripped + r"</span>\2", line)
                avoid_words += 1
                total_avoid_words += 1

            # To calculate the number of complex words, first exclude proper nouns. Next,
            # exclude compound words, then strip -es, -ed, and -ing endings. Finally, if
            # the number of syllables in the remaining word is >= 3, found a complex word.
            if (not (re.search("^[A-Z]", stripped))):
                if ("-" not in stripped):
                    if (SyllableCount(stripped.lower()) >= 3):
                        start = line.find(word)
                        length = len(word)
                        end = start + length

                        # print("Searched: '%s'" % line[start:end])
                        # print("With ends: '%s'" % line[start-1:end+1])
                        # print("Preceeding character: '%s'" % line[start-1])
                        # print("After character: '%s'" % line[end])
                        # print(re.match("[\>\w]", line[start-1]))
                        # print(re.match("[\<\w]", line[end]))
                        # print(line)
                        # print
                        if not ("http" in stripped
                                or re.match("[\>\w]", line[start - 1])
                                or re.match("[\<\w]", line[end])):
                            line = line.replace(
                                stripped, "<span class='complex_word'>" +
                                stripped + "</span>")
                        # line = re.sub((r"[^\>\w]")+stripped+(r"[^\<\w]"), "\1<span class='complex_word'>"+stripped+"</span>\2", line)
                        complex_words += 1
                        # sleep(1)

            syllable_count += SyllableCount(stripped.lower())

        word_count.append(wc)

        # Count sentences in paragraph, and add that number to the running sentence total
        sentences = (len(re.findall("\.[^\w]", line)) +
                     len(re.findall("[?!]", line))) or 1
        total_sentences += sentences

        if (line[0:1] != "* " and line[0] != "#" and line[0:3] != "<pre"
                and block == False and line[-7:].strip() != "</pre>"):
            # Write the paragraph stats div to the output file, then the parsed line.
            o_fd.write(
                "<div class='floating_stats'><div>Words: %d. Sentences: %d</div><div>Overused phrase: %d</div><div>Repeated: %d; Avoid: %d</div></div>\n"
                % (word_count[-1], sentences, overused_words, repeated_words,
                   avoid_words))
        o_fd.write(Markdown(line, "https://zacs.site/") + "\n")

    # Close the source file
    fd.close()

    # Write closing <article> tag
    o_fd.write("</article>")

    # Sum the paragraph word counts into a single count, for document stats
    for num in word_count:
        total_word_count += int(num)

    # Get a timestamp for the document stats
    d = datetime.datetime.now()
    utime = "%d-%d-%d %d:%d:%d" % (d.year, d.month, d.day, d.hour, d.minute,
                                   d.second)

    # Calculate Gunning Fog Index
    # estimates the years of formal education needed to understand the text on a first reading.
    gfi = 0.4 * (float(total_word_count) / float(total_sentences) +
                 100.0 * float(complex_words) / float(total_word_count))

    # Calculate Flesch-Kincaid Readability Test
    # higher scores indicate material that is easier to read; lower numbers indicate difficulty.
    fkr = 206.835 - 1.015 * (float(fk_wc) / float(total_sentences)) - 84.6 * (
        float(syllable_count) / float(fk_wc))

    if (fkr <= 30.0):
        fkr = "<span class='extreme'>%3.2f</span>" % (fkr)
    elif (fkr <= 50.0):
        fkr = "<span class='hard'>%3.2f</span>" % (fkr)
    elif (fkr <= 60.0):
        fkr = "<span class='tough'>%3.2f</span>" % (fkr)
    elif (fkr <= 70.0):
        fkr = "<span class='plain'>%3.2f</span>" % (fkr)
    elif (fkr <= 80.0):
        fkr = "<span class='fair'>%3.2f</span>" % (fkr)
    elif (fkr <= 90.0):
        fkr = "<span class='easy'>%3.2f</span>" % (fkr)
    elif (fkr <= 100.00):
        fkr = "<span class='simple'>%3.2f</span>" % (fkr)

    # Calculate the Flesch-Kincaid Grade level:
    # the number of years of education generally required to understand this text.
    fgl = 0.39 * float(total_word_count) / float(
        total_sentences) + 11.8 * float(syllable_count) / float(
            total_word_count) - 15.59

    # Write the closing HTML to the output file, with document stats. Close it.
    o_fd.write(template[1] %
               (utime, utime, total_word_count, str(total_word_count / 200.0) +
                " mins", total_sentences, len(word_count),
                total_word_count / len(word_count), total_overused_words,
                total_repeated_words, total_avoid_words, gfi, fkr, fgl))
    o_fd.close()