Esempio n. 1
0
def analyze_book(path, bookn):
    lines = files.parse(path)
    sentences = create_sentences(lines)
    words = split_words(lines)
    func_words = count_func_words(sentences)
    
    avg_sentence, comma_ct = avg_sentence_len(sentences)
    return Book(bookn, vocab_distribution(words), avg_sentence, func_words, len(words))
Esempio n. 2
0
#Common words: https://github.com/first20hours/google-10000-english

import re, os, pickle, math, sys, file_utils as files
import numpy as np

common_words = files.parse("10k_words.txt")
func_words = """A ABOUT ABOVE AFTER AGAIN AGO ALL ALMOST ALONG ALREADY ALSO ALTHOUGH ALWAYS AM AMONG AN AND ANOTHER ANY ANYBODY ANYTHING ANYWHERE ARE AREN'T AROUND AS AT
BACK ELSE BE BEEN BEFORE BEING BELOW BENEATH BESIDE BETWEEN BEYOND BILLION BILLIONTH BOTH EACH BUT BY
CAN CAN'T COULD COULDN'T
DID DIDN'T DO DOES DOESN'T DOING DONE DON'T DOWN DURING
EIGHT EIGHTEEN EIGHTEENTH EIGHTH EIGHTIETH EIGHTY EITHER ELEVEN ELEVENTH ENOUGH EVEN EVER EVERY EVERYBODY EVERYONE EVERYTHING EVERYWHERE EXCEPT
FAR FEW FEWER FIFTEEN FIFTEENTH FIFTH FIFTIETH FIFTY FIRST FIVE FOR FORTIETH FORTY FOUR FOURTEEN FOURTEENTH FOURTH HUNDRED FROM
GET GETS GETTING GOT
HAD HADN'T HAS HASN'T HAVE HAVEN'T HAVING HE HE'D HE'LL HENCE HER HERE HERS HERSELF HE'S HIM HIMSELF HIS HITHER HOW HOWEVER NEAR HUNDREDTH
I I'D IF I'LL I'M IN INTO IS I'VE ISN'T IT ITS IT'S ITSELF
JUST
LAST LESS
MANY ME MAY MIGHT MILLION MILLIONTH MINE MORE MOST MUCH MUST MUSTN'T MY MYSELF
NEAR NEARBY NEARLY NEITHER NEVER NEXT NINE NINETEEN NINETEENTH NINETIETH NINETY NINTH NO NOBODY NONE NOONE NOTHING NOR NOT NOW NOWHERE
OF OFF OFTEN ON OR ONCE ONE ONLY OTHER OTHERS OUGHT OUGHTN'T OUR OURS OURSELVES OUT OVER
QUITE
RATHER ROUND
SECOND SEVEN SEVENTEEN SEVENTEENTH SEVENTH SEVENTIETH SEVENTY SHALL SHAN'T SHE'D SHE SHE'LL SHE'S SHOULD SHOULDN'T SINCE SIX SIXTEEN SIXTEENTH SIXTH SIXTIETH SIXTY SO SOME SOMEBODY SOMEONE SOMETHING SOMETIMES SOMEWHERE SOON STILL SUCH
TEN TENTH THAN THAT THAT THAT'S THE THEIR THEIRS THEM THEMSELVES THESE THEN THENCE THERE THEREFORE THEY THEY'D THEY'LL THEY'RE THIRD THIRTEEN THIRTEENTH THIRTIETH THIRTY THIS THITHER THOSE THOUGH THOUSAND THOUSANDTH THREE THRICE THROUGH THUS TILL TO TOWARDS TODAY TOMORROW TOO TWELFTH TWELVE TWENTIETH TWENTY TWICE TWO
UNDER UNDERNEATH UNLESS UNTIL UP US
VERY
WHEN WAS WASN'T WE WE'D WE'LL WERE WE'RE WEREN'T WE'VE WHAT WHENCE WHERE WHEREAS WHICH WHILE WHITHER WHO WHOM WHOSE WHY WILL WITH WITHIN WITHOUT WON'T WOULD WOULDN'T
YES YESTERDAY YET YOU YOUR YOU'D YOU'LL YOU'RE YOURS YOURSELF YOURSELVES YOU'VE""".lower().split(" ")

UNIQUE_PER_WORD__WT = 0
AVG_SENTENCE_LEN__WT = 0