def syllables_count(word):
    """
    Textstat is a python package, to calculate statistics from text to determine readability
    complexity and grade level of a particular corpus.
    Package can be found at https://pypi.python.org/pypi/textstat
    """
    return textstatistics().syllable_count(word)
Esempio n. 2
0
def calcFeatures(params):
    index, rev = params  # Multiprocessing...
    global rev_xl
    filename = "insert data path of the 2015 data from https://figshare.com/articles/English_Wikipedia_Quality_Asssessment_Dataset/1375406" + str(
        rev['revid'])
    if (os.path.exists(filename)):
        print(rev['revid'])
        text = util.read_file(filename)
        text = util.cleanhtml(text)
        text = text.replace('\'\'\'', '')
        assert rev['pageid'] == rev_xl.iloc[index, 0]
        print("matched ", rev['revid'])

        calc = readcalc.ReadCalc(text)
        textual_score = list(calc.get_all_metrics())

        text_stat = textstatistics()
        linsear_write_formula = round(text_stat.linsear_write_formula(text),2)
        textual_score.append(linsear_write_formula)

        grammar_score = len(tool.check(text))
        textual_score.append(grammar_score)

        rev_xl.iloc[index, 14:36] = textual_score

        print(rev_xl.iloc[index, :])

        if index % 10 == 0:
            rev_xl.to_csv(path)
Esempio n. 3
0
def syllables(text):
    """

    :param text:
    :return:
    """
    num_of_syllables = 0
    stripped = text.replace("[", "").replace("]", "").replace("'", "")
    splits = stripped.split(',')
    for word in splits:
        num_of_syllables += textstatistics().syllable_count(word)
    return num_of_syllables
Esempio n. 4
0
def flesch_reading_ease(text): 
  #Reading Ease score = 206.835 - (1.015 × average sentence length) - (84.6 × average word length in syllables)

  words_count,sentences_count,_,_ = get_param(text) 
  #calculate average sentence length
  avg_sentence_length = float(words_count/sentences_count)
  syllable_count = textstatistics().syllable_count(text)
  #calculate average syllables per word
  avg_syllables_per_word = float(syllable_count) / float(words_count)

  FRE = 206.835 - float(1.015 * avg_sentence_length) - float(84.6 * avg_syllables_per_word) 

  return legacy_round(FRE, 2) 
def complex_words(text):
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [token for token in sentence]

    diff_words_set = set()

    for word in words:
        if word not in easy_word_set and textstatistics().syllable_count(str(word)) >= 2:
            diff_words_set.add(word)

    return len(diff_words_set)
Esempio n. 6
0
def difficulty(text):
    """

    :param text:
    :return:
    """
    difficulties = []
    stripped = text.replace("[", "").replace("]", "").replace("'", "")
    splits = stripped.split(',')
    for word in splits:
        if word not in easy_word_set and textstatistics().syllable_count(word) > 2:
            difficulties.append(word)
    return difficulties
Esempio n. 7
0
def poly_syllable_count(text):
    """

    :param text:
    :return:
    """
    count = 0
    stripped = text.replace("[", "").replace("]", "").replace("'", "")
    splits = stripped.split(',')
    for word in splits:
        syllable_count = textstatistics().syllable_count(word)
        if syllable_count >= 3:
            count += 1
    return count
Esempio n. 8
0
 def c_score(self, text):
     nlp = spacy.load('en')
     doc = nlp(text)
     sentences = [sent for sent in doc.sents]
     words = 0
     for sentence in sentences:
         words += len([token for token in sentence])
     num_sent = len(sentences)
     sent_len = float(words / num_sent)
     sylls = textstatistics().syllable_count(text)
     ASPW = float(sylls) / float(words)
     syls_p_wd = legacy_round(ASPW, 1)
     FRE = 206.835 - float(1.015 * sent_len) - float(84.6 * syls_p_wd)
     score = legacy_round(FRE, 2)
     return words, score
Esempio n. 9
0
def text_analysis(text): 

  #Use spacy lib for tokenization
  nlp = spacy.load('en_core_web_sm') 
  doc = nlp(text)
  sentences = doc.sents

  #create 3 counters
  wordsNo = 0
  sentencesNo = 0
  chars = 0

  words = []
  for sentence in sentences: 
    #Count all sentences
    sentencesNo += 1
    for token in sentence:
      if token.dep_!='punct':
        #Count all words without punctuation
        wordsNo += 1 
        #create a list of words
        words.append(str(token))

  if sentencesNo>0:
    #calculate average sentence length
    average_sentence_length = float(wordsNo / sentencesNo)
  else:
    average_sentence_length = 0

  #get number of syllables in the text
  syllable_count = textstatistics().syllable_count(text)

  #calculate the number of characters
  for word in words:
    chars += len(word)

  if wordsNo>0:
    #calculate average word length
    average_word_length = float (chars / wordsNo)
  else:
    average_word_length = 0

  return wordsNo, sentencesNo, average_sentence_length, syllable_count, average_word_length
Esempio n. 10
0
def get_param(text):

    #Use spacy lib for tokenization
    nlp = spacy.load('en')
    doc = nlp(text)
    sentences = doc.sents

    #create 3 counters
    wordsNo = 0
    sentencesNo = 0
    poly_syllable_count = 0

    #Create an empty list for words
    words = []
    for sentence in sentences:
        #Count all words
        wordsNo += len([token for token in sentence])
        #Count all sentences
        sentencesNo += 1
        #create a list of words
        words += [str(token) for token in sentence]

    #Create a difficult words set to contain difficult words
    diff_words = set()
    #Load easy word set
    easy_word = set([
        ln.strip() for ln in pkg_resources.resource_stream(
            'textstat', '/resources/en/easy_words.txt')
    ])
    #Loop on all words
    for word in words:
        #Get syllable count
        syllable_count = textstatistics().syllable_count(word)
        #poly_syllable_count is when syllable is grater than three per word
        if syllable_count >= 3:
            poly_syllable_count += 1
        #Difficult word when the word is not in easy word set and contain more than 2 syllables
        if word not in easy_word and syllable_count >= 2:
            diff_words.add(word)

    #Analyse text to return important parameters for all readability fomulas
    return wordsNo, sentencesNo, len(diff_words), poly_syllable_count
Esempio n. 11
0
def flesch_kincaid_grade(word):
    return textstatistics().flesch_kincaid_grade(word)
Esempio n. 12
0
def linsear_write_score(text):
    return textstatistics().linsear_write_formula(text)
Esempio n. 13
0
def flesch_grade_score(text):
    return textstatistics().flesch_kincaid_grade(text)
Esempio n. 14
0
 def __init__(self):
     """
     An local Object of the textstat package is created before using analyse function
     """
     super().__init__()
     self.text_statistics = TS.textstatistics()
Esempio n. 15
0
 def __init__(self):
     """
     Creating textstat object for analyse function
     """
     self.text_statistics = TS.textstatistics()
Esempio n. 16
0
def num_syllables(word):
    return textstatistics().syllable_count(word)
Esempio n. 17
0
 def __init__(self):
     super().__init__()
     self.ts = TS.textstatistics()
Esempio n. 18
0
def dale_chall_readability_score(word):
    return textstatistics().dale_chall_readability_score(word)
Esempio n. 19
0
def lexicon_count(word):
    return textstatistics().lexicon_count(word)
Esempio n. 20
0
def flesch_ease_score(text):
    return textstatistics().flesch_reading_ease(text)
Esempio n. 21
0
def coleman_liau_index(word):
    return textstatistics().coleman_liau_index(word)
def syllables_count(word):
    return textstatistics().syllable_count(word, lang='en_US')
Esempio n. 23
0
    # Calculate the total number of sentences
    docReader = nltk.corpus.PlaintextCorpusReader('./', artist + '.txt')
    sentences = len(docReader.sents())

    # Calculate the total number of difficult words
    diff_words_count = textstat.difficult_words(raw_text)

    # Calculate readability-- Gunning Fog
    dif_words = (diff_words_count / ttl_words * 100)
    gf_read = 0.4 * (float(ttl_words / sentences) + dif_words)

    # Calculate readability-- SMOG
    poly_syl = 0
    for word in words:
        syl_count = textstatistics().syllable_count(word)
        if syl_count >= 3:
            poly_syl += 1
    SMOG = (1.043 * (30 * (poly_syl / sentences))**0.5) + 3.1291
    smog_read = legacy_round(SMOG, 1)

    # Calculate readability-- Linsear Write
    cl_read = textstat.coleman_liau_index(raw_text)

    df.loc[i] = (artist, 0, ttl_words, sentences, 0, len(set(words)),
                 round(100 - (len(lyrics_no_sw) * 100.0 / ttl_words),
                       2), diff_words_count, gf_read, smog_read, cl_read)
    i += 1

df['songs'] = [304, 224]
df['words_per_song'] = df['words'] / df['songs']
Esempio n. 24
0
def linsear_write_formula(word):
    return textstatistics().linsear_write_formula(word)
Esempio n. 25
0
def text_param(text):

  #Use spacy lib for tokenization 
  nlp = spacy.load('en_core_web_sm') 
  doc = nlp(text)
  sentences = doc.sents

  #create 5 counters
  wordsNo = 0
  sentencesNo = 0
  poly_syllable_count = 0
  long_word = 0
  chars = 0

  #Create an empty list for words
  words = []
  for sentence in sentences: 
    #Count all sentences
    sentencesNo += 1
    for token in sentence:
      #Count all words
      wordsNo += 1
      words.append(str(token))
        # print(str(token))

  #Create a difficult words set to contain difficult words
  diff_words = set() 
  #Load easy word set
  easy_word = set([ln.strip() for ln in pkg_resources.resource_stream('textstat', '/resources/en/easy_words.txt')])
  #Load easy word set
  for word in words: 
    #Get syllable count of word
    syllable_count = textstatistics().syllable_count(word) 
    #poly_syllable_count is when syllable is grater than three per word
    if syllable_count >= 3:
      poly_syllable_count += 1
    #Long word is when its length is greater than 7
    if len(word)>7:
      long_word += 1
    #Count no of characters
    chars += len(word)

  #Get syllable count of whole text
  syllable_count = textstatistics().syllable_count(text)
  #Get lexical count of whole text
  lexical_counts = textstat.lexicon_count(text, removepunct=True)
  #calculate average sentence length
  average_sentence_length = float(wordsNo / sentencesNo)
  #calculate average syllables per words
  average_syllables_per_words = float(syllable_count / wordsNo)
  #calculate average poly syllable per words
  average_poly_syllable = float (poly_syllable_count / wordsNo)
  #calculate average long words per words
  average_long_word = float (long_word / wordsNo)
  #calculate average word length
  average_word_length = float (chars / wordsNo)
  
  #return a list with text parameters used in readability equations as features
  return [ wordsNo , sentencesNo, average_sentence_length , syllable_count ,\
          average_syllables_per_words , poly_syllable_count  , lexical_counts ,\
          average_poly_syllable , long_word , average_long_word , average_word_length ]
Esempio n. 26
0
def syllables_count(dummy):

    return textstatistics().syllable_count(dummy)
Esempio n. 27
0
 def load():
     TextStats.ts = textstatistics()
     TextStats.es = easy_word_set
Esempio n. 28
0
def smog_index(word):
    return textstatistics().smog_index(word)
Esempio n. 29
0
def syllables_count(word):
    return textstatistics().syllable_count(word)
Esempio n. 30
0
def automated_readability_index(word):
    return textstatistics().automated_readability_index(word)