Beispiel #1
0
    def readability_analysis(self, text):
        words = text.split()
        wrd_dic = {}
        for wrd in words:
            wrd = "".join(a for a in wrd if a not in punctuation)
            wrd_dic[wrd] = textstat.syllable_count(wrd)
        wrd_dic = [b for b in wrd_dic if wrd_dic[b] >= 5]

        flesch_reading_ease = textstat.flesch_reading_ease(text)

        if flesch_reading_ease > 100:
            flesch_reading_ease = 100
        elif flesch_reading_ease < 0:
            flesch_reading_ease = 0

        syllable_count = textstat.syllable_count(text)
        avg_syllables_per_word = textstat.avg_syllables_per_word(text)
        avg_letter_per_word = textstat.avg_letter_per_word(text)

        readability = {
            "flesch_reading_ease": flesch_reading_ease,
            "avg_syllables_per_word": avg_syllables_per_word,
            "syllable_count": syllable_count,
            "avg_letter_per_word": avg_letter_per_word,
        }

        grade, score = self.readability_grade(readability)
        readability['grade'] = grade
        readability['score'] = score
        readability['difficult_words'] = wrd_dic
        return readability
Beispiel #2
0
    def _calculate_scores(self, docs):
        docs_scores = []

        for doc in docs:
            scores = {}
            scores['chars'] = ts.char_count(doc)
            scores['words'] = ts.lexicon_count(doc)
            scores['sents'] = ts.sentence_count(doc)
            #scores['syllables'] = ts.syllable_count(doc)
            scores['avg_sent_length'] = ts.avg_sentence_length(doc)
            scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc)
            scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc)
            scores['flesch'] = ts.flesch_reading_ease(doc)
            #scores['smog'] = ts.smog_index(doc)
            #scores['coleman_liau'] = ts.coleman_liau_index(doc)
            scores['automated_readability'] = ts.automated_readability_index(
                doc)
            #scores['linsear'] = ts.linsear_write_formula(doc)
            #scores['difficult_words'] = ts.difficult_words(doc)
            scores['dale_chall'] = ts.dale_chall_readability_score(doc)
            #scores['gunning_fog'] = ts.gunning_fog(doc)
            scores['lix'] = ts.lix(doc)
            docs_scores.append(scores)

        return docs_scores
Beispiel #3
0
def scores_cal_ori(text):

              char_count_value=textstat.char_count(text,ignore_spaces=True)
              lexicon_count_value=textstat.lexicon_count(text,removepunct=True)
              syllable_count_value=textstat.syllable_count(text)
              sentence_count_value=textstat.sentence_count(text)
              avg_sentence_length_value=textstat.avg_sentence_length(text)
              avg_syllables_per_word_value=textstat.avg_syllables_per_word(text)
              avg_letter_per_word_value=textstat.avg_letter_per_word(text)
              avg_sentence_per_word_value=textstat.avg_sentence_per_word(text)
              flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text)
              smog_index_value=textstat.smog_index(text)
              gunning_fog_value=textstat.gunning_fog(text)
              difficult_words_value=textstat.difficult_words(text)
              dale_chall_value=textstat.dale_chall_readability_score(text)
              polysyllab_value=textstat.polysyllabcount(text)
              return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value
              return smog_index_value
Beispiel #4
0
def analyse_json(json_text):
    # consider moving this to be a feature of Transcript in the other module

    df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name',
                                         'syllable_count','lexicon_count',
                                         'sentence_count',
                                         'syllables_per_word',
                                         'gunning_fog', 'smog_index',
                                         'text_standard'],
                      index=[])

    trscrpt = json.loads(json_text)
    if 'witnesses' in trscrpt:
        witnesses = trscrpt['witnesses']


        for s in trscrpt['all_sections']:
            if 'speaker' in s and 'person' in s['speaker'] and \
                    s['speaker']['person']['speaker_type']=='witness':
                witness =  witnesses[s['speaker']['person']['name']]
                witness.setdefault('all_text', []).append(s['spoken_text'])

        for i, p in enumerate(witnesses):
            if 'all_text' in witnesses[p]:
                witness_text = '\n\n'.join(witnesses[p]['all_text'])
                if len(witness_text) > 0:
                    stats_data = {'html_file_location': trscrpt['html_file_location'],
                                  'witness_name': p,
                                  'syllable_count': textstat.syllable_count(witness_text),
                                  'lexicon_count': textstat.lexicon_count(witness_text),
                                  'sentence_count': textstat.sentence_count(witness_text),
                                  'syllables_per_word': textstat.avg_syllables_per_word(witness_text),
                                  'gunning_fog': textstat.gunning_fog(witness_text),
                                  'smog_index': textstat.smog_index(witness_text),
                                  'text_standard': textstat.text_standard(witness_text)}
                    df_witnesses.loc['witness_%i' % i] = stats_data
                else:
                    df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                    df_witnesses.loc['witness_%i' % i, 'witness_name'] = p
            else:
                df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location']
                df_witnesses.loc['witness_%i' % i, 'witness_name'] = p

    return df_witnesses
def analyzeTweet(tweetId):
    tweet = tweets_coll.find_one({"_id": tweetId})
    print("tweet: ", tweet)
    tweetAnalysis = {"_id": tweet["id"]}
    print("\tanalyzed tweet:", tweetAnalysis)
    blob = TextBlob(tweet["text"])
    tweetAnalysis["tags"] = blob.tags
    tweetAnalysis["noun_phrases"] = blob.noun_phrases
    tweetAnalysis["sentiment.polarity"] = blob.sentiment[0]
    tweetAnalysis["sentiment.subjectivity"] = blob.sentiment[1]
    tweetAnalysis["flesch_kincaid"] = textstat.flesch_kincaid_grade(
        tweet["text"])
    tweetAnalysis["average_sentence_length"] = textstat.avg_sentence_length(
        tweet["text"])
    tweetAnalysis[
        "average_syllables_per_word"] = textstat.avg_syllables_per_word(
            tweet["text"])
    print("\ttweetAnalyis: " + json.dumps(tweetAnalysis))
    processed_tweets_coll.update_one({"_id": tweet["id"]},
                                     {'$set': tweetAnalysis},
                                     upsert=True)
Beispiel #6
0
 def stats(self, text):
     test_data = text
     stats = {}
     stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data)
     stats['smog'] = textstat.smog_index(test_data)
     stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data)
     stats['coleman Liau'] = textstat.coleman_liau_index(test_data)
     stats['automated'] = textstat.automated_readability_index(test_data)
     stats['dale chall'] = textstat.dale_chall_readability_score(test_data)
     stats['difficult'] = textstat.difficult_words(test_data)
     stats['linsear'] = textstat.linsear_write_formula(test_data)
     stats['gunning_fog'] = textstat.gunning_fog(test_data)
     stats['standard'] = textstat.text_standard(test_data)
     stats['charcount'] = textstat.char_count(test_data)
     stats['lexicon count'] = textstat.lexicon_count(test_data)
     stats['syllable count'] = textstat.syllable_count(test_data)
     stats['sentence count'] = textstat.sentence_count(test_data)
     stats['avg sentence length'] = textstat.avg_sentence_length(test_data)
     stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word(
         test_data)
     stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data)
     stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word(
         test_data)
     return stats
Beispiel #7
0
    def updateData(self):

        # Full list of polarity scores
        self.polscore = self.sid.polarity_scores(self.text)

        ##### INDEX 0 IN DATA: Text Sentiment #####
        # [INDEX 0] Compounded score (0.0 - 1.0)            [INDEX 1] Negative connotation rating (0.0 - 1.0),
        # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0)
        self.data.append([
            self.polscore['compound'], self.polscore['neg'],
            self.polscore['pos'], self.polscore['neu']
        ])

        ##### INDEX 1 IN DATA: Sentence Info #####
        # [INDEX 0] Sentence count          [INDEX 1] Average sentence length
        # [INDEX 2] Syllable count          [INDEX 3] Overall word count
        # [INDEX 4] Character count         [INDEX 5] Character count without spaces
        # [INDEX 6] Avg letters per word    [INDEX 7] Avg syllables per word
        self.data.append([
            textstat.sentence_count(self.text),
            textstat.avg_sentence_length(self.text),
            textstat.syllable_count(self.text),
            len(self.splList),
            textstat.char_count(self.text, False),
            textstat.char_count(self.text, True),
            textstat.avg_letter_per_word(self.text),
            textstat.avg_syllables_per_word(self.text)
        ])

        ##### INDEX 2 IN DATA: Flesch Reading Ease #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 100
        self.freRaw = textstat.flesch_reading_ease(self.text)
        self.freStat = min(max(self.freRaw, 0), 100)
        self.data.append([
            round(self.freStat, 3),
            self.freGrade(self.freStat),
            round(abs(self.freStat - 100), 2)
        ])

        ##### INDEX 3 IN DATA: Flesch-Kincaid Grade #####
        # [INDEX 0] Pure score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fkgRaw = textstat.flesch_kincaid_grade(self.text)
        self.fkgStat = self.adjustScore(self.fkgRaw)
        self.data.append([
            round(self.fkgStat, 3),
            self.grade(self.fkgStat),
            round(self.fkgStat / 0.18, 2)
        ])

        ##### INDEX 4 IN DATA: Gunning FOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.fogRaw = textstat.gunning_fog(self.text)
        self.fogStat = self.adjustScore(self.fogRaw)
        self.data.append([
            round(self.fogStat, 3),
            self.grade(self.fogStat),
            round(self.fogStat / 0.18, 2)
        ])

        ##### INDEX 5 IN DATA: SMOG Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.smogRaw = textstat.smog_index(self.text)
        self.smogStat = self.adjustScore(self.smogRaw)
        self.data.append([
            round(self.smogStat, 3),
            self.grade(self.smogStat),
            round(self.smogStat / 0.18, 2)
        ])

        ##### INDEX 6 IN DATA: Automated Readability Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 14
        self.ariRaw = textstat.automated_readability_index(self.text)
        self.ariStat = min(max(self.ariRaw, 0), 14)
        self.data.append([
            round(self.ariStat, 3),
            self.ariGrade(ceil(self.ariStat)),
            round(self.ariStat / 0.14, 2)
        ])  #13

        ##### INDEX 7 IN DATA: Coleman-Liau Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.cliRaw = textstat.coleman_liau_index(self.text)
        self.cliStat = self.adjustScore(self.cliRaw)
        self.data.append([
            round(self.cliStat, 3),
            self.grade(self.cliStat),
            round(self.cliStat / 0.18, 2)
        ])

        ##### INDEX 8 IN DATA: Linsear Write Index #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 18
        self.lwiRaw = textstat.linsear_write_formula(self.text)
        self.lwiStat = self.adjustScore(self.lwiRaw)
        self.data.append([
            round(self.lwiStat, 3),
            self.grade(self.lwiStat),
            round(self.lwiStat / 0.18, 2)
        ])

        ##### INDEX 9 IN DATA: Dale-Chall Readability Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 10
        self.dcrRaw = textstat.dale_chall_readability_score(self.text)
        self.dcrStat = min(max(self.dcrRaw, 0), 10)
        self.data.append([
            round(self.dcrStat, 3),
            self.daleChallGrade(self.dcrStat),
            round(self.dcrStat / 0.1, 2)
        ])

        ##### INDEX 10 IN DATA: Overall Score #####
        # [INDEX 0] Pure Score              [INDEX 1] Approximate grade     [INDEX 2] Normalized (ratio) score
        # SCORE SCALE: 0 - 20
        self.txtRaw = textstat.text_standard(self.text, True)
        self.txtStd = min(max(self.txtRaw, 0), 20)
        self.txtInfo = textstat.text_standard(self.text)
        self.data.append([
            round(self.txtStd, 3),
            self.txtGrade(self.txtStd, self.txtInfo),
            round(self.txtStd / 0.2, 2)
        ])

        return self.data
 def dutch_flesch_reading_ease(self, text):
     ASL = textstat.avg_sentence_length(text)
     ASW = textstat.avg_syllables_per_word(text)
     FRE = 206.84 - float(0.33 * ASL) - float(77 * ASW)
     return int(FRE)
 def french_flesch_reading_ease(self, text):
     ASL = textstat.avg_sentence_length(text)
     ASW = textstat.avg_syllables_per_word(text)
     FRE = 209 - float(1.15 * ASL) - float(68 * ASW)
     return int(FRE)
 def italian_flesch_reading_ease(self, text):
     ASL = textstat.avg_sentence_length(text)
     ASW = textstat.avg_syllables_per_word(text)
     FRE = 217 - float(1.3 * ASL) - float(60 * ASW)
     return int(FRE)
 def portuguese_flesch_reading_ease(self, text):
     ASL = textstat.avg_sentence_length(text)
     ASW = textstat.avg_syllables_per_word(text)
     FRE = 206.84 - float(1.02 * ASL) - float(60 * ASW)
     return int(FRE)
Beispiel #12
0
    def __init__(self, path):
        """
        Create document instance for analysis.

        Opens and reads document to string raw_text.
        Textract interprets the document format and
        opens to plain text string (docx, pdf, odt, txt)

        Args:
            path (str): path to file to open, anaylze, close


        Public attributes:
        -user: (str) optional string to set username.
        -path: (str) relative path to document.
        -abs_path: (str) the absolute path to the document.
        -file_name:  (str) the file name with extension of document (base
        name).
        -mime:  tbd
        -guessed_type:  makes best guess of mimetype of document.
        -file_type:  returns index[0] from guessed_type.
        -raw_text:  (str) plain text extracted from .txt, .odt, .pdf, .docx,
        and .doc.
        -ptext:  (str) raw text after a series of regex expressions to
        eliminate special characters.
        -text_no_feed:  (str) ptext with most new line characters eliminated
        /n/n stays intact.
        -sentence_tokens:  list of all sentences in a comma separated list
        derived by nltk.
        -sentence_count:  (int) count of sentences found in list.
        -passive_sentences:  list of passive sentences identified by the
        passive module.
        -passive_sentence_count:  count of the passive_sentences list.
        -percent_passive:  (float) ratio of passive sentences to all sentences
        in percent form.
        -be_verb_analysis:  (int) sum number of occurrences of each to be verb
        (am, is, are, was, were, be, being been).
        -be_verb_count: tbd
        -be_verb_analysis: tbd
        -weak_sentences_all:  (int) sum of be verb analysis.
        -weak_sentences_set:  (set) set of all sentences identified as
        having to be verbs.
        -weak_sentences_count:  (int) count of items in weak_sentences_set.
        -weak_verbs_to_sentences:  (float) proportion of sentences with to
        be to all sentences in percent (this might not be sound).
        -word_tokens:  list of discreet words in text that breaks
        contractions up (default nltk tokenizer).
        -word_tokens_no_punct:  list of all words in text including
        contractions but otherwise no punctuation.
        -no_punct:  (str) full text string without sentence punctuation.
        -word_tokens_no_punct:  uses white-space tokenizer to create a list
        of all words.
        -readability_flesch_re:  (int) Flesch Reading Ease Score (numeric
        score) made by textstat module.
        -readability_smog_index:  (int) grade level as determined by the
        SMOG algorithum made by textstat module.
        -readability_flesch_kincaid_grade:  (int)  Flesch-Kincaid grade level
        of reader made by textstat module.
        -readability_coleman_liau_index:  (int) grade level of reader as made by
        textstat module.
        -readability_ari:  (int) grade leader of reader determined by
        automated readability index algorithum implemented by textstat.
        -readability_linser_write:  FIX SPELLING grade level as determined
        by Linsear Write algorithum implemented by textstat.
        -readability_dale_chall:  (int) grade level based on Dale-Chall
        readability as determined by textstat.
        -readability_standard:  composite grade level based on readability
        algorithums.
        -flesch_re_key:  list for interpreting Flesch RE Score.
        -word_count:  word count of document based on white space tokener,
        this word count should be used.
        -page_length:  (float) page length in decimal format given 250
        words per page.
        -paper_count:  (int) number of printed pages given 250 words per
        page.
        -parts_of_speech:  words with parts of speech tags.
        -pos_counts:  values in word, tag couple grouped in a list (Counter).
        -pos_total:  (int) sum of pos_counts values
        -pos_freq:  (dict) word, ratio of whole
        -doc_pages:  (float) page length based on 250 words per page
        (warning, this is the second time this attribute is defined).
        -freq_words:  word frequency count not standardized based on the
        correct word tokener (not ratio, just count).
        modal_dist:  count of auxillary verbs based on word_tokens_no_punct.
        sentence_count (int): Count the sentence tokens
        passive_sentences (list): List of all sentences identified as passive
        passive_sentence_count (int): count of items in passive_sentences
        be_verb_count (int): count "to be" verbs in text
        word_tokens_no_punct (list): words separated, stripped of punctuation,
         made lower case
        flesch_re_key (str): reading ease score to description
        freq_words (list or dict): frequency distribution of all words
        modal_dist (list): frequency distribution of aux verbs
        """
        self.user = ""
        self.path = path
        self.abs_path = os.path.abspath(self.path)
        if os.path.isfile(self.path):
            self.time_stamp = self.timestamp()
            self.file_name = os.path.basename(path)
            self.mime = MimeTypes()
            self.guessed_type = self.mime.guess_type(self.path)
            self.file_type = self.guessed_type[0]
            self.raw_text = textract.process(self.path, encoding="ascii")
            self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text)
            self.ptext = re.sub(u"\u2014", "--", self.ptext)
            self.ptext = re.sub(",", ",", self.ptext)
            self.ptext = re.sub("—", "--", self.ptext)
            self.ptext = re.sub("…", "...", self.ptext)
            self.text_no_feed = self.clean_new_lines(self.ptext)
            self.sentence_tokens = self.sentence_tokenize(self.text_no_feed)
            self.sentence_count = len(self.sentence_tokens)
            self.passive_sentences = passive(self.text_no_feed)
            self.passive_sentence_count = len(self.passive_sentences)
            self.percent_passive = (100 * (float(self.passive_sentence_count) /
                                           float(self.sentence_count)))
            self.percent_passive_round = round(self.percent_passive, 2)

            self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens)
            self.be_verb_count = self.be_verb_analysis[0]
            self.weak_sentences_all = self.be_verb_analysis[1]
            self.weak_sentences_set = set(self.weak_sentences_all)
            self.weak_sentences_count = len(self.weak_sentences_set)
            self.weak_verbs_to_sentences = 100 * float(
                self.weak_sentences_count) / float(self.sentence_count)
            self.weak_verbs_to_sentences_round = round(
                self.weak_verbs_to_sentences, 2)
            self.word_tokens = self.word_tokenize(self.text_no_feed)
            self.word_tokens_no_punct = \
                self.word_tokenize_no_punct(self.text_no_feed)
            self.no_punct = self.strip_punctuation(self.text_no_feed)
            # use this! It make lower and strips symbols
            self.word_tokens_no_punct = self.ws_tokenize(self.no_punct)


            self.readability_flesch_re = \
                textstat.flesch_reading_ease(self.text_no_feed)
            self.readability_smog_index = \
                textstat.smog_index(self.text_no_feed)
            self.readability_flesch_kincaid_grade = \
                textstat.flesch_kincaid_grade(self.text_no_feed)
            self.readability_coleman_liau_index = \
                textstat.coleman_liau_index(self.text_no_feed)
            self.readability_ari = \
                textstat.automated_readability_index(self.text_no_feed)
            self.readability_linser_write = \
                textstat.linsear_write_formula(self.text_no_feed)
            self.readability_dale_chall = \
                textstat.dale_chall_readability_score(self.text_no_feed)
            self.readability_standard = \
                textstat.text_standard(self.text_no_feed)

            self.flesch_re_desc_str = self.flesch_re_desc(
                int(textstat.flesch_reading_ease(self.text_no_feed)))
            self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed)
            self.lexicon_count = textstat.lexicon_count(self.text_no_feed)
            self.avg_syllables_per_word = textstat.avg_syllables_per_word(
                self.text_no_feed)
            self.avg_sentence_per_word = textstat.avg_sentence_per_word(
                self.text_no_feed)
            self.avg_sentence_length = textstat.avg_sentence_length(
                self.text_no_feed)
            self.avg_letter_per_word = textstat.avg_letter_per_word(
                self.text_no_feed)
            self.difficult_words = textstat.difficult_words(self.text_no_feed)
            self.rand_passive = self.select_random(self.passive_sentence_count,
                                                   self.passive_sentences)
            self.rand_weak_sentence = self.select_random(
                len(self.weak_sentences), self.weak_sentences)
            if self.word_tokens_no_punct:
                self.word_count = len(self.word_tokens_no_punct)
                self.page_length = float(self.word_count) / float(250)
                self.paper_count = int(math.ceil(self.page_length))
                self.parts_of_speech = pos_tag(self.word_tokens_no_punct)
                self.pos_counts = Counter(
                    tag for word, tag in self.parts_of_speech)
                self.pos_total = sum(self.pos_counts.values())
                self.pos_freq = dict(
                    (word, float(count) / self.pos_total)
                    for word, count in self.pos_counts.items())
                self.doc_pages = float(float(self.word_count) / float(250))
                self.freq_words = \
                    self.word_frequency(self.word_tokens_no_punct)
                self.modal_dist = self.modal_count(self.word_tokens_no_punct)
                # self.ws_tokens = self.ws_tokenize(self.text_no_cr)
                self.pos_count_dict = self.pos_counts.items()

            # Model - use for any pos
            self.modals = self.pos_isolate('MD', self.pos_count_dict)
            self.preposition_count = self.pos_isolate('IN',
                                                      self.pos_count_dict)
            self.adjective_count = self.pos_isolate_fuzzy(
                'JJ', self.pos_count_dict)
            self.adverb_count = self.pos_isolate_fuzzy('RB',
                                                       self.pos_count_dict)
            self.proper_nouns = self.pos_isolate_fuzzy('NNP',
                                                       self.pos_count_dict)
            self.cc_count = self.pos_isolate('CC', self.pos_count_dict)
            self.commas = self.char_count(",")
            self.comma_sentences = self.list_sentences(",")
            self.comma_example = self.select_random(len(self.comma_sentences),
                                                    self.comma_sentences)
            self.semicolons = self.char_count(";")
            self.semicolon_sentences = self.list_sentences(";")
            self.semicolon_example = self.select_random(
                len(self.semicolon_sentences), self.semicolon_sentences)
            self.lint_suggestions = lint(self.raw_text)
Beispiel #13
0
    def test_avg_syllables_per_word(self):
        avg = textstat.avg_syllables_per_word(self.long_test)

        self.assertEqual(1.4, avg)
Beispiel #14
0
def preprocess(x):
    print('PROCESSING ID: ' + str(x['id']))
    try:
        fvec = []
        fvec.append(int(x['id']))  # Append Article ID
        fvec.append(nnp_num(x['targetTitle']))
        if len(x['targetParagraphs']) > 0:
            fvec.append(
                ts.automated_readability_index(' '.join(
                    x['targetParagraphs'])))
            fvec.append(ts.avg_letter_per_word(' '.join(
                x['targetParagraphs'])))
            fvec.append(ts.avg_sentence_length(' '.join(
                x['targetParagraphs'])))
            fvec.append(
                ts.avg_sentence_per_word(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.avg_syllables_per_word(' '.join(x['targetParagraphs'])))
            fvec.append(ts.char_count(' '.join(x['targetParagraphs'])))
            fvec.append(ts.coleman_liau_index(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.dale_chall_readability_score(' '.join(
                    x['targetParagraphs'])))
            fvec.append(ts.difficult_words(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.flesch_kincaid_grade(' '.join(x['targetParagraphs'])))
            fvec.append(ts.flesch_reading_ease(' '.join(
                x['targetParagraphs'])))
            fvec.append(ts.gunning_fog(' '.join(x['targetParagraphs'])))
            fvec.append(ts.lexicon_count(' '.join(x['targetParagraphs'])))
            fvec.append(
                ts.linsear_write_formula(' '.join(x['targetParagraphs'])))
            fvec.append(ts.polysyllabcount(' '.join(x['targetParagraphs'])))
            fvec.append(ts.sentence_count(' '.join(x['targetParagraphs'])))
            fvec.append(ts.smog_index(' '.join(x['targetParagraphs'])))
            fvec.append(ts.syllable_count(' '.join(x['targetParagraphs'])))
            fvec.append(mean_wordlen(x['targetParagraphs']))
            fvec += ratio(x['targetParagraphs'])  #36
            fvec += ngram_feat(x['targetParagraphs'])  # 6
        else:
            fvec += [0] * 61
        if len(word_tokenize(' '.join(x['postText']))) > 0:
            fvec.append(max_wordlen(x['postText']))
            fvec.append(sw_ratio(' '.join(x['postText'])))
            fvec += ngram_feat(x['postText'])  #6
        else:
            fvec += [0] * 8
        fvec.append(len(word_tokenize(x['targetTitle'])))
        fvec.append(wlen_title(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNP'))
        fvec.append(int(num_start(x['targetTitle'])))
        fvec.append(in_num(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBZ'))
        fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NNP'))
        fvec.append(wrb_num(x['targetTitle']))
        fvec.append(nnp_num(x['targetTitle']))
        fvec.append(int(wh_start(x['targetTitle'])))
        fvec.append(int(qm_exist(x['targetTitle'])))
        fvec.append(pos_thnn(x['targetTitle']))
        fvec.append(prp_count(x['targetTitle']))
        fvec.append(vbz_count(x['targetTitle']))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'VBZ'))
        fvec.append(pos_2gram(x['targetTitle'], 'NN', 'IN'))
        fvec.append(pos_3gram(x['targetTitle'], 'NN', 'IN', 'NNP'))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', '.'))
        fvec.append(pos_2gram(x['targetTitle'], 'PRP', 'VBP'))
        fvec.append(wp_count(x['targetTitle']))
        fvec.append(dt_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'IN'))
        fvec.append(pos_3gram(x['targetTitle'], 'IN', 'NNP', 'NNP'))
        fvec.append(pos_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NN'))
        if len(x['targetKeywords']) > 0 and len(x['postText']) > 0:
            fvec.append(kw_post_match(x['targetKeywords'], x['postText']))
        else:
            fvec += [0] * 1
        fvec.append(comma_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNS'))
        fvec.append(pos_2gram(x['targetTitle'], 'IN', 'JJ'))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'POS'))
        fvec.append(wdt_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NN'))
        fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NNP'))
        fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBD'))
        fvec.append(rb_count(x['targetTitle']))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NNP'))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NN'))
        fvec.append(rbs_count(x['targetTitle']))
        fvec.append(vbn_count(x['targetTitle']))
        fvec.append(pos_2gram(x['targetTitle'], 'VBN', 'IN'))
        fvec.append(pos_2gram(x['targetTitle'], 'JJ', 'NNP'))
        fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NN', 'NN'))
        fvec.append(pos_2gram(x['targetTitle'], 'DT', 'NN'))
        fvec.append(ex_exist(x['targetTitle']))
        fvec += ngram_feat(x['targetTitle'])  #6
    except Exception as e:
        print('EXCEPTION AT ID ' + str(x['id']))
        print(e)
        sys.exit()

    return fvec