def readability_analysis(self, text): words = text.split() wrd_dic = {} for wrd in words: wrd = "".join(a for a in wrd if a not in punctuation) wrd_dic[wrd] = textstat.syllable_count(wrd) wrd_dic = [b for b in wrd_dic if wrd_dic[b] >= 5] flesch_reading_ease = textstat.flesch_reading_ease(text) if flesch_reading_ease > 100: flesch_reading_ease = 100 elif flesch_reading_ease < 0: flesch_reading_ease = 0 syllable_count = textstat.syllable_count(text) avg_syllables_per_word = textstat.avg_syllables_per_word(text) avg_letter_per_word = textstat.avg_letter_per_word(text) readability = { "flesch_reading_ease": flesch_reading_ease, "avg_syllables_per_word": avg_syllables_per_word, "syllable_count": syllable_count, "avg_letter_per_word": avg_letter_per_word, } grade, score = self.readability_grade(readability) readability['grade'] = grade readability['score'] = score readability['difficult_words'] = wrd_dic return readability
def _calculate_scores(self, docs): docs_scores = [] for doc in docs: scores = {} scores['chars'] = ts.char_count(doc) scores['words'] = ts.lexicon_count(doc) scores['sents'] = ts.sentence_count(doc) #scores['syllables'] = ts.syllable_count(doc) scores['avg_sent_length'] = ts.avg_sentence_length(doc) scores['avg_syllables_per_word'] = ts.avg_syllables_per_word(doc) scores['avg_letters_per_word'] = ts.avg_letter_per_word(doc) scores['flesch'] = ts.flesch_reading_ease(doc) #scores['smog'] = ts.smog_index(doc) #scores['coleman_liau'] = ts.coleman_liau_index(doc) scores['automated_readability'] = ts.automated_readability_index( doc) #scores['linsear'] = ts.linsear_write_formula(doc) #scores['difficult_words'] = ts.difficult_words(doc) scores['dale_chall'] = ts.dale_chall_readability_score(doc) #scores['gunning_fog'] = ts.gunning_fog(doc) scores['lix'] = ts.lix(doc) docs_scores.append(scores) return docs_scores
def scores_cal_ori(text): char_count_value=textstat.char_count(text,ignore_spaces=True) lexicon_count_value=textstat.lexicon_count(text,removepunct=True) syllable_count_value=textstat.syllable_count(text) sentence_count_value=textstat.sentence_count(text) avg_sentence_length_value=textstat.avg_sentence_length(text) avg_syllables_per_word_value=textstat.avg_syllables_per_word(text) avg_letter_per_word_value=textstat.avg_letter_per_word(text) avg_sentence_per_word_value=textstat.avg_sentence_per_word(text) flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text) smog_index_value=textstat.smog_index(text) gunning_fog_value=textstat.gunning_fog(text) difficult_words_value=textstat.difficult_words(text) dale_chall_value=textstat.dale_chall_readability_score(text) polysyllab_value=textstat.polysyllabcount(text) return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value return smog_index_value
def analyse_json(json_text): # consider moving this to be a feature of Transcript in the other module df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name', 'syllable_count','lexicon_count', 'sentence_count', 'syllables_per_word', 'gunning_fog', 'smog_index', 'text_standard'], index=[]) trscrpt = json.loads(json_text) if 'witnesses' in trscrpt: witnesses = trscrpt['witnesses'] for s in trscrpt['all_sections']: if 'speaker' in s and 'person' in s['speaker'] and \ s['speaker']['person']['speaker_type']=='witness': witness = witnesses[s['speaker']['person']['name']] witness.setdefault('all_text', []).append(s['spoken_text']) for i, p in enumerate(witnesses): if 'all_text' in witnesses[p]: witness_text = '\n\n'.join(witnesses[p]['all_text']) if len(witness_text) > 0: stats_data = {'html_file_location': trscrpt['html_file_location'], 'witness_name': p, 'syllable_count': textstat.syllable_count(witness_text), 'lexicon_count': textstat.lexicon_count(witness_text), 'sentence_count': textstat.sentence_count(witness_text), 'syllables_per_word': textstat.avg_syllables_per_word(witness_text), 'gunning_fog': textstat.gunning_fog(witness_text), 'smog_index': textstat.smog_index(witness_text), 'text_standard': textstat.text_standard(witness_text)} df_witnesses.loc['witness_%i' % i] = stats_data else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p return df_witnesses
def analyzeTweet(tweetId): tweet = tweets_coll.find_one({"_id": tweetId}) print("tweet: ", tweet) tweetAnalysis = {"_id": tweet["id"]} print("\tanalyzed tweet:", tweetAnalysis) blob = TextBlob(tweet["text"]) tweetAnalysis["tags"] = blob.tags tweetAnalysis["noun_phrases"] = blob.noun_phrases tweetAnalysis["sentiment.polarity"] = blob.sentiment[0] tweetAnalysis["sentiment.subjectivity"] = blob.sentiment[1] tweetAnalysis["flesch_kincaid"] = textstat.flesch_kincaid_grade( tweet["text"]) tweetAnalysis["average_sentence_length"] = textstat.avg_sentence_length( tweet["text"]) tweetAnalysis[ "average_syllables_per_word"] = textstat.avg_syllables_per_word( tweet["text"]) print("\ttweetAnalyis: " + json.dumps(tweetAnalysis)) processed_tweets_coll.update_one({"_id": tweet["id"]}, {'$set': tweetAnalysis}, upsert=True)
def stats(self, text): test_data = text stats = {} stats['flesch_reading_ease'] = textstat.flesch_reading_ease(test_data) stats['smog'] = textstat.smog_index(test_data) stats['flesch kincaid'] = textstat.flesch_kincaid_grade(test_data) stats['coleman Liau'] = textstat.coleman_liau_index(test_data) stats['automated'] = textstat.automated_readability_index(test_data) stats['dale chall'] = textstat.dale_chall_readability_score(test_data) stats['difficult'] = textstat.difficult_words(test_data) stats['linsear'] = textstat.linsear_write_formula(test_data) stats['gunning_fog'] = textstat.gunning_fog(test_data) stats['standard'] = textstat.text_standard(test_data) stats['charcount'] = textstat.char_count(test_data) stats['lexicon count'] = textstat.lexicon_count(test_data) stats['syllable count'] = textstat.syllable_count(test_data) stats['sentence count'] = textstat.sentence_count(test_data) stats['avg sentence length'] = textstat.avg_sentence_length(test_data) stats['avg_syllables_per_word'] = textstat.avg_syllables_per_word( test_data) stats['avg_letter_per_word'] = textstat.avg_letter_per_word(test_data) stats['avg_sentence_per_word'] = textstat.avg_sentence_per_word( test_data) return stats
def updateData(self): # Full list of polarity scores self.polscore = self.sid.polarity_scores(self.text) ##### INDEX 0 IN DATA: Text Sentiment ##### # [INDEX 0] Compounded score (0.0 - 1.0) [INDEX 1] Negative connotation rating (0.0 - 1.0), # [INDEX 2] Positive connotation rating (0.0 - 1.0) [INDEX 3] Neutral connotation rating (0.0 - 1.0) self.data.append([ self.polscore['compound'], self.polscore['neg'], self.polscore['pos'], self.polscore['neu'] ]) ##### INDEX 1 IN DATA: Sentence Info ##### # [INDEX 0] Sentence count [INDEX 1] Average sentence length # [INDEX 2] Syllable count [INDEX 3] Overall word count # [INDEX 4] Character count [INDEX 5] Character count without spaces # [INDEX 6] Avg letters per word [INDEX 7] Avg syllables per word self.data.append([ textstat.sentence_count(self.text), textstat.avg_sentence_length(self.text), textstat.syllable_count(self.text), len(self.splList), textstat.char_count(self.text, False), textstat.char_count(self.text, True), textstat.avg_letter_per_word(self.text), textstat.avg_syllables_per_word(self.text) ]) ##### INDEX 2 IN DATA: Flesch Reading Ease ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 100 self.freRaw = textstat.flesch_reading_ease(self.text) self.freStat = min(max(self.freRaw, 0), 100) self.data.append([ round(self.freStat, 3), self.freGrade(self.freStat), round(abs(self.freStat - 100), 2) ]) ##### INDEX 3 IN DATA: Flesch-Kincaid Grade ##### # [INDEX 0] Pure score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fkgRaw = textstat.flesch_kincaid_grade(self.text) self.fkgStat = self.adjustScore(self.fkgRaw) self.data.append([ round(self.fkgStat, 3), self.grade(self.fkgStat), round(self.fkgStat / 0.18, 2) ]) ##### INDEX 4 IN DATA: Gunning FOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.fogRaw = textstat.gunning_fog(self.text) self.fogStat = self.adjustScore(self.fogRaw) self.data.append([ round(self.fogStat, 3), self.grade(self.fogStat), round(self.fogStat / 0.18, 2) ]) ##### INDEX 5 IN DATA: SMOG Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.smogRaw = textstat.smog_index(self.text) self.smogStat = self.adjustScore(self.smogRaw) self.data.append([ round(self.smogStat, 3), self.grade(self.smogStat), round(self.smogStat / 0.18, 2) ]) ##### INDEX 6 IN DATA: Automated Readability Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 14 self.ariRaw = textstat.automated_readability_index(self.text) self.ariStat = min(max(self.ariRaw, 0), 14) self.data.append([ round(self.ariStat, 3), self.ariGrade(ceil(self.ariStat)), round(self.ariStat / 0.14, 2) ]) #13 ##### INDEX 7 IN DATA: Coleman-Liau Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.cliRaw = textstat.coleman_liau_index(self.text) self.cliStat = self.adjustScore(self.cliRaw) self.data.append([ round(self.cliStat, 3), self.grade(self.cliStat), round(self.cliStat / 0.18, 2) ]) ##### INDEX 8 IN DATA: Linsear Write Index ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 18 self.lwiRaw = textstat.linsear_write_formula(self.text) self.lwiStat = self.adjustScore(self.lwiRaw) self.data.append([ round(self.lwiStat, 3), self.grade(self.lwiStat), round(self.lwiStat / 0.18, 2) ]) ##### INDEX 9 IN DATA: Dale-Chall Readability Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 10 self.dcrRaw = textstat.dale_chall_readability_score(self.text) self.dcrStat = min(max(self.dcrRaw, 0), 10) self.data.append([ round(self.dcrStat, 3), self.daleChallGrade(self.dcrStat), round(self.dcrStat / 0.1, 2) ]) ##### INDEX 10 IN DATA: Overall Score ##### # [INDEX 0] Pure Score [INDEX 1] Approximate grade [INDEX 2] Normalized (ratio) score # SCORE SCALE: 0 - 20 self.txtRaw = textstat.text_standard(self.text, True) self.txtStd = min(max(self.txtRaw, 0), 20) self.txtInfo = textstat.text_standard(self.text) self.data.append([ round(self.txtStd, 3), self.txtGrade(self.txtStd, self.txtInfo), round(self.txtStd / 0.2, 2) ]) return self.data
def dutch_flesch_reading_ease(self, text): ASL = textstat.avg_sentence_length(text) ASW = textstat.avg_syllables_per_word(text) FRE = 206.84 - float(0.33 * ASL) - float(77 * ASW) return int(FRE)
def french_flesch_reading_ease(self, text): ASL = textstat.avg_sentence_length(text) ASW = textstat.avg_syllables_per_word(text) FRE = 209 - float(1.15 * ASL) - float(68 * ASW) return int(FRE)
def italian_flesch_reading_ease(self, text): ASL = textstat.avg_sentence_length(text) ASW = textstat.avg_syllables_per_word(text) FRE = 217 - float(1.3 * ASL) - float(60 * ASW) return int(FRE)
def portuguese_flesch_reading_ease(self, text): ASL = textstat.avg_sentence_length(text) ASW = textstat.avg_syllables_per_word(text) FRE = 206.84 - float(1.02 * ASL) - float(60 * ASW) return int(FRE)
def __init__(self, path): """ Create document instance for analysis. Opens and reads document to string raw_text. Textract interprets the document format and opens to plain text string (docx, pdf, odt, txt) Args: path (str): path to file to open, anaylze, close Public attributes: -user: (str) optional string to set username. -path: (str) relative path to document. -abs_path: (str) the absolute path to the document. -file_name: (str) the file name with extension of document (base name). -mime: tbd -guessed_type: makes best guess of mimetype of document. -file_type: returns index[0] from guessed_type. -raw_text: (str) plain text extracted from .txt, .odt, .pdf, .docx, and .doc. -ptext: (str) raw text after a series of regex expressions to eliminate special characters. -text_no_feed: (str) ptext with most new line characters eliminated /n/n stays intact. -sentence_tokens: list of all sentences in a comma separated list derived by nltk. -sentence_count: (int) count of sentences found in list. -passive_sentences: list of passive sentences identified by the passive module. -passive_sentence_count: count of the passive_sentences list. -percent_passive: (float) ratio of passive sentences to all sentences in percent form. -be_verb_analysis: (int) sum number of occurrences of each to be verb (am, is, are, was, were, be, being been). -be_verb_count: tbd -be_verb_analysis: tbd -weak_sentences_all: (int) sum of be verb analysis. -weak_sentences_set: (set) set of all sentences identified as having to be verbs. -weak_sentences_count: (int) count of items in weak_sentences_set. -weak_verbs_to_sentences: (float) proportion of sentences with to be to all sentences in percent (this might not be sound). -word_tokens: list of discreet words in text that breaks contractions up (default nltk tokenizer). -word_tokens_no_punct: list of all words in text including contractions but otherwise no punctuation. -no_punct: (str) full text string without sentence punctuation. -word_tokens_no_punct: uses white-space tokenizer to create a list of all words. -readability_flesch_re: (int) Flesch Reading Ease Score (numeric score) made by textstat module. -readability_smog_index: (int) grade level as determined by the SMOG algorithum made by textstat module. -readability_flesch_kincaid_grade: (int) Flesch-Kincaid grade level of reader made by textstat module. -readability_coleman_liau_index: (int) grade level of reader as made by textstat module. -readability_ari: (int) grade leader of reader determined by automated readability index algorithum implemented by textstat. -readability_linser_write: FIX SPELLING grade level as determined by Linsear Write algorithum implemented by textstat. -readability_dale_chall: (int) grade level based on Dale-Chall readability as determined by textstat. -readability_standard: composite grade level based on readability algorithums. -flesch_re_key: list for interpreting Flesch RE Score. -word_count: word count of document based on white space tokener, this word count should be used. -page_length: (float) page length in decimal format given 250 words per page. -paper_count: (int) number of printed pages given 250 words per page. -parts_of_speech: words with parts of speech tags. -pos_counts: values in word, tag couple grouped in a list (Counter). -pos_total: (int) sum of pos_counts values -pos_freq: (dict) word, ratio of whole -doc_pages: (float) page length based on 250 words per page (warning, this is the second time this attribute is defined). -freq_words: word frequency count not standardized based on the correct word tokener (not ratio, just count). modal_dist: count of auxillary verbs based on word_tokens_no_punct. sentence_count (int): Count the sentence tokens passive_sentences (list): List of all sentences identified as passive passive_sentence_count (int): count of items in passive_sentences be_verb_count (int): count "to be" verbs in text word_tokens_no_punct (list): words separated, stripped of punctuation, made lower case flesch_re_key (str): reading ease score to description freq_words (list or dict): frequency distribution of all words modal_dist (list): frequency distribution of aux verbs """ self.user = "" self.path = path self.abs_path = os.path.abspath(self.path) if os.path.isfile(self.path): self.time_stamp = self.timestamp() self.file_name = os.path.basename(path) self.mime = MimeTypes() self.guessed_type = self.mime.guess_type(self.path) self.file_type = self.guessed_type[0] self.raw_text = textract.process(self.path, encoding="ascii") self.ptext = re.sub(u'[\u201c\u201d]', '"', self.raw_text) self.ptext = re.sub(u"\u2014", "--", self.ptext) self.ptext = re.sub(",", ",", self.ptext) self.ptext = re.sub("—", "--", self.ptext) self.ptext = re.sub("…", "...", self.ptext) self.text_no_feed = self.clean_new_lines(self.ptext) self.sentence_tokens = self.sentence_tokenize(self.text_no_feed) self.sentence_count = len(self.sentence_tokens) self.passive_sentences = passive(self.text_no_feed) self.passive_sentence_count = len(self.passive_sentences) self.percent_passive = (100 * (float(self.passive_sentence_count) / float(self.sentence_count))) self.percent_passive_round = round(self.percent_passive, 2) self.be_verb_analysis = self.count_be_verbs(self.sentence_tokens) self.be_verb_count = self.be_verb_analysis[0] self.weak_sentences_all = self.be_verb_analysis[1] self.weak_sentences_set = set(self.weak_sentences_all) self.weak_sentences_count = len(self.weak_sentences_set) self.weak_verbs_to_sentences = 100 * float( self.weak_sentences_count) / float(self.sentence_count) self.weak_verbs_to_sentences_round = round( self.weak_verbs_to_sentences, 2) self.word_tokens = self.word_tokenize(self.text_no_feed) self.word_tokens_no_punct = \ self.word_tokenize_no_punct(self.text_no_feed) self.no_punct = self.strip_punctuation(self.text_no_feed) # use this! It make lower and strips symbols self.word_tokens_no_punct = self.ws_tokenize(self.no_punct) self.readability_flesch_re = \ textstat.flesch_reading_ease(self.text_no_feed) self.readability_smog_index = \ textstat.smog_index(self.text_no_feed) self.readability_flesch_kincaid_grade = \ textstat.flesch_kincaid_grade(self.text_no_feed) self.readability_coleman_liau_index = \ textstat.coleman_liau_index(self.text_no_feed) self.readability_ari = \ textstat.automated_readability_index(self.text_no_feed) self.readability_linser_write = \ textstat.linsear_write_formula(self.text_no_feed) self.readability_dale_chall = \ textstat.dale_chall_readability_score(self.text_no_feed) self.readability_standard = \ textstat.text_standard(self.text_no_feed) self.flesch_re_desc_str = self.flesch_re_desc( int(textstat.flesch_reading_ease(self.text_no_feed))) self.polysyllabcount = textstat.polysyllabcount(self.text_no_feed) self.lexicon_count = textstat.lexicon_count(self.text_no_feed) self.avg_syllables_per_word = textstat.avg_syllables_per_word( self.text_no_feed) self.avg_sentence_per_word = textstat.avg_sentence_per_word( self.text_no_feed) self.avg_sentence_length = textstat.avg_sentence_length( self.text_no_feed) self.avg_letter_per_word = textstat.avg_letter_per_word( self.text_no_feed) self.difficult_words = textstat.difficult_words(self.text_no_feed) self.rand_passive = self.select_random(self.passive_sentence_count, self.passive_sentences) self.rand_weak_sentence = self.select_random( len(self.weak_sentences), self.weak_sentences) if self.word_tokens_no_punct: self.word_count = len(self.word_tokens_no_punct) self.page_length = float(self.word_count) / float(250) self.paper_count = int(math.ceil(self.page_length)) self.parts_of_speech = pos_tag(self.word_tokens_no_punct) self.pos_counts = Counter( tag for word, tag in self.parts_of_speech) self.pos_total = sum(self.pos_counts.values()) self.pos_freq = dict( (word, float(count) / self.pos_total) for word, count in self.pos_counts.items()) self.doc_pages = float(float(self.word_count) / float(250)) self.freq_words = \ self.word_frequency(self.word_tokens_no_punct) self.modal_dist = self.modal_count(self.word_tokens_no_punct) # self.ws_tokens = self.ws_tokenize(self.text_no_cr) self.pos_count_dict = self.pos_counts.items() # Model - use for any pos self.modals = self.pos_isolate('MD', self.pos_count_dict) self.preposition_count = self.pos_isolate('IN', self.pos_count_dict) self.adjective_count = self.pos_isolate_fuzzy( 'JJ', self.pos_count_dict) self.adverb_count = self.pos_isolate_fuzzy('RB', self.pos_count_dict) self.proper_nouns = self.pos_isolate_fuzzy('NNP', self.pos_count_dict) self.cc_count = self.pos_isolate('CC', self.pos_count_dict) self.commas = self.char_count(",") self.comma_sentences = self.list_sentences(",") self.comma_example = self.select_random(len(self.comma_sentences), self.comma_sentences) self.semicolons = self.char_count(";") self.semicolon_sentences = self.list_sentences(";") self.semicolon_example = self.select_random( len(self.semicolon_sentences), self.semicolon_sentences) self.lint_suggestions = lint(self.raw_text)
def test_avg_syllables_per_word(self): avg = textstat.avg_syllables_per_word(self.long_test) self.assertEqual(1.4, avg)
def preprocess(x): print('PROCESSING ID: ' + str(x['id'])) try: fvec = [] fvec.append(int(x['id'])) # Append Article ID fvec.append(nnp_num(x['targetTitle'])) if len(x['targetParagraphs']) > 0: fvec.append( ts.automated_readability_index(' '.join( x['targetParagraphs']))) fvec.append(ts.avg_letter_per_word(' '.join( x['targetParagraphs']))) fvec.append(ts.avg_sentence_length(' '.join( x['targetParagraphs']))) fvec.append( ts.avg_sentence_per_word(' '.join(x['targetParagraphs']))) fvec.append( ts.avg_syllables_per_word(' '.join(x['targetParagraphs']))) fvec.append(ts.char_count(' '.join(x['targetParagraphs']))) fvec.append(ts.coleman_liau_index(' '.join(x['targetParagraphs']))) fvec.append( ts.dale_chall_readability_score(' '.join( x['targetParagraphs']))) fvec.append(ts.difficult_words(' '.join(x['targetParagraphs']))) fvec.append( ts.flesch_kincaid_grade(' '.join(x['targetParagraphs']))) fvec.append(ts.flesch_reading_ease(' '.join( x['targetParagraphs']))) fvec.append(ts.gunning_fog(' '.join(x['targetParagraphs']))) fvec.append(ts.lexicon_count(' '.join(x['targetParagraphs']))) fvec.append( ts.linsear_write_formula(' '.join(x['targetParagraphs']))) fvec.append(ts.polysyllabcount(' '.join(x['targetParagraphs']))) fvec.append(ts.sentence_count(' '.join(x['targetParagraphs']))) fvec.append(ts.smog_index(' '.join(x['targetParagraphs']))) fvec.append(ts.syllable_count(' '.join(x['targetParagraphs']))) fvec.append(mean_wordlen(x['targetParagraphs'])) fvec += ratio(x['targetParagraphs']) #36 fvec += ngram_feat(x['targetParagraphs']) # 6 else: fvec += [0] * 61 if len(word_tokenize(' '.join(x['postText']))) > 0: fvec.append(max_wordlen(x['postText'])) fvec.append(sw_ratio(' '.join(x['postText']))) fvec += ngram_feat(x['postText']) #6 else: fvec += [0] * 8 fvec.append(len(word_tokenize(x['targetTitle']))) fvec.append(wlen_title(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNP')) fvec.append(int(num_start(x['targetTitle']))) fvec.append(in_num(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBZ')) fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NNP')) fvec.append(wrb_num(x['targetTitle'])) fvec.append(nnp_num(x['targetTitle'])) fvec.append(int(wh_start(x['targetTitle']))) fvec.append(int(qm_exist(x['targetTitle']))) fvec.append(pos_thnn(x['targetTitle'])) fvec.append(prp_count(x['targetTitle'])) fvec.append(vbz_count(x['targetTitle'])) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'VBZ')) fvec.append(pos_2gram(x['targetTitle'], 'NN', 'IN')) fvec.append(pos_3gram(x['targetTitle'], 'NN', 'IN', 'NNP')) fvec.append(pos_2gram(x['targetTitle'], 'NNP', '.')) fvec.append(pos_2gram(x['targetTitle'], 'PRP', 'VBP')) fvec.append(wp_count(x['targetTitle'])) fvec.append(dt_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'IN')) fvec.append(pos_3gram(x['targetTitle'], 'IN', 'NNP', 'NNP')) fvec.append(pos_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NN')) if len(x['targetKeywords']) > 0 and len(x['postText']) > 0: fvec.append(kw_post_match(x['targetKeywords'], x['postText'])) else: fvec += [0] * 1 fvec.append(comma_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNS')) fvec.append(pos_2gram(x['targetTitle'], 'IN', 'JJ')) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'POS')) fvec.append(wdt_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NN')) fvec.append(pos_2gram(x['targetTitle'], 'NN', 'NNP')) fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBD')) fvec.append(rb_count(x['targetTitle'])) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NNP')) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NNP', 'NN')) fvec.append(rbs_count(x['targetTitle'])) fvec.append(vbn_count(x['targetTitle'])) fvec.append(pos_2gram(x['targetTitle'], 'VBN', 'IN')) fvec.append(pos_2gram(x['targetTitle'], 'JJ', 'NNP')) fvec.append(pos_3gram(x['targetTitle'], 'NNP', 'NN', 'NN')) fvec.append(pos_2gram(x['targetTitle'], 'DT', 'NN')) fvec.append(ex_exist(x['targetTitle'])) fvec += ngram_feat(x['targetTitle']) #6 except Exception as e: print('EXCEPTION AT ID ' + str(x['id'])) print(e) sys.exit() return fvec