def get_XY_vectors(): meta, id_to_idx, idx_to_id = utils.load_meta(chosen_meta) all_answers = get_answers_list(meta) Y = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers]) x = [extract_features_from_body(text) for post_id,text in utils.fetch_posts(chosen) if post_id in all_answers] X = np.asarray(x) return X,Y
def prepare_sent_feature(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: sent_lens = [len(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean([len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum([word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')
def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean( [len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum( [word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')
def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: from platform import python_version if python_version().startswith('2'): text = text.decode('utf-8') sent_lens = [len(nltk.word_tokenize( sent)) for sent in nltk.sent_tokenize(text)] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean( [len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum( [word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')
def prepare_sent_features(): for pid, text in fetch_posts(chosen, with_index=True): if not text: meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0 else: from platform import python_version if python_version().startswith('2'): text = text.decode('utf-8') sent_lens = [ len(nltk.word_tokenize(sent)) for sent in nltk.sent_tokenize(text) ] meta[pid]['AvgSentLen'] = np.mean(sent_lens) meta[pid]['AvgWordLen'] = np.mean( [len(w) for w in nltk.word_tokenize(text)]) meta[pid]['NumAllCaps'] = np.sum( [word.isupper() for word in nltk.word_tokenize(text)]) meta[pid]['NumExclams'] = text.count('!')