Esempio n. 1
0
 def get_local_features(self, text):
     feature_vector = dict()
     text_obj = TextObj(text)
     feature_extractor.get_ngrams(feature_vector, text_obj.tokens)
     feature_extractor.get_ngrams(feature_vector, text_obj.tokens, n=2)
     feature_extractor.get_initialisms(feature_vector, text_obj.tokens)
     feature_extractor.get_basic_lengths(feature_vector, text_obj.text, text_obj.sentences, text_obj.tokens)
     feature_extractor.get_repeated_punct(feature_vector, text_obj.text)
     feature_extractor.get_LIWC(feature_vector, text_obj.text)
     
     return feature_vector
Esempio n. 2
0
def build_features(body, subject, stemmer=None, wn=None, features=[]):#'bow_disc']): #'bow_disc', 'liwc'
    all_bow = dict()
    body_bow = dict()
    subj_bow = dict()
    stemmed_body = stemmer.stem(body)
    stemmed_subj = stemmer.stem(subject)
    tokens_body = wordpunct_tokenize(stemmed_body)
    tokens_subj = wordpunct_tokenize(stemmed_subj)
    all_bow = {'bow_{}'.format(token): True for token in tokens_body + tokens_subj}
    if 'bow_desc' in features:
        body_bow = {'body_{}'.format(token): True for token in tokens_body}
        subj_bow = {'subject_{}'.format(token): True for token in tokens_subj}
    polarity = defaultdict(int)
    subj_liwc = dict()
    body_liwc = dict()
    if 'liwc' in features:
        get_LIWC(subj_liwc, stemmed_subj)
        get_LIWC(body_liwc, stemmed_body)
    if 'polarity' in features:
       for token in tokens_subj + tokens_body:
           result = msol.lookup(token)
           if result == 'negative' or result == 'positive':
               polarity[result] += 1
    all_wn = dict()
    if 'wn' in features:
        for token in tokens_body + tokens_subj:
            result = wn.lookup(token)
            if len(result) > 0:
                for element in result:
                    all_wn['WN_{}'.format(element)] = True
    return combine_dicts(all_bow,
                         subj_bow,
                         body_bow,
                         all_wn,
                         prepend_key(subj_liwc, 'SUBJLIWC'),
                         prepend_key(body_liwc, 'BODYLIWC'),
                         polarity,
                        )