def generate_feature_csv(self, feature_csv, pos_lexicon, neg_lexicon, postag_instances=None): """ Generates a csv file with features extracted from instances according to data-driven DD model :param feature_csv: :param pos_lexicon: :param neg_lexicon: :param postag_instances: :return: """ if postag_instances: corpus_postag_set = Corpus.get_postag_set(postag_instances) # return all tags in corpus in a list else: corpus_postag_set = Corpus.get_postag_set(self.instances) # return all tags in corpus in a list # feature file header: ID, text, pos_feature, neg_feature, percentages for all corpus tags, label with open(feature_csv, 'wb') as f: wr = csv.writer(f) id = 1 wr.writerow(["ID", "text", "pos", "neg"]+corpus_postag_set+["label"]) for inst in self.instances: inst_postags = [token.get_tag() for token in inst.get_tokens()] inst_postag_counter = Counter(inst_postags) postag_percent = [] for tag in corpus_postag_set: if tag in inst_postag_counter: # percentage of words belonging to each POS in instance postag_percent.append(inst_postag_counter[tag]/inst.get_length()) else: postag_percent.append(0) # tokens_list = [token.get_text() for token in inst.get_tokens()] tokens_list = [token for token in inst.get_tokens()] # tokens as objects pos_neg_list = self.get_lexicon_features(tokens_list, pos_lexicon, neg_lexicon) # wr.writerow([id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]]+postag_percent+[inst.get_label_gold()]) wr.writerow( [unicode(id).encode("utf-8"), unicode(inst.get_text()).encode("utf-8"), unicode(pos_neg_list[0]).encode("utf-8"), unicode(pos_neg_list[1]).encode("utf-8")] + postag_percent + [unicode(inst.get_label_gold()).encode("utf-8")]) id += 1 return feature_csv, corpus_postag_set
def generate_combined_features(self, feature_csv): feature_rows = pd.read_csv(feature_csv) # Create vectorizer for function to use vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2)) y = feature_rows["label"].values.astype(np.float32) X = sp.sparse.hstack( (vectorizer.fit_transform(feature_rows.text), feature_rows[['pos', 'neg'] + Corpus.get_postag_set(self.instances)].values), format='csr') return X, y, vectorizer
def generate_feature_csv(self, feature_csv, pos_lexicon, neg_lexicon, postag_instances=None): if postag_instances: corpus_postag_set = Corpus.get_postag_set( postag_instances) # return all tags in corpus in a list else: corpus_postag_set = Corpus.get_postag_set( self.instances) # return all tags in corpus in a list # ID, text, pos_feature, neg_feature, percentages for all corpus tags, label with open(feature_csv, 'wb') as f: # wr = csv.writer(f, quoting=csv.QUOTE_ALL) wr = csv.writer(f) id = 1 wr.writerow(["ID", "text", "pos", "neg"] + corpus_postag_set + ["label"]) for inst in self.instances: inst_postags = [token.get_tag() for token in inst.get_tokens()] inst_postag_counter = Counter(inst_postags) postag_percent = [] for tag in corpus_postag_set: if tag in inst_postag_counter: # percentage of words belonging to each POS in instance postag_percent.append(inst_postag_counter[tag] / inst.get_length()) else: postag_percent.append(0) pos_neg_list = self.get_lexicon_features( inst.get_text(), pos_lexicon, neg_lexicon) wr.writerow( [id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]] + postag_percent + [inst.get_label_gold()]) id += 1 return feature_csv, corpus_postag_set
def generate_combined_features(self, feature_csv): feature_rows = pd.read_csv(feature_csv) # pandas Data Frame object # Create vectorizer for function to use vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2)) # CountVectorizer constructs BOW model based on word counts y = feature_rows["label"].values.astype(np.float32) # combine BOW model from Count Vectorizer with self-extracted features X = sp.sparse.hstack( (vectorizer.fit_transform(feature_rows.text), feature_rows[['pos', 'neg']+Corpus.get_postag_set(self.instances)].values), format='csr' ) return X, y, vectorizer