예제 #1
0
    def generate_feature_csv(self, feature_csv, pos_lexicon, neg_lexicon, postag_instances=None):
        """
         Generates a csv file with features extracted from instances according to data-driven DD model
        :param feature_csv:
        :param pos_lexicon:
        :param neg_lexicon:
        :param postag_instances:
        :return:
        """
        if postag_instances:
            corpus_postag_set = Corpus.get_postag_set(postag_instances) # return all tags in corpus in a list
        else:
            corpus_postag_set = Corpus.get_postag_set(self.instances) # return all tags in corpus in a list

        # feature file header: ID, text, pos_feature, neg_feature, percentages for all corpus tags, label
        with open(feature_csv, 'wb') as f:
            wr = csv.writer(f)
            id = 1
            wr.writerow(["ID", "text", "pos", "neg"]+corpus_postag_set+["label"])
            for inst in self.instances:
                inst_postags = [token.get_tag() for token in inst.get_tokens()]
                inst_postag_counter = Counter(inst_postags)
                postag_percent = []
                for tag in corpus_postag_set:
                    if tag in inst_postag_counter:
                        # percentage of words belonging to each POS in instance
                        postag_percent.append(inst_postag_counter[tag]/inst.get_length())
                    else:
                        postag_percent.append(0)
                # tokens_list = [token.get_text() for token in inst.get_tokens()]
                tokens_list = [token for token in inst.get_tokens()] # tokens as objects
                pos_neg_list = self.get_lexicon_features(tokens_list, pos_lexicon, neg_lexicon)
                # wr.writerow([id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]]+postag_percent+[inst.get_label_gold()])
                wr.writerow(
                    [unicode(id).encode("utf-8"),
                     unicode(inst.get_text()).encode("utf-8"),
                     unicode(pos_neg_list[0]).encode("utf-8"),
                     unicode(pos_neg_list[1]).encode("utf-8")]
                    + postag_percent
                    + [unicode(inst.get_label_gold()).encode("utf-8")])
                id += 1
        return feature_csv, corpus_postag_set
    def generate_combined_features(self, feature_csv):
        feature_rows = pd.read_csv(feature_csv)
        # Create vectorizer for function to use
        vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
        y = feature_rows["label"].values.astype(np.float32)

        X = sp.sparse.hstack(
            (vectorizer.fit_transform(feature_rows.text),
             feature_rows[['pos', 'neg'] +
                          Corpus.get_postag_set(self.instances)].values),
            format='csr')
        return X, y, vectorizer
    def generate_feature_csv(self,
                             feature_csv,
                             pos_lexicon,
                             neg_lexicon,
                             postag_instances=None):
        if postag_instances:
            corpus_postag_set = Corpus.get_postag_set(
                postag_instances)  # return all tags in corpus in a list
        else:
            corpus_postag_set = Corpus.get_postag_set(
                self.instances)  # return all tags in corpus in a list

        # ID, text, pos_feature, neg_feature, percentages for all corpus tags, label
        with open(feature_csv, 'wb') as f:
            # wr = csv.writer(f, quoting=csv.QUOTE_ALL)
            wr = csv.writer(f)
            id = 1
            wr.writerow(["ID", "text", "pos", "neg"] + corpus_postag_set +
                        ["label"])
            for inst in self.instances:
                inst_postags = [token.get_tag() for token in inst.get_tokens()]
                inst_postag_counter = Counter(inst_postags)
                postag_percent = []
                for tag in corpus_postag_set:
                    if tag in inst_postag_counter:
                        # percentage of words belonging to each POS in instance
                        postag_percent.append(inst_postag_counter[tag] /
                                              inst.get_length())
                    else:
                        postag_percent.append(0)
                pos_neg_list = self.get_lexicon_features(
                    inst.get_text(), pos_lexicon, neg_lexicon)
                wr.writerow(
                    [id, inst.get_text(), pos_neg_list[0], pos_neg_list[1]] +
                    postag_percent + [inst.get_label_gold()])
                id += 1
        return feature_csv, corpus_postag_set
예제 #4
0
 def generate_combined_features(self, feature_csv):
     feature_rows = pd.read_csv(feature_csv) # pandas Data Frame object
     # Create vectorizer for function to use
     vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2)) # CountVectorizer constructs BOW model based on word counts
     y = feature_rows["label"].values.astype(np.float32)
     # combine BOW model from Count Vectorizer with self-extracted features
     X = sp.sparse.hstack(
         (vectorizer.fit_transform(feature_rows.text), feature_rows[['pos', 'neg']+Corpus.get_postag_set(self.instances)].values),
         format='csr'
     )
     return X, y, vectorizer