Ejemplo n.º 1
0
def feature_extractor(X):
    # Apply attribute templates to obtain features.
    crfutils.apply_templates(X, templates)
    # Add the is_punct() feature.
    map(lambda x: x['F'].append("punct=" + is_punct(x['c'])), X)
    # Add the char_class() feature.
    map(lambda x: x['F'].append("class=" + char_class(x['c'])), X)
def feature_extractor(X):
    # Apply attribute templates to obtain features (in fact, attributes)
    crfutils.apply_templates(X, templates)
    if X:
        # Append BOS and EOS features manually
        X[0]['F'].append('__BOS__')  # BOS feature
        X[-1]['F'].append('__EOS__')  # EOS feature
def feature_extractor(X):
    for x in X:
        observation(x)
    crfutils.apply_templates(X, templates)
    if X:
        X[0]['F'].append('__BOS__')
        X[-1]['F'].append('__EOS__')
Ejemplo n.º 4
0
def feature_extractor(X):
    # Apply attribute templates to obtain features.
    crfutils.apply_templates(X, templates)
    # Add the is_punct() feature.
    map(lambda x: x['F'].append("punct=" + is_punct(x['c'])), X)
    # Add the char_class() feature.
    map(lambda x: x['F'].append("class=" + char_class(x['c'])), X)
Ejemplo n.º 5
0
def feature_extractor(X):
    # Apply attribute templates to obtain features (in fact, attributes)
    crfutils.apply_templates(X, templates)
    if X:
	# Append BOS and EOS features manually
        X[0]['F'].append('__BOS__')     # BOS feature
        X[-1]['F'].append('__EOS__')    # EOS feature
Ejemplo n.º 6
0
def FeatureExtractor(X):
    """apply attribute templates to obtain features (in fact, attributes)"""
    crfutils.apply_templates(X, templates)

    Featurizer(X)
    if X:
        X[0]['F'].append('__BOS__')  # BOS feature
        X[-1]['F'].append('__EOS__')  # EOS feature
Ejemplo n.º 7
0
def FeatureExtractor(X):
    """apply attribute templates to obtain features (in fact, attributes)"""
    #print 'FeatureExtractor called with X ', X
    crfutils.apply_templates(X, templates)

    Featurizer(X)
    #print X
    if X:
        #print 'in if X'
        X[0]['F'].append('__BOS__')  # BOS feature
        X[-1]['F'].append('__EOS__')  # EOS feature
Ejemplo n.º 8
0
def FeatureExtractor(X):
    """apply attribute templates to obtain features (in fact, attributes)"""
    #print 'FeatureExtractor called with X ', X
    crfutils.apply_templates(X, templates)
     
    Featurizer(X)
    #print X
    if X:
        #print 'in if X'
        X[0]['F'].append('__BOS__')     # BOS feature
        X[-1]['F'].append('__EOS__')    # EOS feature
Ejemplo n.º 9
0
def feature_extractor(X):
    # Append observations.
    for x in X:
        observation(x)

    # Apply the feature templates.
    crfutils.apply_templates(X, templates)

    if X:
	# Append BOS and EOS features manually
        X[0]['F'].append('__BOS__')     # BOS feature
        X[-1]['F'].append('__EOS__')    # EOS feature
Ejemplo n.º 10
0
def feature_extractor(X):
    # Append observations.
    for x in X:
        observation(x)

    # Apply the feature templates.
    crfutils.apply_templates(X, templates)

    # Append disjunctive features.
    for t in range(len(X)):
        disjunctive(X, t, 'w', -4, -1)
        disjunctive(X, t, 'w', 1, 4)

    # Append BOS and EOS features.
    if X:
        X[0]['F'].append('__BOS__')
        X[-1]['F'].append('__EOS__')
Ejemplo n.º 11
0
def feature_extractor(X):
    # Append observations.
    for x in X:
        observation(x)

    # Apply the feature templates.
    crfutils.apply_templates(X, templates)

    # Append disjunctive features.
    for t in range(len(X)):
        disjunctive(X, t, 'w', -4, -1)
        disjunctive(X, t, 'w', 1, 4)

    # Append BOS and EOS features.
    if X:
        X[0]['F'].append('__BOS__')
        X[-1]['F'].append('__EOS__')
Ejemplo n.º 12
0
    def feature_extractor(self, X):
        # Append observations.
        for x in X:
            self.observation(x)

        gen_regex_observation(X)

        gen_gazetteer_observation(X, gazetteer=self.gazetteer)

        # Apply the feature templates.
        crfutils.apply_templates(X, self.templates)

        # Append disjunctive features.
        for t in range(len(X)):
            disjunctive(X, t, 'w', -4, -1)
            disjunctive(X, t, 'w', 1, 4)

            if self.use_fregex:
                regexp_features(X, t)

        # Append BOS and EOS features.
        if X:
            X[0]['F'].append('__BOS__')
            X[-1]['F'].append('__EOS__')
Ejemplo n.º 13
0
def feature_extractor(X):
    # Apply templates to obtain features.
    crfutils.apply_templates(X, templates)
    # Add the is_punct() feature.
    map(lambda x: x['F'].append("punct=" + is_punct(x['c'])), X)
Ejemplo n.º 14
0
def feature_extractor(sentence):
    crfutils.apply_templates(sentence, attribute_templates)
Ejemplo n.º 15
0
def feature_extractor(X):
    # Apply templates to obtain features.
    crfutils.apply_templates(X, templates)
    # Add the is_punct() feature.
    map(lambda x: x['F'].append("punct=" + is_punct(x['c'])), X)
Ejemplo n.º 16
0
    def extract_features(self, seq, content_type):
        ret = []

        if content_type == 'Blog':
            use_brown_clusters = self.params.get_boolean('blog.use_brown_clusters')
            use_traditional_features = self.params.get_boolean('blog.use_traditional_features')
            use_idf_word_features = self.params.get_boolean('blog.use_idf_word_features')
            use_embeddings = self.params.get_boolean('blog.use_embeddings')
            use_lowercase_embeddings = self.params.get_boolean('blog.use_lowercase_embeddings')
            use_postag = self.params.get_boolean('blog.use_postag')
        elif content_type == 'SocialMediaPosting':
            use_brown_clusters = self.params.get_boolean('tweet.use_brown_clusters')
            use_traditional_features = self.params.get_boolean('tweet.use_traditional_features')
            use_idf_word_features = self.params.get_boolean('tweet.use_idf_word_features')
            use_embeddings = self.params.get_boolean('tweet.use_embeddings')
            use_lowercase_embeddings = self.params.get_boolean('tweet.use_lowercase_embeddings')
            use_postag = self.params.get_boolean('tweet.use_postag')
        elif content_type == 'NewsArticle':
            use_brown_clusters = self.params.get_boolean('news.use_brown_clusters')
            use_traditional_features = self.params.get_boolean('news.use_traditional_features')
            use_idf_word_features = self.params.get_boolean('news.use_idf_word_features')
            use_embeddings = self.params.get_boolean('news.use_embeddings')
            use_lowercase_embeddings = self.params.get_boolean('news.use_lowercase_embeddings')
            use_postag = self.params.get_boolean('news.use_postag')
        elif content_type == 'Post':
            use_brown_clusters = self.params.get_boolean('dw.use_brown_clusters')
            use_traditional_features = self.params.get_boolean('dw.use_traditional_features')
            use_idf_word_features = self.params.get_boolean('dw.use_idf_word_features')
            use_embeddings = self.params.get_boolean('dw.use_embeddings')
            use_lowercase_embeddings = self.params.get_boolean('dw.use_lowercase_embeddings')
            use_postag = self.params.get_boolean('dw.use_postag')

        if use_traditional_features:
            ner_seq = [{'w': x[0], 'F':[]} for x in seq]
            for x in ner_seq:
                ner.observation(x)
            crfutils.apply_templates(ner_seq, self.templates)
            # ner_seq is the same len as seq. Also, ner_seq[F] is a list of features

        for i in range(2, len(seq)-2):
            fs = []	# list of features for this word
       
            if use_traditional_features:
                fs.extend(ner_seq[i]['F'])

            if use_idf_word_features:
                fs.extend(self.get_idf_word_features(seq[i][0], i-2)) # subtract two to get "real" index
 
            # word features
            #fs.append('U00=%s' % seq[i-2][0])                  # word left-2
            #fs.append('U01=%s' % seq[i-1][0])                  # word left-1
            #fs.append('U02=%s' % seq[i][0])                    # current word (w)
            #fs.append('U03=%s' % seq[i+1][0])                  # word right+1
            #fs.append('U04=%s' % seq[i+2][0])                  # word right+2
            #fs.append('U05=%s/%s' % (seq[i-1][0], seq[i][0]))  # w_left-2 / w
            #fs.append('U06=%s/%s' % (seq[i][0], seq[i+1][0]))  # w / w_right+1

            # lowercase features
            #fs.append('U00lc=%s' % seq[i-2][0].lower())                          # word left-2
            #fs.append('U01lc=%s' % seq[i-1][0].lower())                          # word left-1
            #fs.append('U02lc=%s' % seq[i][0].lower())                            # current word (w)
            #fs.append('U03lc=%s' % seq[i+1][0].lower())                          # word right+1
            #fs.append('U04lc=%s' % seq[i+2][0].lower())                          # word right+2
            #fs.append('U05lc=%s/%s' % (seq[i-1][0].lower(), seq[i][0].lower()))  # w_left-2 / w
            #fs.append('U06lc=%s/%s' % (seq[i][0].lower(), seq[i+1][0].lower()))  # w / w_right+1

            # This will only be non-empty if word lists are specified
            fs.extend(self.get_word_list_features(seq[i][0]))

            if use_brown_clusters:
                # Size is the number of bits in the brown prefix  returned
                for size in [8, 12, 16, 20]:
                    # Get results for five-word window
                    for index in [-2, -1, 0, 1, 2]:
                        bc = self.get_brown_prefix(seq[i+index][0], size)
                        if bc:
                            fs.append("brown.%d.%d=%s" % (size, index, bc))


            # embeddings
            # TODO : try changing the %g to %f
            if use_embeddings:
                for j, embedding in enumerate(self.word_to_embedding[content_type]):
                    # U00e0-0=1:float U00e0-1=1:float U00e1-399=1:float   , word_left-2  embeddings
                    # U01e0-0=1:float ...                                 , word_left-1  embeddings
                    # U02e...                                             , word w       embeddings
                    # U03e...                                             , word_right+1 embeddings
                    # U04e...                                             , word_right+2 embeddings
                    for name, pos in zip(["U00", "U01", "U02", "U03", "U04"], [i-2,i-1,i,i+1,i+2]):
                        w = seq[pos][0]		                # word in that position/index
                        if w not in embedding: w = "*UNKNOWN*"  # default all OOV words to the UNKNOWN embeddings
                        for d in range(len(embedding[w])):
                            fs.append("%se%d-%d=1:%g" % (name, j, d, embedding[w][d]))

            if use_lowercase_embeddings:
                for j, embedding in enumerate(self.word_to_embedding[content_type]):
                    for name, pos in zip(["U00", "U01", "U02", "U03", "U04"], [i-2,i-1,i,i+1,i+2]):
                        w = seq[pos][0]		                # word in that position/index
                        wlc = w.lower()
                        if wlc not in embedding: wlc = "*UNKNOWN*"  # default all OOV words to the UNKNOWN embeddings
                        for d in range(len(embedding[wlc])):
                            fs.append("%slce%d-%d=1:%g" % (name, j, d, embedding[wlc][d]))

            if use_postag:
                fs.append('U10=%s' % seq[i-2][1])
                fs.append('U11=%s' % seq[i-1][1])
                fs.append('U12=%s' % seq[i][1])
                fs.append('U13=%s' % seq[i+1][1])
                fs.append('U14=%s' % seq[i+2][1])
                fs.append('U15=%s/%s' % (seq[i-2][1], seq[i-1][1]))
                fs.append('U16=%s/%s' % (seq[i-1][1], seq[i][1]))
                fs.append('U17=%s/%s' % (seq[i][1], seq[i+1][1]))
                fs.append('U18=%s/%s' % (seq[i+1][1], seq[i+2][1]))
                fs.append('U20=%s/%s/%s' % (seq[i-2][1], seq[i-1][1], seq[i][1]))
                fs.append('U21=%s/%s/%s' % (seq[i-1][1], seq[i][1], seq[i+1][1]))
                fs.append('U22=%s/%s/%s' % (seq[i][1], seq[i+1][1], seq[i+2][1]))


            ret.append("%s\t%s" % (seq[i][2], '\t'.join(fs)))  # example-label , followed by feature vector

        return ret
Ejemplo n.º 17
0
 def feature_extractor(X):
     crfutils.apply_templates(X, templates)
     if X:
         X[0]['F'].append('__BOS__')     # BOS feature
         X[-1]['F'].append('__EOS__')    # EOS feature
Ejemplo n.º 18
0
    features = OrderedDict()
    for i in feature_set:
        features[i[0]] = i[1]

    for w, _ in features.items():
        if w in trigrams:
            features[w] = [[[w, -1]], [[w, 0]], [[w, 1]]]

    feature_keys = features.keys()
    feature_items = features.values()
    input_columns = ' '.join(feature_keys) + ' chunk y'

    attribute_templates = []
    for i in feature_items:
        attribute_templates += i

    print("Using features: {}.".format(str(attribute_templates)))

    feature_extractor = lambda x: crfutils.apply_templates(x, attribute_templates)

    for fi, txt in [(train_csv, "train"), (devel_csv, "devel"), (test_csv, "test")]:
        write_to = path.join(trigram_path, txt + "_trigrams_" + str(mode) + ".crfsuite")
        fo = open(write_to, "w+")
        print("Writing to {}...".format(write_to))
        crfutils.main(feature_extractor, fi, fo, fields=input_columns, sep='\t')
        fo.close()

train_csv.close()
devel_csv.close()
test_csv.close()
Ejemplo n.º 19
0
def feature_extractor(X):
    # Apply attribute templates to obtain features (in fact, attributes)
    crfutils.apply_templates(X, templates)