def featurize_file(filename): # Create the file handler: fi_handler = open(filename, 'r') # # Apply the hashing trick # hf = FeatureHasher(input_type='string',non_negative=True) X = crfutils.get_features(feature_extractor, fields=fields, sep=' ', fi=fi_handler) # X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # doc = [] Y = set() for x in X: for entry in x: Y.add(entry['y']) # doc.append(entry['F']) # return X, hf.transform(doc) return X, list(Y)
X = crfutils.get_features(feature_extractor, fields=fields, sep=' ', fi=fi_handler) # X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # doc = [] Y = set() for x in X: for entry in x: Y.add(entry['y']) # doc.append(entry['F']) # return X, hf.transform(doc) return X, list(Y) if __name__ == '__main__': # pass # crfutils.main(feature_extractor, fields=fields, sep=separator) X = crfutils.get_features(feature_extractor, fields=fields, sep=separator) # Apply the hashing trick hf = FeatureHasher(input_type='string',non_negative=True) # # List of dictionaries: # x_set = set() # # Iterate over each of the tokens features: doc = [] for x in X: # sg_tv.transform(x) for entry in x: # print entry['F'] doc+=entry['F'] # vec = sg_tv.transform(doc) print hf.transform(doc)