class HashSarca(Sarcalingua): def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")): self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair") self.classifier = model self.outEncoder = LabelEncoder() self.drop_outs = set(( u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony", u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique", u"uncyclopedia", u"wikipedia")) def extractFeatures(self, clean_text): return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) ) def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args): def prepare(raw_text): tokens = token_pattern.findall(self.sanitize(raw_text, HTML)) if random.random() < 0.5: # we delete the drop-outs half the time tokens = [tok for tok in tokens if tok not in self.drop_outs] try: alpha = 1./len(tokens) #1./(1+log(len(tokens))) return ((tok.lower(), alpha) for tok in tokens) except ZeroDivisionError: return tuple() for chunk in chunkIterator: X = self.featureExtractor.transform(imap(prepare, chunk.text)) y = np.array(self.outEncoder.fit_transform(chunk[column_label])) yield X,y gc.collect()
class QClassifierImpl: """ A wrapper for question classifier """ def __init__(self, train_data_path, pred_qs = None): """ Constructor """ logging.basicConfig(level = logging.DEBUG, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename='qclassifier.log', filemode='w') reload(sys) sys.setdefaultencoding('utf8') self.clf = None self.path = train_data_path self.pred_qs = pred_qs self.extractor = FeatureExtractor() self.features = None self.labels = None self.vectorizer = None self.cate = ['Person', 'Number', 'Location', 'Other'] def train(self): """ Train use all of the given data """ self.extractor.load(path = self.path) self.features = self.extractor.extract_features() self.labels = self.extractor.get_labels() self.clf = QClassifier(questions = self.extractor.questions) assert(len(self.labels) == len(self.features)) X = self.features Y = self.labels self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True) X = self.vectorizer.transform(X) Y = asarray(Y) logging.info('start training') self.clf.train(X, Y) logging.info('done') def get_type(self, question): """ Get type for a given question """ if not self.features or not self.labels: logging.error('You need to train model first!') return None if not question: logging.error('Question should not be None') return None f = [self.extractor.extract_features_aux(question)] f = self.vectorizer.transform(f) # print self.clf.predict(f) return self.cate[self.clf.predict(f)[0]]
def load_conll(f, features, n_features=(2 ** 16), split=False): """Load CoNLL file, extract features on the tokens and hash them. Parameters ---------- f : {string, file-like} Input file. features : callable Feature extraction function. Must take a list of tokens (see below) and an index into this list. n_features : integer, optional Number of columns in the output. split : boolean, default=False Whether to split lines on whitespace beyond what is needed to parse out the labels. This is useful for CoNLL files that have extra columns containing information like part of speech tags. """ fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _conll_sequences(f, features, labels, lengths, split) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
def io(): hv = FeatureHasher() target = [] train_int = [] train_label = [] for iline in dio.io(): iline = iline.strip().split(',') t = int(iline[0]) int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14]) label_fs = [k for k in iline[14:]] #label_fs = ",".join(iline[14:]) # print int_fs, label_fs target.append(t) train_int.append(int_fs) train_label.append({k:1 for k in label_fs if k}) # print train_int imp = Imputer(missing_values='NaN', strategy='mean', axis=0) train_int = imp.fit_transform(train_int) # print train_int scaler = preprocessing.StandardScaler().fit(train_int) train_int = scaler.transform(train_int) # print train_int train_int = csr_matrix(train_int) # print train_label train_label = hv.transform(train_label) train = hstack((train_int, train_label)) # print train_label # print train return target, train
def to_ffm(df, outfile, ycol, num_columns = []): df = df.copy() one_based = True hasher = FeatureHasher(input_type='string', non_negative=True) bs = 2**10 value_pattern = u'%d:%d:%.16g' line_pattern = u'%d %s\n' with open(outfile, 'w') as out: pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start() for i in xrange((df.shape[0]+bs+1) // bs): pb.update(i) s = slice(i*bs, (i+1)*bs) if ycol in df.columns: Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str')) Xv = np.asarray(df.iloc[s][num_columns].astype('float')) y = df.iloc[s][ycol].values.astype('int') else: Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str')) Xv = np.asarray(df.iloc[s][num_columns].astype('float')) y = np.zeros((bs,)) Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr() for j in xrange(Xt.shape[0]): span = slice(Xt.indptr[j], Xt.indptr[j+1]) row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span]) st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False) feat = (y[j], st) out.write((line_pattern % feat).encode('ascii')) pb.finish()
def process_records(records, fields, target, textmodel=None): tokenize = CountVectorizer().build_analyzer() input = None X = None y_labels = [] for i, record in enumerate(records): nums = [] strs = [] y_labels.append(record.get(target)) for field in fields: if is_number(record.get(field)): nums.append(record[field]) else: strs.append(str(record.get(field) or "").lower()) if strs: if input is None: input = StringIO.StringIO() print >> input, " ".join(tokenize(" ".join(strs))) if nums: if X is None: X = sp.lil_matrix((len(records),len(nums))) X[i] = np.array(nums, dtype=np.float64) if input is not None: if X is not None: X_2 = X.tocsr() else: X_2 = None if isinstance(textmodel,basestring): if textmodel == 'lsi': corpus = TextCorpus(input) textmodel = LsiModel(corpus, chunksize=1000) elif textmodel == 'tfidf': corpus = TextCorpus(input) textmodel = TfidfModel(corpus) elif textmodel == 'hashing': textmodel = None hasher = FeatureHasher(n_features=2 ** 18, input_type="string") input.seek(0) X = hasher.transform(tokenize(line.strip()) for line in input) if textmodel: num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[])) X = corpus2csc(textmodel[corpus], num_terms).transpose() if X_2 is not None: # print >> sys.stderr, "X SHAPE:", X.shape # print >> sys.stderr, "X_2 SHAPE:", X_2.shape X = sp.hstack([X, X_2], format='csr') elif X is not None: textmodel = None X = X.tocsr() print >> sys.stderr, "X SHAPE:", X.shape return X, y_labels, textmodel
def hash(mat, num_features): """ hashing trick """ hasher = FeatureHasher(n_features=num_features, non_negative=True) X = hasher.transform(mat) X = X.toarray() return X
def test_feature_hasher_pairs(): raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 2], x1_nz) assert_equal([1, 3, 4], x2_nz)
def test_hash_empty_input(): n_features = 16 raw_X = [[], (), iter(range(0))] h = FeatureHasher(n_features=n_features, input_type="string") X = h.transform(raw_X) assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
def ner(tokens): """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset.""" global _model X = [_features(tokens, i) for i in range(len(tokens))] hasher = FeatureHasher(2**16, input_type="string") return zip(tokens, _model.predict(hasher.transform(X)))
class Model: def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True): #Init scikit models self.FH = FeatureHasher(n_features=numFeatures, input_type='string') self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle) def train(self, gen, v=False): i = 0 for x, y in gen: #For each batch xHash = self.FH.transform(x) #hash trick y = np.array(y) ## for epoch in range(numEpochs): self.Classifier.partial_fit(xHash, y, [0,1]) i += len(x) if v : print(str(datetime.now())[:-7] , "example:", i) def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch i = 0 for x,y in gen: xHash = self.FH.transform(x) #hash trick p = self.Classifier.predict_proba(xHash) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) i += y.shape[0] if v : print(str(datetime.now())[:-7] , "example:", i) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def predictBatch(self, batch): hashedBatch = self.FH.transform(batch) prediction = self.Classifier.predict_proba(hashedBatch) return prediction def generatePrediction(self, generator): for xBatch, idBatch in generator: prediction = self.predictBatch(xBatch) yield prediction, idBatch def score(self, target, prediction): return llfun(target, prediction)
def test_feature_hasher_pairs_with_string_values(): raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"}, {"baz": u"abc", "quux": 4, "foo": -1}]) h = FeatureHasher(n_features=16, input_type="pair") x1, x2 = h.transform(raw_X).toarray() x1_nz = sorted(np.abs(x1[x1 != 0])) x2_nz = sorted(np.abs(x2[x2 != 0])) assert_equal([1, 1], x1_nz) assert_equal([1, 1, 4], x2_nz) raw_X = (iter(d.items()) for d in [{"bax": "abc"}, {"bax": "abc"}]) x1, x2 = h.transform(raw_X).toarray() x1_nz = np.abs(x1[x1 != 0]) x2_nz = np.abs(x2[x2 != 0]) assert_equal([1], x1_nz) assert_equal([1], x2_nz) assert_equal(x1, x2)
def load_seq2seq(f, features, n_features=(2 ** 16)): fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _sequences(f, features, labels, lengths) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
class ColumnHasherTransformer(BaseEstimator, TransformerMixin): def __init__(self, col): self.col = col self.fh = FeatureHasher(n_features=1024, input_type='dict') def fit(self, X, y=None): return self def transform(self, df): return self.fh.transform(df.loc[:,self.col]\ .apply(lambda x: {x: 1}).values)
class GraphemeBasedModel(DiacriticsRestorationModel): def __init__(self, window=5, input_classes=None): self.window = window self.input_classes = input_classes def train(self, corpus, classes=None, chunk_size=100000): self.vectorizer = FeatureHasher(non_negative=True, n_features=len(classes)*2*self.window, input_type='pair') self.clf = MultinomialNB() i = 0 j = 0 X = [] Y = [] for x, y in corpus: if x[self.window][1] in self.input_classes: X.append(x) Y.append(y) i += 1 if i < chunk_size: continue j += 1 click.echo("Running iteration {}".format(j)) X = self.vectorizer.transform(X) self.clf.partial_fit(X, Y, classes) X = [] Y = [] i = 0 def restore(self, string): corpus = [] out = '' for x, y in string_to_grapheme_corpus(string, self.window): if x[self.window][1] in self.input_classes: x = self.vectorizer.transform([x]) out += self.clf.predict(x)[0] else: out += y return out
def hash_features(features, arm_ids, use_id=True): n_features = np.shape(features)[1] feature_names = [str(x) for x in np.arange(n_features)] all_features = [] for arm_id, feature_set in zip(arm_ids, features): temp_features = zip(feature_names, feature_set) if use_id == True: temp_features.append(("id_"+str(arm_id), 1)) all_features.append(temp_features) f = FeatureHasher(input_type='pair') return f.transform(all_features)
def encode_titles(titles, num_features=2**14): ''' Encode the titles formatted as a string as numerical values using the 'hashing trick'. The size of the feature vector can be specified using the num_features parameter' ''' myHasher = FeatureHasher(input_type='string', n_features= num_features, non_negative=True) featureMatrix = myHasher.transform(titles) return featureMatrix, myHasher
def dump_libffm_format(X, y, f): one_based = True hasher = FeatureHasher(input_type='string', non_negative=True) Xt = hasher.transform(X) value_pattern = u'%d:%d:%.16g' line_pattern = u'%d %s\n' for i in xrange(Xt.shape[0]): span = slice(Xt.indptr[i], Xt.indptr[i+1]) row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span]) s = " ".join(value_pattern % (j + one_based, fe, x) for j, fe, x in row) feat = (y[i], s) f.write((line_pattern % feat).encode('ascii'))
class SequenceHasher(TransformerMixin): """ encodes a sequeces xyz as xy,yz,z""" def __init__(self, base_feature_name="index"): self.base_feature_name = base_feature_name self.hasher = FeatureHasher(input_type="pair") def fit(self, X, y=None): pass def transform(self, X, y=None): f_name = self.base_feature_name seq = (((f_name + str(i), v) for i, v in enumerate(x)) for x in X) return self.hasher.transform(seq)
def predictUserScore(self, body, tags, fgen, users): featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair') # document features featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)] # additional features featureVector.append(("Length", 1)) featureVector.append(("Score", 1)) featureVector.append(("Accepted", 1)) featureVector.append(("OwnerRep", 1)) X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]]) scores = [score for index, score in enumerate(self.cf.decision_function(X)[0]) if int(self.cf.classes_[index]) in users] return scores
def main(): # Uncomment the following line to use a larger set (11k+ documents) # categories = None print(__doc__) print("Usage: %s [n_features_for_hashing]" % sys.argv[0]) print("The default number of features is 2**18.") print() try: n_features = int(sys.argv[1]) except IndexError: n_features = 2 ** 18 except ValueError: print("not a valid number of features: %r" % sys.argv[1]) sys.exit(1) print("Loading 20 newsgroups training data") categories = loadCategories() raw_data, data_size_mb = loadData(categories) print("DictVectorizer") t0 = time() vectorizer = DictVectorizer() vectorizer.fit_transform(token_freqs(d) for d in raw_data) report(data_size_mb, len(vectorizer.get_feature_names()), t0) print("FeatureHasher on frequency dicts") t0 = time() hasher = FeatureHasher(n_features=n_features) X = hasher.transform(token_freqs(d) for d in raw_data) report(data_size_mb, n_nonzero_columns(X), t0) print("FeatureHasher on raw tokens") t0 = time() hasher = FeatureHasher(n_features=n_features, input_type="string") X = hasher.transform(tokens(d) for d in raw_data) report(data_size_mb, n_nonzero_columns(X), t0)
def load_conll(f, features, n_features=(2 ** 16), split=False): """Load CoNLL file, extract features on the tokens and vectorize them. The ConLL file format is a line-oriented text format that describes sequences in a space-separated format, separating the sequences with blank lines. Typically, the last space-separated part is a label. Since the tab-separated parts are usually tokens (and maybe things like part-of-speech tags) rather than feature vectors, a function must be supplied that does the actual feature extraction. This function has access to the entire sequence, so that it can extract context features. A ``sklearn.feature_extraction.FeatureHasher`` (the "hashing trick") is used to map symbolic input feature names to columns, so this function dos not remember the actual input feature names. Parameters ---------- f : {string, file-like} Input file. features : callable Feature extraction function. Must take a list of tokens l that represent a single sequence and an index i into this list, and must return an iterator over strings that represent the features of l[i]. n_features : integer, optional Number of columns in the output. split : boolean, default=False Whether to split lines on whitespace beyond what is needed to parse out the labels. This is useful for CoNLL files that have extra columns containing information like part of speech tags. Returns ------- X : scipy.sparse matrix, shape (n_samples, n_features) Samples (feature vectors), as a single sparse matrix. y : np.ndarray, dtype np.string, shape n_samples Per-sample labels. lengths : np.ndarray, dtype np.int32, shape n_sequences Lengths of sequences within (X, y). The sum of these is equal to n_samples. """ fh = FeatureHasher(n_features=n_features, input_type="string") labels = [] lengths = [] with _open(f) as f: raw_X = _conll_sequences(f, features, labels, lengths, split) X = fh.transform(raw_X) return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
class Model: def __init__(self,numFeatures, learningRate, mustShuffle=True): #Init scikit models self.FH = FeatureHasher(n_features=numFeatures, input_type='pair') self.Classifier = SGDClassifier(loss='log', alpha=learningRate, shuffle=mustShuffle) def train(self, gen, numEpochs, v=False): i = 0 for x, y in gen: #For each batch xHash = self.FH.transform(x) #hash trick y = np.array(y) for epoch in range(numEpochs): self.Classifier.partial_fit(xHash, y, [0,1]) if v and (i % (numBatches/60)) == 0: print(datetime.now(), "example:", i*sizeBatch) i+=1 def test(self, gen, v=False): #init target and prediction arrays ytot = np.array([]) ptot = np.array([]) #Get prediction for each batch for batch in gen: data = list(batch) #store batch in memory for prediction x, y = data[0], np.array(data[1]) x = self.FH.transform(x) p = self.Classifier.predict_proba(x) p = p.T[1].T #Keep column corresponding to probability of class 1 #Stack target and prediction for later analysis ytot = np.hstack((ytot, y)) ptot = np.hstack((ptot, p)) if v: print("Score:", self.score(ytot, ptot)) return (ytot, ptot) def score(self, target, prediction): return llfun(target, prediction)
def predictUsers(self, body, tags, fgen, n = 3): featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair') # document features featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)] # additional features featureVector.append(("Length", 1)) featureVector.append(("Score", 1)) featureVector.append(("Accepted", 1)) featureVector.append(("OwnerRep", 1)) X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]]) userIds = [int(self.cf.classes_[index]) for index, score in sorted(enumerate(self.cf.decision_function(X)[0]), key=lambda x:x[1], reverse=True)][:n] # print(userIds) # print(self.cf.predict(X)) return [Users.get(Users.id == userId) for userId in userIds]
def process(self): header = self.inputFile.readline() ids = [] self.features = [] count = 0 for line in self.inputFile: count += 1 fields = line.split(',') id = fields[0] names = {} name = Kaggle_Grupo.Utils.StringNormalize(fields[1]) for i in name.split(' '): names[i] = 1 ids.append(id) self.features.append(names) featureHasher = FeatureHasher(n_features=2**12, dtype=np.uint16) self.features = featureHasher.transform(self.features) self.features = self.features.toarray() self.features = self.encode(width=24) headerFields = ["Cliente_ID"] for i in range(self.features.shape[1]): headerFields.append('ClientName_{}'.format(i)) headerFields = "\t".join(headerFields) self.outputFile.write(headerFields+'\n') for i in range(self.features.shape[0]): self.outputFile.write('{}\t{}\n'.format(ids[i], ('\t'.join(self.features[i].astype('str')).replace('False', '0').replace('True', '1'))))
class FeatureHasherModel: def fit(self, max_features): self.model = FeatureHasher(input_type = "string", n_features = max_features) def transform(self, dataframe, col_name): hashed = self.model.transform(dataframe) df = pd.DataFrame(hashed.toarray()) df.columns = ["%s_%d" % (col_name, author_num) for author_num in range(0, self.model.n_features)] df.index = dataframe.index return df def get_model(self): return self.model def set_model(self, model): self.model = model
def gen_cinput(origindata, pooldata = [],threshold = 5): origin_feas = gen_feature_data(origindata) pool_feas = gen_feature_data(pooldata) feas_X = [] label_Y = [] s = set() for seq in origin_feas: feas_X.extend([item["F"] for item in seq]) for item in seq: s.update(item["F"]) label_Y.extend([item["L"] for item in seq]) assert len(feas_X) == len(label_Y) print "original data data num : "+str(len(feas_X)) feas_X_2 = [] label_Y_2 = [] for seq_id, seq in enumerate(pool_feas): for token_id, token in enumerate(seq): if pooldata[seq_id][2][token_id] == 1: feas_X_2.append(token["F"]) s.update(token["F"]) label_Y_2.append(token["L"]) print "pool data data num : "+str(len(feas_X_2)) print "original feature num ................ "+str(len(s)) X = feas_X + feas_X_2 X = featurefilter(X, threshold) print X[:2] Y = label_Y + label_Y_2 h = FeatureHasher(input_type = "string", non_negative = True) X = h.transform(X) return X ,Y, h
def test_feature_hasher_strings(): raw_X = [[u"foo", "bar", "baz", "foo"], [u"bar", "baz", "quux"]] # note: duplicate for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_true(np.all(X.data > 0)) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, sum(len(set(x)) for x in raw_X))
def learn(self, fgen, postLimit=None): Parent = Posts.alias() query = Posts.select().join(Parent, on=(Posts.parentid == Parent.id)).where(Posts.posttypeid == 2 & Parent.forevaluation == 0) if postLimit is not None: query = query.limit(postLimit) count = query.count() print("Learning {0} questions".format(count)) allClasses = numpy.array([user.id for user in Users.select()]) maxUserRep = float(Users.select(peewee.fn.Max(Users.reputation)).scalar()) featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair') featureMatrix = [] classList = [] for i, answer in enumerate(query): if answer.owneruserid is None: continue print("Generating feature vector for id {0}".format(answer.id)) # docment features # featureVector = fgen.getDocumentFeatures(answer.parentid.title + answer.parentid.body + answer.body, tagIds) featureVector = fgen.getAnswerFeatures(answer) featureVector = [(str(dim), value) for dim, value in featureVector] # additional features maxScore = Posts.select(peewee.fn.Max(Posts.score)).where(Posts.parentid == answer.parentid).scalar() maxLength = max(len(post.body) for post in Posts.select().where(Posts.parentid == answer.parentid)) featureVector.append(("Length", (len(answer.body)/float(maxLength)))) featureVector.append(("Score", 1 if maxScore == 0 else (answer.score/float(maxScore)))) featureVector.append(("Accepted", 1 if answer.id == answer.parentid.acceptedanswerid else 0)) featureVector.append(("OwnerRep", answer.owneruserid.reputation/maxUserRep)) featureMatrix.append(featureVector) classList.append(answer.owneruserid.id) if len(featureMatrix) == self.batchSize or i == count-1: print("Partial fitting classifier".format(answer.id)) X = featureHasher.transform(featureMatrix) Y = numpy.array(classList) self.cf.partial_fit(X, Y, classes=allClasses) allClasses = None featureMatrix = [] classList = []
def test_feature_hasher_strings(): # mix byte and Unicode strings; note that "foo" is a duplicate in row 0 raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]] for lg_n_features in (7, 9, 11, 16, 22): n_features = 2 ** lg_n_features it = (x for x in raw_X) # iterable h = FeatureHasher(n_features, non_negative=True, input_type="string") X = h.transform(it) assert_equal(X.shape[0], len(raw_X)) assert_equal(X.shape[1], n_features) assert_true(np.all(X.data > 0)) assert_equal(X[0].sum(), 4) assert_equal(X[1].sum(), 3) assert_equal(X.nnz, 6)
def main(): # start timer time.clock() # open training data infile = open("../data/gold/simple_gold_revised.txt", "r") train_sents = infile.readlines() infile.close() train_sents = train_sents[100:] # open CMU training data infile = open("../data/cmu_all_gold.txt") cmu_train_sents = infile.readlines() infile.close() window = 1 num_corpora = 0 sent_tokens = preprocess(train_sents, window) cmu_sent_tokens = preprocess(cmu_train_sents, window) all_tokens = sent_tokens all_tokens.extend(cmu_sent_tokens[:len(cmu_sent_tokens) / 2]) del train_sents X, y = get_features(all_tokens, window) print('Got Features') del all_tokens features, results = get_feature_dict(X, y) print('Got Feature Dict') X_tweets = X[0:len(sent_tokens)] X_cmu = X[len(sent_tokens):] print('Split training Data') print('Training on Tweets...') from sklearn.feature_extraction import FeatureHasher hasher = FeatureHasher(input_type='string') X_new = [] for row in X_tweets: new_row = [] new_row.extend(row) for element in row: new_row.append(element + '_*tweet*') X_new.append(new_row) for row in X_cmu: new_row = [] new_row.extend(row) for element in row: new_row.append(element + '_*cmu*') X_new.append(new_row) x_vec = hasher.transform(X_new) y_vec = [] for y_i in y: new_y = 0 if y_i is not None and y_i in results: new_y = results[y_i] y_vec.append(new_y) clf = svm.LinearSVC(C=0.15) clf.fit(x_vec, y_vec) print('Done') print('Training on CMU...') print('Done') del X del y del sent_tokens ## this writes the classifier to a binary #from sklearn.externals import joblib #joblib.dump(clf, 'classifiers/cmu+gang_nn_hot.pkl') ## This reads the classifier from a binary #from sklearn.externals import joblib #clf = joblib.load('classifiers/cmu+gang_nn_daume.pkl') print('Trained Classifier') # open Corpus development data # infile = open("../data/content/content_revised_tokenized.txt", "r") infile = '../../data/gakirah/gakirah_aggress_loss.csv' print('Reading Dev') f = open(infile, 'rU') reader = csv.DictReader(f) train_Dev = [] for row in reader: tweet = row['CONTENT'].decode('utf-8') train_Dev.append(tweet) f.close() train_dev_words = [] for sentence in train_Dev: train_dev_words.append(sentence.rstrip().split()) dev_tokens = [None] * len(train_Dev) for i in range(len(dev_tokens)): tokens = train_Dev[i].split() for j in range(window): tokens.insert(0, '*\\*') tokens.append('STOP\\STOP') for j in range(len(tokens)): tokens[j] = list(tokens[j].split('\\')) dev_tokens[i] = tokens print('Testing Dev') tagged_sents = tag_sents(clf, dev_tokens, features, results, window, num_corpora, hasher=hasher) print('Writing Results') # output_tagged(tagged_sents, '../results/svm_trained_on_alone+cmu.txt') output_tagged(tagged_sents, '../../results/pos_tagged_gakirah_aggress_loss.txt') print("Time: " + str(time.clock()) + ' sec')
# Custom Implementation def apply_hashing_trick(feature_dict, vector_size=2000): # Create an array of zeros of length 'vector_size' new_features = [0 for x in range(vector_size)] # iterate over every feature in the feature dictionary for key in feature_dict: # get the index into the new feature array array_index = hash(key) % vector_size # add the value of the feature to the new feature array # at the index we got using the hashing trick new_features[array_index] += feature_dict[key] return new_features # Implementing FeatureHasher from sklearn.feature_extraction import FeatureHasher hasher = FeatureHasher(n_features=20) features = [{'how': 1, 'now': 2, 'brown': 4}, {'cow': 2, '.': 5}] hashed_features = hasher.transform(features)
] cat_vars = [ 'categorical8', 'categorical14', 'categorical5', 'categorical12', 'categorical11', 'categorical10', 'categorical7', 'categorical1', 'categorical15', 'categorical18', 'categorical13', 'categorical16' ] #merge cols X = chunk[integer_cols + cat_vars] y = chunk['label'] # replace nulls X.update(X[integer_cols].fillna(0)) X.update(X[cat_vars].fillna('NULL')) X_cat = feature_hasher.transform(X[cat_vars].to_dict('records')).toarray() if i == 0: scaler.fit(X[integer_cols]) X_integer = scaler.transform(X[integer_cols]) X = np.hstack((X_cat, X_integer)) dimensionality_reduction = svd.fit(X) principal_components = dimensionality_reduction.transform(X) logistic.fit(principal_components, y) dump(logistic, 'first_pca_logit.pkl') else: X_integer = scaler.transform(X[integer_cols]) X = np.hstack((X_cat, X_integer))
chunk['ToSecond_scaled']=(chunk.ToSecond-0)/(84239-0) chunk['Cyclic_scaled']=(chunk.Cyclic-0)/(42480-0) selected_columns = ["ip_scaled",'app','device','os','channel','AMPM','Cyclic_scaled','click_day_scaled','click_hour_scaled','is_attributed'] chunk=chunk[selected_columns] #training will be done batch by batch even within each chunk for ii in range(0, chunksize//batchsize): X = chunk.iloc[ii*batchsize:(ii+1)*batchsize,:-1] y = chunk.iloc[ii*batchsize:(ii+1)*batchsize,-1] i+=1 # now lets create the hashed variable using transformation on the hasher object created earlier X_train = fh.transform(np.asarray(X.astype(str))) #clf.fit(X_train,y,xgb_model=None) #for xgboost model we need sparse matrix: dtrain=xgb.DMatrix(X_train, label=y) param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'multi:softprob'} modelXG=xgb.train(param,dtrain,xgb_model='xgbmodel') #each time a batch is used to train the model partially, we will need to save the model and then give it to the next step to continue from there. that is how xgboost is trained modelXG.save_model("xgbmodel") #clf.n_estimators += 1 #every 10 chunks we would like to evaluate and see how we are doing on unseen validation data: if(i%10==0): print(i)
if True and FIT: est = LogisticRegression(multi_class='auto', solver='liblinear') t1 = time.time() est.fit(X_train, y_train) print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}") print("\nFeatureHasher") print("FeatureHasher on frequency dicts") n_features=1048576 #n_features=int(1048576 / 2) hasher = FeatureHasher(n_features=n_features) t1 = time.time() X_train = hasher.fit_transform(token_freqs(d) for d in X_train_text) X_test = hasher.transform(token_freqs(d) for d in X_test_text) print(f"FeatureHasher XX shape {X_train.shape} with {X_train.data.nbytes:,} bytes and nnz {X_train.nnz:,} in {time.time()-t1}") if FIT: est = LogisticRegression(multi_class='auto', solver='liblinear') t1 = time.time() est.fit(X_train, y_train) print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}") #NGRAM_MAX 1 #CountVectorizer #CountVectorizer shape (8485, 112359) with 10,723,592 bytes and nnz 1,340,449 #Vocab length 112359 #Score CountVectorizer 0.8882997525627431
num_train = train.ix[:, train.applymap( np.isreal).all(axis=0)] # Get numerical features cat_train = train.ix[:, np.invert(train.applymap( np.isreal).all(axis=0))] # Get categorical features categorical_type = 'dictvectorizer' cat_dict = (dict(cat_train.ix[x]) for x in range(cat_train.shape[0]) ) # Categorical data generator faster then pandas # cat_dict = cat_train.to_dict(orient='records') # Categorical data dict if categorical_type == 'dictvectorizer': vec = DictVectorizer() cat_data = vec.fit_transform(cat_dict).toarray() elif categorical_type == 'featurehasher': feat_hash = FeatureHasher() hasher = FeatureHasher(input_type='string', n_features=2**8) cat_data = hasher.transform(cat_dict) elif categorical_type == 'onehotencoder': le_data = np.empty(cat_train.shape) for col in range(cat_train.shape[1]): le = LabelEncoder() le_data[:, col] = le.fit_transform(cat_train.ix[:, col]) enc = OneHotEncoder() cat_data = enc.fit_transform(le_data).toarray() else: raise Exception('categorical_type not supported!') np_data = np.array(num_train) ids = np.array(np_data[:, -2], dtype=np.int) x_tr = np_data[:, :-2] x_tr_cat = cat_data y_tr = np_data[:, -1]
distinct_users_in_ratings = review_df['user_id'].unique() distinct_items_in_ratings = review_df['business_id'].unique() user_df = user_df[user_df['user_id'].isin(distinct_users_in_ratings)] item_df = item_df[item_df['business_id'].isin(distinct_items_in_ratings)] # deal with high cardinarity feature : category encoding # TODO: PCA, linear transformation, mean encoder if is_feature_hasher: print('feature hasing ...') mlb = MultiLabelBinarizer() encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] ) fea_hasher = FeatureHasher(n_features=hash_dim) # wrap 'encodings' into dict all_categories = list(mlb.classes_) encode_dict_list = [ dict(zip(all_categories, list(instance_encoding))) for instance_encoding in encodings] hash_encodings = fea_hasher.transform(encode_dict_list).toarray() else: mlb = MultiLabelBinarizer() hash_encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] ) # Build graph print('building graph ...') graph_builder = PandasGraphBuilder() graph_builder.add_entities(user_df, 'user_id', 'user') graph_builder.add_entities(item_df, 'business_id', 'item') graph_builder.add_binary_relations(review_df, 'user_id', 'business_id', 'reviewed') graph_builder.add_binary_relations(review_df, 'business_id', 'user_id', 'reviewed-by') g = graph_builder.build() print('Assigning feature ...')
# else: # chunk.to_csv('data/test.csv', header=False, index=False, mode='a') # Train classifier # clf = RandomForestRegressor() # clf = LogisticRegression() clf = LinearRegression() # clf = KNeighborsRegressor() # clf = MLPRegressor() # clf = SVR(kernel='rbf', C=1e3, gamma=0.1) all_classes = np.array([0, 1]) y_train = file["price"] train = file[cols[:-1]] # train.drop(["normalised-losses"], axis=1, inplace=True) Xcat = fh.transform(np.asarray(train.astype(str))) print 'Training' clf.fit(Xcat, y_train) test_file = pd.read_csv('data/test.csv', names=cols) # test_file = test_file.apply(lambda x: x.fillna(method='pad'), axis=0) test = test_file[cols[:-1]] y_test = test_file["price"] # test.drop(["normalised-losses"], axis=1, inplace=True) X_test = fh.transform(np.asarray(test.astype(str))) y_pred = clf.predict(X_test) for i, value in enumerate(y_test): print value, y_pred[i]
# print indices[indptr[0]:indptr[1]] # test MultinomialNB in sklearn # n_features 尽量取大点,以免发生 Hash 碰撞 # 就算取得很大也无妨,因为是稀疏矩阵,还是消耗不了多少内存 train_datas = [{ 'monkey': 1, 'dog': 1, 'cat': 2, 'elephant': 4 }, { 'dog': 2, 'run': 5 }] feature_hasher = FeatureHasher(n_features=2**20, non_negative=True) train_datas = feature_hasher.transform(train_datas) """X = np.array([[1, 2, 4, 1, 1, 1], [3, 2, 4, 2, 2, 3], [2, 2, 3, 4, 4, 1], [2, 0, 3, 2, 3, 1], [2, 0, 0, 3, 3, 3], [2, 3, 1, 0, 3, 4]])""" class_label = np.array([1, 2]) # 调整平滑因子 clf = MultinomialNB(alpha=0.01) train = clf.fit(train_datas, class_label) test_datas = [{'monkey': 3, 'mouse': 1}] test_datas = feature_hasher.transform(test_datas) test = clf.predict(test_datas) print train_datas print test_datas
from sklearn.feature_extraction import FeatureHasher import numpy as np # h = FeatureHasher(n_features=100000) h = FeatureHasher(n_features=4) D = [{'dog': 1, 'cat': 2, 'elephant': 4}, {'dog': 2, 'run': 5}] f = h.transform(D) print(f.toarray()) print('===') # default input format : (feature_name, value) D = [ { 'dog': 1, 'cat': 2, 'elephant': 4 }, { 'dog': 2, 'run': 5 }, { 'dog': 1 }, { 'run': 5 }, { 'cat': 2 },
return [float(d.weekday()), float(d.hour)] fh = FeatureHasher(n_features=2**20, input_type="string") # Train classifier clf = RidgeClassifier() train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True) all_classes = np.array([0, 1]) for chunk in train: y_train = chunk["click"] chunk = chunk[cols] chunk = chunk.join( pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"])) chunk.drop(["hour"], axis=1, inplace=True) Xcat = fh.transform(np.asarray(chunk.astype(str))) clf.fit(Xcat, y_train) # Create a submission file usecols = cols + ["id"] X_test = pd.read_csv("test/mtest.csv", usecols=usecols) X_test = X_test.join( pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"])) X_test.drop(["hour"], axis=1, inplace=True) X_enc_test = fh.transform(np.asarray(X_test.astype(str))) y_act = pd.read_csv("test/mtest.csv", usecols=['click']) y_pred = clf.predict(X_enc_test) with open('logloss.txt', 'a') as f:
def feature_hash(X, n_features=1000): h = FeatureHasher(n_features=n_features) return h.transform(X)
def train_and_score(_tr, _vv, _vp, model_sizes, colors=None): all_venues, train_pairs, valid_pairs = generate_interaction(_tr, _vv, _vp) print "Creating models" plt.figure(figsize=(10, 10)) lw = 2 roc_aucs = [] for size, color in zip(model_sizes, colors): extractor = FeatureHasher(n_features=2**size) model = SGDClassifier(loss="log", penalty="l2", alpha=0.001, n_jobs=-1) # model = BernoulliNB() print "Training" for i, (user, yay_venues) in enumerate(train_pairs.iteritems()): print "Training on user", i, user labels, yay_pairs, nay_pairs = generate_features( all_venues, yay_venues) yay_features, nay_features = extractor.transform( yay_pairs), extractor.transform(nay_pairs) features = sp.vstack([yay_features, nay_features]) model.partial_fit(features, labels, classes=[0, 1]) print "Testing" all_labels, all_preds, all_probas = [], [], [] for i, (user, yay_venues) in enumerate(valid_pairs.iteritems()): print "Testing on user", i, user labels, yay_pairs, nay_pairs = generate_features( all_venues, yay_venues) all_labels.extend(labels) yay_features, nay_features = extractor.transform( yay_pairs), extractor.transform(nay_pairs) features = sp.vstack([yay_features, nay_features]) preds, probas = model.predict(features), model.predict_proba( features) all_preds.extend(preds), all_probas.extend(probas[:, 1]) print "Scoring" roc_auc = roc_auc_score(all_labels, all_probas) cm = confusion_matrix(all_labels, all_preds) print "Model size", size, "AUC", roc_auc print cm roc_aucs.append(roc_auc) fpr, tpr, _ = roc_curve(all_labels, all_probas) plt.plot(fpr, tpr, color=color, lw=lw, label='Model %d (area = %0.2f)' % (size, roc_auc)) joblib.dump(model, 'model_logit_size%d.pkl' % size) np.save("labels_logit_size%d.npy" % size, all_labels) np.save("probas_logit_size%d.npy" % size, all_probas) plt.plot([0, 1], [0, 1], color='navy', lw=lw, ls='--', label='Luck') plt.xlim([-.05, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic for different model sizes') plt.legend(loc="lower right") # plt.savefig('../plots/model_nb.png') plt.tight_layout() plt.show() '''
'TEXT :' + token_dict[file] ]) X_feature.append(feature) if 'positive' in file: Y_act_tag.append('positive') elif 'negative' in file: Y_act_tag.append('negative') elif 'neutral' in file: Y_act_tag.append('neutral') ''' convert the feature list into featureHasher which will be fed to the classifier''' hasher = FeatureHasher(input_type='string') X = hasher.transform(X_feature) print(type(X)) Y_act_tag_test=[] for dirName, subDir, files in os.walk(sys.argv[2]): for file in files: fopen=open(os.path.join(dirName, file), 'r') review=fopen.read() text = review.lower() final_text = text.translate(None , string.punctuation) token_dict_test[file] = final_text X_feature_test=[] for file in token_dict_test: feature = []
raw_list.append(filtered2) raw_res.append(1 if raw["vandal"] else 0) raw_list_opp.append({ x: y * (-1) for x, y in filtered.items() if y < 0 and not check_rgb(x) and ' ' not in x }) # counter.tick() print(len(raw_list)) from sklearn.naive_bayes import BernoulliNB fh = FeatureHasher(2000000) matrix = fh.transform(raw_list) matrix_opp = fh.transform(raw_list_opp) lr = LogisticRegression(solver='sag', verbose=1, class_weight="balanced", max_iter=300, C=0.01) #lr = BernoulliNB() lr.fit(matrix, raw_res) lr2 = LogisticRegression(solver='sag', verbose=1, class_weight="balanced", max_iter=300, C=0.01)
def main(): # Read the data train_data = [] train_labels = [] test_data = [] test_labels = [] actualPos = 0 actualNeg = 0 actualNeu = 0 distinct_words = set([]) train_dir = {} train = [] test = [] test_dir = {} distinct_words_list = [] train_pos = 0 train_neg = 0 train_neu = 0 for root, directories, filenames in os.walk(sys.argv[1]): for each_filename in filenames: if each_filename.endswith(".txt"): path = root + '/' + each_filename with open(os.path.join(root, each_filename), 'r') as f: tokens = f.read() token_split = tokens.split() # ---- if the folder is training set ----# if "Train" in path: feature = feature_extraction(token_split, train_dir, distinct_words_list) train.append(feature) train_data.append(tokens) if "positive" in path: train_pos = train_pos + 1 train_labels.append("positive") elif "negative" in path: train_neg = train_neg + 1 train_labels.append("negative") elif "neutral" in path: train_neu = train_neu + 1 train_labels.append("neutral") # ---- if the folder is development set ----# elif "Dev" in path: feature = feature_extraction(tokens, test_dir, distinct_words_list) test.append(feature) test_data.append(tokens) if "positive" in path: actualPos = actualPos + 1 test_labels.append("positive") elif "negative" in path: actualNeg = actualNeg + 1 test_labels.append("negative") elif "neutral" in path: actualNeu = actualNeu + 1 test_labels.append("neutral") # print("actual pos",actualPos) # print("actual neg", actualNeg) # print("actual neutral", actualNeu) # #--applying featurehasher to input ---# hasher = FeatureHasher(input_type='string') X = hasher.transform(train) Y = hasher.transform(test) #---perform classification on linear svc()---# classifier = svm.LinearSVC() clf = classifier.fit(X, train_labels) results = clf.predict(Y) pos = 0 neg = 0 neu = 0 print("Results for LinearSVC()") for each in results: if "positive" in each: pos = pos + 1 elif "negative" in each: neg = neg + 1 elif "neutral" in each: neu = neu + 1 # print("pred pos is ",pos) # print("pred neg is ", neg) # print("pred neu is ", neu) # # print("train pos is ", train_pos) # print("train neg is ", train_neg) # print("train neu is ", train_neu) print(classification_report(test_labels, results))
import pandas as pd import json # 最初の10,000件のレビューを読み込み with open('data/yelp/yelp_academic_dataset_review.json') as f: js = [] for i in range(10000): js.append(json.loads(f.readline())) review_df = pd.DataFrame(js) # mにbusiness_idのユニーク数を代入 m = len(review_df['business_id'].unique()) m from sklearn.feature_extraction import FeatureHasher # ハッシュ化 h = FeatureHasher(n_features=m, input_type='string') f = h.transform(review_df['business_id']) # 変換後の特徴量が解釈が困難であることを確認 review_df['business_id'].unique().tolist()[0:5] f.toarray() # 変換後の特徴量のストレージサイズが大きく減っていることを確認 from sys import getsizeof print('Our pandas Series, in bytes: ', getsizeof(review_df['business_id'])) print('Our hashed numpy array, in bytes: ', getsizeof(f))
def hash(mat, n_features=1000): hasher = FeatureHasher(n_features=n_features) X = hasher.transform(mat) X = X.toarray() return X
def func1(): h = FeatureHasher(n_features=3) D = [{'dog': 1, 'cat': 2, 'elephant': 4}, {'dog': 2, 'run': 5}] f = h.transform(D) print(f.toarray())
temp_list = [] #rc contains the list of release code. ignore the last code as it refer to "enter" value rc = list(map(int, temp_X.release_codes.split())) pp = list(map(int, temp_X.pp.split())) pr = list(map(int, temp_X.pr.split())) rp = list(map(int, temp_X.rp.split())) rr = list(map(int, temp_X.rr.split())) for j in range(0, len(rc) - 1): temp_list.append({ 'rc': rc[j], 'pp': pp[j], 'pr': pr[j], 'rp': rp[j], 'rr': rr[j] }) print(hasher.transform(temp_list).todense()) X_transformed.append(hasher.transform(temp_list).todense()) X_transformed = pd.DataFrame(X_transformed) # X_transformed = X_transformed.fillna(method='pad', axis=1) with open(r'output.csv', 'w') as file: file.write(X_transformed.to_csv()) print("==== After transformation =====") print("X_transformed shape: {}".format(X_transformed.shape)) X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.4, random_state=0) print("X_train type: {}".format(type(X_train)))
n_features = 100 print("DictVectorizer") t0 = time() vectorizer = DictVectorizer() vectorizer.fit_transform(token_freqs(d) for d in train) duration = time() - t0 print("Found %d unique terms" % len(vectorizer.get_feature_names())) print(train) print() print("FeatureHasher on frequency dicts") t0 = time() hasher = FeatureHasher(n_features=n_features) X = hasher.transform(token_freqs(d) for d in train) duration = time() - t0 print("Found %d unique terms" % n_nonzero_columns(X)) print() print("FeatureHasher on raw tokens") t0 = time() hasher = FeatureHasher(n_features=n_features, input_type="string") X = hasher.transform(tokens(d) for d in train) duration = time() - t0 print("Found %d unique terms" % n_nonzero_columns(X)) ''' for row in train: #if row not in black_list: print(train[row].describe()) print("\n")
def predict(self, files): ''' return a vector of predicted values for the set of files specified. Assume convention, 0=Benign, 1=Malware. ''' assert self.model is not None # now extract features from file, hash them and use self.model to return predictions start_time = time() completed_files = 0 feature_vector_list = [] feature_dictionary_list = [] print('Starting feature extraction') prev = None for _file in files: try: vector, dictionary = get_feature_vector(_file) prev = vector except: vector = prev dictionary = {} feature_dictionary_list.append(dictionary) feature_vector_list.append(vector) completed_files += 1 print('Completed extracting features from ' + str(completed_files) + ' files', end='\r') print('') end_time = time() print('Feature extraction completed in ' + str(end_time - start_time) + ' seconds') print('Starting testing') start_time = time() features = 7000 hasher = FeatureHasher(n_features=features) feature_x = hasher.transform(feature_dictionary_list).toarray() feature_x = np.concatenate((feature_x, np.array(feature_vector_list)), axis=1) feature_y = self.model.predict(feature_x) end_time = time() print('Testing completed in ' + str(end_time - start_time) + ' seconds') lump = lambda value: 1 if value > 0 else 0 def transform(array): return np.fromiter((lump(element) for element in array), array.dtype) return transform(feature_y)
one.fit(X) train=one.transform(X) print('train data set has got {} rows and {} columns'.format(train.shape[0],train.shape[1])) logistic(train,y) from sklearn.feature_extraction import FeatureHasher ### %time X_train_hash=X.copy() for c in X.columns: X_train_hash[c]=X[c].astype('str') hashing=FeatureHasher(input_type='string') train=hashing.transform(X_train_hash.values) print('train data set has got {} rows and {} columns'.format(train.shape[0],train.shape[1])) logistic(train,y) ### %time X_train_stat=X.copy() for c in X_train_stat.columns: if(X_train_stat[c].dtype=='object'): X_train_stat[c]=X_train_stat[c].astype('category') counts=X_train_stat[c].value_counts() counts=counts.sort_index() counts=counts.fillna(0) counts += np.random.rand(len(counts))/1000
# In[28]: train = train.drop('target', axis=1) # In[29]: dev = dev.drop('target', axis=1) # In[30]: hashed_train = hasher.transform(get_features(train, features)) # In[31]: hashed_dev = hasher.transform(get_features(dev, features)) # In[32]: hashed_test = hasher.transform(get_features(test, features)) # In[33]:
raw_data = fetch_20newsgroups(subset='train', categories=categories).data data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6 print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb)) print() print("DictVectorizer") t0 = time() vectorizer = DictVectorizer() vectorizer.fit_transform(token_freqs(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("Found %d unique terms" % len(vectorizer.get_feature_names())) print() print("FeatureHasher on frequency dicts") t0 = time() hasher = FeatureHasher(n_features=n_features) X = hasher.transform(token_freqs(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("Found %d unique terms" % n_nonzero_columns(X)) print() print("FeatureHasher on raw tokens") t0 = time() hasher = FeatureHasher(n_features=n_features, input_type="string") X = hasher.transform(tokens(d) for d in raw_data) duration = time() - t0 print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration)) print("Found %d unique terms" % n_nonzero_columns(X))
def do_job_unit(self, event, corpus, unit, **kwargs): if unit != 0: raise Exception("Job unit {} out of range".format(unit)) res = ArticlesResource() thresh = kwargs.get("dedupe-sim-threshold", .8) extractor = kwargs.get("extractor", "goose") hasher = FeatureHasher(input_type="pair", non_negative=True) si_iter = res.streamitem_iter( event, corpus, extractor) def to_df(all_ids, all_times, all_matches): d = [] for ids, times, match in izip(all_ids, all_times, all_matches): times.sort() d.append({ "stream ids": ids, "hits": len(ids), "match": match, "earliest": times[0], "latest": times[-1], "second": times[1] if len(times) >= 2 else None, "third": times[2] if len(times) >= 3 else None, }) return pd.DataFrame(d, columns=["stream ids", "match", "hits", "earliest", "latest", "second", "third"]) def query_in_top20(event, df): text = u"\n".join(df["sent text"].tolist()[:20]) for query in event.query: if not re.search(query, text, flags=re.I|re.UNICODE): return False return True def make_time(df): return df["timestamp"].tolist()[0] def make_counts(df, slimit=20): counts = defaultdict(int) for words in df["words"].tolist()[:slimit]: for word in words: counts[word.lower()] += 1 return counts def next_chunk_file(chunk_file_num): deduped_path_fmt = self.get_deduped_path_fmt( event, corpus, extractor, threshold=thresh) deduped_path = deduped_path_fmt.format( chunk_file_num) deduped_dir = os.path.dirname(deduped_path) if not os.path.exists(deduped_dir): os.makedirs(deduped_dir) if os.path.exists(deduped_path): os.remove(deduped_path) return sc.Chunk(path=deduped_path, mode="wb", message=corpus.sc_msg()) X = None chunk_file_num = 1 chunk = next_chunk_file(chunk_file_num) for hour, path, si in si_iter: df = si2df(si, extractor=extractor) counts = make_counts(df) x = hasher.transform([counts.items()]) x.shape = (1, hasher.n_features) if X is None: X = x times = [[make_time(df)]] ids = [[si.stream_id]] matches = [query_in_top20(event, df)] chunk.add(si) else: K = cosine_similarity(X, x) k_argmax = K.argmax() if K[k_argmax] < thresh: X = vstack([X, x]) times.append([make_time(df)]) ids.append([si.stream_id]) matches.append(query_in_top20(event, df)) if X.shape[0] % 1000 == 0: chunk.close() chunk_file_num += 1 chunk = next_chunk_file(chunk_file_num) chunk.add(si) else: times[k_argmax].append(make_time(df)) ids[k_argmax].append(si.stream_id) chunk.close() df = to_df(ids, times, matches) print df stats_path = self.get_stats_path( event, corpus, extractor, thresh) with open(stats_path, "w") as f: df.to_csv(f, index=False, sep="\t")
class FeatureExtractor: def __init__(self, lexicon_helper, fh=None, fs=None): if fh: self.feature_hasher = fh if fs: self.features_set = fs self.train_mode = False else: self.features_set = set() self.lexicon_helper = lexicon_helper features_set = None feature_hasher = None train_mode = True unk = "unk" def build_x_vectors(self, ent_couple_objects): ''' :param tuple(sen_id, ent1 name, ent2 name, x) :return: tuple(sen_id, ent1 name, ent2 name, x) ''' if not self.feature_hasher: self.feature_hasher = FeatureHasher(n_features=len( self.features_set), input_type='string') x_data = self.feature_hasher.transform( [t[3] for t in ent_couple_objects]) converted_ent_objects = [(t[0], t[1], t[2], x_data[i]) for i, t in enumerate(ent_couple_objects)] return converted_ent_objects, x_data def extract_features(self, ent_tuple, sentence): ''' :param ent_tuple: :param sentence: :return: tuple(sen_id, ent1 name, ent2 name, x) ''' features = [] sen_id = ent_tuple[0] ent1_text = self.extract_text(ent_tuple[1]) ent2_text = self.extract_text(ent_tuple[2]) #Entity features ent1_type = self.extract_type(ent_tuple[1]) ent2_type = self.extract_type(ent_tuple[2]) ent1_head = self.extract_head(ent_tuple[1]) ent2_head = self.extract_head(ent_tuple[2]) concatenated_types = ent1_type + ent2_type features.append(self.get_feature("e1_type", ent1_type)) features.append(self.get_feature("e2_type", ent2_type)) features.append(self.get_feature("e1_head", ent1_head)) features.append(self.get_feature("e2_head", ent2_head)) features.append( self.get_feature("e1_root", ent_tuple[1][ENT_OBJ_ROOT].lemma_)) features.append( self.get_feature("e2_root", ent_tuple[2][ENT_OBJ_ROOT].lemma_)) features.append( self.get_feature("concanated_types", concatenated_types)) #Lexicon Features features.append( self.get_feature( "e1_lex_fname", self.lexicon_helper.does_include_first_name(ent1_text))) features.append( self.get_feature( "e1_lex_lname", self.lexicon_helper.does_include_last_name(ent1_text))) features.append( self.get_feature("e2_lex_loc", self.lexicon_helper.is_location(ent_tuple[2]))) #word based features words_between_ents = parser.get_words_between(ent_tuple[1], ent_tuple[2]) for word in words_between_ents: features.append(self.get_feature("bow", word.text)) features.append( self.get_feature("ent1_bword", ent_tuple[1][ENT_OBJ_ROOT].left_edge.text)) features.append( self.get_feature("ent2_aword", ent_tuple[2][ENT_OBJ_ROOT].right_edge.text)) #syntactic features features.append( self.get_feature("ent_dist", parser.get_dist(ent_tuple[1], ent_tuple[2]))) dependency_path_str = parser.get_dependecy_path_str( ent_tuple[1], ent_tuple[2]) features.append(self.get_feature("dep_path", dependency_path_str)) dependency_path_pos_str = parser.get_dependecy_path_pos_str( ent_tuple[1], ent_tuple[2]) features.append( self.get_feature("dep_pos_path", dependency_path_pos_str)) features.append( self.get_feature( "is_descriptive_path", parser.is_direct_ent2_to_ent1_path(ent_tuple[1], ent_tuple[2]))) # # e1_clean = self.clean_name(ent_tuple[1]) e2_clean = self.clean_name(ent_tuple[2]) return (sen_id, e1_clean, e2_clean, features) def extract_text(self, ent_obj): return parser.clean_entity_text(ent_obj[ENT_OBJ_TEXT], ent_obj[ENT_OBJ_ROOT]) def extract_type(self, ent_obj): return ent_obj[ENT_OBJ_LABEL] def extract_head(self, ent_obj): return ent_obj[ENT_OBJ_ROOT].head.lemma_ def get_feature(self, feature_prefix, feature_val): feature = feature_prefix + str(feature_val) if self.train_mode: self.features_set.add(feature) self.features_set.add(feature_prefix + self.unk) return feature else: if feature in self.features_set: return feature else: return feature_prefix + self.unk def clean_name(self, ent_obj): return parser.modify_entity_text(ent_obj[ENT_OBJ_TEXT], ent_obj[ENT_OBJ_SPACY_ENT])
class DQNLearner(WhenLearner): def __init__( self, gamma=0.7, lr=3e-5, batch_size=64, mem_capacity=10000, # state_size=394, action_size=257, state_hidden_size=197, state_size=50, action_size=50, state_hidden_size=30, action_hidden_size=122): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # self.device = "cpu" #TODO: make cuda not break elsewhere self.gamma = gamma self.lr = lr self.batch_size = batch_size self.state_size = state_size self.action_size = action_size self.state_hidden_size = state_hidden_size self.action_hidden_size = action_hidden_size self.state_hasher = FeatureHasher(n_features=self.state_size, alternate_sign=False) self.action_hasher = FeatureHasher(n_features=self.action_size, alternate_sign=False) # special case to make things run faster and drop values # self.state_hasher = FractionsStateHasher() # self.action_hasher = FractionsActionHasher() self.value_net = ValueNet(self.state_size, self.state_hidden_size).to(self.device) self.action_net = ActionNet(self.action_size, self.state_hidden_size, self.action_hidden_size).to(self.device) # create separate target net for computing future value self.target_value_net = ValueNet(self.state_size, self.state_hidden_size) self.target_value_net.load_state_dict(self.value_net.state_dict()) self.target_value_net.eval() self.target_action_net = ActionNet(self.action_size, self.state_hidden_size, self.action_hidden_size) self.target_action_net.load_state_dict(self.action_net.state_dict()) self.target_action_net.eval() self.replay_memory = ReplayMemory(mem_capacity) params = (list(self.value_net.parameters()) + list(self.action_net.parameters())) self.optimizer = torch.optim.Adam(params, lr=self.lr) def update_target_net(self): self.target_value_net.load_state_dict(self.value_net.state_dict()) self.target_action_net.load_state_dict(self.action_net.state_dict()) def gen_state_vector(self, state: dict) -> np.ndarray: state = {str(a): state[a] for a in state} return self.state_hasher.transform([state]).toarray() def gen_action_vectors(self, actions: Collection[Activation]) -> np.ndarray: action_dicts = [] for action in actions: act_d = {} name = action.get_rule_name() act_d['rulename'] = name bindings = action.get_rule_bindings() for a, v in bindings.items(): if isinstance(v, bool): act_d[str(a)] = str(v) else: act_d[str(a)] = v action_dicts.append(act_d) return self.action_hasher.transform(action_dicts).toarray() def eval(self, state: dict, action: Activation) -> float: if state is None: return 0 state_x = torch.from_numpy(self.gen_state_vector(state)).float().to( self.device) action_x = torch.from_numpy(self.gen_action_vectors( [action])).float().to(self.device) with torch.no_grad(): state_val, state_hidden = self.value_net(state_x) action_val = self.action_net(action_x, state_hidden) return state_val[0].cpu().item() + action_val[0].cpu().item() def eval_multiple(self, state: dict, actions: Collection[Activation]) -> Collection[float]: if state is None: return 0 state_x = torch.from_numpy(self.gen_state_vector(state)).float().to( self.device) action_x = torch.from_numpy( self.gen_action_vectors(actions)).float().to(self.device) with torch.no_grad(): state_val, state_hidden = self.value_net(state_x) action_val = self.action_net(action_x, state_hidden.expand(len(actions), -1)) return (state_val.expand(len(actions), -1) + action_val).squeeze(1).cpu().tolist() def update( self, state: dict, action: Activation, reward: float, next_state: dict, next_actions: Collection[Activation], ) -> None: state_v = self.gen_state_vector(state) action_v = self.gen_action_vectors([action]) if next_state is None or len(next_actions) == 0: next_state_v = None next_action_vs = None else: next_state_v = self.gen_state_vector(next_state) next_action_vs = self.gen_action_vectors(next_actions) self.replay_memory.push( torch.from_numpy(state_v).float().to(self.device), torch.from_numpy(action_v).float().to(self.device), torch.tensor([reward]).float().to(self.device), None if next_state_v is None else torch.from_numpy(next_state_v).float().to(self.device), None if next_action_vs is None else torch.from_numpy(next_action_vs).float().to(self.device)) self.train() def train(self): # epochs = (len(replay_memory) // target_update // 2) + 1 batch_size = self.batch_size if len(self.replay_memory) < batch_size: batch_size = len(self.replay_memory) updates = len(self.replay_memory) // batch_size if updates < 20: updates = 20 updates *= 3 if updates > 200: updates = 200 log.debug('len replay mem =' + str(len(self.replay_memory))) loss = [] for i in range(updates): if i % 5: self.update_target_net() loss.append(self.optimize_model()) def optimize_model(self): batch_size = self.batch_size if len(self.replay_memory) < self.batch_size: batch_size = len(self.replay_memory) transitions = self.replay_memory.sample(batch_size) batch = Transition(*zip(*transitions)) # transpose batch # Get states, actions, and rewards state = torch.cat(batch.state).view(batch_size, self.state_size) action = torch.cat(batch.action).view(batch_size, self.action_size) reward = torch.stack(batch.reward).view(1, batch_size) state_value, state_hidden = self.value_net(state) action_value = self.action_net(action, state_hidden) state_action_values = state_value + action_value # compute mask of non-final states and concatenate the batch elements non_final_mask = torch.tensor(tuple( map(lambda sa: sa is not None, batch.next_state)), device=self.device, dtype=torch.bool) any_non_final = non_final_mask.sum() > 0 if any_non_final: non_final_next_state = torch.cat([ s for s in batch.next_state if s is not None ]).view(-1, self.state_size) non_final_next_actions = torch.cat([ s for s in batch.next_actions if s is not None ]).view(-1, self.action_size) # how many actions are available for each state next_action_lens = [ nas.shape[0] for nas in batch.next_actions if nas is not None ] next_action_start = [ sum(next_action_lens[0:i]) for i in range(len(next_action_lens)) ] # Compute next state action indices from policy net if any_non_final: with torch.no_grad(): next_value = self.target_value_net(non_final_next_state) non_final_next_state_value, non_final_next_hidden = next_value next_state_value_expanded = torch.cat([ non_final_next_state_value[i].expand( next_action_lens[i], -1) for i in range(len(next_action_start)) ], 0) next_state_hidden_expanded = torch.cat([ non_final_next_hidden[i].expand(next_action_lens[i], -1) for i in range(len(next_action_start)) ], 0) non_final_next_action_value = ( next_state_value_expanded + self.target_action_net( non_final_next_actions, next_state_hidden_expanded)) # Compute value of next state actions from target net # Detach, so we don't track gradients, target net not getting updated. next_state_values = torch.zeros(batch_size, device=self.device) if any_non_final: next_state_values[non_final_mask] = torch.tensor( [ non_final_next_action_value.narrow( 0, next_action_start[i], next_action_lens[i]).max(0)[0] for i in range(len(next_action_start)) ], device=self.device) # next_state_values[non_final_mask] = self.net( # non_final_next_sas).gather( # 1, non_final_next_sa_idx).detach().squeeze() # Calculate the expected state-action value with torch.no_grad(): expected_state_action_values = ( reward + self.gamma * next_state_values).view(batch_size, 1) # print(torch.cat([state_action_values, expected_state_action_values], 1)) # print(expected_state_action_values) self.optimizer.zero_grad() loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) # perform backprop loss.backward() # for param in self.value_net.parameters(): # param.grad.data.clamp_(-1, 1) # for param in self.action_net.parameters(): # param.grad.data.clamp_(-1, 1) self.optimizer.step() return loss.detach().item()
#AI-TECHGYM-2-6-A-4 #特徴量エンジニアリング #インポート import pandas as pd from sklearn.feature_extraction import FeatureHasher n_features = 5 h = FeatureHasher(n_features=n_features) #読み込みデータ columns = ['Python', 'Ruby', 'PHP', 'Java', 'JavaScript'] D = [{ "Label": "Python" }, { "Label": "Ruby" }, { "Label": "PHP" }, { "Label": "Java" }, { "Label": "JavaScript" }] df_D = pd.DataFrame(D) #display(df_D) f_array = h.transform(D).toarray() df_a = pd.DataFrame(f_array, dtype=int, index=columns) display(df_a)
def test_hasher_invalid_input(): raw_X = [[], (), iter(range(0))] feature_hasher = FeatureHasher(input_type="gobbledygook") with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=-1) with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=0) with pytest.raises(ValueError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features="ham") with pytest.raises(TypeError): feature_hasher.transform(raw_X) feature_hasher = FeatureHasher(n_features=np.uint16(2**6)) with pytest.raises(ValueError): feature_hasher.transform([]) with pytest.raises(Exception): feature_hasher.transform([[5.5]]) with pytest.raises(Exception): feature_hasher.transform([[None]])