コード例 #1
0
ファイル: sarcalingua.py プロジェクト: Previsou/Sarcasticus
class HashSarca(Sarcalingua):
    
    def __init__(self, nbits=20, model=SGDClassifier(alpha=1e-5, penalty="l1", loss="modified_huber")):
        
        self.featureExtractor = FeatureHasher(pow(2,nbits), input_type="pair")
        self.classifier = model
        self.outEncoder = LabelEncoder()
        self.drop_outs = set((   u"#sarcasm", u"#sarcastic", u"#ironic", u"#irony",
                    u"#sarcasme", u"#sarcastique", u"#ironie", u"#ironique",
                    u"uncyclopedia", u"wikipedia"))
        
    def extractFeatures(self, clean_text):
        return self.featureExtractor.transform( (token_pattern.finditer(clean_text),) )
        
    def corpusToDataset(self, chunkIterator, column_label, HTML=False, **args):
        
        def prepare(raw_text):
            tokens = token_pattern.findall(self.sanitize(raw_text, HTML))
            if random.random() < 0.5:   # we delete the drop-outs half the time
                tokens = [tok for tok in tokens if tok not in self.drop_outs]
            try:
                alpha = 1./len(tokens)  #1./(1+log(len(tokens)))
                return ((tok.lower(), alpha) for tok in tokens)
            except ZeroDivisionError:
                return tuple()
        
        for chunk in chunkIterator:
            X = self.featureExtractor.transform(imap(prepare, chunk.text))
            y = np.array(self.outEncoder.fit_transform(chunk[column_label]))
            
            yield X,y
            gc.collect()
コード例 #2
0
ファイル: QClassifier.py プロジェクト: StevenLOL/Factoid-QA
class QClassifierImpl:
    """
    A wrapper for question classifier
    """

    def __init__(self, train_data_path, pred_qs = None):
        """
        Constructor
        """
        logging.basicConfig(level = logging.DEBUG,
                format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
                datefmt='%a, %d %b %Y %H:%M:%S',
                filename='qclassifier.log',
                filemode='w')
        reload(sys)
        sys.setdefaultencoding('utf8')

        self.clf = None
        self.path = train_data_path
        self.pred_qs = pred_qs
        self.extractor = FeatureExtractor()
        self.features = None
        self.labels = None
        self.vectorizer = None
        self.cate = ['Person', 'Number', 'Location', 'Other']

    def train(self):
        """
        Train use all of the given data
        """
        self.extractor.load(path = self.path)
        self.features = self.extractor.extract_features()
        self.labels = self.extractor.get_labels()
        self.clf = QClassifier(questions = self.extractor.questions)
        assert(len(self.labels) == len(self.features))

        X = self.features
        Y = self.labels
        self.vectorizer = FeatureHasher(input_type = 'string', non_negative = True)
        X = self.vectorizer.transform(X)
        Y = asarray(Y)

        logging.info('start training')
        self.clf.train(X, Y)
        logging.info('done')

    def get_type(self, question):
        """
        Get type for a given question
        """
        if not self.features or not self.labels:
            logging.error('You need to train model first!')
            return None
        if not question:
            logging.error('Question should not be None')
            return None
        f = [self.extractor.extract_features_aux(question)]
        f = self.vectorizer.transform(f)
        # print self.clf.predict(f)
        return self.cate[self.clf.predict(f)[0]]
コード例 #3
0
ファイル: conll.py プロジェクト: fmailhot/seqlearn
def load_conll(f, features, n_features=(2 ** 16), split=False):
    """Load CoNLL file, extract features on the tokens and hash them.

    Parameters
    ----------
    f : {string, file-like}
        Input file.
    features : callable
        Feature extraction function. Must take a list of tokens (see below)
        and an index into this list.
    n_features : integer, optional
        Number of columns in the output.
    split : boolean, default=False
        Whether to split lines on whitespace beyond what is needed to parse
        out the labels. This is useful for CoNLL files that have extra columns
        containing information like part of speech tags.
    """
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _conll_sequences(f, features, labels, lengths, split)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
コード例 #4
0
ファイル: kaggle_train_lr.py プロジェクト: Tagtoo/model
def io():
    hv = FeatureHasher()

    target = []
    train_int = []
    train_label = []

    for iline in dio.io():
        iline = iline.strip().split(',')
        t = int(iline[0])
        int_fs = map(lambda i: numpy.NaN if not i else int(i), iline[1:14])
        label_fs = [k for k in iline[14:]]
        #label_fs = ",".join(iline[14:])
#        print int_fs, label_fs

        target.append(t)
        train_int.append(int_fs)
        train_label.append({k:1 for k in label_fs if k})

#    print train_int
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    train_int = imp.fit_transform(train_int)
#    print train_int
    scaler = preprocessing.StandardScaler().fit(train_int)
    train_int = scaler.transform(train_int)
#    print train_int
    train_int = csr_matrix(train_int)
#    print train_label
    train_label = hv.transform(train_label)
    train = hstack((train_int, train_label))
#    print train_label
#    print train
    return target, train
コード例 #5
0
ファイル: ffm.py プロジェクト: PKostya/kaggle
def to_ffm(df, outfile, ycol, num_columns = []):
    df = df.copy()
    one_based = True
    hasher = FeatureHasher(input_type='string', non_negative=True)
    bs = 2**10
    value_pattern = u'%d:%d:%.16g'
    line_pattern = u'%d %s\n'
    with open(outfile, 'w') as out:
        pb = progressbar.ProgressBar(maxval=(df.shape[0]+bs+1) // bs).start()
        for i in xrange((df.shape[0]+bs+1) // bs):
            pb.update(i)
            s = slice(i*bs, (i+1)*bs)
            if ycol in df.columns:
                Xh = np.asarray(df.iloc[s].drop([ycol], axis=1).drop(num_columns,axis=1).astype('str'))
                Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
                y = df.iloc[s][ycol].values.astype('int')
            else:
                Xh = np.asarray(df.iloc[s].drop(num_columns,axis=1).astype('str'))
                Xv = np.asarray(df.iloc[s][num_columns].astype('float'))
                y = np.zeros((bs,))
            Xt = scipy.sparse.hstack([Xv,hasher.transform(Xh)]).tocsr()
            for j in xrange(Xt.shape[0]):
                span = slice(Xt.indptr[j], Xt.indptr[j+1])
                row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
                st = " ".join(value_pattern % (j + one_based, fe + one_based, x) for j, fe, x in row if np.isnan(x) == False)
                feat = (y[j], st)
                out.write((line_pattern % feat).encode('ascii'))
        pb.finish()
コード例 #6
0
ファイル: multiclassify.py プロジェクト: nlproc/splunkml
def process_records(records, fields, target, textmodel=None):
	tokenize = CountVectorizer().build_analyzer()

	input = None
	X = None
	y_labels = []

	for i, record in enumerate(records):
		nums = []
		strs = []
		y_labels.append(record.get(target))

		for field in fields:
			if is_number(record.get(field)):
				nums.append(record[field])
			else:
				strs.append(str(record.get(field) or "").lower())
		if strs:
			if input is None:
				input = StringIO.StringIO()
			print >> input, " ".join(tokenize(" ".join(strs)))
		if nums:
			if X is None:
				X = sp.lil_matrix((len(records),len(nums)))
			X[i] = np.array(nums, dtype=np.float64)

	if input is not None:
		if X is not None:
			X_2 = X.tocsr()
		else:
			X_2 = None

		if isinstance(textmodel,basestring):
			if textmodel == 'lsi':
				corpus = TextCorpus(input)
				textmodel = LsiModel(corpus, chunksize=1000)
			elif textmodel == 'tfidf':
				corpus = TextCorpus(input)
				textmodel = TfidfModel(corpus)
			elif textmodel == 'hashing':
				textmodel = None
				hasher = FeatureHasher(n_features=2 ** 18, input_type="string")
				input.seek(0)
				X = hasher.transform(tokenize(line.strip()) for line in input)
		if textmodel:
			num_terms = len(textmodel.id2word or getattr(textmodel, 'dfs',[]))
			X = corpus2csc(textmodel[corpus], num_terms).transpose()

		if X_2 is not None:
			# print >> sys.stderr, "X SHAPE:", X.shape
			# print >> sys.stderr, "X_2 SHAPE:", X_2.shape
			X = sp.hstack([X, X_2], format='csr')

	elif X is not None:
		textmodel = None
		X = X.tocsr()

	print >> sys.stderr, "X SHAPE:", X.shape

	return X, y_labels, textmodel
コード例 #7
0
ファイル: textMining_hash.py プロジェクト: ai-se/SMOTE
 def hash(mat, num_features):
   """
   hashing trick
   """
   hasher = FeatureHasher(n_features=num_features, non_negative=True)
   X = hasher.transform(mat)
   X = X.toarray()
   return X
コード例 #8
0
def test_feature_hasher_pairs():
    raw_X = (d.iteritems() for d in [{"foo": 1, "bar": 2}, {"baz": 3, "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 2], x1_nz)
    assert_equal([1, 3, 4], x2_nz)
コード例 #9
0
def test_hash_empty_input():
    n_features = 16
    raw_X = [[], (), iter(range(0))]

    h = FeatureHasher(n_features=n_features, input_type="string")
    X = h.transform(raw_X)

    assert_array_equal(X.A, np.zeros((len(raw_X), n_features)))
コード例 #10
0
ファイル: _nl_conll_ner.py プロジェクト: NLeSC/xtas
def ner(tokens):
    """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset."""

    global _model

    X = [_features(tokens, i) for i in range(len(tokens))]
    hasher = FeatureHasher(2**16, input_type="string")
    return zip(tokens, _model.predict(hasher.transform(X)))
コード例 #11
0
class Model:
    def __init__(self,numFeatures, learningRate, numEpochs, ppenalty="l1", mustShuffle=True):
        #Init scikit models
        self.FH = FeatureHasher(n_features=numFeatures, input_type='string')
        self.Classifier = SGDClassifier(penalty=ppenalty, loss='log', alpha=learningRate, n_iter = numEpochs, shuffle=mustShuffle)
    def train(self, gen,  v=False):

        i = 0
        for x, y in gen: #For each batch
            xHash = self.FH.transform(x) #hash trick
            y = np.array(y)            
##            for epoch in range(numEpochs):
            self.Classifier.partial_fit(xHash, y, [0,1])
            i += len(x)
            if v : print(str(datetime.now())[:-7] , "example:", i)
            
    def test(self, gen,  v=False):

        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        i = 0
        for x,y in gen:
            xHash = self.FH.transform(x) #hash trick
            p = self.Classifier.predict_proba(xHash)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))
            i += y.shape[0]
            if v : print(str(datetime.now())[:-7] , "example:", i)
        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def predictBatch(self, batch):
        hashedBatch = self.FH.transform(batch)
        prediction = self.Classifier.predict_proba(hashedBatch)
        return prediction
    def generatePrediction(self, generator):
        for xBatch, idBatch in generator:
            prediction = self.predictBatch(xBatch)
            yield prediction, idBatch
    def score(self, target, prediction):
        return llfun(target, prediction)
コード例 #12
0
def test_feature_hasher_pairs_with_string_values():
    raw_X = (iter(d.items()) for d in [{"foo": 1, "bar": "a"},
                                       {"baz": u"abc", "quux": 4, "foo": -1}])
    h = FeatureHasher(n_features=16, input_type="pair")
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = sorted(np.abs(x1[x1 != 0]))
    x2_nz = sorted(np.abs(x2[x2 != 0]))
    assert_equal([1, 1], x1_nz)
    assert_equal([1, 1, 4], x2_nz)

    raw_X = (iter(d.items()) for d in [{"bax": "abc"},
                                       {"bax": "abc"}])
    x1, x2 = h.transform(raw_X).toarray()
    x1_nz = np.abs(x1[x1 != 0])
    x2_nz = np.abs(x2[x2 != 0])
    assert_equal([1], x1_nz)
    assert_equal([1], x2_nz)
    assert_equal(x1, x2)
コード例 #13
0
def load_seq2seq(f, features, n_features=(2 ** 16)):
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _sequences(f, features, labels, lengths)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
コード例 #14
0
class ColumnHasherTransformer(BaseEstimator, TransformerMixin):
  def __init__(self, col):
    self.col = col
    self.fh = FeatureHasher(n_features=1024, input_type='dict')

  def fit(self, X, y=None):
    return self

  def transform(self, df):
    return self.fh.transform(df.loc[:,self.col]\
                             .apply(lambda x: {x: 1}).values)
コード例 #15
0
ファイル: models.py プロジェクト: mrshu/diaqres
class GraphemeBasedModel(DiacriticsRestorationModel):
    def __init__(self, window=5, input_classes=None):
        self.window = window
        self.input_classes = input_classes

    def train(self, corpus, classes=None, chunk_size=100000):
        self.vectorizer = FeatureHasher(non_negative=True,
                                        n_features=len(classes)*2*self.window,
                                        input_type='pair')
        self.clf = MultinomialNB()
        i = 0
        j = 0
        X = []
        Y = []
        for x, y in corpus:
            if x[self.window][1] in self.input_classes:
                X.append(x)
                Y.append(y)
                i += 1
            if i < chunk_size:
                continue

            j += 1
            click.echo("Running iteration {}".format(j))

            X = self.vectorizer.transform(X)
            self.clf.partial_fit(X, Y, classes)
            X = []
            Y = []
            i = 0

    def restore(self, string):
        corpus = []
        out = ''
        for x, y in string_to_grapheme_corpus(string, self.window):
            if x[self.window][1] in self.input_classes:
                x = self.vectorizer.transform([x])
                out += self.clf.predict(x)[0]
            else:
                out += y
        return out
コード例 #16
0
ファイル: bandits.py プロジェクト: stoddardg/mab_simulation
def hash_features(features, arm_ids, use_id=True):
    n_features = np.shape(features)[1]
    feature_names = [str(x) for x in np.arange(n_features)]
    all_features = []
    for arm_id, feature_set in zip(arm_ids, features):
        temp_features = zip(feature_names, feature_set)
        if use_id == True:
            temp_features.append(("id_"+str(arm_id), 1))
        all_features.append(temp_features)

    f = FeatureHasher(input_type='pair')
    return f.transform(all_features)
コード例 #17
0
ファイル: transform.py プロジェクト: grodrigues3/IssueLabeler
def encode_titles(titles, num_features=2**14):
  '''
  Encode the titles formatted as a string as numerical values using
  the 'hashing trick'.
  The size of the feature vector can be specified using the
  num_features parameter'
  '''
  myHasher = FeatureHasher(input_type='string',
                           n_features= num_features,
                           non_negative=True)
  featureMatrix = myHasher.transform(titles)
  return featureMatrix, myHasher
コード例 #18
0
ファイル: file.py プロジェクト: PKostya/kaggle
def dump_libffm_format(X, y, f):
    one_based = True
    hasher = FeatureHasher(input_type='string', non_negative=True)
    Xt = hasher.transform(X)
    value_pattern = u'%d:%d:%.16g'
    line_pattern = u'%d %s\n'
    for i in xrange(Xt.shape[0]):
        span = slice(Xt.indptr[i], Xt.indptr[i+1])
        row = zip(range(len(Xt.indices[span])), Xt.indices[span], Xt.data[span])
        s = " ".join(value_pattern % (j + one_based, fe, x) for j, fe, x in row)
        feat = (y[i], s)
        f.write((line_pattern % feat).encode('ascii'))
コード例 #19
0
ファイル: sequences.py プロジェクト: Sandy4321/featkit
class SequenceHasher(TransformerMixin):
    """ encodes a sequeces xyz as xy,yz,z"""
    def __init__(self, base_feature_name="index"):
        self.base_feature_name = base_feature_name
        self.hasher = FeatureHasher(input_type="pair")

    def fit(self, X, y=None):
        pass

    def transform(self, X, y=None):
        f_name = self.base_feature_name
        seq = (((f_name + str(i), v) for i, v in enumerate(x)) for x in X)
        return self.hasher.transform(seq)
コード例 #20
0
    def predictUserScore(self, body, tags, fgen, users):
        featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair')
        # document features
        featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)]
        # additional features
        featureVector.append(("Length", 1))
        featureVector.append(("Score", 1))
        featureVector.append(("Accepted", 1))
        featureVector.append(("OwnerRep", 1))

        X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]])
        scores = [score for index, score in enumerate(self.cf.decision_function(X)[0]) if int(self.cf.classes_[index]) in users]
        return scores
コード例 #21
0
def main():

    # Uncomment the following line to use a larger set (11k+ documents)
    # categories = None

    print(__doc__)
    print("Usage: %s [n_features_for_hashing]" % sys.argv[0])
    print("The default number of features is 2**18.")
    print()

    try:
        n_features = int(sys.argv[1])
    except IndexError:
        n_features = 2 ** 18
    except ValueError:
        print("not a valid number of features: %r" % sys.argv[1])
        sys.exit(1)

    print("Loading 20 newsgroups training data")
    categories = loadCategories()
    raw_data, data_size_mb = loadData(categories)

    print("DictVectorizer")
    t0 = time()
    vectorizer = DictVectorizer()
    vectorizer.fit_transform(token_freqs(d) for d in raw_data)
    report(data_size_mb, len(vectorizer.get_feature_names()), t0)

    print("FeatureHasher on frequency dicts")
    t0 = time()
    hasher = FeatureHasher(n_features=n_features)
    X = hasher.transform(token_freqs(d) for d in raw_data)
    report(data_size_mb, n_nonzero_columns(X), t0)

    print("FeatureHasher on raw tokens")
    t0 = time()
    hasher = FeatureHasher(n_features=n_features, input_type="string")
    X = hasher.transform(tokens(d) for d in raw_data)
    report(data_size_mb, n_nonzero_columns(X), t0)
コード例 #22
0
ファイル: datasets.py プロジェクト: 1oscar/seqlearn
def load_conll(f, features, n_features=(2 ** 16), split=False):
    """Load CoNLL file, extract features on the tokens and vectorize them.

    The ConLL file format is a line-oriented text format that describes
    sequences in a space-separated format, separating the sequences with
    blank lines. Typically, the last space-separated part is a label.

    Since the tab-separated parts are usually tokens (and maybe things like
    part-of-speech tags) rather than feature vectors, a function must be
    supplied that does the actual feature extraction. This function has access
    to the entire sequence, so that it can extract context features.

    A ``sklearn.feature_extraction.FeatureHasher`` (the "hashing trick")
    is used to map symbolic input feature names to columns, so this function
    dos not remember the actual input feature names.

    Parameters
    ----------
    f : {string, file-like}
        Input file.
    features : callable
        Feature extraction function. Must take a list of tokens l that
        represent a single sequence and an index i into this list, and must
        return an iterator over strings that represent the features of l[i].
    n_features : integer, optional
        Number of columns in the output.
    split : boolean, default=False
        Whether to split lines on whitespace beyond what is needed to parse
        out the labels. This is useful for CoNLL files that have extra columns
        containing information like part of speech tags.

    Returns
    -------
    X : scipy.sparse matrix, shape (n_samples, n_features)
        Samples (feature vectors), as a single sparse matrix.
    y : np.ndarray, dtype np.string, shape n_samples
        Per-sample labels.
    lengths : np.ndarray, dtype np.int32, shape n_sequences
        Lengths of sequences within (X, y). The sum of these is equal to
        n_samples.
    """
    fh = FeatureHasher(n_features=n_features, input_type="string")
    labels = []
    lengths = []

    with _open(f) as f:
        raw_X = _conll_sequences(f, features, labels, lengths, split)
        X = fh.transform(raw_X)

    return X, np.asarray(labels), np.asarray(lengths, dtype=np.int32)
コード例 #23
0
class Model:
    def __init__(self,numFeatures, learningRate, mustShuffle=True):
        #Init scikit models
        self.FH = FeatureHasher(n_features=numFeatures, input_type='pair')
        self.Classifier = SGDClassifier(loss='log', alpha=learningRate, shuffle=mustShuffle)
    def train(self, gen, numEpochs,  v=False):
        i = 0
        for x, y in gen: #For each batch
            xHash = self.FH.transform(x) #hash trick
            y = np.array(y)            
            for epoch in range(numEpochs):
                self.Classifier.partial_fit(xHash, y, [0,1])
                
            if v and (i % (numBatches/60)) == 0: print(datetime.now(), "example:", i*sizeBatch)
            i+=1
    def test(self, gen,  v=False):
        
        #init target and prediction arrays
        ytot = np.array([])
        ptot = np.array([])
        #Get prediction for each batch
        for batch in gen:
            data = list(batch) #store batch in memory for prediction
            x, y = data[0], np.array(data[1])
            x = self.FH.transform(x)
            p = self.Classifier.predict_proba(x)
            p = p.T[1].T #Keep column corresponding to probability of class 1
            #Stack target and prediction for later analysis
            ytot = np.hstack((ytot, y)) 
            ptot = np.hstack((ptot, p))

        if v: print("Score:", self.score(ytot, ptot))
        
        return (ytot, ptot)
    def score(self, target, prediction):
        return llfun(target, prediction)
コード例 #24
0
    def predictUsers(self, body, tags, fgen, n = 3):
        featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair')
        # document features
        featureVector = [(str(dim), value) for dim, value in fgen.getDocumentFeatures(body, tags)]
        # additional features
        featureVector.append(("Length", 1))
        featureVector.append(("Score", 1))
        featureVector.append(("Accepted", 1))
        featureVector.append(("OwnerRep", 1))

        X = featureHasher.transform([[(str(dim), value) for dim, value in featureVector]])
        userIds = [int(self.cf.classes_[index]) for index, score in sorted(enumerate(self.cf.decision_function(X)[0]), key=lambda x:x[1], reverse=True)][:n]
        # print(userIds)
        # print(self.cf.predict(X))

        return [Users.get(Users.id == userId) for userId in userIds]
コード例 #25
0
ファイル: ClientFeaturizer.py プロジェクト: kumaran-5555/ML
    def process(self):

        header = self.inputFile.readline()

        ids = []
        self.features = []

        count = 0
        for line in self.inputFile:
            count += 1
            fields = line.split(',')

            id = fields[0]
            names = {}
            name = Kaggle_Grupo.Utils.StringNormalize(fields[1])

            for i in name.split(' '):
                names[i] = 1

            ids.append(id)
            self.features.append(names)




        featureHasher = FeatureHasher(n_features=2**12, dtype=np.uint16)

        self.features = featureHasher.transform(self.features)
        self.features = self.features.toarray()

        self.features = self.encode(width=24)


        headerFields  = ["Cliente_ID"]

        for i in range(self.features.shape[1]):
            headerFields.append('ClientName_{}'.format(i))

        headerFields = "\t".join(headerFields)


        self.outputFile.write(headerFields+'\n')

        for i in range(self.features.shape[0]):
            self.outputFile.write('{}\t{}\n'.format(ids[i], ('\t'.join(self.features[i].astype('str')).replace('False', '0').replace('True', '1'))))
コード例 #26
0
class FeatureHasherModel:
	def fit(self, max_features):
		self.model = FeatureHasher(input_type = "string", n_features = max_features)

	def transform(self, dataframe, col_name):
		hashed = self.model.transform(dataframe)

		df = pd.DataFrame(hashed.toarray())
		df.columns = ["%s_%d" % (col_name, author_num) for author_num in range(0, self.model.n_features)]
		df.index = dataframe.index

		return df

	def get_model(self):
		return self.model

	def set_model(self, model):
		self.model = model
コード例 #27
0
def gen_cinput(origindata, pooldata = [],threshold = 5):
	origin_feas = gen_feature_data(origindata)
	pool_feas = gen_feature_data(pooldata)

	feas_X = []
	label_Y = []
	s  = set()
	for seq in origin_feas:
		feas_X.extend([item["F"] for item in seq])
		for item in seq:
			s.update(item["F"])
		label_Y.extend([item["L"] for item in seq])

	assert len(feas_X) == len(label_Y)
	print "original  data  data  num   :   "+str(len(feas_X))

	
	feas_X_2 = []
	label_Y_2 = []
	for seq_id, seq in enumerate(pool_feas):
		for token_id, token in enumerate(seq):
			if pooldata[seq_id][2][token_id] == 1:
				feas_X_2.append(token["F"])
				s.update(token["F"])
				label_Y_2.append(token["L"])



	print "pool data  data  num   :   "+str(len(feas_X_2))


	print "original feature num   ................ "+str(len(s))

	X = feas_X + feas_X_2
	X = featurefilter(X, threshold)
	print X[:2]


	Y = label_Y + label_Y_2
	h = FeatureHasher(input_type = "string", non_negative = True)

	X = h.transform(X)

	return X ,Y, h 
コード例 #28
0
def test_feature_hasher_strings():
    raw_X = [[u"foo", "bar", "baz", "foo"], [u"bar", "baz", "quux"]]  # note: duplicate

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)  # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, sum(len(set(x)) for x in raw_X))
コード例 #29
0
    def learn(self, fgen, postLimit=None):
        Parent = Posts.alias()
        query = Posts.select().join(Parent, on=(Posts.parentid == Parent.id)).where(Posts.posttypeid == 2 & Parent.forevaluation == 0)
        if postLimit is not None:
            query = query.limit(postLimit)
        count = query.count()
        print("Learning {0} questions".format(count))

        allClasses = numpy.array([user.id for user in Users.select()])

        maxUserRep = float(Users.select(peewee.fn.Max(Users.reputation)).scalar())

        featureHasher = FeatureHasher(n_features = fgen.getMaxDimSize()+4, input_type = 'pair')
        featureMatrix = []
        classList = []
        for i, answer in enumerate(query):
            if answer.owneruserid is None:
                continue
            print("Generating feature vector for id {0}".format(answer.id))
            # docment features
            # featureVector = fgen.getDocumentFeatures(answer.parentid.title + answer.parentid.body + answer.body, tagIds)
            featureVector = fgen.getAnswerFeatures(answer)

            featureVector = [(str(dim), value) for dim, value in featureVector]
            # additional features
            maxScore = Posts.select(peewee.fn.Max(Posts.score)).where(Posts.parentid == answer.parentid).scalar()
            maxLength = max(len(post.body) for post in Posts.select().where(Posts.parentid == answer.parentid))
            featureVector.append(("Length", (len(answer.body)/float(maxLength))))
            featureVector.append(("Score", 1 if maxScore == 0 else (answer.score/float(maxScore))))
            featureVector.append(("Accepted", 1 if answer.id == answer.parentid.acceptedanswerid else 0))
            featureVector.append(("OwnerRep", answer.owneruserid.reputation/maxUserRep))

            featureMatrix.append(featureVector)
            classList.append(answer.owneruserid.id)
            if len(featureMatrix) == self.batchSize or i == count-1:
                print("Partial fitting classifier".format(answer.id))
                X = featureHasher.transform(featureMatrix)
                Y = numpy.array(classList)
                self.cf.partial_fit(X, Y, classes=allClasses)
                allClasses = None
                featureMatrix = []
                classList = []
コード例 #30
0
def test_feature_hasher_strings():
    # mix byte and Unicode strings; note that "foo" is a duplicate in row 0
    raw_X = [["foo", "bar", "baz", "foo".encode("ascii")], ["bar".encode("ascii"), "baz", "quux"]]

    for lg_n_features in (7, 9, 11, 16, 22):
        n_features = 2 ** lg_n_features

        it = (x for x in raw_X)  # iterable

        h = FeatureHasher(n_features, non_negative=True, input_type="string")
        X = h.transform(it)

        assert_equal(X.shape[0], len(raw_X))
        assert_equal(X.shape[1], n_features)

        assert_true(np.all(X.data > 0))
        assert_equal(X[0].sum(), 4)
        assert_equal(X[1].sum(), 3)

        assert_equal(X.nnz, 6)
コード例 #31
0
def main():
    # start timer
    time.clock()

    # open training data
    infile = open("../data/gold/simple_gold_revised.txt", "r")
    train_sents = infile.readlines()
    infile.close()
    train_sents = train_sents[100:]
    # open CMU training data
    infile = open("../data/cmu_all_gold.txt")
    cmu_train_sents = infile.readlines()
    infile.close()

    window = 1

    num_corpora = 0

    sent_tokens = preprocess(train_sents, window)
    cmu_sent_tokens = preprocess(cmu_train_sents, window)
    all_tokens = sent_tokens
    all_tokens.extend(cmu_sent_tokens[:len(cmu_sent_tokens) / 2])

    del train_sents
    X, y = get_features(all_tokens, window)
    print('Got Features')
    del all_tokens
    features, results = get_feature_dict(X, y)
    print('Got Feature Dict')
    X_tweets = X[0:len(sent_tokens)]
    X_cmu = X[len(sent_tokens):]
    print('Split training Data')
    print('Training on Tweets...')

    from sklearn.feature_extraction import FeatureHasher
    hasher = FeatureHasher(input_type='string')

    X_new = []
    for row in X_tweets:
        new_row = []
        new_row.extend(row)
        for element in row:
            new_row.append(element + '_*tweet*')
        X_new.append(new_row)
    for row in X_cmu:
        new_row = []
        new_row.extend(row)
        for element in row:
            new_row.append(element + '_*cmu*')
        X_new.append(new_row)

    x_vec = hasher.transform(X_new)
    y_vec = []
    for y_i in y:
        new_y = 0
        if y_i is not None and y_i in results:
            new_y = results[y_i]
        y_vec.append(new_y)

    clf = svm.LinearSVC(C=0.15)
    clf.fit(x_vec, y_vec)

    print('Done')
    print('Training on CMU...')
    print('Done')
    del X
    del y
    del sent_tokens

    ## this writes the classifier to a binary
    #from sklearn.externals import joblib
    #joblib.dump(clf, 'classifiers/cmu+gang_nn_hot.pkl')

    ## This reads the classifier from a binary
    #from sklearn.externals import joblib
    #clf = joblib.load('classifiers/cmu+gang_nn_daume.pkl')

    print('Trained Classifier')

    # open Corpus development data
    #    infile = open("../data/content/content_revised_tokenized.txt", "r")
    infile = '../../data/gakirah/gakirah_aggress_loss.csv'
    print('Reading Dev')
    f = open(infile, 'rU')
    reader = csv.DictReader(f)
    train_Dev = []
    for row in reader:
        tweet = row['CONTENT'].decode('utf-8')
        train_Dev.append(tweet)

    f.close()

    train_dev_words = []
    for sentence in train_Dev:
        train_dev_words.append(sentence.rstrip().split())
    dev_tokens = [None] * len(train_Dev)
    for i in range(len(dev_tokens)):
        tokens = train_Dev[i].split()
        for j in range(window):
            tokens.insert(0, '*\\*')
            tokens.append('STOP\\STOP')
        for j in range(len(tokens)):
            tokens[j] = list(tokens[j].split('\\'))
        dev_tokens[i] = tokens

    print('Testing Dev')
    tagged_sents = tag_sents(clf,
                             dev_tokens,
                             features,
                             results,
                             window,
                             num_corpora,
                             hasher=hasher)
    print('Writing Results')
    #    output_tagged(tagged_sents, '../results/svm_trained_on_alone+cmu.txt')
    output_tagged(tagged_sents,
                  '../../results/pos_tagged_gakirah_aggress_loss.txt')
    print("Time: " + str(time.clock()) + ' sec')
コード例 #32
0
# Custom Implementation
def apply_hashing_trick(feature_dict, vector_size=2000):
    # Create an array of zeros of length 'vector_size'
    new_features = [0 for x in range(vector_size)]
    # iterate over every feature in the feature dictionary
    for key in feature_dict:
        # get the index into the new feature array
        array_index = hash(key) % vector_size
        # add the value of the feature to the new feature array
        # at the index we got using the hashing trick
        new_features[array_index] += feature_dict[key]
        return new_features


# Implementing FeatureHasher
from sklearn.feature_extraction import FeatureHasher
hasher = FeatureHasher(n_features=20)
features = [{'how': 1, 'now': 2, 'brown': 4}, {'cow': 2, '.': 5}]
hashed_features = hasher.transform(features)
コード例 #33
0
    ]
    cat_vars = [
        'categorical8', 'categorical14', 'categorical5', 'categorical12',
        'categorical11', 'categorical10', 'categorical7', 'categorical1',
        'categorical15', 'categorical18', 'categorical13', 'categorical16'
    ]

    #merge cols
    X = chunk[integer_cols + cat_vars]
    y = chunk['label']

    # replace nulls
    X.update(X[integer_cols].fillna(0))
    X.update(X[cat_vars].fillna('NULL'))

    X_cat = feature_hasher.transform(X[cat_vars].to_dict('records')).toarray()

    if i == 0:
        scaler.fit(X[integer_cols])
        X_integer = scaler.transform(X[integer_cols])
        X = np.hstack((X_cat, X_integer))

        dimensionality_reduction = svd.fit(X)
        principal_components = dimensionality_reduction.transform(X)
        logistic.fit(principal_components, y)

        dump(logistic, 'first_pca_logit.pkl')
    else:
        X_integer = scaler.transform(X[integer_cols])
        X = np.hstack((X_cat, X_integer))
コード例 #34
0
        chunk['ToSecond_scaled']=(chunk.ToSecond-0)/(84239-0)
        chunk['Cyclic_scaled']=(chunk.Cyclic-0)/(42480-0)

        selected_columns = ["ip_scaled",'app','device','os','channel','AMPM','Cyclic_scaled','click_day_scaled','click_hour_scaled','is_attributed']
        chunk=chunk[selected_columns]
    

        #training will be done batch by batch even within each chunk
        for ii in range(0, chunksize//batchsize):

            X = chunk.iloc[ii*batchsize:(ii+1)*batchsize,:-1]
            y = chunk.iloc[ii*batchsize:(ii+1)*batchsize,-1]
            i+=1

            # now lets create the hashed variable using transformation on the hasher object created earlier
            X_train = fh.transform(np.asarray(X.astype(str)))
            #clf.fit(X_train,y,xgb_model=None)
            
            #for xgboost model we need sparse matrix:
            dtrain=xgb.DMatrix(X_train, label=y)
            param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'multi:softprob'}
            modelXG=xgb.train(param,dtrain,xgb_model='xgbmodel')
            #each time a batch is used to train the model partially, we will need to save the model and then give it to the next step to continue from there. that is how xgboost is trained
            modelXG.save_model("xgbmodel")

            #clf.n_estimators += 1

            #every 10 chunks we would like to evaluate and see how we are doing on unseen validation data:
            if(i%10==0):

                print(i)
コード例 #35
0
if True and FIT:
    est = LogisticRegression(multi_class='auto', solver='liblinear')
    t1 = time.time()
    est.fit(X_train, y_train)

    print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}")

print("\nFeatureHasher")
print("FeatureHasher on frequency dicts")
n_features=1048576
#n_features=int(1048576 / 2)
hasher = FeatureHasher(n_features=n_features)
t1 = time.time()
X_train = hasher.fit_transform(token_freqs(d) for d in X_train_text)
X_test = hasher.transform(token_freqs(d) for d in X_test_text)
print(f"FeatureHasher XX shape {X_train.shape} with {X_train.data.nbytes:,} bytes and nnz {X_train.nnz:,} in {time.time()-t1}")

if FIT:
    est = LogisticRegression(multi_class='auto', solver='liblinear')
    t1 = time.time()
    est.fit(X_train, y_train)
    print(f"Score {est.score(X_test, y_test)} in {time.time()-t1}")


#NGRAM_MAX 1

#CountVectorizer
#CountVectorizer shape (8485, 112359) with 10,723,592 bytes and nnz 1,340,449
#Vocab length 112359
#Score CountVectorizer 0.8882997525627431
コード例 #36
0
num_train = train.ix[:, train.applymap(
    np.isreal).all(axis=0)]  # Get numerical features
cat_train = train.ix[:, np.invert(train.applymap(
    np.isreal).all(axis=0))]  # Get categorical features

categorical_type = 'dictvectorizer'
cat_dict = (dict(cat_train.ix[x]) for x in range(cat_train.shape[0])
            )  # Categorical data generator faster then pandas
# cat_dict = cat_train.to_dict(orient='records')  # Categorical data dict
if categorical_type == 'dictvectorizer':
    vec = DictVectorizer()
    cat_data = vec.fit_transform(cat_dict).toarray()
elif categorical_type == 'featurehasher':
    feat_hash = FeatureHasher()
    hasher = FeatureHasher(input_type='string', n_features=2**8)
    cat_data = hasher.transform(cat_dict)
elif categorical_type == 'onehotencoder':
    le_data = np.empty(cat_train.shape)
    for col in range(cat_train.shape[1]):
        le = LabelEncoder()
        le_data[:, col] = le.fit_transform(cat_train.ix[:, col])
    enc = OneHotEncoder()
    cat_data = enc.fit_transform(le_data).toarray()
else:
    raise Exception('categorical_type not supported!')

np_data = np.array(num_train)
ids = np.array(np_data[:, -2], dtype=np.int)
x_tr = np_data[:, :-2]
x_tr_cat = cat_data
y_tr = np_data[:, -1]
コード例 #37
0
    distinct_users_in_ratings = review_df['user_id'].unique()
    distinct_items_in_ratings = review_df['business_id'].unique()
    user_df = user_df[user_df['user_id'].isin(distinct_users_in_ratings)]
    item_df = item_df[item_df['business_id'].isin(distinct_items_in_ratings)]

    # deal with high cardinarity feature : category encoding
    # TODO: PCA, linear transformation, mean encoder
    if is_feature_hasher:
        print('feature hasing ...')
        mlb = MultiLabelBinarizer()
        encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] )
        fea_hasher = FeatureHasher(n_features=hash_dim)
        # wrap 'encodings' into dict
        all_categories = list(mlb.classes_)
        encode_dict_list = [ dict(zip(all_categories, list(instance_encoding)))  for instance_encoding in encodings] 
        hash_encodings = fea_hasher.transform(encode_dict_list).toarray()
    else:
        mlb = MultiLabelBinarizer()
        hash_encodings = mlb.fit_transform( [ cat_str.split(',') for cat_str in item_df['categories'].values ] )

    # Build graph
    print('building graph ...')
    graph_builder = PandasGraphBuilder()
    graph_builder.add_entities(user_df, 'user_id', 'user')
    graph_builder.add_entities(item_df, 'business_id', 'item')
    graph_builder.add_binary_relations(review_df, 'user_id', 'business_id', 'reviewed')
    graph_builder.add_binary_relations(review_df, 'business_id', 'user_id', 'reviewed-by')

    g = graph_builder.build()

    print('Assigning feature ...')
コード例 #38
0
ファイル: dataset.py プロジェクト: MLDaily/automobile
#     else:
#         chunk.to_csv('data/test.csv', header=False, index=False, mode='a')

# Train classifier
# clf = RandomForestRegressor()
# clf = LogisticRegression()
clf = LinearRegression()
# clf = KNeighborsRegressor()
# clf = MLPRegressor()
# clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
all_classes = np.array([0, 1])

y_train = file["price"]
train = file[cols[:-1]]
# train.drop(["normalised-losses"], axis=1, inplace=True)
Xcat = fh.transform(np.asarray(train.astype(str)))
print 'Training'
clf.fit(Xcat, y_train)

test_file = pd.read_csv('data/test.csv', names=cols)
# test_file = test_file.apply(lambda x: x.fillna(method='pad'), axis=0)

test = test_file[cols[:-1]]
y_test = test_file["price"]
# test.drop(["normalised-losses"], axis=1, inplace=True)
X_test = fh.transform(np.asarray(test.astype(str)))

y_pred = clf.predict(X_test)

for i, value in enumerate(y_test):
    print value, y_pred[i]
コード例 #39
0
ファイル: bayes_test.py プロジェクト: laurelhhq/Image_Emotion
# print indices[indptr[0]:indptr[1]]

# test MultinomialNB in sklearn
# n_features 尽量取大点,以免发生 Hash 碰撞
# 就算取得很大也无妨,因为是稀疏矩阵,还是消耗不了多少内存
train_datas = [{
    'monkey': 1,
    'dog': 1,
    'cat': 2,
    'elephant': 4
}, {
    'dog': 2,
    'run': 5
}]
feature_hasher = FeatureHasher(n_features=2**20, non_negative=True)
train_datas = feature_hasher.transform(train_datas)
"""X = np.array([[1, 2, 4, 1, 1, 1],
 [3, 2, 4, 2, 2, 3],
 [2, 2, 3, 4, 4, 1],
 [2, 0, 3, 2, 3, 1],
 [2, 0, 0, 3, 3, 3],
 [2, 3, 1, 0, 3, 4]])"""
class_label = np.array([1, 2])
# 调整平滑因子
clf = MultinomialNB(alpha=0.01)
train = clf.fit(train_datas, class_label)
test_datas = [{'monkey': 3, 'mouse': 1}]
test_datas = feature_hasher.transform(test_datas)
test = clf.predict(test_datas)
print train_datas
print test_datas
コード例 #40
0
from sklearn.feature_extraction import FeatureHasher
import numpy as np
# h = FeatureHasher(n_features=100000)
h = FeatureHasher(n_features=4)
D = [{'dog': 1, 'cat': 2, 'elephant': 4}, {'dog': 2, 'run': 5}]
f = h.transform(D)
print(f.toarray())

print('===')

# default input format : (feature_name, value)

D = [
    {
        'dog': 1,
        'cat': 2,
        'elephant': 4
    },
    {
        'dog': 2,
        'run': 5
    },
    {
        'dog': 1
    },
    {
        'run': 5
    },
    {
        'cat': 2
    },
コード例 #41
0
    return [float(d.weekday()), float(d.hour)]


fh = FeatureHasher(n_features=2**20, input_type="string")

# Train classifier
clf = RidgeClassifier()
train = pd.read_csv("train/subtrain.csv", chunksize=100000, iterator=True)
all_classes = np.array([0, 1])
for chunk in train:
    y_train = chunk["click"]
    chunk = chunk[cols]
    chunk = chunk.join(
        pd.DataFrame([dayhour(x) for x in chunk.hour], columns=["wd", "hr"]))
    chunk.drop(["hour"], axis=1, inplace=True)
    Xcat = fh.transform(np.asarray(chunk.astype(str)))
    clf.fit(Xcat, y_train)

# Create a submission file
usecols = cols + ["id"]
X_test = pd.read_csv("test/mtest.csv", usecols=usecols)
X_test = X_test.join(
    pd.DataFrame([dayhour(x) for x in X_test.hour], columns=["wd", "hr"]))
X_test.drop(["hour"], axis=1, inplace=True)

X_enc_test = fh.transform(np.asarray(X_test.astype(str)))

y_act = pd.read_csv("test/mtest.csv", usecols=['click'])
y_pred = clf.predict(X_enc_test)

with open('logloss.txt', 'a') as f:
コード例 #42
0
def feature_hash(X, n_features=1000):
    h = FeatureHasher(n_features=n_features)
    return h.transform(X)
コード例 #43
0
def train_and_score(_tr, _vv, _vp, model_sizes, colors=None):
    all_venues, train_pairs, valid_pairs = generate_interaction(_tr, _vv, _vp)

    print "Creating models"

    plt.figure(figsize=(10, 10))
    lw = 2
    roc_aucs = []
    for size, color in zip(model_sizes, colors):
        extractor = FeatureHasher(n_features=2**size)
        model = SGDClassifier(loss="log", penalty="l2", alpha=0.001, n_jobs=-1)
        # model = BernoulliNB()

        print "Training"
        for i, (user, yay_venues) in enumerate(train_pairs.iteritems()):
            print "Training on user", i, user
            labels, yay_pairs, nay_pairs = generate_features(
                all_venues, yay_venues)
            yay_features, nay_features = extractor.transform(
                yay_pairs), extractor.transform(nay_pairs)
            features = sp.vstack([yay_features, nay_features])
            model.partial_fit(features, labels, classes=[0, 1])

        print "Testing"
        all_labels, all_preds, all_probas = [], [], []
        for i, (user, yay_venues) in enumerate(valid_pairs.iteritems()):
            print "Testing on user", i, user
            labels, yay_pairs, nay_pairs = generate_features(
                all_venues, yay_venues)
            all_labels.extend(labels)
            yay_features, nay_features = extractor.transform(
                yay_pairs), extractor.transform(nay_pairs)
            features = sp.vstack([yay_features, nay_features])
            preds, probas = model.predict(features), model.predict_proba(
                features)
            all_preds.extend(preds), all_probas.extend(probas[:, 1])

        print "Scoring"
        roc_auc = roc_auc_score(all_labels, all_probas)
        cm = confusion_matrix(all_labels, all_preds)
        print "Model size", size, "AUC", roc_auc
        print cm
        roc_aucs.append(roc_auc)
        fpr, tpr, _ = roc_curve(all_labels, all_probas)
        plt.plot(fpr,
                 tpr,
                 color=color,
                 lw=lw,
                 label='Model %d (area = %0.2f)' % (size, roc_auc))

    joblib.dump(model, 'model_logit_size%d.pkl' % size)
    np.save("labels_logit_size%d.npy" % size, all_labels)
    np.save("probas_logit_size%d.npy" % size, all_probas)

    plt.plot([0, 1], [0, 1], color='navy', lw=lw, ls='--', label='Luck')
    plt.xlim([-.05, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic for different model sizes')
    plt.legend(loc="lower right")
    # plt.savefig('../plots/model_nb.png')
    plt.tight_layout()
    plt.show()
    '''
コード例 #44
0
          'TEXT :' + token_dict[file]
      ])

      X_feature.append(feature)

      if 'positive' in file:
          Y_act_tag.append('positive')
      elif 'negative' in file:
          Y_act_tag.append('negative')
      elif 'neutral' in file:
          Y_act_tag.append('neutral')


''' convert the feature list into featureHasher which will be fed to the classifier'''
hasher = FeatureHasher(input_type='string')
X = hasher.transform(X_feature)
print(type(X))
Y_act_tag_test=[]

for dirName, subDir, files in os.walk(sys.argv[2]):
    for file in files:
        fopen=open(os.path.join(dirName, file), 'r')
        review=fopen.read()
        text = review.lower()
        final_text = text.translate(None , string.punctuation)
        token_dict_test[file] = final_text


X_feature_test=[]
for file in token_dict_test:
    feature = []
コード例 #45
0
    raw_list.append(filtered2)
    raw_res.append(1 if raw["vandal"] else 0)

    raw_list_opp.append({
        x: y * (-1)
        for x, y in filtered.items()
        if y < 0 and not check_rgb(x) and ' ' not in x
    })  #

    counter.tick()

print(len(raw_list))
from sklearn.naive_bayes import BernoulliNB
fh = FeatureHasher(2000000)
matrix = fh.transform(raw_list)
matrix_opp = fh.transform(raw_list_opp)

lr = LogisticRegression(solver='sag',
                        verbose=1,
                        class_weight="balanced",
                        max_iter=300,
                        C=0.01)
#lr = BernoulliNB()
lr.fit(matrix, raw_res)

lr2 = LogisticRegression(solver='sag',
                         verbose=1,
                         class_weight="balanced",
                         max_iter=300,
                         C=0.01)
コード例 #46
0
def main():

    # Read the data
    train_data = []
    train_labels = []
    test_data = []
    test_labels = []
    actualPos = 0
    actualNeg = 0
    actualNeu = 0

    distinct_words = set([])
    train_dir = {}
    train = []
    test = []
    test_dir = {}

    distinct_words_list = []

    train_pos = 0
    train_neg = 0
    train_neu = 0
    for root, directories, filenames in os.walk(sys.argv[1]):

        for each_filename in filenames:
            if each_filename.endswith(".txt"):

                path = root + '/' + each_filename

                with open(os.path.join(root, each_filename), 'r') as f:
                    tokens = f.read()

                    token_split = tokens.split()

                    # ---- if the folder is training set ----#
                    if "Train" in path:
                        feature = feature_extraction(token_split, train_dir,
                                                     distinct_words_list)

                        train.append(feature)
                        train_data.append(tokens)
                        if "positive" in path:
                            train_pos = train_pos + 1
                            train_labels.append("positive")
                        elif "negative" in path:
                            train_neg = train_neg + 1
                            train_labels.append("negative")
                        elif "neutral" in path:
                            train_neu = train_neu + 1
                            train_labels.append("neutral")
                    # ---- if the folder is development set ----#
                    elif "Dev" in path:
                        feature = feature_extraction(tokens, test_dir,
                                                     distinct_words_list)
                        test.append(feature)
                        test_data.append(tokens)

                        if "positive" in path:
                            actualPos = actualPos + 1
                            test_labels.append("positive")
                        elif "negative" in path:
                            actualNeg = actualNeg + 1
                            test_labels.append("negative")
                        elif "neutral" in path:
                            actualNeu = actualNeu + 1
                            test_labels.append("neutral")

    # print("actual pos",actualPos)
    # print("actual neg", actualNeg)
    # print("actual neutral", actualNeu)
    #
    #--applying featurehasher to input ---#
    hasher = FeatureHasher(input_type='string')
    X = hasher.transform(train)
    Y = hasher.transform(test)

    #---perform classification on linear svc()---#
    classifier = svm.LinearSVC()
    clf = classifier.fit(X, train_labels)
    results = clf.predict(Y)

    pos = 0
    neg = 0
    neu = 0
    print("Results for LinearSVC()")
    for each in results:
        if "positive" in each:
            pos = pos + 1
        elif "negative" in each:
            neg = neg + 1
        elif "neutral" in each:
            neu = neu + 1

    # print("pred pos is ",pos)
    # print("pred neg is ", neg)
    # print("pred neu is ", neu)
    #
    # print("train pos is ", train_pos)
    # print("train neg is ", train_neg)
    # print("train neu is ", train_neu)

    print(classification_report(test_labels, results))
コード例 #47
0
import pandas as pd
import json

# 最初の10,000件のレビューを読み込み
with open('data/yelp/yelp_academic_dataset_review.json') as f:
    js = []
    for i in range(10000):
        js.append(json.loads(f.readline()))

review_df = pd.DataFrame(js)
# mにbusiness_idのユニーク数を代入
m = len(review_df['business_id'].unique())

m

from sklearn.feature_extraction import FeatureHasher

# ハッシュ化
h = FeatureHasher(n_features=m, input_type='string')
f = h.transform(review_df['business_id'])

# 変換後の特徴量が解釈が困難であることを確認
review_df['business_id'].unique().tolist()[0:5]

f.toarray()

# 変換後の特徴量のストレージサイズが大きく減っていることを確認
from sys import getsizeof

print('Our pandas Series, in bytes: ', getsizeof(review_df['business_id']))
print('Our hashed numpy array, in bytes: ', getsizeof(f))
コード例 #48
0
ファイル: svmtopics.py プロジェクト: ai-se/LDAClassification
def hash(mat, n_features=1000):
    hasher = FeatureHasher(n_features=n_features)
    X = hasher.transform(mat)
    X = X.toarray()
    return X
コード例 #49
0
def func1():
    h = FeatureHasher(n_features=3)
    D = [{'dog': 1, 'cat': 2, 'elephant': 4}, {'dog': 2, 'run': 5}]
    f = h.transform(D)
    print(f.toarray())
コード例 #50
0
        temp_list = []
        #rc contains the list of release code. ignore the last code as it refer to "enter" value
        rc = list(map(int, temp_X.release_codes.split()))
        pp = list(map(int, temp_X.pp.split()))
        pr = list(map(int, temp_X.pr.split()))
        rp = list(map(int, temp_X.rp.split()))
        rr = list(map(int, temp_X.rr.split()))
        for j in range(0, len(rc) - 1):
            temp_list.append({
                'rc': rc[j],
                'pp': pp[j],
                'pr': pr[j],
                'rp': rp[j],
                'rr': rr[j]
            })
        print(hasher.transform(temp_list).todense())
        X_transformed.append(hasher.transform(temp_list).todense())

    X_transformed = pd.DataFrame(X_transformed)
    # X_transformed = X_transformed.fillna(method='pad', axis=1)
    with open(r'output.csv', 'w') as file:
        file.write(X_transformed.to_csv())

    print("==== After transformation =====")
    print("X_transformed shape: {}".format(X_transformed.shape))

    X_train, X_test, y_train, y_test = train_test_split(X_transformed,
                                                        y,
                                                        test_size=0.4,
                                                        random_state=0)
    print("X_train type: {}".format(type(X_train)))
コード例 #51
0
n_features = 100

print("DictVectorizer")
t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in train)
duration = time() - t0
print("Found %d unique terms" % len(vectorizer.get_feature_names()))
print(train)
print()

print("FeatureHasher on frequency dicts")
t0 = time()
hasher = FeatureHasher(n_features=n_features)
X = hasher.transform(token_freqs(d) for d in train)
duration = time() - t0
print("Found %d unique terms" % n_nonzero_columns(X))
print()

print("FeatureHasher on raw tokens")
t0 = time()
hasher = FeatureHasher(n_features=n_features, input_type="string")
X = hasher.transform(tokens(d) for d in train)
duration = time() - t0
print("Found %d unique terms" % n_nonzero_columns(X))
'''
for row in train:
    #if row not in black_list:
    print(train[row].describe())
    print("\n")
コード例 #52
0
ファイル: model.py プロジェクト: NavneelSinghal/HCLHackIITK
    def predict(self, files):
        '''
        return a vector of predicted values for the set of files specified.
        Assume convention, 0=Benign, 1=Malware.
        '''
        assert self.model is not None

        # now extract features from file, hash them and use self.model to return predictions

        start_time = time()

        completed_files = 0
        feature_vector_list = []
        feature_dictionary_list = []
        print('Starting feature extraction')
        prev = None

        for _file in files:
            try:
                vector, dictionary = get_feature_vector(_file)
                prev = vector
            except:
                vector = prev
                dictionary = {}
            feature_dictionary_list.append(dictionary)
            feature_vector_list.append(vector)
            completed_files += 1
            print('Completed extracting features from ' +
                  str(completed_files) + ' files',
                  end='\r')

        print('')

        end_time = time()

        print('Feature extraction completed in ' + str(end_time - start_time) +
              ' seconds')

        print('Starting testing')

        start_time = time()

        features = 7000
        hasher = FeatureHasher(n_features=features)
        feature_x = hasher.transform(feature_dictionary_list).toarray()
        feature_x = np.concatenate((feature_x, np.array(feature_vector_list)),
                                   axis=1)
        feature_y = self.model.predict(feature_x)

        end_time = time()

        print('Testing completed in ' + str(end_time - start_time) +
              ' seconds')

        lump = lambda value: 1 if value > 0 else 0

        def transform(array):
            return np.fromiter((lump(element) for element in array),
                               array.dtype)

        return transform(feature_y)
コード例 #53
0
one.fit(X)
train=one.transform(X)

print('train data set has got {} rows and {} columns'.format(train.shape[0],train.shape[1]))


logistic(train,y)
from sklearn.feature_extraction import FeatureHasher
### %time

X_train_hash=X.copy()
for c in X.columns:
    X_train_hash[c]=X[c].astype('str')      
hashing=FeatureHasher(input_type='string')
train=hashing.transform(X_train_hash.values)

print('train data set has got {} rows and {} columns'.format(train.shape[0],train.shape[1]))


logistic(train,y)
### %time

X_train_stat=X.copy()
for c in X_train_stat.columns:
    if(X_train_stat[c].dtype=='object'):
        X_train_stat[c]=X_train_stat[c].astype('category')
        counts=X_train_stat[c].value_counts()
        counts=counts.sort_index()
        counts=counts.fillna(0)
        counts += np.random.rand(len(counts))/1000
コード例 #54
0
# In[28]:


train = train.drop('target', axis=1)


# In[29]:


dev = dev.drop('target', axis=1)


# In[30]:


hashed_train = hasher.transform(get_features(train, features))


# In[31]:


hashed_dev = hasher.transform(get_features(dev, features))


# In[32]:


hashed_test = hasher.transform(get_features(test, features))


# In[33]:
コード例 #55
0
raw_data = fetch_20newsgroups(subset='train', categories=categories).data
data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6
print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb))
print()

print("DictVectorizer")
t0 = time()
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % len(vectorizer.get_feature_names()))
print()

print("FeatureHasher on frequency dicts")
t0 = time()
hasher = FeatureHasher(n_features=n_features)
X = hasher.transform(token_freqs(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))
print()

print("FeatureHasher on raw tokens")
t0 = time()
hasher = FeatureHasher(n_features=n_features, input_type="string")
X = hasher.transform(tokens(d) for d in raw_data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
print("Found %d unique terms" % n_nonzero_columns(X))
コード例 #56
0
ファイル: _dedupe.py プロジェクト: abiraja2004/cuttsum
    def do_job_unit(self, event, corpus, unit, **kwargs):
        if unit != 0:
            raise Exception("Job unit {} out of range".format(unit))
        
        res = ArticlesResource()
        thresh = kwargs.get("dedupe-sim-threshold", .8)
        extractor = kwargs.get("extractor", "goose")
        hasher = FeatureHasher(input_type="pair", non_negative=True)
        si_iter = res.streamitem_iter(
            event, corpus, extractor) 

        def to_df(all_ids, all_times, all_matches):
            d = []
            for ids, times, match in izip(all_ids, all_times, all_matches):

                times.sort()
                d.append({
                    "stream ids": ids, "hits": len(ids), "match": match,
                    "earliest": times[0], "latest": times[-1], 
                    "second": times[1] if len(times) >= 2 else None,
                    "third": times[2] if len(times) >= 3 else None,
                })
            return pd.DataFrame(d, columns=["stream ids", "match", "hits", 
                                            "earliest", "latest", 
                                            "second", "third"])    

        def query_in_top20(event, df):
            text = u"\n".join(df["sent text"].tolist()[:20]) 
            for query in event.query:
                if not re.search(query, text, flags=re.I|re.UNICODE):
                    return False
            return True

        def make_time(df):
            return df["timestamp"].tolist()[0]

        def make_counts(df, slimit=20):
            counts = defaultdict(int)
            for words in df["words"].tolist()[:slimit]:
                for word in words:
                    counts[word.lower()] += 1   
            return counts

        def next_chunk_file(chunk_file_num):
            deduped_path_fmt = self.get_deduped_path_fmt(
                event, corpus, extractor, threshold=thresh)
            deduped_path = deduped_path_fmt.format(
                chunk_file_num)
            deduped_dir = os.path.dirname(deduped_path)
            if not os.path.exists(deduped_dir):
                os.makedirs(deduped_dir)
            
            if os.path.exists(deduped_path):
                os.remove(deduped_path)

            return sc.Chunk(path=deduped_path, mode="wb", 
                message=corpus.sc_msg())



        X = None

        chunk_file_num = 1
        chunk = next_chunk_file(chunk_file_num)

        for hour, path, si in si_iter:
            df = si2df(si, extractor=extractor)
            counts = make_counts(df)
            x = hasher.transform([counts.items()])
            x.shape = (1, hasher.n_features)
            
            if X is None:
                X = x
                times = [[make_time(df)]]
                ids = [[si.stream_id]]
                matches = [query_in_top20(event, df)]

                chunk.add(si)
                        
            else:
                K = cosine_similarity(X, x)
                k_argmax = K.argmax()
                
                if K[k_argmax] < thresh:
                    
                    X = vstack([X, x])
                    times.append([make_time(df)])
                    ids.append([si.stream_id])
                    matches.append(query_in_top20(event, df))

                    if X.shape[0] % 1000 == 0:
                        chunk.close()
                        chunk_file_num += 1
                        chunk = next_chunk_file(chunk_file_num)

                    chunk.add(si)
                    
                else:
                    times[k_argmax].append(make_time(df))
                    ids[k_argmax].append(si.stream_id)
               
        chunk.close() 
     
        df = to_df(ids, times, matches)            
        print df

        stats_path = self.get_stats_path(
            event, corpus, extractor, thresh)
        with open(stats_path, "w") as f:
            df.to_csv(f, index=False, sep="\t")
コード例 #57
0
class FeatureExtractor:
    def __init__(self, lexicon_helper, fh=None, fs=None):
        if fh:
            self.feature_hasher = fh
        if fs:
            self.features_set = fs
            self.train_mode = False
        else:
            self.features_set = set()

        self.lexicon_helper = lexicon_helper

    features_set = None
    feature_hasher = None
    train_mode = True
    unk = "unk"

    def build_x_vectors(self, ent_couple_objects):
        '''

        :param tuple(sen_id, ent1 name, ent2 name, x)
        :return: tuple(sen_id, ent1 name, ent2 name, x)
        '''
        if not self.feature_hasher:
            self.feature_hasher = FeatureHasher(n_features=len(
                self.features_set),
                                                input_type='string')

        x_data = self.feature_hasher.transform(
            [t[3] for t in ent_couple_objects])
        converted_ent_objects = [(t[0], t[1], t[2], x_data[i])
                                 for i, t in enumerate(ent_couple_objects)]
        return converted_ent_objects, x_data

    def extract_features(self, ent_tuple, sentence):
        '''

        :param ent_tuple:
        :param sentence:
        :return: tuple(sen_id, ent1 name, ent2 name, x)
        '''
        features = []
        sen_id = ent_tuple[0]
        ent1_text = self.extract_text(ent_tuple[1])
        ent2_text = self.extract_text(ent_tuple[2])

        #Entity features
        ent1_type = self.extract_type(ent_tuple[1])
        ent2_type = self.extract_type(ent_tuple[2])
        ent1_head = self.extract_head(ent_tuple[1])
        ent2_head = self.extract_head(ent_tuple[2])
        concatenated_types = ent1_type + ent2_type
        features.append(self.get_feature("e1_type", ent1_type))
        features.append(self.get_feature("e2_type", ent2_type))
        features.append(self.get_feature("e1_head", ent1_head))
        features.append(self.get_feature("e2_head", ent2_head))
        features.append(
            self.get_feature("e1_root", ent_tuple[1][ENT_OBJ_ROOT].lemma_))
        features.append(
            self.get_feature("e2_root", ent_tuple[2][ENT_OBJ_ROOT].lemma_))
        features.append(
            self.get_feature("concanated_types", concatenated_types))

        #Lexicon Features
        features.append(
            self.get_feature(
                "e1_lex_fname",
                self.lexicon_helper.does_include_first_name(ent1_text)))
        features.append(
            self.get_feature(
                "e1_lex_lname",
                self.lexicon_helper.does_include_last_name(ent1_text)))
        features.append(
            self.get_feature("e2_lex_loc",
                             self.lexicon_helper.is_location(ent_tuple[2])))

        #word based features
        words_between_ents = parser.get_words_between(ent_tuple[1],
                                                      ent_tuple[2])
        for word in words_between_ents:
            features.append(self.get_feature("bow", word.text))

        features.append(
            self.get_feature("ent1_bword",
                             ent_tuple[1][ENT_OBJ_ROOT].left_edge.text))
        features.append(
            self.get_feature("ent2_aword",
                             ent_tuple[2][ENT_OBJ_ROOT].right_edge.text))

        #syntactic features
        features.append(
            self.get_feature("ent_dist",
                             parser.get_dist(ent_tuple[1], ent_tuple[2])))
        dependency_path_str = parser.get_dependecy_path_str(
            ent_tuple[1], ent_tuple[2])
        features.append(self.get_feature("dep_path", dependency_path_str))
        dependency_path_pos_str = parser.get_dependecy_path_pos_str(
            ent_tuple[1], ent_tuple[2])
        features.append(
            self.get_feature("dep_pos_path", dependency_path_pos_str))

        features.append(
            self.get_feature(
                "is_descriptive_path",
                parser.is_direct_ent2_to_ent1_path(ent_tuple[1],
                                                   ent_tuple[2])))
        #

        #

        e1_clean = self.clean_name(ent_tuple[1])
        e2_clean = self.clean_name(ent_tuple[2])
        return (sen_id, e1_clean, e2_clean, features)

    def extract_text(self, ent_obj):
        return parser.clean_entity_text(ent_obj[ENT_OBJ_TEXT],
                                        ent_obj[ENT_OBJ_ROOT])

    def extract_type(self, ent_obj):
        return ent_obj[ENT_OBJ_LABEL]

    def extract_head(self, ent_obj):
        return ent_obj[ENT_OBJ_ROOT].head.lemma_

    def get_feature(self, feature_prefix, feature_val):
        feature = feature_prefix + str(feature_val)
        if self.train_mode:
            self.features_set.add(feature)
            self.features_set.add(feature_prefix + self.unk)
            return feature
        else:
            if feature in self.features_set:
                return feature
            else:
                return feature_prefix + self.unk

    def clean_name(self, ent_obj):
        return parser.modify_entity_text(ent_obj[ENT_OBJ_TEXT],
                                         ent_obj[ENT_OBJ_SPACY_ENT])
コード例 #58
0
ファイル: dqn_learner.py プロジェクト: pearlfranz20/AL_Core
class DQNLearner(WhenLearner):
    def __init__(
            self,
            gamma=0.7,
            lr=3e-5,
            batch_size=64,
            mem_capacity=10000,
            # state_size=394, action_size=257, state_hidden_size=197,
            state_size=50,
            action_size=50,
            state_hidden_size=30,
            action_hidden_size=122):

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        # self.device = "cpu" #TODO: make cuda not break elsewhere
        self.gamma = gamma
        self.lr = lr
        self.batch_size = batch_size

        self.state_size = state_size
        self.action_size = action_size
        self.state_hidden_size = state_hidden_size
        self.action_hidden_size = action_hidden_size

        self.state_hasher = FeatureHasher(n_features=self.state_size,
                                          alternate_sign=False)
        self.action_hasher = FeatureHasher(n_features=self.action_size,
                                           alternate_sign=False)

        # special case to make things run faster and drop values
        # self.state_hasher = FractionsStateHasher()
        # self.action_hasher = FractionsActionHasher()

        self.value_net = ValueNet(self.state_size,
                                  self.state_hidden_size).to(self.device)
        self.action_net = ActionNet(self.action_size, self.state_hidden_size,
                                    self.action_hidden_size).to(self.device)

        # create separate target net for computing future value
        self.target_value_net = ValueNet(self.state_size,
                                         self.state_hidden_size)
        self.target_value_net.load_state_dict(self.value_net.state_dict())
        self.target_value_net.eval()
        self.target_action_net = ActionNet(self.action_size,
                                           self.state_hidden_size,
                                           self.action_hidden_size)
        self.target_action_net.load_state_dict(self.action_net.state_dict())
        self.target_action_net.eval()

        self.replay_memory = ReplayMemory(mem_capacity)

        params = (list(self.value_net.parameters()) +
                  list(self.action_net.parameters()))
        self.optimizer = torch.optim.Adam(params, lr=self.lr)

    def update_target_net(self):
        self.target_value_net.load_state_dict(self.value_net.state_dict())
        self.target_action_net.load_state_dict(self.action_net.state_dict())

    def gen_state_vector(self, state: dict) -> np.ndarray:
        state = {str(a): state[a] for a in state}

        return self.state_hasher.transform([state]).toarray()

    def gen_action_vectors(self,
                           actions: Collection[Activation]) -> np.ndarray:

        action_dicts = []
        for action in actions:
            act_d = {}
            name = action.get_rule_name()
            act_d['rulename'] = name
            bindings = action.get_rule_bindings()
            for a, v in bindings.items():
                if isinstance(v, bool):
                    act_d[str(a)] = str(v)
                else:
                    act_d[str(a)] = v
            action_dicts.append(act_d)

        return self.action_hasher.transform(action_dicts).toarray()

    def eval(self, state: dict, action: Activation) -> float:
        if state is None:
            return 0

        state_x = torch.from_numpy(self.gen_state_vector(state)).float().to(
            self.device)
        action_x = torch.from_numpy(self.gen_action_vectors(
            [action])).float().to(self.device)

        with torch.no_grad():
            state_val, state_hidden = self.value_net(state_x)
            action_val = self.action_net(action_x, state_hidden)
            return state_val[0].cpu().item() + action_val[0].cpu().item()

    def eval_multiple(self, state: dict,
                      actions: Collection[Activation]) -> Collection[float]:
        if state is None:
            return 0

        state_x = torch.from_numpy(self.gen_state_vector(state)).float().to(
            self.device)
        action_x = torch.from_numpy(
            self.gen_action_vectors(actions)).float().to(self.device)

        with torch.no_grad():
            state_val, state_hidden = self.value_net(state_x)
            action_val = self.action_net(action_x,
                                         state_hidden.expand(len(actions), -1))
            return (state_val.expand(len(actions), -1) +
                    action_val).squeeze(1).cpu().tolist()

    def update(
        self,
        state: dict,
        action: Activation,
        reward: float,
        next_state: dict,
        next_actions: Collection[Activation],
    ) -> None:

        state_v = self.gen_state_vector(state)
        action_v = self.gen_action_vectors([action])

        if next_state is None or len(next_actions) == 0:
            next_state_v = None
            next_action_vs = None
        else:
            next_state_v = self.gen_state_vector(next_state)
            next_action_vs = self.gen_action_vectors(next_actions)

        self.replay_memory.push(
            torch.from_numpy(state_v).float().to(self.device),
            torch.from_numpy(action_v).float().to(self.device),
            torch.tensor([reward]).float().to(self.device),
            None if next_state_v is None else
            torch.from_numpy(next_state_v).float().to(self.device),
            None if next_action_vs is None else
            torch.from_numpy(next_action_vs).float().to(self.device))

        self.train()

    def train(self):
        # epochs = (len(replay_memory) // target_update // 2) + 1
        batch_size = self.batch_size
        if len(self.replay_memory) < batch_size:
            batch_size = len(self.replay_memory)
        updates = len(self.replay_memory) // batch_size
        if updates < 20:
            updates = 20
        updates *= 3
        if updates > 200:
            updates = 200

        log.debug('len replay mem =' + str(len(self.replay_memory)))
        loss = []
        for i in range(updates):
            if i % 5:
                self.update_target_net()
            loss.append(self.optimize_model())

    def optimize_model(self):
        batch_size = self.batch_size

        if len(self.replay_memory) < self.batch_size:
            batch_size = len(self.replay_memory)

        transitions = self.replay_memory.sample(batch_size)
        batch = Transition(*zip(*transitions))  # transpose batch

        # Get states, actions, and rewards
        state = torch.cat(batch.state).view(batch_size, self.state_size)
        action = torch.cat(batch.action).view(batch_size, self.action_size)
        reward = torch.stack(batch.reward).view(1, batch_size)

        state_value, state_hidden = self.value_net(state)
        action_value = self.action_net(action, state_hidden)
        state_action_values = state_value + action_value

        # compute mask of non-final states and concatenate the batch elements
        non_final_mask = torch.tensor(tuple(
            map(lambda sa: sa is not None, batch.next_state)),
                                      device=self.device,
                                      dtype=torch.bool)
        any_non_final = non_final_mask.sum() > 0

        if any_non_final:
            non_final_next_state = torch.cat([
                s for s in batch.next_state if s is not None
            ]).view(-1, self.state_size)
            non_final_next_actions = torch.cat([
                s for s in batch.next_actions if s is not None
            ]).view(-1, self.action_size)

        # how many actions are available for each state
        next_action_lens = [
            nas.shape[0] for nas in batch.next_actions if nas is not None
        ]
        next_action_start = [
            sum(next_action_lens[0:i]) for i in range(len(next_action_lens))
        ]

        # Compute next state action indices from policy net
        if any_non_final:

            with torch.no_grad():
                next_value = self.target_value_net(non_final_next_state)
                non_final_next_state_value, non_final_next_hidden = next_value

                next_state_value_expanded = torch.cat([
                    non_final_next_state_value[i].expand(
                        next_action_lens[i], -1)
                    for i in range(len(next_action_start))
                ], 0)

                next_state_hidden_expanded = torch.cat([
                    non_final_next_hidden[i].expand(next_action_lens[i], -1)
                    for i in range(len(next_action_start))
                ], 0)

                non_final_next_action_value = (
                    next_state_value_expanded + self.target_action_net(
                        non_final_next_actions, next_state_hidden_expanded))

        # Compute value of next state actions from target net
        # Detach, so we don't track gradients, target net not getting updated.
        next_state_values = torch.zeros(batch_size, device=self.device)
        if any_non_final:
            next_state_values[non_final_mask] = torch.tensor(
                [
                    non_final_next_action_value.narrow(
                        0, next_action_start[i], next_action_lens[i]).max(0)[0]
                    for i in range(len(next_action_start))
                ],
                device=self.device)

        # next_state_values[non_final_mask] = self.net(
        # non_final_next_sas).gather(
        #         1, non_final_next_sa_idx).detach().squeeze()

        # Calculate the expected state-action value
        with torch.no_grad():
            expected_state_action_values = (
                reward + self.gamma * next_state_values).view(batch_size, 1)

        # print(torch.cat([state_action_values, expected_state_action_values], 1))
        # print(expected_state_action_values)

        self.optimizer.zero_grad()

        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)

        # perform backprop
        loss.backward()

        # for param in self.value_net.parameters():
        #     param.grad.data.clamp_(-1, 1)
        # for param in self.action_net.parameters():
        #     param.grad.data.clamp_(-1, 1)

        self.optimizer.step()

        return loss.detach().item()
コード例 #59
0
#AI-TECHGYM-2-6-A-4
#特徴量エンジニアリング

#インポート
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

n_features = 5
h = FeatureHasher(n_features=n_features)

#読み込みデータ
columns = ['Python', 'Ruby', 'PHP', 'Java', 'JavaScript']
D = [{
    "Label": "Python"
}, {
    "Label": "Ruby"
}, {
    "Label": "PHP"
}, {
    "Label": "Java"
}, {
    "Label": "JavaScript"
}]
df_D = pd.DataFrame(D)
#display(df_D)

f_array = h.transform(D).toarray()
df_a = pd.DataFrame(f_array, dtype=int, index=columns)
display(df_a)
コード例 #60
0
def test_hasher_invalid_input():
    raw_X = [[], (), iter(range(0))]

    feature_hasher = FeatureHasher(input_type="gobbledygook")
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features=-1)
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features=0)
    with pytest.raises(ValueError):
        feature_hasher.transform(raw_X)
    feature_hasher = FeatureHasher(n_features="ham")
    with pytest.raises(TypeError):
        feature_hasher.transform(raw_X)

    feature_hasher = FeatureHasher(n_features=np.uint16(2**6))
    with pytest.raises(ValueError):
        feature_hasher.transform([])
    with pytest.raises(Exception):
        feature_hasher.transform([[5.5]])
    with pytest.raises(Exception):
        feature_hasher.transform([[None]])