Exemple #1
0
class Feat(object):
    def __init__(self):
        self.dvec = DictVectorizer(dtype=np.float32, sparse=False)

    def fit(self, trn):
        # def fit(self, trn, dev, tst):
        self.dvec.fit(self.feat_basic(ci, sent) for sent in trn for ci, c in enumerate(sent["cseq"]))
        # self.tseqenc.fit([t for sent in trn for t in sent['tseq']])
        # self.tsenc.fit([t for sent in chain(trn,dev,tst) for t in sent['ts']])
        self.feature_names = self.dvec.get_feature_names()
        # self.ctag_classes = self.tseqenc.classes_
        # self.wtag_classes = self.tsenc.classes_
        logging.info(self.feature_names)
        logging.debug(" ".join([fn for fn in self.feature_names]))
        # logging.info(self.ctag_classes)
        # logging.info(self.wtag_classes)
        self.NF = len(self.feature_names)
        # logging.info('NF: {} NC: {}'.format(self.NF, self.NC))
        logging.info("NF: {}".format(self.NF))

    def transform(self, sent):
        Xsent = self.dvec.transform([self.feat_basic(ci, sent) for ci, c in enumerate(sent["cseq"])])  # nchar x nf
        slen = Xsent.shape[0]
        ysent = np.zeros((slen, 2), dtype=bool)
        ysent[range(slen), sent["lseq"]] = True
        # ysent = np.array(sent['lseq'])
        return Xsent, ysent

    def feat_basic(self, ci, sent):
        return {"c": sent["cseq"][ci]}
class BinTransformer(object):
    """
    bins: int (number of bins) or percentlile
    """
    def __init__(self, bins, percentiles):
        self._dv = DictVectorizer()
        self._bins = bins
        self._bin_boundaries = {}
        self._percentiles = percentiles
        self._feature_names = []

    def fit(self, data):
        binned_data = data.copy()
        for col in data.columns:
            cut_func = pd.qcut if self._percentiles else pd.cut
            binned_data[col], self._bin_boundaries[col] = cut_func(data[col], self._bins, retbins=True)
        self._dv.fit(binned_data.T.to_dict().values())

    def transform(self, data):
        binned_data = data.copy()
        for col in data.columns:
            binned_data[col] = pd.cut(data[col], self._bin_boundaries[col])
        binnedX = self._dv.transform(binned_data.T.to_dict().values())
        self._feature_names += self._dv.get_feature_names()
        return binnedX

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

    def get_feature_names(self):
        return self._feature_names()
Exemple #3
0
def vectorize(train_features, test_features):
    """
    convert set of features to vector representation
    :param train_features: A dictionary with the following structure
             { instance_id: {f1:count, f2:count,...}
            ...
            }
    :param test_features: A dictionary with the following structure
             { instance_id: {f1:count, f2:count,...}
            ...
            }
    :return: X_train: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
            X_test: A dictionary with the following structure
             { instance_id: [f1_count,f2_count, ...]}
            ...
            }
    """
    X_train = {}
    X_test = {}

    vec = DictVectorizer()
    vec.fit(train_features.values())
    for instance_id in train_features:
        X_train[instance_id] = vec.transform(train_features[instance_id]).toarray()[0]

    for instance_id in test_features:
        X_test[instance_id] = vec.transform(test_features[instance_id]).toarray()[0]

    return X_train, X_test
def dict_vectorize(dict_list):
    assert isinstance(dict_list, list)
    from sklearn.feature_extraction import DictVectorizer

    vec = DictVectorizer()
    vec.fit(dict_list)
    return vec
Exemple #5
0
class DictVectWrapper(object):
    
    def __init__(self):
        self.feature_extractors = [extractor[1] for extractor in inspect.getmembers(self, predicate=inspect.ismethod) if extractor[0].find("feature") > 0]
        self.dv = DictVectorizer()
        
    def fit(self, data):
        data_dics = []
        for datum in data:
            features = {}
            for feature in self.feature_extractors:
                features.update(feature(datum))
            data_dics.append(features)
        self.dv.fit(data_dics)
    
    def fit_transform(self, data):
        data_dics = []
        for datum in data:
            features = {}
            for feature in self.feature_extractors:
                features.update(feature(datum))
            data_dics.append(features)
        res = self.dv.fit_transform(data_dics)
        return res
    
    def transform(self, datum):
        features = {}
        for feature in self.feature_extractors:
            features.update(feature(datum))
        return self.dv.transform(features)
Exemple #6
0
def main():
    # load data
    path = 'exracted_content/extractedrawtext'
    data = map(json.loads, file(path))

    # count word for every tag
    tags = TAGS + ['boilerplate', 'boilerpipe']
    counts_per_tag = {}

    for eachtag in tags:
        counts = map(counter, getItems(eachtag, data))
        counts_per_tag[eachtag] = counts

    total = sumUp(counts_per_tag, len(data))

    # vectorize
    vect = DictVectorizer()
    vect.fit([total])

    features = {}
    for eachtag in tags:
        features[eachtag] = vect.transform(counts_per_tag[eachtag])

    save('textfeature', features)
    save('textvector', vect)
class DictVectorizerModel:
	def fit(self, train):
		temp_list = []

		for item in train:
		    dic = {"dict" : item}
		    temp_list.append(dic)


		self.model = DictVectorizer()
		self.model.fit(temp_list)

	def transform(self, dataframe, col_name):
		temp_list = []
		for item in dataframe:
			dic = {"dict" : item}
			temp_list.append(dic)

		df = self.model.transform(temp_list).toarray()
		df = pd.DataFrame(df)
		df.index = dataframe.index

		df.columns = ["%s_%d" % (col_name, data) for data in df.columns]

		return df

	def get_model(self):
		return self.model

	def set_model(self, model):
		self.model = model
Exemple #8
0
    def _map_goose2streamitem(self, goose_sents, si_sents):

        goose_wc = []
        for sent in goose_sents:
            c = {}
            for token in sent:
                token = token.lower()
                if isinstance(token, unicode):
                    token = token.encode("utf-8") 
                c[token] = c.get(token, 0) + 1
            goose_wc.append(c)
        si_wc = []
        for sent in si_sents:
            c = {}
            for token in sent:
                token = token.lower()
                if isinstance(token, unicode):
                    token = token.encode("utf-8") 
                c[token] = c.get(token, 0) + 1
            si_wc.append(c)

        vec = DictVectorizer()
        vec.fit(goose_wc + si_wc)
        X_goose = vec.transform(goose_wc)
        X_si = vec.transform(si_wc)
        K = cosine_similarity(X_goose, X_si)
        I = np.argmax(K, axis=1)
        return I                                    
Exemple #9
0
    def _train(self, train_data, resources):
        sample_length = len(train_data)
        dict_status_path = os.path.join(root_dic,
                                        'dict_vectorizer_{}.status'.
                                        format(sample_length))
        if os.path.isfile(dict_status_path):
            dictVectorizer = joblib.load(dict_status_path)
        else:
            dictVectorizer = DictVectorizer()
            dictVectorizer.fit(train_data[self.features].
                               fillna(0).
                               to_dict('record'))
            joblib.dump(dictVectorizer, dict_status_path)

        tfidf_status_path = os.path.join(root_dic,
                                         'tfidf_vectorizer_{}.status'.
                                         format(sample_length))
        if os.path.isfile(tfidf_status_path):
            tfidf = joblib.load(tfidf_status_path)
        else:
            tfidf = TfidfVectorizer(min_df=40, max_features=300)
            tfidf.fit(train_data.essay)
            joblib.dump(tfidf, tfidf_status_path)

        resources['dictVectorizer'] = dictVectorizer
        resources['tfidf'] = tfidf
        print 'Head Processing Completed'
        return train_data, resources
Exemple #10
0
def dictionary(rows):
	dl = dictlist(rows2objects(rows))

	dv = DictVectorizer()
	dv.fit(dl)
	
	return dv
def vectorize_data(data_list,vec_data_fd):
    '''Takes in the data as a list of attribute-name:value tuples
    and converts it into vectorized form for processing by scikit
    prints the feature mapping in filename.'''
    vec=DictVectorizer();
    vec.fit(data_list)
    print len(vec.get_feature_names())
    vector_data = vec.transform(data_list).toarray()

    one_hot_names = vec.get_feature_names();
    #print the feature mappings
    feature_indices = range(0,len(one_hot_names));
    one_hot_mapping = zip(one_hot_names,feature_indices);
    with open('one_hot_encoding.txt','w') as file:
        for (idx,one_hot_name) in one_hot_mapping:
            print >> file, "%s-->%d\n"%(idx,one_hot_name);
		
    # print the one-hot encoding  for each tuple.
    with open('vector_mappings.txt','w') as file:
        for row in vector_data:
            print >> file, vec.inverse_transform(row),"\n";
    sys.exit(1)
    for row in vector_data:
        row = [str(x) for x in row]
        row = ",".join(row)
        vec_data_fd.write(row)
        vec_data_fd.write("\n\n")
    return vector_data;
Exemple #12
0
def onehot_encoder(df):
    d = df.T.to_dict().values()

    dv = DictVectorizer()
    dv.fit(d)

    return dv
def main():
    # load data
    #path = 'generated/extracted_text'
    os.system("mkdir generated")
    path = 'extracted_text'
    data = map(json.loads, file(path))

    # count word for every tag
    tags = TAGS + ['boilerplate', 'boilerpipe']
    counts_per_tag = {}

    for tag in tags:
        counts = map(count, get(tag, data))
        counts_per_tag[tag] = counts

    total = sum_up(counts_per_tag, len(data))

    # vectorize
    v = DictVectorizer()
    v.fit([total])

    features = {}
    for tag in tags:
        features[tag] = v.transform(counts_per_tag[tag])

    save('text_features', features)
    save('text_vectorizer', v)
    os.system("mv generated/text_features . ")
    os.system("mv generated/text_vectorizer . ")
Exemple #14
0
def encode_categorical_features(features, sparse=True):
    from sklearn.feature_extraction import DictVectorizer

    enc = DictVectorizer(sparse=sparse)
    enc.fit(features)  
    svm_features = enc.transform(features)
    return svm_features, enc
Exemple #15
0
def generate_matrix():
    D = []
    y = []
    fex = features.IpadicFeature()
    progress = 0
    print('create feature dictionary')
    for q, a in load_corpus():
        D.append(list(fex.transform(q)))
        a = normalize.normalize_askfm(a, h2z=False)
        y.append(isnot_shitsumon(a))
        progress += 1
        if progress % 100 == 0:
            print(progress)

    dv = DictVectorizer()
    dv.fit(itertools.chain(*D))

    progress = 0
    print('create feature vector')
    X = []
    for ds in D:
        count = None
        for d in ds:
            v = dv.transform(d)
            if count is None:
                count = v
            else:
                count += v
        X.append(count)
        progress += 1
        if progress % 100 == 0:
            print(progress)
    X = scipy.sparse.vstack(X)
    y = numpy.array(y)
    return X, y, dv
def plotFeatures(featureListTrain,trainingsetLabels):
    cv = DictVectorizer()
    cv.fit(featureListTrain)
    print (len(cv.vocabulary_))
    print (cv.get_feature_names())
    X_train = cv.transform(featureListTrain)
    svm = LinearSVC()
    svm.fit(X_train, trainingsetLabels)
    plot_coefficients(svm, cv.get_feature_names())
    def vectorize(self, categorical_features, continuous_features):
        vec = DictVectorizer(sparse=False)

        vec.fit(self.training_categorical_features)
        enc_categorical_features = vec.transform(categorical_features)
        merged_features = []
        for cont, cat in zip(continuous_features, enc_categorical_features):
            all_features_for_item = list(cont) + list(cat)
            # TODO why do I need this ||0 ? -- why isn't the imputer handling this?
            merged_features.append([0.0 if math.isnan(y) else y for y in all_features_for_item])

        return merged_features
Exemple #18
0
class make_dummies(sklearn.base.BaseEstimator,
                       sklearn.base.TransformerMixin):
    '''uses pandas to transform categorical variables into one-hot encoding'''
    def __init__(self, dummy_cols):
        self.dummy_cols = dummy_cols
        self.dv = DictVectorizer()

    def fit(self, X, y=None):
        self.dv.fit(X[self.dummy_cols].to_dict(orient='records'))
        return self

    def transform(self, X):
        return self.dv.transform(X[self.dummy_cols].to_dict(orient='records'))
 def __get_bag_of_words(self, set, special_chars_bow=None):
     list_of_bows = list()
     if special_chars_bow is None:
         for user_posts in set:
             list_of_bows.append(self.vectorizer.transform(np.array(user_posts)).toarray())
     else:
         special_vectoriser = DictVectorizer()
         special_vectoriser.fit(Counter(s.split()) for s in special_chars_bow)
         print(special_vectoriser.vocabulary_)
         for user_posts in set:
             list_of_bows.append(
                 special_vectoriser.transform(Counter(s.split()) for s in np.array(user_posts)).toarray())
     return list_of_bows
Exemple #20
0
def vectorize(data, s):
    '''
    :param data: list of instances for a given lexelt with the following structure:
        {
			[(instance_id, left_context, head, right_context, sense_id), ...]
        }
    :param s: list of words (features) for a given lexelt: [w1,w2,w3, ...]
    :return: vectors: A dictionary with the following structure
            { instance_id: [w_1 count, w_2 count, ...],
            ...
            }
            labels: A dictionary with the following structure
            { instance_id : sense_id }

    '''
    vectors = {}
    labels = {}

    # implement your code here
    vec = DictVectorizer()
    s_set = set(s)
    def vectorize_one(t):
        tokens_left = list(nltk.word_tokenize(t[1]))
        tokens_right = list(nltk.word_tokenize(t[3]))
        tokens = tokens_left + [t[2]] + tokens_right
        context_words = tokens_left[-window_size:] + tokens_right[0:window_size]
        context_window = dict(map(lambda x: ('BOW_'+x, 0), s))
        def inc_one(word):
            if word in s_set:
                key = 'BOW_'+word
                context_window.setdefault(key, 0)
                context_window[key] += 1

        try:
            map(lambda word: inc_one(word),
                context_words
                )
        except Exception as e:
            # print 'word', 'not in s ', e
            pass
        try:
            vectors[t[0]] = context_window
        except:
            pass
        labels[t[0]] = t[-1]
    map(vectorize_one, data)
    vec.fit(vectors.values())
    for instance_id in vectors:
        vectors[instance_id] = vec.transform(vectors[instance_id]).toarray()[0]
    return vectors, labels
Exemple #21
0
def crossval(paths, annDir, eval_type, use_reach):
    ''' Puts all together '''

    print "Parsing data"
    paths = set(paths)
    labels, vectors, hashes, data = parse_data(paths, annDir, use_reach)

    # Group indexes by paper id
    groups = {p:[] for p in paths}

    for i, d in enumerate(data):
        groups[d.namespace].append(i)

    # Hack!!
    groups2 = {}
    for k, v in groups.iteritems():
        if len(v) != 0:
            groups2[k] = v

    groups = groups2

    print "Using %i papers" % len(groups2)

    # Make it a numpy array to index it more easily
    data = np.asarray(data)

    dv = DictVectorizer()
    dv.fit(vectors)

    X = dv.transform(vectors)
    y = np.asarray(labels)

    f1_diffs = []
    model_f1s = {}

    indices = set(range(len(data)))
    # Do the "Cross-validation" only on those papers that have more than N papers
    for path in groups.keys():

        others = paths - {path}
        test_ix = set(groups[path])
        train_ix = list(indices - test_ix)
        test_ix = list(test_ix)

        policy_f1, model_f1 = machine_learning(X, y, data, train_ix, test_ix)

        f1_diffs.append(model_f1 - policy_f1)
        model_f1s[path] = model_f1

    return pd.Series(f1_diffs), model_f1s
Exemple #22
0
class Phi():
    def __init__(self):
        self.vectorizer = DictVectorizer()

    def fit(self, training_sentences):
        counts = []
        for sentence in training_sentences:
            words = [pair[0] for pair in sentence]
            tags = [pair[1] for pair in sentence]
            count = self.extract_freatures(words, tags)
            counts.append(count)

        #Fit the dictvectorizer to the data
        #Fit expects a list of dictonaries but we just give it the one we constructed
        self.vectorizer.fit(counts)

    def transform(self, word_sequence, tag_sequence):
        #Extract the feature count
        count = self.extract_freatures(word_sequence, tag_sequence)

        #Convert the count to a sparse vector using dictVectorizer
        vector = self.vectorizer.transform(count)

        return vector

    def extract_freatures(self, word_sequence, tag_sequence):
        #Force them to be the same length
        length = len(word_sequence)

        #Append end to tags
        tag_sequence.append('END')

        #Create a list of word-tag and tag-tag features
        features = []
        for i in range(0,length):
            features.append((word_sequence[i],tag_sequence[i]))
            features.append((tag_sequence[i],tag_sequence[i+1]))

        #Create a count out of the list of features
        count = Counter(features)

        return count

    def inverse_transform(self, vector):
        #Convert the vector to a count using dictVectorizer
        counts = self.vectorizer.inverse_transform(vector)

        #inverse_transform returns a list of counts but we only gave it one vector
        return counts[0]
Exemple #23
0
def encode_categorical_features(train, test):
    dict_vectorizer = DictVectorizer()
    categorical_features = []

    for column in train:
        if train[column].dtype == 'object':
        	categorical_features.append(column)

    train_categorical_features = train[categorical_features].to_dict(outtype='records').toarray()
    test_categorical_features = test[categorical_features].to_dict(outtype='records').toarray()

    dict_vectorizer.fit(train_categorical_features)

	train_categorical_encoded = pandas.DataFrame(dict_vectorizer.transform(train_categorical_features))
	test_categorical_encoded = pandas.DataFrame(dict_vectorizer.transform(test_categorical_features))
Exemple #24
0
class CategoricalConverter(BaseEstimator, TransformerMixin):
	"""
	An inheritance class of BaseEstimator, TransformerMixin in sklearn. It can be used in sklearn.pipeline.
	It can convert categorical columns to numeric ones.
	
	Parameters:
	
	method: string. can be "dummy", "groupmean", "valuecount". Default is "dummy".
	cate_col: None or a list of string of column names.
	
	Return:
	A pandas dataframe with the categorical columns dropped. The original one will not be affected.
	"""
	def __init__(self, method="dummy", cate_cols=None):
		self.method = method
		self.cate_cols = cate_cols
		return
		
	def fit(self, X, y):
		if self.cate_cols is None:
			self.cate_cols = get_cate_col(X)
		self.values = {}
		if self.method == "dummy":
			self.dvec = DictVectorizer(sparse=False)
			self.dvec.fit((X[self.cate_cols]).to_dict('record'))
		elif self.method == "groupmean":
			for col in self.cate_cols:
				tempdict = {}
				tempvals = [val for val in X[col].unique() if str(val) != "nan"]
				for val in tempvals:
					tempdict[val] = y[(X[col] == val)].mean()
				self.values[col] = tempdict
		elif self.method == "valuecount":
			for col in self.cate_cols:
				self.values[col] = X[col].value_counts()
		return self
		
	def transform(self, X, y=None):
		XX = X.copy()
		if self.method == "dummy":
			temp_dummy = pd.DataFrame( data = self.dvec.transform((XX[self.cate_cols]).to_dict('record')), columns = self.dvec.get_feature_names(), index=XX.index)
			XX = pd.concat([temp_dummy,XX],axis=1)
		elif self.method in ["groupmean", "valuecount"]:
			for col in self.cate_cols:
				XX.loc[:,col+"_gpmean"] = XX[col].map(self.values[col])
		
		XX.drop(self.cate_cols, axis=1, inplace=True)
		return XX
class FullPickledRSTFeatureExtractor:
    def __init__(self,instancenums):
        self.vectorizer = DictVectorizer(dtype=float, sparse=True)
        self.atInstance = 0 
        self.instanceNums = instancenums

    def fit(self, X, y=None):
        self.vectorizer = self.vectorizer.fit([self.getFeatures(x) for x in X])
        return self
    
    def getFeatures(self,text):
        features = {}#{'textlen':len(text)}
        rstFile = './output_trees/review%d.pickle.gz'%self.instanceNums[self.atInstance]
        tree = getPickledTree(rstFile).tree
        features['size'] = tree_size(tree)
        if features['size']>0:
            features['normdepth'] = tree_depth(tree)/tree_size(tree)
        features['balance'] =abs(tree_balance(tree))
        features.update(relation_proportion(tree)) 
        features.update(parent_relation_proportion(tree)) 
        self.atInstance = self.atInstance + 1
        if self.atInstance>=len(self.instanceNums):
            self.atInstance = 0
        return  features

    def setInstanceNums(self, nums):
        self.instanceNums = nums

    def setInstance(self, num):
        self.atInstance = num

    def transform(self, X, y=None):
        return self.vectorizer.transform([self.getFeatures(x) for x in X],y)
class FullTextRSTFeatureExtractor:
    def __init__(self,instancenums):
        self.vectorizer = DictVectorizer(dtype=float, sparse=True)
        self.atInstance = 0 
        self.instanceNums = instancenums

    def fit(self, X, y=None):
        self.vectorizer = self.vectorizer.fit([self.getTextFeatures(text) for text in X])
        return self

    def getTextFeatures(self,text):
        features = {}#{'textlen':len(text)}
        rstFile = open('./rstParsed/review%d.brackets'%self.instanceNums[self.atInstance],'r')
        counter = Counter()
        for line in rstFile:
            eduRange, satOrNuc, rangeType  = eval(line)
            counter[satOrNuc] += 1 
            counter[rangeType] += 1 
            counter['lines'] += 1
            counter['maxEDU'] = max(eduRange[1],counter['maxEDU'])
            counter['maxDif'] = max(eduRange[1]-eduRange[0],counter['maxDif'])
        features.update(counter)
        self.atInstance = self.atInstance + 1
        if self.atInstance>=len(self.instanceNums):
            self.atInstance = 0
        return  features

    def setInstanceNums(self, nums):
        self.instanceNums = nums

    def setInstance(self, num):
        self.atInstance = num

    def transform(self, X, y=None):
        return self.vectorizer.transform([self.getTextFeatures(text) for text in X],y)
def KFoldPredictionScore (X,y,k,header):

    from sklearn.svm import SVC
    from sklearn.feature_extraction import DictVectorizer
    vec = DictVectorizer()

    try:
        accuracy = 0.0
        for X_train, y_train, X_test, y_test in k_fold_generator(X, y, k):

            vec = DictVectorizer()
            fit = vec.fit(X_train)

            X_train_counts = fit.transform(X_train)
            X_test_counts = fit.transform(X_test)
            clf = SVC(kernel="linear", C=0.025)
            try:
                clf.fit(X_train_counts.toarray(), y_train)
                #predict = clf.predict(X_test_counts.toarray())
                accuracy += clf.score(X_test_counts.toarray(),y_test)
                # coef = clf._get_coef()
               # print(np.argsort(coef)[-20:])
                #for i in range(0,len(X_test)):
                    #print (X_test[i]['ID']+"\t"+y_test[i]+"\t"+predict[i])
            except BaseException as b:
                    print (b)
        print (header+"\t"+str(accuracy))
    except BaseException as b:
        print (b)
Exemple #28
0
 def _trainPOSDictVectorizer(self, goldTree, to_classify=None):
     sentences = list(goldTree)
     if to_classify:
         sentences.extend(to_classify)
     pos_tagged = self.get_pos_tags_for_sentences(sentences)
     items = []
     assert len(pos_tagged) == len(sentences)
     for sentence, pos in itertools.izip(sentences, pos_tagged):
         # feels silly, but there is the occasional encoding error
         # when using str(sentence)
         self.posCache[sentence.pprint().encode('utf-8')] = pos
         items.extend(self.extract_POS(sentence, pos))
     dv = DictVectorizer(sparse=False)
     dv.fit(items)
     #logger.debug("DictVectorizer vocab: %s", dv.vocabulary_)
     return dv
Exemple #29
0
def numberize_features(dataset, unrolled_dataset, dv=None):
  ''' turn non-numeric features into sparse binary features; also return the feature map '''
  # http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/
  # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
  if dv is None:
    dv = DictVectorizer(sparse=False) # can we make it true?
    dv = dv.fit(unrolled_dataset.flatten())
  return np.array(list(map(dv.transform, dataset))), dv
Exemple #30
0
def numberize_features(dataset, sparse=True, dv=None):
  ''' turn non-numeric features into sparse binary features; also return the feature map '''
  # http://fastml.com/converting-categorical-data-into-numbers-with-pandas-and-scikit-learn/
  # http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html
  if dv is None:
    dv = DictVectorizer(sparse=sparse)
    dv = dv.fit(dataset)
  return dv.transform(dataset), dv
import pandas as pd
from sklearn.feature_extraction import DictVectorizer

df = pd.read_csv(
    'https://raw.githubusercontent.com/dlsun/data-science-book/master/data/titanic.csv'
)

# define the training data
cols = ["age", "sex", "pclass"]
X_train = df[cols]
y_train = df["survived"]

# convert categorical variables to dummy variables
vec = DictVectorizer(sparse=False)
X_train = X_train.to_dict(orient='records')
vec.fit(X_train)
X_train = pd.DataFrame(vec.transform(X_train)).fillna(0)
X_train.columns = vec.get_feature_names()

# fit the 5-nearest neighbors model
model = KNeighborsClassifier(n_neighbors=5)
scaler = StandardScaler()

pipeline = Pipeline([("scaler", scaler), ("model", model)])

y_train = df["survived"]
pipeline.fit(X_train, y_train)

y_train_pred = pipeline.predict(X_train)

print("Accuracy", accuracy_score(y_train, y_train_pred))
Exemple #32
0
            else:
                temp[name] = entry[name]
        for name in num_var:
            if entry[name] == 'NA':
                continue
            else:
                temp[name] = float(entry[name])
        #temp["self_reported_fishing_vessel"] = entry["self_reported_fishing_vessel"] == "TRUE"
        examples.append(temp)


#%% vectorize:
from sklearn.feature_extraction import DictVectorizer

feature_numbering = DictVectorizer(sort=True, sparse=False)
feature_numbering.fit(examples)
X = feature_numbering.transform(examples)
print("Features as {} matrix.".format(X.shape))

del examples

#%% Split data
from sklearn.model_selection import train_test_split
import numpy as np

RANDOM_SEED = 12345678

y = np.array(ys)
# split off 10% for train/validate (tv) pieces.
X_tv, rX_test, y_tv, y_test = train_test_split(
    X, y, train_size=0.1, shuffle=True, random_state=RANDOM_SEED
    train_size=0.9,
    shuffle=True,
    random_state=RANDOM_SEED,
)
# split off train, validate from (tv) pieces.
ex_train, ex_vali, y_train, y_vali = train_test_split(
    ex_tv, y_tv, train_size=0.9, shuffle=True, random_state=RANDOM_SEED
)

#%% vectorize:

from sklearn.preprocessing import StandardScaler, MinMaxScaler

feature_numbering = DictVectorizer(sparse=False)
# Learn columns from training data (again)
feature_numbering.fit(ex_train)
# Translate our list of texts -> matrices of counts
rX_train = feature_numbering.transform(ex_train)
rX_vali = feature_numbering.transform(ex_vali)
rX_test = feature_numbering.transform(ex_test)

scaling = StandardScaler()
X_train = scaling.fit_transform(rX_train)
X_vali = scaling.transform(rX_vali)
X_test = scaling.transform(rX_test)

print(X_train.shape, X_vali.shape)
#%% train a model:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
    for n in range(npoints):
        x = X.iloc[n, :]
        list_X.append(x)
    return list_X

time_start = time.time()

df = pd.read_csv('datasets/training_data.csv')

X = df.iloc[:, :-1]
y = df.iloc[:,-1]
# attribute = ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']
# 43 features
# if dont want using feature, df_train.drop(df_train.columns[[0]], axis=1, inplace=True)
X = get_data_to_list(X)

dv = DictVectorizer()
dv.fit(X)

X_vec = dv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size = 0.3)

clf_inforGain = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=50, min_samples_leaf=2)
clf_inforGain.fit(X_train, y_train)

pred = clf_inforGain.predict(X_test)
print('rate:', metrics.accuracy_score(y_test, pred))

time_end = time.time()
print('time run:', time_end - time_start)
Exemple #35
0
class SupervisedLearner:
    def __init__(self,
                 abstract_reader,
                 target="n",
                 hold_out_a_test_set=False,
                 test_set_p=None):
        '''
        abstract_reader: a LabeledAbstractReader instance.
        target: the tag of interest (i.e., to be predicted)
        '''
        self.abstract_reader = abstract_reader
        self.target = target
        # this is a special target because we
        # enforce the additional constraint that
        # there be only one 'yes' vote per citation
        self.predicting_sample_size = target == "n"

        # reserve some data for testing?
        self.holding_out_a_test_set = hold_out_a_test_set
        if self.holding_out_a_test_set:
            assert test_set_p is not None

        self.test_set_p = test_set_p
        self.n_citations = len(self.abstract_reader)

    def plot_preds(self, preds, y):
        # (preds, y) = sl.cv()
        # sklearn wraps up the predicted results

        pos_indices = [i for i in xrange(len(y)) if y[i] > 0]
        all_preds = [preds[i][1] for i in xrange(len(y))]
        pos_preds = [preds[i][1] for i in pos_indices]

    def generate_features(self):
        print "generating feature vectors"

        # I don't think we ever want to flatten abstracts.
        self.features, self.y = self.features_from_citations(
            flatten_abstracts=False)
        self.vectorizer = DictVectorizer(sparse=True)

        # note that we keep structure around that keeps features
        # in citations together. specifically, features will be a
        # list of feature vectors representing words
        # in abstracts comprising distinct citations
        all_features = []
        for citation_fvs in self.features:
            all_features.extend(citation_fvs)

        self.vectorizer.fit(all_features)
        self.X_fv = []
        no_abstracts = 0
        for X_citation in self.features:
            if len(X_citation) > 0:
                self.X_fv.append(self.vectorizer.transform(X_citation))
            else:
                self.X_fv.append(None)
                no_abstracts += 1
        print "({0} had no abstracts!)".format(no_abstracts)
        #self.X_fv = [self.vectorizer.transform(X_citation) for X_citation in self.features if len(X_citation) > 0]

        if self.holding_out_a_test_set:
            self.set_held_out_indices()

    def set_held_out_indices(self):
        test_set_size = int(self.test_set_p * self.n_citations)
        print "setting aside a test set of size {0}".format(test_set_size)
        #import pdb; pdb.set_trace()
        self.test_indices = random.sample(range(self.n_citations),
                                          test_set_size)

    def select_train_citation_indices(self, train_p):
        '''
        this is somewhat confusing, but the idea here is to allow one to
        have a single, consistent test set and to increase the training
        set to see how this affects performance on said set. 
        '''
        # first remove the held out indices.
        self.train_indices = [
            i for i in range(self.n_citations) if not i in self.test_indices
        ]
        # now draw a sample from the remaining (train) abstracts.
        train_set_size = int(train_p * len(self.train_indices))
        print "going to train on {0} citations".format(train_set_size)
        self.train_indices = random.sample(self.train_indices, train_set_size)

    '''
    @TODO this method is meant to supplant the following routine.
    The idea is that is more general, i.e., allows us to
    assess performance on <tx>, etc; not just <n>
    '''

    def train_and_test(self, test_size=.2, train_p=None):
        test_citation_indices = None
        train_citation_indices = None
        if self.holding_out_a_test_set:
            print "using the held-out test set!"
            test_size = len(self.test_indices)
            test_citation_indices = self.test_indices
            train_citation_indices = self.train_indices
        else:
            test_size = int(test_size * self.n_citations)
            test_citation_indices = random.sample(range(self.n_citations),
                                                  test_size)

        print "test set of size {0} out of {1} total citations".format(
            test_size, self.n_citations)

    @staticmethod
    def max_index(self, a):
        return max((v, i) for i, v in enumerate(a))[1]

    def train_and_test_sample_size(self, test_size=.2, train_p=None):
        '''
        @TODO need to amend for predicting things other than sample size
        in retrospect, should probably never flatten abstracts; at test
        time we'll want to enforce certain constraints

        @TODO refactor -- this method is too long.
        '''
        test_citation_indices = None
        train_citation_indices = None
        if self.holding_out_a_test_set:
            print "using the held-out test set!"
            test_size = len(self.test_indices)
            test_citation_indices = self.test_indices
            train_citation_indices = self.train_indices
        else:
            test_size = int(test_size * self.n_citations)
            test_citation_indices = random.sample(range(self.n_citations),
                                                  test_size)

        print "test set of size {0} out of {1} total citations".format(
            test_size, self.n_citations)

        X_train, y_train = [], []
        X_test, y_test = [], []
        test_citation_indices.sort()  # not necessary; tmp
        for i in xrange(self.n_citations):
            if self.X_fv[i] is not None:
                is_a_training_instance = (train_citation_indices is None
                                          or i in train_citation_indices)
                if not i in test_citation_indices and is_a_training_instance:
                    # we flatten these for training.
                    X_train.extend(self.X_fv[i])
                    y_train.extend(self.y[i])

                elif i in test_citation_indices:
                    # these we keep structured, though.
                    X_test.append(self.X_fv[i])
                    y_test.append(self.y[i])

        clf = SupervisedLearner._get_SVM()
        X_train = scipy.sparse.vstack(X_train)
        clf.fit(X_train, y_train)
        print "ok -- testing!"
        max_index = lambda a: max((v, i) for i, v in enumerate(a))[1]
        '''
        @TODO refactor. note that this will have to change for other
        targets (TX's, etc.)
        '''
        TPs, FPs, N_pos = 0, 0, 0
        for test_citation_i, citation_fvs in enumerate(X_test):
            true_lbls_i = y_test[test_citation_i]
            preds_i = clf.best_estimator_.decision_function(citation_fvs)
            # we set the index corresponding to the max
            # val (most likely entry) to 1; all else are 0
            preds_i_max = max_index(preds_i)
            preds_i = [-1] * len(preds_i)
            preds_i[preds_i_max] = 1

            # *abstract level* predictions.
            if not 1 in true_lbls_i:
                cit_n = test_citation_indices[test_citation_i]
                print "-- no sample size for abstract (biview_id) {0}!".format(
                    self.abstract_reader[cit_n]["biview_id"])
                # since we force a prediction for every abstract right now,
                # i'll penalize us here. this is an upperbound on precision.
                FPs += 1
            else:
                N_pos += 1
                if preds_i.index(1) == true_lbls_i.index(1):
                    TPs += 1
                else:
                    FPs += 1

        N = len(X_test)
        return TPs, FPs, N_pos, N

    def cv(self, predict_probs=False):
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
            self.X_fv, self.y, test_size=0.1)
        clf = SupervisedLearner._get_SVM()
        clf.fit(X_train, y_train)
        preds = None
        if predict_probs:
            # well, *log* probs, anyway
            preds = [p[1] for p in clf.predict_log_proba(X_test)]
        else:
            preds = clf.predict(X_test)
        return preds, y_test

    @staticmethod
    def _get_SVM():
        tune_params = [{"C": [1, 5, 10, 100, 1000]}]
        return GridSearchCV(LinearSVC(), tune_params, scoring="f1")

    def train(self):
        features, y = self.features_from_citations()
        self.vectorizer = DictVectorizer(sparse=True)
        X_fv = self.vectorizer.fit_transform(self.features)

        self.clf = _get_SVM()

        ##
        # @TODO grid search over c?
        self.clf.fit(X_fv, y)

    def features_from_citations(self, flatten_abstracts=False):
        X, y = [], []

        pb = progressbar.ProgressBar(len(self.abstract_reader), timer=True)
        for cit_id in range(len(self.abstract_reader)):
            # first we perform feature extraction over the
            # abstract text (X)

            merged_tags = self.abstract_reader.get(cit_id)
            #pdb.set_trace()
            p = TaggedTextPipeline(merged_tags, window_size=4)
            p.generate_features()

            # @TODO will eventually want to exploit sentence
            # structure, I think

            ####
            # IM: 'punct' = token has all punctuation
            # filter here is a lambda function used on the
            # individual word's hidden features
            ###
            # X_i = p.get_features(flatten=True, filter=lambda w: w['punct']==False)
            # y_i = p.get_answers(flatten=True, answer_key=lambda w: "n" in w["tags"], filter=lambda w: w['punct']==False)

            ####
            # IM: xml annotations are now all available in w["tags"] for each word in the features list
            ####

            if self.predicting_sample_size:
                ###
                # restrict to integers only
                ###

                #X_i = p.get_features(flatten=True, filter=lambda w: w['num']==True)
                X_i = p.get_features(flatten=True, filter=integer_filter)
                y_i = p.get_answers(flatten=True,
                                    answer_key=is_sample_size,
                                    filter=integer_filter)
            else:
                X_i = p.get_features(flatten=False)
                y_i = p.get_answers(flatten=False, answer_key=is_target)

            if flatten_abstracts:
                X.extend(X_i)
                y.extend(y_i)
            else:
                X.append(X_i)
                y.append(y_i)

            pb.tap()

        return X, y

    def train_on_all_data(self):
        X_train, y_train = [], []
        for i in xrange(self.n_citations):
            if self.X_fv[i] is not None:
                # we flatten these for training.
                X_train.extend(self.X_fv[i])
                y_train.extend(self.y[i])

        clf = SupervisedLearner._get_SVM()
        X_train = scipy.sparse.vstack(X_train)
        print "fitting...."
        clf.fit(X_train, y_train)
        print "success!"

        return clf, self.vectorizer
Exemple #36
0
time_start = time.time()

df = pd.read_csv('datasets/training_data.csv')
#df_test = pd.read_csv('datasets/test.csv')
X = df.iloc[:, :-1]
y = df.iloc[:,-1]
# attribute = ['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat']
# 43 features
# if dont want using feature, df_train.drop(df_train.columns[[0]], axis=1, inplace=True)
X = get_data_to_list(X)
#print(X.head())
#print(y.head())

#for i in test: print(i)
dv = DictVectorizer()
dv.fit(X)

X_vec = dv.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size = 0.3)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print('rate:', metrics.accuracy_score(y_test, pred))

time_end = time.time()
print('time run:', time_end - time_start)


'''
Exemple #37
0
df_no_priceLink = df_no_price.drop(['link'], 1)
df_no_priceLinkTitle = df.drop(['ID', 'price', 'link', 'title'], 1)

# --------------------------------------------------------------------------------------------------------

# Do the same for our testing file
testing_no_priceLinkTitle = testing.drop(['price', 'link', 'title'], 1)

# --------------------------------------------------------------------------------------------------------

#Now use the sklearn DictVectorizor libary to map each colum from the data frame into a numpy array
# Transforms lists of feature-value mappings to vectors.
#
#
dv = DictVectorizer()
dv.fit(df_no_priceLinkTitle.T.to_dict().values())

# --------------------------------------------------------------------------------------------------------

# Create linear regression object
LR = LinearRegression()

# Train the model using the training sets(DataFrame without title, link or price and then price by itself)
LR.fit(dv.transform(df_no_priceLinkTitle.T.to_dict().values()), df.price)

# Explained variance score: 1 is perfect prediction
print(
    'Variance score: %.2f' % LR.score(
        dv.transform(df_no_priceLinkTitle.T.to_dict().values()), df.price))

# --------------------------------------------------------------------------------------------------------
Exemple #38
0
class BaseLabeler(object):
    def __init__(self,
                 path_train="",
                 fileid_train_labeled="",
                 fileid_train_unlabeled="",
                 train_instances_labeled=None,
                 train_instances_unlabeled=None,
                 fileid_train_labeled_dep="",
                 fileid_train_unlabeled_dep="",
                 filter_func=None,
                 special_label="",
                 featselec_featorder=[],
                 feature_list=[],
                 path_test="",
                 fileid_test="",
                 model=None,
                 target_verbs=[],
                 discard_labels=[],
                 train_method="supervised",
                 verbose_fileid=None):

        self.filter_func = filter_func
        self.feature_list = feature_list
        self.path_test = path_test
        self.fileid_test = fileid_test
        self.model = model
        self.target_verbs = target_verbs
        self.discard_labels = discard_labels
        self.prop_num_per_verb = None
        self.verbose_fileid = verbose_fileid

        self.train_feats, self.train_labels, self.train_props = self._load_instances(
            path_train, fileid_train_labeled, train_instances_labeled,
            filter_func, special_label, False, fileid_train_labeled_dep,
            self.prop_num_per_verb)

        if train_method == "self-training":
            unlabeled_instances = self._load_instances(
                path_train, fileid_train_unlabeled, train_instances_unlabeled,
                filter_func, special_label, True, fileid_train_unlabeled_dep,
                self.prop_num_per_verb)
            # Extract features
            self.train_unlabeled_props = []
            self.train_unlabeled_feats = []
            self.train_unlabeled_labels = []
            for argcand in unlabeled_instances:
                argcand_feats, argcand_prop = self.extract_features(argcand)
                self.train_unlabeled_feats.append(argcand_feats)
                self.train_unlabeled_props.append(argcand_prop)

                argcand_label = argcand["info"]["label"]
                if argcand_label == "NULL":
                    self.train_unlabeled_labels.append("NULL")
                elif special_label != "":
                    self.train_unlabeled_labels.append(special_label)
                else:
                    self.train_unlabeled_labels.append(argcand_label)

        self.featselec_featorder = featselec_featorder
        self.train_method = train_method

    def set_params(self,
                   path_test="",
                   fileid_test="",
                   fileid_test_dep="",
                   prop_num_per_verb=None):
        self.path_test = path_test
        self.fileid_test = fileid_test
        self.fileid_test_dep = fileid_test_dep

        try:
            if self.prop_num_per_verb is None or prop_num_per_verb is not None:
                self.prop_num_per_verb = prop_num_per_verb

        except AttributeError:
            self.prop_num_per_verb = prop_num_per_verb

        return

    def _load_instances(self,
                        path,
                        fileid,
                        instances=None,
                        filter_func=None,
                        special_label="",
                        test=False,
                        fileid_dep="",
                        prop_num_per_verb=None):

        if instances is None:
            column_types = [
                "id", "words", "lemma", "pos", "feat", "clause", "fclause",
                "tree", "srl"
            ]
            reader = PropbankBrConllCorpusReader(path, fileid, column_types,
                                                 None, "S", False, True,
                                                 "utf-8")

            column_types_dep = [
                "id", "words", "lemma", "pos", "feat", "head", "deprel",
                "fillpred", "srl"
            ]
            reader_dep = PropbankBrConllCorpusReader(path, fileid_dep,
                                                     column_types_dep, None,
                                                     "FCL", False, False,
                                                     "utf-8")
            # Get the argument candidates
            argcands, self.prop_num_per_verb = self._read_instances(
                reader, filter_func, reader_dep, prop_num_per_verb)
        else:
            argcands = instances

        if test:
            return argcands

        # Extract the necessary features from the argument candidates
        train_argcands_props = []
        train_argcands_feats = []
        train_argcands_target = []

        for argcand in argcands:
            argcand_label = argcand["info"]["label"]
            if (argcand_label in self.discard_labels) or ("C-"
                                                          in argcand_label):
                continue

            arg_feats, arg_prop = self.extract_features(argcand)
            train_argcands_feats.append(arg_feats)
            train_argcands_props.append(arg_prop)

            if argcand_label == "NULL":
                train_argcands_target.append("NULL")
            elif special_label != "":
                train_argcands_target.append(special_label)
            else:
                train_argcands_target.append(argcand_label)

        # Create an encoder for the features
        self.feature_encoder = DictVectorizer()
        self.feature_encoder.fit(train_argcands_feats)

        # Create and encoder for the target labels
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(train_argcands_target)

        return train_argcands_feats, train_argcands_target, train_argcands_props

    def _read_instances(self,
                        reader,
                        filter_func=None,
                        reader_dep=None,
                        prop_num_per_verb=None):
        arg_cands = []
        if reader_dep is None:
            info_sent = zip(reader.lexicalinfo_sents(),
                            reader.srl_instances(None, None, False), None)
        else:
            info_sent = zip(reader.lexicalinfo_sents(),
                            reader.srl_instances(None, None, False),
                            reader_dep.dep_parsed_sents())

        if prop_num_per_verb is None:
            prop_num_per_verb = dict()

        for lexinfo_sent, sent_ins, sent_ins_depgraph in info_sent:
            # Get the parse tree of the sentence
            tree = sent_ins.tree
            for ins in sent_ins:
                # Check if the instance belongs to one of the target verbs
                if (ins.verb_stem in self.target_verbs) or (self.target_verbs
                                                            == []):
                    if ins.verb_stem in prop_num_per_verb:
                        prop_num_per_verb[ins.verb_stem] += 1
                    else:
                        prop_num_per_verb[ins.verb_stem] = 1
                    verb_prop_num = prop_num_per_verb[ins.verb_stem]
                    if filter_func is None:
                        # Get the gold arguments
                        for arg in ins.arguments:
                            arg_cands.append({
                                "ins":
                                ins,
                                "verb_prop_num":
                                verb_prop_num,
                                "info_sent":
                                lexinfo_sent,
                                "info":
                                self._format_argcand(arg, lexinfo_sent, tree),
                                "depgraph":
                                sent_ins_depgraph
                            })
                    else:
                        # Prune the constituents of the sentence to get the argument candidates
                        pruned_argcands = filter_func(
                            tree, tree.leaf_treeposition(ins.verb_head))
                        # Format each argument candidate
                        for argcand_treepos in pruned_argcands:
                            argcand_span = util.treepos_to_tuple(
                                tree, argcand_treepos)
                            # Get the label of the argument candidate
                            for arg in ins.arguments:
                                if argcand_span == arg[0]:
                                    argcand_label = arg[-1]
                                    break
                            else:
                                argcand_label = "NULL"

                            arg_cands.append({
                                "ins":
                                ins,
                                "verb_prop_num":
                                verb_prop_num,
                                "info_sent":
                                lexinfo_sent,
                                "depgraph":
                                sent_ins_depgraph,
                                "info":
                                self._format_argcand(
                                    (argcand_span, argcand_label),
                                    lexinfo_sent, tree, argcand_treepos)
                            })

        return arg_cands, prop_num_per_verb

    def extract_features(self, argcand):
        feats_dep = feature_extractor_dep(argcand, self.feature_list)
        feats_const = feature_extractor_const(argcand, self.feature_list,
                                              argcand["depgraph"])
        feats_const.update(feats_dep)
        return feats_const, argcand["verb_prop_num"]

    def fit_mix(self, model_name="LogisticRegression"):
        if self.model == None:
            if model_name == "LinearSVC":
                model = LinearSVC(C=1, loss="l2")
            elif model_name == "SVC":
                model = SVC(kernel="poly")
            elif model_name == "LogisticRegression":
                model = LogisticRegression(C=8, penalty="l1")
            else:
                raise ValueError("Invalid model name.")

        if self.train_method == "supervised":
            self.model = model
            self.model.fit(self.feature_encoder.transform(self.train_feats),
                           self.label_encoder.transform(self.train_labels))

        return self.model

    def _join_test_discarded(self, test, discarded, test_order,
                             discarded_order):
        joined = test
        for arg, order in zip(discarded, discarded_order):
            joined.insert(order, arg)
        return joined

    def predict_mix(self, test_instances=[], filter_first=False):
        if test_instances == []:
            if (self.fileid_test <> None):
                # Get the instances from the test set
                test_instances = self._load_instances(
                    path=self.path_test,
                    fileid=self.fileid_test,
                    filter_func=self.filter_func,
                    test=True,
                    fileid_dep=self.fileid_test_dep,
                    prop_num_per_verb=self.prop_num_per_verb)
            else:
                return []

        # We got the instances right from the output of the identification system
        # Therefore, we need to filter out first those that are not argument candidates
        if filter_first:
            test_argcands = []
            test_argcands_order = []
            discarded_argcands = []
            discarded_argcands_order = []
            discarded_argcands_labels = []
            order = 0
            for argcand, label in test_instances:
                if label != "NULL":
                    test_argcands.append(argcand)
                    test_argcands_order.append(order)
                else:
                    discarded_argcands.append(argcand)
                    discarded_argcands_order.append(order)
                    discarded_argcands_labels.append("NULL")
                order += 1
        else:
            test_argcands = test_instances

        # Extract features
        test_argcands_feats = []
        for argcand in test_argcands:
            argcands_feats, _ = self.extract_features(argcand)
            test_argcands_feats.append(argcands_feats)

        # Transform the features to the format required by the classifier
        test_argcands_feats = self.feature_encoder.transform(
            test_argcands_feats)

        # Classify the candidate arguments
        test_argcands_targets = self.model.predict(test_argcands_feats)

        # Get the correct label names
        test_argcands_labels = self.label_encoder.inverse_transform(
            test_argcands_targets)

        if filter_first:
            test_argcands = self._join_test_discarded(
                test_argcands, discarded_argcands, test_argcands_order,
                discarded_argcands_order)
            test_argcands_labels = self._join_test_discarded(
                test_argcands_labels.tolist(), discarded_argcands_labels,
                test_argcands_order, discarded_argcands_order)

        return zip(test_argcands, test_argcands_labels)

    def set_model_parameters(self, model_name, verbose=3, file_path=""):
        if not self.model is None:
            model_name = self.model.__class__.__name__

        if model_name == "LinearSVC":
            model_to_set = LinearSVC()
            parameters = {"C": [1, 2, 4, 8], "loss": ["l1", "l2"]}
        elif model_name == "SVC":
            model_to_set = OneVsRestClassifier(SVC(kernel="poly"))
            parameters = {
                "estimator__C": [1, 2, 4, 8],
                "estimator__kernel": ["poly", "rbf"],
                "estimator__degree": [1, 2, 3, 4]
            }
        elif model_name == "LogisticRegression":
            model_to_set = LogisticRegression()
            parameters = {"penalty": ["l1", "l2"], "C": [1, 2, 4, 8]}
        else:
            raise ValueError("Invalid model name.")

        # Perform Grid Search with 10-fold cross-validation to estimate the parameters
        # cv_generator = StratifiedKFold(self.label_encoder.transform(self.train_labels), n_folds=7)
        cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True)
        model_tunning = GridSearchCV(model_to_set,
                                     param_grid=parameters,
                                     scoring=f1_score,
                                     n_jobs=1,
                                     cv=cv_generator,
                                     verbose=verbose)

        # Perform parameter setting
        model_tunning.fit(self.train_feats,
                          self.label_encoder.transform(self.train_labels))

        if verbose > 0:
            print "Best model:"
            print model_tunning.best_estimator_
            print "Best parameters:"
            print model_tunning.best_params_
            print "Best score {}:".format(
                model_tunning.get_params()["score_func"])
            print model_tunning.best_score_

        if file_path != "":
            file_name = file_path + model_name + "AI_Semi.bin"
            if verbose > 0:
                print "Saving best model {}...".format(file_name)
            tunned_model_file = open(file_name, "wb")
            cPickle.dump(model_tunning.best_estimator_, tunned_model_file)
            tunned_model_file.close()

        self.model = model_tunning.best_estimator_

        return self.model

    def analyse_feature_salience(self,
                                 model_name="LogisticRegression",
                                 forward=True,
                                 verbose=0):
        if self.model == None:
            if model_name == "LinearSVC":
                model = LinearSVC(C=1, loss="l2")
            elif model_name == "SVC":
                model = SVC(kernel="poly")
            elif model_name == "LogisticRegression":
                model = LogisticRegression(C=8, penalty="l1")
            else:
                raise ValueError("Invalid model name.")
        else:
            model = self.model

        # cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True)
        cv_generator = StratifiedKFold(self.label_encoder.transform(
            self.train_labels),
                                       n_folds=7)
        fscv = FeatureSalienceCV(
            model,
            cv=cv_generator,
            forward=forward,
            score_func=[precision_score, recall_score, f1_score],
            sort_by="f1_score",
            verbose=verbose)

        fscv.fit_mix(self.train_feats,
                     self.label_encoder.transform(self.train_labels))

        return fscv

    def analyse_feature_selection(self,
                                  model_name="LogisticRegression",
                                  forward=True,
                                  verbose=0):
        if self.model == None:
            if model_name == "LinearSVC":
                model = LinearSVC(C=1, loss="l2")
            elif model_name == "SVC":
                model = SVC(kernel="poly")
            elif model_name == "LogisticRegression":
                model = LogisticRegression(C=8, penalty="l1")
            else:
                raise ValueError("Invalid model name.")
        else:
            model = self.model

        # cv_generator = KFold(len(self.train_labels), n_folds=10, shuffle=True)
        cv_generator = StratifiedKFold(self.label_encoder.transform(
            self.train_labels),
                                       n_folds=7)
        fscv = FeatureSelectionCV(model,
                                  cv=cv_generator,
                                  feature_order=self.featselec_featorder,
                                  score_func=f1_score,
                                  verbose=verbose)

        fscv.fit_mix(self.train_feats,
                     self.label_encoder.transform(self.train_labels))

        return fscv

    def _format_argcand(self,
                        argcand_tuple,
                        lexinfo_sent,
                        tree,
                        argcand_treepos=None):

        start_arg, end_arg = argcand_tuple[0]

        if argcand_treepos == None:
            argcand_treepos = tree.treeposition_spanning_leaves(
                start_arg, end_arg)

        argcand = dict()
        argcand["treepos"] = argcand_treepos
        argcand["span"] = argcand_tuple[0]
        argcand["label"] = argcand_tuple[-1]
        argcand["cat"] = util.get_postag(tree[argcand_treepos])
        argcand["lexinfo"] = dict()

        for i in range(start_arg, end_arg):
            id_token, word, lemma, pos, feat = lexinfo_sent[i]
            argcand["lexinfo"][id_token] = {
                "word": word,
                "lemma": lemma,
                "pos": pos,
                "feat": feat
            }

        return argcand
Exemple #39
0
def make_conversion_data(num_feat_files,
                         from_suffix,
                         to_suffix,
                         with_labels=True):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = [] if with_labels else None
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        # if we are not using labels, we do not want zero-valued features
        # because it may be the case that some subset of features end up
        # being all 0 and if this subset ends up being written out to a file
        # below, then for some formats (e.g., megam) nothing will get written
        # out which can cause issues when reading this file
        lowest_feature_value = 0 if with_labels else 1
        x = {
            "f{:03d}".format(feat_num):
            np.random.randint(lowest_feature_value, 4 + lowest_feature_value)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        if with_labels:
            labels.append(y)
        features.append(x)

    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    if with_labels:
        label_map = {
            label: num
            for num, label in enumerate(
                sorted({
                    label
                    for label in labels if not isinstance(label, (int, float))
                }))
        }
        # Add fake item to vectorizer for None
        label_map[None] = '00000'
    else:
        label_map = None

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # use '_unlabeled' as part of any file names when not using labels
    with_labels_part = '' if with_labels else '_unlabeled'

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}{}'.format(feature_name_prefix, i,
                                            with_labels_part, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        elif from_suffix in ['.arff', '.csv', '.tsv']:
            label_col = 'y' if with_labels else None
            Writer.for_path(train_path, train_fs, label_col=label_col).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(
        convert_dir, '{}{}_all{}'.format(feature_name_prefix, with_labels_part,
                                         to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)

    # we need to do this to get around the FeatureSet using NaNs
    # instead of None when there are no labels which causes problems
    # later when comparing featuresets
    if not with_labels:
        train_fs.labels = [None] * len(train_fs.labels)

    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    elif to_suffix in ['.arff', '.csv', '.tsv']:
        label_col = 'y' if with_labels else None
        Writer.for_path(train_path, train_fs, label_col=label_col).write()
    else:
        Writer.for_path(train_path, train_fs).write()
Exemple #40
0
def Feature_Names(cluster_dict):
    vectorizer = DictVectorizer(sparse=False)
    vectorizer.fit([OrderedDict.fromkeys(cluster_dict.keys(), 1)])
    cluster_id_features = vectorizer.get_feature_names()
    return cluster_id_features
def encode_categorical_features(features, sparse=True):
    encoder = DictVectorizer(sparse=sparse)
    encoder.fit(features)
    encoded_features = encoder.transform(features)
    return encoded_features, encoder
Exemple #42
0
train_set_size = int(train_set_ration * len(titan))
test_set_size = len(titan) - train_set_size

train_set = titan.loc[:train_set_size]
test_set = titan.loc[train_set_size:]

# Create features and target for train
train_features = feature_cleaning(train_set)
train_target = train_set['Survived']

test_features = feature_cleaning(test_set)
test_target = test_set['Survived']

# Pre-processing
vectorizer = DictVectorizer (sparse = False)
vectorizer.fit(train_features.to_dict(orient = 'records'))

cleaned_train_features = ghosl(train_features, vectorizer)
cleaned_test_features = ghosl(test_features, vectorizer)

# Make model (Logistic regression)
lr = lrm.LogisticRegression()
lr.fit(cleaned_train_features,train_target)

print("Train set accuracy: " + str (show_result(cleaned_train_features,train_target,lr)))
print("Test set accuracy: " + str(show_result(cleaned_test_features, test_target, lr)))

eval_data = ps.read_csv('data/test.csv')
cleaned_eval_data = feature_cleaning(eval_data,False)
result = solve(cleaned_eval_data,lr,vectorizer)
Exemple #43
0
class NMF_Label:
    '''
    
    '''
    def __init__(self, n_components=20,
                       distance='cos',
                       sparse=True,
                       tfidf=False):
        self.n_components = n_components
        self.distance = distance
        self.tfidf = tfidf
        
        # DictVectorizer for embedding
        self.dv_x = DictVectorizer(sparse=sparse)
        self.dv_y = DictVectorizer(sparse=sparse)
        
        # Model for Tfidf
        self.TFIDF = None
        
        
    def build(self, universe_x, universe_y):
        # Build DictVectorizer for feature (x)
        self.dv_x.fit([ {x: 1} for x in universe_x ])
        self.map_v2i_x = self.dv_x.vocabulary_
        self.map_i2v_x = dict(zip(self.map_v2i_x.values(), self.map_v2i_x.keys()))        
        
        # Build DictVectorizer for target (y)
        self.dv_y.fit([ {x: 1} for x in universe_y ])
        self.map_v2i_y = self.dv_y.vocabulary_
        self.map_i2v_y = dict(zip(self.map_v2i_y.values(), self.map_v2i_y.keys()))
        
        
    def compile(self):
        # Deterministic, do nothing
        pass
        
        
    def fit(self, x=None,
                  y=None,
                  verbose=False):
        
        # Embed feature (x)
        embed_matrix_x = self.dv_x.transform([ {v: 1 for v in arr} for arr in x ])
        if self.tfidf:
            self.TFIDF = TfidfModel(list( [ (j, row[0,j]) for j in row.nonzero()[1] ] for row in embed_matrix_x ),
                                    normalize=False)
            embed_matrix_x = self.dv_x.transform([ { self.map_i2v_x[i]: w
                                                     for i,w in self.TFIDF[list( (self.map_v2i_x[v], 1.0) 
                                                                                 for v in arr if v in self.map_v2i_x )] }
                                                   for arr in x ])
        
        # Embed target (y)
        embed_matrix_y = self.dv_y.transform([ {v: 1 for v in arr} for arr in y ])
        
        # Co-occurance matrix
        #   Raw
        co_matrix = embed_matrix_y.T.dot(embed_matrix_x)
        #   Normalized (row-wise)
        co_matrix_norm = co_matrix / np.linalg.norm(co_matrix.A, ord=2, axis=1, keepdims=True)
        
        # Factorize using NMF
        nmf = NMF(n_components=self.n_components, random_state=RANDOM_STATE)#, beta_loss='kullback-leibler', solver='mu')
        self.U = nmf.fit_transform(co_matrix)
        self.V = nmf.components_.T
        if verbose:
            print('Recon error: {0}, Raw matrix norm: {1}'.format(nmf.reconstruction_err_, np.linalg.norm(co_matrix.A, ord=2)))
        
            
    def predict(self, x, n_best=1):
            
        # Embed feature (x)
        if self.tfidf:
            embed_matrix_x = self.dv_x.transform([ { self.map_i2v_x[i]: w
                                                     for i,w in self.TFIDF[list( (self.map_v2i_x[v], 1.0) 
                                                                                 for v in arr if v in self.map_v2i_x )] }
                                                   for arr in x ])
        else:
            embed_matrix_x = self.dv_x.transform([ {v: 1 for v in arr} for arr in x ])
        
        # Transform embedded description into encoded space 
        enc_x = embed_matrix_x.dot(self.V)
        
        # Match by finding NN in encoded space wrt rows of U
        if self.distance == 'cos':
            # Cosine distance
            #   Normalize encoded vector 
            U_norm = self.U / (np.linalg.norm(self.U, ord=2, axis=1, keepdims=True) + EPS)
            enc_x_norm = enc_x / (np.linalg.norm(enc_x, ord=2, axis=1, keepdims=True) + EPS)
            
            dist_matrix = U_norm.dot(enc_x_norm.T)

            # y_idx = np.argmax(dist_matrix, axis=0)
            y_idx = np.argsort(dist_matrix, axis=0, )[-n_best:, :].T
        
        # Recover target (y) from embed idx
        y = utils.asarray_of_list([ [ self.map_i2v_y[i] for i in arr ] for arr in y_idx ])
        
        return y
        
    
    def save_model(self, filename):
        with open(filename, 'wb') as file:
            pickle.dump((self.U, self.V), file)
        
        
    def load_model(self, filename):
        with open(filename, 'rb') as file:
            self.U, self.V = pickle.load(file)
Exemple #44
0
    {
        'packed': 1,
        'contains_encrypted': 0
    },
    {
        'packed': 0,
        'contains_encrypted': 0
    },
    {
        'packed': 0,
        'contains_encrypted': 0
    },
]
ground_truth = [1, 1, 1, 1, 0, 0, 0, 0]
# initialize the vectorizer with the training data
vectorizer.fit(training_examples)

# transform the training examples to vector form
X = vectorizer.transform(training_examples)
y = ground_truth  # call ground truth 'y', by convention
# train the classifier (a.k.a. 'fit' the classifier)
classifier.fit(X, y)
test_example = {'packed': 1, 'contains_encrypted': 0}
test_vector = vectorizer.transform(test_example)
print ` classifier.predict(test_vector) `  # prints [1]
#visualize the decision tree
with open("classifier.dot", "w") as output_file:
    tree.export_graphviz(classifier,
                         feature_names=vectorizer.get_feature_names(),
                         out_file=output_file)
class NGramFrequencyExtractor(BaseEstimator, TransformerMixin):
    """
    Transformer object turning messages into frequency feature vectors counting ngrams up to specified maximum.
    Sci-kit learn documentation on creating estimators: http://scikit-learn.org/dev/developers/contributing.html#rolling-your-own-estimator
    """
    def __init__(self,
                 lexicon,
                 form=None,
                 default_form=lambda word, lexicon: word,
                 ngram_size=1,
                 adjust_for_message_len=True):
        self.lexicon = lexicon
        self.form = form
        self.default_form = default_form
        self.ngram_size = ngram_size
        self.vectorizer = DictVectorizer()
        self.adjust_for_message_len = adjust_for_message_len

    def extract_frequency_dicts(self, X):
        frequency_dicts = []
        for message in X:
            tuple_ngrams = nltk.ngrams(self.retrieve_lexical_form(message),
                                       self.ngram_size)
            string_ngrams = []
            for ngram in tuple_ngrams:
                string_ngrams.append(",".join(ngram))

            frequency_dict = Counter(string_ngrams)
            if self.adjust_for_message_len:
                for ngram in frequency_dict:
                    frequency_dict[ngram] = frequency_dict[ngram] / len(
                        string_ngrams)

            frequency_dicts.append(frequency_dict)

        return frequency_dicts

    def fit(self, X, y=None):
        """
        Determines the list of tokens and ngrams to be used
        :param X: List of tokenised messages
        :type X: list(list(str))
        """
        frequency_dicts = self.extract_frequency_dicts(X)
        self.vectorizer.fit(frequency_dicts)
        return self

    def transform(self, X, y=None):
        """
        Transforms tokenised messages into frequency vectors
        :return: frequency vectors
        :rtype: numpy array of shape [n_samples, n_features]
        """
        frequency_dicts = self.extract_frequency_dicts(X)
        return self.vectorizer.transform(frequency_dicts)

    def fit_transform(self, X, y=None, **fit_params):
        """
        Fit to data then transform it
        :return: frequency vectors
        :rtype: numpy array of shape [n_samples, n_features]
        """
        frequency_dicts = self.extract_frequency_dicts(X)
        return self.vectorizer.fit_transform(frequency_dicts)

    def get_feature_names(self):
        try:
            return self.vectorizer.get_feature_names()
        except AttributeError:
            raise AttributeError(
                "No feature names, object has not been fitted")

    def retrieve_lexical_form(self, message):
        if self.form is None:
            return message

        assert self.lexicon.has_feature(self.form)

        transformed_message = []
        for word in message:
            if word in self.lexicon and self.lexicon.get_feature_value_by_word(
                    word, self.form):
                transformed_message.append(
                    self.lexicon.get_feature_value_by_word(word, self.form))
            else:
                transformed_message.append(
                    self.default_form(word, self.lexicon))

        return transformed_message
            "Bedroom AbvGr", 
            "Year Built", "Yr Sold",
            "Neighborhood"]

X_train_dict = housing[features].to_dict(orient="records")
y_train = housing["SalePrice"]
# -

# Now we will use Scikit-Learn to preprocess the features...

# +
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import StandardScaler

vec = DictVectorizer(sparse=False)
vec.fit(X_train_dict)
X_train = vec.transform(X_train_dict)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)
# -

# ...and to fit the $k$-nearest neighbors model to the data.

# +
from sklearn.neighbors import KNeighborsRegressor

# Fit a 10-nearest neighbors model.
model = KNeighborsRegressor(n_neighbors=10)
model.fit(X_train_sc, y_train)
Exemple #47
0
class Normalizer(object):
    '''Convertidor y normalizador de datos para los clasificadores.'''
    def __init__(self, norm="l2"):
        super(Normalizer, self).__init__()
        self.featureNames = [
            "Provincia", "Canton", "Totalpobla", "Superficie", "Densidadpobln",
            "Urbano/Rural", "Genero", "Edad", "Dependencia", "Alfabeta",
            "Escolaridadpromedio", "Escolaridadregular", "Trabaja",
            "Asegurado", "Cantcasas", "Ocupantespromedio", "Condicion",
            "Hacinada", "Nacido", "Discapacitado", "Jefaturafemenina",
            "Jefaturacompartida", "Votoronda1", "Votoronda2"
        ]
        self.converter = DictVectorizer(sparse=False)
        self.norm = norm
        self.normalData = {}
        self.convertedData = {}
        self.tensorColumns = []

    def prepare_data_tensor(self, samples, pct_test):
        data = self.separate_data(samples, pct_test)

        for key in data:
            if "Classes" not in key:
                data[key] = self.convert_to_dict_list(data[key])
        return data

    '''
    Retorna los datos de las muestras pasadas por parametro en un diccionario
    de la forma:
        {
            "trainingFeatures": <Datos de entrenamiento>,
            "testingFeatures": <Datos de testing>,
            "trainingFeaturesFirstInclude": <Datos de entrenamiento
                                                con primer voto>,
            "testingFeaturesFirstInclude": <Datos de testin
                                                con primer voto>,
            "trainingClassesFirst": <Resultados de primera ronda de los datos
                                        de entrenamiento>,
            "trainingClassesSecond": <Resultados de segunda ronda de los datos
                                        de entrenamiento>,
            "testingClassesFirst": <Resultados de primera ronda de los datos
                                        de pruebas>,
            "testingClassesSecond": <Resultados de segunda ronda de los datos
                                        de pruebas>
        }
    Entrada: Datos generados por el generador de muestras, porcentaje que se
    usara para pruebas.
    Salida: Los datos en forma de diccionario
    '''

    def prepare_data(self, samples, pct_test):
        data = self.separate_data(samples, pct_test)
        # Los datos se transforman a solo numeros
        self.convert_data(data)
        return data

    '''
    Convierte todos los datos a valores entre 0 y 1
    Entrada: datos ya transformados a numeros
    Salida: los nuevos datos se guardan en el mismo diccionario
    '''

    def normalize_data(self, data):
        data["trainingFeatures"] = normalize(data["trainingFeatures"],
                                             norm=self.norm,
                                             copy=False)
        data["testingFeatures"] = normalize(data["testingFeatures"],
                                            norm=self.norm,
                                            copy=False)
        data["trainingFeaturesFirstInclude"] = normalize(
            data["trainingFeaturesFirstInclude"], norm=self.norm, copy=False)
        data["testingFeaturesFirstInclude"] = normalize(
            data["testingFeaturesFirstInclude"], norm=self.norm, copy=False)

    '''
    Convierte los datos en un diccionario segun los indicadores (nombre de las
    propiedades)
    Entrada: datos a transformar de la forma:
        [["Genero", "Canton",...],...]
    Salida: los datos en forma de una lista de diccionarios
    '''

    def convert_to_dict_list(self, samples):
        features = []

        for featureList in samples:

            dictFeatures = {}
            featureNum = 0
            for feature in featureList:
                try:
                    feature = float(feature)
                except ValueError:
                    # La propiedad es un string
                    pass
                dictFeatures[self.featureNames[featureNum]] = feature
                featureNum += 1

            features.append(dictFeatures)

        return features

    def convert_to_list(self, dict):
        list = []

        for key in dict:
            list.append(dict[key])

        return list

    '''
    Convierte los datos en numericos
    Entrada: lista de datos en forma de diccionario
    Salida: guarda los datos en el mismo diccionario de entrada
    '''

    def convert_data(self, data):

        for key in data:
            if "Classes" not in key:
                data[key] = self.convert_to_dict_list(data[key])

        self.converter.fit(
            np.append(data["trainingFeatures"],
                      data["testingFeatures"],
                      axis=0))

        data["trainingFeatures"] = self.converter.transform(
            data["trainingFeatures"])
        data["testingFeatures"] = self.converter.transform(
            data["testingFeatures"])

        self.converter.fit(
            np.append(data["trainingFeaturesFirstInclude"],
                      data["testingFeaturesFirstInclude"],
                      axis=0))

        data["trainingFeaturesFirstInclude"] = self.converter.transform(
            data["trainingFeaturesFirstInclude"])
        data["testingFeaturesFirstInclude"] = self.converter.transform(
            data["testingFeaturesFirstInclude"])

        self.convertedData = copy.deepcopy(data)

    '''
    Separa los datos en datos de entrenamiento y de pruebas
    Entrada: Los datos generados por el generador, el procentaje a usar para
    pruebas.
    Salida: Un diccionario con los datos separados
    '''

    def separate_data(self, samples, pct_test):

        samplesArray = np.array(samples)
        X = samplesArray[:, :22]
        y = samplesArray[:, 22:]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=pct_test,
                                                            random_state=42)

        y_train_first_round = y_train[:, 0]
        y_train_second_round = y_train[:, 1]

        y_test_first_round = y_test[:, 0]
        y_test_second_round = y_test[:, 1]

        X_train_2 = np.append(X_train, y_train[:, :1], axis=1)
        X_test_2 = np.append(X_test, y_test[:, :1], axis=1)

        self.normalData = {
            "trainingFeatures": X_train,
            "testingFeatures": X_test,
            "trainingFeaturesFirstInclude": X_train_2,
            "testingFeaturesFirstInclude": X_test_2,
            "trainingClassesFirst": y_train_first_round,
            "trainingClassesSecond": y_train_second_round,
            "testingClassesFirst": y_test_first_round,
            "testingClassesSecond": y_test_second_round
        }

        return {
            "trainingFeatures": X_train,
            "testingFeatures": X_test,
            "trainingFeaturesFirstInclude": X_train_2,
            "testingFeaturesFirstInclude": X_test_2,
            "trainingClassesFirst": y_train_first_round,
            "trainingClassesSecond": y_train_second_round,
            "testingClassesFirst": y_test_first_round,
            "testingClassesSecond": y_test_second_round
        }

    def separate_data_2(self, samples, pct_test):

        samplesArray = np.array(samples)
        X = samplesArray
        y = samplesArray[:, 22:]

        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=pct_test,
                                                            random_state=42)

        X_train_2 = np.delete(X_train, [22], axis=1)
        X_test_2 = np.delete(X_test, [22], axis=1)

        X_train_3 = X_train
        X_test_3 = X_test

        X_train = np.delete(X_train, [23], axis=1)
        X_test = np.delete(X_test, [23], axis=1)

        y_train_first_round = y_train[:, 0]
        y_train_second_round = y_train[:, 1]

        y_test_first_round = y_test[:, 0]
        y_test_second_round = y_test[:, 1]

        self.normalData = {
            "trainingFeatures": X_train,
            "testingFeatures": X_test,
            "trainingFeaturesSecond": X_train_2,
            "testingFeaturesSecond": X_test_2,
            "trainingFeaturesFirstInclude": X_train_3,
            "testingFeaturesFirstInclude": X_test_3,
            "trainingClassesFirst": y_train_first_round,
            "trainingClassesSecond": y_train_second_round,
            "testingClassesFirst": y_test_first_round,
            "testingClassesSecond": y_test_second_round
        }

        return {
            "trainingFeaturesFirst": X_train,
            "testingFeaturesFirst": X_test,
            "trainingFeaturesSecond": X_train_2,
            "testingFeaturesSecond": X_test_2,
            "trainingFeaturesFirstInclude": X_train_3,
            "testingFeaturesFirstInclude": X_test_3,
            "trainingClassesFirst": y_train_first_round,
            "trainingClassesSecond": y_train_second_round,
            "testingClassesFirst": y_test_first_round,
            "testingClassesSecond": y_test_second_round
        }

    def get_normal_data(self):
        return self.normalData

    def get_converted_data(self):
        return self.convertedData

    '''
    Compara el tamaño de dos listas y retorna el número del tamaño de la lista
    más grande
    Entrada: Recibe dos listas
    Salida: Tamaño de la lista más grande
    '''

    def bigger_size(self, list1, list2):
        if len(list1) >= len(list2):
            return len(list1)
        else:
            return len(list2)

    '''
    Crea una lista de ceros del tamaño deseado
    Entrada: Tamaño de la lista a crear
    Salida: Lista con ceros del tamaño ingresado
    '''

    def extra_list(self, num):
        temp = []
        for i in range(num):
            temp += [0]
        return temp
def model(load_model, model_type):
    # Create training and testing set
    X_dict_train, y_train = process_data('train')
    X_dict_validation = process_data('validation')

    # Creating test set and turn into one-hot encoded vectors
    dict_one_hot_encoder = DictVectorizer(sparse=False)
    dict_one_hot_encoder.fit(X_dict_train)
    X_validation = dict_one_hot_encoder.transform(X_dict_validation)

    # Load Model
    if load_model:
        print('Loading model from previous training...')
        if model_type == 'decision_tree':
            d_tree_file = open('./models/decision_tree_model.sav', 'rb')
        elif model_type == 'random_forest':
            d_tree_file = open('./models/random_forest_model.sav', 'rb')
        else:
            print("Cannot load model without model_type")
            return 0
        train_model = pickle.load(d_tree_file)
        d_tree_file.close()

    if load_model == False:
        # Transform training dictionary into one-hot encoded vectors
        X_train = dict_one_hot_encoder.transform(X_dict_train)
        print('Completed processing data')

        # Train decision tree classifier
        if model_type == 'decision_tree':
            train_model = DecisionTreeClassifier(criterion='gini',
                                                 min_samples_split=30)
        elif model_type == 'random_forest':
            train_model = RandomForestClassifier(n_estimators=100,
                                                 criterion='gini',
                                                 min_samples_split=30,
                                                 n_jobs=-1)
        else:
            print("Cannot set up model without model_type")
            return 0
        print("Started training...")
        train_model.fit(X_train, y_train)
        print('Completed training')

        # Save Model
        if model_type == 'decision_tree':
            model_file = open('./models/decision_tree_model.sav', "wb")
        elif model_type == 'random_forest':
            model_file = open('./models/random_forest_model.sav', "wb")
        else:
            print("Cannot save model without model_type")
            return 0
        pickle.dump(train_model, model_file)
        model_file.close()
        print('Saved model')

    # Evaluate and run model on validation data
    print('Tuning base bid for the model...')
    pCTRs = train_model.predict_proba(X_validation)[:, 1]
    if model_type == 'decision_tree':
        f = open('tune_base_bid_decision_tree.csv', 'w')
    elif model_type == 'random_forest':
        f = open('tune_base_bid_random_forest.csv', 'w')
    else:
        print("Cannot save model without model_type")
        return 0
    f.write('basebid,clicks, CTR, spend, avgCPM, avgCPC\n')
    for base_bid in range(1, 201, 1):
        bidding_results = bidding(pCTRs, base_bid)
        for bidding_result in bidding_results:
            f.write(str(bidding_result) + ',')
        f.write('\n')
    f.close()
    return 0
Exemple #49
0
        for index, (term, class_) in enumerate(pos_tags):
            # Add basic NLP features for each sentence term
            X.append(add_basic_features(untag(pos_tags), index))
            y.append(class_)
    return X, y


# get the right set
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(testing_sentences)
X_val, y_val = transform_to_dataset(validation_sentences)

# Fit our DictVectorizer with our set of features
from sklearn.feature_extraction import DictVectorizer
dict_vectorizer = DictVectorizer(sparse=True)
dict_vectorizer.fit(X_train + X_test + X_val)

# Convert dict features to vectors
X_train = dict_vectorizer.transform(X_train)
X_test = dict_vectorizer.transform(X_test)
X_val = dict_vectorizer.transform(X_val)

from sklearn.preprocessing import LabelEncoder
# Fit LabelEncoder with our list of classes
label_encoder = LabelEncoder()
label_encoder.fit(y_train + y_test + y_val)
# Encode class values as integers
y_train = label_encoder.transform(y_train)
y_test = label_encoder.transform(y_test)
y_val = label_encoder.transform(y_val)
def main():
    import gp
    from sklearn.feature_extraction import DictVectorizer

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-g',
        '--gpu-split',
        type=float,
        default=1,
        help="Num ways we'll split the GPU (how many tabs you running?)")
    parser.add_argument('-n',
                        '--net-type',
                        type=str,
                        default='conv2d',
                        help="(lstm|conv2d) Which network arch to use")
    parser.add_argument(
        '--guess',
        type=int,
        default=-1,
        help="Run the hard-coded 'guess' values first before exploring")
    parser.add_argument(
        '--boost',
        action="store_true",
        default=False,
        help=
        "Use custom gradient-boosting optimization, or bayesian optimization?")
    args = parser.parse_args()

    # Encode features
    hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type)
    hypers_, hardcoded = hsearch.hypers, hsearch.hardcoded
    hypers_ = {k: v for k, v in hypers_.items() if k not in hardcoded}
    hsearch.close()

    # Build a matrix of features,  length = max feature size
    max_num_vals = 0
    for v in hypers_.values():
        l = len(v['vals'])
        if l > max_num_vals: max_num_vals = l
    empty_obj = {k: None for k in hypers_}
    mat = pd.DataFrame([empty_obj.copy() for _ in range(max_num_vals)])
    for k, hyper in hypers_.items():
        for i, v in enumerate(hyper['vals']):
            mat.loc[i, k] = v
    mat.ffill(inplace=True)

    # Above is Pandas-friendly stuff, now convert to sklearn-friendly & pipe through OneHotEncoder
    vectorizer = DictVectorizer()
    vectorizer.fit(mat.T.to_dict().values())
    feat_names = vectorizer.get_feature_names()

    # Map TensorForce actions to GP-compatible `domain`
    # instantiate just to get actions (get them from hypers above?)
    bounds = []
    for k in feat_names:
        hyper = hypers_.get(k, False)
        if hyper:
            bounded, min_, max_ = hyper['type'] == 'bounded', min(
                hyper['vals']), max(hyper['vals'])
        b = [min_, max_] if bounded else [0, 1]
        bounds.append(b)

    def hypers2vec(obj):
        h = dict()
        for k, v in obj.items():
            if k in hardcoded: continue
            if type(v) == bool: h[k] = float(v)
            else: h[k] = v or 0.
        return vectorizer.transform(h).toarray()[0]

    def vec2hypers(vec):
        # Reverse the encoding
        # https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data
        # https://github.com/scikit-learn/scikit-learn/issues/4414
        reversed = vectorizer.inverse_transform([vec])[0]
        obj = {}
        for k, v in reversed.items():
            if '=' not in k:
                obj[k] = v
                continue
            if k in obj: continue  # we already handled this x=y logic (below)
            # Find the winner (max) option for this key
            score, attr, val = v, k.split('=')[0], k.split('=')[1]
            for k2, score2 in reversed.items():
                if k2.startswith(attr + '=') and score2 > score:
                    score, val = score2, k2.split('=')[1]
            obj[attr] = val

        # Bools come in as floats. Also, if the result is False they don't come in at all! So we start iterate
        # hypers now instead of nesting this logic in reversed-iteration above
        for k, v in hypers_.items():
            if v['type'] == 'bool':
                obj[k] = bool(round(obj.get(k, 0.)))
        return obj

    # Specify the "loss" function (which we'll maximize) as a single rl_hsearch instantiate-and-run
    def loss_fn(params):
        hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type)
        reward = hsearch.execute(vec2hypers(params))
        hsearch.close()
        return [reward]

    guess_i = 0
    while True:
        # Every iteration, re-fetch from the database & pre-train new model. Acts same as saving/loading a model to disk,
        # but this allows to distribute across servers easily
        conn_runs = data.engine_runs.connect()
        sql = "select hypers, advantages, advantage_avg from runs where flag=:f"
        runs = conn_runs.execute(text(sql), f=args.net_type).fetchall()
        conn_runs.close()
        X, Y = [], []
        for run in runs:
            X.append(hypers2vec(run.hypers))
            Y.append([utils.calculate_score(run)])
        boost_model = print_feature_importances(X, Y, feat_names)

        if args.guess != -1:
            guess = {k: v['guess'] for k, v in hypers_.items()}
            guess.update(utils.guess_overrides[args.guess][guess_i])
            loss_fn(hypers2vec(guess))

            guess_i += 1
            if guess_i > len(utils.guess_overrides[args.guess]) - 1:
                args.guess = -1  # start on GP

            continue

        if args.boost:
            print('Using gradient-boosting')
            boost_optimization(model=boost_model,
                               loss_fn=loss_fn,
                               bounds=np.array(bounds),
                               x_list=X,
                               y_list=Y)
        else:
            # Evidently duplicate values break GP. Many of these are ints, so they're definite duplicates. Either way,
            # tack on some small epsilon to make them different (1e-6 < gp.py's min threshold, make sure that #'s not a
            # problem). I'm concerned about this since many hypers can go below that epislon (eg learning-rate).
            for x in X:
                for i, v in enumerate(x):
                    x[i] += np.random.random() * 1e-6
            gp.bayesian_optimisation2(loss_fn=loss_fn,
                                      bounds=np.array(bounds),
                                      x_list=X,
                                      y_list=Y)
Exemple #51
0
imp = Imputer(missing_values='NaN', strategy='mean', axis=1)
for i in num_in:
    train.iloc[:, i] = np.transpose(imp.fit_transform(train.iloc[:, i]))
    test.iloc[:, i] = np.transpose(imp.fit_transform(test.iloc[:, i]))

# Replace nan in Dataframe as it is confused to NaN, here nan is actually a string value except in LotFrontage and MasVnrArea
train.iloc[:, :] = train.iloc[:, :].replace(np.nan, 'nnn')
test.iloc[:, :] = test.iloc[:, :].replace(np.nan, 'nnn')

# One Hot Encoder for string data
enc = DictVectorizer(sparse=False)

# convert Dataframe with selected columns to dictionary as the OneHotEncoder for string values DictVectorizer needs dictionary data
train_dic = train.iloc[:, cat_ind].to_dict(orient='records')
test_dic = test.iloc[:, cat_ind].to_dict(orient='records')
enc.fit(train_dic)
x_train_categorical = enc.transform(train_dic)
x_test_categorical = enc.transform(test_dic)

# Numerical features
x_train_numeric = train.iloc[:, num_in]
x_train_numeric = normalize(x_train_numeric)
x_train = np.concatenate((x_train_numeric, x_train_categorical), axis=1)
x_test_numeric = test.iloc[:, num_in]
x_test_numeric = normalize(x_test_numeric)
x_test = np.concatenate((x_test_numeric, x_test_categorical), axis=1)

# the last is the target/class variable
y_train = train.iloc[:, 80]

m = x_train.shape[0]
Exemple #52
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {
            "f{:03d}".format(feat_num): np.random.randint(0, 4)
            for feat_num in range(num_feat_files * num_feats_per_file)
        }
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    label_map = {
        label: num
        for num, label in enumerate(
            sorted({
                label
                for label in labels if not isinstance(label, (int, float))
            }))
    }
    # Add fake item to vectorizer for None
    label_map[None] = '00000'

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(
            convert_dir, '{}_{}{}'.format(feature_name_prefix, i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {
                "f{:03d}".format(feat_num + j):
                features[example_num]["f{:03d}".format(feat_num + j)]
                for j in range(num_feats_per_file)
            }
            sub_features.append(x)
        train_fs = FeatureSet('sub_train',
                              ids,
                              labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs, label_map=label_map).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
                                                     to_suffix))
    train_fs = FeatureSet('train',
                          ids,
                          labels=labels,
                          features=features,
                          vectorizer=feat_vectorizer)
    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs, label_map=label_map).write()
    else:
        Writer.for_path(train_path, train_fs).write()
Exemple #53
0
def main_gp():
    import gp, GPyOpt
    from sklearn.feature_extraction import DictVectorizer

    parser = argparse.ArgumentParser()
    parser.add_argument('-a',
                        '--agent',
                        type=str,
                        default='ppo_agent',
                        help="Agent to use (ppo_agent|dqn_agent|etc)")
    parser.add_argument(
        '-g',
        '--gpu_split',
        type=float,
        default=1,
        help="Num ways we'll split the GPU (how many tabs you running?)")
    parser.add_argument('-n',
                        '--net_type',
                        type=str,
                        default='lstm',
                        help="(lstm|conv2d) Which network arch to use")
    parser.add_argument(
        '--guess',
        action="store_true",
        default=False,
        help="Run the hard-coded 'guess' values first before exploring")
    parser.add_argument(
        '--gpyopt',
        action="store_true",
        default=False,
        help=
        "Use GPyOpt library, or use basic sklearn GP implementation? GpyOpt shows more promise, but has bugs."
    )
    args = parser.parse_args()

    # Encode features
    hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type)
    hypers_, hardcoded = hsearch.hypers, hsearch.hardcoded
    hypers_ = {k: v for k, v in hypers_.items() if k not in hardcoded}
    hsearch.close()

    # Build a matrix of features,  length = max feature size
    max_num_vals = 0
    for v in hypers_.values():
        l = len(v['vals'])
        if l > max_num_vals: max_num_vals = l
    empty_obj = {k: None for k in hypers_}
    mat = pd.DataFrame([empty_obj.copy() for _ in range(max_num_vals)])
    for k, hyper in hypers_.items():
        for i, v in enumerate(hyper['vals']):
            mat.loc[i, k] = v
    mat.ffill(inplace=True)

    # Above is Pandas-friendly stuff, now convert to sklearn-friendly & pipe through OneHotEncoder
    vectorizer = DictVectorizer()
    vectorizer.fit(mat.T.to_dict().values())
    feat_names = vectorizer.get_feature_names()

    # Map TensorForce actions to GPyOpt-compatible `domain`
    # instantiate just to get actions (get them from hypers above?)
    bounds = []
    for k in feat_names:
        hyper = hypers_.get(k, False)
        if hyper:
            bounded, min_, max_ = hyper['type'] == 'bounded', min(
                hyper['vals']), max(hyper['vals'])
        if args.gpyopt:
            b = {'name': k, 'type': 'discrete', 'domain': (0, 1)}
            if bounded: b.update(type='continuous', domain=(min_, max_))
        else:
            b = [min_, max_] if bounded else [0, 1]
        bounds.append(b)

    def hypers2vec(obj):
        h = dict()
        for k, v in obj.items():
            if k in hardcoded: continue
            if type(v) == bool:
                h[k] = float(v)
            else:
                h[k] = v or 0.
        return vectorizer.transform(h).toarray()[0]

    def vec2hypers(vec):
        # Reverse the encoding
        # https://stackoverflow.com/questions/22548731/how-to-reverse-sklearn-onehotencoder-transform-to-recover-original-data
        # https://github.com/scikit-learn/scikit-learn/issues/4414
        if not args.gpyopt:
            vec = [vec]  # gp.py passes as flat, GPyOpt as wrapped
        reversed = vectorizer.inverse_transform(vec)[0]
        obj = {}
        for k, v in reversed.items():
            if '=' not in k:
                obj[k] = v
                continue
            if k in obj: continue  # we already handled this x=y logic (below)
            # Find the winner (max) option for this key
            score, attr, val = v, k.split('=')[0], k.split('=')[1]
            for k2, score2 in reversed.items():
                if k2.startswith(attr + '=') and score2 > score:
                    score, val = score2, k2.split('=')[1]
            obj[attr] = val

        # Bools come in as floats. Also, if the result is False they don't come in at all! So we start iterate
        # hypers now instead of nesting this logic in reversed-iteration above
        for k, v in hypers_.items():
            if v['type'] == 'bool':
                obj[k] = bool(round(obj.get(k, 0.)))
        return obj

    # Specify the "loss" function (which we'll maximize) as a single rl_hsearch instantiate-and-run
    def loss_fn(params):
        hsearch = HSearchEnv(gpu_split=args.gpu_split, net_type=args.net_type)
        reward = hsearch.execute(vec2hypers(params))
        hsearch.close()
        return [reward]

    while True:
        conn = data.engine.connect()
        sql = "SELECT hypers, reward_avg FROM runs WHERE flag=:f"
        runs = conn.execute(text(sql), f=args.net_type).fetchall()
        conn.close()
        X, Y = [], []
        for run in runs:
            X.append(hypers2vec(run.hypers))
            Y.append([run.reward_avg])
        print_feature_importances(X, Y, feat_names)

        if args.guess:
            guesses = {k: v['guess'] for k, v in hypers_.items()}
            X.append(hypers2vec(guesses))
            Y.append([None])
            args.guess = False

        if args.gpyopt:
            pretrain = {'X': np.array(X), 'Y': np.array(Y)} if X else {}
            opt = GPyOpt.methods.BayesianOptimization(f=loss_fn,
                                                      domain=bounds,
                                                      maximize=True,
                                                      **pretrain)
            # using max_iter=1 because of database setup. Normally you'd go until convergence, but since we're using
            # a database for the runs we can parallelize runs across machines (connected to the same database). Then
            # between each run we can grab the result from the other machines and merge with our own; so only run
            # once, reset the model-fitting w/ the full database (which may have grown), and repeat
            opt.run_optimization(max_iter=1)
        else:
            gp.bayesian_optimisation2(n_iters=1,
                                      loss_fn=loss_fn,
                                      bounds=np.array(bounds),
                                      x_list=X,
                                      y_list=Y)
Exemple #54
0
class GlobalFeatures(object):
    def __init__(self,
                 word2vec_model=None,
                 cluster_vocabs=None,
                 dict_features=None,
                 cat_names=None,
                 WORD_IDX=0):
        self.word2vec_model = word2vec_model
        self.cluster_vocabs = cluster_vocabs
        self.dict_features = dict_features
        self.WORD_IDX = WORD_IDX
        self.cat_names = cat_names

    def get_global_sequence_features(self, sent, predictions=None):
        features = dict()
        sent_length = len(sent) * 1.
        for word in sent:
            word = word[self.WORD_IDX]
            lookup_key = preprocess_token(word, to_lower=True)
            if self.word2vec_model and lookup_key in self.word2vec_model:
                for i, v in enumerate(self.word2vec_model[lookup_key]):
                    features["_GLOBAL_WORDVEC_%s" % i] = dict.get(
                        features, "_GLOBAL_WORDVEC_%s" % i, 0) + v
            if self.cluster_vocabs and lookup_key in self.cluster_vocabs:
                v = dict.get(self.cluster_vocabs, lookup_key)
                features["_GLOBAL_CLUSTER_=%s" % v] = dict.get(
                    features, "_GLOBAL_CLUSTER_=%s" % v, 0) + 1
        features = {k: v / sent_length for k, v in six.iteritems(features)}
        if predictions:
            for k, prob in six.iteritems(predictions):
                features["_MODEL_=%s" % k] = prob
        return [features for word in sent]

    def tweet_features(self, sent):
        features = {}
        sent_length = len(sent) * 1.
        for widx, word in enumerate(sent):
            word = word[self.WORD_IDX]
            lookup_key = preprocess_token(word, to_lower=True)
            if self.word2vec_model and lookup_key in self.word2vec_model:
                for i, v in enumerate(self.word2vec_model[lookup_key]):
                    features["_GLOBAL_WORDVEC_%s" % i] = dict.get(
                        features, "_GLOBAL_WORDVEC_%s" % i, 0) + v
            if self.cluster_vocabs and lookup_key in self.cluster_vocabs:
                v = dict.get(self.cluster_vocabs, lookup_key)
                features["_GLOBAL_CLUSTER_=%s" % v] = dict.get(
                    features, "_GLOBAL_CLUSTER_=%s" % v, 0) + 1
            if self.dict_features:
                d_features = self.dict_features.GetDictFeatures(
                    [k[WORD_IDX] for k in sent], widx)
                for k in d_features:
                    features[k] = dict.get(features, k, 0) + 1
                d_hashtag_features = self.dict_features.GetHashtagDictFeatures(
                    word)
                for k in d_hashtag_features:
                    features[k] = dict.get(features, k, 0) + 1
        #features = {k: v / sent_length for k,v in six.iteritems(features)}
        return features

    def get_sequence_features(self, sequences):
        features = [self.tweet_features(sent) for sent in sequences]
        return features

    def is_tweet_type(self, sent, cat_type):
        for t in sent:
            if t.tag != "O":
                if t.tag[2:] == cat_type:
                    return 1
        return 0

    def fit_feature_dict(self, sequences):
        train_data = self.get_sequence_features(sequences)
        self.feature2matrix = DictVectorizer()
        self.feature2matrix.fit(train_data)

    def tranform_sequence2feature(self, sequences):
        train_data = self.get_sequence_features(sequences)
        return self.feature2matrix.transform(train_data)

    def fit_model(self, train_sequences, test_sequences=None):
        if test_sequences is None:
            test_sequences = train_sequences
        self.fit_feature_dict(train_sequences)
        tweet_X_train = self.tranform_sequence2feature(train_sequences)
        tweet_X_test = self.tranform_sequence2feature(test_sequences)
        self.models = dict()
        for cat_type in self.cat_names:
            print("Processing: %s" % cat_type)
            y_train = np.array([
                self.is_tweet_type(sent, cat_type) for sent in train_sequences
            ])
            y_test = np.array([
                self.is_tweet_type(sent, cat_type) for sent in test_sequences
            ])
            model = LogisticRegression(solver="lbfgs",
                                       multi_class="multinomial")
            model.fit(tweet_X_train, y_train)
            y_pred = model.predict(tweet_X_test)
            print(classification_report(y_test, y_pred))
            self.models[cat_type] = model

    def get_global_predictions(self, sequences):
        predictions = {}
        X_train = self.tranform_sequence2feature(sequences)
        for k, model in six.iteritems(self.models):
            y_pred = model.predict_proba(X_train)[:, 1]
            predictions[k] = y_pred
        keys = predictions.keys()
        predictions = [dict(zip(keys, v)) for v in zip(*predictions.values())]
        return predictions
X, y = list(zip(*data))

# split and randomize
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=.25,
                                                    shuffle=True)
print(X_train)
print(y_train)

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer

dict_vectorizer = DictVectorizer()
name_classifier = DecisionTreeClassifier()

# Scikit-Learn models work with arrays not dicts
# We need to train the vectorizer so that
# it knows what's the format of the dicts
dict_vectorizer.fit(X_train)

# Vectorize the training data
X_train_vectorized = dict_vectorizer.transform(X_train)

# Train the classifier on vectorized data
name_classifier.fit(X_train_vectorized, y_train)

# Test the model
X_test_vectorized = dict_vectorizer.transform(X_test)
# Compute Accuracy (0.75)
print(name_classifier.score(X_test_vectorized, y_test))
class ModelTrainer(BaseProcessor):
    """
    ModelTrainer

    :param
        _dic_vec: DictVectorizer model for entity features
        _tfidf_vec: TfidfVectorizer model for intent features
        _intent_clf: RandomForestClassifier for intent classification
        _entity_clf: BernoulliNB for entity classification
        _ent_feature: Entity features
        _ent_label: Entity Labels
        _int_feature: Intent features
        _int_label: Intent Labels
        path:  Path to save models

    :Function:

    _check_file_format: 
        :description:
        Checks for YML/YAML file format
        
        :param file: Filename
        :return status: Is valid or not

    load:
        :description:
        The function loads the yaml training file and generates the training data.
        The load uses the function _intent_entity_extractor, _entity_label_extract and
        _get_features from  BaseProcessor. To create features and label.

        :param file: Training file

    _intent_entity_extractor:
        :description:
        Intent Entity Extractor uses the TF-IDF for intent features and labels genration.
        While DicVect and _get_features for entity features and labels genration. 
        The _entity_label_extract helps for entity label extraction

        :param data: training file data.

    _entity_label_extract:
        :description:
        Following training data schema function used to extract entity labels from that scehma.
        Default entity label is 'O'.
    
        :param question_dict: Traning data dictionary
               token_pos: token postion defined in trainig data. i.e Entity value postion

    _persist_helper:
        :description:
        Model object is saved as pickle file

        :param filename:File name to save
               object_: Model to save.

    _persist_models:
        :description:
        Saves all four models and classifers.

    train:
        :description:
        Trains the Intent and Entity Model
    """
    def __init__(self, path):
        super().__init__()
        self._dic_vec = DictVectorizer()
        self._tfidf_vec = TfidfVectorizer()
        self._intent_clf = RandomForestClassifier(n_estimators=200)
        self._entity_clf = BernoulliNB(alpha=0.1, binarize=0.1)
        self._ent_feature = []
        self._ent_label = []
        self._int_feature = []
        self._int_label = []
        self.path = path

    def _check_file_format(self, file):
        if file.split('.')[1] in FILE_FORMAT:
            return True
        return False

    def load(self, file):
        file_foarmt = self._check_file_format(file)
        if not file_foarmt:
            raise FileFormatError("Only YML/YAML file is allowed")

        with open(file, 'r') as f:
            data = yaml.load(f)

        ent_train_list, int_train_dict = self._intent_entity_extractor(data)

        int_feature_arr = np.array(list(int_train_dict.keys()))
        int_labels_arr = np.array(list(int_train_dict.values()))
        self._tfidf_vec.fit(int_feature_arr)
        self._int_feature = self._tfidf_vec.transform(
            int_feature_arr).toarray()
        self._int_label = int_labels_arr

        self._dic_vec.fit(ent_train_list)
        self._ent_feature = self._dic_vec.transform(ent_train_list).toarray()

    def _intent_entity_extractor(self, data):
        ent_train_list = []
        int_train_dict = {}

        for intent, question_list in tqdm(data.items()):
            for question_dict in question_list:

                token = question_dict['text'].split(' ')
                int_train_dict[question_dict['text']] = intent

                for i, word in enumerate(token):
                    self._entity_label_extract(question_dict, i)
                    ent_train_list.append(self._get_features(i, word, token))

        return ent_train_list, int_train_dict

    def _entity_label_extract(self, question_dict, token_pos):
        try:
            for ent in question_dict['entity']:
                k, v = list(ent.items())[1]
                if ent['pos'] == token_pos:
                    self._ent_label.append(k)
                    break
            else:
                self._ent_label.append('O')
        except:
            self._ent_label.append('O')

    def _persist_helper(self, filename, object_):
        with open(os.path.join(self.path, filename), 'wb+') as f:
            pickle.dump(object_, f)

    def _persist_models(self):
        self._persist_helper(self.dic_vec_name, self._dic_vec)
        self._persist_helper(self.tfidf_name, self._tfidf_vec)
        self._persist_helper(self.entity_name, self._entity_clf)
        self._persist_helper(self.intent_name, self._intent_clf)

    def train(self):
        self._entity_clf.fit(self._ent_feature, self._ent_label)
        self._intent_clf.fit(self._int_feature, self._int_label)
        self._persist_models()
Exemple #57
0
from sklearn.externals import joblib

def convierte_a_listas(oracion):
	wap=oracion.split(" ")
	words=['-','-']
	for i in wap:
		words.append(i)
	words=words+['-','-']
	return words
def prepara_frase(words):
	features=[]
	feature={}
	for i in range(len(words[2:-2])):
		i=i+2
		feature['0']=str(words[i-2]).lower()
		feature['1']=str(words[i-1]).lower()
		feature['2']=str(words[i]).lower()
		feature['3']=str(words[i+1]).lower()
		feature['4']=str(words[i+2]).lower()
		features.append(feature)
		feature={}
	return features
lista=convierte_a_listas("Que tal esta es una lista para el mercado algunas de las cosas que quiero comprar es jamon queso pechuga de pavo y cereal")
print(lista)
features=prepara_frase(lista)
print(features)
v = DictVectorizer(sparse=False)
#X = v.fit_transform(features)
v.fit(features)
joblib.dump(v, 'vectorizer.pkl')

    random.shuffle(dataset)


    x_train=[row[0] for row in dataset[:7000]]
    y_train=[row[1] for row in dataset[:7000]]
    
    x_test=[row[0] for row in dataset[7000:]]
    y_test=[row[1] for row in dataset[7000:]]

#We are using DictVectorizer to transform the list of feature dictionary into numerical form or simply for word embedding

    from sklearn.feature_extraction import DictVectorizer
    d = DictVectorizer()
    d.fit([row[0] for row in dataset])
    X_train_count =  d.transform(x_train)

    X_test_count =  d.transform(x_test)



    models=get_models()

    for name,model in models.items():

            scores=evaluate_model(model)
            #results.append(scores)
            print(name,scores)

        'firstThree-letters': name[:8],
        'last-letter': name[-5:],
        'lastTwo-letters': name[-4:],
        'lastThree-letters': name[-3:],
    }


features = np.vectorize(features)
Name = features(names[:, 0])
Gender = names[:, 2]

Name_Train, Name_Test, Gender_Train, Gender_Test = train_test_split(
    Name, Gender, test_size=0.3)

vectorizer = DictVectorizer()
vectorizer.fit(Name_Train)

clf = DecisionTreeClassifier()
clf.fit(vectorizer.transform(Name_Train), Gender_Train)

Gender_pred = clf.predict(vectorizer.transform(Name_Test))

print(
    clf.predict(
        vectorizer.transform(
            features([
                "Nguyễn Ánh Dương", "Vũ Tiến Đạt", "Ngô Văn Vĩ",
                "Phạm Ngọc Hà", "Hoàng Mai Hương"
            ]))))
from sklearn.metrics import accuracy_score
print('Accuracy = ', accuracy_score(Gender_Test, Gender_pred))
def format_data(df0, df_ts):
    # df = shuffle(df0, random_state=0)
    df = df0
    train_size = df.shape[0]
    print df.head()
    y = df['Criminal']

    df = df.drop('Criminal', axis=1)
    assert isinstance(df, DataFrame)

    df_combined = df.append(df_ts)
    df_combined.fillna('NA', inplace=True)

    if isinstance(df_combined, dict):
        df_to_dict = df_combined
    else:
        df_to_dict = df_combined.to_dict(orient="records")

    vec = DictVectorizer(sparse=False)
    vec.fit(df_to_dict)

    X = vec.transform(df_to_dict)
    print('inside make model after one hot encoding= ', X.shape)
    columns_names = vec.feature_names_
    input_dataframe = DataFrame(data=X, columns=columns_names)

    # This part is removing un important columns
    rf_clf = RandomForestClassifier(n_estimators=100, max_depth=10)
    rf_clf.fit(X[0:train_size], y)
    imp = rf_clf.feature_importances_
    threshold_for_features = 0.001
    for index, value in enumerate(imp):
        if value <= threshold_for_features:
            key = columns_names[index]
            input_dataframe = input_dataframe.drop(key, axis=1)

    temp3 = list(input_dataframe)
    for feat in temp3:
        if feat.endswith("=NA") or feat.endswith("=nan") or feat.endswith(
                "=99"):
            # print("dropping feature with no value = ", feat)
            input_dataframe = input_dataframe.drop(feat, axis=1)

    # This part was about removing un important columns

    df_to_dict = input_dataframe.to_dict(orient="records")
    vec = DictVectorizer(sparse=False)
    vec.fit(df_to_dict)

    print(" modified data frame ", input_dataframe.shape)

    input_train_df = input_dataframe[0:train_size]
    input_test_df = input_dataframe[train_size:]

    with open('train_encoded_2.csv', 'wb') as infile:
        input_train_df['Criminal'] = y
        print("input df shape to csv ", input_train_df.shape)
        input_train_df.to_csv(infile, index=False)

    with open('test_encoded_2.csv', 'wb') as infile:
        print("input df shape to csv ", input_test_df.shape)
        input_test_df.to_csv(infile, index=False)