Exemple #1
0
def get_data(f='net/nettalk.data.txt', size=2):
    f = open(f)

    words = [process_line(line)[2:] for line in f]

    X, y = [], []
    vect_syl = DictVectorizer()
    vect_stress = DictVectorizer()
    vect_syl.feature_names_ = set()
    vect_stress.feature_names_ = set()
    for word, stress in words:
        if word is None:
            continue
        if len(word.strip().replace('-', '')) != len(stress):
            print >> sys.stderr, "Skipped %s" % word
            continue
        x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict(
            word.strip(), stress, size=size)
        for x in x_dict_syl:
            for f, v in x.iteritems():
                if isinstance(v, (str, unicode)):
                    f = "%s%s%s" % (f, vect_syl.separator, v)
                vect_syl.feature_names_.add(f)
        for x in x_dict_stress:
            for f, v in x.iteritems():
                if isinstance(v, (str, unicode)):
                    f = "%s%s%s" % (f, vect_stress.separator, v)
                vect_stress.feature_names_.add(f)
    vect_syl.feature_names_ = sorted(vect_syl.feature_names_)
    vect_syl.vocabulary_ = dict((f, i) for i, f in
                                enumerate(vect_syl.feature_names_))
    vect_stress.feature_names_ = sorted(vect_stress.feature_names_)
    vect_stress.vocabulary_ = dict((f, i) for i, f in
                                   enumerate(vect_stress.feature_names_))

    for word, stress in words:
        if word is None:
            continue
        x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict(
            word, stress, size=size)
        if not len(x_dict_syl):
            print >> sys.stderr, "Empty features for {}".format(word)
            continue
        X.append((vect_syl.transform(x_dict_syl),
                  vect_stress.transform(x_dict_stress)))
        where_stress = y_stress.argmax()
        if y_stress[where_stress] == 1:
            y_stress[where_stress + 1:] = 2
        y_syl += 1
        y.append(np.r_[y_syl, y_stress])

    return X, y
Exemple #2
0
    def load(conn, table, feature_column='feature', weight_column='weight', bias_feature=None):
        df = conn.fetch_table(table)

        intercept = np.array([0.])  # (1,)
        coef = np.array([[]])  # (1, n_feature)

        vocabulary = {}
        feature_names = []

        j = 0
        for i, row in df.iterrows():
            feature, weight = row[feature_column], row[weight_column]

            if feature == bias_feature:
                intercept[0] = float(weight)
                continue

            coef = np.append(coef, [[weight]], axis=1)

            vocabulary[feature] = j
            j += 1
            feature_names.append(feature)

        vectorizer = DictVectorizer(separator='#')
        vectorizer.vocabulary_ = vocabulary
        vectorizer.feature_names_ = feature_names

        return coef, intercept, vectorizer
Exemple #3
0
def feature_dict_to_dict_vectorizer(feature_dict):
    # Convert to DictVectorizer
    from sklearn.feature_extraction import DictVectorizer

    DV = DictVectorizer(sparse=True)
    T2I = {}


    feature_names = []
    vocab = {}

    for k, v in feature_dict.iteritems():
        if k not in vocab:
            if not k.__contains__('='):
                T2I[k] = v
            else:
                vocab[k] = v
                feature_names.append(k)

    if DV.sort:
        feature_names.sort()

    DV.feature_names_ = feature_names
    DV.vocabulary_ = vocab
    return T2I, DV
Exemple #4
0
def create_dict_vectorizer(vocab):
    """

    """
    ngram_to_idx = dict((n, i) for i, n in enumerate(sorted(vocab)))
    _count2vec = DictVectorizer(separator=":")
    _count2vec.vocabulary_ = ngram_to_idx.copy()
    rev_dict = dict((y, x) for x, y in ngram_to_idx.items())
    _count2vec.feature_names_ = [rev_dict[i] for i in range(len(rev_dict))]
    return _count2vec
Exemple #5
0
def deserialize_dict_vectorizer(model_dict):
    model = DictVectorizer()

    model.dtype = np.dtype(model_dict['dtype']).type
    model.separator = model_dict['separator']
    model.sparse = model_dict['sparse']
    model.sort = model_dict['sort']
    model.feature_names_ = model_dict['feature_names']
    model.vocabulary_ = model_dict['vocabulary']

    return model
def initialize_vectorizer(vocabulary):
    """
    Initialize a vectorizer that transforms a counter dictionary
    into a sparse vector of counts (with a uniform feature index)
    """
    ## Isolate Terms, Sort Alphanumerically
    ngram_to_idx = dict((t, i) for i, t in enumerate(vocabulary))
    ## Create Dict Vectorizer
    _count2vec = DictVectorizer(separator=":", dtype=int)
    _count2vec.vocabulary_ = ngram_to_idx.copy()
    rev_dict = dict((y, x) for x, y in ngram_to_idx.items())
    _count2vec.feature_names_ = [rev_dict[i] for i in range(len(rev_dict))]
    return _count2vec
Exemple #7
0
 def __call__(self):
     aux_data = pickle.load(open(self.feature_map_file, "rb"))
     model: SGDClassifier = pickle.load(open(self.model_file_name, "rb"))
     frequent_words = aux_data[TrainModel.FREQUENT_WORDS]
     vectorizer = DictVectorizer()
     vectorizer.vocabulary_ = aux_data[TrainModel.FEATURE_IDXS]
     vectorizer.feature_names_ = aux_data[TrainModel.FEATURE_NAMES]
     tagged_sentences = []
     with open(self.input_file_name, 'r') as in_f:
         lines = [line.rstrip() for line in in_f.readlines()]
     already_tagged = all(
         map(lambda l: all(map(lambda w: '/' in w, l.split(' '))), lines))
     print('input already tagged:', already_tagged)
     sentences = [
         ExtractFeatures.split_by_whitespace_and_seperate_tags(l)
         for l in lines
     ]
     sentences = list(map(lambda s: list(map(lambda t: t[0], s)),
                          sentences))
     sentences_with_idxs = [(s, i) for (i, s) in enumerate(sentences)]
     sentences = sorted(sentences_with_idxs, key=lambda t: len(t[0]))
     idxs_processed = []
     for l, g in itertools.groupby(sentences, key=lambda t: len(t[0])):
         g = list(g)
         sents_of_len_l = np.asarray(list(map(operator.itemgetter(0), g)))
         idxs_of_len_l = list(map(operator.itemgetter(1), g))
         idxs_processed.extend(idxs_of_len_l)
         tags_of_len_l = np.empty(sents_of_len_l.shape, dtype="U8")
         for i in range(l):
             feats_for_ith_word = []
             for sent_i, word in enumerate(sents_of_len_l[:, i]):
                 feats = ExtractFeatures.extract(
                     sents_of_len_l[sent_i, :], tags_of_len_l[sent_i, :], i,
                     (word not in frequent_words))
                 feats_for_ith_word.append(feats)
             X = vectorizer.transform(feats_for_ith_word)
             tags_pred = model.predict(X)
             tags_of_len_l[:, i] = tags_pred
         tagged_sents_of_len_l = np.char.add(
             np.char.add(sents_of_len_l, '/'), tags_of_len_l)
         tagged_sentences.extend(
             [' '.join(row) for row in tagged_sents_of_len_l])
     tagged_sentences = map(
         operator.itemgetter(0),
         sorted(zip(tagged_sentences, idxs_processed),
                key=operator.itemgetter(1)))
     tagged_sentences = [
         w.replace('$EQ$', '=') for w in (s for s in tagged_sentences)
     ]
     with open(self.output_file, 'w+') as out_f:
         out_f.write('\n'.join(tagged_sentences) + '\n')
Exemple #8
0
    def prepareVectors(self, featureslist, classlist=None, vectortransformation=None, featureselection=None, features_pct=50, returnasmatrix=False, filter_features=None, returnindexed=False):
        """
        Takes a list of dictionaries (representing units), in which keys are words and values their occurence within the unit. Vector can be transformed to tfidf or binomial. Featureselection selects a percentage of the featurelist (features_pct) based on a score for the relevance of the feature (e.g., chi2). Note that for certain feature selection methods (e.g., chi2), a list of class labels to match the units in the featureslist needs to be provided. If filter_features is a list of feature names, only these features are used.

        A tuple is returned with the features (as dictionary or sparse matrix) and a list of the selected features. (these selected features can be used as input for 'filter_features' to match new vectors to the vectors on which a classifier is trained) 
        """
        dv = DictVectorizer()
        fmatrix = dv.fit_transform(featureslist)
        fnames = dv.feature_names_
        if vectortransformation:
            print('- Transforming vectors')
            fmatrix = self.transformVectors(fmatrix, vectortransformation)
        if featureselection and not filter_features:
            print('- Selecting features')
            fmatrix, fnames = self.selectFeatures(fmatrix, fnames, featureselection, classlist, features_pct)
            dv.feature_names_ = fnames # store new index of featurenames for dv.inverse_transform
        if filter_features:
            print('- Filtering features')
            fmatrix, fnames = self.filterFeatures(fmatrix, fnames, filter_features)
            dv.feature_names_ = fnames
        if returnasmatrix == False:
            if returnindexed == True: dv.feature_names_ = range(0,len(fnames))
            return (dv.inverse_transform(fmatrix), fnames)
        else: return (fmatrix, fnames)
Exemple #9
0
    def _create_dict_vectorizer(self, vocab):
        """
        Create a DictVectorizer object given a list of vocabulary

        Args:
            vocab (iterable): Sorted list of feature names for the Vectorizer
        
        Returns:
            _count2vec (DictVectorizer): Count -> csr_matrix Class
        """
        feature_to_idx = dict((n, i) for i, n in enumerate(vocab))
        _count2vec = DictVectorizer(separator=":")
        _count2vec.vocabulary_ = feature_to_idx.copy()
        rev_dict = dict((y, x) for x, y in feature_to_idx.items())
        _count2vec.feature_names_ = [rev_dict[i] for i in range(len(rev_dict))]
        return _count2vec
Exemple #10
0
    def learn_feature_matrix(self, triples, person_abs, vectorizer):
        """
		Learns a sparse DictVectorizer object representing a feature matrix
		that can be used in building machine learning models.

		Runs through all the triples, looking up their abstracts in provided 
		abstract cache (person_abs) and creates feature dictionary for each triple.
		Feature dictionaries from all triples in the cup are then used to create a
		DictVectorizer object, which represents the abstract-based feature matrix.
		"""
        def _get_feature_dict(triple):
            sub, obj = triple
            obj_idx = vectorizer.target_idx_cache.get(obj)
            if obj_idx is None or sub not in person_abs:
                return dict(
                )  # no tokens/abstract available for this subject. return empty dictionary
            abs_tokens = person_abs[sub]
            d = {}
            for token in abs_tokens:
                if token in vectorizer.top_feature_idx and token not in d:
                    d[token] = vectorizer.td_mat[
                        obj_idx, vectorizer.top_feature_idx[token]]
            return d

        print 'Creating feature dictionaries..',
        sys.stdout.flush()
        # create list of feature dictionaries, one for each triple
        D = []
        t1 = time()
        person_no_features = 0
        for i, triple in enumerate(triples):
            d = _get_feature_dict(triple)
            if len(d) == 0:
                person_no_features += 1
            D.append(d)
        print '#People w/o features: {}. Time: {:.2f}s'.format(
            person_no_features,
            time() - t1)

        # create a sparse DictVectorizer object, represeting the feature matrix
        dvec = DictVectorizer(sparse=True)
        dvec.feature_names_ = vectorizer.top_feature_idx.keys()
        dvec.dvec_mat = dvec.fit_transform(D)
        return dvec
Exemple #11
0
def load_topic_scores(corpus, num_topics):
    topic_vec = DictVectorizer(sparse=True, dtype=float)
    with open("data/{}/output/T{}/init/model.docs".format(corpus,
                                                          num_topics)) as f:
        f.readline()
        score = parse_topic_scores(f.readline())
        topic_vec.vocabulary_ = {
            k: i
            for i, k in enumerate(sorted(score[1].keys()))
        }
        topic_vec.feature_names_ = sorted(score[1].keys())
    with open("data/{}/output/T{}/init/model.docs".format(corpus,
                                                          num_topics)) as f:
        f.readline()
        X_topics = topic_vec.transform(
            (d[1]
             for d in sorted((parse_topic_scores(l) for l in f.readlines()),
                             key=lambda x: x[0])))
    return topic_vec, X_topics
def _initialize_dict_vectorizer(vocabulary):
    """
    Initialize a vectorizer that transforms a counter dictionary
    into a sparse vector of counts (with a uniform feature index)

    Args:
        vocabulary (iterable): Input vocabulary
    
    Returns:
        _count2vec (DictVectorizer): Transformer
    """
    ## Sort
    vocabulary = sorted(vocabulary)
    ## Initialize Vectorizer
    _count2vec = DictVectorizer(separator=":", dtype=int, sort=False)
    ## Update Attributes
    _count2vec.vocabulary_ = dict((x, i) for i, x in enumerate(vocabulary))
    _count2vec.feature_names_ = vocabulary
    return _count2vec
Exemple #13
0
    def load_from_json(self, fname):
        # load the model
        import_data = json_tricks.load(open(fname))
        import_clf = ModifiedNB()
        import_clf.class_count_ = import_data['class_count_']
        import_clf.class_log_prior_ = import_data['class_log_prior_']
        import_clf.classes_ = import_data['classes_']
        import_clf.feature_count_ = import_data['feature_count_']
        import_clf.feature_log_prob_ = import_data['feature_log_prob_']
        self.clf = import_clf

        # load the fps dict vectoriser
        v_fps = DictVectorizer()
        dv = import_data['fps_vectoriser']
        v_fps.vocabulary_ = {int(k): v for k, v in dv['vocabulary_'].items()}
        v_fps.feature_names_ = dv['feature_names_']
        self.v_fps = v_fps

        # load the continous variables binariser
        try:
            binariser = import_data['binariser']
            kbd = KBinsDiscretizer(n_bins=10,
                                   encode='onehot',
                                   strategy='quantile')
            kbd.n_bins = binariser['n_bins']
            kbd.n_bins_ = binariser['n_bins_']
            kbd.bin_edges_ = np.asarray(
                [np.asarray(x) for x in binariser['bin_edges_']])
            encoder = OneHotEncoder()
            encoder.categories = binariser['categories']
            encoder._legacy_mode = False
            kbd._encoder = encoder
            self.kbd = kbd
        except Exception as e:
            pass

        # extra parameters
        self.trained = True
        self.con_desc_list = import_data['con_desc_list']
        self.fp_type = import_data['fp_type']
        self.fp_radius = import_data['fp_radius']
        self.informative_cvb = import_data['informative_cvb']
Exemple #14
0
        x.append(features)
    word_stripped = word.replace("-", "")
    return (
        x,
        [_build_feature_dict(word_stripped, k, size, size) for k in xrange(len(word_stripped))],
        # (np.array(y) == 0).astype(int),
        np.array(y, dtype=int) + 2,
        np.array(stress, dtype=int),
    )


if __name__ == "__main__":
    X_train, y_train = [], []
    vect_syl = DictVectorizer(sparse=True)
    vect_stress = DictVectorizer(sparse=True)
    vect_syl.feature_names_ = set()
    vect_stress.feature_names_ = set()
    # fit vectorizers
    for _, word, stress in syllabifications("../silabe.train.xml", 10):
        if len(word.strip().replace("-", "")) != len(stress):
            print >> sys.stderr, "Skipped %s" % word
            continue
        x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict(word.strip(), stress, size=4)
        for x in x_dict_syl:
            for f, v in x.iteritems():
                if isinstance(v, (str, unicode)):
                    f = "%s%s%s" % (f, vect_syl.separator, v)
                vect_syl.feature_names_.add(f)
        for x in x_dict_stress:
            for f, v in x.iteritems():
                if isinstance(v, (str, unicode)):
Exemple #15
0
                    if len(left_feature) == k + 1:
                        features['%s-%s' % (-i - 1, -i - k - 1)] = left_feature
        x.append(features)
    word_stripped = word.replace('-', '')
    return (x, [_build_feature_dict(word_stripped, k, size, size)
               for k in xrange(len(word_stripped))],
        #(np.array(y) == 0).astype(int),
        np.array(y, dtype=int) + 2,
        np.array(stress, dtype=int))


if __name__ == '__main__':
    X_train, y_train = [], []
    vect_syl = DictVectorizer(sparse=True)
    vect_stress = DictVectorizer(sparse=True)
    vect_syl.feature_names_ = set()
    vect_stress.feature_names_ = set()
    # fit vectorizers
    for _, word, stress in syllabifications('../silabe.train.xml', 10):
        if len(word.strip().replace('-', '')) != len(stress):
            print >> sys.stderr, "Skipped %s" % word
            continue
        x_dict_syl, x_dict_stress, y_syl, y_stress = word_to_feature_dict(
                word.strip(), stress, size=4)
        for x in x_dict_syl:
            for f, v in x.iteritems():
                if isinstance(v, (str, unicode)):
                    f = "%s%s%s" % (f, vect_syl.separator, v)
                vect_syl.feature_names_.add(f)
        for x in x_dict_stress:
            for f, v in x.iteritems():
Exemple #16
0
def vectorizeExamples(examples,
                      featureGroups=None,
                      sparseLabels=False,
                      idPath=None):
    print "Vectorizing examples"
    mlb = MultiLabelBinarizer(sparse_output=sparseLabels)
    if "predictions" in examples and examples["predictions"] != None:
        print "Vectorizing predictions"
        assert idPath == None
        #examples["labels"] = mlb.fit_transform(examples["labels"])
        #examples["predictions"] = examples["labels"]
        numLabels = len(examples["labels"])
        vector = mlb.fit_transform(examples["labels"] +
                                   examples["predictions"])
        examples["labels"] = vector[:numLabels, :]
        examples["predictions"] = vector[numLabels:, :]
        print "Vectorized predictions", (examples[x].shape[1]
                                         for x in ("labels", "predictions"))
    else:
        if idPath != None:
            labelIdPath = os.path.join(idPath, "labels.tsv")
            print "Vectorizing labels with existing ids from", labelIdPath
            labelNames = loadIdNames(labelIdPath)
            mlb = MultiLabelBinarizer(
                [labelNames[x] for x in sorted(labelNames.keys())],
                sparse_output=sparseLabels)
            mlb.fit(set(labelNames.values()))
            examples["labels"] = mlb.transform(examples["labels"])
        else:
            print "Vectorizing labels with new ids"
            examples["labels"] = mlb.fit_transform(examples["labels"])
    examples["label_names"] = mlb.classes_
    if "features" in examples:
        print "Vectorizing features"
        dv = DictVectorizer(sparse=True)
        if idPath != None:
            featureIdPath = os.path.join(idPath, "features.tsv.gz")
            print "Vectorizing features with existing ids from", featureIdPath
            featureNames = loadIdNames(featureIdPath)
            #dv.fit([featureNames])
            dv.feature_names_ = [
                featureNames[x] for x in sorted(featureNames.keys())
            ]
            dv.vocabulary_ = dict(
                (f, i) for i, f in enumerate(dv.feature_names_))
            examples["features"] = dv.transform(examples["features"])
        else:
            print "Vectorizing features with new ids"
            examples["features"] = dv.fit_transform(examples["features"])
        examples["feature_names"] = dv.feature_names_
    else:
        examples["feature_names"] = []
    if featureGroups != None and "select" in featureGroups:
        threshold = .1
        print "Selecting features", examples["features"].shape[1]
        examples["features"] = VarianceThreshold(
            threshold * (1 - threshold)).fit_transform(examples["features"])
        print "Selected features", examples["features"].shape[1]
        #examples["features"] = SelectKBest(chi2, k=1000).fit_transform(examples["features"], examples["labels"])
    print "Vectorized", examples["labels"].shape[0], "examples with", len(
        examples["feature_names"]), "unique features and", len(
            examples["label_names"]), "unique labels", ("(sparse)" if
                                                        sparseLabels else "")
Exemple #17
0
    -0.02934711, 0.05490663, 0.02008552, 0.05069223, 0., 0.2016651,
    -0.28770706, -0.88722735, -0.26507582, 0.52628048, -1.28404466,
    -1.96447254, 0.07607324, 0.70359565, 0.35094977, 0.01376572
]])
_clf.classes_ = array([False, True])
_clf.intercept_ = [-2.75918545]

_v = DictVectorizer()
_v.feature_names_ = [
    'first_chars= ', 'first_chars="a', "first_chars=' ", "first_chars='A",
    'first_chars=(0', 'first_chars=(A', 'first_chars=(a', 'first_chars=)]',
    'first_chars=, ', 'first_chars=. ', 'first_chars=0', 'first_chars=0 ',
    'first_chars=0,', 'first_chars=0.', 'first_chars=00', 'first_chars=0:',
    'first_chars=0\\', 'first_chars=@', 'first_chars=A', 'first_chars=A ',
    'first_chars=A,', 'first_chars=A-', 'first_chars=A.', 'first_chars=A0',
    'first_chars=A=', 'first_chars=AA', 'first_chars=Aa', 'first_chars=[0',
    'first_chars=[A', 'first_chars=[a', 'first_chars=\\A', 'first_chars=a ',
    'first_chars=a(', 'first_chars=a-', 'first_chars=a.', 'first_chars=a0',
    'first_chars=aA', 'first_chars=a[', 'first_chars=aa', 'isalpha', 'isdigit',
    'islower', 'mean_len', 'prev_len', 'punct= ', 'punct="', 'punct=%',
    'punct=(', 'punct=)', 'punct=*', 'punct=,', 'punct=-', 'punct=.',
    'punct=:', 'punct=;', 'punct=@', 'punct=]', 'this_len'
]
_v.vocabulary_ = {
    'first_chars= ': 0,
    'first_chars="a': 1,
    "first_chars=' ": 2,
    "first_chars='A": 3,
    'first_chars=(0': 4,
    'first_chars=(A': 5,
    'first_chars=(a': 6,