Exemple #1
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.calculate_importance = False

        self.sampler = InstanceHardnessThreshold(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.keywords(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.product(),
            bug_features.component(),
            bug_features.is_mozillian(),
            bug_features.bug_reporter(),
            bug_features.blocked_bugs_number(),
            bug_features.priority(),
            bug_features.has_cve_in_alias(),
            bug_features.comment_count(),
            bug_features.comment_length(),
            bug_features.reporter_experience(),
            bug_features.number_of_bug_dependencies(),
        ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(
                    feature_extractors,
                    cleanup_functions,
                    rollback=True,
                    rollback_when=self.rollback,
                ),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.0001), "title"),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.0001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
for row in allLikesLS:
    aDictLikes2[row[0]].append(row[1])

combDICT = {}
for uid in unqLikesUIDs:
    tmpDICT = {}
    tmpLS = aDictLikes2[uid]
    for row in tmpLS:
        tmpDICT[str(row)] = 1
    combDICT[uid] = tmpDICT

tryTHIS = []
for uid in unqLikesUIDs:
    tryTHIS.append(combDICT[uid])

v = DictVectorizer()
likesMAT = v.fit_transform(tryTHIS)

del globals()['likes']
del globals()['likesUIDs']
del globals()['likesLIDs']
del globals()['lsLikesUIDs']
del globals()['lsLikesLIDs']
del globals()['setLikesUIDs']
del globals()['setLikesLIDs']
del globals()['allLikesLS']
del globals()['aDictLikes2']
del globals()['aUID']
del globals()['row']
del globals()['combDICT']
del globals()['uid']
Exemple #3
0
from sklearn.metrics import precision_score
from sklearn import linear_model
from sklearn import tree
from sklearn.cluster import KMeans
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE

allegations = list()
officers = dict()
complainants = dict()
features = DictVectorizer()
results = []

#################################################### REFERENCE VARS
# column numbers for allegation sheet
id_colnum = 0
complainantid_colnum = 1
officerid_colnum = 2
allcat_colnum = 4
result_colnum = 11
investigator_colnum = 22

# column numbers for officer sheet
id_colnum = 0
gender_colnum = 4
race_colnum = 6
Exemple #4
0
def get_similarity_model():
    model = Pipeline([("vect", DictVectorizer()), ("neigh", NearestNeighbors(n_neighbors=6))])
    return model
Exemple #5
0
 def __init__(self):
     self.vectorizer = DictVectorizer()
Exemple #6
0
    def __init__(self, lemmatization=False, historical=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            # Ignore keywords that would make the ML completely skewed
            # (we are going to use them as 100% rules in the evaluation phase).
            bug_features.keywords({"regression", "talos-regression", "feature"}),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.blocked_bugs_number(),
            bug_features.ever_affected(),
            bug_features.affected_then_unaffected(),
            bug_features.product(),
            bug_features.component(),
        ]

        if historical:
            feature_extractors += [
                bug_features.had_severity_enhancement(),
                bug_features.patches(),
                bug_features.landings(),
            ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(feature_extractors, cleanup_functions),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(min_df=0.001), "title"),
                            (
                                "first_comment",
                                self.text_vectorizer(min_df=0.001),
                                "first_comment",
                            ),
                            (
                                "comments",
                                self.text_vectorizer(min_df=0.001),
                                "comments",
                            ),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=utils.get_physical_cpu_count())
        self.clf.set_params(predictor="cpu_predictor")
    open('./txt_CosineSimilarity/TFIDF_forCS_headline.txt',
         'r',
         encoding="UTF-8"))
CSBody_TFIDF = json.load(
    open('./txt_CosineSimilarity/TFIDF_forCS_body.txt', 'r', encoding="UTF-8"))
CosineSimilarity = json.load(
    open('./txt_CosineSimilarity/TFIDF_CosineSimilarity.txt',
         'r',
         encoding="utf-8"))
data = pd.read_csv("merged_traintest.csv")

# y = data['Stance_2cat']
y = data['Stance_4cat']  #(75385,1)
# print ("y",y)

dict_vec = DictVectorizer(sparse=True)
X1 = dict_vec.fit_transform(CSHeadline_TFIDF.values())
X1 = pd.DataFrame(X1.toarray(), columns=dict_vec.get_feature_names())
# print (X1) # (75385 rows x 3293 columns)

X2 = dict_vec.fit_transform(CSBody_TFIDF.values())
X2 = pd.DataFrame(X2.toarray(), columns=dict_vec.get_feature_names())
# print (X2) # (75385 rows x 4207 columns)

X = pd.concat([X1, X2], axis=1)
del X1
del X2

X3 = pd.DataFrame.from_dict(CosineSimilarity,
                            orient='index',
                            columns=['CosineSimilarity'])  #(75385筆, 1欄)
Exemple #8
0
 def fit_feature_dict(self, sequences):
     train_data = self.get_sequence_features(sequences)
     self.feature2matrix = DictVectorizer()
     self.feature2matrix.fit(train_data)
Exemple #9
0
from sklearn.metrics import accuracy_score

### Step 1 ###
testData = pd.read_csv("testing.csv")
trainingData = pd.read_csv("training.csv")

Xtest = testData.drop("target", axis=1)
Ytest = testData["target"]
Xtrain = trainingData.drop("target", axis=1)
Ytrain = trainingData["target"]

### step 2 ###
Xtrain_dict = Xtrain.to_dict("records")
Xtest_dict = Xtest.to_dict("records")

dv = DictVectorizer()

Xtrain_encoded = dv.fit_transform(Xtrain_dict)
Xtest_encoded = dv.transform(Xtest_dict)

clf = DecisionTreeClassifier()
score = np.mean(cross_val_score(clf, Xtrain_encoded, Ytrain))
print(score)

### step 3 ###
pipe_dv_dtc = make_pipeline(DictVectorizer(), DecisionTreeClassifier())

pipe_dv_dtc.fit(Xtrain_dict, Ytrain)
pred = pipe_dv_dtc.predict(Xtest_dict)
print(accuracy_score(Ytest, pred))
Exemple #10
0
parser = argparse.ArgumentParser()
parser.add_argument(
    '-p',
    '--persist',
    action='store_true',
    help='Specify whether to make the model persistent in models/*')
parser.add_argument(
    '--noval',
    action='store_true',
    help='specify whether to evaluate the model\'s performance')
args = parser.parse_args()

forest_clf = RandomForestRegressor(min_samples_leaf=5, random_state=42)
multi_clf = MultiOutputRegressor(forest_clf)
full_pipeline = Pipeline([('filterer', DictFilterer(exclude_u_sub)),
                          ('vectorizer', DictVectorizer(sparse=True)),
                          ('selectKBest', SelectKBest(multi_f_classif,
                                                      k=1000)),
                          ('scaler', StandardScaler(with_mean=False)),
                          ('framer', ToSparseDF()), ('clf', multi_clf)])

if __name__ == '__main__':
    from tables import Comment, User, db
    from collections import defaultdict

    comment_groups = Comment.query.with_entities(Comment.author, Comment.subreddit, db.func.count(Comment.subreddit))\
                            .group_by(Comment.author, Comment.subreddit)\
                            .all()

    subreddit_counts = defaultdict(dict)
    for author, subreddit, count in comment_groups:
Exemple #11
0
        return self

    def transform(self, X):
        return [{word: True
                 for word in word_tokenize(document)} for document in X]


# 加载消息。我们只对消息内容感兴趣,因此只提取和存储它们的text值。代码如下:

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
# 加载消息的类别。

with open(labels_filename) as inf:
    labels = json.load(inf)

# 创建流水线,把所有部件组合起来。流水线包含以下三个部分。
# 我们创建的NLTKBOW转换器  DictVectorizer转换器  BernoulliNB分类器

from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', NLTKBOW()),
                     ('vectorizer', DictVectorizer()),
                     ('naive-bayes', BernoulliNB())])
Exemple #12
0
def make_conversion_data(num_feat_files, from_suffix, to_suffix):
    num_examples = 500
    num_feats_per_file = 7

    np.random.seed(1234567890)

    convert_dir = join(_my_dir, 'train', 'test_conversion')
    if not exists(convert_dir):
        os.makedirs(convert_dir)

    # Create lists we will write files from
    ids = []
    features = []
    labels = []
    for j in range(num_examples):
        y = "dog" if j % 2 == 0 else "cat"
        ex_id = "{}{}".format(y, j)
        x = {"f{:03d}".format(feat_num): np.random.randint(0, 4) for feat_num
             in range(num_feat_files * num_feats_per_file)}
        x = OrderedDict(sorted(x.items(), key=lambda t: t[0]))
        ids.append(ex_id)
        labels.append(y)
        features.append(x)
    # Create vectorizers/maps for libsvm subset writing
    feat_vectorizer = DictVectorizer()
    feat_vectorizer.fit(features)
    label_map = {label: num for num, label in
                 enumerate(sorted({label for label in labels if
                                   not isinstance(label, (int, float))}))}
    # Add fake item to vectorizer for None
    label_map[None] = '00000'

    # get the feature name prefix
    feature_name_prefix = '{}_to_{}'.format(from_suffix.lstrip('.'),
                                            to_suffix.lstrip('.'))

    # Write out unmerged features in the `from_suffix` file format
    for i in range(num_feat_files):
        train_path = join(convert_dir, '{}_{}{}'.format(feature_name_prefix,
                                                        i, from_suffix))
        sub_features = []
        for example_num in range(num_examples):
            feat_num = i * num_feats_per_file
            x = {"f{:03d}".format(feat_num + j):
                 features[example_num]["f{:03d}".format(feat_num + j)] for j in
                 range(num_feats_per_file)}
            sub_features.append(x)
        train_fs = FeatureSet('sub_train', ids, labels=labels,
                              features=sub_features,
                              vectorizer=feat_vectorizer)
        if from_suffix == '.libsvm':
            Writer.for_path(train_path, train_fs,
                            label_map=label_map).write()
        else:
            Writer.for_path(train_path, train_fs).write()

    # Write out the merged features in the `to_suffix` file format
    train_path = join(convert_dir, '{}_all{}'.format(feature_name_prefix,
                                                     to_suffix))
    train_fs = FeatureSet('train', ids, labels=labels, features=features,
                          vectorizer=feat_vectorizer)
    if to_suffix == '.libsvm':
        Writer.for_path(train_path, train_fs,
                        label_map=label_map).write()
    else:
        Writer.for_path(train_path, train_fs).write()
Exemple #13
0
    for index, row in sub_train.iterrows():
        wifi_dict = {}
        for wifi in row.wifi_infos.split(';'):
            bssid, signal, flag = wifi.split('|')
            wifi_dict[bssid] = int(signal)
        train_set.append(wifi_dict)

    test_set = []
    for index, row in sub_test.iterrows():
        wifi_dict = {}
        for wifi in row.wifi_infos.split(';'):
            bssid, signal, flag = wifi.split('|')
            wifi_dict[bssid] = int(signal)
        test_set.append(wifi_dict)

    v = DictVectorizer(sparse=False, sort=False)
    train_set = v.fit_transform(train_set)
    test_set = v.transform(test_set)
    train_set[train_set == 0] = np.NaN
    test_set[test_set == 0] = np.NaN
    sub_train = pd.concat([sub_train.reset_index(),
                           pd.DataFrame(train_set)],
                          axis=1)
    sub_test = pd.concat([sub_test.reset_index(),
                          pd.DataFrame(test_set)],
                         axis=1)

    lbl = LabelEncoder()
    lbl.fit(list(sub_train['shop_id'].values))
    sub_train['label'] = lbl.transform(list(sub_train['shop_id'].values))
    num_class = sub_train['label'].max() + 1
Exemple #14
0
def add_sample_metadata_as_features(exp: Experiment,
                                    fields,
                                    sparse=None,
                                    inplace=False):
    '''Add covariates from sample metadata to the data table as features for machine learning.

    This will convert the columns of categorical strings using one-hot
    encoding scheme and add them into the data table as new features.

    .. note:: This is only for numeric and/or nominal covariates in
       sample metadata. If you want to add a ordinal column as a
       feature, use `pandas.Series.map` to convert ordinal column to
       numeric column first.

    Examples
    --------
    >>> exp = Experiment(np.array([[1,2], [3, 4]]), sparse=False,
    ...                  sample_metadata=pd.DataFrame({'category': ['A', 'B'],
    ...                                                'ph': [6.6, 7.7]},
    ...                                               index=['s1', 's2']),
    ...                  feature_metadata=pd.DataFrame({'motile': ['y', 'n']}, index=['otu1', 'otu2']))
    >>> exp
    Experiment with 2 samples, 2 features

    Let's add the columns of `category` and `ph` as features into data table:

    >>> new = exp.add_sample_metadata_as_features(['category', 'ph'])
    >>> new
    Experiment with 2 samples, 5 features
    >>> new.feature_metadata
               motile
    category=A    NaN
    category=B    NaN
    ph            NaN
    otu1            y
    otu2            n
    >>> new.data  # doctest: +SKIP
    array([[1. , 0. , 6.6, 1. , 2. ],
           [0. , 1. , 7.7, 3. , 4. ]])

    Parameters
    ----------
    fields : list of str
        the column names in the sample metadata. These columns will be
        converted to one-hot numeric code and then concatenated to the
        data table
    sparse : bool or None, optional
        use sparse or dense data matrix. When it is ``None``, it will follow
        the same sparsity of the current data table in the :class:`.Experiment` object
    inplace : bool
        change the :class:`.Experiment` object in place or return a copy of changed.

    Returns
    -------
    Experiment

    See Also
    --------
    sklearn.preprocessing.OneHotEncoder

    '''
    logger.debug('Add the sample metadata {} as features'.format(fields))
    if inplace:
        new = exp
    else:
        new = exp.copy()
    md = new.sample_metadata[fields]
    if sparse is None:
        sparse = new.sparse

    vec = DictVectorizer(sparse=sparse)
    encoded = vec.fit_transform(md.to_dict(orient='records'))

    if sparse:
        new.data = hstack((encoded, new.data), format='csr')
    else:
        new.data = np.concatenate([encoded, new.data], axis=1)
    # the order in the concatenation should be consistent with the data table
    new.feature_metadata = pd.concat(
        [pd.DataFrame(index=vec.get_feature_names()), new.feature_metadata])
    return new
def crossval(paths, annDir, eval_type, use_reach, relabeling,
             conservative_eval, limit_training, balance_dataset):
    ''' Puts all together '''
    def in_neighborhood(datum, intervals):
        ''' Used to filter a datum if it's not within the neighborhood of (min, max) '''

        for interval in intervals:
            minIx, maxIx = interval
            if datum.ctxIx >= minIx and datum.ctxIx <= maxIx:
                return True

        return False

    print "Parsing data"
    paths = set(paths)
    labels, features, data = parse_data(paths, annDir, use_reach, relabeling)

    # Group indexes by paper id
    groups = {p: [] for p in paths}

    for i, d in enumerate(data):
        groups[d.namespace].append(i)

    # Hack!!
    groups2 = {}
    for k, v in groups.iteritems():
        if len(v) != 0:
            groups2[k] = v

    groups = groups2

    print
    print "Cross-validation"
    print "Using %i papers" % len(groups2)
    if relabeling: print "Doing relabeling"
    if conservative_eval: print "Doing conservative evaluation"
    if limit_training: print "Limiting training data range"
    if balance_dataset: print "Balancing data set during training"
    if one_hit_all: print "One-hit-all"
    print "Total golden data: %i\tTotal expanded data: %i" % (len(
        [d for d in data if d.golden]), len([d for d in data if not d.golden]))
    print

    # Only use the training data
    if limit_training:
        # Compute the range of the annotations
        intervals = defaultdict(list)
        k = 2
        for datum in data:
            if datum.golden:
                intervals[datum.namespace].append(
                    (datum.ctxIx - k, datum.ctxIx + k))

    # Make it a numpy array to index it more easily
    data = np.asarray(data)

    dv = DictVectorizer()
    dv.fit(features.values())

    # Build a feature vector and attach it to each datum
    vectors = {k: dv.transform(v) for k, v in features.iteritems()}

    c_results, p_results = [], []

    # Do the "Cross-validation" only on those papers that have more than N papers
    for ix, path in enumerate(groups.keys()):

        print "Fold: %i" % (ix + 1)

        training_paths = paths - {path}

        X_train, X_test = [], []
        y_train, y_test = [], []
        data_train, data_test = [], []

        for datum in data:
            if datum.namespace in training_paths:

                if limit_training:
                    if not in_neighborhood(datum, intervals[datum.namespace]):
                        continue

                X_train.append(vectors[datum])
                y_train.append(labels[datum])
                data_train.append(datum)

        for datum in data:
            if datum.namespace not in training_paths:
                if conservative_eval:
                    if not datum.golden:
                        continue

                X_test.append(vectors[datum])
                y_test.append(labels[datum])
                data_test.append(datum)

        # Balance the dataset if necessary
        if balance_dataset:
            train_positive, train_negative = [], []
            for datum in data_train:
                if datum.label == 1:
                    train_positive.append(datum)
                else:
                    train_negative.append(datum)

            k = 4  # Ratio of negatives per positives for balancing
            size = len(train_positive) * k
            if size < len(train_negative):
                balanced_negatives = np.random.choice(train_negative,
                                                      size,
                                                      replace=False).tolist()
            else:
                balanced_negatives = train_negative

            data_train = train_positive + balanced_negatives

            X_train = [vectors[datum] for datum in data_train]
            y_train = [labels[datum] for datum in data_train]

        p = len([d for d in data_train if d.label == 1])
        n = len([d for d in data_train if d.label == 0])
        r = n / float(p)
        print path
        print "Training data: %i positives\t%i negatives\t%f N:P ratio" % (
            p, n, r)
        p = len([d for d in data_test if d.label == 1])
        n = len([d for d in data_test if d.label == 0])
        r = n / float(p)
        print "Testing data: %i positives\t%i negatives\t%f N:P ratio" % (p, n,
                                                                          r)

        model_pred = machine_learning(vstack(X_train), y_train, vstack(X_test),
                                      y_test)
        policy_pred = policy(np.asarray(data_test))

        # One-hit-all approach
        if one_hit_all:
            ctx_types = list({d.ctxGrounded for d in data_test})
            ctx_types.sort()
            local_events = list({d.evt for d in data_test})
            local_events.sort

            predicted_bag = set()
            for datum, prediction in it.izip(data_test, model_pred):
                if prediction == 1:
                    predicted_bag.add((datum.evt, datum.ctxGrounded))

            policy_bag = set()
            for datum, prediction in it.izip(data_test, policy_pred):
                if prediction == 1:
                    policy_bag.add((datum.evt, datum.ctxGrounded))

            truth_bag = set()
            for datum, prediction in it.izip(data_test, y_test):
                if prediction == 1:
                    truth_bag.add((datum.evt, datum.ctxGrounded))

            new_model_pred, new_policy_pred, new_truth = [], [], []
            for evt in local_events:
                for ctx in ctx_types:
                    if (evt, ctx) in predicted_bag:
                        new_model_pred.append(1)
                    else:
                        new_model_pred.append(0)

                    if (evt, ctx) in policy_bag:
                        new_policy_pred.append(1)
                    else:
                        new_policy_pred.append(0)

                    if (evt, ctx) in truth_bag:
                        new_truth.append(1)
                    else:
                        new_truth.append(0)

        y_test = new_truth
        model_pred = new_model_pred
        policy_pred = new_policy_pred
        ######################

        model_results = ClassificationResults("Model %s" % path, y_test,
                                              model_pred)
        policy_result = ClassificationResults("Policy %s" % path, y_test,
                                              policy_pred)
        print "Model scores: %s" % model_results
        print "Policy scores %s" % policy_result
        print

        c_results.append(model_results)
        p_results.append(policy_result)

    #return pd.Series(f1_diffs), model_f1s
    return c_results, p_results
Exemple #16
0
def evaluate_combinations(X_train, Y_train, X_test, Y_test, model, w2v_vector):
    combination_results = dict()
    accuracy = []
    n_params = []
    caps = []
    pos = []
    NER = []
    context = []
    w2v = []
    F1 = []
    comb_list = list(product([True, False], repeat=5))
    for i, each_comb in enumerate(comb_list):
        caps.append(each_comb[0])
        pos.append(each_comb[1])
        NER.append(each_comb[2])
        context.append(each_comb[3])
        w2v.append(each_comb[4])

        print("\ncaps : ", each_comb[0], " POS : ", each_comb[1], " NER : ",
              each_comb[2], " context : ", each_comb[3], " w2v : ",
              each_comb[4])

        train_dicts = make_feature_dicts(X_train,
                                         w2v_model_wv,
                                         token=True,
                                         caps=each_comb[0],
                                         pos=each_comb[1],
                                         NER=each_comb[2],
                                         context=each_comb[3],
                                         w2v=each_comb[4])

        vec = DictVectorizer()
        X_train_v = vec.fit_transform(train_dicts)

        clf = 0
        if model == "SVM":
            clf = LinearSVC(C=0.1,
                            random_state=123,
                            class_weight="balanced",
                            max_iter=100,
                            fit_intercept=True)
        elif model == "NN":
            clf = MLPClassifier(hidden_layer_sizes=(5, 10),
                                solver='sgd',
                                learning_rate='adaptive',
                                activation='logistic',
                                max_iter=50,
                                random_state=42)
        elif model == "MNB":
            clf = GaussianNB()

        #clf = LogisticRegression()
        clf.fit(X_train_v.toarray(), Y_train)
        """
    #checking feature weights
    for i, cls in enumerate(clf.classes_):
      print("\nFeature weights for class : ",cls,"\n")
      df = pd.DataFrame(data= {"Features" : vec.feature_names_, "weights" : clf.coef_[i]})
      df = df.sort_values(axis=0,by='weights',ascending=False)
      print(df)
    """

        test_dicts = make_feature_dicts(X_test,
                                        w2v_model_wv,
                                        token=True,
                                        caps=each_comb[0],
                                        pos=each_comb[1],
                                        NER=each_comb[2],
                                        context=each_comb[3],
                                        w2v=each_comb[4])
        X_test_v = vec.transform(test_dicts)
        Y_preds = clf.predict(X_test_v)

        #confusion_matrix = confusion(test_labels, preds)
        #class_labels = ["Relevant","Not Relevant","Deceptive"]
        class_labels = ["Relevant", "Not Relevant", "Deceptive"]
        c_m = confusion_matrix(Y_test, Y_preds, labels=class_labels)
        cm_df = pd.DataFrame(c_m, index=class_labels, columns=class_labels)
        print('confusion matrix:\n%s\n' % str(cm_df))

        #evaluation_matrix = evaluate(confusion_matrix)
        #f1_score = average_f1s(evaluation_matrix)
        acc = accuracy_score(Y_test, Y_preds)
        print("Accuracy : ", acc)
        n_params.append(clf.coef_.size)
        accuracy.append(acc)

        f1 = calculate_f1(cm_df)
        print("f1: ", f1)
        F1.append(f1)

    df = pd.DataFrame(
        data={
            'F1': F1,
            'Accuracy': accuracy,
            'n_params': n_params,
            'caps': caps,
            'pos': pos,
            'NER': NER,
            'context': context,
            'w2v': w2v
        })
    #df = pd.DataFrame(data={'Accuracy' : accuracy, 'caps' : caps, 'pos' : pos, 'NER' : NER, 'context' : context, 'w2v' : w2v})
    df = df[[
        'F1', 'Accuracy', 'n_params', 'caps', 'pos', 'NER', 'context', 'w2v'
    ]]
    #df = df[['Accuracy','caps','pos','NER','context','w2v']]
    df = df.sort_values(axis=0, by='F1', ascending=False)
    return df
Exemple #17
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--clusters', type=int, default=5, required=False)
    parser.add_argument('--confidence', type=float, default=0.0, required=False)
    parser.add_argument('directory', nargs=1)
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--kmeans', action='store_true')
    group.add_argument('--dbscan', action='store_true')
    args = parser.parse_args()

    confidence = args.confidence
    n_clusters = args.clusters
    directory  = args.directory[0]
    documents = {}

    for filename in os.listdir(directory):

        if not filename.endswith('.labels'):
            continue

        with open(directory + '/' + filename, 'r') as f:
            documents[filename] = []

            for line in f.readlines():

                l_components = line.split('\t')
                conf = float(l_components[0])
                label = l_components[1][:-1]

                if conf > confidence:
                    documents[filename].append(label)
 
    v = DictVectorizer()
    dox = [ { l : 1 for l in documents[d] } for d in documents ]
    doc_names = [ d.rstrip('.labels') for d in documents ]

    X = v.fit_transform(dox)
    features = v.get_feature_names()

    if args.kmeans:
        km = KMeans(n_clusters=n_clusters)
        km.fit(X)

        # Sort cluster centers by proximity to centroid
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
        closest_labels_dict = { i : "" for i in range(n_clusters) }

        for i in range(n_clusters):
        
            for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
                closest_labels_dict[i] += features[ind] + ", "
            closest_labels_dict[i] = closest_labels_dict[i].rstrip(', ')

        clusters = km.labels_.tolist()
        clusters_dict = { i : [] for i in range(n_clusters) } 

        for c in range(len(clusters)):
            clusters_dict[clusters[c]].append(doc_names[c])

        print('<html>')
        print('<body>')

        print('<style>')
        print('img { height: 75px; }')
        print('h2 { font-family: sans-serif; } ')
        print('.box { max-width: 700px; }')
        print('</style>')

        print('<div class="box">')
        for k in clusters_dict:
            print('<h2>' + str(k) + ": " +  closest_labels_dict[k] + '</h2>')
            for img in clusters_dict[k]:
                print('<img src="file://' + directory + '/' + img + '">')
        print('</div>')
        print('</body>')
        print('</html>')

    elif args.dbscan:
        raise
Exemple #18
0
def BasicNight():
    #BasicFuntion Night
    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    #BasicFuntion Night

    df_training = data
    df_train = df_training[df_training.columns[2:9]]

    Y_train = df_train['L11P'].values
    del df_train['L11P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'Estbasic.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    df_training = data
    df_train = df_training[list(data.columns[2:8]) + list(data.columns[10:16])]
    Y_train = df_train['L21P'].values
    del df_train['L21P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL21.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    df_training = data
    df_train = df_training[list(data.columns[2:8]) + list(data.columns[17:22])]
    Y_train = df_train['L22P'].values
    del df_train['L22P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL22.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    df_training = data
    df_train = df_training[list(data.columns[2:8]) + list(data.columns[23:28])]
    Y_train = df_train['L23P'].values
    del df_train['L23P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL23.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    df_training = data
    df_train = df_training[list(data.columns[2:8]) + list(data.columns[29:34])]
    Y_train = df_train['L24P'].values
    del df_train['L24P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL24.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)

    df_training = data
    df_train = df_training[list(data.columns[45:49])]
    Y_train = df_train['L33P'].values
    del df_train['L33P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL33.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    df_training = data
    df_train = df_training[list(data.columns[35:39])]
    Y_train = df_train['L31P'].values
    del df_train['L31P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL31.pkl')

    db = MySQLdb.connect("localhost", "root", "", "ge_hackathon")
    cursor = db.cursor()
    sql = "SELECT * FROM answers WHERE finished='1'"
    data = frame_query(sql, db)
    df_training = data
    df_train = df_training[list(data.columns[40:44])]
    Y_train = df_train['L32P'].values
    del df_train['L32P']
    #ENCODING
    X_train = df_train.to_dict('records')
    X_tr = []
    X_tr.extend(X_train)
    #One Hot Encoding
    enc = DictVectorizer(sparse=True)
    X_encoded_train = enc.fit_transform(X_tr)
    estimator = GradientBoostingClassifier()
    estimator.fit(X_encoded_train.toarray(), Y_train)
    joblib.dump(estimator, 'EstL32.pkl')

    return 'success'
Exemple #19
0
from sklearn.decomposition import LatentDirichletAllocation
import string
from nltk.stem.porter import PorterStemmer
import theano
from theano import tensor as T
from theano import function
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
from sklearn.pipeline import Pipeline
import  matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
def clear_title(title,remove_stopwords):
    raw_text=BeautifulSoup(title,'html').get_text()
    letters=re.sub('[^a-zA-Z]',' ',raw_text)
    words=letters.lower().split()
    if remove_stopwords:
	stop_words=set(stopwords.words('english'))
	words=[w for w in words if w not in stop_words]	
    return ' '.join(words)
dict_vec=DictVectorizer(sparse=False)
PATH_TO_ORIGINAL_DATA = '../datasets/'
data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'cleared_bugs',sep='\t')
selected_columns=['Product','Component','Assignee','Summary','Changed']
data=data[selected_columns]
text=[]
for title in data['Summary']:
    text.append(clear_title(title,True).split())
print text
from gensim.models import word2vec
model=word2vec.Word2Vec(text,workers=4,size=50,min_count=1,window=2)
model.wv.save_word2vec_format('summary.txt',binary=False)
PREFIX = 'data/'
train = pd.read_csv(PREFIX + 'train_items.csv')
train = train[train['price'] > 0].reset_index(drop=True)

test = pd.read_csv(PREFIX + 'test_items.csv')
sid = test.sample_id.values
del test['sample_id']

# In[16]:

# noinspection PyTypeChecker
vectorizer = make_union(
    on_field('name', tfidf_fabric()), on_field('text', tfidf_fabric()),
    on_field(['shipping', 'item_condition_id', 'category_name'],
             FunctionTransformer(to_records, validate=False),
             DictVectorizer()))

# In[17]:

cv = KFold(n_splits=10, shuffle=True, random_state=42)
train_ids, valid_ids = next(cv.split(train))
train, valid = train.iloc[train_ids], train.iloc[valid_ids]

# In[18]:

y_train = np.log1p(train['price'].values.reshape(-1, 1))
y_valid = valid['price'].values

X_train = vectorizer.fit_transform(preprocess(train)).astype(np.float32)
X_valid = vectorizer.transform(preprocess(valid)).astype(np.float32)
X_final = vectorizer.transform(preprocess(test)).astype(np.float32)
Exemple #21
0
featureList = []  #字典的列表,每个字典对应一个实例
lableList = []  #标题的列表

with open(r'E:\pycharm\ML\computer.csv', 'r') as f:
    r = csv.reader(f)
    header = next(r)  #标题行
    for line in r:
        lableList.append(line[-1])
        dic = {}
        for i in range(1, len(line) - 1):
            dic[header[i]] = line[i]
        featureList.append(dic)

# pprint(featureList)

vec = DictVectorizer()  #进行特征向量的变换

dummyX = vec.fit_transform(featureList).toarray()

# print(dummyX.shape)
# print(dummyX)

print(vec.get_feature_names())
# print(dummyX)

dummyY = preprocessing.LabelBinarizer().fit_transform(lableList)

classifier = tree.DecisionTreeClassifier(
    criterion='entropy')  #利用信息熵时,用entropy(熵)。默认是gini
classifier.fit(dummyX, dummyY)
'''
Exemple #22
0
def PredictionScoreLeaveOneOutSpecifyClassifier(X, y, limit, columnName,
                                                classifierNames, classifiers):
    from sklearn.metrics import f1_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.svm import SVC, LinearSVC
    import matplotlib.pyplot as plt

    names = classifierNames

    outFile = open('output.txt', 'a')

    vec = DictVectorizer()

    for name, clf in zip(names, classifiers):
        try:
            accuracy = 0.0
            count = 0.0
            total_accuracy = 0.0
            total_f1 = 0.0
            total_precision = 0.0
            total_recall = 0.0

            count = 1.0

            from sklearn.model_selection import LeaveOneOut
            loo = LeaveOneOut()
            loo.get_n_splits(X)

            # print(loo)
            y_test_all = []
            y_pred_all = []
            accuracy_total = 0
            count = 0
            for train_index, test_index in loo.split(X):
                # print("TRAIN:", train_index, "TEST:", test_index)
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                from sklearn.feature_extraction.text import CountVectorizer
                count_vect = CountVectorizer()
                X_train_fit = count_vect.fit(X_train)
                X_train_counts = X_train_fit.transform(X_train)
                X_test_counts = X_train_fit.transform(X_test)
                #
                from sklearn.feature_extraction.text import TfidfTransformer
                tfidf_transformer = TfidfTransformer()
                fit = tfidf_transformer.fit(X_train_counts)
                X_train_tfidf = fit.transform(X_train_counts)
                X_test_tfidf = fit.transform(X_test_counts)

                X_train_counts = X_train_tfidf
                X_test_counts = X_test_tfidf
                try:
                    clf.fit(X_train_counts.toarray(), y_train)
                    accuracy_total += clf.score(X_test_counts.toarray(),
                                                y_test)
                    count += 1
                    y_pred = clf.predict(X_test_counts.toarray())
                    #
                    # binary_predictions = [x if x == 'good' else 0 for x in y_pred]
                    # binary_predictions = [x if x == 0 else 1 for x in binary_predictions]
                    #
                    # binary_labels = [x if x == 'good' else 0 for x in y_test]
                    # binary_labels = [x if x == 0 else 1 for x in binary_labels]
                    y_pred_all.append(y_pred[0])
                    y_test_all.append(y_test[0])

                except BaseException as b:
                    print(b)

            f1 = f1_score(y_test_all, y_pred_all, average='weighted')
            precision = precision_score(y_test_all,
                                        y_pred_all,
                                        average='weighted')
            recall = recall_score(y_test_all, y_pred_all, average='weighted')

            print(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall))
            outFile.write(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall) + "\n")
            # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test)
            #
            # total_accuracy +=acc
            # total_f1 += f1
            # total_precision += prc
            # total_recall += rec

        except BaseException as b:
            print(b)
    outFile.close()
class MateSegmenter(object):
    """Class for perfoming discourse segmentation on constituency trees.

    """

    #: classifier object: default classification method
    DEFAULT_CLASSIFIER = LinearSVC(multi_class='ovr', class_weight='auto')

    #:str: path  to default model to use in classification
    DEFAULT_MODEL = os.path.join(os.path.dirname(__file__), "data",
                                 "mate.model")

    #:pipeline object: default pipeline object used for classification
    DEFAULT_PIPELINE = Pipeline([('vectorizer', DictVectorizer()),
                                 ('var_filter', VarianceThreshold()),
                                 ('classifier', DEFAULT_CLASSIFIER)])

    def __init__(self, featgen=gen_features_for_segment, model=DEFAULT_MODEL):
        """Class constructor.
        """
        self.featgen = featgen
        self.pipeline = None
        self._update_model(model)

    def extract_features_from_corpus(self, dep_corpus, seg_corpus=None):
        all_features = []
        all_labels = []
        for text in sorted(dep_corpus.keys()):
            seg_forest = seg_corpus.get(text, None)
            features, labels = self.extract_features_from_text(
                dep_corpus[text], seg_forest=seg_forest)
            all_features.extend(features)
            all_labels.extend(labels)
        return all_features, all_labels

    def extract_features_from_text(self, dep_forest, seg_forest=None):
        features = []
        labels = []
        observations = get_observations(seg_forest, dep_forest)
        for sentence_index, address, dep_tree, class_ in sorted(observations):
            features.append(self.featgen(dep_tree, address))
            labels.append(class_)
        return features, labels

    def segment(self, dep_corpus, out_folder):
        for text, trees in dep_corpus.iteritems():
            print text
            discourse_tree = self.segment_text(trees)
            with open(out_folder + '/' + text + '.tree', 'w') as fout:
                fout.write(str(discourse_tree))

    def segment_text(self, dep_forest):
        features, _ = self.extract_features_from_text(dep_forest)
        predictions = self._predict(features)
        return self._segment_text(predictions, dep_forest)

    def _segment_text(self, predictions, parses):
        all_segments = []
        for sentence, dep_graph in enumerate(parses):
            # slice prediction vector
            sentence_length = dep_graph.length()
            sentence_predictions = predictions[:sentence_length]
            predictions = predictions[sentence_length:]
            # segment
            segments = self._segment_sentence(sentence_predictions, dep_graph)
            segment = segments[0][1]
            all_segments.append((sentence, segment))
        return DiscourseSegment(a_name='TEXT', a_leaves=all_segments)

    def _segment_sentence(self, sentence_predictions, dep_graph):
        if dep_graph.is_valid_parse_tree():
            # remove prediction annotations (just to be sure)
            dep_graph.deannotate(PREDICTION)
            # annotate dep_graph with sentence predictions
            dep_graph.annotate(sentence_predictions, PREDICTION)
            # call tree_segmenter
            segmenter = TreeSegmenter(a_type=DEPENDENCY)
            segments = segmenter.segment(dep_graph,
                                         a_predict=decision_function,
                                         a_word_access=word_access,
                                         a_strategy=GREEDY,
                                         a_root_idx=dep_graph.root[ADDRESS])
        else:
            # make a simple sentence segment for invalid parse trees
            leaves = [(i, word)
                      for i, (_, word) in enumerate(dep_graph.words(), 1)]
            dseg = DiscourseSegment(a_name=DEFAULT_SEGMENT, a_leaves=leaves)
            segments = [(0, dseg)]
        return segments

    def train(self, seg_corpus, dep_corpus, path=None):
        assert seg_corpus.keys() == dep_corpus.keys()
        features, labels = self.extract_features_from_corpus(
            dep_corpus, seg_corpus=seg_corpus)
        self._train(features, labels)
        if path is not None:
            joblib.dump(self.pipeline, path, compress=1, cache_size=1e9)

    def _train(self, features, labels):
        self.pipeline = MateSegmenter.DEFAULT_PIPELINE
        self.pipeline.fit(features, labels)

    def test(self, seg_corpus, dep_corpus):
        assert seg_corpus.keys() == dep_corpus.keys()
        features, labels = self.extract_features_from_corpus(
            dep_corpus, seg_corpus=seg_corpus)
        predicted_labels = self._predict(features)
        return self._score(labels, predicted_labels)

    def _predict(self, features):
        return self.pipeline.predict(features)

    def _score(self, labels, predicted_labels):
        _, _, macro_f1, _ = precision_recall_fscore_support(labels,
                                                            predicted_labels,
                                                            average='macro',
                                                            warn_for=())
        _, _, micro_f1, _ = precision_recall_fscore_support(labels,
                                                            predicted_labels,
                                                            average='micro',
                                                            warn_for=())
        return macro_f1, micro_f1

    def cross_validate(self, seg_corpus, dep_corpus, out_folder=None):
        assert seg_corpus.keys() == dep_corpus.keys()
        texts = np.array(sorted(seg_corpus.keys()))
        folds = KFold(len(texts), number_of_folds)

        # extract features for all texts
        all_features = {}
        all_labels = {}
        for text in texts:
            features, labels = self.extract_features_from_text(
                dep_corpus[text], seg_forest=seg_corpus[text])
            all_features[text] = features
            all_labels[text] = labels

        # do the cross-validation
        macro_F1s = []
        micro_F1s = []
        tp = fp = fn = tp_i = fp_i = fn_i = 0
        for i, (train, test) in enumerate(folds):
            print "# FOLD", i
            # train
            train_texts = texts[train]
            train_features = chained(
                [all_features[text] for text in train_texts])
            train_labels = chained([all_labels[text] for text in train_texts])
            print "  training on %d items..." % len(train_labels)
            self._train(train_features, train_labels)
            print "  extracted %d features using the dict vectorizer." % \
                len(self.pipeline.named_steps['vectorizer'].get_feature_names())
            # test (predicting textwise)
            test_labels = []
            pred_labels = []
            for text in texts[test]:
                features = all_features[text]
                labels = all_labels[text]
                predictions = self._predict(features)
                test_labels.extend(labels)
                pred_labels.extend(predictions)
                if out_folder is not None:
                    discourse_tree = self._segment_text(
                        predictions, dep_corpus[text])
                    with open(out_folder + '/' + text + '.tree', 'w') as fout:
                        fout.write(str(discourse_tree))
            macro_f1, micro_f1 = self._score(test_labels, pred_labels)
            macro_F1s.append(macro_f1)
            micro_F1s.append(micro_f1)
            tp_i, fp_i, fn_i = _cnt_stat(test_labels, pred_labels)
            tp += tp_i
            fp += fp_i
            fn += fn_i

        print "# Average Macro F1 = %3.1f +- %3.2f" % \
            (100 * np.mean(macro_F1s), 100 * np.std(macro_F1s))
        print "# Average Micro F1 = %3.1f +- %3.2f" % \
            (100 * np.mean(micro_F1s), 100 * np.std(micro_F1s))
        if tp or fp or fn:
            print "# F1_{tp,fp} %.2f" % (2. * tp / (2. * tp + fp + fn) * 100)
        else:
            print "# F1_{tp,fp} 0. %"

    def _update_model(self, model):
        if model is None:
            self.pipeline = MateSegmenter.DEFAULT_PIPELINE
        elif isinstance(model, str):
            if not os.path.isfile(model) or not os.access(model, os.R_OK):
                raise RuntimeError(
                    "Can't load model from file {:s}".format(model))
            self.pipeline = joblib.load(model)
        else:
            self.pipeline = model
Exemple #24
0
def PredictionScoreLeaveOneOut(X, y, limit, columnName):
    from sklearn.metrics import f1_score
    from sklearn.metrics import precision_score
    from sklearn.metrics import recall_score
    from sklearn.svm import SVC, LinearSVC
    import matplotlib.pyplot as plt

    names = [
        "Linear SVM", "Nearest Neighbors", "RBF SVM", "Decision Tree",
        "Random Forest", "AdaBoost", "Naive Bayes"
    ]
    # names = ["Linear SVM","Linear SVM","Linear SVM","Linear SVM"]

    classifiers = [
        SVC(kernel="linear", C=0.025, probability=True),
        KNeighborsClassifier(3),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB()
    ]

    outFile = open('output.txt', 'a')

    vec = DictVectorizer()

    for name, clf in zip(names, classifiers):
        try:
            accuracy = 0.0
            count = 0.0
            total_accuracy = 0.0
            total_f1 = 0.0
            total_precision = 0.0
            total_recall = 0.0

            count = 1.0

            from sklearn.model_selection import LeaveOneOut
            loo = LeaveOneOut()
            loo.get_n_splits(X)

            # print(loo)
            y_test_all = []
            y_pred_all = []
            accuracy_total = 0
            count = 0
            for train_index, test_index in loo.split(X):
                # print("TRAIN:", train_index, "TEST:", test_index)
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                from sklearn.feature_extraction.text import CountVectorizer
                count_vect = CountVectorizer()
                X_train_fit = count_vect.fit(X_train)
                X_train_counts = X_train_fit.transform(X_train)
                X_test_counts = X_train_fit.transform(X_test)

                from sklearn.feature_extraction.text import TfidfTransformer
                tfidf_transformer = TfidfTransformer()
                fit = tfidf_transformer.fit(X_train_counts)
                X_train_tfidf = fit.transform(X_train_counts)
                X_test_tfidf = fit.transform(X_test_counts)

                X_train_counts = X_train_tfidf
                X_test_counts = X_test_tfidf
                try:
                    clf.fit(X_train_counts.toarray(), y_train)
                    accuracy_total += clf.score(X_test_counts.toarray(),
                                                y_test)
                    count += 1
                    y_pred = clf.predict(X_test_counts.toarray())
                    #
                    # binary_predictions = [x if x == 'good' else 0 for x in y_pred]
                    # binary_predictions = [x if x == 0 else 1 for x in binary_predictions]
                    #
                    # binary_labels = [x if x == 'good' else 0 for x in y_test]
                    # binary_labels = [x if x == 0 else 1 for x in binary_labels]
                    y_pred_all.append(y_pred[0])
                    y_test_all.append(y_test[0])

                except BaseException as b:
                    print(b)

            f1 = f1_score(y_test_all, y_pred_all, average='weighted')
            precision = precision_score(y_test_all,
                                        y_pred_all,
                                        average='weighted')
            recall = recall_score(y_test_all, y_pred_all, average='weighted')

            print(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall))
            outFile.write(
                str(columnName) + "\t" + str(limit) + "\t" + str(name) + "\t" +
                str(accuracy_total / count) + "\t" + str(f1) + "\t" +
                str(precision) + "\t" + str(recall) + "\n")
            # acc, f1,prc,rec = classify(clf,X_train,X_test,y_train,y_test)
            #
            # total_accuracy +=acc
            # total_f1 += f1
            # total_precision += prc
            # total_recall += rec

        except BaseException as b:
            print(b)
    outFile.close()
Exemple #25
0
 def __init__(self, mu):
     self.mu = mu
     self.vectorizer = DictVectorizer()
def convCatData(data_set):
    dict_vect = DictVectorizer(sparse=False)
    data_frm = pd.DataFrame(data_set).convert_objects(convert_numeric=True)
    converted_data_set = dict_vect.fit_transform(
        data_frm.to_dict(orient='records'))
    return converted_data_set
__author__ = 'Harsh'

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_extraction import DictVectorizer
from datetime import datetime
#Load training data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# Find maximum data and subtract days from it to check how old hotel is

vec = DictVectorizer()

#Encode label
#labelencoder = preprocessing.LabelEncoder()
#train['City'] = labelencoder.fit_transform(train['City'])
#train['City Group'] = labelencoder.fit_transform(train['City Group'])
#train['Type'] = labelencoder.fit_transform(train['Type'])
#train['Open Date'] = labelencoder.fit_transform(train['Open Date'])

def diff_dates_2015(date_x):
  date_format = "%m/%d/%Y"
  x = datetime.strptime(date_x, date_format)
  y = datetime.strptime('01/01/2015', date_format)
  delta = y - x
  return delta.days

train['Open Date'] = train['Open Date'].apply(lambda x: diff_dates_2015(x))
test['Open Date'] = test['Open Date'].apply(lambda x: diff_dates_2015(x))
Exemple #28
0
        for song in playlist['tracks']:
            artist = song['artist_name']
            artist_name.append(artist)
            unique_artists.add(artist)
        playlist_vs_artists.append(collections.Counter(artist_name))

    print("Slice : " + str(i) + " - Parsed")
    start = end + 1

challenge_artists = set()
slice = json.load(open('challenge_set.json'))
for playlist in slice['playlists']:
    for song in playlist['tracks']:
        challenge_artists.add(song['artist_name'])

v = DictVectorizer(sparse=True)
X = csr_matrix.transpose(v.fit_transform(playlist_vs_artists))
print 'Vectorized!'
svd = TruncatedSVD(n_components=100)
X = svd.fit_transform(X)
print 'Reduced!'
neighbor = NearestNeighbors(n_neighbors=21, metric='cosine')
neighbor.fit(X)
print X

unique_artists = list(unique_artists)
unique_artists.sort()
print(len(unique_artists))
print 'Training Done!'

nearest_artists = {}
Exemple #29
0
from sklearn.tree import DecisionTreeClassifier, export_graphviz

print('Reading dummy data ... ')
df = read_csv('flight_data.csv')
print('[Done]')

# 'Current time at origin',
# 'Current time at destination',
columns = ['Age', 'Nationality', 'Sleep quality (1-5)']

le = LabelEncoder()
y = le.fit_transform(df['Light Color'].values)

print('Training on past data ... ')
df = df[columns].to_dict(orient='records')
vectorizer = DictVectorizer(sparse=False)
X = vectorizer.fit_transform(df)

X_train = X[:-1]
y_train = y[:-1]

grd = GradientBoostingClassifier(n_estimators=10)
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
print('[Done]\n')

# dot_data = export_graphviz(clf, out_file=None,
#                            feature_names=vectorizer.feature_names_,
#                            class_names=le.classes_.tolist(),
#                            filled=True, rounded=True,
#                            special_characters=True,
import pandas as pd, numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from random import randint

if __name__ == "__main__":
    br = '\n'
    tips = pd.read_csv('data/tips.csv')
    data = tips.drop(['tip'], axis=1)
    target = tips['tip']
    v = ['sex', 'smoker', 'day', 'time']
    ls = data[v].to_dict(orient='records')
    vector = DictVectorizer(sparse=False, dtype=int)
    d = vector.fit_transform(ls)
    print('one hot encoding:')
    print(d[0:3], br)
    print('encoding order:')
    encode_order = vector.get_feature_names()
    print(encode_order, br)
    data = data.drop(['sex', 'smoker', 'day', 'time'], axis=1)
    X = data.values
    print('feature shape after removing categorical columns:')
    print(X.shape, br)
    Xls, dls = X.tolist(), d.tolist()
    X = [np.array(row + dls[i]) for i, row in enumerate(Xls)]
    X = np.array(X)
    y = target.values
    print('feature shape after adding encoded data back:')
    print(X.shape, br)