def classify_lyrics_pos(genre_lyrics_map):

    vectorizer = DictVectorizer()

    all_lyrics_pos_tags = []
    all_lyrics_genres   = []

    for genre in genre_lyrics_map.keys():

        genre_lyrics = genre_lyrics_map[genre]

        for song_lyrics in genre_lyrics:

            pos_tags_map = song_lyrics["features"]["pos_tags_map"]

            all_lyrics_pos_tags.append(pos_tags_map)
            all_lyrics_genres.append(genre)

    pos_train, pos_test, genres_train, genres_test = train_test_split(all_lyrics_pos_tags, all_lyrics_genres, test_size=0.33)

    vectorizer.fit(all_lyrics_pos_tags)
    vect = vectorizer.transform((all_lyrics_pos_tags))
    print("vect = " + str(vect))

    classifiers_to_use      = get_classifiers()
    partial_fit_classifiers = classifiers_to_use["partial"]
    full_fit_classifiers    = classifiers_to_use["full"]

    teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_train, genres_train, app_data.LYRICS_GENRES_METAL)

    test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, pos_test, genres_test)

    print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)
def _get_lyrics_vectorizers():

    union = FeatureUnion(
        transformer_list=[

            ('verse_count', Pipeline([
                                ('extractor', ct.LyricsVerseCountVectorizer()),
                            ])
            ),

            ('stanza_count', Pipeline([
                                ('extractor', ct.LyricsStanzaCountVectorizer()),
                             ])
            ),

            ('avg_verse_words', Pipeline([
                                    ('extractor', ct.LyricsAvgVerseWordCountVectorizer())
                                ])
            ),

            ('word_count', Pipeline([
                                ('extractor', ct.LyricsWordCountVectorizer())
                           ])
            ),

            ('pos_tags_map', Pipeline([
                                ('extractor', ct.LyricsPartOfSpeechVectorizer()),
                                ('vectorizer', DictVectorizer())
                             ])
            ),

            ('word_endings', Pipeline([
                                ('extractor', ct.LyricsWordEndingsVectorizer()),
                                ('vectorizer', DictVectorizer()),
                                ('transformer', TfidfTransformer())
                             ])
            ),

            ('lyrics_bow', Pipeline([
                                ('vectorizer', _get_tfidf_vectorizer())
                           ])
            )

        ],

        transformer_weights={
            'verse_count'     : 2,
            'stanza_count'    : 2,
            'avg_verse_words' : 3,
            'word_count'      : 3,
            'pos_tags_map'    : 10,
            'word_endings'    : 7,
            'lyrics_bow'      : 10
        }
    )

    return union
 def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200):
     self.clf = SparsePU_SL(prior=prior,
                            sigma=sigma,
                            lam=lam,
                            basis=basis,
                            n_basis=n_basis)
     # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis)
     # self.clf = SVC()
     self.featureizer = DictVectorizer(sparse=True)
Example #4
0
def mutual_information_similarity(file_name):
    """
    Calculates MI between all pairs of short_genre based on their word's MI.

    Prints to file the similarity

    :return:
    """
    from sklearn.metrics.pairwise import cosine_similarity as cos_sim
    import math

    SimilarityScore = collections.namedtuple("SimilarityScore", ("g1", "g2", "score"))  # a type

    # fetch all short genres
    mi_coll = MutualInformation()
    # all possible pairs of genre with no repeat
    genres = []

    # calculate cosine similarity b/w pairs
    dv = DictVectorizer()

    def extract_bow_add_to_genres(genre, bow):
        if genre not in genres:
            genres.append(genre)

        new_bow = {}

        for k in bow.keys():

            curr = bow[k]
            new_bow[k] = 0 if math.isnan(curr) or math.isinf(curr) else curr

            new_bow == 0 and print("Eliminated element")

        return new_bow

    bow_matrix = dv.fit_transform(
        extract_bow_add_to_genres(mi_obj.short_genre, mi_obj.bow) for mi_obj in mi_coll.iterable()
    )

    print("Done with making vector")
    # sort the pairs by the cosine similarity score
    similarity_matrix = cos_sim(bow_matrix)

    print("Done with similarity calculation")
    sorted_list = []
    # sort the similarity scores
    for x, y in itertools.combinations(range(0, len(genres)), 2):
        sorted_list.append(SimilarityScore(genres[x], genres[y], similarity_matrix[x][y]))
    # sort!
    sorted_list = sorted(sorted_list, key=operator.itemgetter(2), reverse=True)

    print("printing file")
    with open(file_name, mode="a", errors="ignore", encoding="latin-1") as file:
        for l in sorted_list:
            file.write("{}, {} value: {}\n".format(l[0], l[1], l[2]))
Example #5
0
 def __init__(self, tweet_file_path, no_of_clusters):
     """
     The constructor reads csv file and builds the data matrix.
     """
     self.np_extractor = ConllExtractor()
     self.pos_tagger = NLTKTagger()
     self.tweet_file_path = tweet_file_path
     self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
     self.vectorizer = DictVectorizer(sparse=True)
     self.k_means_estimator = KMeans(init="random",
                                     n_clusters=no_of_clusters)
def transform_into_indicators(origin_dest_arr, params):
    print "Fitting the label encoder"
    keys=  origin_dest_arr.keys()
    origins = map(lambda x: {'origin': x}, origin_dest_arr['origin'].as_matrix().astype(str))
    dests = map(lambda x: {'dest': x}, origin_dest_arr['dest'].as_matrix().astype(str))
    d = DictVectorizer(sort=False)
    reduce_fn = get_reduce_fn(keys)
    print "mapping"
    #cat = origin_dest_arr.to_dict(orient = 'records')
    cat = map(lambda x: x.__self__, origin_dest_arr.apply(reduce_fn, axis = 1, raw = True, reduce = True))
    print "Transforming"
    return d.fit_transform(cat)
Example #7
0
def load_vocab_vectorizer(train_set,pickle=True,extra_label="default"):
    train_dv=DictVectorizer()

    words=[dict(itertools.chain(*(train_set_obj.attr_map.items() for train_set_obj in train_set.objects())))]
    #fit the dv first
    train_dv.fit(words)
    print("vocab length is {}".format(len(train_dv.feature_names_)))

    del words

    pickle and pickle_dv(train_dv,extra_label)

    return train_dv
def classify_lyrics_mixed_features( genre_lyrics_map):

    vectorizer = DictVectorizer()

    all_lyrics_features = []
    all_lyrics_genres   = []

    for genre in genre_lyrics_map.keys():

        genre_lyrics = genre_lyrics_map[genre]

        for song_lyrics in genre_lyrics:

            features = song_lyrics["features"]

            song_features_map = {}

            for feature_name, feature_value in features.items():

                # pos_tags_map is a dictionary - merge it with song_features_map dictionary
                if feature_name == "pos_tags_map":
                    song_features_map.update(feature_value)

                # All other features are numeric, add their name and value as a new key-value pair to the song_features_map
                else:
                    song_features_map[feature_name] = feature_value

            print("Features: " + str(song_features_map))
            all_lyrics_features.append(song_features_map)
            all_lyrics_genres.append(genre)


    features_train, features_test, genres_train, genres_test = train_test_split(all_lyrics_features, all_lyrics_genres, test_size=0.33)

    vectorizer.fit(all_lyrics_features)

    classifiers_to_use = get_classifiers()
    partial_fit_classifiers = classifiers_to_use["partial"]
    full_fit_classifiers = classifiers_to_use["full"]

    teach_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_train, genres_train, app_data.LYRICS_GENRES_METAL)

    test_classifiers(partial_fit_classifiers, full_fit_classifiers, vectorizer, features_test, genres_test)

    print_top_features(partial_fit_classifiers + full_fit_classifiers, vectorizer, app_data.LYRICS_GENRES_METAL)

    print_classification_report(full_fit_classifiers[0], vectorizer, features_test, genres_test)
Example #9
0
def cat_vectorize(train_data, test_data, num_cols):
    # categorical attributes
    cat_train_data = train_data.drop(num_cols, axis=1)
    cat_test_data = test_data.drop(num_cols, axis=1)

    cat_train_data.fillna('NA', inplace=True)
    cat_test_data.fillna('NA', inplace=True)

    cat_train_data_values = cat_train_data.T.to_dict().values()
    cat_test_data_values = cat_test_data.T.to_dict().values()

    # vectorize (encode as one hot)
    vectorizer = DictVectorizer(sparse=False)
    vec_train_data = vectorizer.fit_transform(cat_train_data_values)
    vec_test_data = vectorizer.transform(cat_test_data_values)

    return vec_train_data, vec_test_data
 def __init__(self, tweet_file_path, no_of_clusters):
     """
     The constructor reads csv file and builds the data matrix.
     """
     self.np_extractor = ConllExtractor()
     self.pos_tagger = NLTKTagger()
     self.tweet_file_path = tweet_file_path
     self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
     self.vectorizer = DictVectorizer(sparse=True)
     self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
def _get_basic_features_transformator_union():

    union = FeatureUnion(
        transformer_list=[

            ('features_dict', Pipeline([
                                ('selector', LyricsFeatureSelector(key='features')),
                                ('vectorizer', DictVectorizer())
                              ])
            ),

            ('pos_tags_map', Pipeline([
                                ('selector', LyricsFeatureSelector(key='pos_tags_map')),
                                ('vectorizer', DictVectorizer())
                            ])
            ),

            ('word_endings', Pipeline([
                                ('selector', LyricsFeatureSelector(key='word_endings')),
                                ('vectorizer', DictVectorizer()),
                                ('transformer', TfidfTransformer())
                            ])

            ),

            ('lyrics_bow', Pipeline([
                                ('selector', LyricsFeatureSelector(key='lyrics')),
                                ('vectorizer', TfidfVectorizer(stop_words='english', max_df=0.6, analyzer='word'))
                           ])
            )

        ],
        transformer_weights={
            'features_dict': 0.2,
            'pos_tags_map' : 1.0,
            'word_endings' : 0.7,
            'lyrics_bow'   : 0.4
        }
    )

    return union
Example #12
0
def getCounts(examples):
    lengths = [len(examples[x]) for x in ("sets", "ids")
               ] + [examples[x].shape[0] for x in ("features", "labels")]
    assert len(set(lengths)) == 1, lengths
    data = {}
    print "Counting labels"
    counts = defaultdict(list)
    for sets, labels in zip(examples["sets"], examples["labels"]):
        numLabels = numpy.count_nonzero(labels)
        #category = ",".join(sets)
        categories = getCategories(sets)
        for category in categories:
            counts[category].append(numLabels)
    data["labels"] = counts
    print "Counting features"
    dv = DictVectorizer(sparse=True)
    dv.feature_names_ = examples["feature_names"]
    decoded = dv.inverse_transform(examples["features"])
    for sets, features in zip(examples["sets"], decoded):
        categories = getCategories(sets)
        tags = [x.split(":")[0] for x in features.keys()]
        tagCounts = {x: tags.count(x) for x in set(tags)}
        for tag in tagCounts.keys():
            if tag not in data:
                data[tag] = defaultdict(list)
            counts = data[tag]
            for category in categories:
                counts[category].append(tagCounts[tag])


#         feature_indices = numpy.nonzero(features)
#         for i in feature_indices:
#             name = feature_names[i]
#             if ":" in name:
#                 tag = "coverage_" + name.split(":")[0]
#                 if tag not in data:
#                     data[tag] = defaultdict(list)
#                 counts = data[tag]
#                 for category in sets:
#                     counts[category] += 1
    return dict(data)
Example #13
0
class EntityDetectionPU(ClassifierMixin):
    def __init__(self, prior=.5, sigma=.1, lam=1, basis='gauss', n_basis=200):
        self.clf = SparsePU_SL(prior=prior,
                               sigma=sigma,
                               lam=lam,
                               basis=basis,
                               n_basis=n_basis)
        # self.clf = PU_SL(prior=prior, sigma=sigma, lam=lam, basis=basis, n_basis=n_basis)
        # self.clf = SVC()
        self.featureizer = DictVectorizer(sparse=True)

    def fit(self, X, y):
        x_feat = self.featureizer.fit_transform(X)
        self.clf.fit(x_feat, y)
        return self

    def predict(self, X):
        x_feat = self.featureizer.transform(X)
        return self.clf.predict(x_feat)

    def predict_sent(self, X_sent):
        for xi in X_sent:
            yield (self.predict([xi]))
Example #14
0
 def train_backoff_model(self, train):
     """\
     Train a DummyClassifier back-off on the given training data.
     """
     select_attr = [train.attribs[0].name]
     if train.attribs[0].name == self.class_attr:
         select_attr = [train.attribs[1].name]
     config = {
         'class_attr': self.class_attr,
         'select_attr': select_attr,
         'vectorizer': DictVectorizer()
     }
     model = Model(config)
     model.train_on_data(train)
     return model
def _get_advanced_features_transformator_union():

    union = FeatureUnion(transformer_list=[

            # ('verse_count', Pipeline([
            #                     ('selector', LyricsFeatureSelector(key='verse_count')),
            #                     ('transformer', TfidfVectorizer()),
            #                 ])
            # ),
            #
            # ('stanza_count', Pipeline([
            #                     ('selector', LyricsFeatureSelector(key='stanza_count')),
            #                     ('transformer', TfidfVectorizer())
            #                 ])
            # ),
            #
            # ('avg_verse_length', Pipeline([
            #                         ('selector', LyricsFeatureSelector(key='avg_verse_length')),
            #                         ('transformer', TfidfVectorizer())
            #                     ])
            # ),

            ('pos_tags_map', Pipeline([
                ('selector', LyricsFeatureSelector(key='pos_tags_map')),
                ('vectorizer', DictVectorizer())
                # ('transformer', TfidfTransformer())
            ])
             ),

            ('lyrics_bow', Pipeline([
                ('selector', LyricsFeatureSelector(key='lyrics')),
                ('vectorizer', CountVectorizer(max_features=10000))
                # ('transformer', TfidfTransformer())
            ])
             )

        ], transformer_weights={
            # 'verse_count'      : 0.5,
            # 'stanza_count'     : 0.5,
            # 'avg_verse_length' : 0.8,
            'pos_tags_map': 0.6,
            'lyrics_bow'  : 0.9
        }

    )

    return union
Example #16
0
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """

    ADJECTIVE = 'JJ'
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [
        USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY,
        LABEL_FEATURE_KEY
    ]

    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random",
                                        n_clusters=no_of_clusters)

    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(
                self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s",
                         row[0])
        return data_matrix

    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text,
                             np_extractor=self.np_extractor,
                             pos_tagger=self.pos_tagger)
        adjective_map = dict(
            Counter((ele[0] for ele in set(text_blob.pos_tags)
                     if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(
            adjective_map.items() + {
                self.POLARITY_FEATURE_KEY: polarity,
                self.SUBJECTIVITY_FEATURE_KEY: subjectivity
            }.items())

    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy

    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(
            self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(
            clustering_data_matrix)

        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix,
                                           features_to_include)

    @time_it
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(
                lambda feature_name: feature_vector[feature_name]
                if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(
                feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]

        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix,
                                                   features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[
                    feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix

    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix

    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
Example #17
0
            combination_features.update(combs_i_dict)
        id_features[id].update(combination_features)
    if add_location:
        id_features[id][id_location[id]] = '1'
    id_features[id]['feature_count'] = float(id_feature_count[id])
    id_features[id]['event_count'] = id_event_count[id]
    id_features[id]['resource_count'] = id_resource_count[id]

train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0': 0, '1': 1, '2': 2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt',
               X_train.toarray(),
               delimiter=',',
               header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
Example #18
0
def vectorize(data):
    transformer = DictVectorizer()
    values= flatten([make_features(x) for x in  data])
    X = transformer.fit_transform([x["x"] for x in values]).toarray()
    Y = array([x["y"] for x in values])
    return (X, Y, values)
Example #19
0
def vectorize(data):
    transformer = DictVectorizer()
    values = flatten([make_features(x) for x in data])
    X = transformer.fit_transform([x["x"] for x in values]).toarray()
    Y = array([x["y"] for x in values])
    return (X, Y, values)
Example #20
0
    # This controls which features of the input ARFF file will be actually
    # used in training (do not select the target class feature!)
    'select_attr': [
        'Lemma', 'LemmaSuff_1', 'LemmaSuff_2', 'LemmaSuff_3', 'LemmaSuff_4',
        'LemmaSuff_5', 'LemmaSuff_6', 'LemmaSuff_7', 'LemmaSuff_8', 'Tag_POS',
        'Tag_SubPOS', 'Tag_Gen', 'Tag_Num', 'Tag_Cas', 'Tag_PGe', 'Tag_PNu',
        'Tag_Per', 'Tag_Ten', 'Tag_Gra', 'Tag_Neg', 'Tag_Voi', 'Tag_Var'
    ],

    # This filters out some feature values (here 'Tag_*' values equal to '.' or '-'.
    # You can use an arbitrary lambda function here (or None if you don't want it).
    'filter_attr':
    lambda key, val: False
    if key.startswith('Tag') and val in ['.', '-'] else True,
    'vectorizer':
    DictVectorizer(),

    # Feature filtering using ANOVA (recommended)
    'feature_filter':
    SelectPercentile(percentile=10),

    # You can use any Scikit-Learn classifier here
    'classifier_class':
    LogisticRegression,

    # Classifier parameter settings (see Scikit-Learn documentation for the list of parameters).
    # If you use lists instead of single values and specify the unfold_pattern, all the values
    # in the lists will be tried in parallel on a cluster using qsub).
    # Do not use lists of values and the unfold_pattern setting if you don't have access to
    # cluster/qsub.
    'classifier_params': {
class KMeansEstimator:
    """
    This class reads the tweets of users from a file and builds cluster centers on that data. It also provides
    method for finding the closest cluster center of unseen data.
    """
    
    ADJECTIVE = 'JJ'
    
    """
    Feature keys used in clustering...
    """
    POLARITY_FEATURE_KEY = 'polarity'
    SUBJECTIVITY_FEATURE_KEY = 'subjectivity'
    TWEET_COUNT_FEATURE_KEY = 'tweetCount'
    """
    Features not considered for clustering...
    """
    USER_ID_FEATURE_KEY = 'userId'
    LONGITUDE_FEATURE_KEY = 'longitude'
    LATITUDE_FEATURE_KEY = 'latitude'
    
    
    """
    Predicted label feature name.
    """
    LABEL_FEATURE_KEY = 'label'

    RELEVENT_FEATURE_LIST = [USER_ID_FEATURE_KEY, LATITUDE_FEATURE_KEY, LONGITUDE_FEATURE_KEY, LABEL_FEATURE_KEY]
    
    def __init__(self, tweet_file_path, no_of_clusters):
        """
        The constructor reads csv file and builds the data matrix.
        """
        self.np_extractor = ConllExtractor()
        self.pos_tagger = NLTKTagger()
        self.tweet_file_path = tweet_file_path
        self.data_matrix = self.__get_data_matrix_from_file(tweet_file_path)
        self.vectorizer = DictVectorizer(sparse=True)
        self.k_means_estimator = KMeans(init="random", n_clusters=no_of_clusters)
        
    @time_it
    def __get_data_matrix_from_file(self, tweet_file_path):
        """
        Reads tweets from csv file at path "tweet_file_path", extracts features from the tweets and returns list
        of all feature vectors.
        """
        file_reader = csv.reader(open(tweet_file_path, "rb"), delimiter=',')
        next(file_reader)
        data_matrix = []
        for row in file_reader:
            logging.info("Extracting features for user_id:%s", row[0])
            feature_vector = {}
            feature_vector[self.USER_ID_FEATURE_KEY] = int(row[0])
            feature_vector[self.LATITUDE_FEATURE_KEY] = float(row[2])
            feature_vector[self.LONGITUDE_FEATURE_KEY] = float(row[3])
            feature_vector[self.TWEET_COUNT_FEATURE_KEY] = int(row[4])
            feature_vector.update(self.__get_features_from_tweet_text(row[1].decode('utf-8')))
            data_matrix.append(feature_vector)
            logging.info("Successfully extracted features for user_id:%s", row[0])
        return data_matrix
    
    @time_it
    def __get_features_from_tweet_text(self, tweet_text):
        """This function returns the following features from the tweet text:
        - Adjectives and their corresponding frequencies found in the tweet. Each adjective is a separate feature.
        - Subjectivity and polarity as determined by TextBlob.
        :returns:  (key,value) map of all features found. 
        """
        text_blob = TextBlob(tweet_text, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger);
        adjective_map = dict(Counter((ele[0] for ele in set(text_blob.pos_tags) if ele[1] == self.ADJECTIVE)))
        polarity = text_blob.sentiment[0]
        subjectivity = text_blob.sentiment[1]
        return dict(adjective_map.items() + {self.POLARITY_FEATURE_KEY:polarity, self.SUBJECTIVITY_FEATURE_KEY:subjectivity}.items())
    
    @time_it
    def __get_clustering_data_matrix(self, data_matrix):
        """
        This method removes unnecessary features(features like user_id which are not relevant for building cluster centers) from
        the data matrix and returns a copy of the data matrix.
        """
        data_matrix_copy = copy.deepcopy(data_matrix)
        for feature_vector in data_matrix_copy:
            feature_vector.pop(self.USER_ID_FEATURE_KEY)
            feature_vector.pop(self.LATITUDE_FEATURE_KEY)
            feature_vector.pop(self.LONGITUDE_FEATURE_KEY)
        return data_matrix_copy


    @time_it
    def perform_clustering(self, features_to_include=None):
        """
        This function performs k-means clustering with "no_of_clusters" clusters of the data present in file at
        "tweet_file_path".
        It returns list of feature vector, where each feature vector contains only "features_to_include" or all features
        if "features_to_include" is None.
        """
        clustering_data_matrix = self.__get_clustering_data_matrix(self.data_matrix)
        transformed_data_matrix = self.vectorizer.fit_transform(clustering_data_matrix)
        
        self.k_means_estimator.fit(transformed_data_matrix, y=None)
        return self.__get_predicted_labels(self.data_matrix, features_to_include)

    @time_it    
    def __get_predicted_labels(self, data_matrix, features_to_include):
        """
        Finds the nearest cluster for all data points and adds a new feature label in all feature vectors of data matrix. The
        data matrix is modified in place.
        It returns a new copy of data_matrix with "features_to_include" features.
        """
        feature_names = self.vectorizer.get_feature_names()
        for feature_vector in data_matrix:
            row = [0] * len(feature_names)
            column = range(len(feature_names))
            data = map(lambda feature_name:feature_vector[feature_name] if feature_name in feature_vector else 0, feature_names)
            feature_csr_matrix = csr_matrix(coo_matrix((data, (row, column))))
            predicted_label = self.k_means_estimator.predict(feature_csr_matrix)
            feature_vector[self.LABEL_FEATURE_KEY] = predicted_label[0]
        
        expanded_data_matrix = self.__get_expanded_data_matrix(data_matrix)
        if features_to_include:
            return self.__get_filtered_data_matrix(expanded_data_matrix, features_to_include)
        else:
            return expanded_data_matrix

    @time_it
    def __get_filtered_data_matrix(self, data_matrix, features_to_include):
        """
        Removes all features except features_to_include
        """
        filtered_data_matrix = []
        for feature_vector in data_matrix:
            filtered_feature_vector = {}
            for feature_name in features_to_include:
                filtered_feature_vector[feature_name] = feature_vector[feature_name]
            filtered_data_matrix.append(filtered_feature_vector)
        return filtered_data_matrix
    
    @time_it
    def __get_expanded_data_matrix(self, data_matrix):
        """
        Adds new keys for missing features to all feature vectors of data_matrix. The data matrix is not modified, but a new 
        modified copy is returned.
        """
        feature_names = self.vectorizer.get_feature_names()
        expanded_data_matrix = copy.deepcopy(data_matrix)
        for feature_vector in expanded_data_matrix:
            for feature_name in feature_names:
                if feature_name not in feature_vector:
                    feature_vector[feature_name] = 0
        return expanded_data_matrix
    
    @time_it
    def predict_labels_for_data(self, file_path, features_to_include=None):
        """
        This function reads the tweets of different users from the file at file_path and assigns the closest 
        cluster center to each user.
        It returns list of tuples of (user_id,predicted_label,latitude, longitude).
        """
        data_matrix = self.__get_data_matrix_from_file(file_path)
        return self.__get_predicted_labels(data_matrix, features_to_include)
y_test = y_test.astype(int)

# categorical attributes

cat_train = X_train.drop(numeric_cols, axis=1)
cat_test = X_test.drop(numeric_cols, axis=1)

cat_train.fillna('NA', inplace=True)
cat_test.fillna('NA', inplace=True)

x_cat_train = cat_train.T.to_dict().values()
x_cat_test = cat_test.T.to_dict().values()

# vectorize (encode as one hot)

vectorizer = DictVectorizer(sparse=False)
vec_x_cat_train = vectorizer.fit_transform(x_cat_train)
vec_x_cat_test = vectorizer.transform(x_cat_test)

# build the feature vector

x_train = np.hstack((x_num_train, vec_x_cat_train))
x_test = np.hstack((x_num_test, vec_x_cat_test))


#clfLR = LogisticRegression().fit(x_train, y_train.values)
#pred = clfLR.predict(x_test)
#print classification_report(y_test.values, pred, digits=4)
#print accuracy_score(y_test.values, pred)

clfTree = tree.DecisionTreeClassifier().fit(x_train, y_train)
Example #23
0
	def get_data_social(self, ids) :
		'''
		Read the social features from the database.
		'''
#		data = self.get_social_features(ids)
		
		# First get some aggregated values
		boards_info = self.get_boards_info()
		repinned_info = self.get_repinned_info()
		
		uncateg, categ_entropy =  self.get_users_categories_features()

		query = """SELECT p.id as pin_id, 
							 u.id as user_id, 
							 p.nComments as comments, 
							 p.category as category, 
							 p.description as description, 
							 p.isRepin as is_repin,
							 p.date as date,
							 u.gender as gender, 
							 u.nFollowers as followers, 
							 u.nFollowing as following, 
							 u.nPins as pins,
							 u.nBoards as boards,
							 (u.website != "null") as has_website, 
							 p.board_id as board_id
							 FROM pins p JOIN users u ON p.user_id = u.id"""

		# Make query, get results and represent as map {pin_id: data} for quick access
		c = self.db.cursor()
		c.execute(query)
		rows_map = {row[0]: row[1:] for row in c.fetchall()}
		c.close()

		# Store concepts as a dict per row (pin) 
		data = [] 
		for pin_id in ids:

			(user_id, ncomments, categ, desc, is_repin, date, gender, nfollowers, nfollowing, npins, nboards, has_web, board_id) = rows_map[pin_id]

			f = {}

			# Convert to string to emphasize that this feature is categorical
#			f["ncomments"] = ncomments
			f["category"] = categ
			f["description_len"] = len(desc)
			f["is_repin"] = is_repin
			f["gender"] = gender
#			f["user_followers"] = nfollowers
			f["user_following"] = nfollowing
			f["users_pins"] = npins
			f["users_boards"] = nboards
			f["has_website"] = has_web

			f["is_product"] = (1 if '$' in desc else 0)
			f["day_of_the_week"] = (date.strftime("%a") if (date) else "")

			if nfollowers == 0 : 
				nfollowers = 1

#			f["follow_ratio"] = float(nfollowing)/nfollowers

			board_pins, board_followers = boards_info[board_id]
			f["board_pins"] = board_pins            # Total pins of the board
#			f["board_followers"] = board_followers  # Total followers of the board

			f["category_entropy"] = categ_entropy[user_id]
			f["uncategorized"] = uncateg[user_id]
			f["repinned"] = repinned_info[user_id]

			data.append(f)
			
	# 	data = data[0:4,:]
	
		# Convert categorical features to numerical representation 
		vec = DictVectorizer()
		data = vec.fit_transform(data).toarray()
		return vec.get_feature_names(), data
Example #24
0
class Model(object):
    """
    Class for abstracting the different classification models.
    """
    
    def __init__(self, train_tweets, train_targets, vect_options, tfidf_options, extra_params):
        self.grid_params = {
#                            'vect__ngram_range': [(1,1),(1,2),(2,2)],
#                      'tfidf__use_idf': (True,False),
#                      'tfidf__smooth_idf': (True, False),
#                      'tfidf__sublinear_tf': (True, False),
                      }
        
        self.grid_params = dict(self.grid_params.items()+extra_params.items())
        self.vect_options = vect_options
        self.tfidf_options = tfidf_options
        self.feature_set = {}
        self.train_tweets = train_tweets
        self.train_targets = train_targets
        self.only_text_features = False
        
    def train_on_feature_set(self, cross_validate=True, use_tfidf=True):
        """
        Performs training with the given model using the given feature set
        """
        #Establish document text feature vectors
        print "Vectorizing"
#        self.tokenizer = CountVectorizer().build_tokenizer()
        
        
        self.vect = CountVectorizer(**self.vect_options)
        self.tfidf_transformer = TfidfTransformer(**self.tfidf_options)
        self.dict_transformer = TfidfTransformer(**self.tfidf_options)
#        train_counts_tf = tfidf_transformer.fit_transform(train_counts)
        
        count_vector = self.vect.fit_transform([t.text for t in self.train_tweets])
        tfidf_count = self.tfidf_transformer.fit_transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            self.dict_vectorizer = DictVectorizer()
            dict_vector = self.dict_vectorizer.fit_transform(self.feature_set)
            
            f=codecs.open("feature_set.txt", "w", "utf8")
            for d in dict_vector:
                f.write(d.__str__())
            f.close()
            tfidf_dict = self.dict_transformer.fit_transform(dict_vector)
            f=codecs.open("feature_set_tdidf.txt", "w", "utf8")
            for d in tfidf_dict:
                f.write(d.__str__())
            f.close()
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
#        combined_features = FeatureUnion()
        #Crossvalidation
        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
#                                        ('vect', self.vect),
#                                    ('tfidf', self.tfidf_transformer),
                                    ('clf', self.classifier)
                                    ])
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit(combined_vector, self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score)
        return self.grid
        
    def grid_search_on_text_features(self, cross_validate=True, file_postfix=""):
        """
        Performs a grid search using text features on the given dataset. Stores the parameters for the optimal classifier.
        """
        
        self.grid_params = {
                    'vect__ngram_range': [(1,1),(1,2),(2,2),(1,3),(2,3),(3,3),(1,4)],
              'vect__use_idf': (True,False),
              'vect__smooth_idf': (True, False),
              'vect__sublinear_tf': (True, False),
              'vect__max_df': (0.5,),
              }
        self.vect = TfidfVectorizer()

        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
                                        ('vect', self.vect),
                                    ('clf', self.classifier)]
                                       )
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit([t.text for t in self.train_tweets], self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "        
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score, file_postfix=file_postfix)
        return self.grid

    def classify(self, tweets, sentimentvalues=None):
        """
        Performs the classification process on list of tweets.
        """
        if sentimentvalues!=None:
            self.test_words_and_values = sentimentvalues
        count_vector = self.vect.transform([t.text for t in tweets])
        tfidf_count = self.tfidf_transformer.transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)])
            tfidf_dict = self.dict_transformer.transform(dict_vector)
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
                
        predictions = self.best_estimator.predict(combined_vector)

        return predictions

    def classify_text(self, texts):
        """
        Performs classification with only text features.
        """
        
        count_vector = self.vect.transform([t for t in texts])
        text_vector = self.tfidf_transformer.transform(count_vector)
        predictions = self.best_estimator.predict(text_vector)

        return predictions
        
    def test_and_return_results(self, test_tweets, test_targets, sentimentvalues):
        """
        Tests the classifier on a given test set, and returns the accuracy, precision, recall, and f1 score.
        """
        self.test_words_and_values = sentimentvalues
        predictions = self.classify(test_tweets)
        binary_predictions = utils.reduce_targets(predictions)
        binary_test_targets = utils.reduce_targets(test_targets)
        
        accuracy = metrics.accuracy_score(binary_test_targets, binary_predictions)
        precision = metrics.precision_score(binary_test_targets, binary_predictions)
        recall = metrics.recall_score(binary_test_targets, binary_predictions)
        f1_score = metrics.f1_score(binary_test_targets, binary_predictions)
        print "Scores:  ", accuracy, precision, recall, f1_score
        
        return accuracy, precision, recall, f1_score
    
    def get_correctly_classified_tweets(self, tweets_and_sentiment):
        """
        Classifies the given set of tweets and returns the ones that were correctly classified.
        """
        tweets, sentimentvalues = zip(*tweets_and_sentiment)
        if sentimentvalues!=None:
            self.test_words_and_values = sentimentvalues
        count_vector = self.vect.transform([t.text for t in tweets])
        tfidf_count = self.tfidf_transformer.transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            dict_vector = self.dict_vectorizer.transform([features.get_feature_set(t, self.featureset, v) for t,v in zip(tweets, self.test_words_and_values)])
            tfidf_dict = self.dict_transformer.transform(dict_vector)
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
                
        predictions = self.best_estimator.predict(combined_vector)
        tweets, targets = utils.make_subjectivity_targets(tweets)
        #return the tweets where the target match prediction
        correct_tweets = []
        correct_sentimentvalues = []
        for i in xrange(len(tweets)):
            if predictions[i]==targets[i]:
                correct_tweets.append(tweets[i])
                correct_sentimentvalues.append(sentimentvalues[i])
        return correct_tweets, correct_sentimentvalues
    
    def set_feature_set(self, featureset, sentimentvalues):
        """
        Extracts and stores the given feature set for classification.
        """
        self.featureset = featureset
        if featureset=='SA' or featureset=='PA':
            self.only_text_features=True
            self.feature_set = {}
        else:
            words_and_values = sentimentvalues
            self.feature_set = [features.get_feature_set(t, self.featureset, v) for t,v in zip(self.train_tweets,words_and_values)]
        
                
Example #25
0
    def train_on_feature_set(self, cross_validate=True, use_tfidf=True):
        """
        Performs training with the given model using the given feature set
        """
        #Establish document text feature vectors
        print "Vectorizing"
#        self.tokenizer = CountVectorizer().build_tokenizer()
        
        
        self.vect = CountVectorizer(**self.vect_options)
        self.tfidf_transformer = TfidfTransformer(**self.tfidf_options)
        self.dict_transformer = TfidfTransformer(**self.tfidf_options)
#        train_counts_tf = tfidf_transformer.fit_transform(train_counts)
        
        count_vector = self.vect.fit_transform([t.text for t in self.train_tweets])
        tfidf_count = self.tfidf_transformer.fit_transform(count_vector)
        if self.only_text_features:
            combined_vector = tfidf_count
        else:
            self.dict_vectorizer = DictVectorizer()
            dict_vector = self.dict_vectorizer.fit_transform(self.feature_set)
            
            f=codecs.open("feature_set.txt", "w", "utf8")
            for d in dict_vector:
                f.write(d.__str__())
            f.close()
            tfidf_dict = self.dict_transformer.fit_transform(dict_vector)
            f=codecs.open("feature_set_tdidf.txt", "w", "utf8")
            for d in tfidf_dict:
                f.write(d.__str__())
            f.close()
            combined_vector = sp.hstack([tfidf_count, tfidf_dict])
#        combined_features = FeatureUnion()
        #Crossvalidation
        cross_validation = StratifiedKFold(self.train_targets, n_folds=10)
        
        #Build a Pipeline with TFidfVectorizer and classifier
        pipeline_classifier = Pipeline([
#                                        ('vect', self.vect),
#                                    ('tfidf', self.tfidf_transformer),
                                    ('clf', self.classifier)
                                    ])
        
        #Perform grid search
        print "Performing grid search with classifier of instance ",str(self.classifier.__class__.__name__)
        self.grid = GridSearchCV(pipeline_classifier, self.grid_params, cv=cross_validation, refit=True, n_jobs=-1,verbose=1)

        self.grid.fit(combined_vector, self.train_targets)
        
        self.best_estimator = self.grid.best_estimator_
        self.best_parameters = self.grid.best_params_
        self.best_score = self.grid.best_score_
        
        
        print "Results for ",self.classifier.__class__.__name__
        print "Best params: ", self.best_parameters
        print "Best score: ", self.best_score
        
        print "Storing estimator... "
        utils.store_model(self.classifier.__class__.__name__, self.best_parameters, self.best_score)
        return self.grid
Example #26
0
        id_features[id].update(combination_features)    
    if add_location:                  
        id_features[id][id_location[id]] = '1' 
    id_features[id]['feature_count'] = float(id_feature_count[id])
    id_features[id]['event_count'] = id_event_count[id]
    id_features[id]['resource_count'] = id_resource_count[id]
    
    
train_ids = sorted(id_location_train.keys())
test_ids = sorted(id_location_test.keys())
train_features = [id_features[id] for id in train_ids]
test_features = [id_features[id] for id in test_ids]
labels = {'0':0, '1':1, '2':2}
train_labels = [labels[id_severity_train[id]] for id in train_ids]
test_fake_labels = [train_labels[0]] * len(test_ids)
vectorizer = DictVectorizer()

X_train = vectorizer.fit_transform(train_features)
features = vectorizer.get_feature_names()
save_train_features = False
if save_train_features:
    np.savetxt('x_train.txt', X_train.toarray(), delimiter=',', header=','.join(features))

X_test = vectorizer.transform(test_features)

#scaler = prep.MinMaxScaler(feature_range=(0, 1), copy=True)
scaler = prep.StandardScaler(copy=True, with_mean=True, with_std=True)
X_train = scaler.fit_transform(X_train.toarray())
X_test = scaler.transform(X_test.toarray())

do_feature_elimination = False