Example #1
0
    def transform(self, records):
        """
        Transforms the reviews into a numpy matrix so that they can be easily
        processed by the functions available in scikit-learn

        :type records: list[dict]
        :param records: a list of dictionaries with the reviews
        :return: a matrix with the independent variables (X) and a vector with
        the dependent variables (y)
        """

        self.num_features = \
            len(review_metrics_extractor.get_review_metrics(records[0]))
        metrics = numpy.zeros((len(records), self.num_features))

        for index in range(len(records)):
            metrics[index] = \
                review_metrics_extractor.get_review_metrics(records[index])

        self.min_values = metrics.min(axis=0)
        self.max_values = metrics.max(axis=0)
        review_metrics_extractor.normalize_matrix_by_columns(
            metrics, self.min_values, self.max_values)

        labels = \
            numpy.array([record['specific'] == 'yes' for record in records])

        return metrics, labels
Example #2
0
def transform(records):
    """
    Transforms the reviews into a numpy matrix so that they can be easily
    processed by the functions available in scikit-learn

    :type records: list[dict]
    :param records: a list of dictionaries with the reviews
    :return:
    """

    num_features =\
        len(review_metrics_extractor.get_review_metrics(records[0]))
    x_matrix = numpy.zeros((len(records), num_features))

    for index in range(len(records)):
        x_matrix[index] =\
            review_metrics_extractor.get_review_metrics(records[index])

    min_values = x_matrix.min(axis=0)
    max_values = x_matrix.max(axis=0)
    review_metrics_extractor.normalize_matrix_by_columns(
        x_matrix, min_values, max_values)

    y_vector =\
        numpy.array([record['specific'] == 'yes' for record in records])

    return x_matrix, y_vector
def plot(records):
    num_features = len(review_metrics_extractor.get_review_metrics(records[0]))
    metrics = numpy.zeros((len(records), num_features))
    for index in range(len(records)):
        metrics[index] = \
            review_metrics_extractor.get_review_metrics(records[index])

    review_metrics_extractor.normalize_matrix_by_columns(metrics)
    labels = numpy.array([record['specific'] == 'yes' for record in records])

    clf = LogisticRegression(C=100)
    clf.fit(metrics, labels)

    coef = clf.coef_[0]
    intercept = clf.intercept_

    print('coef', coef)
    # print('intercept', intercept)

    xvals = numpy.linspace(0, 1.0, 2)
    yvals = -(coef[0] * xvals + intercept[0]) / coef[1]
    plt.plot(xvals, yvals, color='g', label='Decision boundary')

    plt.xlabel("log number of words (normalized)")
    plt.ylabel("log number of verbs in past tense (normalized)")
    my_legends = ['Specific reviews', 'Generic reviews']
    for outcome, marker, colour, legend in zip([0, 1], "ox", "br", my_legends):
        plt.scatter(
            metrics[:, 0][labels == outcome],
            metrics[:, 1][labels == outcome], c=colour, marker=marker,
            label=legend)
    # plt.legend([red_dot, (red_dot, white_cross)], ["Attr A", "Attr A+B"])
    plt.legend(loc='lower left', numpoints=1, ncol=3, fontsize=8,
               bbox_to_anchor=(0, 0))
    def predict(self, reviews):
        metrics = numpy.zeros((len(reviews), self.num_features))
        for index in range(len(reviews)):
            metrics[index] = review_metrics_extractor.get_review_metrics(reviews[index])

        review_metrics_extractor.normalize_matrix_by_columns(metrics, self.min_values, self.max_values)
        return self.classifier.predict(metrics)
Example #5
0
def cluster_reviews(reviews):
    """
    Classifies a list of reviews into specific and generic. Returns a list of
    integer of the same size as the list of reviews, in which each position of
    the list contains a 0 if that review is specific or a 1 if that review is
    generic.

    :param reviews: a list of reviews. Each review must contain the text of the
    review and the part-of-speech tags for every word
    :type reviews: list[Review]
    :return a list of integer of the same size as the list of reviews, in which
    each position of the list contains a 0 if that review is specific or a 1 if
    that review is generic
    """

    metrics = np.zeros((len(reviews), NUM_FEATURES))

    for index in range(len(reviews)):
        metrics[index] =\
            review_metrics_extractor.get_review_metrics(reviews[index])
    review_metrics_extractor.normalize_matrix_by_columns(metrics)

    k_means = KMeans(n_clusters=2)
    k_means.fit(metrics)
    labels = k_means.labels_

    record_clusters = split_list_by_labels(metrics, labels)
    cluster0_sum = reduce(lambda x, y: x + sum(y), record_clusters[0], 0)
    cluster1_sum = reduce(lambda x, y: x + sum(y), record_clusters[1], 0)

    if cluster0_sum < cluster1_sum:
        # If the cluster 0 contains the generic review we invert the tags
        labels = [1 if element == 0 else 0 for element in labels]

    return labels
Example #6
0
    def train(self, records, reviews=None):

        if reviews is None:
            reviews = []
            for record in records:
                reviews.append(Review(record['text']))

        if len(records) != len(reviews):
            msg = 'The size of the records and reviews arrays must be the same'
            raise ValueError(msg)

        metrics = numpy.zeros((len(reviews), self.num_features))

        for index in range(len(reviews)):
            metrics[index] =\
                review_metrics_extractor.get_review_metrics(reviews[index])

        self.min_values = metrics.min(axis=0)
        self.max_values = metrics.max(axis=0)
        review_metrics_extractor.normalize_matrix_by_columns(
            metrics, self.min_values, self.max_values)

        labels =\
            numpy.array([record['specific'] == 'yes' for record in records])
        self.classifier.fit(metrics, labels)
Example #7
0
    def score(self, records):
        metrics = numpy.zeros((len(records), self.num_features))
        for index in range(len(records)):
            metrics[index] = \
                review_metrics_extractor.get_review_metrics(records[index])

        review_metrics_extractor.normalize_matrix_by_columns(
            metrics, self.min_values, self.max_values)

        labels = \
            numpy.array([record['specific'] == 'yes' for record in records])

        return self.classifier.score(metrics, labels)
Example #8
0
    def score(self, records):
        metrics = numpy.zeros((len(records), self.num_features))
        for index in range(len(records)):
            metrics[index] = \
                review_metrics_extractor.get_review_metrics(records[index])

        review_metrics_extractor.normalize_matrix_by_columns(
            metrics, self.min_values, self.max_values)

        labels = \
            numpy.array([record['specific'] == 'yes' for record in records])

        return self.classifier.score(metrics, labels)
Example #9
0
def plot(records):
    num_features = len(review_metrics_extractor.get_review_metrics(records[0]))
    metrics = numpy.zeros((len(records), num_features))
    for index in range(len(records)):
        metrics[index] = \
            review_metrics_extractor.get_review_metrics(records[index])

    review_metrics_extractor.normalize_matrix_by_columns(metrics)
    labels = numpy.array([record['specific'] == 'yes' for record in records])

    clf = LogisticRegression(C=100)
    clf.fit(metrics, labels)

    coef = clf.coef_[0]
    intercept = clf.intercept_

    print('coef', coef)
    # print('intercept', intercept)

    xvals = numpy.linspace(0, 1.0, 2)
    yvals = -(coef[0] * xvals + intercept[0]) / coef[1]
    plt.plot(xvals, yvals, color='g', label='Decision boundary')

    plt.xlabel("log number of words (normalized)")
    plt.ylabel("log number of verbs in past tense (normalized)")
    my_legends = ['Specific reviews', 'Generic reviews']
    for outcome, marker, colour, legend in zip([0, 1], "ox", "br", my_legends):
        plt.scatter(metrics[:, 0][labels == outcome],
                    metrics[:, 1][labels == outcome],
                    c=colour,
                    marker=marker,
                    label=legend)
    # plt.legend([red_dot, (red_dot, white_cross)], ["Attr A", "Attr A+B"])
    plt.legend(loc='lower left',
               numpoints=1,
               ncol=3,
               fontsize=8,
               bbox_to_anchor=(0, 0))
Example #10
0
def main():
    item_type = 'hotel'
    # item_type = 'restaurant'
    my_folder = '/Users/fpena/UCC/Thesis/datasets/context/'
    my_file = my_folder + 'classified_' + item_type + '_reviews.json'
    binary_reviews_file = my_folder + 'classified_' + item_type + '_reviews.pkl'
    my_records = ETLUtils.load_json_file(my_file)

    with open(binary_reviews_file, 'rb') as read_file:
        my_reviews = pickle.load(read_file)

    num_features = 2

    my_metrics = numpy.zeros((len(my_reviews), num_features))
    for index in range(len(my_reviews)):
        my_metrics[index] =\
            review_metrics_extractor.get_review_metrics(my_reviews[index])

    review_metrics_extractor.normalize_matrix_by_columns(my_metrics)

    count_specific = 0
    count_generic = 0
    for record in my_records:

        if record['specific'] == 'yes':
            count_specific += 1

        if record['specific'] == 'no':
            count_generic += 1

    print('count_specific: %d' % count_specific)
    print('count_generic: %d' % count_generic)
    print('specific percentage: %f%%' % (float(count_specific)/len(my_records)))
    print('generic percentage: %f%%' % (float(count_generic)/len(my_records)))

    my_labels = numpy.array([record['specific'] == 'yes' for record in my_records])

    classifiers = [
        DummyClassifier(strategy='most_frequent', random_state=0),
        DummyClassifier(strategy='stratified', random_state=0),
        DummyClassifier(strategy='uniform', random_state=0),
        # DummyClassifier(strategy='constant', random_state=0, constant=True),
        LogisticRegression(C=100),
        SVC(C=1.0, kernel='rbf'),
        SVC(C=1.0, kernel='linear'),
        KNeighborsClassifier(n_neighbors=10),
        tree.DecisionTreeClassifier(),
        NuSVC(),
        LinearSVC()
    ]
    scores = [[] for _ in range(len(classifiers))]

    Xtrans = my_metrics

    cv = KFold(n=len(my_metrics), n_folds=5)

    for i in range(len(classifiers)):
        for train, test in cv:
            x_train, y_train = Xtrans[train], my_labels[train]
            x_test, y_test = Xtrans[test], my_labels[test]

            clf = classifiers[i]
            clf.fit(x_train, y_train)
            scores[i].append(clf.score(x_test, y_test))

    for classifier, score in zip(classifiers, scores):
        print("Mean(scores)=%.5f\tStddev(scores)=%.5f" % (numpy.mean(score), numpy.std(score)))

    plot(my_metrics, my_labels)