Ejemplo n.º 1
0
 def count_sentiment_for_list_test(self):
     sentiment = Sentiment(output_results='')
     sentiment_after = sentiment.count_sentiment_for_list(
         document_tokens=['this', 'is', 'my', 'string'],
         lexicon={'this': 1, 'is': 1, 'my': 1, 'string': 1, 'this is': -2,
                  'is my': -5, 'nice evening': -99, 'is m': -999})
     self.assertEqual(4, sentiment_after)
Ejemplo n.º 2
0
 def count_sentiment_for_list_bigrams(self):
     sentiment = Sentiment(output_results='')
     document_tokens = ['not so good', 'bad', 'very good', 'very', 'really nice']
     lexicon = {'good': 3,
                'awesome': 2,
                'neutral': 0,
                'very good': 4,
                'not good': -2,
                'bad': -2}
     sentiment_before = 2
     sentiment_after = sentiment.count_sentiment_for_list(document_tokens, lexicon)
     self.assertEqual(sentiment_before, sentiment_after)
Ejemplo n.º 3
0
 def count_sentiment_for_list_unigrams(self):
     sentiment = Sentiment(output_results='')
     document_tokens = ['good', 'good', 'bad', 'tri gram test']
     lexicon = {'good': 3,
                'bad': -3,
                'awesome': 2,
                'neutral': 0,
                'very good': 4,
                'tri gram test': 7
                }
     sentiment_before = 10
     sentiment_after = sentiment.count_sentiment_for_list(document_tokens, lexicon)
     self.assertEqual(sentiment_before, sentiment_after)
Ejemplo n.º 4
0
def sentiment_ngrams_selection(dataset,
                               max_features=[None],
                               classifiers=None,
                               features_ngrams=None,
                               bins=10):
    logging.info('Starting with %s' % dataset)

    ml_predictions = {}
    sentiment = Sentiment()

    # logging.basicConfig(filename='processing.log', level=logging.DEBUG,
    #                     format='%(asctime)s - sentiment_ngrams.py - '
    #                            '%(levelname)s - %(message)s')

    # dataset='Movies_&_TV1200.csv' -> feature_space_size 1200 x 116495
    logging.info('# max features = %s' % max_features)
    if max_features is None:
        max_features = [None]
    else:
        max_features = max_features
        thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
        max_features = [math.floor(x * max_features) for x in thresholds]

    for mf in max_features:
        logging.info('Starting with %s and %s features' % (dataset, mf))
        for n_gram_name, n_grams_range in features_ngrams.iteritems():
            logging.info('Starting with %s' % n_gram_name)
            # print 'CountVectorizer'
            f_name = n_gram_name + '_CountVectorizer'
            classes, ml_prediction, results_ml = sentiment.supervised_sentiment(
                dataset=dataset,
                # worksheet_name='Arkusz1',
                n_gram_range=n_grams_range,
                n_folds=10,
                classifiers={'LinearSVC': LinearSVC()},
                # classifiers=None,  # all classifier available in sentiment class
                # classifiers=classifiers,
                amazon=True,
                lowercase=True,
                stop_words='english',
                max_df=1.0,
                min_df=0.0,
                max_features=mf,
                f_name_results=f_name,
                vectorizer='CountVectorizer',
                # tokenizer=document_preprocessor.tokenizer_with_stemming
            )
            ml_predictions.update(ml_prediction)
            results_to_pickle(dataset,
                              '%s-%s-%s' % (n_gram_name, f_name, str(mf)),
                              results_ml)
Ejemplo n.º 5
0
# -*- coding: utf-8 -*-
__author__ = 'Łukasz Augustyniak'

import pandas as pd
from textlytics.sentiment.sentiment import Sentiment

from textlytics.sentiment.document_preprocessing import DocumentPreprocessor

df = pd.read_csv('C:\Users\Dell\Documents\GitHub\word2vec\d2v-vs-bow\Automotive9600.csv')

dp = DocumentPreprocessor()
df, _ = dp.star_score_to_sentiment(df, score_column='Stars', star_mean_score=3)

s = Sentiment()

df_lex, lexicon_prediction, lexicon_result, classes = \
    s.lex_sent_batch(
        df=df,
        lexs_files=['amazon_automotive_25_w2v_all.txt', 'amazon_automotive_25.txt'],
        words_stem=False,
        dataset_name='word_vectorization')

print lexicon_result
Ejemplo n.º 6
0
def get_dataset_with_kfolds_indexes(base_path, output_folder, dataset_filter,
                                    n_reviews=2000, n_cv=10,
                                    vectorizer_type='CountVectorizer',
                                    stars=None, model=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path to the directory where all outcomes of the experiment will
        be stored.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_reviews : int, 2000 by default
        Number of reviews from each dataset to use in analysis.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    model : gensim.Doc2Vec
        Model that will convert list of documents into list of document's
        vectors.
    """

    datasets = glob.glob(
        path.join(base_path, '*%s*.csv' % dataset_filter))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: %s' % dataset_name)
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df = df[['Sentiment', 'Document']]

        indexes_all = set(df.index)
        log.info('All indexes: {}'.format(len(indexes_all)))

        predictions = []
        results = []
        log.info('Vectorizer type processed: {}'.format(vectorizer_type))
        f_name = 'Supervised-learning-{}-{}-{}'.format(
            dataset_name, vectorizer_type, list_to_str(stars))
        s = Sentiment(dataset_name=dataset_name)
        classes, ml_prediction, results_ml = s.supervised_sentiment(
            docs=df['Document'],
            y=np.array(df['Sentiment']),
            classifiers=ALL_CLASSIFIERS,
            f_name_results=f_name,
            vectorizer=vectorizer_type,
            n_folds=n_cv,
            model=model,
        )
        results.append(results_ml)
        predictions.append(ml_prediction)
        to_pickle(p=output_folder, dataset=dataset_name, f_name=f_name,
                  obj=results)
Ejemplo n.º 7
0
def imdb_sentiment(n_cv=10, vectorizer_type='CountVectorizer'):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on IMDB data.

    Parameters
    ----------
    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.
    """
    dataset_name = 'IMDB'
    dataset = Dataset()
    df = dataset.load_several_files()

    features_ngrams = {
        # 'unigrams': (1, 1),
        'n_grams_1_2': (1, 2),
        # 'n_grams_1_3': (1, 3),
    }

    max_features = 3000

    clfs = {
        'BernoulliNB': BernoulliNB(),
        'LogisticRegression': LogisticRegression(),
        'LinearSVC': LinearSVC(),
    }

    predictions = []
    results = []

    for n_gram_name, n_grams_range in features_ngrams.iteritems():
        log.info('Ngram type processed: {}'.format(n_gram_name))
        log.info('Vectorizer type processed: {}'.format(vectorizer_type))

        f_name = 'Supervised-learning-{}-{}-{}-{}'.format(
            n_gram_name, max_features, vectorizer_type, dataset_name)
        s = Sentiment(dataset_name=dataset_name)
        classes, ml_prediction, results_ml = s.supervised_sentiment(
            docs=df['Document'],
            y=df['Sentiment'],
            n_gram_range=n_grams_range,
            classifiers=clfs,
            lowercase=True,
            stop_words='english',
            max_df=1.0,
            min_df=0.0,
            max_features=max_features,
            f_name_results=f_name,
            vectorizer=vectorizer_type,
            n_folds=n_cv,
        )
        results.append(results_ml)
        predictions.append(ml_prediction)
        results_to_pickle(dataset=dataset_name,
                          f_name='predictions-%s' % f_name,
                          obj=ml_prediction)

    results_to_pickle(dataset=dataset_name, f_name=f_name, obj=results)
    results_to_pickle(dataset=dataset_name,
                      f_name='predictions-%s' % f_name,
                      obj=predictions)
Ejemplo n.º 8
0
def sentiment_lexicons_imdb(lexs_names=None,
                            lex_path=None,
                            output_folder=None,
                            evaluate=True):
    """
    Counting sentiment analysis tasks with lexicon for IMDB Dataset with
    predefined Cross-Validation split.

    Parameters
    ----------
    lexs_names : list
        List of path/file names for lexicons loading.

    lex_path: str
        Path to the directory with lexicon's files.

    output_folder : str
        Path where we want to save our results.

    evaluate : bool, True by default
        If true the metrics for analysis will be counted, otherwise only
        prediction will be saved.

    Returns
    ----------
        Nothing, all necessary files will be saved automatically.
    """
    dataset_name = 'IMDB'
    results = []
    predictions = []
    predictions_directory = join(output_folder, 'predictions')

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    if not exists(predictions_directory):
        makedirs(predictions_directory)
        log.info('Directory for predictions has been created: {}'.format(
            predictions_directory))

    dataset = Dataset()
    df = dataset.load_several_files()

    log.info('Pre-processing phase starts!')
    dp = DocumentPreprocessor()
    df.Document = [dp.remove_numbers(doc) for doc in df.Document]
    sent_lex = SentimentLexicons(stemmed=False, lexicons_path=lex_path)
    lexicons = sent_lex.load_lexicons(lexicons_file_names=lexs_names)

    s = Sentiment(n_jobs=len(lexs_names), output_results=output_folder)
    df_lex, lexicon_prediction, lexicon_result, classes = \
        s.lex_sent_batch(
            df=df,
            dataset_name=dataset_name,
            lexicons=lexicons)
    results.append(lexicon_result)
    predictions.append(lexicon_prediction)

    to_pickle(f_path=join(output_folder,
                          '{}-{}.pkl'.format(dataset_name, 'results')),
              obj=results)
Ejemplo n.º 9
0
def sentiment_lexicons_amazon_cv(datasets_path='',
                                 dataset_filter=None,
                                 lexs_names=None,
                                 n_reviews=2000,
                                 train=False,
                                 norm_freq=False,
                                 lex_path=None,
                                 f_name='',
                                 n_cv=10,
                                 stars=None,
                                 frequentiment_lexicons_path='',
                                 output_folder=None,
                                 evaluate=True):
    """
    Counting sentiment analysis tasks with lexicon for Amazon Dataset with
    predefined Cross-Validation split.

    Parameters
    ----------
    frequentiment_lexicons_path : str
        Path to frequentiment lexicons (csv files with comma as separator).

    datasets_path: str
        Path to the datasets directory, it must contain
        folder/train_test_subsets with cross-validation information - data
        frame indexes for each fold. Datasets are in csv files - converted from
        Web Amazon dataset structure.

    dataset_filter : list
        List of substring for choosing datasets.

    lexs_names : list
        List of path/file names for lexicons loading.

    n_reviews: int, 2000 by default.
        number of reviews from each star score.

    train : bool, False by default.
        If True you are counting sentiment for train subsets,
        otherwise counting sentiment for testing subsets.

    norm_freq : tuple with floats
        Tuple, i.e., (-1, 1) fot threshold cutting, lower than
        first value of tuples will be negative, between -1 and 1 will be neutral
        more than 1 will be positive.

    lex_path: str
        Path to the directory with lexicon's files.

    f_name : str
        Additional part of output files name (results, predictions).

    n_cv : int, 10 by default
        Number of Cross-Validation's folds to performed.

    stars : list
        Star scores that will be used in experiment, as default all.

    output_folder : str
        Path where we want to save our results.

    evaluate : bool, True by default
        If true the metrics for analysis will be counted, otherwise only
        prediction will be saved.

    Returns
    ----------
        Nothing, all necessary files will be saved automatically.
    """
    results = {}
    predictions = {}
    datasets = []
    predictions_directory = join(output_folder, 'predictions')

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    if not exists(predictions_directory):
        makedirs(predictions_directory)
        log.info('Directory for predictions has been created: {}'.format(
            predictions_directory))

    # lexs_names = [x.split('.')[0] for x in lexs_files]
    train_test_path = join(datasets_path, 'train_test_subsets')

    # get all datasets and cv
    if dataset_filter is not None:
        for df in dataset_filter:
            datasets.extend(glob(join(datasets_path, '{}.csv'.format(df))))
    else:
        datasets = glob(join(datasets_path, '*.txt.gz.csv'))
    log.debug('Datasets to process: {}'.format(datasets))

    if frequentiment_lexicons_path:
        log.debug('Freq lexs path: {}'.format(
            join(frequentiment_lexicons_path, '*.csv')))
        freq_lexs = glob(join(frequentiment_lexicons_path, '*.csv'))
    else:
        freq_lexs = []

    for dataset in datasets:

        # load Amazon data
        dataset_file_name = basename(dataset)
        dataset_name = dataset_file_name.split('.')[0]
        log.info('Dataset name: %s' % dataset_name)
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        df, _ = dp.star_score_to_sentiment(df, score_column='review/score')
        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        # df['Document'] = df['review/summary']
        df = df[['Document', 'Sentiment']]

        log.info('Pre-processing phase starts!')
        df.Document = [dp.remove_numbers(doc) for doc in df.Document]

        try:
            # load train/test sets folds
            f_path = join(train_test_path,
                          'train-test-%s-%s.pkl' % (n_reviews, dataset_name))
            with open(f_path, 'rb') as f:
                train_test_indexes = pickle.load(f)
                log.info('Pickle has been loaded, %s' % f_path)

            results[dataset_name] = []
            predictions[dataset_name] = []

            # iterate over all cross-validation subsets
            for cv_idx, cv in enumerate(train_test_indexes[:n_cv]):
                log.info('Start for {}: CV: {}/{} '.format(
                    dataset_name, cv_idx + 1, n_cv))
                freq_lexs_ = [
                    basename(x) for x in freq_lexs
                    if '{}-{}'.format(dataset_name, cv_idx) in x
                ]
                log.info(
                    'Dataset: {}, CV: {} => frequentiment lexicons: {}'.format(
                        dataset_name, cv_idx, freq_lexs_))
                lexs_names.extend(freq_lexs_)
                if stars is not None:
                    cv = (set(cv[0]).intersection(df.index.values),
                          set(cv[1]).intersection(df.index.values))

                sent_lex = SentimentLexicons(stemmed=False,
                                             lexicons_file_names=lexs_names,
                                             lexicons_path=lex_path)
                lexicons = sent_lex.load_lexicons(
                    lexicons_file_names=lexs_names, lexicons_path=lex_path)

                if train:
                    ind = cv[0]
                    f_name = 'train-%s-fold-%s' % (dataset_name, cv_idx)
                else:
                    ind = cv[1]
                    f_name = 'test-%s-fold-%s' % (dataset_name, cv_idx)

                s = Sentiment()
                df_lex, lexicon_prediction, lexicon_result, classes = \
                    s.lex_sent_batch(
                        df=df.ix[ind],
                        dataset_name=dataset_name,
                        lexicons=lexicons,
                        evaluate=evaluate)
                results[dataset_name].append(lexicon_result)
                predictions[dataset_name].append(lexicon_prediction)

                to_pickle(p=output_folder,
                          dataset='',
                          f_name=f_name,
                          obj=lexicon_prediction)
                # df_lex.to_excel(join(output_folder, 'predictions', 'predictions-%s.xls' % f_name))
                df_lex.to_pickle(
                    join(output_folder, 'predictions',
                         'predictions-%s.pkl' % f_name))

                # save predictions
                # results_to_pickle(dataset=dataset_name,
                #                   f_name='Part-predictions-%s' % f_name,
                #                   obj=lexicon_prediction)
        except IOError as err:
            log.error('%s not loaded' % dataset_name)
            raise IOError(str(err))

    to_pickle(p=output_folder, dataset='', f_name='Results', obj=results)
    to_pickle(p=output_folder,
              dataset='',
              f_name='Predictions',
              obj=predictions)
Ejemplo n.º 10
0
def ensemble_lex_clf(stars,
                     lex_test_path=None,
                     lex_train_path=None,
                     datasets=None,
                     new_lex_path='',
                     unigrams_path=None,
                     clfs=[],
                     lexs=[],
                     supervised=False,
                     res_name='all',
                     n_folds=10,
                     csv_path='/datasets/amazon-data/csv/',
                     binary=False,
                     freq=None):
    """
    Counting sentiment orientation for lexicon and/or classifier's predictions
    with ensemble classification.
    """
    if datasets is None:
        datasets = [
            'Automotive', 'Books', 'Clothing_&_Accessories', 'Electronics',
            'Health', 'Movies_&_TV', 'Music', 'Sports_&_Outdoors',
            'Toys_&_Games', 'Video_Games'
        ]

    log.info('Experiments will be conducted for these datasets: {}'.format(
        datasets))

    f_unigrams = glob(path.join(unigrams_path, '*.pkl'))
    log.debug('f_unigrams: %s' % f_unigrams[0])

    for dataset in datasets:
        log.info('%s dataset is starting' % dataset)
        results = []
        predictions = []
        feature_list = []
        f_unigrams_ = [x for x in f_unigrams if dataset in x]
        unigrams_pred = pd.read_pickle(f_unigrams_[0])

        # CV folds
        for i in xrange(10):
            start = datetime.now()
            log.info('#%s CV is started' % i)
            f_lex_test = 'predictions-test-%s-fold-%s.pkl' % (dataset, i)
            f_lex_train = 'predictions-train-%s-fold-%s.pkl' % (dataset, i)

            # get indexes with proper star values
            g = glr.GenerateLexicons(csv_path=csv_path)
            g.get_reviews(filter_str=dataset, file_type='csv')
            review = g.reviews[dataset]
            log.debug('Columns in review df: {}'.format(review.columns))
            # log.debug('Review Score: {}'.format(review['review/score']))
            idx_scores = review[review['review/score'].isin(stars)].index

            u = review[review['review/score'].isin(stars)]['review/score']
            log.debug('Unique stars in DF: {}'.format(set(u)))
            log.debug('#scores: {}'.format(len(u)))

            # ######################## lexicons-based approach ################
            df_test = pd.read_pickle(path.join(lex_test_path, f_lex_test))
            df_train = pd.read_pickle(path.join(lex_train_path, f_lex_train))
            # df_frequentiment_lex = pd.read_csv(path.join(new_lex_path,
            #                                              '%s-%s.csv'
            #                                              % (dataset, i)),
            #                                    index_col=0,
            #                                    names=['unigrams', 'bigrams',
            #                                           'trigrams'],
            #                                    skiprows=1)

            if new_lex_path:
                df_frequentiment_lex = merge_frequentiment_predictions(
                    cv=i,
                    dataset_name=dataset,
                    lex_generated_path=new_lex_path,
                    freq=freq)
                df_test = pd.merge(df_test,
                                   df_frequentiment_lex,
                                   left_index=True,
                                   right_index=True,
                                   how='left')
            log.debug('Frequentiment lex: {}'.format(df_test.columns))

            # ######################## supervised learning ###################
            if supervised:
                # get sentiment labels/classes from dataset (evaluation and fitting)
                df_uni_test = pd.DataFrame.from_dict({
                    k: v
                    for k, v in unigrams_pred[i].iteritems()
                    if 'train' not in k
                })
                df_uni_train = pd.DataFrame.from_dict({
                    k: v
                    for k, v in unigrams_pred[i].iteritems() if 'train' in k
                })
                df_train = pd.merge(df_train,
                                    df_uni_train,
                                    left_index=True,
                                    right_index=True,
                                    how='left')
                df_test = pd.merge(df_test,
                                   df_uni_test,
                                   left_index=True,
                                   right_index=True,
                                   how='left')

            # only lexicons or supervised too?
            if stars is not None:
                idx_test = list(set(df_test.index).intersection(idx_scores))
                log.debug('idx_scores test: {}, \nlen: {}'.format(
                    sorted(idx_test[:10]), len(idx_test)))
                # log.debug('!!! {}'.format(len(idx_test)))
                # log.debug('DF: {}'.format(len(df_test.index)))

                classes_test = np.asarray(df_test.ix[idx_test].Sentiment)
                df_test = df_test.ix[idx_test]

                # idx_train = df_train.Sentiment.isin(stars)
                idx_train = list(
                    set(df_train.index).intersection(set(idx_scores)))
                log.debug('idx_scores train: {}'.format(len(idx_train)))
                classes_train = np.asarray(df_train.ix[idx_train].Sentiment)
                df_train = df_train.ix[idx_train]
            else:
                classes_test = np.asarray(df_test.Sentiment)
                classes_train = np.asarray(df_train.Sentiment)

            l = list(df_train.columns)
            for j, x in enumerate(l):
                if 'cv' in x:
                    l[j] = x.split('-')[-1]
                if '-train' in x:
                    l[j] = x.split('-')[0]
            log.info('Lexicons: %s' % l)
            df_train.columns = l

            # TODO poprawić, bo bardzo nie ładnie napisane
            lexs = [
                x for x in df_train.columns
                if x not in ['Document', 'Sentiment']
            ]
            log.info('As feature set this columns have been chosen: {}'.format(
                lexs))

            df_test = df_test[clfs + lexs]
            df_train = df_train[clfs + lexs]
            df_test = df_test[clfs + lexs]
            df_train = df_train[clfs + lexs]

            df_train = df_train.sort(axis=1)
            df_test = df_test.sort(axis=1)

            log.debug('Unique classes Test: {}'.format(set(classes_test)))
            log.debug('Unique classes Train: {}'.format(set(classes_train)))
            log.debug('DF train shape: {}'.format(df_train.shape))
            log.debug('DF test shape: {}'.format(df_test.shape))

            s = Sentiment()
            pred = s.sentiment_classification(
                X=df_train.as_matrix().astype(np.int),
                y=classes_train,
                X_test=df_test.as_matrix().astype(np.int),
                y_test=classes_test,
                n_folds=None,
                classifiers=ALL_CLASSIFIERS,
                kfolds_indexes=[(df_train.index, df_test.index)],
                save_clf=False,
                cv_normal=False)
            results_ml = s.results
            # add information about chosen lexicons and classifiers
            results_ml['lexs-clfs'] = lexs + clfs
            log.info('Results: %s' % results_ml)
            results.append(results_ml)

            predictions.append(pred)
            log.info('Flow for dataset %s, #%s CV end' % (dataset, i))
            stop = datetime.now()
            log.info('It took %s seconds ' % (stop - start).seconds)
        results_to_pickle(dataset, 'ensemble-%s' % res_name, results)
Ejemplo n.º 11
0
def amazon_supervised(base_path,
                      output_folder,
                      dataset_filter,
                      n_cv=10,
                      vectorizer_type='CountVectorizer',
                      stars=None,
                      stars_dist=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path to the directory where all outcomes of the experiment will
        be stored.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_reviews : int, 2000 by default
        Number of reviews from each dataset to use in analysis.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    """

    datasets = glob.glob(path.join(base_path, '*%s*.csv' % dataset_filter))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: %s' % dataset_name)
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            dp = DocumentPreprocessor()
            if stars_dist is not None:
                df = df.ix[dp.get_reviews(df, 'review/score', stars_dist)]
            df = df[df['review/score'].isin(stars)]
        df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df = df[['Sentiment', 'Document']]

        log.info('All indexes: {}'.format(len(set(df.index))))

        try:
            features_ngrams = {
                'unigrams': (1, 1),
                'n_grams_1_2': (1, 2),
                # 'n_grams_1_3': (1, 3),
            }
            log.info('Feature ngrams: {}'.format(features_ngrams))
            results = []

            for n_gram_name, n_grams_range in features_ngrams.iteritems():
                log.info('Ngram type processed: {}'.format(n_gram_name))
                log.info(
                    'Vectorizer type processed: {}'.format(vectorizer_type))

                f_name = 'Supervised-learning-{}-{}-{}-n_reviews-{}'.format(
                    vectorizer_type,
                    n_gram_name, '-'.join([str(s) for s in stars]),
                    min(stars_dist.values()))
                s = Sentiment(dataset_name=dataset_name,
                              output_results=output_folder)

                log.info('Chosen dataframe subset is %s x %s' % df.shape)
                classes, ml_prediction, results_ml = s.supervised_sentiment(
                    docs=df['Document'],
                    y=np.array(df['Sentiment']),
                    n_gram_range=n_grams_range,
                    classifiers=ALL_CLASSIFIERS,
                    lowercase=True,
                    stop_words='english',
                    # max_df=1.0,
                    # min_df=0.0,
                    max_features=50000,
                    f_name_results=f_name,
                    vectorizer=vectorizer_type,
                    n_folds=n_cv,
                )
                results.append(results_ml)
        except IOError as err:
            log.error('%s not loaded' % dataset_name)
            raise IOError(str(err))

        to_pickle(f_path=join(output_folder, 'resutls-super-example.pkl'),
                  obj=results)
Ejemplo n.º 12
0
def sentiment_doc2vec_amazon_cv(base_path,
                                output_folder,
                                dataset_filter,
                                n_reviews=2000,
                                n_cv=10,
                                vectorizer_type='doc2vec',
                                stars=None,
                                model=None,
                                n_max_unsupervised=None,
                                d2v_size=100,
                                save_model=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path where to save results.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_reviews : int, 2000 by default
        Number of reviews from each dataset to use in analysis.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    model : gensim.Doc2Vec
        Model that will convert list of documents into list of document's

    n_max_unsupervised : int
        How many document will be used during doc-2-vec approach for training as
         unsupervised examples.

    save_model : string, by default None - without saving
        Path where models doc-2-vec should be saved.

    d2v_size : int
        Length of the doc-2-vec vectors.
    """
    train_test_path = path.join(base_path, 'train_test_subsets')
    datasets = glob.glob(
        path.join(base_path, '*{}*.csv'.format(dataset_filter)))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: {}'.format(dataset_name))
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        # TODO remove comment below
        # df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df['Sentiment'] = df['review/score']
        df = df[['Sentiment', 'Document']]

        indexes_all = set(df.index)
        log.info('All indexes: {}'.format(len(indexes_all)))

        # try:
        # load train/test sets folds
        f_path = path.join(
            train_test_path,
            'train-test-{}-{}.pkl'.format(n_reviews, dataset_name))
        with open(f_path, 'rb') as f:
            train_test_indexes = pickle.load(f)
            log.info('Pickle has been loaded: {}'.format(f_path))
        predictions = []
        results = []

        for i, cv in enumerate(train_test_indexes[:n_cv]):
            log.info('%s fold from division has been started!' % i)
            if stars is not None:
                cv = (list(set(cv[0]).intersection(df.index)),
                      list(set(cv[1]).intersection(df.index)))
            log.info('Vectorizer type processed: {}'.format(vectorizer_type))
            f_name = 'Supervised-learning-{}-folds-unsup-{}-stars-{}-d2v-size-{}'.format(
                vectorizer_type, n_max_unsupervised, list_to_str(stars),
                d2v_size)
            # s = Sentiment(dataset_name='{}-cv-{}'.format(dataset_name, i),
            s = Sentiment(dataset_name=dataset_name, save_model=save_model)
            log.info('Length train: {}\n Length test: {}'.format(
                len(cv[0]), len(cv[1])))
            # df_ = df.ix[cv[0] + cv[1]]
            # W2V specific
            # unsup_docs = df.loc[~df.index.isin(df_.index)]['Document'][
            #              :n_max_unsupervised]
            # log.debug('Unsup_docs {}'.format(len(unsup_docs)))
            #
            # log.info(
            # 	'Chosen dataframe subset is {} x {}'.format(df_.shape[0],
            # 	                                            df_.shape[1]))

            model_path = path.join(
                save_model,
                '{}-doc-2-vec.model-{}.pkl'.format(f_name, dataset_name))
            if exists(model_path):
                log.info('Doc-2-Vec will be loaded: {}'.format(model_path))
                model = pd.read_pickle(model_path)
                docs = s.labelize_tokenize_docs(docs=df['Document'],
                                                label_type=s.w2v_label)
                X = s.get_doc_2_vec_vectors(model=model, corpus=docs)
            else:
                log.info('Doc-2-Vec will be trained!')
                X, model = s.build_doc2vec(df['Document'], [], model)
                to_pickle(save_model,
                          dataset_name,
                          '{}-doc-2-vec.model'.format(f_name),
                          model,
                          set_time=False)

            df_csv = pd.DataFrame()
            df_csv['class'] = df['Sentiment']
            df_csv = pd.merge(df_csv,
                              pd.DataFrame(X),
                              left_index=True,
                              right_index=True)
            log.debug(
                'Data Frame with labels and features, shape: {}x{}'.format(
                    df_csv.shape[0], df_csv.shape[1]))
            df_csv.to_csv(path.join(save_model,
                                    '{}-{}.csv'.format(dataset_name,
                                                       d2v_size)),
                          header=False,
                          index=False)