def run_all_domains(model,
                    base_path,
                    size,
                    save_model='/models/doc2vec/domains'):
    log.info('Build vocabulary!')
    model.build_vocab(get_amazon_docs(base_path))
    model.train(get_amazon_docs(base_path))
    to_pickle(save_model,
              'all-domains',
              '{}-doc-2-vec.model'.format(size),
              model,
              set_time=False)
Exemple #2
0
def get_dataset_with_kfolds_indexes(base_path, output_folder, dataset_filter,
                                    n_reviews=2000, n_cv=10,
                                    vectorizer_type='CountVectorizer',
                                    stars=None, model=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path to the directory where all outcomes of the experiment will
        be stored.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_reviews : int, 2000 by default
        Number of reviews from each dataset to use in analysis.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    model : gensim.Doc2Vec
        Model that will convert list of documents into list of document's
        vectors.
    """

    datasets = glob.glob(
        path.join(base_path, '*%s*.csv' % dataset_filter))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: %s' % dataset_name)
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df = df[['Sentiment', 'Document']]

        indexes_all = set(df.index)
        log.info('All indexes: {}'.format(len(indexes_all)))

        predictions = []
        results = []
        log.info('Vectorizer type processed: {}'.format(vectorizer_type))
        f_name = 'Supervised-learning-{}-{}-{}'.format(
            dataset_name, vectorizer_type, list_to_str(stars))
        s = Sentiment(dataset_name=dataset_name)
        classes, ml_prediction, results_ml = s.supervised_sentiment(
            docs=df['Document'],
            y=np.array(df['Sentiment']),
            classifiers=ALL_CLASSIFIERS,
            f_name_results=f_name,
            vectorizer=vectorizer_type,
            n_folds=n_cv,
            model=model,
        )
        results.append(results_ml)
        predictions.append(ml_prediction)
        to_pickle(p=output_folder, dataset=dataset_name, f_name=f_name,
                  obj=results)
Exemple #3
0
def sentiment_lexicons_imdb(lexs_names=None,
                            lex_path=None,
                            output_folder=None,
                            evaluate=True):
    """
    Counting sentiment analysis tasks with lexicon for IMDB Dataset with
    predefined Cross-Validation split.

    Parameters
    ----------
    lexs_names : list
        List of path/file names for lexicons loading.

    lex_path: str
        Path to the directory with lexicon's files.

    output_folder : str
        Path where we want to save our results.

    evaluate : bool, True by default
        If true the metrics for analysis will be counted, otherwise only
        prediction will be saved.

    Returns
    ----------
        Nothing, all necessary files will be saved automatically.
    """
    dataset_name = 'IMDB'
    results = []
    predictions = []
    predictions_directory = join(output_folder, 'predictions')

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    if not exists(predictions_directory):
        makedirs(predictions_directory)
        log.info('Directory for predictions has been created: {}'.format(
            predictions_directory))

    dataset = Dataset()
    df = dataset.load_several_files()

    log.info('Pre-processing phase starts!')
    dp = DocumentPreprocessor()
    df.Document = [dp.remove_numbers(doc) for doc in df.Document]
    sent_lex = SentimentLexicons(stemmed=False, lexicons_path=lex_path)
    lexicons = sent_lex.load_lexicons(lexicons_file_names=lexs_names)

    s = Sentiment(n_jobs=len(lexs_names), output_results=output_folder)
    df_lex, lexicon_prediction, lexicon_result, classes = \
        s.lex_sent_batch(
            df=df,
            dataset_name=dataset_name,
            lexicons=lexicons)
    results.append(lexicon_result)
    predictions.append(lexicon_prediction)

    to_pickle(f_path=join(output_folder,
                          '{}-{}.pkl'.format(dataset_name, 'results')),
              obj=results)
Exemple #4
0
def sentiment_lexicons_amazon_cv(datasets_path='',
                                 dataset_filter=None,
                                 lexs_names=None,
                                 n_reviews=2000,
                                 train=False,
                                 norm_freq=False,
                                 lex_path=None,
                                 f_name='',
                                 n_cv=10,
                                 stars=None,
                                 frequentiment_lexicons_path='',
                                 output_folder=None,
                                 evaluate=True):
    """
    Counting sentiment analysis tasks with lexicon for Amazon Dataset with
    predefined Cross-Validation split.

    Parameters
    ----------
    frequentiment_lexicons_path : str
        Path to frequentiment lexicons (csv files with comma as separator).

    datasets_path: str
        Path to the datasets directory, it must contain
        folder/train_test_subsets with cross-validation information - data
        frame indexes for each fold. Datasets are in csv files - converted from
        Web Amazon dataset structure.

    dataset_filter : list
        List of substring for choosing datasets.

    lexs_names : list
        List of path/file names for lexicons loading.

    n_reviews: int, 2000 by default.
        number of reviews from each star score.

    train : bool, False by default.
        If True you are counting sentiment for train subsets,
        otherwise counting sentiment for testing subsets.

    norm_freq : tuple with floats
        Tuple, i.e., (-1, 1) fot threshold cutting, lower than
        first value of tuples will be negative, between -1 and 1 will be neutral
        more than 1 will be positive.

    lex_path: str
        Path to the directory with lexicon's files.

    f_name : str
        Additional part of output files name (results, predictions).

    n_cv : int, 10 by default
        Number of Cross-Validation's folds to performed.

    stars : list
        Star scores that will be used in experiment, as default all.

    output_folder : str
        Path where we want to save our results.

    evaluate : bool, True by default
        If true the metrics for analysis will be counted, otherwise only
        prediction will be saved.

    Returns
    ----------
        Nothing, all necessary files will be saved automatically.
    """
    results = {}
    predictions = {}
    datasets = []
    predictions_directory = join(output_folder, 'predictions')

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    if not exists(predictions_directory):
        makedirs(predictions_directory)
        log.info('Directory for predictions has been created: {}'.format(
            predictions_directory))

    # lexs_names = [x.split('.')[0] for x in lexs_files]
    train_test_path = join(datasets_path, 'train_test_subsets')

    # get all datasets and cv
    if dataset_filter is not None:
        for df in dataset_filter:
            datasets.extend(glob(join(datasets_path, '{}.csv'.format(df))))
    else:
        datasets = glob(join(datasets_path, '*.txt.gz.csv'))
    log.debug('Datasets to process: {}'.format(datasets))

    if frequentiment_lexicons_path:
        log.debug('Freq lexs path: {}'.format(
            join(frequentiment_lexicons_path, '*.csv')))
        freq_lexs = glob(join(frequentiment_lexicons_path, '*.csv'))
    else:
        freq_lexs = []

    for dataset in datasets:

        # load Amazon data
        dataset_file_name = basename(dataset)
        dataset_name = dataset_file_name.split('.')[0]
        log.info('Dataset name: %s' % dataset_name)
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        df, _ = dp.star_score_to_sentiment(df, score_column='review/score')
        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        # df['Document'] = df['review/summary']
        df = df[['Document', 'Sentiment']]

        log.info('Pre-processing phase starts!')
        df.Document = [dp.remove_numbers(doc) for doc in df.Document]

        try:
            # load train/test sets folds
            f_path = join(train_test_path,
                          'train-test-%s-%s.pkl' % (n_reviews, dataset_name))
            with open(f_path, 'rb') as f:
                train_test_indexes = pickle.load(f)
                log.info('Pickle has been loaded, %s' % f_path)

            results[dataset_name] = []
            predictions[dataset_name] = []

            # iterate over all cross-validation subsets
            for cv_idx, cv in enumerate(train_test_indexes[:n_cv]):
                log.info('Start for {}: CV: {}/{} '.format(
                    dataset_name, cv_idx + 1, n_cv))
                freq_lexs_ = [
                    basename(x) for x in freq_lexs
                    if '{}-{}'.format(dataset_name, cv_idx) in x
                ]
                log.info(
                    'Dataset: {}, CV: {} => frequentiment lexicons: {}'.format(
                        dataset_name, cv_idx, freq_lexs_))
                lexs_names.extend(freq_lexs_)
                if stars is not None:
                    cv = (set(cv[0]).intersection(df.index.values),
                          set(cv[1]).intersection(df.index.values))

                sent_lex = SentimentLexicons(stemmed=False,
                                             lexicons_file_names=lexs_names,
                                             lexicons_path=lex_path)
                lexicons = sent_lex.load_lexicons(
                    lexicons_file_names=lexs_names, lexicons_path=lex_path)

                if train:
                    ind = cv[0]
                    f_name = 'train-%s-fold-%s' % (dataset_name, cv_idx)
                else:
                    ind = cv[1]
                    f_name = 'test-%s-fold-%s' % (dataset_name, cv_idx)

                s = Sentiment()
                df_lex, lexicon_prediction, lexicon_result, classes = \
                    s.lex_sent_batch(
                        df=df.ix[ind],
                        dataset_name=dataset_name,
                        lexicons=lexicons,
                        evaluate=evaluate)
                results[dataset_name].append(lexicon_result)
                predictions[dataset_name].append(lexicon_prediction)

                to_pickle(p=output_folder,
                          dataset='',
                          f_name=f_name,
                          obj=lexicon_prediction)
                # df_lex.to_excel(join(output_folder, 'predictions', 'predictions-%s.xls' % f_name))
                df_lex.to_pickle(
                    join(output_folder, 'predictions',
                         'predictions-%s.pkl' % f_name))

                # save predictions
                # results_to_pickle(dataset=dataset_name,
                #                   f_name='Part-predictions-%s' % f_name,
                #                   obj=lexicon_prediction)
        except IOError as err:
            log.error('%s not loaded' % dataset_name)
            raise IOError(str(err))

    to_pickle(p=output_folder, dataset='', f_name='Results', obj=results)
    to_pickle(p=output_folder,
              dataset='',
              f_name='Predictions',
              obj=predictions)
Exemple #5
0
def sentiment_doc2vec_amazon_cv(base_path,
                                output_folder,
                                dataset_filter,
                                n_cv=10,
                                vectorizer_type='gensim',
                                stars=None,
                                model=None,
                                n_max_unsupervised=None,
                                d2v_size=100,
                                save_model=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path where to save results.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    model : gensim.Doc2Vec
        Model that will convert list of documents into list of document's

    n_max_unsupervised : int
        How many document will be used during doc-2-vec approach for training as
         unsupervised examples.

    save_model : string, by default None - without saving
        Path where models doc-2-vec should be saved.

    d2v_size : int
        Length of the doc-2-vec vectors.
    """
    datasets = glob.glob(
        path.join(base_path, '*{}*.csv'.format(dataset_filter)))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: {}'.format(dataset_name))
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        # TODO remove comment below
        # df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df['Sentiment'] = df['review/score']
        df = df[['Sentiment', 'Document']]

        indexes_all = set(df.index)
        log.info('All indexes: {}'.format(len(indexes_all)))

        predictions = []
        results = []

        log.info('Vectorizer type processed: {}'.format(vectorizer_type))
        f_name = 'Supervised-learning-{}-folds-unsup-{}-stars-{}-d2v-size-{}'.format(
            vectorizer_type, n_max_unsupervised, list_to_str(stars), d2v_size)
        s = Sentiment(dataset_name=dataset_name, save_model=save_model)

        model_path = path.join(
            save_model, '{}-gensim.model-{}.pkl'.format(f_name, dataset_name))
        if exists(model_path):
            log.info('Gensim will be loaded: {}'.format(model_path))
            model = pd.read_pickle(model_path)
            docs = s.labelize_tokenize_docs(docs=df['Document'],
                                            label_type=s.w2v_label)
            X = s.get_doc_2_vec_vectors(model=model, corpus=docs)
        else:
            log.info('Gensim will be trained!')
            docs = df['Document']
            log.info('#docs: {}'.format(len(docs)))
            X, model = s.build_gensim(docs, model=model)
            to_pickle(save_model,
                      dataset_name,
                      '{}-gensim.model'.format(f_name),
                      model,
                      set_time=False)

        df_csv = pd.DataFrame()
        df_csv['class'] = df['Sentiment']
        df_csv = pd.merge(df_csv,
                          pd.DataFrame(X),
                          left_index=True,
                          right_index=True)
        log.debug('Data Frame with labels and features, shape: {}x{}'.format(
            df_csv.shape[0], df_csv.shape[1]))
        df_csv.to_csv(path.join(save_model,
                                '{}-{}.csv'.format(dataset_name, d2v_size)),
                      header=False,
                      index=False)
Exemple #6
0
def amazon_supervised(base_path,
                      output_folder,
                      dataset_filter,
                      n_cv=10,
                      vectorizer_type='CountVectorizer',
                      stars=None,
                      stars_dist=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path to the directory where all outcomes of the experiment will
        be stored.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_reviews : int, 2000 by default
        Number of reviews from each dataset to use in analysis.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    """

    datasets = glob.glob(path.join(base_path, '*%s*.csv' % dataset_filter))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: %s' % dataset_name)
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            dp = DocumentPreprocessor()
            if stars_dist is not None:
                df = df.ix[dp.get_reviews(df, 'review/score', stars_dist)]
            df = df[df['review/score'].isin(stars)]
        df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df = df[['Sentiment', 'Document']]

        log.info('All indexes: {}'.format(len(set(df.index))))

        try:
            features_ngrams = {
                'unigrams': (1, 1),
                'n_grams_1_2': (1, 2),
                # 'n_grams_1_3': (1, 3),
            }
            log.info('Feature ngrams: {}'.format(features_ngrams))
            results = []

            for n_gram_name, n_grams_range in features_ngrams.iteritems():
                log.info('Ngram type processed: {}'.format(n_gram_name))
                log.info(
                    'Vectorizer type processed: {}'.format(vectorizer_type))

                f_name = 'Supervised-learning-{}-{}-{}-n_reviews-{}'.format(
                    vectorizer_type,
                    n_gram_name, '-'.join([str(s) for s in stars]),
                    min(stars_dist.values()))
                s = Sentiment(dataset_name=dataset_name,
                              output_results=output_folder)

                log.info('Chosen dataframe subset is %s x %s' % df.shape)
                classes, ml_prediction, results_ml = s.supervised_sentiment(
                    docs=df['Document'],
                    y=np.array(df['Sentiment']),
                    n_gram_range=n_grams_range,
                    classifiers=ALL_CLASSIFIERS,
                    lowercase=True,
                    stop_words='english',
                    # max_df=1.0,
                    # min_df=0.0,
                    max_features=50000,
                    f_name_results=f_name,
                    vectorizer=vectorizer_type,
                    n_folds=n_cv,
                )
                results.append(results_ml)
        except IOError as err:
            log.error('%s not loaded' % dataset_name)
            raise IOError(str(err))

        to_pickle(f_path=join(output_folder, 'resutls-super-example.pkl'),
                  obj=results)
def sentiment_doc2vec_amazon_cv(base_path,
                                output_folder,
                                dataset_filter,
                                n_reviews=2000,
                                n_cv=10,
                                vectorizer_type='doc2vec',
                                stars=None,
                                model=None,
                                n_max_unsupervised=None,
                                d2v_size=100,
                                save_model=None):
    """
    Main function for getting data and all necessary setting to start up
    supervised learning approach for sentiment analysis based on Amazon data
    with predefined cross-validation folds.

    Parameters
    ----------
    base_path : string
        Path to all folders and files needed in analysis, e.g, csv files with
        amazon data.

    output_folder : string
        Path where to save results.

    dataset_filter : string
        Filter files nas for dataset that will be used in the experiment.

    n_reviews : int, 2000 by default
        Number of reviews from each dataset to use in analysis.

    n_cv : int, 10 by default
        Number of Cross-Validation folds that will be used in experiment.

    vectorizer_type : object, as default - CounterVectorizer (Scikit-Learn).
        Type of vectorizer that will be used to build feature vector.

    stars : list
        List of stars that will be mapped into sentiment.

    model : gensim.Doc2Vec
        Model that will convert list of documents into list of document's

    n_max_unsupervised : int
        How many document will be used during doc-2-vec approach for training as
         unsupervised examples.

    save_model : string, by default None - without saving
        Path where models doc-2-vec should be saved.

    d2v_size : int
        Length of the doc-2-vec vectors.
    """
    train_test_path = path.join(base_path, 'train_test_subsets')
    datasets = glob.glob(
        path.join(base_path, '*{}*.csv'.format(dataset_filter)))
    log.info('Datasets will be used in experiment: {}'.format(datasets))

    if not exists(output_folder):
        makedirs(output_folder)
        log.info('New directory has been created in: {}'.format(output_folder))

    for dataset in datasets:
        dataset_name = path.basename(dataset).split('.')[0]
        log.info('Dataset name: {}'.format(dataset_name))
        dp = DocumentPreprocessor()
        df = pd.DataFrame.from_csv(dataset, sep=';', index_col=False)

        # filter stars in reviews
        if stars is not None:
            df = df[df['review/score'].isin(stars)]
        # TODO remove comment below
        # df, _ = dp.star_score_to_sentiment(df, score_column='review/score')

        # extract only Document and Sentiment columns
        df['Document'] = df['review/text']
        df['Sentiment'] = df['review/score']
        df = df[['Sentiment', 'Document']]

        indexes_all = set(df.index)
        log.info('All indexes: {}'.format(len(indexes_all)))

        # try:
        # load train/test sets folds
        f_path = path.join(
            train_test_path,
            'train-test-{}-{}.pkl'.format(n_reviews, dataset_name))
        with open(f_path, 'rb') as f:
            train_test_indexes = pickle.load(f)
            log.info('Pickle has been loaded: {}'.format(f_path))
        predictions = []
        results = []

        for i, cv in enumerate(train_test_indexes[:n_cv]):
            log.info('%s fold from division has been started!' % i)
            if stars is not None:
                cv = (list(set(cv[0]).intersection(df.index)),
                      list(set(cv[1]).intersection(df.index)))
            log.info('Vectorizer type processed: {}'.format(vectorizer_type))
            f_name = 'Supervised-learning-{}-folds-unsup-{}-stars-{}-d2v-size-{}'.format(
                vectorizer_type, n_max_unsupervised, list_to_str(stars),
                d2v_size)
            # s = Sentiment(dataset_name='{}-cv-{}'.format(dataset_name, i),
            s = Sentiment(dataset_name=dataset_name, save_model=save_model)
            log.info('Length train: {}\n Length test: {}'.format(
                len(cv[0]), len(cv[1])))
            # df_ = df.ix[cv[0] + cv[1]]
            # W2V specific
            # unsup_docs = df.loc[~df.index.isin(df_.index)]['Document'][
            #              :n_max_unsupervised]
            # log.debug('Unsup_docs {}'.format(len(unsup_docs)))
            #
            # log.info(
            # 	'Chosen dataframe subset is {} x {}'.format(df_.shape[0],
            # 	                                            df_.shape[1]))

            model_path = path.join(
                save_model,
                '{}-doc-2-vec.model-{}.pkl'.format(f_name, dataset_name))
            if exists(model_path):
                log.info('Doc-2-Vec will be loaded: {}'.format(model_path))
                model = pd.read_pickle(model_path)
                docs = s.labelize_tokenize_docs(docs=df['Document'],
                                                label_type=s.w2v_label)
                X = s.get_doc_2_vec_vectors(model=model, corpus=docs)
            else:
                log.info('Doc-2-Vec will be trained!')
                X, model = s.build_doc2vec(df['Document'], [], model)
                to_pickle(save_model,
                          dataset_name,
                          '{}-doc-2-vec.model'.format(f_name),
                          model,
                          set_time=False)

            df_csv = pd.DataFrame()
            df_csv['class'] = df['Sentiment']
            df_csv = pd.merge(df_csv,
                              pd.DataFrame(X),
                              left_index=True,
                              right_index=True)
            log.debug(
                'Data Frame with labels and features, shape: {}x{}'.format(
                    df_csv.shape[0], df_csv.shape[1]))
            df_csv.to_csv(path.join(save_model,
                                    '{}-{}.csv'.format(dataset_name,
                                                       d2v_size)),
                          header=False,
                          index=False)