Beispiel #1
0
def train_models_for_resources(data_type,
                               resources,
                               resource_lang_csv=None,
                               csv_data_file_path=None):
    resources_names_list = []

    if data_type == 'db':
        if resources is None:
            resources_all = Resources.select(Resources.resource).iterator()
            resources_names_list = [
                i.__data__['resource'] for i in resources_all
            ]
        else:
            resources_names_list = [resources]

    elif data_type == 'csv':
        resources_names_list = [resources]

    if len(resources_names_list) == 0:
        raise Exception(
            "Resources not defined. Set -r <resource> or --resource <resource> variable."
        )

    for resource_name in resources_names_list:
        LDAMWHandler().train(data_type=data_type,
                             resource=resource_name,
                             res_lang=resource_lang_csv,
                             csv_data_file_path=csv_data_file_path)

    my_print("{}Train finished.\n".format(SUCCESS_FLAG))
Beispiel #2
0
def save_topics_to_csv(save_to_file_path, df):
    try:
        df.to_csv(save_to_file_path)
        my_print("{} Topics saved to [ {} ]".format(SUCCESS_FLAG,
                                                    save_to_file_path))
    except Exception as e:
        my_print("{} Cant save topics to [ {} ]".format(
            ERROR_FLAG, save_to_file_path))
Beispiel #3
0
def get_subjectivity_analyzer(lang):
    try:
        sa_subj_data_file_path = 'nltk_data/sa_subjectivity.pickle'

        sentim_analyzer = load(DEFAULT_PROJECT_PATH + sa_subj_data_file_path)

    except LookupError:
        my_print(
            '{}Cannot find the sentiment analyzer you want to load.'.format(
                WARNING_FLAG))
        my_print(
            '{}Training & save a new one using NaiveBayesClassifier.'.format(
                WARNING_FLAG))

        sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True)

    return sentim_analyzer
Beispiel #4
0
def get_correlation_metric(resource, csv_data_file_path, data_type):
    resources_iterator = []

    if data_type == 'db':
        if resource is None:
            rdata = Resources.select().iterator()
            resources_iterator = [elem.__data__['resource'] for elem in rdata]
        else:
            if is_resource_exists(resource):
                resources_iterator = [resource]
            else:
                my_print("{}Resource [ {} ] not found. Exiting ...".format(
                    ERROR_FLAG, resource))

    elif data_type == 'csv':
        resources_iterator = [resource]

    gstart_time = datetime.datetime.now()

    for _resource in resources_iterator:
        lstart_time = datetime.datetime.now()

        _get_correlation_metric_from_resource(
            _resource,
            csv_data_file_path=csv_data_file_path,
            data_type=data_type)

        my_print("{}Correlation for [ {} ] calculated in {}".format(
            INFO_FLAG, _resource,
            datetime.datetime.now() - lstart_time))

    my_print("{}Correlation for [ {} ] calculated in {}".format(
        INFO_FLAG, "All resources",
        datetime.datetime.now() - gstart_time))
Beispiel #5
0
def merge_topics_with_in_csv(input_file_path,
                             save_to_file_path,
                             df,
                             on='id',
                             how='outer'):
    try:
        df_in = pd.read_csv(input_file_path)

        if df_in.index.name is None or df_in.index.name != on:
            df_in.set_index([on], inplace=True)

        dfinal = df_in.merge(df, on=on, how=how)
        no_unnamed_columns = [i for i in dfinal.columns if "Unnamed" not in i]

        dfinal = dfinal[no_unnamed_columns]

        dfinal.to_csv(save_to_file_path)
        my_print("{} Topics saved to [ {} ]".format(SUCCESS_FLAG,
                                                    save_to_file_path))
    except Exception as e:
        my_print("{} {}".format(EXCEPTION_FLAG, e))
        my_print("{} Cant save topics to [ {} ]".format(
            ERROR_FLAG, save_to_file_path))
Beispiel #6
0
def run(corr_calc,
        base_calc,
        add_calc,
        resource,
        last_added_only,
        data_type,
        csv_data_input_file_path=None,
        csv_data_output_file_path=None):
    if data_type == 'db':
        articles = get_articles_from_db(resource_id=resource,
                                        last_added_only=last_added_only)

    elif data_type == 'csv':
        articles = get_articles_from_csv(
            resource=resource, csv_file_path=csv_data_input_file_path)
    else:
        raise Exception("Cant read data <articles>. Exiting ...")

    articles_simple, articles_additional, articles_for_data_params_simple, articles_for_data_params_additional = tee(
        articles, 4)

    if base_calc:

        my_print(
            "{}Going to calculate simple parameters for [ {} ] ...".format(
                INFO_FLAG,
                resource if resource is not None else "All resources"))

        time_start = datetime.now()

        # extracted_simple_parameters = (process_for_simple(a_data) for a_data in articles_for_data_params_simple)

        if data_type == 'db':
            n_cores = multiprocessing.cpu_count()
            pool = multiprocessing.Pool(n_cores)

            extracted_simple_parameters = pool.imap(
                process_for_simple, articles_for_data_params_simple)

            del pool

            for art, ltc_params in zip(articles_simple,
                                       extracted_simple_parameters):
                for param in ltc_params:
                    if param is not None:
                        save_parameters(art, param)

        elif data_type == 'csv':
            n_cores = multiprocessing.cpu_count()
            pool = multiprocessing.Pool(n_cores)

            extracted_simple_parameters = pool.map(
                process_for_simple, articles_for_data_params_simple)
            # extracted_simple_parameters = (process_for_simple(i) for i in articles_for_data_params_simple)

            if csv_data_output_file_path is not None:
                output_file = csv_data_output_file_path
            else:
                output_file = "{}_simple_parameters.csv".format(resource)

            csv_resource_simple_parameters_df = pd.DataFrame()

            pool = Pool()
            a_tmps = (pool.map(
                _process_csv_pool,
                zip(articles_simple, extracted_simple_parameters)))

            for tmp in a_tmps:
                csv_resource_simple_parameters_df = csv_resource_simple_parameters_df.append(
                    [tmp])

            del pool

            # for art, ltc_params in zip(articles_simple, extracted_simple_parameters):
            # 	a_id = art[-1]
            #
            # 	p00 = ltc_params[0]
            # 	p11 = ltc_params[1]
            # 	p22 = ltc_params[2]
            #
            # 	p00_d = {'element_type_{}_'.format(p00['element_type']) + k: v for k, v in p00.items()}
            # 	p11_d = {'element_type_{}_'.format(p11['element_type']) + k: v for k, v in p11.items()}
            # 	p22_d = {'element_type_{}_'.format(p22['element_type']) + k: v for k, v in p22.items()}
            #
            # 	tmp = {'id': a_id, **p00_d, **p11_d, **p22_d}
            #
            # 	csv_resource_simple_parameters_df = csv_resource_simple_parameters_df.append([tmp])

            csv_resource_simple_parameters_df.set_index(['id'], inplace=True)

            merge_parameters_with_in_csv(csv_data_input_file_path, output_file,
                                         csv_resource_simple_parameters_df)

        my_print(
            "{}Resources: [ {} ]; Simple parameters calculated in {}".format(
                SUCCESS_FLAG,
                resource if resource is not None else "All resources",
                datetime.now() - time_start))

    if corr_calc:
        my_print(
            "{}Going to calculate articles parameters correlation for [ {} ] ..."
            .format(INFO_FLAG,
                    resource if resource is not None else "All resources"))
        time_start = datetime.now()

        get_correlation_metric(resource,
                               csv_data_file_path=csv_data_input_file_path,
                               data_type=data_type)

        my_print("{}Resources: [ {} ]; Correlation calculated in {}".format(
            SUCCESS_FLAG,
            resource if resource is not None else "All resources",
            datetime.now() - time_start))

    if add_calc:
        my_print(
            "{}Going to calculate additional parameters for [ {} ] ...".format(
                INFO_FLAG,
                resource if resource is not None else "All resources"))

        time_start = datetime.now()

        if data_type == 'db':
            extracted_additional_parameters = pool.imap(
                process_for_additional, articles_for_data_params_additional)

            for art, params in zip(articles_additional,
                                   extracted_additional_parameters):

                if params is not None:
                    save_additional_parameters(params, art)

        elif data_type == 'csv':
            extracted_additional_parameters = (
                process_for_additional(i)
                for i in articles_for_data_params_additional)

            if csv_data_output_file_path is not None:
                output_file = csv_data_output_file_path
            else:
                output_file = "{}_additional_parameters.csv".format(resource)

            csv_resource_additional_parameters_df = pd.DataFrame(
            )  #columns=['id', "n_title_symbols", "n_title_numbers", "n_title_letters", "n_title_words", "n_title_mean_letters_in_words", "title_words_diff_emotions", "title_angry", "title_anticipation", "title_disgust", "title_fear", "title_joy", "title_sadness", "title_surprise", "title_trust", "title_neg", "title_pos", "most_frequent_title_word_len", "most_frequent_title_word_count", "title_max_term_length", "n_lead_symbols", "n_lead_numbers", "n_lead_letters", "n_lead_words", "n_lead_mean_letters_in_words", "lead_words_diff_emotions", "lead_angry", "lead_anticipation", "lead_disgust", "lead_fear", "lead_joy", "lead_sadness", "lead_surprise", "lead_trust", "lead_neg", "lead_pos", "most_frequent_lead_word_len", "most_frequent_lead_word_count", "lead_max_term_length", "content_dots_count", "content_commas_count", "content_exclamation_marks_count", "content_question_marks_count", "n_content_symbols", "n_content_numbers", "n_content_letters", "n_content_words", "n_content_mean_letters_in_words", "content_mean_words_count", "content_sentences_count", "max_count_words_in_sent_content", "min_count_words_in_sent_content", "content_total_words_count", "max_freq_of_term_in_content", "min_freq_of_term_in_content", "max_term_length_content", "content_sum_emotionality", "content_mean_emotionality", "content_max_emotionality_sentences", "content_min_emotionality_sentences", "content_mean_emo_of_sentences", "content_angry", "content_anticipation", "content_disgust", "content_fear", "content_joy", "content_sadness", "content_surprise", "content_trust", "content_neg", "content_pos", "title_uniq_wd", "title_complx", "title_snt_len", "title_syll_ct", "title_flesch", "lead_uniq_wd", "lead_complx", "lead_snt_len", "lead_syll_ct", "lead_flesch", "content_ari", "title_ari", "lead_ari", "content_coleman", "content_db1", "content_db2", "content_db_grade", "content_ds", "content_herdan", "content_cttr", "content_hdd", "content_yueles_k", "content_maas_1", "content_mtld", "content_rld", "content_sld", "content_ttr", "title_ttr", "lead_ttr", "content_count_of_types", "content_count_of_tokens", "title_count_of_types", "title_count_of_tokens", "lead_count_of_types", "lead_count_of_tokens", "content_uber", "content_growth_vocabl"])

            for art, params in zip(articles_additional,
                                   extracted_additional_parameters):

                a_id = art[-1]

                tmp = {'id': a_id, **params}

                csv_resource_additional_parameters_df = csv_resource_additional_parameters_df.append(
                    [tmp])

            csv_resource_additional_parameters_df.set_index('id', inplace=True)

            merge_parameters_with_in_csv(
                csv_data_input_file_path, output_file,
                csv_resource_additional_parameters_df)

        my_print("{}Additional parameters for {} calculated in {}".format(
            SUCCESS_FLAG,
            resource if resource is not None else "All resources",
            datetime.now() - time_start))
Beispiel #7
0
    def __call__(self, text):
        try:
            words = nltk.tokenize.word_tokenize(text)

            pos_valuences = list(
                filter(lambda x: x > 0, [
                    self.analyzer.lexicon.get(word.lower(), 0)
                    for word in words
                ]))
            neg_valuences = list(
                filter(lambda x: x < 0, [
                    self.analyzer.lexicon.get(word.lower(), 0)
                    for word in words
                ]))

            nwords, npos, nneg = len(words), len(pos_valuences), len(
                neg_valuences)

            polarity = self.analyzer.polarity_scores(text)

            return (
                ('global_negative_polarity', polarity['neg']),
                ('global_positive_polarity', polarity['pos']),
                ('global_neutral_polarity', polarity['neu']),
                ('global_sentiment_polarity', polarity['compound']),
                ('global_rate_positive_words',
                 self._sigmoid(npos /
                               nwords) if nwords else self._sigmoid(npos)),
                ('global_rate_negative_words',
                 self._sigmoid(nneg /
                               nwords) if nwords else self._sigmoid(nneg)),
                ('rate_positive_words', self._sigmoid(npos / (npos + nneg))
                 if npos or nneg else self._sigmoid(npos)),
                ('rate_negative_words', self._sigmoid(nneg / (npos + nneg))
                 if npos or nneg else self._sigmoid(nneg)),
                ('avg_positive_polarity',
                 self._sigmoid(sum(pos_valuences) / npos)
                 if npos else self._sigmoid(sum(pos_valuences))),
                ('min_positive_polarity',
                 self._sigmoid(min(pos_valuences)) if pos_valuences else 0.0),
                ('max_positive_polarity',
                 self._sigmoid(max(pos_valuences)) if pos_valuences else 0.0),
                ('avg_negative_polarity',
                 self._sigmoid(sum(neg_valuences) / nneg)
                 if nneg else self._sigmoid(sum(neg_valuences))),
                ('min_negative_polarity',
                 self._sigmoid(min(neg_valuences)) if neg_valuences else 0.0),
                ('max_negative_polarity',
                 self._sigmoid(max(neg_valuences)) if neg_valuences else 0.0),
            )
        except Exception as e:
            my_print("{}{}".format(EXCEPTION_FLAG, e))
            return (
                ('global_negative_polarity', 0),
                ('global_positive_polarity', 0),
                ('global_neutral_polarity', 0),
                ('global_sentiment_polarity', 0),
                ('global_rate_positive_words', 0),
                ('global_rate_negative_words', 0),
                ('rate_positive_words', 0),
                ('rate_negative_words', 0),
                ('avg_positive_polarity', 0),
                ('min_positive_polarity', 0),
                ('max_positive_polarity', 0),
                ('avg_negative_polarity', 0),
                ('min_negative_polarity', 0),
                ('max_negative_polarity', 0),
            )
Beispiel #8
0
def run(resource=None,
        period=None,
        last_added_only=False,
        data_type=None,
        csv_data_input_file_path=None,
        csv_data_output_file_path=None):
    gtime_start = datetime.datetime.now()

    if data_type == 'db':
        if resource is None:
            rdata = Resources.select().iterator()
            resources_iterator = [elem.__data__['resource'] for elem in rdata]
        else:
            if is_resource_exists(resource):
                resources_iterator = [resource]
            else:
                my_print("{}Resource [ {} ] not found. Exiting ...".format(
                    ERROR_FLAG, resource))

        ps_resources = (get_articles_from_db(resource_id=r_id,
                                             period=period,
                                             last_added_only=last_added_only)
                        for r_id in resources_iterator)

        for ps, res in zip(ps_resources, resources_iterator):
            ltime_start = datetime.datetime.now()

            ps, articles_s, data = tee(ps, 3)

            # check content; process if it's not too short or empty
            data = ((p.lead, p.title, p.content, res) for p in data
                    if (is_text(p, None)[0]))

            pool = Pool()

            params = pool.map(topic_ltc_by_resource, data)

            for art, prms in zip(articles_s, params):
                for par in prms:
                    if par is not None:
                        save_parameters(par, art)

            my_print("{} [ {} ] :: LDA topics calculated in {}".format(
                SUCCESS_FLAG, res,
                datetime.datetime.now() - ltime_start))

            del pool

        if len(resources_iterator) != 1:
            my_print("{}{} :: calculated in {}".format(
                SUCCESS_FLAG, " ".join(resources_iterator),
                datetime.datetime.now() - gtime_start))

    elif data_type == 'csv':
        if resource is None:
            raise Exception("Resource cant be undefined for csv data_type.")

        ps_csv_resource = get_articles_from_csv(resource,
                                                csv_data_input_file_path)

        ltime_start = datetime.datetime.now()

        ps, articles_s, data = tee(ps_csv_resource, 3)

        if csv_data_output_file_path is not None:
            output_file = csv_data_output_file_path
        else:
            output_file = "{}_topics.csv".format(resource)

        csv_resource_topics_df = pd.DataFrame()

        pool = Pool()
        a_tmps = (pool.map(_process_csv_pool, data))

        for tmp in a_tmps:
            csv_resource_topics_df = csv_resource_topics_df.append([tmp])

        del pool

        csv_resource_topics_df.set_index('id', inplace=True)

        my_print("{} [ {} ] :: LDA topics calculated in {}".format(
            SUCCESS_FLAG, resource,
            datetime.datetime.now() - ltime_start))

        # save_topics_to_csv(output_file, csv_resource_topics_df)
        merge_topics_with_in_csv(csv_data_input_file_path, output_file,
                                 csv_resource_topics_df)
    else:
        pass