Example #1
0
def calc_similarity_pairs(text_ids, en_nlp, el_nlp, lang_det, cutoff):
    """
    This function splits the list of texts into greek and english,
    then calculates the similarity pairs for each language, if possible.
    """

    # Not enough texts to compare; return early.
    if len(text_ids) < 2:
        return []

    # Detect the language for each text and assign it.
    texts = [(id, text, detect_language(lang_det, text))
             for (id, text) in text_ids]

    # Split the texts between english and greek.
    en_texts = [text for text in texts if text[2] == 'english']
    el_texts = [text for text in texts if text[2] == 'greek']

    # Calculate all textual similarity pairs.
    sim_pairs_en = (textual_similarity(en_nlp, 'english', en_texts, en_texts)
                    if len(en_texts) >= 2 else [])

    sim_pairs_el = (textual_similarity(el_nlp, 'greek', el_texts, el_texts)
                    if len(el_texts) >= 2 else [])

    return [
        sim_pair for sim_pair in sim_pairs_en + sim_pairs_el
        if sim_pair[1] >= cutoff
    ]
Example #2
0
    def train_classifiers(discussions, lang_det):
        english_classifier = ArgumentClassifier()
        greek_classifier = ArgumentClassifier()

        english_texts, english_labels = [], []
        greek_texts, greek_labels = [], []
        for discussion in discussions:
            if discussion['Position'] in ['Issue', 'Solution']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                english_texts.append(text)
                english_labels.append(discussion['Position'])
            elif language == 'greek':
                greek_texts.append(text)
                greek_labels.append(discussion['Position'])

        english_classifier.train(english_texts, english_labels)
        greek_classifier.train(greek_texts, greek_labels)
        ArgumentClassifier.english_classifier = english_classifier
        ArgumentClassifier.greek_classifier = greek_classifier

        if config.debug:
            print(f'English texts accuracy score {ArgumentClassifier.english_classifier.score(english_texts, english_labels)} %')
            print(f'Greek texts accuracy score {ArgumentClassifier.greek_classifier.score(greek_texts, greek_labels)} %')
Example #3
0
    def fit_clusterers(discussions, lang_det, en_nlp, el_nlp):
        english_clusterer = None
        greek_clusterer = None

        english_texts, greek_texts = [], []
        for discussion in discussions:
            if discussion['Position'] in ['Issue']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                english_texts.append(text)
            elif language == 'greek':
                greek_texts.append(text)

        if len(english_texts) > 2:
            # Initialize the English Clusterer.
            english_clusterer = ArgumentClusterer()

            # Calculate the embeddings for each text of this discussion.
            english_embeddings = [
                en_nlp.tokenizer(text).vector for text in english_texts
            ]

            # Fit the clusterer using the textual embeddings of this discussion.
            english_clusterer.fit(english_embeddings, 'english.pdf')

            # Find the medoids of each cluster from each language.
            english_clusterer.__medoid_texts = {
                str(english_clusterer.__clusterer.labels_[i]): english_texts[i]
                for i in english_clusterer.__clusterer.medoid_indices_
            }

        if len(greek_texts) > 2:
            # Initialize the Greek Clusterer.
            greek_clusterer = ArgumentClusterer()

            # Calculate the embeddings for each text of this discussion.
            greek_embeddings = [
                el_nlp.tokenizer(text).vector for text in greek_texts
            ]

            # Fit the clusterer using the textual embeddings of this discussion.
            greek_clusterer.fit(greek_embeddings, 'greek.pdf')

            # Find the medoids of each cluster from each language.
            greek_clusterer.__medoid_texts = {
                str(greek_clusterer.__clusterer.labels_[i]): greek_texts[i]
                for i in greek_clusterer.__clusterer.medoid_indices_
            }

        ArgumentClusterer.english_clusterer = english_clusterer
        ArgumentClusterer.greek_clusterer = greek_clusterer
Example #4
0
    def suggest_argument_types(discussions, lang_det):
        res = []
        for discussion in discussions:
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                predicted = ArgumentClassifier.english_classifier.predict([text])[0]
            elif language == 'greek':
                predicted = ArgumentClassifier.greek_classifier.predict([text])[0]
            else:
                continue

            if predicted != discussion['Position']:
                res.append({
                    'id': discussion['id'],
                    'suggested_argument_type': predicted,
                    'text': discussion['DiscussionText'],
                })
        return {
            'suggested_argument_types': res
        }
Example #5
0
def aggregate_summaries_keyphrases(workspace, lang_det, en_nlp, el_nlp, top_n,
                                   top_sent):
    """
    Function that aggregates summaries from each workspace,
    and produces keyphrases from the aggregated summary.
    """
    # Initialize the results.
    results = {
        'Aggregated': {
            'Summary': '',
            'Keyphrases': []
        },
    }

    # Aggregate all earlier produced summaries.
    aggregated_summary = ' '.join(summary for item in workspace.values()
                                  for summary in item['Summaries']).replace(
                                      '\n', ' ')

    # If the aggregated summary is not empty, run textRank.
    if aggregated_summary:

        # Detect the language of the aggregated summary.
        language = detect_language(lang_det, aggregated_summary)

        # Select the nlp object depending on language.
        nlp = (en_nlp if language == 'english' else el_nlp)

        # Run textrank on the aggregated summary.
        doc = run_textrank(aggregated_summary, nlp)

        results['Aggregated'] = {
            'Summary': text_summarization(doc, nlp, top_n, top_sent),
            'Keyphrases': keyword_extraction(doc, nlp, language, 2 * top_n)
        }
    return results
Example #6
0
def summarize_communities(database, en_nlp, el_nlp, lang_det, top_n, top_sent):
    """
    Function that performs text summarization on all communities,
    and returns their summaries.
    """
    communities = extract_id_texts_from_communities(database)

    if not communities:  # if no communities exist, exit early.
        return {}

    results = {community: None for community in communities.keys()}

    # Iterate each community id and its contents.
    for community, (position, ids, text) in communities.items():

        # If the community contains no text,
        # or contains no more that 2 documents,
        # then don't summarize it.
        if text == '' or len(ids) < 2:
            continue

        # Detect the language of the text.
        language = detect_language(lang_det, text)

        # Select the nlp object depending on language.
        nlp = (en_nlp if language == 'english' else el_nlp)

        # Run textrank and obtain the processed document.
        doc = run_textrank(text, nlp)

        # Insert the results of both methods into the dict.
        results[community] = [
            position, ids,
            text_summarization(doc, nlp, top_n, top_sent)
        ]
    return results
Example #7
0
    def suggest_clusters(discussions, lang_det, en_nlp, el_nlp):

        # The workspace doesn't have enough discussions, early exit.
        if len(discussions) < 3:
            return {'greek_clusters': {}, 'english_clusters': {}}

        # Fit all clusterers for all discussions of a single workspace.
        ArgumentClusterer.fit_clusterers(discussions, lang_det, en_nlp, el_nlp)
        english_clusters = {
            label: {
                'nodes': [],
                'texts': [],
                'summary': '',
                'medoid_text': ''
            }
            for label in map(
                str, ArgumentClusterer.english_clusterer.__clusterer.labels_)
        } if ArgumentClusterer.english_clusterer is not None else {}
        greek_clusters = {
            label: {
                'nodes': [],
                'texts': [],
                'summary': '',
                'medoid_text': ''
            }
            for label in map(
                str, ArgumentClusterer.greek_clusterer.__clusterer.labels_)
        } if ArgumentClusterer.greek_clusterer is not None else {}

        for discussion in discussions:
            if discussion['Position'] in ['Issue', 'Solution']:
                continue
            text = discussion['DiscussionText']
            language = detect_language(lang_det, text)
            text = remove_punctuation_and_whitespace(text)
            if language == 'english':
                if ArgumentClusterer.english_clusterer is None:
                    continue
                predicted = str(
                    ArgumentClusterer.english_clusterer.predict(
                        [en_nlp.tokenizer(text).vector])[0])
                english_clusters[predicted]['nodes'].append(discussion['id'])
                english_clusters[predicted]['texts'].append(text)
                english_clusters[predicted][
                    'medoid_text'] = ArgumentClusterer.english_clusterer.__medoid_texts[
                        predicted]
            elif language == 'greek':
                if ArgumentClusterer.greek_clusterer is None:
                    continue
                predicted = str(
                    ArgumentClusterer.greek_clusterer.predict(
                        [el_nlp.tokenizer(text).vector])[0])
                greek_clusters[predicted]['nodes'].append(discussion['id'])
                greek_clusters[predicted]['texts'].append(text)
                greek_clusters[predicted][
                    'medoid_text'] = ArgumentClusterer.greek_clusterer.__medoid_texts[
                        predicted]

            # Run textrank on non-empty aggregated text from each cluster for each language.
            for en_cluster in english_clusters.keys():
                en_text = '. '.join(english_clusters[en_cluster]['texts'])
                if en_text != '':
                    en_doc = run_textrank(en_text, en_nlp)
                    english_clusters[en_cluster][
                        'summary'] = text_summarization(
                            en_doc, en_nlp, config.top_n, config.top_sent)

            for el_cluster in greek_clusters.keys():
                el_text = '. '.join(greek_clusters[el_cluster]['texts'])
                if el_text != '':
                    el_doc = run_textrank(el_text, el_nlp)
                    greek_clusters[el_cluster]['summary'] = text_summarization(
                        el_doc, el_nlp, config.top_n, config.top_sent)

        return {
            'greek_clusters': greek_clusters,
            'english_clusters': english_clusters
        }