def calc_similarity_pairs(text_ids, en_nlp, el_nlp, lang_det, cutoff): """ This function splits the list of texts into greek and english, then calculates the similarity pairs for each language, if possible. """ # Not enough texts to compare; return early. if len(text_ids) < 2: return [] # Detect the language for each text and assign it. texts = [(id, text, detect_language(lang_det, text)) for (id, text) in text_ids] # Split the texts between english and greek. en_texts = [text for text in texts if text[2] == 'english'] el_texts = [text for text in texts if text[2] == 'greek'] # Calculate all textual similarity pairs. sim_pairs_en = (textual_similarity(en_nlp, 'english', en_texts, en_texts) if len(en_texts) >= 2 else []) sim_pairs_el = (textual_similarity(el_nlp, 'greek', el_texts, el_texts) if len(el_texts) >= 2 else []) return [ sim_pair for sim_pair in sim_pairs_en + sim_pairs_el if sim_pair[1] >= cutoff ]
def train_classifiers(discussions, lang_det): english_classifier = ArgumentClassifier() greek_classifier = ArgumentClassifier() english_texts, english_labels = [], [] greek_texts, greek_labels = [], [] for discussion in discussions: if discussion['Position'] in ['Issue', 'Solution']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': english_texts.append(text) english_labels.append(discussion['Position']) elif language == 'greek': greek_texts.append(text) greek_labels.append(discussion['Position']) english_classifier.train(english_texts, english_labels) greek_classifier.train(greek_texts, greek_labels) ArgumentClassifier.english_classifier = english_classifier ArgumentClassifier.greek_classifier = greek_classifier if config.debug: print(f'English texts accuracy score {ArgumentClassifier.english_classifier.score(english_texts, english_labels)} %') print(f'Greek texts accuracy score {ArgumentClassifier.greek_classifier.score(greek_texts, greek_labels)} %')
def fit_clusterers(discussions, lang_det, en_nlp, el_nlp): english_clusterer = None greek_clusterer = None english_texts, greek_texts = [], [] for discussion in discussions: if discussion['Position'] in ['Issue']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': english_texts.append(text) elif language == 'greek': greek_texts.append(text) if len(english_texts) > 2: # Initialize the English Clusterer. english_clusterer = ArgumentClusterer() # Calculate the embeddings for each text of this discussion. english_embeddings = [ en_nlp.tokenizer(text).vector for text in english_texts ] # Fit the clusterer using the textual embeddings of this discussion. english_clusterer.fit(english_embeddings, 'english.pdf') # Find the medoids of each cluster from each language. english_clusterer.__medoid_texts = { str(english_clusterer.__clusterer.labels_[i]): english_texts[i] for i in english_clusterer.__clusterer.medoid_indices_ } if len(greek_texts) > 2: # Initialize the Greek Clusterer. greek_clusterer = ArgumentClusterer() # Calculate the embeddings for each text of this discussion. greek_embeddings = [ el_nlp.tokenizer(text).vector for text in greek_texts ] # Fit the clusterer using the textual embeddings of this discussion. greek_clusterer.fit(greek_embeddings, 'greek.pdf') # Find the medoids of each cluster from each language. greek_clusterer.__medoid_texts = { str(greek_clusterer.__clusterer.labels_[i]): greek_texts[i] for i in greek_clusterer.__clusterer.medoid_indices_ } ArgumentClusterer.english_clusterer = english_clusterer ArgumentClusterer.greek_clusterer = greek_clusterer
def suggest_argument_types(discussions, lang_det): res = [] for discussion in discussions: text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': predicted = ArgumentClassifier.english_classifier.predict([text])[0] elif language == 'greek': predicted = ArgumentClassifier.greek_classifier.predict([text])[0] else: continue if predicted != discussion['Position']: res.append({ 'id': discussion['id'], 'suggested_argument_type': predicted, 'text': discussion['DiscussionText'], }) return { 'suggested_argument_types': res }
def aggregate_summaries_keyphrases(workspace, lang_det, en_nlp, el_nlp, top_n, top_sent): """ Function that aggregates summaries from each workspace, and produces keyphrases from the aggregated summary. """ # Initialize the results. results = { 'Aggregated': { 'Summary': '', 'Keyphrases': [] }, } # Aggregate all earlier produced summaries. aggregated_summary = ' '.join(summary for item in workspace.values() for summary in item['Summaries']).replace( '\n', ' ') # If the aggregated summary is not empty, run textRank. if aggregated_summary: # Detect the language of the aggregated summary. language = detect_language(lang_det, aggregated_summary) # Select the nlp object depending on language. nlp = (en_nlp if language == 'english' else el_nlp) # Run textrank on the aggregated summary. doc = run_textrank(aggregated_summary, nlp) results['Aggregated'] = { 'Summary': text_summarization(doc, nlp, top_n, top_sent), 'Keyphrases': keyword_extraction(doc, nlp, language, 2 * top_n) } return results
def summarize_communities(database, en_nlp, el_nlp, lang_det, top_n, top_sent): """ Function that performs text summarization on all communities, and returns their summaries. """ communities = extract_id_texts_from_communities(database) if not communities: # if no communities exist, exit early. return {} results = {community: None for community in communities.keys()} # Iterate each community id and its contents. for community, (position, ids, text) in communities.items(): # If the community contains no text, # or contains no more that 2 documents, # then don't summarize it. if text == '' or len(ids) < 2: continue # Detect the language of the text. language = detect_language(lang_det, text) # Select the nlp object depending on language. nlp = (en_nlp if language == 'english' else el_nlp) # Run textrank and obtain the processed document. doc = run_textrank(text, nlp) # Insert the results of both methods into the dict. results[community] = [ position, ids, text_summarization(doc, nlp, top_n, top_sent) ] return results
def suggest_clusters(discussions, lang_det, en_nlp, el_nlp): # The workspace doesn't have enough discussions, early exit. if len(discussions) < 3: return {'greek_clusters': {}, 'english_clusters': {}} # Fit all clusterers for all discussions of a single workspace. ArgumentClusterer.fit_clusterers(discussions, lang_det, en_nlp, el_nlp) english_clusters = { label: { 'nodes': [], 'texts': [], 'summary': '', 'medoid_text': '' } for label in map( str, ArgumentClusterer.english_clusterer.__clusterer.labels_) } if ArgumentClusterer.english_clusterer is not None else {} greek_clusters = { label: { 'nodes': [], 'texts': [], 'summary': '', 'medoid_text': '' } for label in map( str, ArgumentClusterer.greek_clusterer.__clusterer.labels_) } if ArgumentClusterer.greek_clusterer is not None else {} for discussion in discussions: if discussion['Position'] in ['Issue', 'Solution']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': if ArgumentClusterer.english_clusterer is None: continue predicted = str( ArgumentClusterer.english_clusterer.predict( [en_nlp.tokenizer(text).vector])[0]) english_clusters[predicted]['nodes'].append(discussion['id']) english_clusters[predicted]['texts'].append(text) english_clusters[predicted][ 'medoid_text'] = ArgumentClusterer.english_clusterer.__medoid_texts[ predicted] elif language == 'greek': if ArgumentClusterer.greek_clusterer is None: continue predicted = str( ArgumentClusterer.greek_clusterer.predict( [el_nlp.tokenizer(text).vector])[0]) greek_clusters[predicted]['nodes'].append(discussion['id']) greek_clusters[predicted]['texts'].append(text) greek_clusters[predicted][ 'medoid_text'] = ArgumentClusterer.greek_clusterer.__medoid_texts[ predicted] # Run textrank on non-empty aggregated text from each cluster for each language. for en_cluster in english_clusters.keys(): en_text = '. '.join(english_clusters[en_cluster]['texts']) if en_text != '': en_doc = run_textrank(en_text, en_nlp) english_clusters[en_cluster][ 'summary'] = text_summarization( en_doc, en_nlp, config.top_n, config.top_sent) for el_cluster in greek_clusters.keys(): el_text = '. '.join(greek_clusters[el_cluster]['texts']) if el_text != '': el_doc = run_textrank(el_text, el_nlp) greek_clusters[el_cluster]['summary'] = text_summarization( el_doc, el_nlp, config.top_n, config.top_sent) return { 'greek_clusters': greek_clusters, 'english_clusters': english_clusters }