def ordered(merged, f_score): """Order n-grams of a dict using a particular score function Parameters ---------- merged : dict The dict containing n-grams and their score for each class f_score : function The score function to use to order n-grams Returns ------- list the ordered list of n-grams """ print("Ordering grams") dida = CONFIG['DIDA_DOCS'] notdida = CONFIG['NOTDIDA_DOCS'] merged = sorted(merged.items(), key=lambda kv: f_score(kv[1][dida], kv[1][notdida])) merged.reverse() display.display_ok("Ordering done") return merged
def initialization(): print("Starting clusters initialization") initialize_clusters() display.display_ok("Clusters initialization done") print("Processing agglomerative information") agglomerative_information() display.display_ok("Processing agglomerative information done")
def process_ngrams(n, data, data_class): """Searches the top n-grams of a publications set and computes the set cover Parameters ---------- n : int The length of the n-grams data : list The publications list to handle data_class : str The class to handle Returns ------- list n-grams above the threshold fixed in config file list set cover with n-grams above the threshold """ print("Process for {0}".format(data_class)) # Number of documents in data n_data = len(data) # Count occurrences for each n-grams print("Counting occurrences") occurrences = ngh.count_occurrences(n, data) # Normalize the occurrences print("Normalizing occurrences") normalized = ngh.normalize_occurrences(occurrences, n_data) # Find n-grams above a given threshold (see Config file) print("Filtering occurrences") subsets = filter(normalized) # Find top n-grams covering all documents print("Searching full set cover") find_set_cover(normalized, data) # Save all the normalized n-grams save_to_file(normalized, n, data_class) # Plot the n-grams plot(normalized, data_class, n) # Find the Set Cover based on best n-grams print("Searching partial set cover") set_cover = get_set_cover(subsets) exh.write_json(set_cover, SET_COVER_FILENAME.format(data_class, n)) print("Computing score of partial set cover") scores = check_score(set_cover, subsets, normalized, data) exh.write_text(scores, SCORE_FILENAME.format(data_class, n)) display.display_ok("Process for {0} done".format(data_class)) return subsets, set_cover
def cross_top_words(dida_data, notdida_data, initial_stopwords): """Applies cross top words (1-grams) analysis Parameters ---------- dida_data : list The publications of DIDA notdida_data : list The publications of Not-DIDA initial_stopwords : list The initial stopwords """ print("Starting cross top 1-grams analysis") max_top = CONFIG['NTOPWORDS'] iteration = 0 CTW = [1] # common top words topwords_dict = dict() stopwords_dict = dict() while CTW: # Loop until there is no common top words CTW.clear() # Insert PubTator annotations in the abstracts dida_docs = pbmdh.extract_features(deepcopy(dida_data), initial_stopwords) notdida_docs = pbmdh.extract_features(deepcopy(notdida_data), initial_stopwords) # Search top words of each publication top_dida = top_words(dida_docs) top_notdida = top_words(notdida_docs) # Search common top words find_common_words(top_dida, top_notdida, CTW) # Save top words topwords_dict['iteration'] = iteration topwords_dict['CTW'] = CTW topwords_dict['dida'] = top_dida topwords_dict['notdida'] = top_notdida exh.write_json(topwords_dict, TOPGRAMS_FILENAME.format(1, max_top, iteration)) if CTW: # If there is common top words # Add them to stopwords initial_stopwords.extend(CTW) # Save new stopwords list stopwords_dict['stopwords'] = initial_stopwords iteration += 1 exh.write_json(stopwords_dict, STOPGRAMS_FILENAME.format(1, max_top, iteration)) display.display_ok("Cross top 1-grams analysis done")
def loop(M): backup_clusters(M) n_categories = len(Pcw) print("Starting IB method loop") for m in range(M - 1, 0, -1): s = "Running iteration {0} on {1}".format(M - m, M - 1) print(s, end="\r") # print("Iteration {0} / {1}".format(M-m, M-1)) # Find minimum cost cluster_i, cluster_j = np.argwhere(agg_info == agg_info.min())[0] # Merge clusters p_w = Pcluster[cluster_i] + Pcluster[cluster_j] pi_i = Pcluster[cluster_i] / p_w pi_j = Pcluster[cluster_j] / p_w pc_w = [] for c in range(n_categories): temp = pi_i * Pc_cluster[c, cluster_i] + pi_j * Pc_cluster[ c, cluster_j] pc_w.append(temp) clusters[cluster_i].extend(clusters[cluster_j]) Pcluster[cluster_i] = p_w for c in range(n_categories): Pc_cluster[c, cluster_i] = pc_w[c] # Remove cluster j clusters[cluster_j].clear() for j in range(cluster_j + 1, M): agg_info[cluster_j][j] = np.Inf for i in range(cluster_j): agg_info[i][cluster_j] = np.Inf # Update cost for j in range(cluster_i + 1, M): if agg_info[cluster_i, j] != np.Inf: # update agg_info[cluster_i][j] js_d = js_divergence(cluster_i, j, n_categories) agg_info[cluster_i, j] = (Pcluster[cluster_i] + Pcluster[j]) * js_d for i in range(cluster_i): if agg_info[i][cluster_i] != np.Inf: # update agg_info[i][cluster_i] js_d = js_divergence(i, cluster_i, n_categories) agg_info[i, cluster_i] = (Pcluster[i] + Pcluster[cluster_i]) * js_d backup_clusters(m) print(s) display.display_ok("IB method loop done")
def strict_top_words(dida_data, notdida_data, initial_stopwords): """Applies strict top words (1-grams) analysis Parameters ---------- dida_data : list The publications of DIDA notdida_data : list The publications of Not-DIDA initial_stopwords : list The initial stopwords """ print("Starting strict top 1-grams analysis") strict_top = dict() max_top = CONFIG['NTOPWORDS'] # Insert PubTator annotations in the abstracts dida_docs = pbmdh.extract_features(deepcopy(dida_data), initial_stopwords) notdida_docs = pbmdh.extract_features(deepcopy(notdida_data), initial_stopwords) # Ordered words by number of occurrences top_dida = top_words(dida_docs, split=False) top_notdida = top_words(notdida_docs, split=False) top_dida_l = [] top_notdida_l = [] # Find words that are in DIDA but not in Not-DIDA find_unique(top_dida, top_notdida, top_dida_l) # Find words that are in Not-DIDA but not in DIDA find_unique(top_notdida, top_dida, top_notdida_l) # Select best top words of DIDA if (len(top_dida_l) > max_top): top_dida_l = top_dida_l[len(top_dida_l) - max_top:] # Select best top words in Not-DIDA if (len(top_notdida_l) > max_top): top_notdida_l = top_notdida_l[len(top_notdida_l) - max_top:] # Save the results of the strict top words analysis strict_top['didatop'] = top_dida_l strict_top['notdidatop'] = top_notdida_l exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, 1)) display.display_ok("Strict top 1-grams analysis done")
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] subsets_dida = [] subsets_notdida = [] covers = [] for i in range(1, n+1): print("Starting analysis for {0}-grams".format(i)) # Process on DIDA class subset, set_cover = process_ngrams(i, dida_data, "dida") subsets_dida.extend(subset) covers.extend(set_cover) # Process on Not-DIDA class subset, set_cover = process_ngrams(i, notdida_data, "notdida") subsets_notdida.extend(subset) covers.extend(set_cover) display.display_ok("Analysis for {0}-grams done".format(i)) print("Searching set cover with all grams for DIDA") set_cover = get_set_cover(subsets_dida) scores = check_score(set_cover, subsets_dida, subsets_dida, dida_data) exh.write_text(scores, SCORE_FILENAME.format("dida", "all")) display.display_ok("Done") print("Searching set cover with all grams for NotDIDA") set_cover = get_set_cover(subsets_notdida) scores = check_score(set_cover, subsets_notdida, subsets_notdida, notdida_data) exh.write_text(scores, SCORE_FILENAME.format("notdida", "all")) display.display_ok("Done") save_topwords(covers) display.display_info("All results were saved in {0} directory".format(DIRECTORY))
def filter(pmids, known_pmids): """Filters from new PMIDs the ones that are already in DIDA Parameters ---------- pmids : list The list of new PMIDs known_pmids : list The list of PMIDs already in DIDA Returns ------- list the list of PMIDs that are not in DIDA """ print("Filtering PMIDs.") notdida = [] for pmid in pmids: if not pmid in known_pmids: notdida.append(pmid) display.display_ok("Filtering PMIDs done.") return notdida
def get_pmids_by_dates(): """Gets the PMIDs of publications between the dates specified in the configuration file Returns ------- list the list of the PMIDs of the found publications """ start_year = CONFIG['START_YEAR'] end_year = CONFIG['SPLIT_YEAR'] print("Retrieving new PMIDs between {0} and {1}".format( start_year, end_year)) ids = [] query = "digenic+AND+{0}[pdat]" for year in range(start_year, end_year): ids.extend(pbmdh.get_pmids(query.format(year))) x = np.array(ids) x = list(np.unique(x)) display.display_ok("{0} new PMIDs found".format(len(x))) return x
def merge_ngrams(grams1, grams2): """Merge to list of n-grams by keeping their score in each class Parameters ---------- grams1 : list The first list of n-grams grams2 : list The second list of n-grams Returns ------- dict a dict object containing the score of each n-gram in each class """ print("Merging n-grams") merged = dict() dida = CONFIG['DIDA_DOCS'] notdida = CONFIG['NOTDIDA_DOCS'] for gram in grams1: if not gram[0] in merged: # Create the gram merged[gram[0]] = dict() merged[gram[0]][dida] = gram[1] # Prepare value for notDIDA merged[gram[0]][notdida] = 0 for gram in grams2: if not gram[0] in merged: # Create the gram merged[gram[0]] = dict() # Value is 0 for DIDA because gram was not in grams1 merged[gram[0]][dida] = 0 merged[gram[0]][notdida] = gram[1] display.display_ok("Merging done") return merged
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] for i in range(1, n + 1): print("Starting analysis for {0}-grams".format(i)) print("Couting occurrences for DIDA") dida_occurrences = ngh.count_occurrences(i, dida_data) dida_normalized = ngh.normalize_occurrences(dida_occurrences, len(dida_data)) display.display_ok("Counting occurrences for DIDA done") print("Couting occurrences for NotDIDA") notdida_occurrences = ngh.count_occurrences(i, notdida_data) notdida_normalized = ngh.normalize_occurrences(notdida_occurrences, len(notdida_data)) display.display_ok("Counting occurrences for NotDIDA done") # Merge n-grams in the same list merged = merge_ngrams(dida_normalized, notdida_normalized) # Order n-grams by difference merged = ordered(merged, score) # Save results save_to_file(merged, i) display.display_ok("Analysis for {0}-grams done".format(i))
def get_dida_pmids(dida_pmids): """Gets the PMIDs of publications in DIDA from a text file Parameters ---------- dida_pmids : str The file name of the file containing the PMIDs in DIDA Returns ------- list the list of PMIDs of publications in DIDA """ print("Retrieving PMIDs from {0}".format(dida_pmids)) f = open(dida_pmids) lines = f.readlines() pmids = [] for index, line in enumerate(lines): pmids.append(line.replace('\n', '')) f.close() display.display_ok("Retrieving PMIDs done. {0} PMIDs found".format( len(pmids))) return pmids
def strict_ngrams(n, dida_grams, notdida_grams): """Applies strict top n-grams analysis Parameters ---------- n : int The length of the n-grams dida_grams : list The n-grams of DIDA publications notdida_grams : list The n-grams of Not-DIDA publications """ print("Starting strict top {0}-grams analysis".format(n)) didatop = [] notdidatop = [] strict_top = dict() max_top = CONFIG['NTOPWORDS'] # Find n-grams that are in DIDA but not in Not-DIDA find_unique(dida_grams, notdida_grams, didatop) # Find n-grams that are in Not-DIDA but not in DIDA find_unique(notdida_grams, dida_grams, notdidatop) # Select the best top grams if (len(didatop) > max_top): didatop = didatop[len(didatop) - max_top:] if (len(notdidatop) > max_top): notdidatop = notdidatop[len(notdidatop) - max_top:] # Save the results of the strict top grams analysis strict_top['didatop'] = didatop strict_top['notdidatop'] = notdidatop exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, n)) display.display_ok("Strict top {0}-grams analysis done".format(n))
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] for i in range(1, n + 1): extract_ngrams(i, deepcopy(dida_data), deepcopy(notdida_data))
def extract_ngrams(n, dida_data, notdida_data): """Extracts n-grams from publications Parameters ---------- n : int The length of the n-grams dida_data : list The publications of DIDA notdida_data : list The publications of Not-DIDA """ print("Extracting {0}-grams".format(n)) initial_stopwords = pbmdh.STOPWORDS if n == 1: find_top_words(deepcopy(dida_data), deepcopy(notdida_data), deepcopy(initial_stopwords)) else: find_top_ngrams(n, deepcopy(dida_data), deepcopy(notdida_data), deepcopy(initial_stopwords)) display.display_ok("Extracting {0}-grams done".format(n))
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) # docs = [deepcopy(dida_data), deepcopy(notdida_data)] docs = [deepcopy(notdida_data), deepcopy(dida_data)] display.display_ok("Loading publications done") print("Starting extraction of words information") extract_words_information(docs) display.display_ok("Extraction of words information done") print("Computing joint probability distribution") joint_probability_distribution() display.display_ok("Computing joint probability distribution done") print("Starting IB method") all_clusters = ib.cluster(deepcopy(Pcw), deepcopy(Pw)) display.display_ok("IB method finished") save_clusters(all_clusters)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) # Extension of the input file extension = args.FILE.split('.')[-1] if extension in LEGAL_EXTENSIONS: exh.create_directory(DIRECTORY) # Get publications print("Getting publications") documents_l = read_file(args.FILE, extension) display.display_ok("Getting publications done") # Save publications filename = BACK_FILENAME.format(args.OUTPUT) exh.write_json(documents_l, filename) display.display_info("Publications saved in {0}".format(filename)) # Insert PubTator annotations in the abstracts print("Inserting PubTator annotations in abstracts") docs = pbmdh.extract_features(documents_l) display.display_ok("Inserting PubTator annotations in abstracts done") # Extract n-grams print("Extracting n-grams") ngh.extract_ngrams(docs, CONFIG['NGRAMS']) display.display_ok("Extracting n-grams done") # Save publications and their n-grams filename = NGRAMS_FILENAME.format(args.OUTPUT) exh.write_json(docs, filename) display.display_info("Publications and n-grams saved in {0}".format(filename)) else: # The input file has not a valid extension display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension)) sys.exit(0)
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) exh.create_directory(DIRECTORY) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) display.display_ok("Loading publications done") n = CONFIG['NGRAMS'] csv_files = csv_filenames(n) # Real labels of each publication y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data))) data = deepcopy(dida_data) data.extend(deepcopy(notdida_data)) scores = [] classifiers_names = [] print("Strict Classifier training") results = train(StrictClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'strict_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "strict", "threshold", LOG_FILENAME.format("strict")) classifiers_names.append("Strict Classifier") display.display_ok("Strict Classifier training done") print("Split Weighted Classifier training") results = train(SplitWeightedClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'splitweighted_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "splitweighted", "threshold", LOG_FILENAME.format("splitweighted")) classifiers_names.append("Split Weighted Classifier") display.display_ok("Split Weighted Classifier training done") print("Weighted Classifier training") results = train(WeightedClassifier, deepcopy(data), csv_files, y_true) plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data), 'weighted_', "threshold", "Threshold", DIRECTORY) scores.append(results['score']) exh.save_to_log(results, "weighted", "threshold", LOG_FILENAME.format("weighted")) classifiers_names.append("Weighted Classifier") display.display_ok("Weighted Classifier training done") plt.plot_lines(results['threshold'], scores, classifiers_names, FSCORE_FILENAME, "Threshold", "F1-score") display.display_info("Results saved in {0}".format(DIRECTORY))
def classification(docs, Ndw, W, directory, true_predictions): strict_result = { "n_clusters": [], "tn": [], "fp": [], "fn": [], "tp": [], "score": [] } doublon_result = { "n_clusters": [], "tn": [], "fp": [], "fn": [], "tp": [], "score": [] } print("Documents replacement") converted_docs = converter.init(deepcopy(docs), deepcopy(W)) display.display_ok("Documents replacement done") clusters_directory = directory + "/clusters" max_clusters = len(W) print("Evaluating classifier") a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] a.extend(range(100, 8500, 100)) a.extend([8417]) for n_clusters in a: #range(1, max_clusters+1,100): print("Processing for {0} clusters (Total : {1})".format( n_clusters, max_clusters)) # Load clusters clusters = exh.load_json(clusters_directory + "/{0}.json".format(n_clusters)) # Prepare classifier classifier = NaiveBayesCluster(deepcopy(clusters), deepcopy(Ndw), deepcopy(W)) print("Classifier ready") print("Converting documents") strict_converted_docs = converter.convert_all(deepcopy(converted_docs), deepcopy(clusters)) doublon_converted_docs = converter.convert_all( deepcopy(converted_docs), deepcopy(clusters), method='d') print("Converting documents done") print("Evaluate Strict Predictions") strict_predictions = classifier.evaluate(strict_converted_docs) print("Evaluate Doublon Predictions") doublon_predictions = classifier.evaluate(doublon_converted_docs) print("Predictions done") print("Perform scores") strict_score = classifier.score(true_predictions, strict_predictions) doublon_score = classifier.score(true_predictions, doublon_predictions) print("Scores performed : ({0}, {1})".format(strict_score, doublon_score)) add_result(n_clusters, strict_score, strict_result) add_result(n_clusters, doublon_score, doublon_result) display.display_ok("Evaluating classifier done") return strict_result, doublon_result
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) print("Loading publications") # Load DIDA publications dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS'])) # Load Not-DIDA publications notdida_data = exh.load_json( FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS'])) # docs = [deepcopy(notdida_data), deepcopy(dida_data)] docs = [deepcopy(dida_data), deepcopy(notdida_data)] display.display_ok("Loading publications done") data_directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY'] Ndw = exh.load_json(data_directory + "/ndw.json") W = exh.load_json(data_directory + "/W.json") # Real labels of each publication # y_true = np.append(np.zeros(len(notdida_data)), np.ones(len(dida_data))) y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data))) strict_result, doublon_result = classification(docs, Ndw, W, data_directory, y_true) plt.plot_confusion_matrix(strict_result, len(dida_data), len(notdida_data), "strict_", "n_clusters", "Number of clusters", DIRECTORY, step=1000) exh.save_to_log(strict_result, "strict", "n_clusters", LOG_FILENAME.format("strict")) plt.plot_confusion_matrix(doublon_result, len(dida_data), len(notdida_data), "doublon_", "n_clusters", "Number of clusters", DIRECTORY, step=1000) exh.save_to_log(doublon_result, "doublon", "n_clusters", LOG_FILENAME.format("doublon")) scores = [strict_result['score'], doublon_result['score']] classifiers_names = ["Strict converter", "Doublon converter"] plt.plot_lines(strict_result['n_clusters'], scores, classifiers_names, FSCORE_FILENAME, "Number of clusters", "F1-score", step=1000)
def cross_ngrams(n, dida_grams, notdida_grams): """Applies cross n-grams analysis Parameters ---------- n : int The length of the n-grams dida_grams : list The n-grams of DIDA publications notdida_grams : list The n-grams of Not-DIDA publications """ print("Starting cross top {0}-grams analysis".format(n)) iteration = 0 CTG = [1] # common top grams blacklist = [] topgrams_dict = dict() blacklist_dict = dict() max_top = CONFIG['NTOPWORDS'] while CTG: # Loop until there is no common top grams CTG.clear() grams1 = deepcopy(dida_grams) grams2 = deepcopy(notdida_grams) # Select the best top grams if len(grams1) > max_top: grams1 = grams1[len(grams1) - max_top:] if len(grams2) > max_top: grams2 = grams2[len(grams2) - max_top:] # Search the common grams for gram1 in grams1: for gram2 in grams2: if gram1[0] == gram2[0] and not gram1[0] in blacklist: CTG.append(gram1[0]) break # Save the top grams topgrams_dict['iteration'] = iteration topgrams_dict['CTG'] = CTG topgrams_dict['dida'] = grams1 topgrams_dict['notdida'] = grams2 exh.write_json(topgrams_dict, TOPGRAMS_FILENAME.format(n, max_top, iteration)) if CTG: # If there is common top grams # Add them to the blacklist blacklist.extend(CTG) # Remove them from each set of grams for word in CTG: for gram in dida_grams: if gram[0] == word: dida_grams.remove(gram) break for gram in notdida_grams: if gram[0] == word: notdida_grams.remove(gram) break # Save the blacklist blacklist_dict['stopgrams'] = blacklist iteration += 1 exh.write_json(blacklist_dict, STOPGRAMS_FILENAME.format(n, max_top, iteration)) display.display_ok("Cross top {0}-grams analysis done".format(n))