def process_ngrams(n, data, data_class): """Searches the top n-grams of a publications set and computes the set cover Parameters ---------- n : int The length of the n-grams data : list The publications list to handle data_class : str The class to handle Returns ------- list n-grams above the threshold fixed in config file list set cover with n-grams above the threshold """ print("Process for {0}".format(data_class)) # Number of documents in data n_data = len(data) # Count occurrences for each n-grams print("Counting occurrences") occurrences = ngh.count_occurrences(n, data) # Normalize the occurrences print("Normalizing occurrences") normalized = ngh.normalize_occurrences(occurrences, n_data) # Find n-grams above a given threshold (see Config file) print("Filtering occurrences") subsets = filter(normalized) # Find top n-grams covering all documents print("Searching full set cover") find_set_cover(normalized, data) # Save all the normalized n-grams save_to_file(normalized, n, data_class) # Plot the n-grams plot(normalized, data_class, n) # Find the Set Cover based on best n-grams print("Searching partial set cover") set_cover = get_set_cover(subsets) exh.write_json(set_cover, SET_COVER_FILENAME.format(data_class, n)) print("Computing score of partial set cover") scores = check_score(set_cover, subsets, normalized, data) exh.write_text(scores, SCORE_FILENAME.format(data_class, n)) display.display_ok("Process for {0} done".format(data_class)) return subsets, set_cover
def cross_top_words(dida_data, notdida_data, initial_stopwords): """Applies cross top words (1-grams) analysis Parameters ---------- dida_data : list The publications of DIDA notdida_data : list The publications of Not-DIDA initial_stopwords : list The initial stopwords """ print("Starting cross top 1-grams analysis") max_top = CONFIG['NTOPWORDS'] iteration = 0 CTW = [1] # common top words topwords_dict = dict() stopwords_dict = dict() while CTW: # Loop until there is no common top words CTW.clear() # Insert PubTator annotations in the abstracts dida_docs = pbmdh.extract_features(deepcopy(dida_data), initial_stopwords) notdida_docs = pbmdh.extract_features(deepcopy(notdida_data), initial_stopwords) # Search top words of each publication top_dida = top_words(dida_docs) top_notdida = top_words(notdida_docs) # Search common top words find_common_words(top_dida, top_notdida, CTW) # Save top words topwords_dict['iteration'] = iteration topwords_dict['CTW'] = CTW topwords_dict['dida'] = top_dida topwords_dict['notdida'] = top_notdida exh.write_json(topwords_dict, TOPGRAMS_FILENAME.format(1, max_top, iteration)) if CTW: # If there is common top words # Add them to stopwords initial_stopwords.extend(CTW) # Save new stopwords list stopwords_dict['stopwords'] = initial_stopwords iteration += 1 exh.write_json(stopwords_dict, STOPGRAMS_FILENAME.format(1, max_top, iteration)) display.display_ok("Cross top 1-grams analysis done")
def save_topwords(topwords): """Saves all the n-grams selected in set covers in a JSON file Parameters ---------- topwords : list All the n-grams selected in set covers """ top = [] for topword in topwords: if not tuple(topword[0]) in top : top.append(tuple(topword[0])) exh.write_json(top, TOPWORDS_FILENAME)
def download_doc(pmids_list): """Downloads publications based on a PMIDs list and saves them into a JSON file Parameters ---------- pmids_list : list The list containing the PMIDs of the publications to download """ print("Downloading PMIDs for Not-DIDA") all_data = pbmdh.download_publications(pmids_list) filename = CONFIG['NOTDIDA_DOCS'] + ".json" exh.write_json(all_data, filename) display.display_info("Not-DIDA publications saved in {0}".format(filename))
def strict_top_words(dida_data, notdida_data, initial_stopwords): """Applies strict top words (1-grams) analysis Parameters ---------- dida_data : list The publications of DIDA notdida_data : list The publications of Not-DIDA initial_stopwords : list The initial stopwords """ print("Starting strict top 1-grams analysis") strict_top = dict() max_top = CONFIG['NTOPWORDS'] # Insert PubTator annotations in the abstracts dida_docs = pbmdh.extract_features(deepcopy(dida_data), initial_stopwords) notdida_docs = pbmdh.extract_features(deepcopy(notdida_data), initial_stopwords) # Ordered words by number of occurrences top_dida = top_words(dida_docs, split=False) top_notdida = top_words(notdida_docs, split=False) top_dida_l = [] top_notdida_l = [] # Find words that are in DIDA but not in Not-DIDA find_unique(top_dida, top_notdida, top_dida_l) # Find words that are in Not-DIDA but not in DIDA find_unique(top_notdida, top_dida, top_notdida_l) # Select best top words of DIDA if (len(top_dida_l) > max_top): top_dida_l = top_dida_l[len(top_dida_l) - max_top:] # Select best top words in Not-DIDA if (len(top_notdida_l) > max_top): top_notdida_l = top_notdida_l[len(top_notdida_l) - max_top:] # Save the results of the strict top words analysis strict_top['didatop'] = top_dida_l strict_top['notdidatop'] = top_notdida_l exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, 1)) display.display_ok("Strict top 1-grams analysis done")
def run(args): """Executes the main process of the script Parameters ---------- args : ArgumentParser The arguments of the command typed by the user """ global CONFIG CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG)) # Extension of the input file extension = args.FILE.split('.')[-1] if extension in LEGAL_EXTENSIONS: exh.create_directory(DIRECTORY) # Get publications print("Getting publications") documents_l = read_file(args.FILE, extension) display.display_ok("Getting publications done") # Save publications filename = BACK_FILENAME.format(args.OUTPUT) exh.write_json(documents_l, filename) display.display_info("Publications saved in {0}".format(filename)) # Insert PubTator annotations in the abstracts print("Inserting PubTator annotations in abstracts") docs = pbmdh.extract_features(documents_l) display.display_ok("Inserting PubTator annotations in abstracts done") # Extract n-grams print("Extracting n-grams") ngh.extract_ngrams(docs, CONFIG['NGRAMS']) display.display_ok("Extracting n-grams done") # Save publications and their n-grams filename = NGRAMS_FILENAME.format(args.OUTPUT) exh.write_json(docs, filename) display.display_info("Publications and n-grams saved in {0}".format(filename)) else: # The input file has not a valid extension display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension)) sys.exit(0)
def save_to_log(results, model, key): """Saves the evolution of the confusion matrix and the f1-score in JSON file Parameters ---------- results : dict The results of the classifier for different value of the threshold model : str The prefix string of the classifier """ data = dict() for index, n_clusters in enumerate(results['n_clusters']): data[n_clusters] = dict() data[n_clusters]['tn'] = int(results['tn'][index]) data[n_clusters]['tp'] = int(results['tp'][index]) data[n_clusters]['fn'] = int(results['fn'][index]) data[n_clusters]['fp'] = int(results['fp'][index]) data[n_clusters]['score'] = float(results['score'][index]) exh.write_json(data, LOG_FILENAME.format(model))
def strict_ngrams(n, dida_grams, notdida_grams): """Applies strict top n-grams analysis Parameters ---------- n : int The length of the n-grams dida_grams : list The n-grams of DIDA publications notdida_grams : list The n-grams of Not-DIDA publications """ print("Starting strict top {0}-grams analysis".format(n)) didatop = [] notdidatop = [] strict_top = dict() max_top = CONFIG['NTOPWORDS'] # Find n-grams that are in DIDA but not in Not-DIDA find_unique(dida_grams, notdida_grams, didatop) # Find n-grams that are in Not-DIDA but not in DIDA find_unique(notdida_grams, dida_grams, notdidatop) # Select the best top grams if (len(didatop) > max_top): didatop = didatop[len(didatop) - max_top:] if (len(notdidatop) > max_top): notdidatop = notdidatop[len(notdidatop) - max_top:] # Save the results of the strict top grams analysis strict_top['didatop'] = didatop strict_top['notdidatop'] = notdidatop exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, n)) display.display_ok("Strict top {0}-grams analysis done".format(n))
def save_clusters(clusters): data = dict() data['clusters'] = clusters data['Ndw'] = Ndw data['W'] = W directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY'] exh.create_directory(directory) # filename = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_FILENAME'] filename = directory + "/ndw.json" exh.write_json(Ndw, filename) filename = directory + "/W.json" exh.write_json(W, filename) cluster_directory = directory + "/clusters" exh.create_directory(cluster_directory) for i, c in clusters.items(): filename = cluster_directory + "/{0}.json".format(i) exh.write_json(c, filename) display.display_info("Data clusters saved into " + directory)
def cross_ngrams(n, dida_grams, notdida_grams): """Applies cross n-grams analysis Parameters ---------- n : int The length of the n-grams dida_grams : list The n-grams of DIDA publications notdida_grams : list The n-grams of Not-DIDA publications """ print("Starting cross top {0}-grams analysis".format(n)) iteration = 0 CTG = [1] # common top grams blacklist = [] topgrams_dict = dict() blacklist_dict = dict() max_top = CONFIG['NTOPWORDS'] while CTG: # Loop until there is no common top grams CTG.clear() grams1 = deepcopy(dida_grams) grams2 = deepcopy(notdida_grams) # Select the best top grams if len(grams1) > max_top: grams1 = grams1[len(grams1) - max_top:] if len(grams2) > max_top: grams2 = grams2[len(grams2) - max_top:] # Search the common grams for gram1 in grams1: for gram2 in grams2: if gram1[0] == gram2[0] and not gram1[0] in blacklist: CTG.append(gram1[0]) break # Save the top grams topgrams_dict['iteration'] = iteration topgrams_dict['CTG'] = CTG topgrams_dict['dida'] = grams1 topgrams_dict['notdida'] = grams2 exh.write_json(topgrams_dict, TOPGRAMS_FILENAME.format(n, max_top, iteration)) if CTG: # If there is common top grams # Add them to the blacklist blacklist.extend(CTG) # Remove them from each set of grams for word in CTG: for gram in dida_grams: if gram[0] == word: dida_grams.remove(gram) break for gram in notdida_grams: if gram[0] == word: notdida_grams.remove(gram) break # Save the blacklist blacklist_dict['stopgrams'] = blacklist iteration += 1 exh.write_json(blacklist_dict, STOPGRAMS_FILENAME.format(n, max_top, iteration)) display.display_ok("Cross top {0}-grams analysis done".format(n))