Esempio n. 1
0
def process_ngrams(n, data, data_class):
    """Searches the top n-grams of a publications set and computes the set cover

    Parameters
    ----------
    n : int
        The length of the n-grams
    data : list
        The publications list to handle
    data_class : str
        The class to handle

    Returns
    -------
    list
        n-grams above the threshold fixed in config file
    list
        set cover with n-grams above the threshold
    """
    print("Process for {0}".format(data_class))

    # Number of documents in data
    n_data = len(data)

    # Count occurrences for each n-grams
    print("Counting occurrences")
    occurrences = ngh.count_occurrences(n, data)

    # Normalize the occurrences
    print("Normalizing occurrences")
    normalized = ngh.normalize_occurrences(occurrences, n_data)

    # Find n-grams above a given threshold (see Config file)
    print("Filtering occurrences")
    subsets = filter(normalized)

    # Find top n-grams covering all documents
    print("Searching full set cover")
    find_set_cover(normalized, data)

    # Save all the normalized n-grams
    save_to_file(normalized, n, data_class)

    # Plot the n-grams
    plot(normalized, data_class, n)

    # Find the Set Cover based on best n-grams
    print("Searching partial set cover")
    set_cover = get_set_cover(subsets)
    exh.write_json(set_cover, SET_COVER_FILENAME.format(data_class, n))

    print("Computing score of partial set cover")
    scores = check_score(set_cover, subsets, normalized, data)
    exh.write_text(scores, SCORE_FILENAME.format(data_class, n))

    display.display_ok("Process for {0} done".format(data_class))

    return subsets, set_cover
def cross_top_words(dida_data, notdida_data, initial_stopwords):
    """Applies cross top words (1-grams) analysis

    Parameters
    ----------
    dida_data : list
        The publications of DIDA
    notdida_data : list
        The publications of Not-DIDA
    initial_stopwords : list
        The initial stopwords
    """
    print("Starting cross top 1-grams analysis")

    max_top = CONFIG['NTOPWORDS']
    iteration = 0
    CTW = [1]  # common top words
    topwords_dict = dict()
    stopwords_dict = dict()

    while CTW:
        # Loop until there is no common top words
        CTW.clear()

        # Insert PubTator annotations in the abstracts
        dida_docs = pbmdh.extract_features(deepcopy(dida_data),
                                           initial_stopwords)
        notdida_docs = pbmdh.extract_features(deepcopy(notdida_data),
                                              initial_stopwords)

        # Search top words of each publication
        top_dida = top_words(dida_docs)
        top_notdida = top_words(notdida_docs)

        # Search common top words
        find_common_words(top_dida, top_notdida, CTW)

        # Save top words
        topwords_dict['iteration'] = iteration
        topwords_dict['CTW'] = CTW
        topwords_dict['dida'] = top_dida
        topwords_dict['notdida'] = top_notdida
        exh.write_json(topwords_dict,
                       TOPGRAMS_FILENAME.format(1, max_top, iteration))

        if CTW:
            # If there is common top words
            # Add them to stopwords
            initial_stopwords.extend(CTW)

            # Save new stopwords list
            stopwords_dict['stopwords'] = initial_stopwords
            iteration += 1
            exh.write_json(stopwords_dict,
                           STOPGRAMS_FILENAME.format(1, max_top, iteration))

    display.display_ok("Cross top 1-grams analysis done")
Esempio n. 3
0
def save_topwords(topwords):
    """Saves all the n-grams selected in set covers in a JSON file

    Parameters
    ----------
    topwords : list
        All the n-grams selected in set covers
    """
    top = []
    for topword in topwords:
        if not tuple(topword[0]) in top :
            top.append(tuple(topword[0]))

    exh.write_json(top, TOPWORDS_FILENAME)
def download_doc(pmids_list):
    """Downloads publications based on a PMIDs list and saves them into a
    JSON file

    Parameters
    ----------
    pmids_list : list
        The list containing the PMIDs of the publications to download
    """
    print("Downloading PMIDs for Not-DIDA")
    all_data = pbmdh.download_publications(pmids_list)
    filename = CONFIG['NOTDIDA_DOCS'] + ".json"
    exh.write_json(all_data, filename)
    display.display_info("Not-DIDA publications saved in {0}".format(filename))
def strict_top_words(dida_data, notdida_data, initial_stopwords):
    """Applies strict top words (1-grams) analysis

    Parameters
    ----------
    dida_data : list
        The publications of DIDA
    notdida_data : list
        The publications of Not-DIDA
    initial_stopwords : list
        The initial stopwords
    """
    print("Starting strict top 1-grams analysis")

    strict_top = dict()
    max_top = CONFIG['NTOPWORDS']

    # Insert PubTator annotations in the abstracts
    dida_docs = pbmdh.extract_features(deepcopy(dida_data), initial_stopwords)
    notdida_docs = pbmdh.extract_features(deepcopy(notdida_data),
                                          initial_stopwords)

    # Ordered words by number of occurrences
    top_dida = top_words(dida_docs, split=False)
    top_notdida = top_words(notdida_docs, split=False)

    top_dida_l = []
    top_notdida_l = []

    # Find words that are in DIDA but not in Not-DIDA
    find_unique(top_dida, top_notdida, top_dida_l)

    # Find words that are in Not-DIDA but not in DIDA
    find_unique(top_notdida, top_dida, top_notdida_l)

    # Select best top words of DIDA
    if (len(top_dida_l) > max_top):
        top_dida_l = top_dida_l[len(top_dida_l) - max_top:]

    # Select best top words in Not-DIDA
    if (len(top_notdida_l) > max_top):
        top_notdida_l = top_notdida_l[len(top_notdida_l) - max_top:]

    # Save the results of the strict top words analysis
    strict_top['didatop'] = top_dida_l
    strict_top['notdidatop'] = top_notdida_l
    exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, 1))

    display.display_ok("Strict top 1-grams analysis done")
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    # Extension of the input file
    extension = args.FILE.split('.')[-1]

    if extension in LEGAL_EXTENSIONS:
        exh.create_directory(DIRECTORY)

        # Get publications
        print("Getting publications")
        documents_l = read_file(args.FILE, extension)
        display.display_ok("Getting publications done")

        # Save publications
        filename = BACK_FILENAME.format(args.OUTPUT)
        exh.write_json(documents_l, filename)
        display.display_info("Publications saved in {0}".format(filename))

        # Insert PubTator annotations in the abstracts
        print("Inserting PubTator annotations in abstracts")
        docs = pbmdh.extract_features(documents_l)
        display.display_ok("Inserting PubTator annotations in abstracts done")

        # Extract n-grams
        print("Extracting n-grams")
        ngh.extract_ngrams(docs, CONFIG['NGRAMS'])
        display.display_ok("Extracting n-grams done")

        # Save publications and their n-grams
        filename = NGRAMS_FILENAME.format(args.OUTPUT)
        exh.write_json(docs, filename)
        display.display_info("Publications and n-grams saved in {0}".format(filename))
    else:
        # The input file has not a valid extension
        display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension))
        sys.exit(0)
def save_to_log(results, model, key):
    """Saves the evolution of the confusion matrix and the f1-score in JSON file

    Parameters
    ----------
    results : dict
        The results of the classifier for different value of the threshold
    model : str
        The prefix string of the classifier
    """
    data = dict()
    for index, n_clusters in enumerate(results['n_clusters']):
        data[n_clusters] = dict()
        data[n_clusters]['tn'] = int(results['tn'][index])
        data[n_clusters]['tp'] = int(results['tp'][index])
        data[n_clusters]['fn'] = int(results['fn'][index])
        data[n_clusters]['fp'] = int(results['fp'][index])
        data[n_clusters]['score'] = float(results['score'][index])

    exh.write_json(data, LOG_FILENAME.format(model))
def strict_ngrams(n, dida_grams, notdida_grams):
    """Applies strict top n-grams analysis

    Parameters
    ----------
    n : int
        The length of the n-grams
    dida_grams : list
        The n-grams of DIDA publications
    notdida_grams : list
        The n-grams of Not-DIDA publications
    """
    print("Starting strict top {0}-grams analysis".format(n))

    didatop = []
    notdidatop = []
    strict_top = dict()
    max_top = CONFIG['NTOPWORDS']

    # Find n-grams that are in DIDA but not in Not-DIDA
    find_unique(dida_grams, notdida_grams, didatop)

    # Find n-grams that are in Not-DIDA but not in DIDA
    find_unique(notdida_grams, dida_grams, notdidatop)

    # Select the best top grams
    if (len(didatop) > max_top):
        didatop = didatop[len(didatop) - max_top:]
    if (len(notdidatop) > max_top):
        notdidatop = notdidatop[len(notdidatop) - max_top:]

    # Save the results of the strict top grams analysis
    strict_top['didatop'] = didatop
    strict_top['notdidatop'] = notdidatop
    exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, n))

    display.display_ok("Strict top {0}-grams analysis done".format(n))
def save_clusters(clusters):
    data = dict()
    data['clusters'] = clusters
    data['Ndw'] = Ndw
    data['W'] = W

    directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY']
    exh.create_directory(directory)

    # filename = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_FILENAME']
    filename = directory + "/ndw.json"
    exh.write_json(Ndw, filename)

    filename = directory + "/W.json"
    exh.write_json(W, filename)

    cluster_directory = directory + "/clusters"
    exh.create_directory(cluster_directory)

    for i, c in clusters.items():
        filename = cluster_directory + "/{0}.json".format(i)
        exh.write_json(c, filename)

    display.display_info("Data clusters saved into " + directory)
def cross_ngrams(n, dida_grams, notdida_grams):
    """Applies cross n-grams analysis

    Parameters
    ----------
    n : int
        The length of the n-grams
    dida_grams : list
        The n-grams of DIDA publications
    notdida_grams : list
        The n-grams of Not-DIDA publications
    """
    print("Starting cross top {0}-grams analysis".format(n))

    iteration = 0
    CTG = [1]  # common top grams
    blacklist = []
    topgrams_dict = dict()
    blacklist_dict = dict()
    max_top = CONFIG['NTOPWORDS']

    while CTG:
        # Loop until there is no common top grams
        CTG.clear()

        grams1 = deepcopy(dida_grams)
        grams2 = deepcopy(notdida_grams)

        # Select the best top grams
        if len(grams1) > max_top:
            grams1 = grams1[len(grams1) - max_top:]
        if len(grams2) > max_top:
            grams2 = grams2[len(grams2) - max_top:]

        # Search the common grams
        for gram1 in grams1:
            for gram2 in grams2:
                if gram1[0] == gram2[0] and not gram1[0] in blacklist:
                    CTG.append(gram1[0])
                    break

        # Save the top grams
        topgrams_dict['iteration'] = iteration
        topgrams_dict['CTG'] = CTG
        topgrams_dict['dida'] = grams1
        topgrams_dict['notdida'] = grams2
        exh.write_json(topgrams_dict,
                       TOPGRAMS_FILENAME.format(n, max_top, iteration))

        if CTG:
            # If there is common top grams
            # Add them to the blacklist
            blacklist.extend(CTG)

            # Remove them from each set of grams
            for word in CTG:
                for gram in dida_grams:
                    if gram[0] == word:
                        dida_grams.remove(gram)
                        break
                for gram in notdida_grams:
                    if gram[0] == word:
                        notdida_grams.remove(gram)
                        break

            # Save the blacklist
            blacklist_dict['stopgrams'] = blacklist
            iteration += 1
            exh.write_json(blacklist_dict,
                           STOPGRAMS_FILENAME.format(n, max_top, iteration))

    display.display_ok("Cross top {0}-grams analysis done".format(n))