コード例 #1
0
def ordered(merged, f_score):
    """Order n-grams of a dict using a particular score function

    Parameters
    ----------
    merged : dict
        The dict containing n-grams and their score for each class
    f_score : function
        The score function to use to order n-grams

    Returns
    -------
    list
        the ordered list of n-grams
    """
    print("Ordering grams")

    dida = CONFIG['DIDA_DOCS']
    notdida = CONFIG['NOTDIDA_DOCS']

    merged = sorted(merged.items(),
                    key=lambda kv: f_score(kv[1][dida], kv[1][notdida]))
    merged.reverse()

    display.display_ok("Ordering done")

    return merged
コード例 #2
0
def initialization():
    print("Starting clusters initialization")
    initialize_clusters()
    display.display_ok("Clusters initialization done")
    print("Processing agglomerative information")
    agglomerative_information()
    display.display_ok("Processing agglomerative information done")
コード例 #3
0
def process_ngrams(n, data, data_class):
    """Searches the top n-grams of a publications set and computes the set cover

    Parameters
    ----------
    n : int
        The length of the n-grams
    data : list
        The publications list to handle
    data_class : str
        The class to handle

    Returns
    -------
    list
        n-grams above the threshold fixed in config file
    list
        set cover with n-grams above the threshold
    """
    print("Process for {0}".format(data_class))

    # Number of documents in data
    n_data = len(data)

    # Count occurrences for each n-grams
    print("Counting occurrences")
    occurrences = ngh.count_occurrences(n, data)

    # Normalize the occurrences
    print("Normalizing occurrences")
    normalized = ngh.normalize_occurrences(occurrences, n_data)

    # Find n-grams above a given threshold (see Config file)
    print("Filtering occurrences")
    subsets = filter(normalized)

    # Find top n-grams covering all documents
    print("Searching full set cover")
    find_set_cover(normalized, data)

    # Save all the normalized n-grams
    save_to_file(normalized, n, data_class)

    # Plot the n-grams
    plot(normalized, data_class, n)

    # Find the Set Cover based on best n-grams
    print("Searching partial set cover")
    set_cover = get_set_cover(subsets)
    exh.write_json(set_cover, SET_COVER_FILENAME.format(data_class, n))

    print("Computing score of partial set cover")
    scores = check_score(set_cover, subsets, normalized, data)
    exh.write_text(scores, SCORE_FILENAME.format(data_class, n))

    display.display_ok("Process for {0} done".format(data_class))

    return subsets, set_cover
コード例 #4
0
def cross_top_words(dida_data, notdida_data, initial_stopwords):
    """Applies cross top words (1-grams) analysis

    Parameters
    ----------
    dida_data : list
        The publications of DIDA
    notdida_data : list
        The publications of Not-DIDA
    initial_stopwords : list
        The initial stopwords
    """
    print("Starting cross top 1-grams analysis")

    max_top = CONFIG['NTOPWORDS']
    iteration = 0
    CTW = [1]  # common top words
    topwords_dict = dict()
    stopwords_dict = dict()

    while CTW:
        # Loop until there is no common top words
        CTW.clear()

        # Insert PubTator annotations in the abstracts
        dida_docs = pbmdh.extract_features(deepcopy(dida_data),
                                           initial_stopwords)
        notdida_docs = pbmdh.extract_features(deepcopy(notdida_data),
                                              initial_stopwords)

        # Search top words of each publication
        top_dida = top_words(dida_docs)
        top_notdida = top_words(notdida_docs)

        # Search common top words
        find_common_words(top_dida, top_notdida, CTW)

        # Save top words
        topwords_dict['iteration'] = iteration
        topwords_dict['CTW'] = CTW
        topwords_dict['dida'] = top_dida
        topwords_dict['notdida'] = top_notdida
        exh.write_json(topwords_dict,
                       TOPGRAMS_FILENAME.format(1, max_top, iteration))

        if CTW:
            # If there is common top words
            # Add them to stopwords
            initial_stopwords.extend(CTW)

            # Save new stopwords list
            stopwords_dict['stopwords'] = initial_stopwords
            iteration += 1
            exh.write_json(stopwords_dict,
                           STOPGRAMS_FILENAME.format(1, max_top, iteration))

    display.display_ok("Cross top 1-grams analysis done")
コード例 #5
0
def loop(M):
    backup_clusters(M)
    n_categories = len(Pcw)
    print("Starting IB method loop")
    for m in range(M - 1, 0, -1):
        s = "Running iteration {0} on {1}".format(M - m, M - 1)
        print(s, end="\r")
        # print("Iteration {0} / {1}".format(M-m, M-1))
        # Find minimum cost
        cluster_i, cluster_j = np.argwhere(agg_info == agg_info.min())[0]

        # Merge clusters
        p_w = Pcluster[cluster_i] + Pcluster[cluster_j]
        pi_i = Pcluster[cluster_i] / p_w
        pi_j = Pcluster[cluster_j] / p_w
        pc_w = []
        for c in range(n_categories):
            temp = pi_i * Pc_cluster[c, cluster_i] + pi_j * Pc_cluster[
                c, cluster_j]
            pc_w.append(temp)

        clusters[cluster_i].extend(clusters[cluster_j])
        Pcluster[cluster_i] = p_w
        for c in range(n_categories):
            Pc_cluster[c, cluster_i] = pc_w[c]

        # Remove cluster j
        clusters[cluster_j].clear()

        for j in range(cluster_j + 1, M):
            agg_info[cluster_j][j] = np.Inf
        for i in range(cluster_j):
            agg_info[i][cluster_j] = np.Inf

        # Update cost
        for j in range(cluster_i + 1, M):
            if agg_info[cluster_i, j] != np.Inf:
                # update agg_info[cluster_i][j]
                js_d = js_divergence(cluster_i, j, n_categories)
                agg_info[cluster_i,
                         j] = (Pcluster[cluster_i] + Pcluster[j]) * js_d
        for i in range(cluster_i):
            if agg_info[i][cluster_i] != np.Inf:
                # update agg_info[i][cluster_i]
                js_d = js_divergence(i, cluster_i, n_categories)
                agg_info[i, cluster_i] = (Pcluster[i] +
                                          Pcluster[cluster_i]) * js_d

        backup_clusters(m)
    print(s)
    display.display_ok("IB method loop done")
コード例 #6
0
def strict_top_words(dida_data, notdida_data, initial_stopwords):
    """Applies strict top words (1-grams) analysis

    Parameters
    ----------
    dida_data : list
        The publications of DIDA
    notdida_data : list
        The publications of Not-DIDA
    initial_stopwords : list
        The initial stopwords
    """
    print("Starting strict top 1-grams analysis")

    strict_top = dict()
    max_top = CONFIG['NTOPWORDS']

    # Insert PubTator annotations in the abstracts
    dida_docs = pbmdh.extract_features(deepcopy(dida_data), initial_stopwords)
    notdida_docs = pbmdh.extract_features(deepcopy(notdida_data),
                                          initial_stopwords)

    # Ordered words by number of occurrences
    top_dida = top_words(dida_docs, split=False)
    top_notdida = top_words(notdida_docs, split=False)

    top_dida_l = []
    top_notdida_l = []

    # Find words that are in DIDA but not in Not-DIDA
    find_unique(top_dida, top_notdida, top_dida_l)

    # Find words that are in Not-DIDA but not in DIDA
    find_unique(top_notdida, top_dida, top_notdida_l)

    # Select best top words of DIDA
    if (len(top_dida_l) > max_top):
        top_dida_l = top_dida_l[len(top_dida_l) - max_top:]

    # Select best top words in Not-DIDA
    if (len(top_notdida_l) > max_top):
        top_notdida_l = top_notdida_l[len(top_notdida_l) - max_top:]

    # Save the results of the strict top words analysis
    strict_top['didatop'] = top_dida_l
    strict_top['notdidatop'] = top_notdida_l
    exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, 1))

    display.display_ok("Strict top 1-grams analysis done")
コード例 #7
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    subsets_dida = []
    subsets_notdida = []

    covers = []
    for i in range(1, n+1):
        print("Starting analysis for {0}-grams".format(i))

        # Process on DIDA class
        subset, set_cover = process_ngrams(i, dida_data, "dida")
        subsets_dida.extend(subset)
        covers.extend(set_cover)

        # Process on Not-DIDA class
        subset, set_cover = process_ngrams(i, notdida_data, "notdida")
        subsets_notdida.extend(subset)
        covers.extend(set_cover)

        display.display_ok("Analysis for {0}-grams done".format(i))

    print("Searching set cover with all grams for DIDA")
    set_cover = get_set_cover(subsets_dida)
    scores = check_score(set_cover, subsets_dida, subsets_dida, dida_data)
    exh.write_text(scores, SCORE_FILENAME.format("dida", "all"))
    display.display_ok("Done")

    print("Searching set cover with all grams for NotDIDA")
    set_cover = get_set_cover(subsets_notdida)
    scores = check_score(set_cover, subsets_notdida, subsets_notdida, notdida_data)
    exh.write_text(scores, SCORE_FILENAME.format("notdida", "all"))
    display.display_ok("Done")

    save_topwords(covers)
    display.display_info("All results were saved in {0} directory".format(DIRECTORY))
コード例 #8
0
def filter(pmids, known_pmids):
    """Filters from new PMIDs the ones that are already in DIDA

    Parameters
    ----------
    pmids : list
        The list of new PMIDs
    known_pmids : list
        The list of PMIDs already in DIDA

    Returns
    -------
    list
        the list of PMIDs that are not in DIDA
    """
    print("Filtering PMIDs.")
    notdida = []
    for pmid in pmids:
        if not pmid in known_pmids:
            notdida.append(pmid)
    display.display_ok("Filtering PMIDs done.")
    return notdida
コード例 #9
0
def get_pmids_by_dates():
    """Gets the PMIDs of publications between the dates specified in the configuration file

    Returns
    -------
    list
        the list of the PMIDs of the found publications
    """
    start_year = CONFIG['START_YEAR']
    end_year = CONFIG['SPLIT_YEAR']
    print("Retrieving new PMIDs between {0} and {1}".format(
        start_year, end_year))

    ids = []
    query = "digenic+AND+{0}[pdat]"
    for year in range(start_year, end_year):
        ids.extend(pbmdh.get_pmids(query.format(year)))

    x = np.array(ids)
    x = list(np.unique(x))
    display.display_ok("{0} new PMIDs found".format(len(x)))
    return x
コード例 #10
0
def merge_ngrams(grams1, grams2):
    """Merge to list of n-grams by keeping their score in each class

    Parameters
    ----------
    grams1 : list
        The first list of n-grams
    grams2 : list
        The second list of n-grams

    Returns
    -------
    dict
        a dict object containing the score of each n-gram in each class
    """
    print("Merging n-grams")
    merged = dict()
    dida = CONFIG['DIDA_DOCS']
    notdida = CONFIG['NOTDIDA_DOCS']

    for gram in grams1:
        if not gram[0] in merged:
            # Create the gram
            merged[gram[0]] = dict()
            merged[gram[0]][dida] = gram[1]
            # Prepare value for notDIDA
            merged[gram[0]][notdida] = 0

    for gram in grams2:
        if not gram[0] in merged:
            # Create the gram
            merged[gram[0]] = dict()
            # Value is 0 for DIDA because gram was not in grams1
            merged[gram[0]][dida] = 0
        merged[gram[0]][notdida] = gram[1]

    display.display_ok("Merging done")

    return merged
コード例 #11
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    for i in range(1, n + 1):
        print("Starting analysis for {0}-grams".format(i))

        print("Couting occurrences for DIDA")
        dida_occurrences = ngh.count_occurrences(i, dida_data)
        dida_normalized = ngh.normalize_occurrences(dida_occurrences,
                                                    len(dida_data))
        display.display_ok("Counting occurrences for DIDA done")

        print("Couting occurrences for NotDIDA")
        notdida_occurrences = ngh.count_occurrences(i, notdida_data)
        notdida_normalized = ngh.normalize_occurrences(notdida_occurrences,
                                                       len(notdida_data))
        display.display_ok("Counting occurrences for NotDIDA done")

        # Merge n-grams in the same list
        merged = merge_ngrams(dida_normalized, notdida_normalized)

        # Order n-grams by difference
        merged = ordered(merged, score)

        # Save results
        save_to_file(merged, i)

        display.display_ok("Analysis for {0}-grams done".format(i))
コード例 #12
0
def get_dida_pmids(dida_pmids):
    """Gets the PMIDs of publications in DIDA from a text file

    Parameters
    ----------
    dida_pmids : str
        The file name of the file containing the PMIDs in DIDA

    Returns
    -------
    list
        the list of PMIDs of publications in DIDA
    """
    print("Retrieving PMIDs from {0}".format(dida_pmids))
    f = open(dida_pmids)
    lines = f.readlines()
    pmids = []
    for index, line in enumerate(lines):
        pmids.append(line.replace('\n', ''))
    f.close()
    display.display_ok("Retrieving PMIDs done. {0} PMIDs found".format(
        len(pmids)))
    return pmids
コード例 #13
0
def strict_ngrams(n, dida_grams, notdida_grams):
    """Applies strict top n-grams analysis

    Parameters
    ----------
    n : int
        The length of the n-grams
    dida_grams : list
        The n-grams of DIDA publications
    notdida_grams : list
        The n-grams of Not-DIDA publications
    """
    print("Starting strict top {0}-grams analysis".format(n))

    didatop = []
    notdidatop = []
    strict_top = dict()
    max_top = CONFIG['NTOPWORDS']

    # Find n-grams that are in DIDA but not in Not-DIDA
    find_unique(dida_grams, notdida_grams, didatop)

    # Find n-grams that are in Not-DIDA but not in DIDA
    find_unique(notdida_grams, dida_grams, notdidatop)

    # Select the best top grams
    if (len(didatop) > max_top):
        didatop = didatop[len(didatop) - max_top:]
    if (len(notdidatop) > max_top):
        notdidatop = notdidatop[len(notdidatop) - max_top:]

    # Save the results of the strict top grams analysis
    strict_top['didatop'] = didatop
    strict_top['notdidatop'] = notdidatop
    exh.write_json(strict_top, STRICT_TOPGRAMS_FILENAME.format(max_top, n))

    display.display_ok("Strict top {0}-grams analysis done".format(n))
コード例 #14
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']
    for i in range(1, n + 1):
        extract_ngrams(i, deepcopy(dida_data), deepcopy(notdida_data))
コード例 #15
0
def extract_ngrams(n, dida_data, notdida_data):
    """Extracts n-grams from publications

    Parameters
    ----------
    n : int
        The length of the n-grams
    dida_data : list
        The publications of DIDA
    notdida_data : list
        The publications of Not-DIDA
    """
    print("Extracting {0}-grams".format(n))

    initial_stopwords = pbmdh.STOPWORDS

    if n == 1:
        find_top_words(deepcopy(dida_data), deepcopy(notdida_data),
                       deepcopy(initial_stopwords))
    else:
        find_top_ngrams(n, deepcopy(dida_data), deepcopy(notdida_data),
                        deepcopy(initial_stopwords))

    display.display_ok("Extracting {0}-grams done".format(n))
コード例 #16
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))

    # docs = [deepcopy(dida_data), deepcopy(notdida_data)]
    docs = [deepcopy(notdida_data), deepcopy(dida_data)]
    display.display_ok("Loading publications done")

    print("Starting extraction of words information")
    extract_words_information(docs)
    display.display_ok("Extraction of words information done")

    print("Computing joint probability distribution")
    joint_probability_distribution()
    display.display_ok("Computing joint probability distribution done")

    print("Starting IB method")
    all_clusters = ib.cluster(deepcopy(Pcw), deepcopy(Pw))
    display.display_ok("IB method finished")

    save_clusters(all_clusters)
コード例 #17
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    # Extension of the input file
    extension = args.FILE.split('.')[-1]

    if extension in LEGAL_EXTENSIONS:
        exh.create_directory(DIRECTORY)

        # Get publications
        print("Getting publications")
        documents_l = read_file(args.FILE, extension)
        display.display_ok("Getting publications done")

        # Save publications
        filename = BACK_FILENAME.format(args.OUTPUT)
        exh.write_json(documents_l, filename)
        display.display_info("Publications saved in {0}".format(filename))

        # Insert PubTator annotations in the abstracts
        print("Inserting PubTator annotations in abstracts")
        docs = pbmdh.extract_features(documents_l)
        display.display_ok("Inserting PubTator annotations in abstracts done")

        # Extract n-grams
        print("Extracting n-grams")
        ngh.extract_ngrams(docs, CONFIG['NGRAMS'])
        display.display_ok("Extracting n-grams done")

        # Save publications and their n-grams
        filename = NGRAMS_FILENAME.format(args.OUTPUT)
        exh.write_json(docs, filename)
        display.display_info("Publications and n-grams saved in {0}".format(filename))
    else:
        # The input file has not a valid extension
        display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension))
        sys.exit(0)
コード例 #18
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    csv_files = csv_filenames(n)

    # Real labels of each publication
    y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data)))

    data = deepcopy(dida_data)
    data.extend(deepcopy(notdida_data))

    scores = []
    classifiers_names = []

    print("Strict Classifier training")
    results = train(StrictClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'strict_', "threshold", "Threshold", DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "strict", "threshold",
                    LOG_FILENAME.format("strict"))
    classifiers_names.append("Strict Classifier")
    display.display_ok("Strict Classifier training done")

    print("Split Weighted Classifier training")
    results = train(SplitWeightedClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'splitweighted_', "threshold", "Threshold",
                              DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "splitweighted", "threshold",
                    LOG_FILENAME.format("splitweighted"))
    classifiers_names.append("Split Weighted Classifier")
    display.display_ok("Split Weighted Classifier training done")

    print("Weighted Classifier training")
    results = train(WeightedClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'weighted_', "threshold", "Threshold", DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "weighted", "threshold",
                    LOG_FILENAME.format("weighted"))
    classifiers_names.append("Weighted Classifier")
    display.display_ok("Weighted Classifier training done")

    plt.plot_lines(results['threshold'], scores, classifiers_names,
                   FSCORE_FILENAME, "Threshold", "F1-score")
    display.display_info("Results saved in {0}".format(DIRECTORY))
def classification(docs, Ndw, W, directory, true_predictions):
    strict_result = {
        "n_clusters": [],
        "tn": [],
        "fp": [],
        "fn": [],
        "tp": [],
        "score": []
    }
    doublon_result = {
        "n_clusters": [],
        "tn": [],
        "fp": [],
        "fn": [],
        "tp": [],
        "score": []
    }

    print("Documents replacement")
    converted_docs = converter.init(deepcopy(docs), deepcopy(W))
    display.display_ok("Documents replacement done")

    clusters_directory = directory + "/clusters"
    max_clusters = len(W)

    print("Evaluating classifier")
    a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    a.extend(range(100, 8500, 100))
    a.extend([8417])
    for n_clusters in a:  #range(1, max_clusters+1,100):
        print("Processing for {0} clusters (Total : {1})".format(
            n_clusters, max_clusters))

        # Load clusters
        clusters = exh.load_json(clusters_directory +
                                 "/{0}.json".format(n_clusters))

        # Prepare classifier
        classifier = NaiveBayesCluster(deepcopy(clusters), deepcopy(Ndw),
                                       deepcopy(W))
        print("Classifier ready")

        print("Converting documents")
        strict_converted_docs = converter.convert_all(deepcopy(converted_docs),
                                                      deepcopy(clusters))
        doublon_converted_docs = converter.convert_all(
            deepcopy(converted_docs), deepcopy(clusters), method='d')
        print("Converting documents done")

        print("Evaluate Strict Predictions")
        strict_predictions = classifier.evaluate(strict_converted_docs)
        print("Evaluate Doublon Predictions")
        doublon_predictions = classifier.evaluate(doublon_converted_docs)
        print("Predictions done")
        print("Perform scores")
        strict_score = classifier.score(true_predictions, strict_predictions)
        doublon_score = classifier.score(true_predictions, doublon_predictions)
        print("Scores performed : ({0}, {1})".format(strict_score,
                                                     doublon_score))
        add_result(n_clusters, strict_score, strict_result)
        add_result(n_clusters, doublon_score, doublon_result)

    display.display_ok("Evaluating classifier done")
    return strict_result, doublon_result
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))

    # docs = [deepcopy(notdida_data), deepcopy(dida_data)]
    docs = [deepcopy(dida_data), deepcopy(notdida_data)]
    display.display_ok("Loading publications done")

    data_directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY']
    Ndw = exh.load_json(data_directory + "/ndw.json")
    W = exh.load_json(data_directory + "/W.json")

    # Real labels of each publication
    # y_true = np.append(np.zeros(len(notdida_data)), np.ones(len(dida_data)))
    y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data)))
    strict_result, doublon_result = classification(docs, Ndw, W,
                                                   data_directory, y_true)

    plt.plot_confusion_matrix(strict_result,
                              len(dida_data),
                              len(notdida_data),
                              "strict_",
                              "n_clusters",
                              "Number of clusters",
                              DIRECTORY,
                              step=1000)
    exh.save_to_log(strict_result, "strict", "n_clusters",
                    LOG_FILENAME.format("strict"))
    plt.plot_confusion_matrix(doublon_result,
                              len(dida_data),
                              len(notdida_data),
                              "doublon_",
                              "n_clusters",
                              "Number of clusters",
                              DIRECTORY,
                              step=1000)
    exh.save_to_log(doublon_result, "doublon", "n_clusters",
                    LOG_FILENAME.format("doublon"))

    scores = [strict_result['score'], doublon_result['score']]
    classifiers_names = ["Strict converter", "Doublon converter"]

    plt.plot_lines(strict_result['n_clusters'],
                   scores,
                   classifiers_names,
                   FSCORE_FILENAME,
                   "Number of clusters",
                   "F1-score",
                   step=1000)
コード例 #21
0
def cross_ngrams(n, dida_grams, notdida_grams):
    """Applies cross n-grams analysis

    Parameters
    ----------
    n : int
        The length of the n-grams
    dida_grams : list
        The n-grams of DIDA publications
    notdida_grams : list
        The n-grams of Not-DIDA publications
    """
    print("Starting cross top {0}-grams analysis".format(n))

    iteration = 0
    CTG = [1]  # common top grams
    blacklist = []
    topgrams_dict = dict()
    blacklist_dict = dict()
    max_top = CONFIG['NTOPWORDS']

    while CTG:
        # Loop until there is no common top grams
        CTG.clear()

        grams1 = deepcopy(dida_grams)
        grams2 = deepcopy(notdida_grams)

        # Select the best top grams
        if len(grams1) > max_top:
            grams1 = grams1[len(grams1) - max_top:]
        if len(grams2) > max_top:
            grams2 = grams2[len(grams2) - max_top:]

        # Search the common grams
        for gram1 in grams1:
            for gram2 in grams2:
                if gram1[0] == gram2[0] and not gram1[0] in blacklist:
                    CTG.append(gram1[0])
                    break

        # Save the top grams
        topgrams_dict['iteration'] = iteration
        topgrams_dict['CTG'] = CTG
        topgrams_dict['dida'] = grams1
        topgrams_dict['notdida'] = grams2
        exh.write_json(topgrams_dict,
                       TOPGRAMS_FILENAME.format(n, max_top, iteration))

        if CTG:
            # If there is common top grams
            # Add them to the blacklist
            blacklist.extend(CTG)

            # Remove them from each set of grams
            for word in CTG:
                for gram in dida_grams:
                    if gram[0] == word:
                        dida_grams.remove(gram)
                        break
                for gram in notdida_grams:
                    if gram[0] == word:
                        notdida_grams.remove(gram)
                        break

            # Save the blacklist
            blacklist_dict['stopgrams'] = blacklist
            iteration += 1
            exh.write_json(blacklist_dict,
                           STOPGRAMS_FILENAME.format(n, max_top, iteration))

    display.display_ok("Cross top {0}-grams analysis done".format(n))