Beispiel #1
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    subsets_dida = []
    subsets_notdida = []

    covers = []
    for i in range(1, n+1):
        print("Starting analysis for {0}-grams".format(i))

        # Process on DIDA class
        subset, set_cover = process_ngrams(i, dida_data, "dida")
        subsets_dida.extend(subset)
        covers.extend(set_cover)

        # Process on Not-DIDA class
        subset, set_cover = process_ngrams(i, notdida_data, "notdida")
        subsets_notdida.extend(subset)
        covers.extend(set_cover)

        display.display_ok("Analysis for {0}-grams done".format(i))

    print("Searching set cover with all grams for DIDA")
    set_cover = get_set_cover(subsets_dida)
    scores = check_score(set_cover, subsets_dida, subsets_dida, dida_data)
    exh.write_text(scores, SCORE_FILENAME.format("dida", "all"))
    display.display_ok("Done")

    print("Searching set cover with all grams for NotDIDA")
    set_cover = get_set_cover(subsets_notdida)
    scores = check_score(set_cover, subsets_notdida, subsets_notdida, notdida_data)
    exh.write_text(scores, SCORE_FILENAME.format("notdida", "all"))
    display.display_ok("Done")

    save_topwords(covers)
    display.display_info("All results were saved in {0} directory".format(DIRECTORY))
Beispiel #2
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    for i in range(1, n + 1):
        print("Starting analysis for {0}-grams".format(i))

        print("Couting occurrences for DIDA")
        dida_occurrences = ngh.count_occurrences(i, dida_data)
        dida_normalized = ngh.normalize_occurrences(dida_occurrences,
                                                    len(dida_data))
        display.display_ok("Counting occurrences for DIDA done")

        print("Couting occurrences for NotDIDA")
        notdida_occurrences = ngh.count_occurrences(i, notdida_data)
        notdida_normalized = ngh.normalize_occurrences(notdida_occurrences,
                                                       len(notdida_data))
        display.display_ok("Counting occurrences for NotDIDA done")

        # Merge n-grams in the same list
        merged = merge_ngrams(dida_normalized, notdida_normalized)

        # Order n-grams by difference
        merged = ordered(merged, score)

        # Save results
        save_to_file(merged, i)

        display.display_ok("Analysis for {0}-grams done".format(i))
def read_file(filename, extension):
    """Returns publications based on a text file containing PMIDs or a JSON file
    containing publications

    Parameters
    ----------
    filename : str
        The name of the file to read
    extension : str
        The extension of the file

    Returns
    -------
    list
        a list of publications at JSON format
    """
    if extension == "txt":
        print("Received a text file - Reading PMIDs list")
        # Read each PMID in the file
        f = open(filename)
        lines = f.readlines()
        pmids = []
        for line in lines:
            pmids.append(line.replace('\n', ''))
        f.close()

        # Downloads and returns publications
        print("Downloading publications")
        return pbmdh.download_publications(pmids)
    elif extension == "json":
        print("Received a JSON file - Getting publications")
        return exh.load_json(filename)
def run(dida_pmids):
    """Executes the main process of the script

    Parameters
    ----------
    dida_pmids : str
        The file name of the file containing the PMIDs in DIDA
    """
    global CONFIG
    # Load configuration
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    # Get DIDA PMIDs
    known_pmids = get_dida_pmids(dida_pmids)
    # Get Not-DIDA PMIDs
    pmids = get_pmids_by_dates()
    notdida_pmids = filter(pmids, known_pmids)

    display.display_info("Total PMIDs between {0} and {1} : {2}".format(
        CONFIG['START_YEAR'], CONFIG['SPLIT_YEAR'], len(pmids)))
    display.display_info("Total PMIDs in DIDA : {0}".format(len(known_pmids)))
    display.display_info("Total PMIDs in Not-DIDA : {0}".format(
        len(notdida_pmids)))

    # Download Not-DIDA publications
    download_doc(notdida_pmids)
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']
    for i in range(1, n + 1):
        extract_ngrams(i, deepcopy(dida_data), deepcopy(notdida_data))
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))

    # docs = [deepcopy(dida_data), deepcopy(notdida_data)]
    docs = [deepcopy(notdida_data), deepcopy(dida_data)]
    display.display_ok("Loading publications done")

    print("Starting extraction of words information")
    extract_words_information(docs)
    display.display_ok("Extraction of words information done")

    print("Computing joint probability distribution")
    joint_probability_distribution()
    display.display_ok("Computing joint probability distribution done")

    print("Starting IB method")
    all_clusters = ib.cluster(deepcopy(Pcw), deepcopy(Pw))
    display.display_ok("IB method finished")

    save_clusters(all_clusters)
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    # Extension of the input file
    extension = args.FILE.split('.')[-1]

    if extension in LEGAL_EXTENSIONS:
        exh.create_directory(DIRECTORY)

        # Get publications
        print("Getting publications")
        documents_l = read_file(args.FILE, extension)
        display.display_ok("Getting publications done")

        # Save publications
        filename = BACK_FILENAME.format(args.OUTPUT)
        exh.write_json(documents_l, filename)
        display.display_info("Publications saved in {0}".format(filename))

        # Insert PubTator annotations in the abstracts
        print("Inserting PubTator annotations in abstracts")
        docs = pbmdh.extract_features(documents_l)
        display.display_ok("Inserting PubTator annotations in abstracts done")

        # Extract n-grams
        print("Extracting n-grams")
        ngh.extract_ngrams(docs, CONFIG['NGRAMS'])
        display.display_ok("Extracting n-grams done")

        # Save publications and their n-grams
        filename = NGRAMS_FILENAME.format(args.OUTPUT)
        exh.write_json(docs, filename)
        display.display_info("Publications and n-grams saved in {0}".format(filename))
    else:
        # The input file has not a valid extension
        display.display_fail("Extension of input file not supported. Required : txt or json. Received : {0}".format(extension))
        sys.exit(0)
Beispiel #8
0
    * download_publications - downloads publications based on a PMIDs list
    * extract_features - inserts PubTator annotations inside the publications abstracts
    * get_pmids - gets a PMIDs list based on a particular query
"""

import json
import urllib.request as req
import xml.etree.ElementTree as ET

from copy import deepcopy
from nltk.stem import PorterStemmer
from string import punctuation

import explorer_helper as exh

STOPWORDS = exh.load_json("config/stopwords.json")['stopwords']
URL_DOWNLOAD = "https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/RESTful/tmTool.cgi/BioConcept/{0}/JSON/"
URL_PMIDS = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term={0}&retmax={1}"

def clean_text(text, stopwords=STOPWORDS):
    """Lowerizes and stems publications abstracts

    Parameters
    ----------
    text : str
        The abstract to lowerize and stem
    stopwords: list
        A list of stopwords

    Returns
    -------
def classification(docs, Ndw, W, directory, true_predictions):
    strict_result = {
        "n_clusters": [],
        "tn": [],
        "fp": [],
        "fn": [],
        "tp": [],
        "score": []
    }
    doublon_result = {
        "n_clusters": [],
        "tn": [],
        "fp": [],
        "fn": [],
        "tp": [],
        "score": []
    }

    print("Documents replacement")
    converted_docs = converter.init(deepcopy(docs), deepcopy(W))
    display.display_ok("Documents replacement done")

    clusters_directory = directory + "/clusters"
    max_clusters = len(W)

    print("Evaluating classifier")
    a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
    a.extend(range(100, 8500, 100))
    a.extend([8417])
    for n_clusters in a:  #range(1, max_clusters+1,100):
        print("Processing for {0} clusters (Total : {1})".format(
            n_clusters, max_clusters))

        # Load clusters
        clusters = exh.load_json(clusters_directory +
                                 "/{0}.json".format(n_clusters))

        # Prepare classifier
        classifier = NaiveBayesCluster(deepcopy(clusters), deepcopy(Ndw),
                                       deepcopy(W))
        print("Classifier ready")

        print("Converting documents")
        strict_converted_docs = converter.convert_all(deepcopy(converted_docs),
                                                      deepcopy(clusters))
        doublon_converted_docs = converter.convert_all(
            deepcopy(converted_docs), deepcopy(clusters), method='d')
        print("Converting documents done")

        print("Evaluate Strict Predictions")
        strict_predictions = classifier.evaluate(strict_converted_docs)
        print("Evaluate Doublon Predictions")
        doublon_predictions = classifier.evaluate(doublon_converted_docs)
        print("Predictions done")
        print("Perform scores")
        strict_score = classifier.score(true_predictions, strict_predictions)
        doublon_score = classifier.score(true_predictions, doublon_predictions)
        print("Scores performed : ({0}, {1})".format(strict_score,
                                                     doublon_score))
        add_result(n_clusters, strict_score, strict_result)
        add_result(n_clusters, doublon_score, doublon_result)

    display.display_ok("Evaluating classifier done")
    return strict_result, doublon_result
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))

    # docs = [deepcopy(notdida_data), deepcopy(dida_data)]
    docs = [deepcopy(dida_data), deepcopy(notdida_data)]
    display.display_ok("Loading publications done")

    data_directory = DIRECTORY + '/' + CONFIG['ALL_CLUSTERS_DIRECTORY']
    Ndw = exh.load_json(data_directory + "/ndw.json")
    W = exh.load_json(data_directory + "/W.json")

    # Real labels of each publication
    # y_true = np.append(np.zeros(len(notdida_data)), np.ones(len(dida_data)))
    y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data)))
    strict_result, doublon_result = classification(docs, Ndw, W,
                                                   data_directory, y_true)

    plt.plot_confusion_matrix(strict_result,
                              len(dida_data),
                              len(notdida_data),
                              "strict_",
                              "n_clusters",
                              "Number of clusters",
                              DIRECTORY,
                              step=1000)
    exh.save_to_log(strict_result, "strict", "n_clusters",
                    LOG_FILENAME.format("strict"))
    plt.plot_confusion_matrix(doublon_result,
                              len(dida_data),
                              len(notdida_data),
                              "doublon_",
                              "n_clusters",
                              "Number of clusters",
                              DIRECTORY,
                              step=1000)
    exh.save_to_log(doublon_result, "doublon", "n_clusters",
                    LOG_FILENAME.format("doublon"))

    scores = [strict_result['score'], doublon_result['score']]
    classifiers_names = ["Strict converter", "Doublon converter"]

    plt.plot_lines(strict_result['n_clusters'],
                   scores,
                   classifiers_names,
                   FSCORE_FILENAME,
                   "Number of clusters",
                   "F1-score",
                   step=1000)
Beispiel #11
0
def run(args):
    """Executes the main process of the script

    Parameters
    ----------
    args : ArgumentParser
        The arguments of the command typed by the user
    """
    global CONFIG
    CONFIG = exh.load_json("config/{0}.json".format(args.CONFIG))

    exh.create_directory(DIRECTORY)

    print("Loading publications")
    # Load DIDA publications
    dida_data = exh.load_json(FILENAME_TEMPLATE.format(CONFIG['DIDA_DOCS']))
    # Load Not-DIDA publications
    notdida_data = exh.load_json(
        FILENAME_TEMPLATE.format(CONFIG['NOTDIDA_DOCS']))
    display.display_ok("Loading publications done")

    n = CONFIG['NGRAMS']

    csv_files = csv_filenames(n)

    # Real labels of each publication
    y_true = np.append(np.ones(len(dida_data)), np.zeros(len(notdida_data)))

    data = deepcopy(dida_data)
    data.extend(deepcopy(notdida_data))

    scores = []
    classifiers_names = []

    print("Strict Classifier training")
    results = train(StrictClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'strict_', "threshold", "Threshold", DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "strict", "threshold",
                    LOG_FILENAME.format("strict"))
    classifiers_names.append("Strict Classifier")
    display.display_ok("Strict Classifier training done")

    print("Split Weighted Classifier training")
    results = train(SplitWeightedClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'splitweighted_', "threshold", "Threshold",
                              DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "splitweighted", "threshold",
                    LOG_FILENAME.format("splitweighted"))
    classifiers_names.append("Split Weighted Classifier")
    display.display_ok("Split Weighted Classifier training done")

    print("Weighted Classifier training")
    results = train(WeightedClassifier, deepcopy(data), csv_files, y_true)
    plt.plot_confusion_matrix(results, len(dida_data), len(notdida_data),
                              'weighted_', "threshold", "Threshold", DIRECTORY)
    scores.append(results['score'])
    exh.save_to_log(results, "weighted", "threshold",
                    LOG_FILENAME.format("weighted"))
    classifiers_names.append("Weighted Classifier")
    display.display_ok("Weighted Classifier training done")

    plt.plot_lines(results['threshold'], scores, classifiers_names,
                   FSCORE_FILENAME, "Threshold", "F1-score")
    display.display_info("Results saved in {0}".format(DIRECTORY))