コード例 #1
0
def test_prediction_on_classifiers(X: pd.DataFrame,
                                   y: pd.Series,
                                   cv_sets=10,
                                   test_prefix=None):
    if cv_sets is None:
        cv = 10
        print("Using 10-fold cross validation...")
    else:
        cv = cv_sets
        print(f"Using {len(cv_sets)}x preselected train/test sets...")

    results = pd.DataFrame()
    for clf_name, clf in ML_CLASSIFIERS.items():
        timer = Timer()
        with timer:
            scores = cross_validate(clf,
                                    X,
                                    y,
                                    scoring=scoring,
                                    cv=cv,
                                    n_jobs=-1)
        print("Accuracy: %0.2f (+/- %0.2f) <-- %s" % (
            scores["test_accuracy"].mean(),
            scores["test_accuracy"].std() * 2,
            clf_name,
        ))
        data = {
            score_type: scores[score_field]
            for score_type, score_field in zip(scoring, score_fields)
        }
        data[
            "method"] = clf_name if test_prefix is None else test_prefix + clf_name
        data["time"] = timer.interval
        results = results.append(pd.DataFrame(data), ignore_index=True)
    return results
コード例 #2
0
def train_provenance_kernel_pipeline(
    graphs: pd.DataFrame,
    output_path: Path,
    kernel: str,
    level: int,
    y_column: str,
    including_edge_type_counts: bool = False,
) -> Pipeline:
    X, y = load_kernel_ml_data(
        graphs, output_path, kernel, level, y_column, including_edge_type_counts
    )
    clf = Pipeline(
        [
            ("scale", StandardScaler(with_mean=False)),
            ("svm", SVC(kernel="rbf", gamma="scale", class_weight="balanced")),
        ]
    )
    gs = GridSearchCV(
        estimator=clf,
        param_grid={
            "svm__C": SVM_C_PARAMS,
        },
        refit=True,
        n_jobs=-1,
    )
    with Timer():
        gs.fit(X, y)

    clf = gs.best_estimator_
    print(" - Best params:", gs.best_params_)
    print(" - Best score:", gs.best_score_)
    print(" - Accuracy:", clf.score(X, y))

    return clf
コード例 #3
0
def test_prediction_on_Grakel_kernels(
    graphs: pd.DataFrame, y_column: str, cv_sets=None, ignore_kernels=None
):
    if cv_sets is None:
        cv = 10
        print("Using 10-fold cross validation...")
    else:
        cv = cv_sets
        print(f"Using {len(cv_sets)}x preselected train/test sets...")
    if ignore_kernels is None:
        ignore_kernels = set()
    results = pd.DataFrame()
    for method_id, gk_class in GRAKEL_KERNELS.items():
        if method_id in ignore_kernels:
            logger.info("Skipping testing kernel: %s", method_id)
            continue

        logger.info("Testing graph kernel: %s", method_id)
        print("Testing GraKeL kernel:", method_id)
        gk = gk_class()
        has_timed_out = False
        try:
            timer = Timer(timeout=TIMEOUT)
            with timer:
                # TODO: break if timed out
                # only time the kerneling cost
                X = gk.fit_transform(graphs.grakel_graphs)
        except TimeoutException:
            has_timed_out = True
            print("*** TIMED OUT - %s ***" % method_id)

        if not has_timed_out:
            clf = SVC(kernel="precomputed", gamma="scale", class_weight="balanced")
            scores = cross_validate(
                clf, X, graphs[y_column], scoring=scoring, cv=cv, n_jobs=-1
            )
            print(
                "Accuracy: %0.2f (+/- %0.2f) <-- %s"
                % (
                    scores["test_accuracy"].mean(),
                    scores["test_accuracy"].std() * 2,
                    method_id,
                )
            )
            data = {
                score_type: scores[score_field]
                for score_type, score_field in zip(scoring, score_fields)
            }
            data["method"] = method_id
            data["time"] = timer.interval
            results = results.append(pd.DataFrame(data), ignore_index=True)
    return results
コード例 #4
0
def test_prediction_on_classifiers(X: pd.DataFrame,
                                   output_path: Path,
                                   y: pd.Series,
                                   cv_sets=10,
                                   test_prefix=None):
    if cv_sets is None:
        cv = 10
        print("> Using 10-fold cross validation...")
    else:
        cv = cv_sets
        print(f"> Using {len(cv_sets)}x preselected train/test sets...")

    results = pd.DataFrame()
    for clf_name, clf in ML_CLASSIFIERS.items():
        method_id = clf_name if test_prefix is None else test_prefix + clf_name
        # load existing scorings
        scorings = load_experiment_scorings(output_path, method_id)

        if scorings is None:
            print("> Testing ML method:", method_id)
            timer = Timer()
            with timer:
                scores = cross_validate(clf,
                                        X,
                                        y,
                                        scoring=scoring,
                                        cv=cv,
                                        n_jobs=-1)
            print("  - Accuracy: %0.2f (+/- %0.2f) <-- %s" % (
                scores["test_accuracy"].mean(),
                scores["test_accuracy"].std() * 2,
                clf_name,
            ))
            data = {
                score_type: scores[score_field]
                for score_type, score_field in zip(scoring, score_fields)
            }
            data["method"] = method_id
            data["time"] = timer.interval
            scorings = pd.DataFrame(data)
            save_experiment_scorings(output_path, method_id, scorings)

        results = results.append(scorings, ignore_index=True)

    return results
コード例 #5
0
def calculate_provenance_features_for_file(filepath: Path) -> list:
    # Calculate Provenance Network Metrics (22) and number of edge types
    try:
        # load the file
        prov_doc = ProvDocument.deserialize(filepath)
    except Exception as e:
        logger.error("Cannot deserialize %s", filepath)
        raise e
    try:
        timer = Timer(verbose=False)
        with timer:
            # counting the record types
            rec_type_counts = count_record_types(prov_doc)
            prov_rel_cols = [
                rec_type_counts[rec_type] if rec_type in rec_type_counts else 0
                for rec_type in PROV_RELATION_NAMES
            ]
            mv5 = version5(prov_doc, flat=True)  # calculate

        return mv5[:-4] + prov_rel_cols + [timer.interval]
    except Exception as e:
        logger.error("Cannot calculate metrics for %s", filepath)
        raise e
コード例 #6
0
def test_prediction_on_Grakel_kernels(
    graphs: pd.DataFrame,
    output_path: Path,
    y_column: str,
    cv_sets=None,
    ignore_kernels=None,
):
    if cv_sets is None:
        cv = 10
        print("> Using 10-fold cross validation...")
    else:
        cv = cv_sets
        print(f"> Using {len(cv_sets)}x preselected train/test sets...")
    if ignore_kernels is None:
        ignore_kernels = set()
    results = pd.DataFrame()
    for method_id, gk_class in GRAKEL_KERNELS.items():
        if method_id in ignore_kernels:
            logger.info("Skipping testing kernel: %s", method_id)
            continue

        # load existing scorings
        scorings = load_experiment_scorings(output_path, method_id)

        if scorings is None:
            # run the experiment
            logger.info("Testing graph kernel: %s", method_id)
            print("> Testing GraKeL kernel:", method_id)
            gk = gk_class()
            failed = False
            try:
                timer = Timer(timeout=TIMEOUT)
                with timer:
                    # TODO: break if timed out
                    # only time the kerneling cost
                    X = gk.fit_transform(graphs.grakel_graphs)
            except TimeoutException:
                failed = True
                print("*** TIMED OUT - %s ***" % method_id)
            except Exception as e:
                failed = True
                print(f"*** EXCEPTION - {method_id} ***\n{e}")

            if failed:
                # skip this, go to the next experiment
                continue

            clf = SVC(kernel="precomputed",
                      gamma="scale",
                      class_weight="balanced")
            gs = GridSearchCV(
                estimator=clf,
                param_grid={
                    "C": SVM_C_PARAMS,
                },
            )
            scores = cross_validate(gs,
                                    X,
                                    graphs[y_column],
                                    scoring=scoring,
                                    cv=cv,
                                    n_jobs=-1)
            print("  - Accuracy: %0.2f (+/- %0.2f) <-- %s" % (
                scores["test_accuracy"].mean(),
                scores["test_accuracy"].std() * 2,
                method_id,
            ))
            data = {
                score_type: scores[score_field]
                for score_type, score_field in zip(scoring, score_fields)
            }
            data["method"] = method_id
            data["time"] = timer.interval
            scorings = pd.DataFrame(data)
            save_experiment_scorings(output_path, method_id, scorings)

        results = results.append(scorings, ignore_index=True)
    return results
コード例 #7
0
import re
from scripts.utils import Timer
from urllib.parse import urlsplit
from ural.patterns import DOMAIN_TEMPLATE

N = 1_000_000
URL = 'http://www.lemonde.fr:8000/article/1234/index.html?query=mobile#2'

with Timer('urlsplit'):
    for _ in range(N):
        parsed = urlsplit(URL)
        parsed.hostname

pattern = re.compile(DOMAIN_TEMPLATE % r'lemonde\.fr')
with Timer('regex'):
    for _ in range(N):
        parsed = pattern.match(URL)
コード例 #8
0
class SwiftCSVReader(object):
    def __init__(self, reader):
        self.reader = reader

    def __iter__(self):
        return self

    def __next__(self):
        try:
            line = next(self.reader)
            return CSVLine(line)
        except StopIteration:
            raise


with Timer('reader'):
    with open('./scripts/data/youtube-urls.csv') as f:
        for line in csv.reader(f):
            line[1]

with Timer('DictReader'):
    with open('./scripts/data/youtube-urls.csv') as f:
        for line in csv.DictReader(f):
            line['youtube_url']

with Timer('SwiftCSVReader'):
    with open('./scripts/data/youtube-urls.csv') as f:
        for line in SwiftCSVReader(csv.reader(f)):
            line['youtube_url']