Example #1
0
def cluster_acquire(total_data, ratio):
    hotpoints, total_data = hotpoints_acquire(total_data)
    total_texts, cluster = [], []
    key_text = hotpoints[2]
    for line in total_data:
        total_texts.append(line[2])

    sim = calculate_similarity(key_text, total_texts)
    result = []
    for id in range(0, len(sim)):
        arr = {"id": id, "value": sim[id]}
        result.append(arr)
    result.sort(key=lambda x: x["value"])
    result.reverse()
    # 选取相似度 > 0.1 的类
    for line in result:
        if line["value"] > ratio:
            cluster.append(total_data[line["id"]])
            total_data[line["id"]][7] = -2
        else:
            break
    # 更新数据集
    new_data = []
    for line in total_data:
        if line[7] != -2:
            new_data.append(line)
    return hotpoints, cluster, new_data
Example #2
0
def calculate_keyword_similarity(sentences):
    script_keywords = extract_keywords(sentences[0])
    forum_keywords = extract_keywords(sentences[1])
    try:
        sk = ' '.join([x[0] for x in script_keywords])
        fk = ' '.join([x[0] for x in forum_keywords])
        documents = [sk, fk]
        similarity = calculate_similarity(documents)[0][1]  # using cosine
        # similarity = calculate_word2vec_similarity(documents)  # using word2vec
    except TypeError:
        similarity = 0
    return similarity
Example #3
0
def cluster_filter(key_text, total_texts, ratio, flag):
    result, total_texts = hotpoints_merge(total_texts)
    tmp = []
    for line in total_texts:
        tmp.append(line[2])
    if flag == 1:
        sim = calculate_similarity(key_text[2], tmp)
    else:
        sim = calculate_similarity2(key_text[2], tmp)

    for index in range(0, len(total_texts)):
        # print('key_text 与 text%d 相似度为:%.2f' % (index + 1, sim[index]), tmp[index])
        if sim[index] > ratio:
            result.append(total_texts[index])
    return result
Example #4
0
def get_pairwise_similarity(snippet, forum_texts):
    sum_sim, sub_sim, max_sim, min_sim = 0, 1, 0, 999
    l = len(forum_texts)

    for post in forum_texts:
        temp = calculate_similarity([snippet, post])[0][1]

        # sum the pairwise similarity for each post (normalize in the end)
        sum_sim += temp

        # take the max of pairwise similarity for the posts
        if temp > max_sim:
            max_sim = temp

        # take the min of pairwise similarity for the posts
        if temp < min_sim:
            min_sim = temp
    sum_sim /= l
    sub_sim -= sum_sim

    return [sum_sim, sub_sim, max_sim, min_sim]
Example #5
0
def main(log, args):
    """Run visualmetrics.py in parallel.

    Args:
        log: The structlog logger instance.
        args: The parsed arguments from the argument parser.

    Returns:
        The return code that the program will exit with.
    """
    fetch_dir = os.getenv("MOZ_FETCHES_DIR")
    if not fetch_dir:
        log.error("Expected MOZ_FETCHES_DIR environment variable.")
        return 1

    fetch_dir = Path(fetch_dir)

    visualmetrics_path = fetch_dir / "visualmetrics.py"
    if not visualmetrics_path.exists():
        log.error("Could not locate visualmetrics.py",
                  expected_path=str(visualmetrics_path))
        return 1

    browsertime_results_path = fetch_dir / "browsertime-results.tgz"

    try:
        with tarfile.open(str(browsertime_results_path)) as tar:
            tar.extractall(path=str(fetch_dir))
    except Exception:
        log.error("Could not read/extract browsertime results archive",
                  path=browsertime_results_path,
                  exc_info=True)
        return 1
    log.info("Extracted browsertime results", path=browsertime_results_path)

    try:
        jobs_json_path = fetch_dir / "browsertime-results" / "jobs.json"
        jobs_json = read_json(jobs_json_path, JOB_SCHEMA)
    except Exception:
        log.error("Could not open the jobs.json file",
                  path=jobs_json_path,
                  exc_info=True)
        return 1

    jobs = []

    for job in jobs_json["jobs"]:
        browsertime_json_path = fetch_dir / job["browsertime_json_path"]

        try:
            browsertime_json = read_json(browsertime_json_path,
                                         BROWSERTIME_SCHEMA)
        except Exception:
            log.error("Could not open a browsertime.json file",
                      path=browsertime_json_path,
                      exc_info=True)
            return 1

        for site in browsertime_json:
            for video in site["files"]["video"]:
                jobs.append(
                    Job(
                        test_name=job["test_name"],
                        extra_options=len(job["extra_options"]) > 0
                        and job["extra_options"] or jobs_json["extra_options"],
                        json_path=browsertime_json_path,
                        video_path=browsertime_json_path.parent / video,
                    ))

    failed_runs = 0
    suites = {}

    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
        for job, result in zip(
                jobs,
                executor.map(
                    partial(
                        run_visual_metrics,
                        visualmetrics_path=visualmetrics_path,
                        options=args.visual_metrics_options,
                    ),
                    jobs,
                ),
        ):
            returncode, res = result
            if returncode != 0:
                log.error(
                    "Failed to run visualmetrics.py",
                    video_path=job.video_path,
                    error=res,
                )
                failed_runs += 1
            else:
                # Python 3.5 requires a str object (not 3.6+)
                res = json.loads(res.decode("utf8"))
                for name, value in res.items():
                    append_result(log, suites, job.test_name, name, value,
                                  job.extra_options)

    suites = [get_suite(suite) for suite in suites.values()]

    perf_data = {
        "framework": {
            "name": "browsertime"
        },
        "application": jobs_json["application"],
        "type": "pageload",
        "suites": suites,
    }

    # Try to get the similarity for all possible tests, this means that we
    # will also get a comparison of recorded vs. live sites to check
    # the on-going quality of our recordings.
    try:
        from similarity import calculate_similarity
        for name, value in calculate_similarity(jobs_json, fetch_dir,
                                                OUTPUT_DIR).items():
            if value is None:
                continue
            suites[0]["subtests"].append({
                "name": name,
                "value": value,
                "replicates": [value],
                "lowerIsBetter": False,
                "unit": "a.u.",
            })
    except Exception:
        log.info("Failed to calculate similarity score", exc_info=True)

    # Validates the perf data complies with perfherder schema.
    # The perfherder schema uses jsonschema so we can't use voluptuous here.
    validate(perf_data, PERFHERDER_SCHEMA)

    raw_perf_data = json.dumps(perf_data)
    with Path(OUTPUT_DIR, "perfherder-data.json").open("w") as f:
        f.write(raw_perf_data)
    # Prints the data in logs for Perfherder to pick it up.
    log.info("PERFHERDER_DATA: %s" % raw_perf_data)

    # Lists the number of processed jobs, failures, and successes.
    with Path(OUTPUT_DIR, "summary.json").open("w") as f:
        json.dump(
            {
                "total_jobs": len(jobs),
                "successful_runs": len(jobs) - failed_runs,
                "failed_runs": failed_runs,
            },
            f,
        )

    # If there's one failure along the way, we want to return > 0
    # to trigger a red job in TC.
    return failed_runs
Example #6
0
            lsa_sim.append(0)

        # keyword matching
        script_keywords = extract_keywords(sentences[0])
        if not script_keywords:
            script_keywords = sentences[0]
        forum_keywords = extract_keywords(sentences[1])
        if not forum_keywords:
            forum_keywords = sentences[1]
        sk = ' '.join([x[0] for x in script_keywords])
        fk = ' '.join([x[0] for x in forum_keywords])
        documents = [sk, fk]
        keyword_word2vec = calculate_word2vec_similarity(
            documents)  # using word2vec
        try:
            keyword_cosine = calculate_similarity(documents)[0][
                1]  # using cosine
        except RuntimeError:
            keyword_cosine = 0
        key_word2vec_sim.append(keyword_word2vec)
        key_cosine_sim.append(keyword_cosine)

        # word2vec similarity
        word2vec_similarity = calculate_word2vec_similarity(sentences)
        word2vec_sim.append(word2vec_similarity)

    x = np.arange(l)

    print('=' * 50)
    print('{:30}{}'.format('Sum cosine similarities:', sum_sim))
    print('{:30}{}'.format('Max cosine similarities:', max_sim))
    print('{:30}{}'.format('LSA similarities:', lsa_sim))
        while mode != 1 and mode != 2:
            mode = int(
                input("-------------\n"
                      "Wybierz tryb:\n"
                      "1. Nukleotydy\n"
                      "2. Trójki kodonowe\n"))

        task = 0
        while task != 1 and task != 2 and task != 3:
            task = int(
                input("Wybierz zadanie:\n"
                      "1. Odleglosc edycyjna\n"
                      "2. Podobienstwo\n"
                      "3. Najlepsze zestawienie lokalne\n"))

        file_name = (input("Podaj nazwe pliku: (zostaw puste - plik domyslny)")
                     or DEFAULT_FILENAMES[mode * 10 + task])
        file = open("data/" + file_name, 'r')
        u_sym = get_vector(file)
        v_sym = get_vector(file)

        weights = get_matrix(file, mode)
        print("Wczytano plik: " + file_name)

        if task == 1:
            calculate_editional_distance(u_sym, v_sym, weights)
        if task == 2:
            calculate_similarity(u_sym, v_sym, weights)
        if task == 3:
            calculate_local_match(u_sym, v_sym, weights)