def cluster_acquire(total_data, ratio): hotpoints, total_data = hotpoints_acquire(total_data) total_texts, cluster = [], [] key_text = hotpoints[2] for line in total_data: total_texts.append(line[2]) sim = calculate_similarity(key_text, total_texts) result = [] for id in range(0, len(sim)): arr = {"id": id, "value": sim[id]} result.append(arr) result.sort(key=lambda x: x["value"]) result.reverse() # 选取相似度 > 0.1 的类 for line in result: if line["value"] > ratio: cluster.append(total_data[line["id"]]) total_data[line["id"]][7] = -2 else: break # 更新数据集 new_data = [] for line in total_data: if line[7] != -2: new_data.append(line) return hotpoints, cluster, new_data
def calculate_keyword_similarity(sentences): script_keywords = extract_keywords(sentences[0]) forum_keywords = extract_keywords(sentences[1]) try: sk = ' '.join([x[0] for x in script_keywords]) fk = ' '.join([x[0] for x in forum_keywords]) documents = [sk, fk] similarity = calculate_similarity(documents)[0][1] # using cosine # similarity = calculate_word2vec_similarity(documents) # using word2vec except TypeError: similarity = 0 return similarity
def cluster_filter(key_text, total_texts, ratio, flag): result, total_texts = hotpoints_merge(total_texts) tmp = [] for line in total_texts: tmp.append(line[2]) if flag == 1: sim = calculate_similarity(key_text[2], tmp) else: sim = calculate_similarity2(key_text[2], tmp) for index in range(0, len(total_texts)): # print('key_text 与 text%d 相似度为:%.2f' % (index + 1, sim[index]), tmp[index]) if sim[index] > ratio: result.append(total_texts[index]) return result
def get_pairwise_similarity(snippet, forum_texts): sum_sim, sub_sim, max_sim, min_sim = 0, 1, 0, 999 l = len(forum_texts) for post in forum_texts: temp = calculate_similarity([snippet, post])[0][1] # sum the pairwise similarity for each post (normalize in the end) sum_sim += temp # take the max of pairwise similarity for the posts if temp > max_sim: max_sim = temp # take the min of pairwise similarity for the posts if temp < min_sim: min_sim = temp sum_sim /= l sub_sim -= sum_sim return [sum_sim, sub_sim, max_sim, min_sim]
def main(log, args): """Run visualmetrics.py in parallel. Args: log: The structlog logger instance. args: The parsed arguments from the argument parser. Returns: The return code that the program will exit with. """ fetch_dir = os.getenv("MOZ_FETCHES_DIR") if not fetch_dir: log.error("Expected MOZ_FETCHES_DIR environment variable.") return 1 fetch_dir = Path(fetch_dir) visualmetrics_path = fetch_dir / "visualmetrics.py" if not visualmetrics_path.exists(): log.error("Could not locate visualmetrics.py", expected_path=str(visualmetrics_path)) return 1 browsertime_results_path = fetch_dir / "browsertime-results.tgz" try: with tarfile.open(str(browsertime_results_path)) as tar: tar.extractall(path=str(fetch_dir)) except Exception: log.error("Could not read/extract browsertime results archive", path=browsertime_results_path, exc_info=True) return 1 log.info("Extracted browsertime results", path=browsertime_results_path) try: jobs_json_path = fetch_dir / "browsertime-results" / "jobs.json" jobs_json = read_json(jobs_json_path, JOB_SCHEMA) except Exception: log.error("Could not open the jobs.json file", path=jobs_json_path, exc_info=True) return 1 jobs = [] for job in jobs_json["jobs"]: browsertime_json_path = fetch_dir / job["browsertime_json_path"] try: browsertime_json = read_json(browsertime_json_path, BROWSERTIME_SCHEMA) except Exception: log.error("Could not open a browsertime.json file", path=browsertime_json_path, exc_info=True) return 1 for site in browsertime_json: for video in site["files"]["video"]: jobs.append( Job( test_name=job["test_name"], extra_options=len(job["extra_options"]) > 0 and job["extra_options"] or jobs_json["extra_options"], json_path=browsertime_json_path, video_path=browsertime_json_path.parent / video, )) failed_runs = 0 suites = {} with ProcessPoolExecutor(max_workers=cpu_count()) as executor: for job, result in zip( jobs, executor.map( partial( run_visual_metrics, visualmetrics_path=visualmetrics_path, options=args.visual_metrics_options, ), jobs, ), ): returncode, res = result if returncode != 0: log.error( "Failed to run visualmetrics.py", video_path=job.video_path, error=res, ) failed_runs += 1 else: # Python 3.5 requires a str object (not 3.6+) res = json.loads(res.decode("utf8")) for name, value in res.items(): append_result(log, suites, job.test_name, name, value, job.extra_options) suites = [get_suite(suite) for suite in suites.values()] perf_data = { "framework": { "name": "browsertime" }, "application": jobs_json["application"], "type": "pageload", "suites": suites, } # Try to get the similarity for all possible tests, this means that we # will also get a comparison of recorded vs. live sites to check # the on-going quality of our recordings. try: from similarity import calculate_similarity for name, value in calculate_similarity(jobs_json, fetch_dir, OUTPUT_DIR).items(): if value is None: continue suites[0]["subtests"].append({ "name": name, "value": value, "replicates": [value], "lowerIsBetter": False, "unit": "a.u.", }) except Exception: log.info("Failed to calculate similarity score", exc_info=True) # Validates the perf data complies with perfherder schema. # The perfherder schema uses jsonschema so we can't use voluptuous here. validate(perf_data, PERFHERDER_SCHEMA) raw_perf_data = json.dumps(perf_data) with Path(OUTPUT_DIR, "perfherder-data.json").open("w") as f: f.write(raw_perf_data) # Prints the data in logs for Perfherder to pick it up. log.info("PERFHERDER_DATA: %s" % raw_perf_data) # Lists the number of processed jobs, failures, and successes. with Path(OUTPUT_DIR, "summary.json").open("w") as f: json.dump( { "total_jobs": len(jobs), "successful_runs": len(jobs) - failed_runs, "failed_runs": failed_runs, }, f, ) # If there's one failure along the way, we want to return > 0 # to trigger a red job in TC. return failed_runs
lsa_sim.append(0) # keyword matching script_keywords = extract_keywords(sentences[0]) if not script_keywords: script_keywords = sentences[0] forum_keywords = extract_keywords(sentences[1]) if not forum_keywords: forum_keywords = sentences[1] sk = ' '.join([x[0] for x in script_keywords]) fk = ' '.join([x[0] for x in forum_keywords]) documents = [sk, fk] keyword_word2vec = calculate_word2vec_similarity( documents) # using word2vec try: keyword_cosine = calculate_similarity(documents)[0][ 1] # using cosine except RuntimeError: keyword_cosine = 0 key_word2vec_sim.append(keyword_word2vec) key_cosine_sim.append(keyword_cosine) # word2vec similarity word2vec_similarity = calculate_word2vec_similarity(sentences) word2vec_sim.append(word2vec_similarity) x = np.arange(l) print('=' * 50) print('{:30}{}'.format('Sum cosine similarities:', sum_sim)) print('{:30}{}'.format('Max cosine similarities:', max_sim)) print('{:30}{}'.format('LSA similarities:', lsa_sim))
while mode != 1 and mode != 2: mode = int( input("-------------\n" "Wybierz tryb:\n" "1. Nukleotydy\n" "2. Trójki kodonowe\n")) task = 0 while task != 1 and task != 2 and task != 3: task = int( input("Wybierz zadanie:\n" "1. Odleglosc edycyjna\n" "2. Podobienstwo\n" "3. Najlepsze zestawienie lokalne\n")) file_name = (input("Podaj nazwe pliku: (zostaw puste - plik domyslny)") or DEFAULT_FILENAMES[mode * 10 + task]) file = open("data/" + file_name, 'r') u_sym = get_vector(file) v_sym = get_vector(file) weights = get_matrix(file, mode) print("Wczytano plik: " + file_name) if task == 1: calculate_editional_distance(u_sym, v_sym, weights) if task == 2: calculate_similarity(u_sym, v_sym, weights) if task == 3: calculate_local_match(u_sym, v_sym, weights)