def test_analyze(self): datastub = FakeDataStub(files=self.base_files, changes=[Change(base=self.base_files[0], head=self.head_files[0])]) model = IdTyposAnalyzer.train(self.ptr, {}, datastub) analyzer = IdTyposAnalyzer(model, self.ptr.url, {}) comments = analyzer.analyze(self.ptr, self.ptr, datastub) self.assertGreater(len(comments), 0)
def test_reconstruct_identifier_fail(self): tokens = [ ("UpperCamelCase", ["upper", "camel", "case", "fail"]), ] parser = IdTyposAnalyzer.create_token_parser() for identifier, splitted_tokens in tokens: with self.assertRaises(AssertionError): IdTyposAnalyzer.reconstruct_identifier( parser, pred_tokens=splitted_tokens, identifier=identifier)
def test_reconstruct_identifier(self): tokens = [ ("UpperCamelCase", "UpperComelCase", ["upper", "camel", "case"]), ("camelCase", "comelCase", ["camel", "case"]), ("FRAPScase", "FRAPScase", ["frap", "scase"]), ("SQLThing", "SQLThing", ["sql", "thing"]), ("_Astra", "_Ostra", ["astra"]), ("CAPS_CONST", "COPS_CONST", ["caps", "const"]), ("_something_SILLY_", "_something_SIILLY_", ["something", "silly"]), ("blink182", "blunk182", ["blink"]), ("FooBar100500Bingo", "FuBar100500Bingo", ["foo", "bar", "bingo"]), ("Man45var", "Men45var", ["man", "var"]), ("method_name", "metod_name", ["method", "name"]), ("Method_Name", "Metod_Name", ["method", "name"]), ("101dalms", "101dolms", ["dalms"]), ("101_dalms", "101_dolms", ["dalms"]), ("101_DalmsBug", "101_DolmsBug", ["dalms", "bug"]), ("101_Dalms45Bug7", "101_Dolms45Bug7", ["dalms", "bug"]), ("wdSize", "pwdSize", ["wd", "size"]), ("Glint", "Glunt", ["glint"]), ("foo_BAR", "fu_BAR", ["foo", "bar"]), ("sourced.ml.algorithms.uast_ids_to_bag", "source.ml.algorithmos.uast_ids_to_bags", ["sourced", "ml", "algorithms", "uast", "ids", "to", "bag"]), ("WORSTnameYOUcanIMAGINE", "WORSTnomeYOUcanIMGINE", ["wors", "tname", "yo", "ucan", "imagine"]), ("SmallIdsToFoOo", "SmallestIdsToFoOo", ["small", "ids", "to", "fo", "oo"]), ("SmallIdFooo", "SmallestIdFooo", ["small", "id", "fooo"]), ("ONE_M0re_.__badId.example", "ONE_M0ree_.__badId.exomple", ["one", "m", "re", "bad", "id", "example"]), ("never_use_Such__varsableNames", "never_use_Such__varsablezzNameszz", ["never", "use", "such", "varsable", "names"]), ("a.b.c.d", "a.b.ce.de", ["a", "b", "c", "d"]), ("A.b.Cd.E", "A.be.Cde.Ee", ["a", "b", "cd", "e"]), ("looong_sh_loooong_sh", "looongzz_shzz_loooongzz_shzz", ["looong", "sh", "loooong", "sh"]), ("sh_sh_sh_sh", "ch_ch_ch_ch", ["sh", "sh", "sh", "sh"]), ("loooong_loooong_loooong", "laoong_loaong_looang", ["loooong", "loooong", "loooong"]), ] parser = IdTyposAnalyzer.create_token_parser() for correct, corrupted, correct_tokens in tokens: self.assertEqual( correct, IdTyposAnalyzer.reconstruct_identifier( parser, pred_tokens=correct_tokens, identifier=corrupted))
def evaluate_typos_on_identifiers( dataset: str = TYPOS_DATASET, config: Optional[Mapping[str, Any]] = None, mistakes_output: Optional[str] = None) -> str: """ Run IdTyposAnalyzer on the identifiers from the evaluation dataset. :param dataset: Dataset of misspelled identifiers. :param config: Configuration for the IdTyposAnalyzer. :param mistakes_output: Path to the file for printing the wrong corrections. :return: Quality report. """ identifiers = pandas.read_csv(dataset, header=0, usecols=[0, 1], names=["wrong", "correct"], keep_default_na=False) analyzer = IdTyposAnalyzer(IdTyposModel(), "", {} if config is None else config) suggestions = analyzer.check_identifiers(identifiers["wrong"].tolist()) corrections = [] for i, identifier in enumerate(identifiers["wrong"]): candidates = list( analyzer.generate_identifier_suggestions(suggestions[i], identifier)) corrections.append(candidates if len(candidates) > 0 else [Candidate(identifier, 1.0)]) for pos in range(analyzer.config["n_candidates"]): identifiers["sugg " + str(pos)] = [ correction[pos][0] if pos < len(correction) else "" for correction in corrections ] if mistakes_output is not None: identifiers[identifiers["sugg 0"] != identifiers["correct"]][[ "wrong", "sugg 0", "correct" ]].to_csv(mistakes_output) template = load_jinja2_template( os.path.join(TEMPLATE_DIR, "quality_on_identifiers.md.jinja2")) return template.render( identifiers=identifiers, suggestions=suggestions, vocabulary_tokens=analyzer.corrector.generator.tokens, n_candidates=analyzer.config["n_candidates"], IDENTIFIER_INDEX_COLUMN=IDENTIFIER_INDEX_COLUMN, Candidate=Candidate, Columns=Columns, tokenize=lambda x: list(analyzer.parser.split(x)), flatten_df_by_column=flatten_df_by_column, generate_report=generate_report)
def setUpClass(cls): cls.checker = IdTyposAnalyzer(model=IdTyposModel(), url="", config=dict(model=MODEL_PATH, confidence_threshold=0.2, n_candidates=3)) cls.identifiers = ["get", "gpt_tokeb"] cls.test_df = pandas.DataFrame( [[0, "get", "get"], [1, "gpt tokeb", "gpt"], [1, "gpt tokeb", "tokeb"]], columns=[IDENTIFIER_INDEX_COLUMN, Columns.Split, Columns.Token]) cls.suggestions = { 1: [Candidate("get", 0.9), Candidate("gpt", 0.3)], 2: [ Candidate("token", 0.98), Candidate("taken", 0.3), Candidate("tokem", 0.01) ] } cls.filtered_suggestions = { 1: [Candidate("get", 0.9), Candidate("gpt", 0.3)], 2: [Candidate("token", 0.98), Candidate("taken", 0.3)] }
def test_train(self): dataservice = FakeDataService(self.bblfsh_client, files=self.base_files, changes=[]) model = IdTyposAnalyzer.train(ptr=self.ptr, config={}, data_service=dataservice) self.assertSetEqual( model.identifiers, {"name", "print_type", "get_length", "customidentifiertostore"})
def test_analyze(self): dataservice = FakeDataService( self.bblfsh_client, files=self.base_files, changes=[Change(base=self.base_files[0], head=self.head_files[0])]) model = IdTyposAnalyzer.train(ptr=self.ptr, config={}, data_service=dataservice) analyzer = IdTyposAnalyzer(model=model, url=self.ptr.url, config=dict(model=MODEL_PATH, confidence_threshold=0.0, n_candidates=3, check_all_identifiers=False)) comments = analyzer.analyze(ptr_from=self.ptr, ptr_to=self.ptr, data_service=dataservice) self.assertGreater(len(comments), 0) bad_names = ["nam", "print_tipe", "gett_lenght"] good_names = [ "name", "print_type", "get_length", "customidentifiertostore" ] for c in comments: self.assertFalse( any(name in c.text.split(", fixes:")[0] for name in good_names)) self.assertTrue( any(name in c.text.split(", fixes:")[0] for name in bad_names)) analyzer = IdTyposAnalyzer(model=model, url=self.ptr.url, config=dict(model=MODEL_PATH, confidence_threshold=0.0, n_candidates=3, check_all_identifiers=True)) comments = analyzer.analyze(ptr_from=self.ptr, ptr_to=self.ptr, data_service=dataservice) self.assertGreater(len(comments), 0) bad_names = [ "nam", "print_tipe", "gett_lenght", "customidentifiertostore" ] good_names = ["name", "print_type", "get_length"] for c in comments: self.assertFalse( any(name in c.text.split(", fixes:")[0] for name in good_names)) self.assertTrue( any(name in c.text.split(", fixes:")[0] for name in bad_names))
def setUpClass(cls): cls.checker = IdTyposAnalyzer( DummyAnalyzerModel(), "", config=dict( model=str(Path(__file__).parent / "sample_corrector.asdf"), confidence_threshold=0.2, n_candidates=3)) cls.identifiers = ["get", "gpt_tokeb"] cls.test_df = pandas.DataFrame( [[0, "get", "get"], [1, "gpt tokeb", "gpt"], [1, "gpt tokeb", "tokeb"]], columns=[IdTyposAnalyzer.default_config["index_column"], Columns.Split, Columns.Token]) cls.suggestions = {1: [("get", 0.9), ("gpt", 0.3)], 2: [("token", 0.98), ("taken", 0.3), ("tokem", 0.01)]} cls.filtered_suggestions = {1: [("get", 0.9)], 2: [("token", 0.98), ("taken", 0.3)]}
def pipeline(yaml_dir, n_jobs=10): distance = textdistance.DamerauLevenshtein() yaml_files = glob(os.path.join(yaml_dir, "*")) log("Number of YAML files", len(yaml_files)) HERC_COLUMNS = ["repository", "hash"] TYPOS_COLUMNS = ["wrong", "correct", "commit", "file", "line"] def yaml_to_dict(yaml_loc): if not yaml_loc.endswith("yaml"): # commits.txt return [] rows = [] with open(yaml_loc, "r") as f: a = yaml.load(f.read(), Loader=yaml.FullLoader) base = {col: a["hercules"][col] for col in HERC_COLUMNS} for typo in a["TyposDataset"]: res = base.copy() for col in TYPOS_COLUMNS: res[col] = typo[col] rows.append(res) return rows results = Parallel(n_jobs=n_jobs)(delayed(yaml_to_dict)(loc) for loc in yaml_files) pandas_dict = defaultdict(list) for rows in results: for row in rows: for c in (HERC_COLUMNS + TYPOS_COLUMNS): pandas_dict[c].append(row[c]) df = pd.DataFrame.from_dict(pandas_dict) initial_n_samples = df.shape[0] log("Number of samples in initial dataset", initial_n_samples) # deduplication deduplicated_df = df.drop_duplicates(subset=["wrong", "correct"], keep="first") log("Number of samples after deduplication", deduplicated_df.shape[0], ", before", initial_n_samples) # check that number of subtokens keeps the same splitter = IdTyposAnalyzer.create_token_parser() def check_2(line): wrong = line.wrong correct = line.correct wrong_tokens = list(splitter.split(wrong)) corr_tokens = list(splitter.split(correct)) if len(wrong_tokens) != len(corr_tokens): return "Number of subtokens is different" if not len(wrong_tokens): return "Identifier without alphabetic characters" return "" deduplicated_df["check2"] = deduplicated_df.apply(check_2, axis=1) log("Number of good samples after check2", deduplicated_df[deduplicated_df["check2"] == ""].shape[0], ", before", initial_n_samples) # Demerau-Levenshtein distance def check_3(line): wrong = line.wrong correct = line.correct wrong_tokens = list(splitter.split(wrong)) corr_tokens = list(splitter.split(correct)) res = [] for t, ct in zip(wrong_tokens, corr_tokens): if distance(t, ct) > 2: res.append((t, ct)) if res: return "big Demerau-Levenshtein distance %s" % res return "" deduplicated_df["check3"] = deduplicated_df.apply(check_3, axis=1) suspicious_tokens = deduplicated_df[deduplicated_df["check3"] != ""] log("Number of samples with big Demerau-Levenshtein distance", suspicious_tokens.shape[0]) # examples, where token splits of the wrong and the correct identifiers are equal # (they differ in non-alpha chars or casing) deduplicated_df["wrong_split"] = deduplicated_df["wrong"].apply( lambda x: " ".join(splitter.split(x))) deduplicated_df["correct_split"] = deduplicated_df["correct"].apply( lambda x: " ".join(splitter.split(x))) deduplicated_df["check4"] = "" deduplicated_df["check4"][deduplicated_df["wrong_split"] == deduplicated_df["correct_split"]] = "Bad split" log("Number of samples where tokens are the same", deduplicated_df[deduplicated_df["check4"] == "Bad split"].shape[0]) # examples, where wrong and correct identifiers are equal on lemmas level. nlp = spacy.load("en", disable=["parser", "ner"]) # Filter examples with equal lemmas def _lemmatize(token): lemm = nlp(token) if len(lemm) > 1 or lemm[0].lemma_ == "-PRON-" or ( token[-2:] == "ss" and lemm[0].lemma_ == token[:-1]): return token return lemm[0].lemma_ deduplicated_df["wrong_lem"] = deduplicated_df["wrong_split"].apply( lambda x: " ".join(_lemmatize(token) for token in x.split())) deduplicated_df["correct_lem"] = deduplicated_df["correct_split"].apply( lambda x: " ".join(_lemmatize(token) for token in x.split())) deduplicated_df["check5"] = "" deduplicated_df["check5"][(deduplicated_df["wrong_lem"] == deduplicated_df["correct_lem"])] = \ "Equal lemmas" log("Number of good samples after check5", deduplicated_df[deduplicated_df["check5"] == ""].shape[0], ", before", initial_n_samples) deduplicated_df["check6"] = "" deduplicated_df["check6"][(deduplicated_df["wrong"].str.lower() == deduplicated_df["correct"].str.lower())] = \ "Difference in case" good_df = deduplicated_df[(deduplicated_df["check2"] == "") & (deduplicated_df["check3"] == "") & (deduplicated_df["check4"] == "") & (deduplicated_df["check5"] == "") & (deduplicated_df["check6"] == "")] good_df["repository"] = good_df["repository"].str.replace("@", "/") log("Number of good samples", good_df.shape[0]) for i, row in good_df[["repository"] + TYPOS_COLUMNS].iterrows(): print(",".join(map(str, row.values)))
def test_train(self): datastub = FakeDataStub(files=self.base_files, changes=None) model = IdTyposAnalyzer.train(self.ptr, {}, datastub) self.assertIsInstance(model, DummyAnalyzerModel)