Esempio n. 1
0
def remove_non_typos(dataset: str, filtered_dataset: str) -> None:
    """
    Remove non-typo-ed identifiers from the dataset.

    1. Remove examples, where token splits of the wrong and the correct identifiers are equal \
       (they differ in non-alpha chars or casing).
    2. Remove examples, where wrong and correct identifiers are equal on lemmas level.

    :param dataset: Path to the dataset.
    :param filtered_dataset: Path to save the filtered dataset to.
    """
    data = pandas.read_csv(dataset,
                           header=0,
                           usecols=[0, 1],
                           names=["wrong", "correct"],
                           keep_default_na=False)

    # Filter examples with equal splits
    tp = TokenParser(min_split_length=1,
                     stem_threshold=400,
                     single_shot=True,
                     max_token_length=400,
                     attach_upper=True)
    data["wrong_split"] = data["wrong"].apply(lambda x: " ".join(tp.split(x)))
    data["correct_split"] = data["correct"].apply(
        lambda x: " ".join(tp.split(x)))
    data = data[data["wrong_split"] != data["correct_split"]]

    os.system("python3 -m spacy download en")
    nlp = spacy.load("en", disable=["parser", "ner"])

    # Filter examples with equal lemmas
    def _lemmatize(token):
        lemm = nlp(token)
        if len(lemm) > 1 or lemm[0].lemma_ == "-PRON-" or (
                token[-2:] == "ss" and lemm[0].lemma_ == token[:-1]):
            return token
        return lemm[0].lemma_

    data["wrong_lem"] = data["wrong_split"].apply(
        lambda x: " ".join(_lemmatize(token) for token in x.split()))
    data["correct_lem"] = data["correct_split"].apply(
        lambda x: " ".join(_lemmatize(token) for token in x.split()))
    data = data[(data["wrong_lem"] != data["correct_lem"])
                & (data["wrong_lem"] != data["correct_split"]) &
                (data["correct_lem"] != data["wrong_split"])]

    # Save new dataset
    whole_data = pandas.read_csv(dataset, header=0, keep_default_na=False)
    whole_data = whole_data.loc[data.index]
    whole_data.to_csv(filtered_dataset, compression="xz", index=False)
Esempio n. 2
0
    def test_process_token_with_attach_upper(self):
        tp = TokenParser(stem_threshold=100,
                         single_shot=True,
                         max_token_length=100,
                         min_split_length=1)
        tokens = [
            ("ONLYCAPS", ["onlycaps"]),
            ("nocaps", ["nocaps"]),
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "scase"]),
            ("SQLThing", ["sql", "thing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["something", "silly"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalms"]),
            ("101_dalms", ["dalms"]),
            ("101_DalmsBug", ["dalms", "bug"]),
            ("101_Dalms45Bug7", ["dalms", "bug"]),
            ("wdSize", ["wd", "size"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             ["sourced", "ml", "algorithms", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE",
             ["wors", "tname", "yo", "ucan", "imagine"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
            ("SmallIdFooo", ["small", "id", "fooo"]),
            ("ONE_M0re_.__badId.example",
             ["one", "m", "re", "bad", "id", "example"]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsable", "names"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]),
        ]

        for token, correct in tokens:
            res = list(tp.process_token(token))
            self.assertEqual(res, correct)
Esempio n. 3
0
 def __init__(self, token2index=None, token_parser=None):
     """
     :param token2index: The mapping from tokens to bag keys. If None, no mapping is performed.
     :param token_parser: Specify token parser if you want to use a custom one. \
         :class:'TokenParser' is used if it is not specified.
     """
     token_parser = TokenParser() if token_parser is None else token_parser
     super().__init__(token2index, token_parser)
Esempio n. 4
0
    def create_token_parser() -> TokenParser:
        """
        Create instance of TokenParser that should be used by IdTyposAnalyzer.

        :return: TokenParser.
        """
        return TokenParser(stem_threshold=1000,
                           single_shot=True,
                           min_split_length=1)
Esempio n. 5
0
class NNTokenParserTests(unittest.TestCase):
    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def setUp(self):
        self.tp = TokenParser(stem_threshold=4,
                              max_token_length=20,
                              attach_upper=False,
                              use_nn=True)
        self.tp._single_shot = False

    @unittest.skipIf(not has_tensorflow(), "Tensorflow is not installed.")
    def test_process_token(self):
        self.tp.max_token_length = 100

        tokens = [
            ("ONLYCAPS", ["only", "caps"]),
            ("nocaps", ["no", "caps"]),
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "case"]),
            ("SQLThing", ["sql", "thing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["someth", "silli"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foobar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalm"]),
            ("101_dalms", ["dalm"]),
            ("101_DalmsBug", ["dalmsbug"]),
            ("101_Dalms45Bug7", ["dalm", "bug"]),
            ("wdSize", ["wd", "size"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             ["sourc", "d", "ml", "algorithm", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE",
             ["worst", "name", "you", "can", "imagin"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "to", "fooo"]),
            ("SmallIdFooo", ["small", "id", "foo", "o"]),
            ("ONE_M0re_.__badId.example",
             ["one", "m", "re", "badid", "exampl"]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsabl", "name"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]),
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)
Esempio n. 6
0
    def reconstruct_identifier(tokenizer: TokenParser, pred_tokens: List[str], identifier: str) \
            -> str:
        """
        Reconstruct identifier given predicted tokens and initial identifier.

        :param tokenizer: tokenizer - instance of TokenParser.
        :param pred_tokens: list of predicted tokens.
        :param identifier: identifier.
        :return: reconstructed identifier based on predicted tokens.
        """
        identifier_l = identifier.lower()
        # check required parameters
        assert tokenizer._single_shot, "TokenParser should be initialized with " \
                                       "`single_shot=True` for IdTyposAnalyzer"
        # sanity checking
        initial_tokens = list(tokenizer.split(identifier))
        err = "Number of predicted tokens (%s) not equal to the number of tokens in the " \
              "identifier (%s) for identifier '%s', predicted_tokens '%s', tokens in identifier " \
              "'%s'"
        assert len(initial_tokens) == len(pred_tokens), \
            err % (len(initial_tokens), len(pred_tokens), identifier, pred_tokens, initial_tokens)
        # reconstruction
        res = []
        prev_end = 0
        for token, pred_token in zip(initial_tokens, pred_tokens):
            curr = identifier_l.find(token, prev_end)
            assert curr != -1, "TokenParser is broken, the subtoken `%s` was not found in the " \
                               "identifier `%s`" % (token, identifier)
            if curr != prev_end:
                # delimiter found
                res.append(identifier[prev_end:curr])
            if identifier[curr:curr + len(token)].isupper():
                # upper case
                res.append(pred_token.upper())
            elif identifier[curr:curr + len(token)][0].isupper():
                # capitalized
                res.append(pred_token[0].upper() + pred_token[1:])
            else:
                res.append(pred_token)
            prev_end = curr + len(token)
        if prev_end != len(identifier):
            # suffix
            res.append(identifier[prev_end:])
        return "".join(res)
Esempio n. 7
0
 def setUp(self):
     self.tp = TokenParser(stem_threshold=4,
                           max_token_length=20,
                           attach_upper=False)
     self.tp._single_shot = False
Esempio n. 8
0
class TokenParserTests(unittest.TestCase):
    def setUp(self):
        self.tp = TokenParser(stem_threshold=4,
                              max_token_length=20,
                              attach_upper=False)
        self.tp._single_shot = False

    def test_process_token(self):
        self.tp.max_token_length = 100

        tokens = [
            ("ONLYCAPS", ["onlycap"]),
            ("nocaps", ["nocap"]),
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "case"]),
            ("SQLThing", ["sqlt", "hing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["someth", "silli"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalm"]),
            ("101_dalms", ["dalm"]),
            ("101_DalmsBug", ["dalm", "bug"]),
            ("101_Dalms45Bug7", ["dalm", "bug"]),
            ("wdSize", ["wd", "size", "wdsize"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag", [
                "sourc", "sourcedml", "algorithm", "mlalgorithm", "uast",
                "ids", "idsto", "bag", "tobag"
            ]),
            ("WORSTnameYOUcanIMAGINE",
             ["worst", "name", "you", "can", "imagin"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "idsto", "fo", "oo"]),
            ("SmallIdFooo", ["small", "smallid", "fooo", "idfooo"]),
            ("ONE_M0re_.__badId.example", [
                "one", "onem", "re", "bad", "rebad", "badid", "exampl",
                "idexampl"
            ]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsabl", "name"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh",
             ["looong", "looongsh", "loooong", "shloooong", "loooongsh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]),
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)

    def test_process_token_with_attach_upper(self):
        tp = TokenParser(stem_threshold=100,
                         single_shot=True,
                         max_token_length=100,
                         min_split_length=1)
        tokens = [
            ("ONLYCAPS", ["onlycaps"]),
            ("nocaps", ["nocaps"]),
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "scase"]),
            ("SQLThing", ["sql", "thing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["something", "silly"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalms"]),
            ("101_dalms", ["dalms"]),
            ("101_DalmsBug", ["dalms", "bug"]),
            ("101_Dalms45Bug7", ["dalms", "bug"]),
            ("wdSize", ["wd", "size"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             ["sourced", "ml", "algorithms", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE",
             ["wors", "tname", "yo", "ucan", "imagine"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
            ("SmallIdFooo", ["small", "id", "fooo"]),
            ("ONE_M0re_.__badId.example",
             ["one", "m", "re", "bad", "id", "example"]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsable", "names"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]),
        ]

        for token, correct in tokens:
            res = list(tp.process_token(token))
            self.assertEqual(res, correct)

    def test_process_token_single_shot(self):
        self.tp.max_token_length = 100
        self.tp._single_shot = True
        self.tp.min_split_length = 1
        tokens = [
            ("ONLYCAPS", ["onlycap"]),
            ("nocaps", ["nocap"]),
            ("UpperCamelCase", ["upper", "camel", "case"]),
            ("camelCase", ["camel", "case"]),
            ("FRAPScase", ["frap", "case"]),
            ("SQLThing", ["sqlt", "hing"]),
            ("_Astra", ["astra"]),
            ("CAPS_CONST", ["caps", "const"]),
            ("_something_SILLY_", ["someth", "silli"]),
            ("blink182", ["blink"]),
            ("FooBar100500Bingo", ["foo", "bar", "bingo"]),
            ("Man45var", ["man", "var"]),
            ("method_name", ["method", "name"]),
            ("Method_Name", ["method", "name"]),
            ("101dalms", ["dalm"]),
            ("101_dalms", ["dalm"]),
            ("101_DalmsBug", ["dalm", "bug"]),
            ("101_Dalms45Bug7", ["dalm", "bug"]),
            ("wdSize", ["wd", "size"]),
            ("Glint", ["glint"]),
            ("foo_BAR", ["foo", "bar"]),
            ("sourced.ml.algorithms.uast_ids_to_bag",
             ["sourc", "ml", "algorithm", "uast", "ids", "to", "bag"]),
            ("WORSTnameYOUcanIMAGINE",
             ["worst", "name", "you", "can", "imagin"]),
            # Another bad example. Parser failed to parse it correctly
            ("SmallIdsToFoOo", ["small", "ids", "to", "fo", "oo"]),
            ("SmallIdFooo", ["small", "id", "fooo"]),
            ("ONE_M0re_.__badId.example",
             ["one", "m", "re", "bad", "id", "exampl"]),
            ("never_use_Such__varsableNames",
             ["never", "use", "such", "varsabl", "name"]),
            ("a.b.c.d", ["a", "b", "c", "d"]),
            ("A.b.Cd.E", ["a", "b", "cd", "e"]),
            ("looong_sh_loooong_sh", ["looong", "sh", "loooong", "sh"]),
            ("sh_sh_sh_sh", ["sh", "sh", "sh", "sh"]),
            ("loooong_loooong_loooong", ["loooong", "loooong", "loooong"]),
        ]

        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(res, correct)

        min_split_length = 3
        self.tp.min_split_length = min_split_length
        for token, correct in tokens:
            res = list(self.tp.process_token(token))
            self.assertEqual(
                res, [c for c in correct if len(c) >= min_split_length])

    def test_split(self):
        self.assertEqual(list(self.tp.split("set for")), ["set", "for"])
        self.assertEqual(list(self.tp.split("set /for.")), ["set", "for"])
        self.assertEqual(list(self.tp.split("NeverHav")), ["never", "hav"])
        self.assertEqual(list(self.tp.split("PrintAll")), ["print", "all"])
        self.assertEqual(list(self.tp.split("PrintAllExcept")),
                         ["print", "all", "except"])
        self.assertEqual(
            list(self.tp.split("print really long line")),
            # 'longli' is expected artifact due to edge effects
            ["print", "really", "long", "longli"])
        self.assertEqual(list(self.tp.split("set /for. *&PrintAll")),
                         ["set", "for", "print", "all"])
        self.assertEqual(list(self.tp.split("JumpDown not Here")),
                         ["jump", "down", "not", "here"])

        self.assertEqual(list(self.tp.split("a b c d")), ["a", "b", "c", "d"])
        self.assertEqual(list(self.tp.split("a b long c d")),
                         ["a", "b", "long", "blong", "longc", "d"])
        self.assertEqual(list(self.tp.split("AbCd")), ["ab", "cd"])

    def test_reconstruct(self):
        self.tp._save_token_style = True
        self.tp._single_shot = True
        self.tp.min_split_length = 1

        tokens = [
            "ONLYCAPS",
            "nocaps",
            "UpperCamelCase",
            "camelCase",
            "FRAPScase",
            "SQLThing",
            "_Astra",
            "CAPS_CONST",
            "_something_SILLY_",
            "blink182",
            "FooBar100500Bingo",
            "Man45var",
            "method_name",
            "Method_Name",
            "101dalms",
            "101_dalms",
            "101_DalmsBug",
            "101_Dalms45Bug7",
            "wdSize",
            "Glint",
            "foo_BAR",
            "sourced.ml.algorithms.uast_ids_to_bag",
            "WORSTnameYOUcanIMAGINE",
            "SmallIdsToFoOo",
            "SmallIdFooo",
            "ONE_M0re_.__badId.example",
            "never_use_Such__varsableNames",
            "a.b.c.d",
            "A.b.Cd.E",
            "looong_sh_loooong_sh",
            "sh_sh_sh_sh",
            "loooong_loooong_loooong",
        ]
        self.tp.max_token_length = max(map(len, tokens))

        for token in tokens:
            splitted_tokens = list(self.tp.split(token))
            self.assertEqual(token, self.tp.reconstruct(splitted_tokens))

    def test_split_single_shot(self):
        self.tp._single_shot = True
        self.tp.min_split_length = 1
        self.assertEqual(
            list(self.tp.split("print really long line")),
            # 'longli' is expected artifact due to edge effects
            ["print", "really", "long", "li"])
        self.assertEqual(list(self.tp.split("a b c d")), ["a", "b", "c", "d"])
        self.assertEqual(list(self.tp.split("a b long c d")),
                         ["a", "b", "long", "c", "d"])
        self.assertEqual(list(self.tp.split("AbCd")), ["ab", "cd"])

    def test_stem(self):
        self.assertEqual(self.tp.stem("lol"), "lol")
        self.assertEqual(self.tp.stem("apple"), "appl")
        self.assertEqual(self.tp.stem("orange"), "orang")
        self.assertEqual(self.tp.stem("embedding"), "embed")
        self.assertEqual(self.tp.stem("Alfred"), "Alfred")
        self.assertEqual(self.tp.stem("Pluto"), "Pluto")

    def test_pickle(self):
        tp = pickle.loads(pickle.dumps(self.tp))
        self.assertEqual(tp.stem("embedding"), "embed")
Esempio n. 9
0
def preprocess(
    repo: str,
    dataset_name: str,
    exclude_refs: List[str],
    only_head: bool,
    only_by_date: bool,
    version_sep: str,
    langs: Optional[List[str]],
    exclude_langs: Optional[List[str]],
    keep_vendors: bool,
    features: List[str],
    force: bool,
    bblfsh_timeout: float,
    use_nn: bool,
    log_level: str,
) -> None:
    """Extract features from a repository and store them as a pickled dict."""
    def feature_extractor(uast_obj: Any) -> Iterator[Tuple[str, str]]:
        if type(uast_obj) == dict:
            if "@type" in uast_obj and uast_obj["@type"] in feature_mapping:
                key, feature = feature_mapping[uast_obj["@type"]]
                if uast_obj[key] is not None:
                    yield uast_obj[key], feature
            for key in uast_obj:
                if type(uast_obj[key]) in {dict, list}:
                    yield from feature_extractor(uast_obj[key])
        elif type(uast_obj) == list:
            for uast in uast_obj:
                yield from feature_extractor(uast)

    logger = create_logger(log_level, __name__)

    output_path = os.path.join(DATASET_DIR, dataset_name + ".pkl")
    check_remove(output_path, logger, force)
    create_directory(os.path.dirname(output_path), logger)

    bblfsh_host = check_env_exists("BBLFSH_HOSTNAME")
    bblfsh_port = int(check_env_exists("BBLFSH_PORT"))
    host = check_env_exists("GITBASE_HOSTNAME")
    port = int(check_env_exists("GITBASE_PORT"))
    user = check_env_exists("GITBASE_USERNAME")
    password = check_env_exists("GITBASE_PASSWORD")

    if use_nn:
        from sourced.ml.core.algorithms.token_parser import TokenParser

        token_parser = TokenParser(single_shot=True, use_nn=True)

    logger.info("Processing repository '%s'" % repo)
    logger.info("Retrieving tagged references ...")
    sql = get_tagged_refs_sql(repository_id=repo)
    version_mapping: DefaultDict[int, DefaultDict[int, RefList]] = defaultdict(
        lambda: defaultdict(RefList))
    if only_head:
        refs = RefList(["HEAD"])
        logger.info("Only extracting HEAD revision.")
    else:
        refs = RefList(row["ref_name"].decode()
                       for row in extract(host, port, user, password, sql))
        for keyword in exclude_refs:
            refs = RefList(ref for ref in refs if keyword not in ref)
        if not only_by_date:
            for ref in refs:
                major, minor = [
                    int(re.findall(r"[0-9]+", version)[0])
                    for version in ref.split(version_sep)[:2]
                ]
                version_mapping[major][minor].append(ref)
            refs = RefList(ref for major in sorted(version_mapping)
                           for minor in sorted(version_mapping[major])
                           for ref in version_mapping[major][minor])
        logger.info("Found %d tagged references." % len(refs))

    used_langs = create_language_list(langs, exclude_langs)
    exclude_vendors = not keep_vendors
    sql = get_file_info_sql(
        repository_id=repo,
        ref_names=refs,
        exclude_vendors=exclude_vendors,
        langs=used_langs,
    )
    files_info = FilesInfo(refs)
    lang_count: CounterType[str] = Counter()
    seen_files: Set[Tuple[str, str]] = set()
    raw_count = 0
    logger.info("Retrieving file information ...")
    for row in extract(host, port, user, password, sql):
        raw_count += 1
        ref = row["ref_name"].decode()
        file_path = row["file_path"].decode()
        blob_hash = row["blob_hash"].decode()
        lang = row["lang"].decode()
        if (file_path, blob_hash) not in seen_files:
            lang_count[lang] += 1
            seen_files.add((file_path, blob_hash))
        files_info[ref][file_path] = FileInfo(blob_hash=blob_hash,
                                              language=lang)
    if raw_count:
        logger.info("Found %d parsable blobs:" % raw_count)
    else:
        logger.info("Found no parsable blobs, stopping.")
        return
    for ref in refs:
        logger.info("   '%s' : %d blobs.", ref, len(files_info[ref]))
    logger.info("Found %d distinct parsable blobs:" % len(seen_files))
    for lang in sorted(lang_count):
        logger.info("   %s : %d files.", lang, lang_count[lang])

    files_content = FilesContent(files_info)
    sql = get_file_content_sql(
        repository_id=repo,
        ref_names=refs,
        exclude_vendors=exclude_vendors,
        langs=used_langs,
    )
    stop_words = frozenset(stopwords.words("english"))
    stemmer = PorterStemmer()
    stem_mapping: DefaultDict[str, WordCount] = defaultdict(WordCount)
    blacklisted_files: Set[str] = set()
    client = bblfsh.BblfshClient("%s:%d" % (bblfsh_host, bblfsh_port))
    parsed_count: CounterType = Counter()
    feature_mapping = {
        xpath: feature_tuple
        for xpath, feature_tuple in FEATURE_MAPPING.items()
        if feature_tuple[1] in features
    }
    logger.info("Retrieving file content ...")
    # TODO: Remove docker restart logic when this
    #       https://github.com/bblfsh/bblfshd/issues/297 is done
    for row in tqdm.tqdm(extract(host, port, user, password, sql),
                         total=len(seen_files)):
        file_path = row["file_path"].decode()
        if file_path in blacklisted_files:
            continue
        blob_hash = row["blob_hash"].decode()
        lang = row["lang"].decode()
        contents = row["blob_content"].decode()
        if contents == "":
            files_info.remove(file_path, blob_hash)
            continue
        for attempt in range(2):
            try:
                start = time.time()
                ctx = client.parse(
                    filename="",
                    language=lang,
                    contents=contents,
                    timeout=bblfsh_timeout,
                )
                uast = ctx.get_all()
            except Exception:
                if time.time() - start > bblfsh_timeout - 0.1 and attempt == 0:
                    logger.warn(
                        "Babelfish timed out, restarting the container ...")
                    subprocess.call(["docker", "restart", bblfsh_host],
                                    stdout=subprocess.DEVNULL)
                    time.sleep(10)
                    logger.warn("Restarted the container.")
                uast = None
        if uast is None:
            logger.debug(
                "Failed to parse '%s' : %s (%s file), blacklisting it.",
                file_path,
                blob_hash,
                lang,
            )
            files_info.remove(file_path, blob_hash)
            blacklisted_files.add(file_path)
            continue

        parsed_count[lang] += 1
        feature_content = FeatureContent(features)
        num_nodes = 0
        for word, feature in feature_extractor(uast):
            if feature == COMMENTS:
                words = [
                    w for w in word.split() if w.lower() not in stop_words
                ]
            else:
                words = [word]
            if use_nn:
                words = [w for word in words for w in token_parser.split(word)]
            else:
                words = [w for word in words for w in word.split("_")]
                words = [
                    w for word in words for w in re.findall(
                        r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", word)
                ]
            words = [w.lower() for w in words]
            stems_words: List[Tuple[str, str]] = [(stemmer.stem(w), w)
                                                  for w in words]
            stems_words = [(s, w) for s, w in stems_words if good_token(s)]
            if stems_words:
                num_nodes += 1
                feature_content[feature].update(s for s, _ in stems_words)
                for stem, word in stems_words:
                    stem_mapping[stem][word] += 1
        if num_nodes == 0:
            files_info.remove(file_path, blob_hash)
            continue
        files_content[file_path][blob_hash] = feature_content
    files_content.purge(blacklisted_files)
    total_parsed = sum(parsed_count.values())
    logger.info("Extracted features from %d distinct blobs.", total_parsed)
    logger.debug("Parsed successfully %f %% blobs.",
                 total_parsed * 100 / len(seen_files))
    for lang in sorted(parsed_count):
        logger.info("   %s : %d blobs.", lang, parsed_count[lang])
        logger.debug(
            "   Parsed successfully %f %% blobs.",
            parsed_count[lang] * 100 / lang_count[lang],
        )
    logger.info("Creating reverse stem mapping ...")
    reverse_mapping: Dict[str, str] = {}
    for stem in stem_mapping:
        reverse_mapping[stem] = stem_mapping[stem].most_common(1)[0][0]
    logger.info("Reversing stemming ...")
    files_content.map_words(reverse_mapping)

    refs_dict = RefsDict()
    refs_dict[repo] = refs
    dataset = Dataset(
        files_info={repo: files_info},
        files_content={repo: files_content},
        refs_dict=refs_dict,
    )
    logger.info("Saving features ...")
    with open(output_path, "wb") as fout:
        pickle.dump(dataset, fout)
    logger.info("Saved features in '%s'." % output_path)