def __init__(self, model_dir=None): """Programming language guesser. ``model_dir`` -- Guesslang machine learning model directory. """ model_data = model_info(model_dir) #: `tensorflow` model directory self.model_dir = model_data[0] #: tells if current model is the default model self.is_default = model_data[1] #: supported languages with associated extensions self.languages = config_dict('languages.json') n_classes = len(self.languages) feature_columns = [ tf.contrib.layers.real_valued_column('', dimension=CONTENT_SIZE)] self._classifier = tf.contrib.learn.DNNLinearCombinedClassifier( linear_feature_columns=feature_columns, dnn_feature_columns=feature_columns, dnn_hidden_units=_NEURAL_NETWORK_HIDDEN_LAYERS, n_classes=n_classes, linear_optimizer=tf.train.RMSPropOptimizer(_OPTIMIZER_STEP), dnn_optimizer=tf.train.RMSPropOptimizer(_OPTIMIZER_STEP), model_dir=self.model_dir)
def __init__(self, model_dir: Optional[str] = None) -> None: model_data = model_info(model_dir) #: `tensorflow` model directory self.model_dir: str = model_data[0] #: Tells if the current model is the default model self.is_default: bool = model_data[1] #: Supported languages associated with their extensions self.languages: Dict[str, List[str]] = config_dict('languages.json') n_classes = len(self.languages) feature_columns = [ tf.contrib.layers.real_valued_column('', dimension=CONTENT_SIZE) ] self._classifier = tf.contrib.learn.DNNLinearCombinedClassifier( linear_feature_columns=feature_columns, dnn_feature_columns=feature_columns, dnn_hidden_units=NEURAL_NETWORK_HIDDEN_LAYERS, n_classes=n_classes, linear_optimizer=tf.train.RMSPropOptimizer(OPTIMIZER_STEP), dnn_optimizer=tf.train.RMSPropOptimizer(OPTIMIZER_STEP), model_dir=self.model_dir)
def _find_files(test_repos, learn_repos, nb_test, nb_learn, remove): languages = config_dict('languages.json') LOGGER.info("Process %d test repositories", len(test_repos)) full_test_files = _list_files(test_repos, languages, remove) test_files = _drop_extra_files(full_test_files, nb_test) LOGGER.info("Process %d learning repositories", len(learn_repos)) full_learn_files = _list_files(learn_repos, languages, remove) learn_files = _drop_extra_files(full_learn_files, nb_learn) return (test_files, learn_files)
def main(): """Github repositories downloaded command line""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( 'githubtoken', help="Github OAuth token, see https://developer.github.com/v3/oauth/") parser.add_argument('destination', help="location of the downloaded repos") parser.add_argument('-n', '--nbrepo', help="number of repositories per language", type=int, default=1000) parser.add_argument('-d', '--debug', default=False, action='store_true', help="show debug messages") args = parser.parse_args() config_logging(args.debug) destination = Path(args.destination) nb_repos = args.nbrepo token = args.githubtoken languages = config_dict('languages.json') destination.mkdir(exist_ok=True) for pos, language in enumerate(sorted(languages), 1): LOGGER.info("Step %.2f%%, %s", 100 * pos / len(languages), language) LOGGER.info("Fetch %d repos infos for language %s", nb_repos, language) repos = _retrieve_repo_details(language, nb_repos, token) LOGGER.info("%d repos details kept. Downloading", len(repos)) _download_repos(language, repos, destination) LOGGER.info("Language %s repos downloaded", language) LOGGER.debug("Exit OK")
"""Extract features (floats vector) that represent a given text""" import logging import re import math from guesslang.config import config_dict LOGGER = logging.getLogger(__name__) CONTENT_SIZE = 2**10 SPECIAL_KEYWORDS = {'num': '<number>', 'var': '<variable>'} _KEYWORDS = config_dict('keywords.json') _SEPARATOR = re.compile(r'(\W)') _SHIFT = 17 _FACTOR = 23 def extract(text): """Transform the text into a vector of float values. The vector is a representation of the text. :param str text: the text to represent :return: representation :rtype: list """ return _normalize(_vectorize(split(text)))
def main(): """Keywords generator command line""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('learn', help="learning source codes directory") parser.add_argument('keywords', help="output keywords file, JSON") parser.add_argument('-n', '--nbkeywords', type=int, default=10000, help="the number of keywords to keep") parser.add_argument('-d', '--debug', default=False, action='store_true', help="show debug messages") args = parser.parse_args() config_logging(args.debug) learn_path = Path(args.learn) keywords_path = Path(args.keywords) nb_keywords = args.nbkeywords languages = config_dict('languages.json') exts = {ext: lang for lang, exts in languages.items() for ext in exts} term_count = Counter() document_count = Counter() pos = 0 LOGGER.info("Reading files form %s", learn_path) for pos, path in enumerate(Path(learn_path).glob('**/*'), 1): print(pos) print(path) if pos % STEP == 0: LOGGER.info("Processed %d", pos) gc.collect() # Cleanup dirt if not path.is_file() or not exts.get(path.suffix.lstrip('.')): continue counter = _extract(path) term_count.update(counter) document_count.update(counter.keys()) nb_terms = sum(term_count.values()) nb_documents = pos - 1 if not nb_documents: LOGGER.error("No source files found in %s", learn_path) raise RuntimeError('No source files in {}'.format(learn_path)) LOGGER.info("%d unique terms found", len(term_count)) terms = _most_frequent((term_count, nb_terms), (document_count, nb_documents), nb_keywords) keywords = { token: int(hashlib.sha1(token.encode()).hexdigest(), 16) for token in terms } with keywords_path.open('w') as keywords_file: json.dump(keywords, keywords_file, indent=2, sort_keys=True) LOGGER.info("%d keywords written into %s", len(keywords), keywords_path) LOGGER.debug("Exit OK")