Example #1
0
    def __init__(self, model_dir=None):
        """Programming language guesser.

        ``model_dir`` -- Guesslang machine learning model directory.

        """
        model_data = model_info(model_dir)

        #: `tensorflow` model directory
        self.model_dir = model_data[0]

        #: tells if current model is the default model
        self.is_default = model_data[1]

        #: supported languages with associated extensions
        self.languages = config_dict('languages.json')

        n_classes = len(self.languages)
        feature_columns = [
            tf.contrib.layers.real_valued_column('', dimension=CONTENT_SIZE)]

        self._classifier = tf.contrib.learn.DNNLinearCombinedClassifier(
            linear_feature_columns=feature_columns,
            dnn_feature_columns=feature_columns,
            dnn_hidden_units=_NEURAL_NETWORK_HIDDEN_LAYERS,
            n_classes=n_classes,
            linear_optimizer=tf.train.RMSPropOptimizer(_OPTIMIZER_STEP),
            dnn_optimizer=tf.train.RMSPropOptimizer(_OPTIMIZER_STEP),
            model_dir=self.model_dir)
Example #2
0
    def __init__(self, model_dir: Optional[str] = None) -> None:
        model_data = model_info(model_dir)

        #: `tensorflow` model directory
        self.model_dir: str = model_data[0]

        #: Tells if the current model is the default model
        self.is_default: bool = model_data[1]

        #: Supported languages associated with their extensions
        self.languages: Dict[str, List[str]] = config_dict('languages.json')

        n_classes = len(self.languages)
        feature_columns = [
            tf.contrib.layers.real_valued_column('', dimension=CONTENT_SIZE)
        ]

        self._classifier = tf.contrib.learn.DNNLinearCombinedClassifier(
            linear_feature_columns=feature_columns,
            dnn_feature_columns=feature_columns,
            dnn_hidden_units=NEURAL_NETWORK_HIDDEN_LAYERS,
            n_classes=n_classes,
            linear_optimizer=tf.train.RMSPropOptimizer(OPTIMIZER_STEP),
            dnn_optimizer=tf.train.RMSPropOptimizer(OPTIMIZER_STEP),
            model_dir=self.model_dir)
Example #3
0
def _find_files(test_repos, learn_repos, nb_test, nb_learn, remove):
    languages = config_dict('languages.json')

    LOGGER.info("Process %d test repositories", len(test_repos))
    full_test_files = _list_files(test_repos, languages, remove)
    test_files = _drop_extra_files(full_test_files, nb_test)

    LOGGER.info("Process %d learning repositories", len(learn_repos))
    full_learn_files = _list_files(learn_repos, languages, remove)
    learn_files = _drop_extra_files(full_learn_files, nb_learn)

    return (test_files, learn_files)
def main():
    """Github repositories downloaded command line"""

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        'githubtoken',
        help="Github OAuth token, see https://developer.github.com/v3/oauth/")
    parser.add_argument('destination', help="location of the downloaded repos")
    parser.add_argument('-n',
                        '--nbrepo',
                        help="number of repositories per language",
                        type=int,
                        default=1000)
    parser.add_argument('-d',
                        '--debug',
                        default=False,
                        action='store_true',
                        help="show debug messages")

    args = parser.parse_args()
    config_logging(args.debug)

    destination = Path(args.destination)
    nb_repos = args.nbrepo
    token = args.githubtoken

    languages = config_dict('languages.json')
    destination.mkdir(exist_ok=True)

    for pos, language in enumerate(sorted(languages), 1):
        LOGGER.info("Step %.2f%%, %s", 100 * pos / len(languages), language)
        LOGGER.info("Fetch %d repos infos for language %s", nb_repos, language)
        repos = _retrieve_repo_details(language, nb_repos, token)
        LOGGER.info("%d repos details kept. Downloading", len(repos))
        _download_repos(language, repos, destination)
        LOGGER.info("Language %s repos downloaded", language)

    LOGGER.debug("Exit OK")
Example #5
0
"""Extract features (floats vector) that represent a given text"""

import logging
import re
import math

from guesslang.config import config_dict

LOGGER = logging.getLogger(__name__)

CONTENT_SIZE = 2**10

SPECIAL_KEYWORDS = {'num': '<number>', 'var': '<variable>'}
_KEYWORDS = config_dict('keywords.json')

_SEPARATOR = re.compile(r'(\W)')

_SHIFT = 17
_FACTOR = 23


def extract(text):
    """Transform the text into a vector of float values.
    The vector is a representation of the text.

    :param str text: the text to represent
    :return: representation
    :rtype: list
    """
    return _normalize(_vectorize(split(text)))
Example #6
0
def main():
    """Keywords generator command line"""

    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('learn', help="learning source codes directory")
    parser.add_argument('keywords', help="output keywords file, JSON")
    parser.add_argument('-n',
                        '--nbkeywords',
                        type=int,
                        default=10000,
                        help="the number of keywords to keep")
    parser.add_argument('-d',
                        '--debug',
                        default=False,
                        action='store_true',
                        help="show debug messages")

    args = parser.parse_args()
    config_logging(args.debug)

    learn_path = Path(args.learn)
    keywords_path = Path(args.keywords)
    nb_keywords = args.nbkeywords

    languages = config_dict('languages.json')

    exts = {ext: lang for lang, exts in languages.items() for ext in exts}

    term_count = Counter()
    document_count = Counter()
    pos = 0
    LOGGER.info("Reading files form %s", learn_path)
    for pos, path in enumerate(Path(learn_path).glob('**/*'), 1):
        print(pos)
        print(path)
        if pos % STEP == 0:
            LOGGER.info("Processed %d", pos)
            gc.collect()  # Cleanup dirt

        if not path.is_file() or not exts.get(path.suffix.lstrip('.')):
            continue

        counter = _extract(path)
        term_count.update(counter)
        document_count.update(counter.keys())

    nb_terms = sum(term_count.values())
    nb_documents = pos - 1
    if not nb_documents:
        LOGGER.error("No source files found in %s", learn_path)
        raise RuntimeError('No source files in {}'.format(learn_path))

    LOGGER.info("%d unique terms found", len(term_count))

    terms = _most_frequent((term_count, nb_terms),
                           (document_count, nb_documents), nb_keywords)

    keywords = {
        token: int(hashlib.sha1(token.encode()).hexdigest(), 16)
        for token in terms
    }

    with keywords_path.open('w') as keywords_file:
        json.dump(keywords, keywords_file, indent=2, sort_keys=True)
    LOGGER.info("%d keywords written into %s", len(keywords), keywords_path)
    LOGGER.debug("Exit OK")