def __init__(self):
        settings = Settings()

        self.db = PostgresDb()
        self.articles_filename = settings.get_csv_file('articles')
        self.authors_filename = settings.get_csv_file('authors')
        self.comments_filename = settings.get_csv_file('comments')
Exemple #2
0
    def __init__(self,
                 expert_language,
                 second_language,
                 expert_loc=10000,
                 second_loc=1000):
        """
        :param expert_language: helper.language.Language object of the expert language.
        :param second_language: helper.language.Language object of the language to examine. For now just Python.
        :param expert_loc: Lines of code that is the threshold for an expert.
        :param second_loc: Lines of code threshold for the language to examine to get a code base that is large enough.
        """
        self.expert_language = expert_language
        self.second_language = second_language
        self.expert_loc = expert_loc
        self.second_loc = second_loc

        settings = Settings()
        loc_user_file_ext_view = settings.get_database_view(
            'loc_user_file_ext')

        self.query = SQL('''
            SELECT
                u1.user_id,
                u1.login
            FROM {} AS u1
                JOIN {} AS u2 ON u1.user_id = u2.user_id
                JOIN users u ON u1.user_id = u.id
            WHERE u1.additions >= %s AND u1.file_ext = ANY(%s) AND u2.additions >= %s AND u2.file_ext = ANY(%s) AND u.type = 'USR';
        ''').format(Identifier(loc_user_file_ext_view),
                    Identifier(loc_user_file_ext_view))

        self.detection_path = settings.get_path([
            'detections',
            expert_language.language + '_' + second_language.language
        ])
Exemple #3
0
def train():
    settings = Settings()

    batch_size = settings.get_training_parameters('batch_size')
    epochs = settings.get_training_parameters('epochs')

    model_builder = Model4Builder()

    model = model_builder()

    preprocessor = Preprocessor(model)

    preprocessor.load_data(['category', 'is_top_submission'])

    training_input = [preprocessor.training_data['category']]
    validation_input = [preprocessor.validation_data['category']]
    training_output = [preprocessor.training_data['is_top_submission']]
    validation_output = [preprocessor.validation_data['is_top_submission']]

    class_weights = calculate_class_weights(preprocessor.training_data['is_top_submission'],
                                            [ol.name for ol in model.output_layers])

    callbacks = CallbackBuilder(model, [CsvLogger, CsvPlotter, ConfigLogger, ModelSaver])()

    model.fit(training_input, training_output, batch_size=batch_size, epochs=epochs,
              callbacks=callbacks, validation_data=(validation_input, validation_output), class_weight=class_weights)
    def __call__(self):
        timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        settings = Settings()
        log_path = '{}/{}_{}'.format(settings.get_training_root_dir(),
                                     self.model.name, timestamp)
        os.makedirs(log_path, exist_ok=True)

        if CsvLogger in self.callback_classes:
            csv_logger = CsvLogger(self.model.name, log_path)
            self.active_callbacks.append(csv_logger)

        if CsvPlotter in self.callback_classes:
            assert CsvLogger in self.callback_classes

            plotter = CsvPlotter(self.model, log_path)
            self.active_callbacks.append(plotter)

        if ConfigLogger in self.callback_classes:
            config_logger = ConfigLogger(self.model, log_path)
            self.active_callbacks.append(config_logger)

        if ModelSaver in self.callback_classes:
            model_saver = ModelSaver(self.model, log_path)
            self.active_callbacks.append(model_saver)

        return self.active_callbacks
    def __init__(self, sha):
        self.sha = sha

        settings = Settings()
        self._downloads_path = settings.get_path('downloads')
        self._download_archive_path = settings.get_path('download_archive')
        self._earliest_project_view = settings.get_database_view(
            'earliest_project')
    def __init__(self):
        self.required_inputs = []
        self.required_parameters = []

        self.inputs = {}
        self.parameters = {}

        settings = Settings()
        self.parameters = settings.get_network_parameters()
    def __init__(self, dictionary_size):
        self.dictionary_size = dictionary_size
        self.dimensions = self.DIMENSIONS

        settings = Settings()
        self.embedding_path = settings.get_glove_embedding()

        self.word_numbers = {}
        self.embedding_vectors = np.zeros(
            (dictionary_size + 1, self.DIMENSIONS))
Exemple #8
0
    def __init__(self):
        settings = Settings()
        db_settings = settings.get_database_settings()

        connection_parameters = ' '.join([
            '{}={}'.format(key, value) for (key, value) in db_settings.items()
        ])
        self.connection = psycopg2.connect(connection_parameters)
        self.cursor = self.connection.cursor(
            cursor_factory=psycopg2.extras.DictCursor)
Exemple #9
0
    def __init__(self, model, log_path):
        super().__init__()
        settings = Settings()

        self.training_parameters = settings.get_training_parameters()
        self.network_parameters = settings.get_network_parameters()

        self.log_path = log_path
        self.filename = '{}/{}'.format(log_path, 'config.txt')

        model.summary(print_fn=self._handle_summary_print)
        self.model = model
def train():
    settings = Settings()

    batch_size = settings.get_training_parameters('batch_size')
    epochs = settings.get_training_parameters('epochs')
    dictionary_size = settings.get_training_parameters('dictionary_size')
    max_headline_length = settings.get_training_parameters('max_headline_length')

    glove = Glove(dictionary_size)
    glove.load_embedding()

    model_builder = Model1Builder() \
        .set_input('glove', glove) \
        .set_parameter('max_headline_length', max_headline_length)

    model = model_builder()

    preprocessor = Preprocessor(model)
    preprocessor.set_encoder('glove', glove)
    preprocessor.set_parameter('max_headline_length', max_headline_length)

    preprocessor.load_data(['headline', 'is_top_submission'])

    training_input = [preprocessor.training_data['headline']]
    validation_input = [preprocessor.validation_data['headline']]
    training_output = [preprocessor.training_data['is_top_submission']]
    validation_output = [preprocessor.validation_data['is_top_submission']]

    class_weights = calculate_class_weights(preprocessor.training_data['is_top_submission'],
                                            [ol.name for ol in model.output_layers])

    callbacks = CallbackBuilder(model, [CsvLogger, CsvPlotter, ConfigLogger, ModelSaver])()

    model.fit(training_input, training_output, batch_size=batch_size, epochs=epochs,
              callbacks=callbacks, validation_data=(validation_input, validation_output), class_weight=class_weights)
Exemple #11
0
    def load_model(self, tag, dimensions):
        doc2vec_dir = Settings().get_doc2vec_dir()
        doc2vec_file = '{}/{}_{}.model'.format(doc2vec_dir, tag, dimensions)

        if os.path.isfile(doc2vec_file):
            print('loading doc2vec model ...')
            self.model = doc2vec.Doc2Vec.load(doc2vec_file)
        else:
            self.train_model(tag, dimensions)
            print('saving model ...')
            try:
                os.makedirs(doc2vec_dir)
            except FileExistsError:
                pass
            self.model.save(doc2vec_file)
Exemple #12
0
def train():
    settings = Settings()

    batch_size = settings.get_training_parameters('batch_size')
    epochs = settings.get_training_parameters('epochs')
    max_headline_length = settings.get_training_parameters(
        'max_headline_length')
    max_article_length = settings.get_training_parameters('max_article_length')

    headline_numeric_log = NumericLog(max_headline_length)
    article_numeric_log = NumericLog(max_article_length)

    model_builder = Model6Builder() \
        .set_input('headline_numeric_log', headline_numeric_log) \
        .set_input('article_numeric_log', article_numeric_log)

    model = model_builder()

    preprocessor = Preprocessor(model)
    preprocessor.set_encoder('headline_numeric_log', headline_numeric_log)
    preprocessor.set_encoder('article_numeric_log', article_numeric_log)

    preprocessor.load_data([
        'headline_log_representation', 'article_log_representation',
        'is_top_submission'
    ])
    training_input = [
        preprocessor.training_data['headline_log_representation'],
        preprocessor.training_data['article_log_representation']
    ]
    validation_input = [
        preprocessor.validation_data['headline_log_representation'],
        preprocessor.validation_data['article_log_representation']
    ]
    training_output = [preprocessor.training_data['is_top_submission']]
    validation_output = [preprocessor.validation_data['is_top_submission']]

    class_weights = calculate_class_weights(
        preprocessor.training_data['is_top_submission'],
        [ol.name for ol in model.output_layers])

    callbacks = CallbackBuilder(
        model, [CsvLogger, CsvPlotter, ConfigLogger, ModelSaver])()

    model.fit(training_input,
              training_output,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_data=(validation_input, validation_output),
              class_weight=class_weights)
Exemple #13
0
    def __init__(self,
                 user_id,
                 login,
                 expert_language,
                 second_language,
                 expert_loc=10000,
                 second_loc=1000):
        """
        :param user_id: User id
        :param login: User name
        :param expert_language: helper.language.Language object of the expert language.
        :param second_language: helper.language.Language object of the language to examine. For now just Python.
        :param expert_loc: Lines of code that is the threshold for an expert.
        :param second_loc: Lines of code threshold for the language to examine to get a code base that is large enough.
        """
        self.user_id = user_id
        self.login = login
        self.expert_language = expert_language
        self.second_language = second_language
        self.expert_loc = expert_loc
        self.second_loc = second_loc

        settings = Settings()
        if self.expert_language.language == 'java' and self.second_language.language == 'python':
            self.candidates_view = settings.get_database_view(
                'candidates_java_py')
            self.result_dir_path = settings.get_path(
                ['detections', 'java_python'])
        elif self.expert_language.language == 'cpp' and self.second_language.language == 'python':
            self.candidates_view = settings.get_database_view(
                'candidates_cpp_py')
            self.result_dir_path = settings.get_path(
                ['detections', 'cpp_python'])
        elif self.expert_language.language == 'fun' and self.second_language.language == 'python':
            self.candidates_view = settings.get_database_view(
                'candidates_fun_py')
            self.result_dir_path = settings.get_path(
                ['detections', 'fun_python'])
        else:
            raise ValueError(
                'No database view for this combination of languages.')
def calculate_correlations():
    arg_parse = ArgumentParser()
    arg_parse.add_argument('--model_1', type=str)
    arg_parse.add_argument('--model_2', type=str)
    arg_parse.add_argument('--model_3', type=str)
    arg_parse.add_argument('--model_4', type=str)
    arg_parse.add_argument('--model_5', type=str)
    arg_parse.add_argument('--model_6', type=str)
    arg_parse.add_argument('--model_7', type=str)
    arguments = arg_parse.parse_args()

    settings = Settings()
    default_parameters = settings.get_training_parameters()

    glove = Glove(default_parameters['dictionary_size'])
    glove.load_embedding()

    headline_numeric_log = NumericLog(
        default_parameters['max_headline_length'])
    article_numeric_log = NumericLog(default_parameters['max_article_length'])

    print('load data...')
    preprocessor = Preprocessor(None)
    preprocessor.set_encoder('glove', glove)
    preprocessor.set_encoder('headline_numeric_log', headline_numeric_log)
    preprocessor.set_encoder('article_numeric_log', article_numeric_log)
    preprocessor.set_parameter('max_headline_length',
                               default_parameters['max_headline_length'])
    preprocessor.set_parameter('body_begin_length',
                               default_parameters['body_begin_length'])

    preprocessor.load_data([
        'headline', 'body_begin', 'category', 'minute', 'hour', 'day_of_week',
        'day_of_year', 'headline_log_representation',
        'article_log_representation', 'competitive_score'
    ])

    custom_objects = {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

    print('load models...')
    model_inputs = {}
    model_inputs['model_1'] = [preprocessor.test_data['headline']]
    model_inputs['model_2'] = [preprocessor.test_data['headline']]
    model_inputs['model_3'] = [preprocessor.test_data['body_begin']]
    model_inputs['model_4'] = [preprocessor.test_data['category']]
    model_inputs['model_5'] = [
        preprocessor.test_data[key]
        for key in ['minute', 'hour', 'day_of_week', 'day_of_year']
    ]
    model_inputs['model_6'] = [
        preprocessor.test_data[key] for key in
        ['headline_log_representation', 'article_log_representation']
    ]
    model_inputs['model_7'] = [preprocessor.test_data['competitive_score']]

    print('predict...')
    predictions = {}
    for model_name in model_inputs.keys():
        if hasattr(arguments, model_name) and getattr(arguments, model_name):
            model = load_model(getattr(arguments, model_name),
                               custom_objects=custom_objects)
            predictions[model_name] = np.round(
                model.predict(model_inputs[model_name]))

    print('calculate correlation...')
    for model_name_1 in predictions.keys():
        for model_name_2 in predictions.keys():
            if model_name_1 != model_name_2:
                correlation = np.corrcoef(predictions[model_name_1][:, -1],
                                          predictions[model_name_2][:, -1])[0]
                print(model_name_1, model_name_2, correlation[1])
Exemple #15
0
    def __init__(self, database_name):
        settings = Settings()
        database_settings = settings.get_database_settings(database_name)

        self.r = redis.StrictRedis(**database_settings)
Exemple #16
0
 def setUpClass(self):
     URL = Settings().get('url', index=0)
     self.driver = Voince_page(browser_type='chrome').get(
         URL, maximize_window=False)
 def __init__(self, figsize=(800, 600)):
     settings = Settings()
     self.figsize = figsize
     self.plot_dir = settings.get_path('plots')
 def __init__(self):
     settings = Settings()
     self.archive_path = settings.get_path('download_archive')
     self.download_path = settings.get_path('downloads')
Exemple #19
0
 def __init__(self):
     settings = Settings()
     connection_dict = settings.get_database_settings()
     settings = " ".join(['%s=%s' % (key, value) for (key, value) in connection_dict.items()])
     self.conn = psycopg2.connect(settings)
     self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
 def __init__(self):
     settings = Settings()
     self.api_keys = settings.get_guardian_api_keys()
     self.fields = list(map(lambda e: e.value, GuardianApi))
Exemple #21
0
 def __init__(self):
     db_settings = Settings().get_database_settings()
     self._connection_parameters = ' '.join([
         '{}={}'.format(key, value) for (key, value) in db_settings.items()
     ])
def main():
    settings = Settings()
    java_aggregation_path = settings.get_path(['detections', 'java_python'])
    cpp_aggregation_path = settings.get_path(['detections', 'cpp_python'])
    fun_aggregation_path = settings.get_path(['detections', 'fun_python'])

    aggregation_paths = [
        java_aggregation_path, cpp_aggregation_path, fun_aggregation_path
    ]
    aggregation_paths = [p + '/detections.json' for p in aggregation_paths]

    aggregators = [Aggregator.load(p) for p in aggregation_paths]
    labels = ['Java', 'C++', 'func languages']

    plotter = Plotter()

    # if
    if_percentage = [a.percentage(IfDetection, 'user_id') for a in aggregators]
    if_elseif_count_aggregation = [
        a.aggregate(IfDetection, 'elseif_count', 'user_id')
        for a in aggregators
    ]
    if_has_else_aggregation = [
        a.aggregate(IfDetection, 'has_else', 'user_id') for a in aggregators
    ]

    plotter(if_percentage, labels, 'If percentage')
    plotter(if_elseif_count_aggregation, labels, 'Elseif count')
    plotter(if_has_else_aggregation, labels, 'If has else')

    # class
    def line_count(detection):
        return detection.end - detection.begin + 1

    def name_length(detection):
        return len(detection.name)

    class_definition_percentage = [
        a.percentage(ClassDefinitionDetection, 'user_id') for a in aggregators
    ]
    class_line_count_aggregation = [
        a.aggregate(ClassDefinitionDetection, line_count, 'user_id')
        for a in aggregators
    ]
    class_method_count_aggregation = [
        a.aggregate(ClassDefinitionDetection, 'method_count', 'user_id')
        for a in aggregators
    ]
    class_name_length_aggregation = [
        a.aggregate(ClassDefinitionDetection, name_length, 'user_id')
        for a in aggregators
    ]
    class_is_nested_aggregation = [
        a.aggregate(ClassDefinitionDetection, 'nested', 'user_id')
        for a in aggregators
    ]

    plotter(class_definition_percentage, labels, 'Class percentage')
    plotter(class_line_count_aggregation, labels, 'Class line count')
    plotter(class_method_count_aggregation, labels, 'Class method count')
    plotter(class_name_length_aggregation, labels, 'Class name length')
    plotter(class_is_nested_aggregation, labels, 'Class is nested')

    # built in functions
    class IsFunction:
        def __init__(self, name):
            self.name = name

        def __call__(self, detection):
            return 1 if detection.name == self.name else 0

    bif_percentage = [
        a.percentage(BuiltInFunctionDetection, 'user_id') for a in aggregators
    ]
    bif_map_count_aggregation = [
        a.aggregate(BuiltInFunctionDetection, IsFunction('map'), 'user_id')
        for a in aggregators
    ]
    bif_filter_count_aggregation = [
        a.aggregate(BuiltInFunctionDetection, IsFunction('filter'), 'user_id')
        for a in aggregators
    ]
    bif_list_count_aggregation = [
        a.aggregate(BuiltInFunctionDetection, IsFunction('list'), 'user_id')
        for a in aggregators
    ]
    bif_dict_count_aggregation = [
        a.aggregate(BuiltInFunctionDetection, IsFunction('dict'), 'user_id')
        for a in aggregators
    ]
    bif_set_count_aggregation = [
        a.aggregate(BuiltInFunctionDetection, IsFunction('set'), 'user_id')
        for a in aggregators
    ]

    plotter(bif_percentage, labels, 'Built in function percentage')
    plotter(bif_map_count_aggregation, labels, 'Map percentage')
    plotter(bif_filter_count_aggregation, labels, 'Filter percentage')
    plotter(bif_list_count_aggregation, labels, 'List percentage')
    plotter(bif_dict_count_aggregation, labels, 'Dict percentage')
    plotter(bif_set_count_aggregation, labels, 'Set percentage')

    # comprehensions
    list_comprehension_percentage = [
        a.percentage(ListComprehensionDetection, 'user_id')
        for a in aggregators
    ]
    dict_comprehension_percentage = [
        a.percentage(DictComprehensionDetection, 'user_id')
        for a in aggregators
    ]
    set_comprehension_percentage = [
        a.percentage(SetComprehensionDetection, 'user_id') for a in aggregators
    ]

    list_comprehension_generator_count_aggregation = [
        a.aggregate(ListComprehensionDetection, 'generator_count', 'user_id')
        for a in aggregators
    ]
    dict_comprehension_generator_count_aggregation = [
        a.aggregate(DictComprehensionDetection, 'generator_count', 'user_id')
        for a in aggregators
    ]
    set_comprehension_generator_count_aggregation = [
        a.aggregate(SetComprehensionDetection, 'generator_count', 'user_id')
        for a in aggregators
    ]

    plotter(list_comprehension_percentage, labels,
            'List comprehension percentage')
    plotter(dict_comprehension_percentage, labels,
            'Dict comprehension percentage')
    plotter(set_comprehension_percentage, labels,
            'Set comprehension percentage')
    plotter(list_comprehension_generator_count_aggregation, labels,
            'List comprehension generator count')
    plotter(dict_comprehension_generator_count_aggregation, labels,
            'Dict comprehension generator count')
    plotter(set_comprehension_generator_count_aggregation, labels,
            'Set comprehension generator count')