def __init__(self): settings = Settings() self.db = PostgresDb() self.articles_filename = settings.get_csv_file('articles') self.authors_filename = settings.get_csv_file('authors') self.comments_filename = settings.get_csv_file('comments')
def __init__(self, expert_language, second_language, expert_loc=10000, second_loc=1000): """ :param expert_language: helper.language.Language object of the expert language. :param second_language: helper.language.Language object of the language to examine. For now just Python. :param expert_loc: Lines of code that is the threshold for an expert. :param second_loc: Lines of code threshold for the language to examine to get a code base that is large enough. """ self.expert_language = expert_language self.second_language = second_language self.expert_loc = expert_loc self.second_loc = second_loc settings = Settings() loc_user_file_ext_view = settings.get_database_view( 'loc_user_file_ext') self.query = SQL(''' SELECT u1.user_id, u1.login FROM {} AS u1 JOIN {} AS u2 ON u1.user_id = u2.user_id JOIN users u ON u1.user_id = u.id WHERE u1.additions >= %s AND u1.file_ext = ANY(%s) AND u2.additions >= %s AND u2.file_ext = ANY(%s) AND u.type = 'USR'; ''').format(Identifier(loc_user_file_ext_view), Identifier(loc_user_file_ext_view)) self.detection_path = settings.get_path([ 'detections', expert_language.language + '_' + second_language.language ])
def train(): settings = Settings() batch_size = settings.get_training_parameters('batch_size') epochs = settings.get_training_parameters('epochs') model_builder = Model4Builder() model = model_builder() preprocessor = Preprocessor(model) preprocessor.load_data(['category', 'is_top_submission']) training_input = [preprocessor.training_data['category']] validation_input = [preprocessor.validation_data['category']] training_output = [preprocessor.training_data['is_top_submission']] validation_output = [preprocessor.validation_data['is_top_submission']] class_weights = calculate_class_weights(preprocessor.training_data['is_top_submission'], [ol.name for ol in model.output_layers]) callbacks = CallbackBuilder(model, [CsvLogger, CsvPlotter, ConfigLogger, ModelSaver])() model.fit(training_input, training_output, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_data=(validation_input, validation_output), class_weight=class_weights)
def __call__(self): timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime()) settings = Settings() log_path = '{}/{}_{}'.format(settings.get_training_root_dir(), self.model.name, timestamp) os.makedirs(log_path, exist_ok=True) if CsvLogger in self.callback_classes: csv_logger = CsvLogger(self.model.name, log_path) self.active_callbacks.append(csv_logger) if CsvPlotter in self.callback_classes: assert CsvLogger in self.callback_classes plotter = CsvPlotter(self.model, log_path) self.active_callbacks.append(plotter) if ConfigLogger in self.callback_classes: config_logger = ConfigLogger(self.model, log_path) self.active_callbacks.append(config_logger) if ModelSaver in self.callback_classes: model_saver = ModelSaver(self.model, log_path) self.active_callbacks.append(model_saver) return self.active_callbacks
def __init__(self, sha): self.sha = sha settings = Settings() self._downloads_path = settings.get_path('downloads') self._download_archive_path = settings.get_path('download_archive') self._earliest_project_view = settings.get_database_view( 'earliest_project')
def __init__(self): self.required_inputs = [] self.required_parameters = [] self.inputs = {} self.parameters = {} settings = Settings() self.parameters = settings.get_network_parameters()
def __init__(self, dictionary_size): self.dictionary_size = dictionary_size self.dimensions = self.DIMENSIONS settings = Settings() self.embedding_path = settings.get_glove_embedding() self.word_numbers = {} self.embedding_vectors = np.zeros( (dictionary_size + 1, self.DIMENSIONS))
def __init__(self): settings = Settings() db_settings = settings.get_database_settings() connection_parameters = ' '.join([ '{}={}'.format(key, value) for (key, value) in db_settings.items() ]) self.connection = psycopg2.connect(connection_parameters) self.cursor = self.connection.cursor( cursor_factory=psycopg2.extras.DictCursor)
def __init__(self, model, log_path): super().__init__() settings = Settings() self.training_parameters = settings.get_training_parameters() self.network_parameters = settings.get_network_parameters() self.log_path = log_path self.filename = '{}/{}'.format(log_path, 'config.txt') model.summary(print_fn=self._handle_summary_print) self.model = model
def train(): settings = Settings() batch_size = settings.get_training_parameters('batch_size') epochs = settings.get_training_parameters('epochs') dictionary_size = settings.get_training_parameters('dictionary_size') max_headline_length = settings.get_training_parameters('max_headline_length') glove = Glove(dictionary_size) glove.load_embedding() model_builder = Model1Builder() \ .set_input('glove', glove) \ .set_parameter('max_headline_length', max_headline_length) model = model_builder() preprocessor = Preprocessor(model) preprocessor.set_encoder('glove', glove) preprocessor.set_parameter('max_headline_length', max_headline_length) preprocessor.load_data(['headline', 'is_top_submission']) training_input = [preprocessor.training_data['headline']] validation_input = [preprocessor.validation_data['headline']] training_output = [preprocessor.training_data['is_top_submission']] validation_output = [preprocessor.validation_data['is_top_submission']] class_weights = calculate_class_weights(preprocessor.training_data['is_top_submission'], [ol.name for ol in model.output_layers]) callbacks = CallbackBuilder(model, [CsvLogger, CsvPlotter, ConfigLogger, ModelSaver])() model.fit(training_input, training_output, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_data=(validation_input, validation_output), class_weight=class_weights)
def load_model(self, tag, dimensions): doc2vec_dir = Settings().get_doc2vec_dir() doc2vec_file = '{}/{}_{}.model'.format(doc2vec_dir, tag, dimensions) if os.path.isfile(doc2vec_file): print('loading doc2vec model ...') self.model = doc2vec.Doc2Vec.load(doc2vec_file) else: self.train_model(tag, dimensions) print('saving model ...') try: os.makedirs(doc2vec_dir) except FileExistsError: pass self.model.save(doc2vec_file)
def train(): settings = Settings() batch_size = settings.get_training_parameters('batch_size') epochs = settings.get_training_parameters('epochs') max_headline_length = settings.get_training_parameters( 'max_headline_length') max_article_length = settings.get_training_parameters('max_article_length') headline_numeric_log = NumericLog(max_headline_length) article_numeric_log = NumericLog(max_article_length) model_builder = Model6Builder() \ .set_input('headline_numeric_log', headline_numeric_log) \ .set_input('article_numeric_log', article_numeric_log) model = model_builder() preprocessor = Preprocessor(model) preprocessor.set_encoder('headline_numeric_log', headline_numeric_log) preprocessor.set_encoder('article_numeric_log', article_numeric_log) preprocessor.load_data([ 'headline_log_representation', 'article_log_representation', 'is_top_submission' ]) training_input = [ preprocessor.training_data['headline_log_representation'], preprocessor.training_data['article_log_representation'] ] validation_input = [ preprocessor.validation_data['headline_log_representation'], preprocessor.validation_data['article_log_representation'] ] training_output = [preprocessor.training_data['is_top_submission']] validation_output = [preprocessor.validation_data['is_top_submission']] class_weights = calculate_class_weights( preprocessor.training_data['is_top_submission'], [ol.name for ol in model.output_layers]) callbacks = CallbackBuilder( model, [CsvLogger, CsvPlotter, ConfigLogger, ModelSaver])() model.fit(training_input, training_output, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_data=(validation_input, validation_output), class_weight=class_weights)
def __init__(self, user_id, login, expert_language, second_language, expert_loc=10000, second_loc=1000): """ :param user_id: User id :param login: User name :param expert_language: helper.language.Language object of the expert language. :param second_language: helper.language.Language object of the language to examine. For now just Python. :param expert_loc: Lines of code that is the threshold for an expert. :param second_loc: Lines of code threshold for the language to examine to get a code base that is large enough. """ self.user_id = user_id self.login = login self.expert_language = expert_language self.second_language = second_language self.expert_loc = expert_loc self.second_loc = second_loc settings = Settings() if self.expert_language.language == 'java' and self.second_language.language == 'python': self.candidates_view = settings.get_database_view( 'candidates_java_py') self.result_dir_path = settings.get_path( ['detections', 'java_python']) elif self.expert_language.language == 'cpp' and self.second_language.language == 'python': self.candidates_view = settings.get_database_view( 'candidates_cpp_py') self.result_dir_path = settings.get_path( ['detections', 'cpp_python']) elif self.expert_language.language == 'fun' and self.second_language.language == 'python': self.candidates_view = settings.get_database_view( 'candidates_fun_py') self.result_dir_path = settings.get_path( ['detections', 'fun_python']) else: raise ValueError( 'No database view for this combination of languages.')
def calculate_correlations(): arg_parse = ArgumentParser() arg_parse.add_argument('--model_1', type=str) arg_parse.add_argument('--model_2', type=str) arg_parse.add_argument('--model_3', type=str) arg_parse.add_argument('--model_4', type=str) arg_parse.add_argument('--model_5', type=str) arg_parse.add_argument('--model_6', type=str) arg_parse.add_argument('--model_7', type=str) arguments = arg_parse.parse_args() settings = Settings() default_parameters = settings.get_training_parameters() glove = Glove(default_parameters['dictionary_size']) glove.load_embedding() headline_numeric_log = NumericLog( default_parameters['max_headline_length']) article_numeric_log = NumericLog(default_parameters['max_article_length']) print('load data...') preprocessor = Preprocessor(None) preprocessor.set_encoder('glove', glove) preprocessor.set_encoder('headline_numeric_log', headline_numeric_log) preprocessor.set_encoder('article_numeric_log', article_numeric_log) preprocessor.set_parameter('max_headline_length', default_parameters['max_headline_length']) preprocessor.set_parameter('body_begin_length', default_parameters['body_begin_length']) preprocessor.load_data([ 'headline', 'body_begin', 'category', 'minute', 'hour', 'day_of_week', 'day_of_year', 'headline_log_representation', 'article_log_representation', 'competitive_score' ]) custom_objects = { 'precision': precision, 'recall': recall, 'f1': f1, } print('load models...') model_inputs = {} model_inputs['model_1'] = [preprocessor.test_data['headline']] model_inputs['model_2'] = [preprocessor.test_data['headline']] model_inputs['model_3'] = [preprocessor.test_data['body_begin']] model_inputs['model_4'] = [preprocessor.test_data['category']] model_inputs['model_5'] = [ preprocessor.test_data[key] for key in ['minute', 'hour', 'day_of_week', 'day_of_year'] ] model_inputs['model_6'] = [ preprocessor.test_data[key] for key in ['headline_log_representation', 'article_log_representation'] ] model_inputs['model_7'] = [preprocessor.test_data['competitive_score']] print('predict...') predictions = {} for model_name in model_inputs.keys(): if hasattr(arguments, model_name) and getattr(arguments, model_name): model = load_model(getattr(arguments, model_name), custom_objects=custom_objects) predictions[model_name] = np.round( model.predict(model_inputs[model_name])) print('calculate correlation...') for model_name_1 in predictions.keys(): for model_name_2 in predictions.keys(): if model_name_1 != model_name_2: correlation = np.corrcoef(predictions[model_name_1][:, -1], predictions[model_name_2][:, -1])[0] print(model_name_1, model_name_2, correlation[1])
def __init__(self, database_name): settings = Settings() database_settings = settings.get_database_settings(database_name) self.r = redis.StrictRedis(**database_settings)
def setUpClass(self): URL = Settings().get('url', index=0) self.driver = Voince_page(browser_type='chrome').get( URL, maximize_window=False)
def __init__(self, figsize=(800, 600)): settings = Settings() self.figsize = figsize self.plot_dir = settings.get_path('plots')
def __init__(self): settings = Settings() self.archive_path = settings.get_path('download_archive') self.download_path = settings.get_path('downloads')
def __init__(self): settings = Settings() connection_dict = settings.get_database_settings() settings = " ".join(['%s=%s' % (key, value) for (key, value) in connection_dict.items()]) self.conn = psycopg2.connect(settings) self.cur = self.conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
def __init__(self): settings = Settings() self.api_keys = settings.get_guardian_api_keys() self.fields = list(map(lambda e: e.value, GuardianApi))
def __init__(self): db_settings = Settings().get_database_settings() self._connection_parameters = ' '.join([ '{}={}'.format(key, value) for (key, value) in db_settings.items() ])
def main(): settings = Settings() java_aggregation_path = settings.get_path(['detections', 'java_python']) cpp_aggregation_path = settings.get_path(['detections', 'cpp_python']) fun_aggregation_path = settings.get_path(['detections', 'fun_python']) aggregation_paths = [ java_aggregation_path, cpp_aggregation_path, fun_aggregation_path ] aggregation_paths = [p + '/detections.json' for p in aggregation_paths] aggregators = [Aggregator.load(p) for p in aggregation_paths] labels = ['Java', 'C++', 'func languages'] plotter = Plotter() # if if_percentage = [a.percentage(IfDetection, 'user_id') for a in aggregators] if_elseif_count_aggregation = [ a.aggregate(IfDetection, 'elseif_count', 'user_id') for a in aggregators ] if_has_else_aggregation = [ a.aggregate(IfDetection, 'has_else', 'user_id') for a in aggregators ] plotter(if_percentage, labels, 'If percentage') plotter(if_elseif_count_aggregation, labels, 'Elseif count') plotter(if_has_else_aggregation, labels, 'If has else') # class def line_count(detection): return detection.end - detection.begin + 1 def name_length(detection): return len(detection.name) class_definition_percentage = [ a.percentage(ClassDefinitionDetection, 'user_id') for a in aggregators ] class_line_count_aggregation = [ a.aggregate(ClassDefinitionDetection, line_count, 'user_id') for a in aggregators ] class_method_count_aggregation = [ a.aggregate(ClassDefinitionDetection, 'method_count', 'user_id') for a in aggregators ] class_name_length_aggregation = [ a.aggregate(ClassDefinitionDetection, name_length, 'user_id') for a in aggregators ] class_is_nested_aggregation = [ a.aggregate(ClassDefinitionDetection, 'nested', 'user_id') for a in aggregators ] plotter(class_definition_percentage, labels, 'Class percentage') plotter(class_line_count_aggregation, labels, 'Class line count') plotter(class_method_count_aggregation, labels, 'Class method count') plotter(class_name_length_aggregation, labels, 'Class name length') plotter(class_is_nested_aggregation, labels, 'Class is nested') # built in functions class IsFunction: def __init__(self, name): self.name = name def __call__(self, detection): return 1 if detection.name == self.name else 0 bif_percentage = [ a.percentage(BuiltInFunctionDetection, 'user_id') for a in aggregators ] bif_map_count_aggregation = [ a.aggregate(BuiltInFunctionDetection, IsFunction('map'), 'user_id') for a in aggregators ] bif_filter_count_aggregation = [ a.aggregate(BuiltInFunctionDetection, IsFunction('filter'), 'user_id') for a in aggregators ] bif_list_count_aggregation = [ a.aggregate(BuiltInFunctionDetection, IsFunction('list'), 'user_id') for a in aggregators ] bif_dict_count_aggregation = [ a.aggregate(BuiltInFunctionDetection, IsFunction('dict'), 'user_id') for a in aggregators ] bif_set_count_aggregation = [ a.aggregate(BuiltInFunctionDetection, IsFunction('set'), 'user_id') for a in aggregators ] plotter(bif_percentage, labels, 'Built in function percentage') plotter(bif_map_count_aggregation, labels, 'Map percentage') plotter(bif_filter_count_aggregation, labels, 'Filter percentage') plotter(bif_list_count_aggregation, labels, 'List percentage') plotter(bif_dict_count_aggregation, labels, 'Dict percentage') plotter(bif_set_count_aggregation, labels, 'Set percentage') # comprehensions list_comprehension_percentage = [ a.percentage(ListComprehensionDetection, 'user_id') for a in aggregators ] dict_comprehension_percentage = [ a.percentage(DictComprehensionDetection, 'user_id') for a in aggregators ] set_comprehension_percentage = [ a.percentage(SetComprehensionDetection, 'user_id') for a in aggregators ] list_comprehension_generator_count_aggregation = [ a.aggregate(ListComprehensionDetection, 'generator_count', 'user_id') for a in aggregators ] dict_comprehension_generator_count_aggregation = [ a.aggregate(DictComprehensionDetection, 'generator_count', 'user_id') for a in aggregators ] set_comprehension_generator_count_aggregation = [ a.aggregate(SetComprehensionDetection, 'generator_count', 'user_id') for a in aggregators ] plotter(list_comprehension_percentage, labels, 'List comprehension percentage') plotter(dict_comprehension_percentage, labels, 'Dict comprehension percentage') plotter(set_comprehension_percentage, labels, 'Set comprehension percentage') plotter(list_comprehension_generator_count_aggregation, labels, 'List comprehension generator count') plotter(dict_comprehension_generator_count_aggregation, labels, 'Dict comprehension generator count') plotter(set_comprehension_generator_count_aggregation, labels, 'Set comprehension generator count')