def __init__(self): """ Initiator class to load all necessary configurations, logger and HTTP client instances """ config_parser = ConfigParser() self.config = config_parser.return_json( path_to_config='TOREPLACE/config/example.json')
def load_from(config_meta): with open(config_meta['path']) as f: config = yaml.load(f) parser = ConfigParser(config) parser._create_directories() task = Task.load_from(parser.task) dataset = Dataset.load_from(parser.dataset) model_config = config['model'] label_helper = Label.load_from(parser.label) user = config['user'] # Set up logger log_level = config_meta['log_level'] logger = logging.getLogger('label_app') logger.setLevel(getattr(logging, log_level)) ch = logging.StreamHandler(sys.stdout) ch.setFormatter( logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(ch) return LabelApp(task, dataset, label_helper, user, model_config, parser, logger)
def __init__(self, theConfigFilePath, theLogger=Logger()): # parse config file self.configParser = ConfigParser() # self.configParser.parseFile(theConfigFilePath) self.configParser.generateDirectories() self.log = theLogger # keytool path to parse apk's signature self.keytoolPath = None # sanddroid directories self.mainDir = os.path.dirname(__file__) self.appList = [] # list to store apk file - full path self.runningApps = [] # list to store apk file which in being analyzed self.runHeadless = False self.emulatorStartPort = 5554 self.numThreads = 1 self.maxThreadRuntime = 600 # control running threads self.threadLogFileList = [] # list to store thread log file path self.numFinishedApps = 0 # number of analyzed apps self.numRunningThreads = 0 # number of running threads self.threadList = [] # list of threads, size=numThreads self.threadActiveMask = [ ] # bitmask to determine if thread is active, size=numThreads self.avdsheartbeat = (0, 0, 0, 0) # list avds' times used in one cycle self.avdheartbeat = 0 self.startTime = datetime.datetime.now()
def __init__(self, config_filepath): """ Class initializer. Inputs: config_filepath: (str) Configuration filepath. """ self.configparser = ConfigParser(config_filepath) # read configuration file self.filepath = self.configparser.getstr('filepath') self.full_ontology_pkl = self.configparser.getstr('full_ontology_pkl') self.candidate_ontology_pkl = self.configparser.getstr( 'candidate_ontology_pkl') self.skeleton_and_entities_pkl = self.configparser.getstr( 'skeleton_and_entities_pkl') self.overwrite_pkl = self.configparser.getbool('overwrite_pickle_flag') self.outputFoodOn = self.configparser.getstr('outputFoodOn') self.num_seeds = self.configparser.getint('num_seeds') self.num_min_extracted_entities = self.configparser.getint( 'num_min_extracted_entities') # generate pairs from csv file self.pd_foodon_pairs = self.generate_pairs() self.all_classes, self.all_entities = self.get_classes_and_entities() self.foodon_graph, self.graph_dict, self.graph_dict_flip = self.generate_graph( )
def __init__(self, theApkObj, theAvdName, decompressDir, runHeadless, theLogger=Logger()): Thread.__init__(self) # configParser self.configParser = ConfigParser() self.apkObj = theApkObj self.log = theLogger self.curDir = os.path.dirname(__file__) self.staticAnalyzer = None self.dynamicAnalyzer = None self.logcatAnalyzer = None self.startTimeStr = None self.endTimeStr = None self.emulator = None self.emulatorPort = 5554 self.avdName = theAvdName self.runHeadless = runHeadless self.decompressPath = decompressDir self.logcatFile = None self.session = None self.cancelFlag = False # Flag for canceling run
def __init__(self, config_filepath): """ Class initializer. Inputs: config_filepath: (str) Configuration filepath. """ self.configparser = ConfigParser(config_filepath)
def __init__(self, config_file): """ Class initializer. Inputs: """ self.configparser = ConfigParser(config_file) self.epoch_callback = EpochCallback() self.model = None
def load_from(config_path): with open(config_path) as f: config = yaml.load(f) parser = ConfigParser(config) parser._create_directories() task = Task.load_from(parser.task) dataset = Dataset.load_from(parser.dataset) model_config = config['model'] label_helper = Label.load_from(parser.label) user = config['user'] return LabelApp(task, dataset, label_helper, user, model_config, parser)
def main(config_path, epochs=3): # For pretraining, we do everything the same, except we replace the # dataset:judgements_file with model:pretrain_file. with open(config_path) as f: config = yaml.load(f) parser = ConfigParser(config) parser.dataset['judgements_file'] = parser.model['pretrain_file'] task = Task.load_from(parser.task) dataset = PretrainJSONDataset(parser.dataset) model_config = config['model'] label_helper = Label.load_from(parser.label) user = config['user'] label_app = LabelApp(task, dataset, label_helper, user, model_config, parser) label_app.trainer.load_existing() label_app.trainer.train_epochs(epochs=epochs)
def __init__(self, module: str, cli: str): self.current_env = os.environ.copy() self.json_data = ConfigParser( path="/home/vlad/infra/armature/armature/conf/modules.json" ).return_json() self.module = module self.cli = cli self.module_data = self.json_data['modules'][module]
def main(): """ Main function. """ # set log, parse args, and read configuration args = parse_argument() configparser = ConfigParser(args.config_file) set_logging(configparser.getstr('logfile')) # parse FoodOn parse_foodon = ParseFoodOn(configparser.getstr('foodon_parse_config')) classes_dict = parse_foodon.get_candidate_classes() classes_dict_skeleton, candidate_entities = parse_foodon.get_seeded_skeleton( classes_dict) # run scoring_manager = ScoringManager(classes_dict_skeleton, candidate_entities, configparser.getstr('scoring_config')) scoring_manager.run_iteration()
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # baseline / best classifiers baseline_classifier = main_config.get_str('baseline') best_classifier = main_config.get_str('classifier') # plot PR curve and print confusion matrix plot_pr_print_cm(baseline_classifier, best_classifier, main_config, model_manager)
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # parse config classifier = main_config.get_str('classifier') pre_built_models_dir = os.path.join(main_config.get_str('pre_built_models_dir'), classifier) num_classifiers = main_config.get_int('num_classifiers') # we need to build the models first if they do not exist if not dir_exists(pre_built_models_dir): save_models( classifier, pre_built_models_dir, main_config, model_manager, num_classifiers) make_recommendation( classifier, pre_built_models_dir, main_config, model_manager, num_classifiers)
def save_models(classifier, pre_built_models_dir, main_config, model_manager, num_classifiers): log.info('Pre-built model directory specified for %s does not exist.', classifier) log.info('Building models again.') # create directory create_dir(pre_built_models_dir) # load config parsers preprocess_config = ConfigParser(main_config.get_str('preprocess_config')) classifier_config = ConfigParser(main_config.get_str('classifier_config')) classifier_config.overwrite('classifier', classifier) # perform preprocessing X, y = model_manager.preprocess(preprocess_config, section=classifier) # select subset of features if requested selected_features = main_config.get_str_list('selected_features') if selected_features: log.info('Selecting subset of features: %s', selected_features) X = X[selected_features] # train multiple classifiers for i in range(num_classifiers): log.debug('Processing classifier %d/%s', i+1, num_classifiers) cmanager = ClassifierManager(classifier_config) clf = CalibratedClassifierCV(cmanager.get_classifier(), method='sigmoid', cv=5) clf.fit(X, y) save_pkl( clf, os.path.join(pre_built_models_dir, 'model_{}.pkl'.format(i)))
def main(): """ Main function. """ # set log, parse args, and read configuration set_logging() args = parse_argument() configparser = ConfigParser(args.config_file) # load data to train with sentence_column = configparser.getstr('sentence_column') pd_data = pd.read_csv(configparser.getstr('input_filepath'), sep='\t') pd_data.fillna('', inplace=True) pd_data = pd_data[pd_data[sentence_column] != ''] # use specified column as sentences sentences = pd_data[sentence_column].tolist() sentences = [sentence.split() for sentence in sentences] # init word2vec manager w2vm = Word2VecManager(args.config_file) # start training and load pre-training data if prompted if configparser.getbool('pre_train'): pretrained = configparser.getstr('pre_trained_vectors') else: pretrained = None w2vm.train(sentences, pretrained=pretrained) # save word embeddings and model w2vm.save_model(configparser.getstr('model_saveto')) w2vm.save_vectors(configparser.getstr('vectors_saveto')) w2vm.save_loss(configparser.getstr('loss_saveto'))
def __init__(self, candidate_classes_info, candidate_entities, scoring_config): """ Class initializer. """ # config parser if isinstance(scoring_config, str): scoring_config = ConfigParser(scoring_config) # save arguments self.candidate_classes_info = candidate_classes_info self.candidate_entities = candidate_entities # parse config file self.alpha = scoring_config.getfloat('alpha') self.num_mapping_per_iteration = scoring_config.getint( 'num_mapping_per_iteration') self.initial_siblings_scores = scoring_config.getstr( 'initial_siblings_scores') self.initial_parents_scores = scoring_config.getstr( 'initial_parents_scores') self.pairs_filepath = scoring_config.getstr('pairs_filepath') self.populated_filepath = scoring_config.getstr('populated_filepath') self.preprocess_config_filepath = scoring_config.getstr( 'preprocess_config') self.similarity_method = scoring_config.getstr('similarity_method') log.debug('alpha: %f', self.alpha) log.debug('num_mapping_per_iteration: %d', self.num_mapping_per_iteration) log.debug('initial_siblings_scores: %s', self.initial_siblings_scores) log.debug('initial_parents_scores: %s', self.initial_parents_scores) log.debug('pairs_filepath: %s', self.pairs_filepath) log.debug('populated_filepath: %s', self.populated_filepath) log.debug('similarity_method: %s', self.similarity_method) # preprocess manager self.fpm = FdcPreprocessManager(self.preprocess_config_filepath) # number of candidate classes & entities self.num_candidate_classes = len(self.candidate_classes_info) self.num_candidate_entities = len(self.candidate_entities) log.debug('Number of candidate classes: %d', self.num_candidate_classes) log.debug('Number of candidate entities: %d', self.num_candidate_entities) # extract the seeded entities to make complete list of entities seed_entities = self._unpack_sublist( [x[1] for _, x in self.candidate_classes_info.items()]) self.all_entity_labels = list( set(self.candidate_entities + seed_entities)) # all labels of candidate class self.candidate_classes_label = list(self.candidate_classes_info.keys()) # complete list of class labels other_classes = self._unpack_sublist( [x[0] for _, x in self.candidate_classes_info.items()], depth=2) self.all_class_labels = list( set(self.candidate_classes_label + other_classes)) # calculate embedding lookup table for class / entity labels if 'we_' in self.similarity_method: self.keyed_vectors = KeyedVectors.load_word2vec_format( scoring_config.getstr('word_embeddings')) # self.keyed_vectors.save('./output/glove_wiki_embeddings.bin') # self.keyed_vectors = KeyedVectors.load('./output/glove_wiki_embeddings.bin') self.pd_class_label_embeddings = self._calculate_label_embeddings( self.all_class_labels) self.pd_entity_label_embeddings = self._calculate_label_embeddings( self.all_entity_labels) # save_pkl(self.pd_class_label_embeddings, './output/pd_class_label_embeddings.pkl') # save_pkl(self.pd_entity_label_embeddings, './output/pd_entity_label_embeddings.pkl') # sys.exit() # self.pd_class_label_embeddings = load_pkl('./output/pd_class_label_embeddings.pkl') # self.pd_entity_label_embeddings = load_pkl('./output/pd_entity_label_embeddings.pkl') # do initial calculation of the scores self.pd_siblings_scores, self.pd_parents_scores = self._calculate_initial_scores( )
print("==> Stitching frames to create final output videos") stitch_videos(model_output_path, cfg.paths.frames, predictions_dict) # Delete frame directories for video in predictions_dict.keys(): directory_path = f"{model_output_path}/{video}" shutil.rmtree(directory_path) print("==> Generating confusion matrix") metrics.compute_confusion_matrix(predictions_dict, classes, model_output_path) # Upload to AWS S3 only if bucket name is given in config if cfg.bucket: print("==> Creating zip file") zip_videos(model_output_path, cfg.name) print("==> Uploading to AWS S3") response = upload_videos(model_output_path, cfg.name, cfg.bucket) if response: print(f"Output download link: {response}") total_time = datetime.datetime.now() - start_time print(f"Total time: {total_time.total_seconds()}") if __name__ == "__main__": cfg = ConfigParser().config main(cfg)
def make_recommendation(classifier, pre_built_models_dir, main_config, model_manager, num_classifiers): if not dir_exists(pre_built_models_dir): raise RuntimeError('Pre-built model directory does not exist!') log.info('Using pre-built model directory: %s', pre_built_models_dir) # load config parsers preprocess_config = ConfigParser(main_config.get_str('preprocess_recommender_config')) classifier_config = ConfigParser(main_config.get_str('classifier_config')) classifier_config.overwrite('classifier', classifier) # perform preprocessing X, y = model_manager.preprocess(preprocess_config, section=classifier, final_model=True) # select subset of features if requested selected_features = main_config.get_str_list('selected_features') if selected_features: log.info('Selecting subset of features: %s', selected_features) X = X[selected_features] def _revert_column(pd_data): values = list(set(pd_data.tolist())) replace_dict = {} for value in values: replace_dict[value] = list(filter(lambda a: a != value, values))[0] return pd_data.replace(to_replace=replace_dict) # get test data and its inverse for TRT column X_inv = X.copy() X_inv['TRT'] = _revert_column(X_inv['TRT']) pos_trt_idx = (X['TRT'] == 1.0) y_probs = [] y_probs_inv = [] for i in range(num_classifiers): log.debug('Processing classifier %d/%s', i+1, num_classifiers) classifier_filepath = os.path.join(pre_built_models_dir, 'model_{}.pkl'.format(i)) log.debug('Loading classifier: %s', classifier_filepath) clf = load_pkl(classifier_filepath) y_probs.append(clf.predict_proba(X)[:, 1]) y_probs_inv.append(clf.predict_proba(X_inv)[:, 1]) y_probs = pd.DataFrame(y_probs).T y_probs.index = X.index y_probs_inv = pd.DataFrame(y_probs_inv).T y_probs_inv.index = X.index # make recommendation y_probs_avg = y_probs.mean(axis=1) y_probs_inv_avg = y_probs_inv.mean(axis=1) y_probs_avg_diff = y_probs_avg - y_probs_inv_avg inv_minus_pos = y_probs_inv_avg - y_probs_avg y_probs_avg_diff[~pos_trt_idx] = inv_minus_pos[~pos_trt_idx] pval = pd.Series(index=X.index) for index, _ in pval.items(): _, pval[index] = ttest_rel(y_probs.loc[index], y_probs_inv.loc[index]) # calculate y_probs_trt / right now it's y_probs ################ pd_concat = pd.concat( [pos_trt_idx, y_probs_avg, y_probs_inv_avg, y_probs_avg_diff, pval], axis=1) pd_concat.columns = ['pos_trt', 'y_probs_avg', 'y_probs_inv_avg', 'y_probs_avg_diff', 'pval'] print(pd_concat)
# -*- coding: utf-8 -*- from db.db_worker import DBWorker from db.sql_requests import SQLRequests from utils.config_parser import ConfigParser from utils.settings_parser import SettingsParser import logging if __name__ == "__main__": config = ConfigParser().get_config_settings() settings = SettingsParser().get_test_settings() logging.basicConfig(filename=config['log_filename'], level=logging.INFO, format=config['log_format']) logger = logging.getLogger() db_worker = DBWorker(config['host'], config['user'], config['password'], config['database']) sql_request = SQLRequests(db_worker, settings) try: db_worker.connect() logger.info('Шаг 1') sql_request.get_min_working_time() logger.info('-' * 200) logger.info('Шаг 2')
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # perform analysis on these classifiers classifiers = main_config.get_str_list('classifier') # do prediction classifiers_ys = {} for classifier in classifiers: log.info('Running model for classifier \'%s\'', classifier) # load config parsers preprocess_config = ConfigParser( main_config.get_str('preprocess_config')) classifier_config = ConfigParser( main_config.get_str('classifier_config')) # perform preprocessing X, y = model_manager.preprocess(preprocess_config, section=classifier) # run classification model classifier_config.overwrite('classifier', classifier) X = model_manager.feature_selector(X, y, classifier_config) score_avg, score_std, ys = model_manager.run_model_cv( X, y, 'f1', classifier_config) classifiers_ys[classifier] = ys # plot PR curve fig = plt.figure() lines = [] labels = [] for classifier, ys in classifiers_ys.items(): y_trues, y_preds, y_probs = ys y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs) line, label = plot_pr(y_trues, y_probs_1, classifier) lines.append(line) labels.append(label) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('PR Curve') plt.legend(lines, labels, loc='lower right', prop={'size': 8}) save_figure( fig, os.path.join(main_config.get_str('visualization_dir'), 'pr_curve.png')) # plot ROC curve fig = plt.figure() lines = [] labels = [] for classifier, ys in classifiers_ys.items(): y_trues, y_preds, y_probs = ys y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs) line, label = plot_roc(y_trues, y_probs_1, classifier) lines.append(line) labels.append(label) # plt.plot([0, 1], [0, 1], color='k', linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(lines, labels, loc='lower right', prop={'size': 8}) save_figure( fig, os.path.join(main_config.get_str('visualization_dir'), 'roc_curve.png'))
def main(): """ Main function. """ # parse args args = parse_argument() # load main config file and set logging main_config = ConfigParser(args.config_file) set_logging(log_file=main_config.get_str('log_file')) # initialize model manager object model_manager = ModelManager() # run models for all possible combination of preprocessing scale_modes = main_config.get_str_list('scale_mode') mvi_modes = main_config.get_str_list('mvi_mode') outlier_modes = main_config.get_str_list('outlier_mode') classifiers = main_config.get_str_list('classifier') classifier_score_dict = {classifier: 0 for classifier in classifiers} classifier_best_combination_dict = { classifier: None for classifier in classifiers } all_combinations = [scale_modes, mvi_modes, outlier_modes, classifiers] all_combinations = list(itertools.product(*all_combinations)) failed_combinations = [] for idx, combination in enumerate(all_combinations): # unpack the tuple scale_mode = combination[0] mvi_mode = combination[1] outlier_mode = combination[2] classifier = combination[3] # log current combination combination_str_joined = ', '.join(list(combination)) log.info('Running grid search %d/%d: (%s)', idx + 1, len(all_combinations), combination_str_joined) # some classifiers must use minmax scaler if classifier in ['MultinomialNB', 'CategoricalNB' ] and scale_mode != 'minmax': log.info('Skipping this combination...') continue # overwrite the config file using the current combination preprocess_config = ConfigParser( main_config.get_str('preprocess_config')) classifier_config = ConfigParser( main_config.get_str('classifier_config')) preprocess_config.overwrite('scale_mode', scale_mode) preprocess_config.overwrite('mvi_mode', mvi_mode) preprocess_config.overwrite('outlier_mode', outlier_mode) classifier_config.overwrite('classifier', classifier) # perform preprocessing X, y = model_manager.preprocess(preprocess_config) # run classification model try: score = model_manager.grid_search( X, y, main_config.get_str('optimize_scoring'), classifier_config, main_config.get_str('updated_classifier_config')) except (IndexError, ValueError) as e: failed_combinations.append(combination_str_joined) log.error(e) continue # update the best preprocessing combination if classifier_score_dict[classifier] < score: classifier_score_dict[classifier] = score classifier_best_combination_dict[ classifier] = combination_str_joined log.info('Best %s score for each classifier: %s', main_config.get_str('optimize_scoring'), classifier_score_dict) log.info( 'Preprocessing combination of the best %s score for each classifier: %s', main_config.get_str('optimize_scoring'), classifier_best_combination_dict) log.info('%d failed combinations: %s', len(failed_combinations), failed_combinations)
import click from modules.executor import Executor from utils.config_parser import ConfigParser json_data = ConfigParser( path="/home/vlad/infra/armature/armature/conf/modules.json").return_json() MODULE = "packer" @click.group() def cli(): pass @cli.command() def prepare_template(): """Validate configuration file""" click.echo('prepare_template') with Executor(module=MODULE, cli="prepare_template") as cli_executor: cli_executor.run(cli="prepare_template", use_docker_run_wrapper=True) @cli.command() def validate_template(): """Validate configuration file""" click.echo('validate_template') with Executor(module=MODULE, cli="validate_template") as cli_executor: cli_executor.run(cli="validate_template", use_docker_run_wrapper=True)
class FdcPreprocessManager: """ Class for preprocessing the FDC data. """ def __init__(self, config_filepath): """ Class initializer. Inputs: config_filepath: (str) Configuration filepath. """ self.configparser = ConfigParser(config_filepath) def _load_synonym_map(self, section='filter'): pd_map = pd.read_csv(self.configparser.getstr('synonym_map', section), sep='\t', index_col='from') return pd_map['to'].to_dict() def _map_synonyms(self, text, table): regex_str = '|'.join(r'\b%s\b' % re.escape(s) for s in table) return re.sub(regex_str, lambda x: table[x.group(0)], text) def _generate_custom_stopwords(self, section='filter'): """ (Private) Generate custom stopwords by adding or removing user specified stopwords to the gensim's default stopwords. Inputs: section: (str, optional) Section name of the .ini file. Returns: (frozenset) New updated stopwords. """ my_stopwords = list(gpp.STOPWORDS) # stopwords to add to_add_filename = self.configparser.getstr('stopwords_to_add', section) with open(to_add_filename, 'r') as file: to_add_list = file.read().splitlines() if len(to_add_list) > 0: log.info('Adding custom stopwords %s', to_add_list) else: log.info('Not adding any custom stopwords') # stopwords to remove to_remove_filename = self.configparser.getstr('stopwords_to_remove', section) with open(to_remove_filename, 'r') as file: to_remove_list = file.read().splitlines() if len(to_remove_list) > 0: log.info('Removing stopwords %s', to_remove_list) else: log.info('Not removing any custom stopword') # add and remove stopwords my_stopwords.extend(to_add_list) my_stopwords = [x for x in my_stopwords if x not in to_remove_list] return frozenset(my_stopwords) def _custom_remove_stopwords(self, s, stopwords): """ (Private) Custom remove stopwords function. Inputs: s: (str) String to process. stopwords: (frozenset) Custom stopwords. Returns: (str) Preprocessed string with stopwords removed. """ s = gensim_utils.to_unicode(s) return " ".join(w for w in s.split() if w not in stopwords) def _custom_lemmatize(self, text): result = gensim_utils.lemmatize(text) result = b' '.join(result).decode('utf-8') result = re.sub(r'/[^\s]+', '', result) return result def _build_custom_filter_list(self, section='filter'): """ (Private) Build list of filters based on the configuration file that will be applied by gpp.preprocess_string(). Inputs: section: (str, optional) Section name of the .ini file. Returns: custom_filters: (list) List of functions. """ custom_filters = [] if self.configparser.getbool('lower', section): log.debug('Converting to lower cases') custom_filters.append(lambda x: x.lower()) if self.configparser.getbool('map_synonym', section): log.debug('Mapping synonym') map_table = self._load_synonym_map(section) custom_filters.append(lambda x: self._map_synonyms(x, map_table)) if self.configparser.getbool('strip_punctuation', section): log.debug('Stripping punctuation') custom_filters.append(gpp.strip_punctuation) if self.configparser.getbool('strip_multiple_whitespaces', section): log.debug('Stripping multiple whitespaces') custom_filters.append(gpp.strip_multiple_whitespaces) if self.configparser.getbool('strip_numeric', section): log.debug('Stripping numeric') custom_filters.append(gpp.strip_numeric) if self.configparser.getbool('remove_stopwords', section): log.debug('Removing stopwords') stopwords = self._generate_custom_stopwords(section) custom_filters.append( lambda x: self._custom_remove_stopwords(x, stopwords)) if self.configparser.getbool('strip_short', section): minsize = self.configparser.getint('strip_short_minsize', section) log.debug('Stripping words shorter than %d', minsize) custom_filters.append( lambda x: gpp.strip_short(x, minsize=minsize)) if self.configparser.getbool('lemmatize', section): log.debug('Lemmatizing text') custom_filters.append(self._custom_lemmatize) return custom_filters def _generate_phrase(self, pd_data, load_model=False, section='phrase'): """ (Private) Generate phrase using the gensim Phrase detection module. Inputs: pd_data: (pd.Series) Data which will be used to generate phase. section: (str, optional) Section name of the .ini file. Returns: pd_data: (pd.Series) Input data but using phrases. """ if not self.configparser.getbool('generate_phrase', section): log.info('Skipping phrase generation...') return pd_data if load_model: model_filepath = self.configparser.getstr('phrase_model', section) model = Phraser.load(model_filepath) # apply phrase model log.info('Applying loaded phrase model...') pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False) else: log.info('Generating new phrases...') # this is our training data sentences = pd_data.tolist() # detect phrases using the configuration model = Phrases( sentences, min_count=self.configparser.getint('min_count', section), threshold=self.configparser.getfloat('threshold', section), max_vocab_size=self.configparser.getint( 'max_vocab_size', section), progress_per=self.configparser.getint('progress_per', section), scoring=self.configparser.getstr('scoring', section)) # apply trained model to generate phrase log.info('Applying phrase model...') pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False) # save phrase model model_filepath = self.configparser.getstr('phrase_model', section) log.info('Saving phrase model to \'%s\'...', model_filepath) model.save(model_filepath) # dump phrase and its score as text phrase_score_list = [] for phrase, score in model.export_phrases(sentences): phrase_score_list.append([phrase.decode('utf-8'), score]) pd_phrase_score = pd.DataFrame(phrase_score_list, columns=['phrase', 'score']) pd_phrase_score.drop_duplicates(subset='phrase', inplace=True) export_filepath = self.configparser.getstr('phrase_dump_filename', section) log.info('Dumping phrases to \'%s\'...', export_filepath) pd_phrase_score.to_csv(export_filepath, sep='\t', index=False) return pd_data def preprocess_column(self, pd_data, load_model=False): """ Preprocess specified column. Inputs: pd_data: (pd.Series) Input data to preprocess. Returns: pd_data: (pd.Series) Preprocess data. """ # preprocess using set of filters custom_filters = self._build_custom_filter_list() log.info('Applying preprocess filters to the %s...', pd_data.name) pd_data = pd_data.apply( lambda x: gpp.preprocess_string(x, custom_filters), convert_dtype=False) # generate phrase based on the configuration pd_data = self._generate_phrase(pd_data, load_model=load_model) # join the list of words into space delimited string pd_data = pd_data.apply(lambda x: ' '.join(x)) return pd_data def get_vocabs(self, pd_data): log.info('Getting all vocabs...') vocabs = [] for row in pd_data.tolist(): vocabs.extend(row.split(' ')) vocabs = list(set(vocabs)) vocabs = sorted(vocabs, key=str.lower) log.info('Got %d unique vocabularies', len(vocabs)) return vocabs
def __init__(self, config_filepath): configparser = ConfigParser(config_filepath) gt_ontology_filename = configparser.getstr('gt_entitymapping') self.gt_ontology = load_pkl(gt_ontology_filename)
class Word2VecManager(): """ """ def __init__(self, config_file): """ Class initializer. Inputs: """ self.configparser = ConfigParser(config_file) self.epoch_callback = EpochCallback() self.model = None def train(self, sentences, pretrained=None): self.model = Word2Vec( size=self.configparser.getint('size'), window=self.configparser.getint('window'), min_count=self.configparser.getint('min_count'), workers=self.configparser.getint('workers'), callbacks=[self.epoch_callback]) log.info('Building vocabularies...') self.model.build_vocab(sentences) total_examples = self.model.corpus_count if pretrained: original_vocabs = self.model.wv.vocab.keys() pretrained_vocabs = KeyedVectors.load_word2vec_format(pretrained).vocab.keys() common_vocabs = list(set(original_vocabs) & set(pretrained_vocabs)) log.info('Intersecting %d common vocabularies for transfer learning', len(common_vocabs)) if self.configparser.getbool('pre_train_update_vocab'): log.info('Updating vocabularies using vocabs from pre-trained data') self.model.build_vocab([list(pretrained_vocabs)], update=True, min_count=1) self.model.intersect_word2vec_format(pretrained, lockf=1.0) self.model.train( sentences, total_examples=total_examples, epochs=self.configparser.getint('epochs'), compute_loss=True) def save_model(self, filepath): assert self.model is not None log.info('Saving model to %s...', filepath) self.model.save(filepath) def save_vectors(self, filepath): assert self.model is not None log.info('Saving word embeddings to %s...', filepath) self.model.wv.save_word2vec_format(filepath) def save_loss(self, filepath): assert self.model is not None # sorted by key, return a list of tuples lists = sorted(self.epoch_callback.loss.items()) # unpack a list of pairs into two tuples x, y = zip(*lists) plt.plot(x, y) plt.xlabel('Epoch') plt.ylabel('Loss') plt.savefig(filepath)
class SandDroid(): def __init__(self, theConfigFilePath, theLogger=Logger()): # parse config file self.configParser = ConfigParser() # self.configParser.parseFile(theConfigFilePath) self.configParser.generateDirectories() self.log = theLogger # keytool path to parse apk's signature self.keytoolPath = None # sanddroid directories self.mainDir = os.path.dirname(__file__) self.appList = [] # list to store apk file - full path self.runningApps = [] # list to store apk file which in being analyzed self.runHeadless = False self.emulatorStartPort = 5554 self.numThreads = 1 self.maxThreadRuntime = 600 # control running threads self.threadLogFileList = [] # list to store thread log file path self.numFinishedApps = 0 # number of analyzed apps self.numRunningThreads = 0 # number of running threads self.threadList = [] # list of threads, size=numThreads self.threadActiveMask = [ ] # bitmask to determine if thread is active, size=numThreads self.avdsheartbeat = (0, 0, 0, 0) # list avds' times used in one cycle self.avdheartbeat = 0 self.startTime = datetime.datetime.now() # ================================================================================ # Helpers # ================================================================================ def __isConfigValid(self): # check configure javaHome = os.environ.get('JAVA_HOME') if not javaHome: self.log.error('Java environment not detected') return False else: keytoolPath = os.path.join(javaHome, 'bin', 'keytool') if not os.path.exists(keytoolPath): self.log.error('Java keytool no exist') return False else: self.keytoolPath = keytoolPath for desc, directory in self.configParser.getDirectoies().items(): if not directory or not os.path.exists(directory): self.log.error('%s doesn\'t exist!' % desc) return False if (not self.configParser.getDbUsr()) and (not self.configParser.getDbHost()) and ( not self.configParser.getDbPort()) \ and (not self.configParser.getDbPswd()) and (not self.configParser.getDbName()): return False return True def _getLogDir(self): """ Get log directory """ logRootDir = self.configParser.getLogDir() logDir = '%s/%s-%s' % (logRootDir, Utils.getDateAsString( self.startTime), Utils.getTimeAsString(self.startTime)) return logDir def _createLogDir(self, logDir): """ Create log directory """ if not os.path.exists(logDir): try: os.makedirs(logDir) except OSError, e: print e
class ParseFoodOn: """ Class for parsing FoodOn. """ def __init__(self, config_filepath): """ Class initializer. Inputs: config_filepath: (str) Configuration filepath. """ self.configparser = ConfigParser(config_filepath) # read configuration file self.filepath = self.configparser.getstr('filepath') self.full_ontology_pkl = self.configparser.getstr('full_ontology_pkl') self.candidate_ontology_pkl = self.configparser.getstr( 'candidate_ontology_pkl') self.skeleton_and_entities_pkl = self.configparser.getstr( 'skeleton_and_entities_pkl') self.overwrite_pkl = self.configparser.getbool('overwrite_pickle_flag') self.outputFoodOn = self.configparser.getstr('outputFoodOn') self.num_seeds = self.configparser.getint('num_seeds') self.num_min_extracted_entities = self.configparser.getint( 'num_min_extracted_entities') # generate pairs from csv file self.pd_foodon_pairs = self.generate_pairs() self.all_classes, self.all_entities = self.get_classes_and_entities() self.foodon_graph, self.graph_dict, self.graph_dict_flip = self.generate_graph( ) def generate_graph(self): graph_dict = {k: v for v, k in enumerate(self.all_classes)} graph_dict_flip = {v: k for v, k in enumerate(self.all_classes)} G = nx.DiGraph() for _, row in self.pd_foodon_pairs.iterrows(): if row['Parent'] in self.all_classes and row[ 'Child'] in self.all_classes: node_from = graph_dict[row['Parent']] node_to = graph_dict[row['Child']] G.add_edge(node_from, node_to) return G, graph_dict, graph_dict_flip def get_classes_and_entities(self): all_classes = self.pd_foodon_pairs['Parent'].tolist() all_classes = list(set(all_classes)) all_classes.sort() log.debug('Found %d classes.', len(all_classes)) child = self.pd_foodon_pairs['Child'].tolist() child = list(set(child)) child.sort() all_entities = [c for c in child if c not in all_classes] log.debug('Found %d entities.', len(all_entities)) return all_classes, all_entities def generate_pairs(self): log.info('Generating pairs of FoodOn.') if file_exists(self.outputFoodOn) and not self.overwrite_pkl: log.info('Using pre-generated pairs file.') return pd.read_csv(self.outputFoodOn, sep='\t') # 1.Read specified columns from FoodON.csv file foodon = pd.read_csv( self.filepath, usecols=['Class ID', 'Parents', 'Preferred Label']) # 2.Create dictionary of URI and ClassLabel labels_tmp = foodon[["Class ID", "Preferred Label"]].copy() self.labels = labels_tmp.set_index( 'Class ID')['Preferred Label'].to_dict() # 3.Create data frame with columns - child and all its' parents foodonOrigDF = (foodon[[ "Class ID", "Parents" ]].copy()).rename(columns={'Class ID': 'Child'}) # 4.Split above DF into pairs of Child-Parent pairs = [] for _, row in foodonOrigDF.iterrows(): parents = str(row['Parents']) parentList = parents.split("|") for pClass in parentList: child = str(row['Child']) pairs.append([child, pClass]) foodonDF = pd.DataFrame(pairs, columns=['Child', 'Parent']) foodonDF = self.filter_ontology( foodonDF, 'http://purl.obolibrary.org/obo/FOODON_00001872') foodonDF = self.get_subtree( foodonDF, 'http://purl.obolibrary.org/obo/FOODON_00001002') # In foodonDF, replace URI by label for idx, pair in foodonDF.iterrows(): pair['Child'] = self.labels[pair['Child']] if pair['Parent'] in self.labels: pair['Parent'] = self.labels[pair['Parent']] foodonDF.drop_duplicates(inplace=True, ignore_index=True) foodonDF.to_csv(self.outputFoodOn, sep='\t', index=False) return foodonDF def filter_ontology(self, dfObj, classname): # Remove class and its children from the ontology. # Works only if the children are leaf nodes. indexNames = dfObj[dfObj['Parent'] == classname].index dfObj.drop(indexNames, inplace=True) indexNames = dfObj[dfObj['Child'] == classname].index dfObj.drop(indexNames, inplace=True) return dfObj def get_subtree(self, df, rootclass): subtreeDF, nextlevelclasses = self.traverse_next_level( df, ['http://purl.obolibrary.org/obo/FOODON_00001002']) while (len(nextlevelclasses) > 0): pairsDF, nextlevelclasses = self.traverse_next_level( df, nextlevelclasses) subtreeDF = pd.concat([subtreeDF, pairsDF], ignore_index=True) return subtreeDF def traverse_next_level(self, df, classnames): nextlevel = [] subtree_pairs = [] for parent in classnames: selectedPairs = df[df['Parent'] == parent] for idex, pair in selectedPairs.iterrows(): subtree_pairs.append([pair['Child'], pair['Parent']]) ifparent = df[df['Parent'] == pair['Child']] # Check if it is a leaf node if ifparent.empty != True: nextlevel.append(pair['Child']) subtreeDF = pd.DataFrame(subtree_pairs, columns=['Child', 'Parent']) return (subtreeDF, nextlevel) def get_all_classes_dict(self): """ Get all candidate classes. """ log.info('Generating dictionary of all classes.') if file_exists(self.full_ontology_pkl) and not self.overwrite_pkl: log.info('Using pre-generated full classes dictionary file.') return load_pkl(self.full_ontology_pkl) full_classes_dict = {} for class_label in self.all_classes: pd_match = self.pd_foodon_pairs[self.pd_foodon_pairs['Parent'] == class_label] children = pd_match['Child'].tolist() children_entities = [c for c in children if c in self.all_entities] node_from = self.graph_dict['foodon product type'] node_to = self.graph_dict[class_label] paths = [] if class_label == 'foodon product type': paths.append(tuple(['foodon product type'])) else: for path in nx.all_simple_paths(self.foodon_graph, source=node_from, target=node_to): translated_path = [self.graph_dict_flip[p] for p in path] paths.append(tuple(translated_path[::-1])) full_classes_dict[class_label] = (paths, children_entities) save_pkl(full_classes_dict, self.full_ontology_pkl) return full_classes_dict def get_candidate_classes(self): """ Get all candidate classes. """ log.info('Generating dictionary of candidate classes.') if file_exists(self.candidate_ontology_pkl) and not self.overwrite_pkl: log.info( 'Using pre-generated candidate classes dictionary file: %s', self.candidate_ontology_pkl) return load_pkl(self.candidate_ontology_pkl) candidate_classes_dict = {} for class_label in self.all_classes: pd_match = self.pd_foodon_pairs[self.pd_foodon_pairs['Parent'] == class_label] children = pd_match['Child'].tolist() children_entities = [c for c in children if c in self.all_entities] if len(children_entities) > 0: node_from = self.graph_dict['foodon product type'] node_to = self.graph_dict[class_label] paths = [] if class_label == 'foodon product type': paths.append(tuple(['foodon product type'])) else: for path in nx.all_simple_paths(self.foodon_graph, source=node_from, target=node_to): translated_path = [ self.graph_dict_flip[p] for p in path ] paths.append(tuple(translated_path[::-1])) candidate_classes_dict[class_label] = (paths, children_entities) log.info('Found %d candidate classes out of %d all classes.', len(candidate_classes_dict.keys()), len(self.all_classes)) save_pkl(candidate_classes_dict, self.candidate_ontology_pkl) return candidate_classes_dict def get_seeded_skeleton(self, candidate_classes_dict): log.info('Generating dictionary of skeleton candidate classes.') if file_exists( self.skeleton_and_entities_pkl) and not self.overwrite_pkl: log.info('Using pickled skeleton file: %s', self.skeleton_and_entities_pkl) return load_pkl(self.skeleton_and_entities_pkl) skeleton_candidate_classes_dict = {} candidate_entities = [] for candidate_class in candidate_classes_dict.keys(): entities = candidate_classes_dict[candidate_class][1] if len(entities) <= self.num_seeds: temp_num_seeds = len( entities) - self.num_min_extracted_entities if temp_num_seeds > 0: seeds = random.sample(entities, temp_num_seeds) candidate_entities.extend(list(set(entities) - set(seeds))) else: seeds = entities.copy() else: seeds = random.sample(entities, self.num_seeds) candidate_entities.extend(list(set(entities) - set(seeds))) skeleton_candidate_classes_dict[candidate_class] = ( candidate_classes_dict[candidate_class][0], seeds) candidate_entities = list(set(candidate_entities)) candidate_entities.sort() log.info( 'Found %d candidate entities to populate out of %d all entities.', len(candidate_entities), len(self.all_entities)) return_value = (skeleton_candidate_classes_dict, candidate_entities) save_pkl(return_value, self.skeleton_and_entities_pkl) return return_value
def main(): """ Main function. """ # set log, parse args, and read configuration set_logging(log_level=log.INFO) args = parse_argument() configparser = ConfigParser(args.config_file) # need to apply preprocessing fpm = FdcPreprocessManager(configparser.getstr('preprocess_config')) # read FoodOn vocabs labels = [] pd_foodon_pairs = pd.read_csv('./data/FoodOn/foodonpairs.txt', sep='\t') labels.extend(pd_foodon_pairs['Parent'].tolist()) labels.extend(pd_foodon_pairs['Child'].tolist()) labels = list(set(labels)) log.info('Number of unique labels: %d', len(labels)) processed_labels = fpm.preprocess_column(pd.Series(labels), load_model=True).tolist() queries = processed_labels.copy() for processed_label in processed_labels: queries.extend(processed_label.split()) queries = list(set(queries)) # get summaries of the wikipedia entry wm = WikipediaManager() # check if we're gonna reuse the previous results if configparser.getbool('reuse_previous'): prev_summary = configparser.getstr('prev_summaries_filepath') prev_failed = configparser.getstr('prev_failed_filepath') else: prev_summary = None prev_failed = None pd_summary, pd_failed = wm.get_summary(queries, prev_summary=prev_summary, prev_failed=prev_failed) # save results log.info('Saving successfully pulled wiki summaries to %s', configparser.getstr('summaries_filepath')) pd_summary.to_csv(configparser.getstr('summaries_filepath'), sep='\t', index=False) log.info('Saving failed wiki queries to %s', configparser.getstr('failed_filepath')) pd_failed.to_csv(configparser.getstr('failed_filepath'), sep='\t', index=False) # preprocess columns pd_summary['summary_preprocessed'] = fpm.preprocess_column( pd_summary['summary'], load_model=True) output_filepath = configparser.getstr('preprocessed_output') log.info('Saving preprocessed wikipedia data to %s...', output_filepath) pd_summary.to_csv(output_filepath, sep='\t', index=False)
def plot_pr_print_cm(baseline_classifier, best_classifier, main_config, model_manager): classifiers_ys = {} for classifier in [baseline_classifier, best_classifier]: log.info('Running model for classifier \'%s\'', classifier) # load config parsers preprocess_config = ConfigParser( main_config.get_str('preprocess_config')) classifier_config = ConfigParser( main_config.get_str('classifier_config')) # perform preprocessing X, y = model_manager.preprocess(preprocess_config, section=classifier) # select subset of features if requested selected_features = main_config.get_str_list('selected_features') if selected_features: log.info('Selecting subset of features: %s', selected_features) X = X[selected_features] # run classification model classifier_config.overwrite('classifier', classifier) score_avg, score_std, ys = model_manager.run_model_cv( X, y, 'f1', classifier_config) classifiers_ys[classifier] = ys # confusion matrix (y_trues, y_preds, y_probs) = classifiers_ys[best_classifier] tn = [] fp = [] fn = [] tp = [] pred_pos = [] pred_neg = [] known_pos = [] known_neg = [] f1 = [] precision = [] recall = [] specificity = [] npv = [] fdr = [] accuracy = [] for fold in range(len(y_trues)): cm_result = confusion_matrix(y_trues[fold], y_preds[fold]).ravel() tn.append(cm_result[0]) fp.append(cm_result[1]) fn.append(cm_result[2]) tp.append(cm_result[3]) pred_pos.append(cm_result[3] + cm_result[1]) pred_neg.append(cm_result[2] + cm_result[0]) known_pos.append(cm_result[3] + cm_result[2]) known_neg.append(cm_result[1] + cm_result[0]) f1.append(f1_score(y_trues[fold], y_preds[fold])) precision.append( precision_score(y_trues[fold], y_preds[fold], average='binary')) recall.append( recall_score(y_trues[fold], y_preds[fold], average='binary')) specificity.append(cm_result[0] / (cm_result[0] + cm_result[1])) npv.append(cm_result[0] / (cm_result[0] + cm_result[2])) fdr.append(cm_result[1] / (cm_result[1] + cm_result[3])) accuracy.append(accuracy_score(y_trues[fold], y_preds[fold])) tn_mean = np.mean(tn) fp_mean = np.mean(fp) fn_mean = np.mean(fn) tp_mean = np.mean(tp) pred_pos_mean = np.mean(pred_pos) pred_neg_mean = np.mean(pred_neg) known_pos_mean = np.mean(known_pos) known_neg_mean = np.mean(known_neg) f1_mean = np.mean(f1) precision_mean = np.mean(precision) recall_mean = np.mean(recall) specificity_mean = np.mean(specificity) npv_mean = np.mean(npv) fdr_mean = np.mean(fdr) accuracy_mean = np.mean(accuracy) tn_std = np.std(tn) fp_std = np.std(fp) fn_std = np.std(fn) tp_std = np.std(tp) pred_pos_std = np.std(pred_pos) pred_neg_std = np.std(pred_neg) known_pos_std = np.std(known_pos) known_neg_std = np.std(known_neg) f1_std = np.std(f1) precision_std = np.std(precision) recall_std = np.std(recall) specificity_std = np.std(specificity) npv_std = np.std(npv) fdr_std = np.std(fdr) accuracy_std = np.std(accuracy) log.info( 'Confusion matrix (tp, fp, fn, tn): (%.2f±%.2f, %.2f±%.2f, %.2f±%.2f, %.2f±%.2f)', tp_mean, tp_std, fp_mean, fp_std, fn_mean, fn_std, tn_mean, tn_std) log.info('pred pos: %.2f±%.2f', pred_pos_mean, pred_pos_std) log.info('pred neg: %.2f±%.2f', pred_neg_mean, pred_neg_std) log.info('known pos: %.2f±%.2f', known_pos_mean, known_pos_std) log.info('known neg: %.2f±%.2f', known_neg_mean, known_neg_std) log.info('F1: %.2f±%.2f', f1_mean, f1_std) log.info('Precision: %.2f±%.2f', precision_mean, precision_std) log.info('Recall: %.2f±%.2f', recall_mean, recall_std) log.info('Specificity: %.2f±%.2f', specificity_mean, specificity_std) log.info('Npv: %.2f±%.2f', npv_mean, npv_std) log.info('Fdr: %.2f±%.2f', fdr_mean, fdr_std) log.info('Accuracy: %.2f±%.2f', accuracy_mean, accuracy_std) # plot PR curve fig = plt.figure() lines = [] labels = [] for classifier, ys in classifiers_ys.items(): y_trues, y_preds, y_probs = ys if classifier == best_classifier: num_folds = len(y_trues) precision = 0 recall = 0 for fold in range(num_folds): precision += precision_score(y_trues[fold], y_preds[fold], average='binary') recall += recall_score(y_trues[fold], y_preds[fold], average='binary') precision /= num_folds recall /= num_folds arrowprops = {'arrowstyle': '->'} plt.scatter(recall, precision, s=30, marker='x', c='k', zorder=3) plt.annotate('Operational point', (recall, precision), (recall - 0.05, precision + 0.05), arrowprops=arrowprops) y_probs_1 = tuple(y_prob[1].to_numpy() for y_prob in y_probs) line, label = plot_pr(y_trues, y_probs_1, classifier) lines.append(line) labels.append(label) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Best model ({}) PR curve'.format(best_classifier)) plt.legend(lines, labels, loc='upper right', prop={'size': 10}) save_figure(fig, main_config.get_str('pr_curve'))
class RunnerThread(Thread): def __createXml(self): doc = xml.dom.minidom.Document() root = doc.createElement('static_info') doc.appendChild(root) ltime = doc.createElement('datetime') timeStr = time.strftime('%Y-%m-%d: %H:%M:%S', time.localtime(time.time())) ltime.appendChild(doc.createTextNode(str(timeStr))) root.appendChild(ltime) basicInfo = doc.createElement('basicInfo') staticPojo = self.staticAnalyzer if staticPojo.basicInfo is not None and staticPojo.basicInfo is not {}: if staticPojo.basicInfo['VersionCode'] is not None: versionCode = doc.createElement('VersionCode') versionCode.appendChild( doc.createTextNode(str( staticPojo.basicInfo['VersionCode']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('VersionCode') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['FileName'] is not None: versionCode = doc.createElement('FileName') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['FileName']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('FileName') versionCode.appendChild(doc.createTextNode('FileName')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['FileMD5'] is not None: versionCode = doc.createElement('FileMD5') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['FileMD5']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('FileMD5') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['FileSize'] is not None: versionCode = doc.createElement('FileSize') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['FileSize']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('Filesize') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['Package'] is not None: versionCode = doc.createElement('Package') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['Package']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('Package') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['MinSDK'] is not None: versionCode = doc.createElement('MinSDK') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['MinSDK']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('MinSDK') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['TargetSDK'] is not None: versionCode = doc.createElement('TargetSDK') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['TargetSDK']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('TargetSDK') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) if staticPojo.basicInfo['Cert'] is not None: versionCode = doc.createElement('Cert') versionCode.appendChild( doc.createTextNode(str(staticPojo.basicInfo['Cert']))) basicInfo.appendChild(versionCode) else: versionCode = doc.createElement('Cert') versionCode.appendChild(doc.createTextNode('None')) basicInfo.appendChild(versionCode) root.appendChild(basicInfo) isRepackaged = doc.createElement('isRepackaged') if staticPojo.isRepackaged is False: isRepackaged.appendChild(doc.createTextNode('0')) else: isRepackaged.appendChild(doc.createTextNode('1')) root.appendChild(isRepackaged) isRepackaged = doc.createElement('malware') if staticPojo.malware is not None: isRepackaged.appendChild(doc.createTextNode(staticPojo.malware)) else: isRepackaged.appendChild(doc.createTextNode('0')) root.appendChild(isRepackaged) isRepackaged = doc.createElement('riskValue') isRepackaged.appendChild(doc.createTextNode(str(staticPojo.riskValue))) root.appendChild(isRepackaged) sensitiveAPIs = doc.createElement('sensitiveAPIs') if staticPojo.sensitiveAPIs is not None and staticPojo.sensitiveAPIs is not {}: for key, value in staticPojo.sensitiveAPIs.items(): sensitiveAPI = doc.createElement('sensitiveAPI') keyName = doc.createElement('name') keyName.appendChild(doc.createTextNode(str(key))) sensitiveAPI.appendChild(keyName) desc = doc.createElement('desc') desc.appendChild(doc.createTextNode(str(value))) sensitiveAPI.appendChild(desc) sensitiveAPIs.appendChild(sensitiveAPI) root.appendChild(sensitiveAPIs) sensitiveAPIs = doc.createElement('sensitiveStrs') if staticPojo.sensitiveStrs is not None and staticPojo.sensitiveStrs is not {}: for key, value in staticPojo.sensitiveStrs.items(): sensitiveAPI = doc.createElement('sensitiveStr') keyName = doc.createElement('name') keyName.appendChild(doc.createTextNode(str(key))) sensitiveAPI.appendChild(keyName) desc = doc.createElement('desc') desc.appendChild(doc.createTextNode(str(value))) sensitiveAPI.appendChild(desc) sensitiveAPIs.appendChild(sensitiveAPI) root.appendChild(sensitiveAPIs) sensitiveAPIs = doc.createElement('sensitiveFiles') if staticPojo.sensitiveFiles is not None and staticPojo.sensitiveFiles is not {}: for key, value in staticPojo.sensitiveFiles.items(): sensitiveAPI = doc.createElement('sensitiveFile') keyName = doc.createElement('name') keyName.appendChild(doc.createTextNode(str(key))) sensitiveAPI.appendChild(keyName) desc = doc.createElement('type') desc.appendChild(doc.createTextNode(str(value))) sensitiveAPI.appendChild(desc) sensitiveAPIs.appendChild(sensitiveAPI) root.appendChild(sensitiveAPIs) sensitiveAPIs = doc.createElement('sensitiveCodes') if staticPojo.sensitiveCodes is not None and staticPojo.sensitiveCodes is not {}: for key, value in staticPojo.sensitiveCodes.items(): sensitiveAPI = doc.createElement('sensitiveCode') keyName = doc.createElement('name') keyName.appendChild(doc.createTextNode(str(key))) sensitiveAPI.appendChild(keyName) desc = doc.createElement('desc') desc.appendChild(doc.createTextNode(str(value))) sensitiveAPI.appendChild(desc) sensitiveAPIs.appendChild(sensitiveAPI) root.appendChild(sensitiveAPIs) urls_static = doc.createElement('urls') if staticPojo.urls is not None and staticPojo.urls is not []: for item in staticPojo.urls: url_static = doc.createElement('url') url_static.appendChild(doc.createTextNode(str(item))) urls_static.appendChild(url_static) root.appendChild(urls_static) sensitiveAPIs = doc.createElement('permissions') if staticPojo.permissions is not None and staticPojo.permissions is not {}: for key, value in staticPojo.permissions.items(): sensitiveAPI = doc.createElement('permission') keyName = doc.createElement('name') keyName.appendChild(doc.createTextNode(str(key))) sensitiveAPI.appendChild(keyName) desc = doc.createElement('desc') desc.appendChild(doc.createTextNode(str(value))) sensitiveAPI.appendChild(desc) sensitiveAPIs.appendChild(sensitiveAPI) root.appendChild(sensitiveAPIs) mainActivity = doc.createElement('mainActivity') if staticPojo.mainActivity is not None and staticPojo.mainActivity is not {}: mainActivity.appendChild( doc.createTextNode(str(staticPojo.mainActivity))) root.appendChild(mainActivity) activities = doc.createElement('activities') if staticPojo.activities is not None and staticPojo.activities is not {}: for value in staticPojo.activities: activity = doc.createElement('activity') activity.appendChild(doc.createTextNode(str(value))) activities.appendChild(activity) root.appendChild(activities) services = doc.createElement('services') if staticPojo.services is not None and staticPojo.services is not {}: for value in staticPojo.services: service = doc.createElement('service') service.appendChild(doc.createTextNode(str(value))) services.appendChild(service) root.appendChild(services) receivers = doc.createElement('receivers') if staticPojo.receivers is not None and staticPojo.receivers is not {}: for value in staticPojo.receivers: receiver = doc.createElement('receiver') receiver.appendChild(doc.createTextNode(str(value))) receivers.appendChild(receiver) root.appendChild(receivers) providers = doc.createElement('provicers') if staticPojo.providers is not None and staticPojo.providers is not {}: for value in staticPojo.providers: provider = doc.createElement('provicer') provider.appendChild(doc.createTextNode(str(value))) providers.appendChild(provider) root.appendChild(providers) exposedActivities = doc.createElement('exposedActivities') if staticPojo.exposedActivities is not None and staticPojo.exposedActivities is not {}: for value in staticPojo.exposedActivities: exposedActivity = doc.createElement('exposedActivity') exposedActivity.appendChild(doc.createTextNode(str(value))) exposedActivities.appendChild(exposedActivity) root.appendChild(exposedActivities) exposedServices = doc.createElement('exposedServices') if staticPojo.exposedServices is not None and staticPojo.exposedServices is not {}: for value in staticPojo.exposedServices: exposedService = doc.createElement('exposedService') exposedService.appendChild(doc.createTextNode(str(value))) exposedServices.appendChild(exposedService) root.appendChild(exposedServices) exposedReceivers = doc.createElement('exposedReceivers') if staticPojo.exposedReceivers is not None and staticPojo.exposedReceivers is not {}: for value in staticPojo.exposedReceivers: exposedReceiver = doc.createElement('exposedReceiver') exposedReceiver.appendChild(doc.createTextNode(str(value))) exposedReceivers.appendChild(exposedReceiver) root.appendChild(exposedReceivers) classifyInfo = doc.createElement('classifyInfo') if staticPojo.classifyInfo is not None and staticPojo.classifyInfo is not {}: classifyInfo.appendChild( doc.createTextNode(str(staticPojo.classifyInfo))) root.appendChild(classifyInfo) xmlFilename = '/home/mindmac/workspace/SandDroidIIE/staticAnaReports/' + staticPojo.basicInfo[ 'FileMD5'] + '.xml' xmlFile = open(xmlFilename, 'w') doc.writexml(xmlFile, indent='\t', addindent='\t', newl='\n', encoding="utf-8") def __init__(self, theApkObj, theAvdName, decompressDir, runHeadless, theLogger=Logger()): Thread.__init__(self) # configParser self.configParser = ConfigParser() self.apkObj = theApkObj self.log = theLogger self.curDir = os.path.dirname(__file__) self.staticAnalyzer = None self.dynamicAnalyzer = None self.logcatAnalyzer = None self.startTimeStr = None self.endTimeStr = None self.emulator = None self.emulatorPort = 5554 self.avdName = theAvdName self.runHeadless = runHeadless self.decompressPath = decompressDir self.logcatFile = None self.session = None self.cancelFlag = False # Flag for canceling run def checkForCancelation(self): """ Checks for the cancelation flag sent from the main program. If cancel flag is set, abort execution by raising KeyboardInterrupt. """ if self.cancelFlag: self.log.info('Cancelation flag found, abort thread') traceback.print_stack(file=self.log.log) raise KeyboardInterrupt def __getLogcatFilePath(self): """ Generates logcat file name """ return os.path.join(self.decompressPath, 'logcat.log') def getLogger(self): return self.log def staticAnalyze(self): """ Static Analysis """ # Static Analyzer self.staticAnalyzer = StaticAnalyzer(self.apkObj, self.decompressPath, self.curDir, self.log) # Init self.log.info('Initialization...') self.staticAnalyzer.initEnv() # Parse smali files self.log.info('Parse smali files to get methods, urls...') self.staticAnalyzer.parseSmali() # APK basic information self.log.info('Get APK\'s basic information') self.staticAnalyzer.getBasicInfo() # APK permissions used self.log.info('Get APK\'s used permissions') self.staticAnalyzer.getPermissions() # APK components used self.log.info('Get APK\'s used components') self.staticAnalyzer.getComponents() # APK components exposed self.log.info('Get APK\'s exposed components') self.staticAnalyzer.getExposedComps() # APK classifier self.log.info('Get APK\'s classifier information') self.staticAnalyzer.classifyByPermission() # APK fuzzy risk value self.log.info('Get APK\'s fuzzy risk score') self.staticAnalyzer.getRisk() # APK gexf graph self.log.info('Get APK\'s gexf graph', setTime=True) #gexfOutFile = os.path.join(self.decompressPath, '%s.gexf' % self.apkObj.getMd5Hash().upper()) #self.staticAnalyzer.getGexf(gexfOutFile) # APK Malware detection self.log.info('Get APK\'s malicious information', setTime=True) self.staticAnalyzer.getMal() # APK repackaged self.log.info('Check APK if repackaged', setTime=True) self.staticAnalyzer.checkRepackage(self.session) def dynamicAnalyze(self): """ Dynamic Anlysis """ cur_dir = os.path.dirname(__file__) imageDir = os.path.join(cur_dir, 'resources', 'images') pcapFile = os.path.join(self.decompressPath, '%s.pcap' % self.apkObj.getMd5Hash().upper()) self.dynamicAnalyzer = DynamicAnalyzer(self.decompressPath, self.avdName, self.curDir, self.log) self.emulator = EmulatorClient( theSdkPath=self.configParser.getAndroidSdkDir(), thePort=self.emulatorPort, theImageDir=imageDir, thePcapFile=pcapFile, theRunHeadless=self.runHeadless, theAvdName=self.avdName, theLogger=self.log) self.checkForCancelation() # Start emulator self.log.info('Start emulator', setTime=True) self.emulator.start() # Run app isFinishedRunnig = self.dynamicAnalyzer.runApp(self.emulator, self.apkObj) # Store logcat file if isFinishedRunnig: self.log.info('Store logcat file') self.emulator.stopLogcatRedirect() self.emulator.storeLogcatRedirectFile( self.dynamicAnalyzer.logcatRedirectFile, self.logcatFile) else: self.log.error('Run app failed!') def killEmulator(self): if not self.emulator: self.emulator.shutDown() def logcatAnalyze(self, logcatFile): """ Analyze logcat file """ try: if not os.path.exists(logcatFile): self.log.info('Logcat file %s doesn\'t exist!' % logcatFile) return else: # Build self.logcatAnalyzer self.logcatAnalyzer = LogcatAnalyzer(theLogger=self.log) self.logcatAnalyzer.setLogFile(logcatFile) self.logcatAnalyzer.extractLogEntries() except EmulatorClientError, ecErr: self.runnerThread.result['errorList'].append(ecErr)