'rota-dos-concursos-crawler-seguranca-da-informacao-resultados.json' ] input_files_so = [ 'questao-certa-crawler-sistemas-operacionais-resultados.json', 'rota-dos-concursos-crawler-sistemas-operacionais-resultados.json' ] flog = "logs/logs_json_to_db_iter_002.json" output_logs = [] count = 0 database = "tcc" collection_ac = "quest_ac_iter_02" collection_si = "quest_si_iter_02" collection_so = "quest_so_iter_02" db = DatabaseManipulation("mongo") quest_img_count = 0 for file in input_files_ac: json_data = {} with open(main_folder + file, "r") as f: json_data = json.load(f) for quest in json_data["ext_quest_list"]: quest["theme"] = json_data["theme"] if len(quest["question_imgs"]) > 0: quest_img_count += 1 db.insert_many(database, collection_ac, json_data["ext_quest_list"]) count += len(json_data["ext_quest_list"]) output_logs.append({ "theme": json_data["theme"], "file": file.split(".json")[0],
stoptoken_config = ['number', 'key_base_rules'] split_test = 0.2 # Setting base for matrix X with index dictionary and token arrays word_index_map = {} theme_question_list_map = {} theme_token_list_map = {} current_index = 0 N = 0 # Setting stopwords from list to reduce processing time and dimensionality pt_stopwords = stopwords.words('portuguese') pt_stopwords += [w.rstrip() for w in open('stopwords.txt')] # Load themes questions from mongo db into dict arrays db = DatabaseManipulation("mongo") for collection in db_collection_list: theme_question_list_map[collection] = [ MSCSP.bind_question_text_alternatives(q) for q in db.find_all(db_name, collection) ] # Balance themes # Random each theme questions reviews and get same quantity from the theme that has more questions smaller_length = len(theme_question_list_map[db_collection_list[0]]) for collection in db_collection_list: actual_len = len(theme_question_list_map[collection]) # (N x D+1 matrix - keeping themes together so shuffle more easily later N = N + actual_len
def supervised_training(ml_list, split_test, k_fold, balancing, iter_number, db_type, db_name, db_collection_list, collection_label_list, tokenizer_config, stoptoken_config, output_file_path, env): """Start supervised training for k_fold and/or split methods. Options: - ml_list: algorithms options ['logistic_regression', 'decision_tree', 'svm_svc_linear', 'svm_svc_rbf', 'svm_linear_svr ,'multinomial_nb', 'random-forest', 'kneighbors', 'stochastic-gradient-descent-log', 'stochastic-gradient-descent-svm'] - split_test: decimal number percentages (recommended 0.1 to 0.3) - k_fold: integer number (recommended 5 or 10) - balancing: boolean state to balance sample quantities - iter_number: iteration number for training as integer number (e.g. 3) - db_type: database used to get the data (e.g. "mongo") - db_name: database name to access (e.g. "tcc") - db_collection_list: collection name and theme name used in iteration [ { "collec_name": "quest_db_iter_01", "theme_name": "Database" }, { "collec_name": "quest_rc_iter_01", "theme_name": "Computer Network" } ] - collection_label_list: collections labels for each theme from db_collection_list [0, 1] # 0: database, 1: computer network (same quantity from db_collection_list) - tokenizer_config: types of tokenizer to be used ['downcase', 'short', 'porter_stem', 'stopwords'] - stoptoken_config: types of stoptoken to be used ['number', 'key_base_rules'] - output_file_path: RELATIVE filepath used to output the results (e.g. "reports/training_report_iter_008_") - env: database environment ('local', 'preprod', 'prod') :param ml_list: :param split_test: :param k_fold: :param balancing: :param iter_number: :param db_type: :param db_name: :param db_collection_list: :param collection_label_list: :param tokenizer_config: :param stoptoken_config: :param output_file_path: :return: """ # Temp variables today_date = datetime.datetime.now() print "Starting training..." # Output variables header = "\n--------------------------------------\n" header += "REPORT FROM TRAINING - ITERATION " + str(iter_number) + "\n" header += "--------------------------------------\n" header += "Algorithms used: " for ml in ml_list: header = header + ml + " " header += "\n--------------------------------------\n\n" # Setting base for matrix X with index dictionary and token arrays word_index_map = {} theme_question_list_map = {} theme_token_list_map = {} current_index = 0 N = 0 # Setting stopwords from list to reduce processing time and dimensionality pt_stopwords = stopwords.words('portuguese') pt_stopwords += [w.rstrip() for w in open('stopwords.txt')] # Load themes questions from mongo db into dict arrays db = DatabaseManipulation(db_type, env) for collection in db_collection_list: theme_question_list_map[collection["collec_name"]] = [ MSCSP.bind_question_text_alternatives(q) for q in db.find_all(db_name, collection["collec_name"]) ] header = header + collection["theme_name"] + " total questions: " + str( len(theme_question_list_map[collection["collec_name"]])) + "\n" # Check if quantity needs to be balanced or not if balancing: # Balance themes # Random each theme questions reviews and get same quantity from the theme that has more questions smaller_length = len( theme_question_list_map[db_collection_list[0]["collec_name"]]) for collection in db_collection_list: actual_len = len( theme_question_list_map[collection["collec_name"]]) smaller_length = actual_len if actual_len < smaller_length else smaller_length N = smaller_length * len(collection_label_list) header = header + "Total questions for each theme after balancing: " + str( smaller_length) + "\n\n" for collection in db_collection_list: np.random.shuffle( theme_question_list_map[collection["collec_name"]]) theme_question_list_map[collection["collec_name"]] = \ theme_question_list_map[collection["collec_name"]][:smaller_length] output_file_path = output_file_path + "balanced_" else: # Unbalanced themes # (N x D+1 matrix - keeping themes together so shuffle more easily later for collection in db_collection_list: N = N + len(theme_question_list_map[collection["collec_name"]]) output_file_path = output_file_path + "unbalanced_" # Iterate questions from each theme, remove extra stoptokens, # insert tokens into array and map word index in object for collection in db_collection_list: tokens = [] theme_token_list_map[collection["collec_name"]] = [] for question_text in theme_question_list_map[ collection["collec_name"]]: tokens = NLPSP.tokenizer(question_text, tokenizer_config, pt_stopwords) # Remove extra stoptokens that weren't removed from stopwords tokens = [ t for t in tokens if not NLPSP.is_stoptoken(t, stoptoken_config) ] theme_token_list_map[collection["collec_name"]].append(tokens) for token in tokens: if token not in word_index_map: word_index_map[token] = current_index current_index += 1 with open( "logs/word_index_map_" + today_date.strftime("%d-%m-%Y_%H:%M:%S") + ".json", 'w+') as f: json.dump(word_index_map, f) # Initialize data matrix with zero frequencies data = np.zeros((N, len(word_index_map) + 1)) i = 0 col = 0 # Get themes words frequencies and add to data matrix for collection in db_collection_list: for tokens in theme_token_list_map[collection["collec_name"]]: row = MSCSP.tokens_to_vector(tokens, collection_label_list[col], word_index_map) data[i, :] = row i += 1 col += 1 # Training with report output SKLTSP.train_report(data, split_test, k_fold, ml_list, output_file_path, header, collection_label_list)
stoptoken_config = ['number', 'key_base_rules'] split_test = 0.2 # Setting base for matrix X with index dictionary and token arrays word_index_map = {} theme_question_list_map = {} theme_token_list_map = {} current_index = 0 N = 0 # Setting stopwords from list to reduce processing time and dimensionality pt_stopwords = stopwords.words('portuguese') pt_stopwords += [w.rstrip() for w in open('stopwords.txt')] # Load themes questions from mongo db into dict arrays db = DatabaseManipulation("mongo") for collection in db_collection_list: theme_question_list_map[collection] = [MSCSP.bind_question_text_alternatives(q) for q in db.find_all(db_name, collection)] # Balance themes # Random each theme questions reviews and get same quantity from the theme that has more questions smaller_length = len(theme_question_list_map[db_collection_list[0]]) for collection in db_collection_list: actual_len = len(theme_question_list_map[collection]) # (N x D+1 matrix - keeping themes together so shuffle more easily later N = N + actual_len smaller_length = actual_len if actual_len < smaller_length else smaller_length for collection in db_collection_list:
rc_files_added = [] flog = "logs/logs_json_to_db.json" output_logs = [] quest_count = 0 # CHANGE HERE TO ALLOW DIFFERENT DB_NAMES db_name_local = "tccprod" db_name_prod = "tccprod" db_name_env = db_name_prod db_env = "prod" # JSON files to be read follow the regex pattern below regex = '(.*-crawler-)(.*)(-resultados.json)' db = DatabaseManipulation("mongo", db_env) # Start with questions from questao certa folder for flname_qc in os.listdir(qc_folder): qc_rc_status = False if flname_qc.endswith(".json"): # print(os.path.join(directory, filename)) re_search_qc = re.search(regex, flname_qc) if re_search_qc is not None: collection_name = re_search_qc.group(2).replace('-', '_') flname_rc_temp = None for flname_rc in os.listdir(rc_folder): if flname_rc.endswith(".json"): re_search_rc = re.search(regex, flname_rc) if re_search_rc is not None: