Esempio n. 1
0
    'rota-dos-concursos-crawler-seguranca-da-informacao-resultados.json'
]
input_files_so = [
    'questao-certa-crawler-sistemas-operacionais-resultados.json',
    'rota-dos-concursos-crawler-sistemas-operacionais-resultados.json'
]
flog = "logs/logs_json_to_db_iter_002.json"
output_logs = []

count = 0
database = "tcc"
collection_ac = "quest_ac_iter_02"
collection_si = "quest_si_iter_02"
collection_so = "quest_so_iter_02"

db = DatabaseManipulation("mongo")

quest_img_count = 0
for file in input_files_ac:
    json_data = {}
    with open(main_folder + file, "r") as f:
        json_data = json.load(f)
        for quest in json_data["ext_quest_list"]:
            quest["theme"] = json_data["theme"]
            if len(quest["question_imgs"]) > 0:
                quest_img_count += 1
        db.insert_many(database, collection_ac, json_data["ext_quest_list"])
        count += len(json_data["ext_quest_list"])
        output_logs.append({
            "theme": json_data["theme"],
            "file": file.split(".json")[0],
stoptoken_config = ['number', 'key_base_rules']
split_test = 0.2

# Setting base for matrix X with index dictionary and token arrays
word_index_map = {}
theme_question_list_map = {}
theme_token_list_map = {}
current_index = 0
N = 0

# Setting stopwords from list to reduce processing time and dimensionality
pt_stopwords = stopwords.words('portuguese')
pt_stopwords += [w.rstrip() for w in open('stopwords.txt')]

# Load themes questions from mongo db into dict arrays
db = DatabaseManipulation("mongo")
for collection in db_collection_list:
    theme_question_list_map[collection] = [
        MSCSP.bind_question_text_alternatives(q)
        for q in db.find_all(db_name, collection)
    ]

# Balance themes
# Random each theme questions reviews and get same quantity from the theme that has more questions
smaller_length = len(theme_question_list_map[db_collection_list[0]])

for collection in db_collection_list:
    actual_len = len(theme_question_list_map[collection])
    # (N x D+1 matrix - keeping themes together so shuffle more easily later
    N = N + actual_len
Esempio n. 3
0
def supervised_training(ml_list, split_test, k_fold, balancing, iter_number,
                        db_type, db_name, db_collection_list,
                        collection_label_list, tokenizer_config,
                        stoptoken_config, output_file_path, env):
    """Start supervised training for k_fold and/or split methods.
    Options:
        - ml_list: algorithms options
            ['logistic_regression', 'decision_tree', 'svm_svc_linear',
             'svm_svc_rbf', 'svm_linear_svr ,'multinomial_nb',
             'random-forest', 'kneighbors', 'stochastic-gradient-descent-log',
             'stochastic-gradient-descent-svm']
        - split_test: decimal number percentages (recommended 0.1 to 0.3)
        - k_fold: integer number (recommended 5 or 10)
        - balancing: boolean state to balance sample quantities
        - iter_number: iteration number for training as integer number (e.g. 3)
        - db_type: database used to get the data (e.g. "mongo")
        - db_name: database name to access (e.g. "tcc")
        - db_collection_list: collection name and theme name used in iteration
            [
                { "collec_name": "quest_db_iter_01", "theme_name": "Database" },
                { "collec_name": "quest_rc_iter_01", "theme_name": "Computer Network" }
            ]
        - collection_label_list: collections labels for each theme from db_collection_list
            [0, 1]  # 0: database, 1: computer network (same quantity from db_collection_list)
        - tokenizer_config: types of tokenizer to be used
            ['downcase', 'short', 'porter_stem', 'stopwords']
        - stoptoken_config: types of stoptoken to be used
            ['number', 'key_base_rules']
        - output_file_path: RELATIVE filepath used to output the results (e.g. "reports/training_report_iter_008_")
        - env: database environment ('local', 'preprod', 'prod')

    :param ml_list:
    :param split_test:
    :param k_fold:
    :param balancing:
    :param iter_number:
    :param db_type:
    :param db_name:
    :param db_collection_list:
    :param collection_label_list:
    :param tokenizer_config:
    :param stoptoken_config:
    :param output_file_path:
    :return:
    """
    # Temp variables
    today_date = datetime.datetime.now()

    print "Starting training..."

    # Output variables
    header = "\n--------------------------------------\n"
    header += "REPORT FROM TRAINING - ITERATION " + str(iter_number) + "\n"
    header += "--------------------------------------\n"
    header += "Algorithms used: "
    for ml in ml_list:
        header = header + ml + " "
    header += "\n--------------------------------------\n\n"

    # Setting base for matrix X with index dictionary and token arrays
    word_index_map = {}
    theme_question_list_map = {}
    theme_token_list_map = {}
    current_index = 0
    N = 0

    # Setting stopwords from list to reduce processing time and dimensionality
    pt_stopwords = stopwords.words('portuguese')
    pt_stopwords += [w.rstrip() for w in open('stopwords.txt')]

    # Load themes questions from mongo db into dict arrays
    db = DatabaseManipulation(db_type, env)
    for collection in db_collection_list:
        theme_question_list_map[collection["collec_name"]] = [
            MSCSP.bind_question_text_alternatives(q)
            for q in db.find_all(db_name, collection["collec_name"])
        ]
        header = header + collection["theme_name"] + " total questions: " + str(
            len(theme_question_list_map[collection["collec_name"]])) + "\n"

    # Check if quantity needs to be balanced or not
    if balancing:
        # Balance themes
        # Random each theme questions reviews and get same quantity from the theme that has more questions
        smaller_length = len(
            theme_question_list_map[db_collection_list[0]["collec_name"]])

        for collection in db_collection_list:
            actual_len = len(
                theme_question_list_map[collection["collec_name"]])
            smaller_length = actual_len if actual_len < smaller_length else smaller_length

        N = smaller_length * len(collection_label_list)
        header = header + "Total questions for each theme after balancing: " + str(
            smaller_length) + "\n\n"

        for collection in db_collection_list:
            np.random.shuffle(
                theme_question_list_map[collection["collec_name"]])
            theme_question_list_map[collection["collec_name"]] = \
                theme_question_list_map[collection["collec_name"]][:smaller_length]

        output_file_path = output_file_path + "balanced_"
    else:
        # Unbalanced themes
        # (N x D+1 matrix - keeping themes together so shuffle more easily later
        for collection in db_collection_list:
            N = N + len(theme_question_list_map[collection["collec_name"]])
        output_file_path = output_file_path + "unbalanced_"

    # Iterate questions from each theme, remove extra stoptokens,
    # insert tokens into array and map word index in object
    for collection in db_collection_list:
        tokens = []
        theme_token_list_map[collection["collec_name"]] = []

        for question_text in theme_question_list_map[
                collection["collec_name"]]:
            tokens = NLPSP.tokenizer(question_text, tokenizer_config,
                                     pt_stopwords)

            # Remove extra stoptokens that weren't removed from stopwords
            tokens = [
                t for t in tokens
                if not NLPSP.is_stoptoken(t, stoptoken_config)
            ]

            theme_token_list_map[collection["collec_name"]].append(tokens)

            for token in tokens:
                if token not in word_index_map:
                    word_index_map[token] = current_index
                    current_index += 1

    with open(
            "logs/word_index_map_" + today_date.strftime("%d-%m-%Y_%H:%M:%S") +
            ".json", 'w+') as f:
        json.dump(word_index_map, f)

    # Initialize data matrix with zero frequencies
    data = np.zeros((N, len(word_index_map) + 1))

    i = 0
    col = 0

    # Get themes words frequencies and add to data matrix
    for collection in db_collection_list:
        for tokens in theme_token_list_map[collection["collec_name"]]:
            row = MSCSP.tokens_to_vector(tokens, collection_label_list[col],
                                         word_index_map)
            data[i, :] = row
            i += 1
        col += 1

    # Training with report output
    SKLTSP.train_report(data, split_test, k_fold, ml_list, output_file_path,
                        header, collection_label_list)
stoptoken_config = ['number', 'key_base_rules']
split_test = 0.2

# Setting base for matrix X with index dictionary and token arrays
word_index_map = {}
theme_question_list_map = {}
theme_token_list_map = {}
current_index = 0
N = 0

# Setting stopwords from list to reduce processing time and dimensionality
pt_stopwords = stopwords.words('portuguese')
pt_stopwords += [w.rstrip() for w in open('stopwords.txt')]

# Load themes questions from mongo db into dict arrays
db = DatabaseManipulation("mongo")
for collection in db_collection_list:
    theme_question_list_map[collection] = [MSCSP.bind_question_text_alternatives(q) for q in db.find_all(db_name, collection)]

# Balance themes
# Random each theme questions reviews and get same quantity from the theme that has more questions
smaller_length = len(theme_question_list_map[db_collection_list[0]])

for collection in db_collection_list:
    actual_len = len(theme_question_list_map[collection])
    # (N x D+1 matrix - keeping themes together so shuffle more easily later
    N = N + actual_len

    smaller_length = actual_len if actual_len < smaller_length else smaller_length

for collection in db_collection_list:
Esempio n. 5
0
rc_files_added = []

flog = "logs/logs_json_to_db.json"
output_logs = []
quest_count = 0

# CHANGE HERE TO ALLOW DIFFERENT DB_NAMES
db_name_local = "tccprod"
db_name_prod = "tccprod"
db_name_env = db_name_prod
db_env = "prod"

# JSON files to be read follow the regex pattern below
regex = '(.*-crawler-)(.*)(-resultados.json)'

db = DatabaseManipulation("mongo", db_env)

# Start with questions from questao certa folder
for flname_qc in os.listdir(qc_folder):
    qc_rc_status = False
    if flname_qc.endswith(".json"):
        # print(os.path.join(directory, filename))
        re_search_qc = re.search(regex, flname_qc)
        if re_search_qc is not None:
            collection_name = re_search_qc.group(2).replace('-', '_')
            flname_rc_temp = None

            for flname_rc in os.listdir(rc_folder):
                if flname_rc.endswith(".json"):
                    re_search_rc = re.search(regex, flname_rc)
                    if re_search_rc is not None: