def main(): print("GO") start_time = time.time() document_map = {} try: cand_table = db["candidate"] #characteristic_id_list = list(ideal_table.find({}, {"global_job_category_id": 1, "Skills": 1}).distinct("global_job_category_id")) characteristic_list = getCharacteristicMap() #print("%s" % characteristic_list) print("%s" % len(characteristic_list)) print("---query Time: %s seconds ---" % (time.time() - start_time)) print("done") skip_amount = 0 cand_count = 0 total_cand = cand_table.count() print("%s" % total_cand) start_delta = 200000 total_cand = 300000 while (start_delta + skip_amount) < total_cand: candidate_list = list(cand_table.find({}, {"candidate_id":1,"job_skill_names": 1}).skip(start_delta + skip_amount).limit(2500)) for candidate in candidate_list: cand_count += 1 cand_cat_count = 0 print("Running candidate - %s" % cand_count) classification_array = [] for key, record in characteristic_list.items(): if matchSkills(candidate["job_skill_names"], record["Skills"], .75): cand_cat_count += 1 if document_map.get(key) is None: document_map[key] = [] if candidate["candidate_id"] not in document_map[key]: document_map[key].append(candidate["candidate_id"]) print("Added to %s categories" % cand_cat_count) #print("%s" % json.dumps(document_map, default=obj_dict)) skip_amount += 2500 count = 0 text_file.write("[") for key, val in document_map.items(): record = ClassifierObject(key, val) recordJson = record.toJson() text_file.write("%s" % recordJson) count += 1 if count < len(document_map): text_file.write(",") text_file.write("]") text_file.close() print("---Parse Time: %s seconds ---" % (time.time() - start_time)) except Exception as e: DebugException(e) print("---Run Time: %s seconds ---" % (time.time() - start_time))
def main(): print("GO") start_time = time.time() document_map = {} try: cand_table = db["candidate_skills_from_parsed_resumes"] #characteristic_id_list = list(ideal_table.find({}, {"global_job_category_id": 1, "Skills": 1}).distinct("global_job_category_id")) characteristic_list = getCharacteristicMap() #print("%s" % characteristic_list) print("Total Number of Job Categories - %s" % len(characteristic_list)) print("---Job Categories Retrieval Time: %s seconds ---" % (time.time() - start_time)) #print("done") skip_amount = 0 cand_count = 0 total_cand = cand_table.count() #print("%s" % total_cand) start_delta = 0 category_map = db["category_candidate_map"] total_cand = total_cand/2 while (start_delta + skip_amount) < total_cand: candidate_list = list(cand_table.find({}, {"candidate_id":1,"parsedWords": 1}).skip(start_delta + skip_amount).limit(5000)) for candidate in candidate_list: cand_count += 1 cand_cat_count = 0 parsedWords = candidate["parsedWords"] candidate_skill_list = [] for words in parsedWords: candidate_skill_list.append(words["word"].lower()) printPerThousand = cand_count%1000 if(printPerThousand == 0): print("Running candidate - %s - Time Taken So Far - %s" % (cand_count,(time.time() - start_time))) printPerThousand = 1 #print("Running candidate - %s - Time Taken So Far - %s" % (cand_count,(time.time() - start_time))) #print("Running candidate - %s - Time Taken So Far - %s" % (cand_count,(time.time() - start_time))) classification_array = [] for key, record in characteristic_list.items(): if matchSkills(candidate_skill_list, record["Skills"], Threshold): cand_cat_count += 1 if document_map.get(key) is None: document_map[key] = [] if candidate["candidate_id"] not in document_map[key]: document_map[key].append(candidate["candidate_id"]) #category_map.update({"global_job_category_id":key},{"$addToSet":{"candidates":candidate["candidate_id"]}},True) #print("Added to %s categories" % cand_cat_count) #print("%s" % json.dumps(document_map, default=obj_dict)) skip_amount += 5000 count = 0 text_file.write("[") for key, val in document_map.items(): record = ClassifierObject(key, val) recordJson = record.toJson() text_file.write("%s" % recordJson) count += 1 if count < len(document_map): text_file.write(",") text_file.write("]") text_file.close() print("---Parse Time: %s seconds ---" % (time.time() - start_time)) except Exception as e: DebugException(e) print("---Run Time: %s seconds ---" % (time.time() - start_time))