def teachers_comprehensive(self): if "teachers_" + str(self.course_id) in self.cache: return (self.cache["teachers_" + str(self.course_id)]) query_result = self.query( 'query MyQuery {course(id: "' + str(self.course_id) + '") {enrollmentsConnection {nodes {type\nuser {_id\nname\nemail}\nsection {_id}}}}}' )["data"]["course"]["enrollmentsConnection"]["nodes"] teacher_series = {} # Set up dictionary for teachers for teacher in query_result: if (teacher["type"] == "TeacherEnrollment"): if (teacher["user"]["_id"] not in teacher_series): teacher_series[teacher["user"]["_id"]] = { "name": func.clean_text(teacher["user"]["name"]), "email": teacher["user"]["email"] } teacher_series[teacher["user"] ["_id"]]["first_name"] = func.clean_text( teacher["user"]["name"] [0:teacher["user"]["name"].find(" ")]) teacher_series[ teacher["user"]["_id"]]["last_name"] = func.clean_text( teacher["user"]["name"] [teacher["user"]["name"].rfind(" ") + 1:]) teacher_series[teacher["user"]["_id"]]["sections"] = [] teacher_series[teacher["user"]["_id"]]["sections"].append( teacher["section"]["_id"]) self.cache["teachers_" + str(self.course_id) + "_" + str(self.groupset_id)] = teacher_series return (teacher_series)
def groups(self): if "groups_" + str(self.course_id) in self.cache: return (self.cache["groups_" + str(self.course_id)]) query_result = self.query( 'query MyQuery {course(id: "' + str(self.course_id) + '") {groupSetsConnection {nodes {_id\ngroupsConnection {nodes {_id\nname\nmembersConnection {nodes {user {_id\nname}}}}}}}}}' ) try: for groupset in query_result["data"]["course"][ "groupSetsConnection"]["nodes"]: if (groupset["_id"] == str(self.groupset_id)): temp = groupset["groupsConnection"]["nodes"] df = {} for group in temp: df[group["_id"]] = { "name": func.clean_text(group["name"]), "users": [] } for user in group["membersConnection"]["nodes"]: df[group["_id"]]["users"].append( user["user"]["_id"]) self.cache["groups_" + str(self.course_id)] = df return (df) break except: return ({}) return ({})
def students(self): if "students_" + str(self.course_id) not in self.cache: query_result = self.query( 'query MyQuery {course(id: "' + str(self.course_id) + '") {enrollmentsConnection {nodes {type\nuser {_id\nname\nemail}}}}}' ) self.cache["students_" + str(self.course_id)] = { "id": [], "name": [], "email": [] } for student in query_result["data"]["course"][ "enrollmentsConnection"]["nodes"]: if (student["type"] == "StudentEnrollment"): if (student["user"]["_id"] not in self.cache["students_" + str(self.course_id)]["id"]): self.cache["students_" + str(self.course_id)]["id"].append( student["user"]["_id"]) self.cache["students_" + str(self.course_id)]["name"].append( func.clean_text( student["user"]["name"])) self.cache["students_" + str(self.course_id)]["email"].append( student["user"]["email"]) return (self.cache["students_" + str(self.course_id)])
def gen_sentence_for_classifier(dict_bm25ranker, query): dict_classifier = {} dict_classifier['sentence'] = [] dict_classifier['label'] = [] dict_classifier['BM25_score'] = [] dict_classifier['seed_url'] = [] query_clean_tokens = tokenize_cleaned_text(clean_text(query)) query_clean = " ".join(query_clean_tokens) print('gen_sentence_for_classifier: query: {}'.format(query)) print('gen_sentence_for_classifier: query_clean: {}\n'.format(query_clean)) for i in range(len(dict_bm25ranker['doc_scores_corpus_normalized'])): # print('\ngen_sentence_for_classifier: dict_bm25ranker[value_clean][{}]: {}\n'.format(i, dict_bm25ranker['value_clean'][i])) my_current_sentence = '[CLS] ' + query_clean + ' [SEP] ' + dict_bm25ranker[ 'value_clean'][i] dict_classifier['sentence'].append(my_current_sentence) dict_classifier['label'].append(0) dict_classifier['BM25_score'].append( dict_bm25ranker['doc_scores_corpus_normalized'][i]) dict_classifier['seed_url'].append(dict_bm25ranker['seed_url'][i]) df = pd.DataFrame(dict_classifier) return df
def build_corpus_dict(result_dict): corpus_dict = {} corpus_dict['content'] = [] corpus_dict['id'] = [] corpus_dict['label'] = [] corpus_dict['value_original'] = [] corpus_dict['value_clean_tokens'] = [] corpus_dict['value_clean'] = [] corpus_dict['seed_url'] = [] for i in range(len(result_dict['id'])): content = (' '.join( result_dict['value_clean_tokens'][i])) + ' ' + clean_text( result_dict['label'][i]) #content = content.replace('"','') #content = content.replace("'","") content = content.strip() corpus_dict['content'].append(content) corpus_dict['id'].append(result_dict['id'][i]) corpus_dict['label'].append(result_dict['label'][i]) corpus_dict['value_original'].append(result_dict['value_original'][i]) corpus_dict['value_clean_tokens'].append( result_dict['value_clean_tokens'][i]) corpus_dict['value_clean'].append(result_dict['value_clean'][i]) corpus_dict['seed_url'].append(result_dict['seed_url'][i]) return corpus_dict
def courses(self): if "courses" in self.cache: return (self.cache["courses"]) temp = self.query( 'query MyQuery {allCourses {_id\nname}}')["data"]["allCourses"] temp2 = {} for course in temp: temp2[int(course["_id"])] = {} temp2[int(course["_id"])]["name"] = func.clean_text(course["name"]) df = {} for a in sorted(temp2.keys(), reverse=True): df[str(a)] = temp2[a] self.cache["courses"] = df return (df)
def sections(self): if "sections_" + str(self.course_id) not in self.cache: query_result = self.query( 'query MyQuery {course(id: "' + str(self.course_id) + '") {sectionsConnection {nodes {_id\nname}}}}' )["data"]["course"]["sectionsConnection"]["nodes"] self.cache["sections_" + str(self.course_id)] = {} for section in query_result: self.cache["sections_" + str(self.course_id)][section["_id"]] = {} self.cache["sections_" + str(self.course_id)][ section["_id"]]["name"] = func.clean_text(section["name"]) return (self.cache["sections_" + str(self.course_id)])
def assignment_groups(self): if "assn_group_" + str(self.course_id) not in self.cache: self.cache["assn_group_" + str(self.course_id)] = {} assns = self.assignments() for assn in assns: assn_group = assns[assn]["assignmentGroup"] if assn_group["_id"] not in self.cache["assn_group_" + str(self.course_id)]: self.cache["assn_group_" + str(self.course_id)][assn_group["_id"]] = {} self.cache["assn_group_" + str(self.course_id)][ assn_group["_id"]]["name"] = func.clean_text( assn_group["name"]) return (self.cache["assn_group_" + str(self.course_id)])
def bm25_scoring(data_to_score, query): print("data file to score: {}".format(data_to_score)) print("query: {}".format(query)) #keeping only usefull keyword from given query query_keywords_list = build_keywords_list(arg_dict['stopwords_csv'], arg_dict['language'], str(query)) #loading data csv to score into pandas dataframe df = pd.read_csv(data_to_score) #declaring corpus array to give to BM25 to score corpus = [] #building corpus for i in range(len(df)): #concatening all elements on one line together line_for_corpus = str(df.iloc[i]['Titre']).lower() + ' ' + str( df.iloc[i]['Descriptif']).lower() + ' ' + str( df.iloc[i]['Description']).lower() + ' ' + str( df.iloc[i]['Prix']).lower() + ' ' + str( df.iloc[i]['Categorie']).lower() + ' ' + str( df.iloc[i]['Etiquette']).lower() #cleaning text by removing accents and HTML balises line_for_corpus_clean = clean_text(line_for_corpus) #adding element to corpus corpus.append(line_for_corpus_clean) #running BM25 on corpus to get scores bm25 = BM25Okapi(corpus) #tokenizing query keywords tokenized_query = query_keywords_list.split(",") #get BM25 scores doc_scores = bm25.get_scores(tokenized_query) #defining a new column into pandas dataframe with scores df['score'] = doc_scores #sorting dataframe by scores, hightest to lowest df.sort_values(by=['score'], inplace=True, ascending=False) #dropping elements with score 0 indexNames = df[df['score'] <= arg_dict['BM25_threshold']].index df.drop(indexNames, inplace=True) return df
def assignments(self): if "assn_" + str(self.course_id) not in self.cache: self.cache["assn_" + str(self.course_id)] = {} try: query_result = self.query( 'query MyQuery {course(id: "' + str(self.course_id) + '") {assignmentsConnection {nodes {_id name pointsPossible assignmentGroup {_id name} groupSet {_id}}}}}' )["data"]["course"]["assignmentsConnection"]["nodes"] for assn in query_result: self.cache["assn_" + str(self.course_id)][assn["_id"]] = {} self.cache["assn_" + str(self.course_id)][ assn["_id"]]["name"] = func.clean_text(assn["name"]) self.cache["assn_" + str(self.course_id)][ assn["_id"]]["pointsPossible"] = assn["pointsPossible"] self.cache["assn_" + str(self.course_id)][assn["_id"]][ "assignmentGroup"] = assn["assignmentGroup"] self.cache["assn_" + str(self.course_id)][ assn["_id"]]["groupSet"] = assn["groupSet"] except: pass return (self.cache["assn_" + str(self.course_id)])
def request_from_db(user_config_json, keywords_filter_list, knowledges_csv): # chargement du profil utilisateur user_config = load_user_config(user_config_json) APP_URL = user_config['API_URI'].replace('api', 'app') # authentification utilisateur res = requests.post(user_config['API_URI'] + '/auth/token', json={ 'email': user_config['API_USR'], 'password': user_config['API_PWD'] }) # verification si l'authentification utilisateur a bien fonctionné if res.status_code != 200: print('request::request_from_db: Error, not connected during POST') exit(1) # Store JWT Token in memory AuthHeader = {'Authorization': 'Bearer ' + res.json()['token']} df_knowledges = pd.read_csv(knowledges_csv, encoding='UTF-8', delimiter=',') knowledge_id_list = ",".join(df_knowledges['knowledge_id']) requesting_string = user_config[ 'API_URI'] + '/seeds?filter=' + keywords_filter_list + '&page_size=1000&knowledge=' + knowledge_id_list print('request::request_from_db: ligne de commande de la requete: {}\n'. format(requesting_string)) # appelle API res = requests.get(requesting_string, headers=AuthHeader) if res.status_code != 200: print('request::request_from_db: An error occurred during GET') exit(1) res_json = res.json() #print(res_json) # construction du dictionnaire dict_result = {} dict_result['id'] = [] dict_result['owner'] = [] dict_result['label'] = [] dict_result['value_original'] = [] dict_result['value_clean_tokens'] = [] dict_result['value_clean'] = [] dict_result['type'] = [] dict_result['duration'] = [] dict_result['WatsonDocumentID'] = [] dict_result['createdAt'] = [] dict_result['updatedAt'] = [] dict_result['seed_url'] = [] for i in range(len(res_json['result'])): if (res_json['result'][i]['type'] == 'text/html'): dict_result['id'].append(res_json['result'][i]['id']) dict_result['owner'].append(res_json['result'][i]['owner']) dict_result['label'].append(res_json['result'][i]['label']) dict_result['value_original'].append( res_json['result'][i]['value']) # print('request::request_from_db: value_original: {}'.format(res_json['result'][i]['value'])) value_clean_tokens = tokenize_cleaned_text( clean_text(res_json['result'][i]['value'])) # print('request::request_from_db: value_clean_tokens: {}'.format(value_clean_tokens)) dict_result['value_clean_tokens'].append(value_clean_tokens) value_clean = " ".join(value_clean_tokens) dict_result['value_clean'].append(value_clean) # print('request::request_from_db: value_clean: {}\n'.format(value_clean)) dict_result['type'].append(res_json['result'][i]['type']) dict_result['duration'].append(res_json['result'][i]['duration']) dict_result['WatsonDocumentID'].append( res_json['result'][i]['WatsonDocumentID']) dict_result['createdAt'].append(res_json['result'][i]['createdAt']) dict_result['updatedAt'].append(res_json['result'][i]['updatedAt']) for j in range(len(df_knowledges['seed_id'])): if (df_knowledges['seed_id'][j] == res_json['result'][i]['id'] ): seed_url = APP_URL + '/learn/' + df_knowledges['skill_id'][ j] + '/knowledge/' + df_knowledges['knowledge_id'][ j] + '/seed/' + df_knowledges['seed_id'][j] dict_result['seed_url'].append(seed_url) return dict_result
def students_comprehensive(self): print("Course ID is: " + str(self.course_id)) if "students_" + str(self.course_id) + "_" + str( self.groupset_id) in self.cache: return (self.cache["students_" + str(self.course_id) + "_" + str(self.groupset_id)]) query_result = self.query( 'query MyQuery {course(id: "' + str(self.course_id) + '") {enrollmentsConnection {nodes {type\nuser {_id\nname\nemail\nsisId}\nsection {_id}}}}}' )["data"]["course"]["enrollmentsConnection"]["nodes"] student_series = {} # Set up dictionary for students for student in query_result: if (student["type"] == "StudentEnrollment"): if (student["user"]["_id"] not in student_series): student_series[student["user"]["_id"]] = { "name": func.clean_text(student["user"]["name"]), "email": student["user"]["email"], "sisId": student["user"]["sisId"] } student_series[student["user"] ["_id"]]["first_name"] = func.clean_text( student["user"]["name"] [0:student["user"]["name"].find(" ")]) student_series[ student["user"]["_id"]]["last_name"] = func.clean_text( student["user"]["name"] [student["user"]["name"].rfind(" ") + 1:]) student_series[student["user"]["_id"]]["sections"] = [] # Recreate the student's email address from the sis ID, if the email field is blank if (student_series[student["user"]["_id"]]["email"] is None or student_series[student["user"]["_id"]]["email"] == "") and globals.config["EmailFormat"] != "": student_series[student["user"]["_id"]][ "email"] = globals.config["EmailFormat"] student_series[ student["user"]["_id"]]["email"] = student_series[ student["user"]["_id"]]["email"].replace( "[sisId]", str(student_series[student["user"]["_id"]] ["sisId"]) or "" ) # If the replacement is None, replace with "" student_series[ student["user"]["_id"]]["email"] = student_series[ student["user"]["_id"]]["email"].replace( "[first_name]", str(student_series[student["user"]["_id"]] ["first_name"]) or "") student_series[ student["user"]["_id"]]["email"] = student_series[ student["user"]["_id"]]["email"].replace( "[last_name]", str(student_series[student["user"]["_id"]] ["last_name"]) or "") student_series[student["user"]["_id"]]["sections"].append( student["section"]["_id"]) # Identify which group the student is in if self.groupset_id != 0: #Only look for groups if a group set has been specified groups = self.groups() def find_group(student, groups): for group in groups: for member in groups[group]["users"]: if (student == member): return (group) return ("0") for student in student_series: group = find_group(student, groups) if (group != "0"): student_series[student]["group_id"] = group student_series[student]["group_name"] = func.clean_text( groups[group]["name"]) # Find the students' peers (if they are in a group) for student in student_series: if ("group_id" in student_series[student]): student_series[student]["peers"] = [] for member in groups[student_series[student] ["group_id"]]["users"]: if (member != student): student_series[student]["peers"].append(member) self.cache["students_" + str(self.course_id) + "_" + str(self.groupset_id)] = student_series return (student_series)