def __init__(self, port, classifier_file='data/classifiers_new_1.gzip'): LOG.info("Loading classifier ...") f = gzip.open(classifier_file) self.clfs, self.vs, self.dvs, self.columns = pickle.load(f) f.close() # self.nested_function_count = 0 LOG.info("Done Loading classifier ...") self.client = SCNLPWebClient(port)
class goal_classifier: def __init__(self, port, classifier_file='data/classifiers_new_1.gzip'): LOG.info("Loading classifier ...") f = gzip.open(classifier_file) self.clfs, self.vs, self.dvs, self.columns = pickle.load(f) f.close() # self.nested_function_count = 0 LOG.info("Done Loading classifier ...") self.client = SCNLPWebClient(port) # self.client.connect_to_server() # @app.route('/addNewSubject', methods=['POST']) def addNewSubject(self, params): sid = params['id'] subj_eng = params['english_subject'] subj_chin = params['chinese_subject'] password = params['password'] global subj_ids, subjects, subj_json_obj, uID if password != uID: return_obj = {'status': 'Password incorrect!'} return str(return_obj) temp = {} if sid not in subj_ids: temp['_id'] = sid temp['title'] = {} temp['title']['en'] = subj_eng temp['title']['zh'] = subj_chin # subj_ids.append(temp['_id']) # subj_ids.append(temp['_id']) # subjects.append(temp['title']['en']) # subjects.append(temp['title']['zh']) subj_json_obj.append(temp) filename = "./data/subjects-prod.json" with open(filename, 'w') as outfile: json.dump(subj_json_obj, outfile) subj_ids = [] subjects = [] for obj in subj_json_obj: if obj['title']['en'] != None: subj_ids.append(obj['_id']) subjects.append(obj['title']['en']) if obj['title']['zh'] != None: subj_ids.append(obj['_id']) subjects.append(obj['title']['zh']) filename = 'Search Phrases - The Graduate - Subjects + Abbrev..csv' subjects.extend(updateLists(filename, 1, 1)) # To load abbreviations temp_sv = TfidfVectorizer(ngram_range=(1,3)) temp_Sx = temp_sv.fit_transform(subjects) global sv, sX sv = temp_sv sX = temp_Sx return_obj = {'status': "Subject added!"} return str(return_obj) return_obj = {'status': "Subject already exists!"} return str(return_obj) # @app.route('/removeSubject', methods=['POST']) def removeSubject(self, params): sid = params['id'] # subj_eng = request.form['english_subject'] # subj_chin = request.form['chinese_subject'] password = params['password'] global subj_ids, subjects, uID if password != uID: return_obj = {'status': 'Password incorrect!'} return str(return_obj) temp = {} if sid in subj_ids: # temp_esubject = "" # temp_csubject = "" # temp_id = "" for i in range(len(subj_json_obj)): if subj_json_obj[i]['_id'] == sid: # temp_id = subj_json_obj[i]['_id'] # temp_esubject = subj_json_obj[i]['title']['en'] # temp_csubject = subj_json_obj[i]['title']['zh'] subj_json_obj.pop(i) break # subjects = subjects.remove(temp_esubject) # subjects = subjects.remove(temp_csubject) # subj_ids = list(val for val in subj_ids if val != temp_id) filename = "./data/subjects-prod.json" with open(filename, 'w') as outfile: json.dump(subj_json_obj, outfile) subj_ids = [] subjects = [] for obj in subj_json_obj: if obj['title']['en'] != None: subj_ids.append(obj['_id']) subjects.append(obj['title']['en']) if obj['title']['zh'] != None: subj_ids.append(obj['_id']) subjects.append(obj['title']['zh']) filename = 'Search Phrases - The Graduate - Subjects + Abbrev..csv' subjects.extend(updateLists(filename, 1, 1)) # To load abbreviations temp_sv = TfidfVectorizer(ngram_range=(1,3)) temp_Sx = temp_sv.fit_transform(subjects) global sv, sX sv = temp_sv sX = temp_Sx return_obj = {'status': "Subject removed!"} return str(return_obj) return_obj = {'status': "Subject doesn't exist!"} return str(return_obj) def getDistances(self, query, threshold, entity_list, vectorizer, transformed): q = vectorizer.transform([query]) distances = pairwise_distances(transformed, q, metric='cosine') min_index, min_value = min(enumerate(distances), key=operator.itemgetter(1)) if min_value[0] <= threshold: return entity_list[min_index], min_value else: return "None", 1 def get_proper_date(self, normalized_ner, date_string): year = datetime.date.today().year month = datetime.date.today().month month = month + 1 failed = True if month > 12: month = 1 year = year + 1 day = datetime.date.today().day try: m = re.search("(\d\d\d\d-\d\d-\d\d),(\d\d\d\d-\d\d-\d\d)", normalized_ner) if m: normalized_ner = m.group(2) # print("Modified it to %s" % normalized_ner) m = re.search('(\d\d\d\d)-(\d\d)-(\d\d)', normalized_ner) if m: year = int(m.group(1)) month = int(m.group(2)) day = int(m.group(3)) failed = False elif not re.search('(\d\d\d\d)-(\d\d)', normalized_ner) == None: m = re.search('(\d\d\d\d)-(\d\d)', normalized_ner) #day is still missing year = int(m.group(1)) month = int(m.group(2)) m = re.search('P1D-#(\d+)', normalized_ner) failed = False if m: day = int(m.group(1)) elif not re.search('XXXX-(\d\d)-(\d\d)', normalized_ner) == None: m = re.search('XXXX-(\d\d)-(\d\d)', normalized_ner) month = int(m.group(1)) day = int(m.group(2)) #year is still missing m = re.search('\d\d\d\d', date_string) failed = False if m: year = int(m.group(0)) elif re.search('(\d\d\d\d)-W(\d+)', normalized_ner): m = re.search('(\d\d\d\d)-W(\d+)', normalized_ner) week_no = max(0, int(m.group(2))-1) year = int(m.group(1)) dt = datetime.datetime.strptime('%d-W%d' % (year, week_no) + '-0', "%Y-W%W-%w") month = dt.month day = dt.day failed = False except Exception as ex: print(traceback.format_exc()) return datetime.date(year,month, day), failed def pre_process_text(self, t): t = re.sub(" [A-F] ", " __GRADE__ ", t) t = re.sub("\d+%", " __PERCENTAGE__ ", t) t = re.sub("\d+", " __NUMBER__ ", t) # t = t.replace("I want to", "") # t = t.replace("I want ", "") t = t.replace("subject", " __SUBJECT__ ") return t def process_object(self, input_object): output_object = [] for io in input_object: this_output = {} this_output_classes = [] t = io['text'] LOG.info("Processing text: %s" % t) t1 = '%s' % t t = self.pre_process_text(t) print(t) with scnlp_lock: data = self.client.process_text(t) data = self.fix_data(data) for c in self.columns: X_test = self.vs[c].transform([t]) X_test_feats = self.dvs[c].transform([self.get_features(t, data)]) # print("tyring hstack") X_test = hstack([X_test, X_test_feats]) #print("done hstack") pred = self.clfs[c].predict(X_test.toarray()) this_output_classes.append({'class': c, 'probability': float(pred[0])}) this_output["classes"] = this_output_classes this_output["date"] = self.get_dates(t1) this_output["subject"] = self.get_subjects(t1) output_object.append(this_output) return output_object def fix_data(self, data): if type(data['root']['document']['sentences']['sentence']).__name__ == "dict": data['root']['document']['sentences']['sentence'] = [data['root']['document']['sentences']['sentence']] for s in data['root']['document']['sentences']['sentence']: if type(s['tokens']['token']).__name__ == 'dict': s['tokens']['token'] = [s['tokens']['token']] return data def findV(self, data, verb, sentence_no): for x in data['root']['document']['sentences']['sentence'][sentence_no]['tokens']['token']: if x['POS'] == verb: word = x['word'] return word def findDep(self, data, word, sentence_no): deps = [] for x in data['root']['document']['sentences']['sentence'][sentence_no]['dependencies'][0]['dep']: if x['governor']['#text'] == word: d = {"type": x['@type'], "word": x['dependent']['#text']} deps.append(d) return deps # def process_text(self, text, data): # # data = self.client.process_text(text) # return self.fix_data(data) def process_and_get_features(self, text, data): # data = self.process_text(text, data) ls = [] for x in range(len(data['root']['document']['sentences']['sentence'])): d = {} d["verb"] = self.findV(data, 'VB', x) d["dep"] = self.findDep(data, d["verb"], x) if len(d['dep']) < 1: d["verb"] = self.findV(data, 'VBP', x) d["dep"] = self.findDep(data, d["verb"], x) ls.append(d.copy()) return ls def get_features(self, t, data): sentence_features = self.process_and_get_features(t, data) feats = {} for sf in sentence_features: feats['main_verb_%s' % sf['verb']] = 1 for d in sf['dep']: feats['%s_%s' % (d['type'], d['word'])] = 1 return feats def get_subjects(self, query): global subj_ids, subjects, sv, sX # print(subjects) check = 0 # print(query) if query.strip() == 'Exit' or query.strip() == 'exit' or query.strip() == 'q' or query.strip() == 'Q' or query.strip() is None or len(query.strip()) == 0: check = 1 # print("No or too less data received") return '{}' if check == 0: query = query.replace('.', '') query = query.replace(',', '') query = query.replace('!', '') query = query.replace('?', '') try: sName, sValue = self.getDistances(query, 0.604324, subjects, sv, sX) # names_list.append(sName) json_obj = {} json_obj['subject'] = {} if sName in subj_dict: # Means that abbreviation was matched instead of actual subject name actual_subject = subj_dict[sName] json_obj['subject']['name'] = actual_subject json_obj['subject']['id'] = subj_ids[subjects.index(actual_subject)] elif sName in subjects: # Means actual subject name was matched, no abbreviation json_obj['subject']['name'] = sName json_obj['subject']['id'] = subj_ids[subjects.index(sName)] else: # Means no subject was matched at all json_obj['subject']['name'] = "None" json_obj['subject']['id'] = "None" # print(json_obj) return json_obj except Exception as e: print("An exception occurred: ", e) def get_dates(self, query, recurse = True): check = 0 if query.strip() == 'Exit' or query.strip() == 'exit' or query.strip() == 'q' or query.strip() == 'Q' or query.strip() is None or len(query.strip()) == 0: # print("No or too less data received") check = 1 return '{}' if check == 0: query = query.replace('.','') query = query.replace(',','') query = query.replace('!','') query = query.replace('?','') try: with scnlp_lock: processed_text = self.client.process_text(query) xml_obj = processed_text['root']['document']['sentences']['sentence'] # print(xml_obj) output = {} normalized_ner = "" dates = [] if type(xml_obj) is dict: temp_obj = processed_text['root']['document']['sentences']['sentence']['tokens']['token'] if type(temp_obj) is list: for i in range(len(temp_obj)): if 'NormalizedNER' in temp_obj[i]: if temp_obj[i]['NER'] == 'DATE' or temp_obj[i]['NER'] == 'DURATION': normalized_ner = str(temp_obj[i]['NormalizedNER']) if temp_obj[i]['NER'] == 'DATE' or temp_obj[i]['NER'] == 'DURATION': dates.append(str(temp_obj[i]['word'])) else: if 'NormalizedNER' in temp_obj: if temp_obj['NER'] == 'DATE' or temp_obj['NER'] == 'DURATION': normalized_ner = temp_obj['NormalizedNER'] if temp_obj['NER'] == 'DATE' or temp_obj['NER'] == 'DURATION': dates.append(temp_obj['word']) date_string = ' '.join(dates) timestamp, failed = self.get_proper_date(normalized_ner, date_string) json_obj = {} try: r = datetime.datetime.strptime(normalized_ner, "%Y-%m-%d") json_obj['normalized_date_string'] = normalized_ner except: json_obj['normalized_date_string'] = str(timestamp) json_obj['normalized_date'] = str(timestamp) output = json_obj elif type(xml_obj) is list: output = [] for j in range(len(xml_obj)): temp_obj = processed_text['root']['document']['sentences']['sentence'][j]['tokens']['token'] if type(temp_obj) is list: json_obj = {} normalized_ner = None for i in range(len(temp_obj)): if 'NormalizedNER' in temp_obj[i]: if temp_obj[i]['NER'] == 'DATE' or temp_obj[i]['NER'] == 'DURATION': normalized_ner = str(temp_obj[i]['NormalizedNER']) if temp_obj[i]['NER'] == 'DATE' or temp_obj[i]['NER'] == 'DURATION': dates.append(str(temp_obj[i]['word'])) else: if temp_obj['NormalizedNER']: if temp_obj['NER'] == 'DATE' or temp_obj['NER'] == 'DURATION': normalized_ner = str(temp_obj['NormalizedNER']) if temp_obj['NER'] == 'DATE' or temp_obj['NER'] == 'DURATION': dates.append(temp_obj['word']) if normalized_ner is not None: date = ' '.join(dates) timestamp, failed = self.get_proper_date(normalized_ner, date) json_obj['key'] = j try: r = datetime.datetime.strptime(normalized_ner, "%Y-%m-%d") json_obj['normalized_date_string'] = normalized_ner except: json_obj['normalized_date_string'] = str(timestamp) json_obj['normalized_date'] = str(timestamp) output.append(json_obj) if output: # print(date_string) # current_date = time.strftime("%Y-%m-%d") # print(current_date) # print(output) # print(output['normalized_date_string']) # print(normalized_ner) if recurse: if failed and not output['normalized_date_string'] == "": # print(date_string) var = query.find(date_string.split(" ")[-1]) + len(date_string.split(" ")[-1]) query_modified = query[:var] query_modified += " from now" query_modified += query[var:] if query_modified.find('next') >= 0: query_modified = query_modified.replace("next", "") # print("Hehehuhu") # print(query_modified) output = self.get_dates(query_modified, recurse = False) if date_string.find("start") >= 0: day = output['normalized_date'] dt = datetime.datetime.strptime(day, "%Y-%m-%d") if date_string.find("week") >= 0: start = dt - timedelta(days = dt.weekday()) output['normalized_date'] = str(start.date()) output['normalized_date_string'] = str(start.date()) elif date_string.find("month") >= 0: output['normalized_date'] = str((dt + relativedelta(day = 1)).date()) output['normalized_date_string'] = str((dt + relativedelta(day = 1)).date()) elif date_string.find("year") >= 0: year = dt.year output['normalized_date'] = str(year) + '-01-01' output['normalized_date_string'] = str(year) + '-01-01' elif date_string.find("end") >= 0: day = output['normalized_date'] dt = datetime.datetime.strptime(day, "%Y-%m-%d") if date_string.find("week") >= 0: start = dt - timedelta(days = dt.weekday()) end = start + timedelta(days=6) output['normalized_date'] = str(end.date()) output['normalized_date_string'] = str(end.date()) elif date_string.find("month") >= 0: output['normalized_date'] = str((dt + relativedelta(day = 31)).date()) output['normalized_date_string'] = str((dt + relativedelta(day = 31)).date()) elif date_string.find("year") >= 0: year = dt.year output['normalized_date'] = str(year) + '-12-31' output['normalized_date_string'] = str(year) + '-12-31' return output # if re.search('^P(\d+)D$', output['normalized_date_string']): #and output['normalized_date'] == current_date: # query = query + "from now" # if self.nested_function_count == 0: # self.nested_function_count += 1 # self.get_dates(query) # elif re.search('^P(\d+)W$', output['normalized_date_string']): #and output['normalized_date'] == current_date: # query = query + "from now" # if self.nested_function_count == 0: # self.nested_function_count += 1 # self.get_dates(query) # elif re.search('^P(\d+)M$', str(output['normalized_date_string'])): #and output['normalized_date'] == current_date: # print("I am here") # query = query + " from now" # print(query) # if self.nested_function_count == 0: # output = None # json_obj = None # self.nested_function_count = 1 # output = self.get_dates(query) # elif re.search('^P(\d+)Y$', output['normalized_date_string']): #and output['normalized_date'] == current_date: # query = query + "from now" # if self.nested_function_count == 0: # self.nested_function_count += 1 # self.get_dates(query) # else: # pass # self.nested_function_count = 0 return output else: return 'None' except Exception as e: print("An exception occurred: ", e)