def extract_features(qfile="question_train.csv", qcatfile="question_category_train.csv", catfile="category.csv", subcats=False, outfile="features.npz"): # loading the categories cats = categorie_class.categories() # initalizing corpus corp = corpus.corpus(cats) # loading questions into corpus corp.load(qfile, qcatfile) # running filers on the raw questions sentence_filters = [filters.punctuation_filter] word_filters = [ filters.small_word_filter, filters.stopword_filter, filters.stemming_filter ] corp.process(sentence_filters, word_filters) # saving corpus into pickle # pickle.dump(corp, "corpus.pkl") # selecting the term-space term_space = ig_based_non_uniform(corp, M=2500, read_from_file=False) d = len(term_space) # create mapping form features names to new ids and inverse term_to_feature = {} feature_to_term = {} for term, i in zip(term_space, range(d)): term_to_feature[term] = i feature_to_term[i] = term # creating features and lable arrays n = len(corp.tr_set) features = np.zeros((d, n)) categoryids = np.zeros(n) # we define new ids vor the parent categories, which will be coherent with ones assigned in categoryids number_of_cats = len(corp.cats.all_names()) new_category_ids = { c: i for c, i in zip(corp.cats.all_names(), range(number_of_cats)) } for q, j in zip(corp.tr_set, range(n)): fe = simple_features(term_space, q["words"]) for term, value in zip(fe.keys(), fe.values()): i = term_to_feature[term] features[i, j] = value categoryids[j] = new_category_ids[q["category"]] categories = { i: c for c, i in zip(new_category_ids.keys(), new_category_ids.values()) } featurenames = [feature_to_term[i] for i in range(d)] np.savez(outfile, features=features, featurenames=featurenames, categoryids=categoryids, categories=categories)
def update_categories(): cn_db = db_cate.categories(con_db) data = request.json cate1 = cate.categories(data['category_id'], data['category_name'], data['description'], data['picture']) rs = cn_db.update(cate1) return jsonify({'message': rs}), 200
def train_apply_classifier(classifier='NaiveBayes', qfile_train='question_train.csv', qcatfile_train='question_category_train.csv', catfile='category.csv', qfile_test='question_test.csv', subcats=False): """This method performs a parameter tuning using cross validation for the specified classfier. After the hyper-parameter(s) are selected it returns the predicted labes for the given test-set. Following 3 classifiers are known to the method: - "NaiveBayes" (default) - "LogisticRegression" - "RandomForest" """ # initalizing corpus corpus = corpus_class.corpus(categories.categories()) corpus.load(qfile_train, qcatfile_train) filts = std_filters() corpus.process(corpus_size=-1, **filts) corpus.simple_split(0) #corpus = corpus_class.load_from_file() #corpus.simple_split(0) if classifier == 'NaiveBayes': clf_par = MultinomialNB_params(corpus) clf, feat_params = CV(corpus, *clf_par, n_folds=3) elif classifier == 'LogisticRegression': clf_par = LogisticRegression_params() clf, feat_params = CV(corpus, *clf_par, n_folds=3) elif classifier == 'RandomForest': clf_par = RandomForest_params(corpus) clf, feat_params = CV(corpus, *clf_par, n_folds=3, skipping_rule=RF_skipping_rule) else: raise ValueError( "The given classfier is not known to this method. Look up the doc to see which classfiers work." ) # making the fit for the entier traing set corpus.simple_split(0) corpus.make_features(**feat_params) clf.fit(corpus.X_tr, corpus.y_tr) X_te = corpus.process_example(qfile_test) return clf.predict(X_te)
import pickle as pick # Load the data into objects from the pickle serialization files for users. if os.path.isfile("users.pickle"): with open('users.pickle', 'rb') as handle: obj_users = pick.load(handle) else: obj_users = users() dict_Users = obj_users.create_userDict() # Load the data into objects from the pickle serialization files for categories. if os.path.isfile("categories.pickle"): with open('categories.pickle', 'rb') as handle: obj_categories = pick.load(handle) else: obj_categories = categories() dict_categories = obj_categories.create_catDict() # Load the data into objects from the pickle serialization files for expenses. if os.path.isfile("expenses.pickle"): with open('expenses.pickle', 'rb') as handle: obj_expenses = pick.load(handle) else: obj_expenses = expenses() dict_expenses = obj_expenses.create_expenseDict() # Load the data into objects from the pickle serialization files for payments. if os.path.isfile("payments.pickle"): with open('payments.pickle', 'rb') as handle: obj_payments = pick.load(handle) else:
def do_app_categories(self): from categories import categories return categories()
def do_app_categories(self): from logging import TYPE_APP_CATEGORIES self.setFeedType(TYPE_APP_CATEGORIES) from categories import categories return categories()
def one_categories(cate_id): c = cate.categories(category_id=cate_id) rs = db_cate.categories(con_db).get_by_id(c) if rs[1] != 200: return jsonify({'message': rs[0]}), rs[1] return jsonify({'message': rs[0].to_json()}), 200
# Adding the output layer ann_classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid')) # Compiling the ANN ann_classifier.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Fitting the ANN to the Training set ann_classifier.fit(x_train, y_train, batch_size=50, nb_epoch=20) # Predicting the Test set results y_pred_ann = ann_classifier.predict(x_test) y_pred_ann = (y_pred_ann > 0.5) y_pred_ann = np.transpose(y_pred_ann).astype('int32') # Making the Confusion Matrix cm_ann, ann_accuracy, ann_precision, ann_recall = confusion_matrix( y_test, y_pred_ann) print("Confusion Matrix (ANN):\n", cm_ann) # Printing the Accuracy, Precision and Recall print("Accuracy of ANN:", ann_accuracy) print("Precision of ANN:", ann_precision) print("Recall of ANN:", ann_recall) from categories import categories categories()
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'biography' data['_template'] = 'biography.html' # name and shortname data['shortname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['SHORTNAME'])) data['fullname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['FULLNAME'])) # authors data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # last update data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE'])) data['summary'] = htmlparser.parse(datasheet['SUMMARY'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) # dates are tricky. for now leave them as they are data['birthdate'] = datasheet['BIRTHDATE'] data['deathdate'] = datasheet['DEATHDATE'] # birth and death year - remove the ,? if necessary date_pattern = re.compile(r'(\d+)(?:,\??)?') data['birthyear'] = re.sub(date_pattern, r'\1', datasheet['BIRTHYEAR']) data['deathyear'] = re.sub(date_pattern, r'\1', datasheet['DEATHYEAR']) # birthplace, deathplace data['birthplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE'])) data['deathplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['DEATHPLACE'])) # mapinfo - just take the name, ignore mapnum and lat/long mapinfo = re.compile(r'\d,(?P<name>.+?),(?:(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+))?') match = mapinfo.search(datasheet['MAPINFO']) data['maplocation'] = '--Unknown--' data['maplocation'] = '' if match: data['maplocation'] = match.group('name') # country data['country'] = '--Unknown--' if datasheet['COUNTRY'].strip() != '': data['country'] = datasheet['COUNTRY'] if data['country'] == 'Czech_Republic': data['country'] = 'Czech Republic' elif data['country'] == 'Sicily': data['country'] = 'Italy' elif data['country'].endswith(')'): data['country'] = data['country'][:-1] elif data['country'] == '': data['country'] == '--Unknown--' # also add countries to global array if not data['country'] in countries: countries.append(data['country']) # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse translations (use the same format as references) # don't add them to data, as we're combining them with bio translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context) translation_data = json.loads(translations)['data'] translation_data = [{'number':d['number'],'translation':d['reference']} for d in translation_data] data['translations'] = flow.to_flow_block('translation', translation_data) # parse cross references #xrefs = referenceparser.parse_cross_references(datasheet['XREFS'], datasheet['FILENAME']) #data['xrefs'] = xrefs # parse additional links (they use the same format as cross references) # don't add them to data, as we're combining them with bio additional = referenceparser.parse_cross_references(datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse otherweb links (they use the same format as cross references) otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context) data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data']) # parse honours links (they use the same format as cross references) honours = referenceparser.parse_cross_references(datasheet['HONOURS'], datasheet['FILENAME'], url_context) data['honours'] = flow.to_flow_block('otherweb', json.loads(honours)['data']) # parse biography, and add in extras and translations data['content'] = htmlparser.parse(datasheet['BIOGRAPHY'], datasheet['FILENAME'], translations=json.loads(translations)['data'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) # discover categories for this mathematician path = '/Biographies/%s' % datasheet['FILENAME'] tags = [] #with open('../datasheets/Indexes/data.json') as f: # category_data = json.load(f) category_data = categories.categories() for category in category_data: if path in category['entries']: tags.append(category['name']) data['tags'] = ', '.join(tags) # discover alphabetical tags for this mathematician displays = alphaindexparser.get_displays_2(datasheet['FILENAME']) if not displays: assert False displays = '\n'.join(displays) data['alphabetical'] = displays return data
def convert(datasheet, url_context): data = {} # metadata, the template and model data['_model'] = 'historytopic' data['_template'] = 'historytopic.html' # filename, short and full name, authors, update data['shortname'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['SHORTNAME'])) data['fullname'] = htmlparser.parse(datasheet['FULLNAME'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context) data['update'] = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(datasheet['UPDATE'])) # something about indexes, not sure how this is used yet data['indexref'] = datasheet['INDEXREF'] data['indexreffile'] = datasheet['INDEXREFFILE'] # parse references references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context) data['references'] = flow.to_flow_block('reference', json.loads(references)['data']) # parse additional links (they use the same format as cross references) additional = referenceparser.parse_cross_references( datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context) data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data']) # parse translations (use the same format as references) # don't add them to data, as we're combining them with bio translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context) translation_data = json.loads(translations)['data'] translation_data = [{ 'number': d['number'], 'translation': d['reference'] } for d in translation_data] data['translations'] = flow.to_flow_block('translation', translation_data) # parse otherweb links (they use the same format as cross references) otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context) data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data']) # parse history topic data['content'] = htmlparser.parse( datasheet['HISTTOPIC'], datasheet['FILENAME'], translations=json.loads(translations)['data'], extras=json.loads(additional)['data'], paragraphs=True, url_context=url_context) # discover categories for this mathematician path = '/HistTopics/%s' % datasheet['FILENAME'] tags = [] #with open('../datasheets/Indexes/data.json') as f: # category_data = json.load(f) category_data = categories.categories() for category in category_data: if path in category['entries']: tags.append(category['name']) data['tags'] = ', '.join(tags) # discover alphabetical index names for this history topic parsed_entries = [] if 'INDEXNAMES' not in datasheet: if data['fullname'].strip() != '': parsed_entries.append(data['fullname'].strip()) elif data['shortname'].strip() != '': parsed_entries.append(data['shortname'].strip()) else: print('no names for this topic') assert False else: entries = datasheet['INDEXNAMES'].strip().split('\n') for entry in entries: entry = entry.strip() entry = symbolreplace.strip_tags( symbolreplace.tags_to_unicode(entry)) parsed_entries.append(entry) data['alphabetical'] = '\n'.join(parsed_entries) return data