Exemple #1
0
def extract_features(qfile="question_train.csv",
                     qcatfile="question_category_train.csv",
                     catfile="category.csv",
                     subcats=False,
                     outfile="features.npz"):
    # loading the categories
    cats = categorie_class.categories()
    # initalizing corpus
    corp = corpus.corpus(cats)
    # loading questions into corpus
    corp.load(qfile, qcatfile)
    # running filers on the raw questions
    sentence_filters = [filters.punctuation_filter]
    word_filters = [
        filters.small_word_filter, filters.stopword_filter,
        filters.stemming_filter
    ]
    corp.process(sentence_filters, word_filters)
    # saving corpus into pickle
    # pickle.dump(corp, "corpus.pkl")
    # selecting the term-space
    term_space = ig_based_non_uniform(corp, M=2500, read_from_file=False)
    d = len(term_space)
    # create mapping form features names to new ids and inverse
    term_to_feature = {}
    feature_to_term = {}
    for term, i in zip(term_space, range(d)):
        term_to_feature[term] = i
        feature_to_term[i] = term
    # creating features and lable arrays
    n = len(corp.tr_set)
    features = np.zeros((d, n))
    categoryids = np.zeros(n)

    # we define new ids vor the parent categories, which will be coherent with ones assigned in categoryids
    number_of_cats = len(corp.cats.all_names())
    new_category_ids = {
        c: i
        for c, i in zip(corp.cats.all_names(), range(number_of_cats))
    }

    for q, j in zip(corp.tr_set, range(n)):
        fe = simple_features(term_space, q["words"])
        for term, value in zip(fe.keys(), fe.values()):
            i = term_to_feature[term]
            features[i, j] = value
            categoryids[j] = new_category_ids[q["category"]]

    categories = {
        i: c
        for c, i in zip(new_category_ids.keys(), new_category_ids.values())
    }

    featurenames = [feature_to_term[i] for i in range(d)]

    np.savez(outfile,
             features=features,
             featurenames=featurenames,
             categoryids=categoryids,
             categories=categories)
Exemple #2
0
def update_categories():
    cn_db = db_cate.categories(con_db)
    data = request.json
    cate1 = cate.categories(data['category_id'], data['category_name'],
                            data['description'], data['picture'])
    rs = cn_db.update(cate1)
    return jsonify({'message': rs}), 200
def train_apply_classifier(classifier='NaiveBayes',
                           qfile_train='question_train.csv',
                           qcatfile_train='question_category_train.csv',
                           catfile='category.csv',
                           qfile_test='question_test.csv',
                           subcats=False):
    """This method performs a parameter tuning using cross validation for the specified classfier.
    After the hyper-parameter(s) are selected it returns the predicted labes for the given test-set.
    Following 3 classifiers are known to the method:
        - "NaiveBayes" (default)
        - "LogisticRegression"
        - "RandomForest"
    """
    # initalizing corpus
    corpus = corpus_class.corpus(categories.categories())
    corpus.load(qfile_train, qcatfile_train)
    filts = std_filters()
    corpus.process(corpus_size=-1, **filts)
    corpus.simple_split(0)

    #corpus = corpus_class.load_from_file()
    #corpus.simple_split(0)

    if classifier == 'NaiveBayes':
        clf_par = MultinomialNB_params(corpus)
        clf, feat_params = CV(corpus, *clf_par, n_folds=3)
    elif classifier == 'LogisticRegression':
        clf_par = LogisticRegression_params()
        clf, feat_params = CV(corpus, *clf_par, n_folds=3)
    elif classifier == 'RandomForest':
        clf_par = RandomForest_params(corpus)
        clf, feat_params = CV(corpus,
                              *clf_par,
                              n_folds=3,
                              skipping_rule=RF_skipping_rule)
    else:
        raise ValueError(
            "The given classfier is not known to this method. Look up the doc to see which classfiers work."
        )

    # making the fit for the entier traing set
    corpus.simple_split(0)
    corpus.make_features(**feat_params)
    clf.fit(corpus.X_tr, corpus.y_tr)

    X_te = corpus.process_example(qfile_test)

    return clf.predict(X_te)
Exemple #4
0
import pickle as pick

# Load the data into objects from the pickle serialization files for users.
if os.path.isfile("users.pickle"):
    with open('users.pickle', 'rb') as handle:
        obj_users = pick.load(handle)
else:
    obj_users = users()
dict_Users = obj_users.create_userDict()

# Load the data into objects from the pickle serialization files for categories.
if os.path.isfile("categories.pickle"):
    with open('categories.pickle', 'rb') as handle:
        obj_categories = pick.load(handle)
else:
    obj_categories = categories()
dict_categories = obj_categories.create_catDict()

# Load the data into objects from the pickle serialization files for expenses.
if os.path.isfile("expenses.pickle"):
    with open('expenses.pickle', 'rb') as handle:
        obj_expenses = pick.load(handle)
else:
    obj_expenses = expenses()
dict_expenses = obj_expenses.create_expenseDict()

# Load the data into objects from the pickle serialization files for payments.
if os.path.isfile("payments.pickle"):
    with open('payments.pickle', 'rb') as handle:
        obj_payments = pick.load(handle)
else:
 def do_app_categories(self):
   from categories import categories
   return categories()
Exemple #6
0
 def do_app_categories(self):
   from logging import TYPE_APP_CATEGORIES
   self.setFeedType(TYPE_APP_CATEGORIES)
   from categories import categories
   return categories()
Exemple #7
0
def one_categories(cate_id):
    c = cate.categories(category_id=cate_id)
    rs = db_cate.categories(con_db).get_by_id(c)
    if rs[1] != 200:
        return jsonify({'message': rs[0]}), rs[1]
    return jsonify({'message': rs[0].to_json()}), 200
 def do_app_categories(self):
   from categories import categories
   return categories()
Exemple #9
0
 def do_app_categories(self):
   from logging import TYPE_APP_CATEGORIES
   self.setFeedType(TYPE_APP_CATEGORIES)
   from categories import categories
   return categories()
Exemple #10
0
# Adding the output layer
ann_classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

# Compiling the ANN
ann_classifier.compile(optimizer='adam',
                       loss='binary_crossentropy',
                       metrics=['accuracy'])

# Fitting the ANN to the Training set
ann_classifier.fit(x_train, y_train, batch_size=50, nb_epoch=20)

# Predicting the Test set results
y_pred_ann = ann_classifier.predict(x_test)
y_pred_ann = (y_pred_ann > 0.5)
y_pred_ann = np.transpose(y_pred_ann).astype('int32')

# Making the Confusion Matrix
cm_ann, ann_accuracy, ann_precision, ann_recall = confusion_matrix(
    y_test, y_pred_ann)
print("Confusion Matrix (ANN):\n", cm_ann)

# Printing the Accuracy, Precision and Recall
print("Accuracy of ANN:", ann_accuracy)
print("Precision of ANN:", ann_precision)
print("Recall of ANN:", ann_recall)

from categories import categories

categories()
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'biography'
    data['_template'] = 'biography.html'

    # name and shortname
    data['shortname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['SHORTNAME']))
    data['fullname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['FULLNAME']))

    # authors
    data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)

    # last update
    data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    data['summary'] = htmlparser.parse(datasheet['SUMMARY'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)

    # dates are tricky. for now leave them as they are
    data['birthdate'] = datasheet['BIRTHDATE']
    data['deathdate'] = datasheet['DEATHDATE']

    # birth and death year - remove the ,? if necessary
    date_pattern = re.compile(r'(\d+)(?:,\??)?')
    data['birthyear'] = re.sub(date_pattern, r'\1', datasheet['BIRTHYEAR'])
    data['deathyear'] = re.sub(date_pattern, r'\1', datasheet['DEATHYEAR'])

    # birthplace, deathplace
    data['birthplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE']))
    data['deathplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['DEATHPLACE']))

    # mapinfo - just take the name, ignore mapnum and lat/long
    mapinfo = re.compile(r'\d,(?P<name>.+?),(?:(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+))?')
    match = mapinfo.search(datasheet['MAPINFO'])
    data['maplocation'] = '--Unknown--'
    data['maplocation'] = ''
    if match:
        data['maplocation'] = match.group('name')

    # country
    data['country'] = '--Unknown--'
    if datasheet['COUNTRY'].strip() != '':
        data['country'] = datasheet['COUNTRY']

        if data['country'] == 'Czech_Republic':
            data['country'] = 'Czech Republic'
        elif data['country'] == 'Sicily':
            data['country'] = 'Italy'
        elif data['country'].endswith(')'):
            data['country'] = data['country'][:-1]
        elif data['country'] == '':
            data['country'] == '--Unknown--'

        # also add countries to global array
        if not data['country'] in countries:
            countries.append(data['country'])

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context)
    data['references'] = flow.to_flow_block('reference', json.loads(references)['data'])

    # parse translations (use the same format as references)
    # don't add them to data, as we're combining them with bio
    translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context)
    translation_data = json.loads(translations)['data']
    translation_data = [{'number':d['number'],'translation':d['reference']} for d in translation_data]
    data['translations'] = flow.to_flow_block('translation', translation_data)

    # parse cross references
    #xrefs = referenceparser.parse_cross_references(datasheet['XREFS'], datasheet['FILENAME'])
    #data['xrefs'] = xrefs

    # parse additional links (they use the same format as cross references)
    # don't add them to data, as we're combining them with bio
    additional = referenceparser.parse_cross_references(datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data'])

    # parse otherweb links (they use the same format as cross references)
    otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context)
    data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data'])

    # parse honours links (they use the same format as cross references)
    honours = referenceparser.parse_cross_references(datasheet['HONOURS'], datasheet['FILENAME'], url_context)
    data['honours'] = flow.to_flow_block('otherweb', json.loads(honours)['data'])

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['BIOGRAPHY'],
                                datasheet['FILENAME'],
                                translations=json.loads(translations)['data'],
                                extras=json.loads(additional)['data'],
                                paragraphs=True,
                                url_context=url_context)

    # discover categories for this mathematician
    path = '/Biographies/%s' % datasheet['FILENAME']
    tags = []
    #with open('../datasheets/Indexes/data.json') as f:
    #    category_data = json.load(f)
    category_data = categories.categories()
    for category in category_data:
        if path in category['entries']:
            tags.append(category['name'])
    data['tags'] = ', '.join(tags)

    # discover alphabetical tags for this mathematician
    displays = alphaindexparser.get_displays_2(datasheet['FILENAME'])
    if not displays:
        assert False
    displays = '\n'.join(displays)
    data['alphabetical'] = displays

    return data
Exemple #12
0
def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'historytopic'
    data['_template'] = 'historytopic.html'

    # filename, short and full name, authors, update
    data['shortname'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['SHORTNAME']))
    data['fullname'] = htmlparser.parse(datasheet['FULLNAME'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)
    data['authors'] = htmlparser.parse(datasheet['AUTHORS'],
                                       datasheet['FILENAME'],
                                       paragraphs=False,
                                       url_context=url_context)
    data['update'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    # something about indexes, not sure how this is used yet
    data['indexref'] = datasheet['INDEXREF']
    data['indexreffile'] = datasheet['INDEXREFFILE']

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'],
                                                  datasheet['FILENAME'],
                                                  url_context)
    data['references'] = flow.to_flow_block('reference',
                                            json.loads(references)['data'])

    # parse additional links (they use the same format as cross references)
    additional = referenceparser.parse_cross_references(
        datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb',
                                            json.loads(additional)['data'])

    # parse translations (use the same format as references)
    # don't add them to data, as we're combining them with bio
    translations = referenceparser.parse_references(datasheet['TRANSLATION'],
                                                    datasheet['FILENAME'],
                                                    url_context)
    translation_data = json.loads(translations)['data']
    translation_data = [{
        'number': d['number'],
        'translation': d['reference']
    } for d in translation_data]
    data['translations'] = flow.to_flow_block('translation', translation_data)

    # parse otherweb links (they use the same format as cross references)
    otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'],
                                                      datasheet['FILENAME'],
                                                      url_context)
    data['otherweb'] = flow.to_flow_block('otherweb',
                                          json.loads(otherweb)['data'])

    # parse history topic
    data['content'] = htmlparser.parse(
        datasheet['HISTTOPIC'],
        datasheet['FILENAME'],
        translations=json.loads(translations)['data'],
        extras=json.loads(additional)['data'],
        paragraphs=True,
        url_context=url_context)

    # discover categories for this mathematician
    path = '/HistTopics/%s' % datasheet['FILENAME']
    tags = []
    #with open('../datasheets/Indexes/data.json') as f:
    #    category_data = json.load(f)
    category_data = categories.categories()
    for category in category_data:
        if path in category['entries']:
            tags.append(category['name'])
    data['tags'] = ', '.join(tags)

    # discover alphabetical index names for this history topic
    parsed_entries = []
    if 'INDEXNAMES' not in datasheet:
        if data['fullname'].strip() != '':
            parsed_entries.append(data['fullname'].strip())
        elif data['shortname'].strip() != '':
            parsed_entries.append(data['shortname'].strip())
        else:
            print('no names for this topic')
            assert False
    else:
        entries = datasheet['INDEXNAMES'].strip().split('\n')

        for entry in entries:
            entry = entry.strip()
            entry = symbolreplace.strip_tags(
                symbolreplace.tags_to_unicode(entry))
            parsed_entries.append(entry)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data