def xgboost_model(data, amlclf=None): model = {'data': data} if byW2v: model['w2v'] = config['load_w2v']() else: token = [ cont for _, cont in [data['corpus'][idx] for idx, *_ in data['TrainData']] ] # we can append title to doc, but ignore # because we want to retain the setence struct, # we don't remove stopword in token, we remove them here #stopwordTxt = os.path.join('..', 'data', 'stopword', "stopword.txt") #data['stop'] = set(open(stopwordTxt, "r").read().split()) #token = [' '.join([t for t in doc.split() if t not in data['stop']]) for doc in token] model['bm25'] = BM25Transformer() model['vectorizer'] = TfidfVectorizer() model['vectorizer'].fit(token) TrainTf = model['vectorizer'].transform(tqdm(token)) print("fitting bm25...", end='') sys.stdout.flush() model['bm25'].fit(TrainTf) print("transforming...", end='') model['TrainBm25'] = model['bm25'].transform(TrainTf) print("ok") if amlclf: vectorizer = amlclf['vectorizer'] bm25_model = amlclf['bm25'] else: vectorizer = model['vectorizer'] bm25_model = model['bm25'] def fetch_setence_by_winsize(cont, idx, size): beg = lambda index, offset: max(0, index - offset) end = lambda index, offset: min(idx + offset, len(cont)) return cont[beg(idx, size):idx] + cont[idx + 1:end(idx, size) + 1] def fetch_setence_by_punctuation(cont, idx): beg, end = idx, idx while cont[beg] != '。' and beg > 0: beg -= 1 while cont[end] != '。' and end < len(cont) - 1: end += 1 #print(''.join(cont[beg + 1:end])) #beg = max(idx - 7, beg) #end = min(idx + 7, end) return cont[beg + 1:end] model['namefilter'] = namefilter() model['reporter'] = set() def names_to_bm25(names, cont): names = model['namefilter'](names, cont) # consider remove stop word cont = cont.split() #cont = [t for t in cont if t not in data['stop']] people, description = [], [] model['reporter'] |= set( [name for name in names if isReporter(name, cont)]) names = [name for name in names if not isReporter(name, cont)] descs_of_name = lambda name, cont: \ [fetch_setence_by_winsize(cont, idx, 5) for idx in get_indexes(cont, name)] #[fetch_setence_by_punctuation(cont, idx) for idx in get_indexes(cont, name)] #descs = {name: descs_of_name(name, cont) for name in names} name_descs = [(name, descs_of_name(name, cont)) for name in names] descs = {nm: dscs for nm, dscs in name_descs if dscs} if len(descs) == 0: print(names) print(name_descs) print(cont) return [], [] if amlclf: descs = { name: [amlclf['data']['config']['tokenize'](''.join(desc)).split() \ for desc in descs[name]] for name in descs} if byW2v: base = np.zeros((model['w2v'].vector_size, )) else: base = np.zeros((len(vectorizer.idf_), )) if byW2v: if descs: people, description = map(list, zip(*[(name, np.sum( [np.ravel(np.sum(model['w2v'] [[t for t in desc if t in model['w2v']]], axis=0).sum(axis=0)) \ for desc in descs[name]], axis = 0 ) / len(descs[name])) for name in descs])) else: people, description = [], [] else: #bm25 if descs: people, description = map( list, zip(*[(name, np.sum([ np.ravel( bm25_model.transform( vectorizer.transform([' '.join(desc) ])).sum(axis=0)) for desc in descs[name] ] + [base], axis=0) / max(1, len(descs[name]))) for name in descs])) else: people, description = [], [] #pprint(list(zip(people, [np.sum(d) for d in description]))) return people, description xtrain_tfv, ytrain, model['ntrain'] = map( list, zip(*[(desc, name in acc, name) for idx, acc, pred in data['TrainData'] for _, cont in [data['corpus'][idx]] for name, desc in zip(*names_to_bm25(pred, cont))])) # start if byW2v: svd = decomposition.TruncatedSVD() else: svd = decomposition.TruncatedSVD(n_components=200) svd.fit(xtrain_tfv) xtrain_svd = svd.transform(xtrain_tfv) scl = preprocessing.StandardScaler() scl.fit(xtrain_svd) xtrain_svd_scl = scl.transform(xtrain_svd) clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1) clf.fit(xtrain_svd, ytrain) #def documents_to_bm25(tokens): # tf = model['vectorizer'].transform(tqdm(tokens)) # print("doing the valid set transformation...", end='') # sys.stdout.flush() # DocData = model['bm25'].transform(tf) # print("ok") # print('tf.shape:', tf.shape) # return DocData def validate(xvalid_tfv, show_loss=False): if not xvalid_tfv: return [] xvalid_svd = svd.transform(xvalid_tfv) xvalid_svd_scl = scl.transform(xvalid_svd) if show_loss: predictions = clf.predict_proba(xvalid_svd) print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions)) predictions = clf.predict(xvalid_svd) return predictions def predict(pred, cont, show_loss=False): name, desc = names_to_bm25(pred, cont) return name, validate(desc, show_loss) model['names_to_bm25'] = names_to_bm25 model['validate'] = validate model['predict'] = predict return model
title_weight = 2 for i, key in enumerate(tqdm(tokey)): title = retain_chinese(titles.get(key, '')).strip() if title and title != "Non": title_token = ' {}'.format(' '.join([ w for w in cut_method(title) if w not in stopwords ])) * title_weight token[i] += title_token #print('+= ' + title_token) if len(token) != len(tokey): print('token len sould eq to tokey len') exit(0) bm25 = BM25Transformer() vectorizer = TfidfVectorizer() print(""" building corpus vector space... """) doc_tf = vectorizer.fit_transform(tqdm(token)) bm25.fit(doc_tf) doc_bm25 = bm25.transform(doc_tf) print('\ncorpus vector space - ok\n') docsTokens = [t.split() for t in token] print("loading model")
def new_models(config): models = {} token = mapTrim( open(config['tokenFile'], encoding="UTF-8").read().split('\n')) title = mapTrim( open(config['titleFile'], encoding="UTF-8").read().split('\n')) if len(config['tokey']) != len(token) or len(token) != len(title): print('len(token) {} != len(tokey) {}'.format(len(token), len(config['tokey']))) exit(0) # append title to doc print("\nappending title to document...\n") for i, key in enumerate(tqdm(config['tokey'])): if title and title != "Non": token[i] += ' {}'.format(title[i]) * title_weight print("\nbuilding corpus vector space...\n") models['bm25'] = BM25Transformer() models['vectorizer'] = TfidfVectorizer() doc_tf = models['vectorizer'].fit_transform(tqdm(token)) print("fitting bm25...", end='') sys.stdout.flush() models['bm25'].fit(doc_tf) print("transforming...", end='') models['doc_bm25'] = models['bm25'].transform(doc_tf) print("ok") print("saving bm25Cache...", end='') sys.stdout.flush() joblib.dump(models['bm25'], config['bm25Cache']) print("ok") print("saving docBM25Cache...", end='') sys.stdout.flush() joblib.dump(models['doc_bm25'], config['docBM25Cache']) print("ok") print("saving vectorizerCache...", end='') sys.stdout.flush() joblib.dump(models['vectorizer'], config['vectorizerCache']) print("ok") print('\ncorpus vector space - ok\n') docsTokens = [t.split() for t in token] # mod print("loading w2v model...", end='') sys.stdout.flush() models['w2v'] = config['load_w2v']() print("ok") print("making document word vector") models['docWv'] = np.array( [np.sum(models['w2v'][[t for t in docsTokens[i] if t in models['w2v']]], axis=0) \ for i in tqdm(range(len(docsTokens)))]) print("saving docW2VCache...", end='') sys.stdout.flush() joblib.dump(models['docWv'], config['docW2VCache']) print("ok") return models
def train(xtrain, ytrain, xval, yval, lang, tags_to_idx, weighting): if weighting == 'tfidf': path = "./models/model_" + lang + "_weights.hdf5" elif weighting == 'bm25': path = "./models/model_" + lang + "_bm25_weights.hdf5" checkpointer = ModelCheckpoint(filepath=path, verbose=1, monitor="val_acc", save_best_only=True, mode="max") #print("Train and dev shape: ", xtrain.shape, xval.shape) counts = defaultdict(int) for c in ytrain.tolist(): counts[c] += 1 if lang != 'all': character_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3, 6), lowercase=False, min_df=5, max_df=0.3) else: character_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(3, 5), lowercase=False, min_df=5, max_df=0.3) if weighting == 'tfidf': transformer = TfidfTransformer(sublinear_tf=True) elif weighting == 'bm25': transformer = BM25Transformer() tfidf_matrix = pipeline.Pipeline([ ('character', pipeline.Pipeline([('s5', text_col(key='text_clean')), ('character_vectorizer', character_vectorizer), ('tfidf_character', transformer)])), ('scale', Normalizer()) ]) tfidf_matrix = tfidf_matrix.fit(xtrain) tfidf_matrix_test = tfidf_matrix.transform(xtrain) print('tfidf matrix size: ', tfidf_matrix_test.shape) ngrams_matrix_shape = tfidf_matrix_test.shape[1] tfidf_matrix_val = tfidf_matrix.transform(xval) charvec, char_vocab, max_train_len_char = make_charvec( xtrain.text_clean.tolist()) char_vocab_size = len(char_vocab) + 2 charvec_shape = charvec.shape[1] charvec_val, _, _ = make_charvec(xval.text_clean.tolist(), train=False, char_vocab=char_vocab, max_text_len=max_train_len_char) num_classes = len(set(yval.tolist())) textmodel_data = ngrams_matrix_shape, num_classes, charvec_shape, char_vocab_size, tfidf_matrix, char_vocab, max_train_len_char, tags_to_idx if weighting == 'tfidf': data_path = 'models/model_' + lang + '_data.pk' elif weighting == 'bm25': data_path = 'models/model_' + lang + '_bm25_data.pk' with open(data_path, 'wb') as f: pickle.dump(textmodel_data, f, protocol=2) if lang != 'all': if lang not in ['sg', 'ar']: num_epoch = 20 else: num_epoch = 80 else: num_epoch = 10 model = build_model(ngrams_matrix_shape, num_classes, charvec_shape, char_vocab_size) model.fit([tfidf_matrix_test, charvec], ytrain, validation_data=([tfidf_matrix_val, charvec_val], yval), batch_size=16, epochs=num_epoch, verbose=0, callbacks=[checkpointer]) K.clear_session() gc.collect() return model
def xgboost_model(data): model = {'data': data} print("\nbuilding corpus vector space...\n") model['bm25'] = BM25Transformer() model['vectorizer'] = TfidfVectorizer() model['vectorizer'].fit(data['TrainData']) #data['vectorizer'].fit(ValidToken) TrainTf = model['vectorizer'].transform(tqdm(data['TrainData'])) print("fitting bm25...", end='') sys.stdout.flush() model['bm25'].fit(TrainTf) #data['bm25'].fit(ValidTf) print("ok") print("transforming...", end='') sys.stdout.flush() data['TrainData'] = model['bm25'].transform(TrainTf) print("ok") print('TrainTf.shape:', TrainTf.shape) ytrain = data['TrainLabel'] xtrain_tfv = data['TrainData'] svd = decomposition.TruncatedSVD(n_components=120) svd.fit(xtrain_tfv) xtrain_svd = svd.transform(xtrain_tfv) scl = preprocessing.StandardScaler() scl.fit(xtrain_svd) xtrain_svd_scl = scl.transform(xtrain_svd) clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, subsample=0.8, nthread=10, learning_rate=0.1) clf.fit(xtrain_svd, ytrain) def documents_to_bm25(tokens): tf = model['vectorizer'].transform(tqdm(tokens)) print("doing the valid set transformation...", end='') sys.stdout.flush() DocData = model['bm25'].transform(tf) print("ok") print('ValidTf.shape:', tf.shape) return DocData def validate(documents, show_loss=False): xvalid_tfv = documents_to_bm25(documents) xvalid_svd = svd.transform(xvalid_tfv) xvalid_svd_scl = scl.transform(xvalid_svd) if show_loss: predictions = clf.predict_proba(xvalid_svd) print("logloss: %0.3f " % multiclass_logloss(yvalid, predictions)) predictions = clf.predict(xvalid_svd) return predictions def predict(doc, show_loss=False): return validate([data['config']['tokenize'](doc)], show_loss)[0] model['validate'] = validate model['predict'] = predict return model