y_train = dftrain['target'].values word2id = dict() word2id[u''] = 0 segmenter = Segmenter() for name, df in [('train', dftrain), ('test', dftest)]: X_parag = np.zeros((df.shape[0], max_len), dtype=np.int32) X_parag1 = np.zeros((df.shape[0], max_len), dtype=np.int32) X_quest = np.zeros((df.shape[0], max_len), dtype=np.int32) for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Processing " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_words = TextNormalizer.tokenize_words(question)[0:max_len] parag_words = TextNormalizer.tokenize_words(paragraph)[0:max_len] for word_pos,word in enumerate(quest_words): if word not in word2id: word2id[word] = len(word2id) X_quest[index, word_pos ] = word2id[word] for word_pos, word in enumerate(parag_words): if word not in word2id: word2id[word] = len(word2id)
ps_words3 = set(filter_NEs(tokenize3(parag_sent))) match = len(q_words3&ps_words3) max_ne_match = max( max_ne_match, match ) paragraph2 = uniq_words2(parag_sent) idf_intersection2 = np.sum([idfs.get(x, 0.0) for x in paragraph2 & question2]) max_idf_intersection2_sent = max( max_idf_intersection2_sent, idf_intersection2) return (max_sent_match, max_ne_match, max_idf_intersection2_sent) # ========================== LSA ========================== tfidf_corpus = set() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="extracting texts for LSA from " + name): question = TextNormalizer.tokenize_crops( TextNormalizer.preprocess_question_str(row.question) ) paragraph = TextNormalizer.tokenize_crops( row.paragraph ) tfidf_corpus.add(u' '.join(question)) tfidf_corpus.add(u' '.join(paragraph)) vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 1), min_df=1, analyzer='word', stop_words=stopwords) svd_model = TruncatedSVD(n_components=LSA_DIMS, algorithm='randomized', n_iter=20, random_state=42) svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)]) svd_transformer.fit(tfidf_corpus) del tfidf_corpus gc.collect()
# ------------------------------------------------------------------------ dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8') dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8') # ------------------------------------------------------------------------ segmenter = Segmenter() target0_words = Counter() target1_words = Counter() df = dftrain[['question','target']] for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]): quest_words = TextNormalizer.tokenize_raw(TextNormalizer.preprocess_question_str(row.question)) if row['target']==0: target0_words.update(quest_words) else: target1_words.update(quest_words) total0 = sum(target0_words.values()) word_0_freq = dict([ (w,f/total0) for (w,f) in target0_words.iteritems()]) total1 = sum(target1_words.values()) word_1_freq = dict([ (w,f/total1) for (w,f) in target1_words.iteritems()]) # ------------------------------------------------------------------------ segmenter = Segmenter()
verbose_eval=False, ) print('Validating...') y_pred = cl.predict(D_test, ntree_limit=nbrounds) val_score = sklearn.metrics.roc_auc_score(y_test, y_pred) print('roc_auc_score={}'.format(val_score)) y_2 = [(1 if z > 0.5 else 0) for z in y_pred] acc = sklearn.metrics.accuracy_score(y_test, y_2) print('accuracy_score={}'.format(acc)) segmenter = Segmenter.Segmenter() if False: print('Printing mispredictons') y_test = y_test.values with codecs.open('../submit/mispredictions.txt', 'w', 'utf-8') as wrt: for y_i, df_i in enumerate(i_test): if y_2[y_i] != y_test[y_i]: wrt.write('\n\ny_pred={} y_test={}\n'.format( y_pred[y_i], y_test[y_i])) quest = dftrain.loc[df_i, ['question']].values[0] parag = dftrain.loc[df_i, ['paragraph']].values[0] quest_str = TextNormalizer.preprocess_question_str(quest) for j, parag_sent in enumerate(segmenter.split(parag)): wrt.write(u'P[{}]:\t{}\n'.format(j, parag_sent)) wrt.write(u'Q:\t{}\n'.format(quest_str))