Beispiel #1
0
y_train = dftrain['target'].values

word2id = dict()
word2id[u''] = 0

segmenter = Segmenter()

for name, df in [('train', dftrain), ('test', dftest)]:

    X_parag  = np.zeros((df.shape[0], max_len), dtype=np.int32)
    X_parag1 = np.zeros((df.shape[0], max_len), dtype=np.int32)
    X_quest  = np.zeros((df.shape[0], max_len), dtype=np.int32)

    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Processing " + name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_words = TextNormalizer.tokenize_words(question)[0:max_len]
        parag_words = TextNormalizer.tokenize_words(paragraph)[0:max_len]

        for word_pos,word in enumerate(quest_words):
            if word not in word2id:
                word2id[word] = len(word2id)

            X_quest[index, word_pos ] = word2id[word]

        for word_pos, word in enumerate(parag_words):
            if word not in word2id:
                word2id[word] = len(word2id)
Beispiel #2
0
        ps_words3 = set(filter_NEs(tokenize3(parag_sent)))
        match = len(q_words3&ps_words3)
        max_ne_match = max( max_ne_match, match )

        paragraph2 = uniq_words2(parag_sent)
        idf_intersection2 = np.sum([idfs.get(x, 0.0) for x in paragraph2 & question2])
        max_idf_intersection2_sent = max( max_idf_intersection2_sent, idf_intersection2)

    return (max_sent_match, max_ne_match, max_idf_intersection2_sent)


# ========================== LSA ==========================
tfidf_corpus = set()
for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="extracting texts for LSA from " + name):
        question = TextNormalizer.tokenize_crops( TextNormalizer.preprocess_question_str(row.question) )
        paragraph = TextNormalizer.tokenize_crops( row.paragraph )
        tfidf_corpus.add(u' '.join(question))
        tfidf_corpus.add(u' '.join(paragraph))


vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 1), min_df=1, analyzer='word', stop_words=stopwords)

svd_model = TruncatedSVD(n_components=LSA_DIMS, algorithm='randomized', n_iter=20, random_state=42)

svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])
svd_transformer.fit(tfidf_corpus)

del tfidf_corpus
gc.collect()
Beispiel #3
0
# ------------------------------------------------------------------------

dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8')
dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8')

# ------------------------------------------------------------------------

segmenter = Segmenter()

target0_words = Counter()
target1_words = Counter()
df = dftrain[['question','target']]

for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    quest_words = TextNormalizer.tokenize_raw(TextNormalizer.preprocess_question_str(row.question))
    if row['target']==0:
        target0_words.update(quest_words)
    else:
        target1_words.update(quest_words)


total0 = sum(target0_words.values())
word_0_freq = dict([ (w,f/total0) for (w,f) in target0_words.iteritems()])

total1 = sum(target1_words.values())
word_1_freq = dict([ (w,f/total1) for (w,f) in target1_words.iteritems()])

# ------------------------------------------------------------------------

segmenter = Segmenter()
Beispiel #4
0
    verbose_eval=False,
)

print('Validating...')
y_pred = cl.predict(D_test, ntree_limit=nbrounds)
val_score = sklearn.metrics.roc_auc_score(y_test, y_pred)
print('roc_auc_score={}'.format(val_score))

y_2 = [(1 if z > 0.5 else 0) for z in y_pred]
acc = sklearn.metrics.accuracy_score(y_test, y_2)
print('accuracy_score={}'.format(acc))

segmenter = Segmenter.Segmenter()

if False:
    print('Printing mispredictons')
    y_test = y_test.values
    with codecs.open('../submit/mispredictions.txt', 'w', 'utf-8') as wrt:
        for y_i, df_i in enumerate(i_test):
            if y_2[y_i] != y_test[y_i]:
                wrt.write('\n\ny_pred={} y_test={}\n'.format(
                    y_pred[y_i], y_test[y_i]))
                quest = dftrain.loc[df_i, ['question']].values[0]
                parag = dftrain.loc[df_i, ['paragraph']].values[0]
                quest_str = TextNormalizer.preprocess_question_str(quest)

                for j, parag_sent in enumerate(segmenter.split(parag)):
                    wrt.write(u'P[{}]:\t{}\n'.format(j, parag_sent))

                wrt.write(u'Q:\t{}\n'.format(quest_str))