Example #1
0
def store_sent(sent_str):
    words = TextNormalizer.tokenize_lemmas(sent_str)
    for word in words:
        if word not in word2id:
            word2id[word] = len(word2id)

    ids = tuple( [word2id[word] for word in words] )
    if ids not in sent2id:
        sent2id[ u' '.join(words) ] = len(sent_list)
        sent_list.append(ids)
Example #2
0
 def parse_regex(self):
     for key in self.specs.patterns[
             'triples']:  ##specs is a dict with regex pattern as key, and order of arguments as value
         #print(key)
         matchObj = re.search(key, self.question, re.M | re.I | re.U)
         if matchObj:
             #print("expression found")
             triple = [
                 TextNormalizer(
                     matchObj.group(1)).allowedTagKeeper('noun_adjective'),
                 TextNormalizer(
                     matchObj.group(2)).allowedTagKeeper('noun_adjective'),
                 ""
             ]  ##instead of complicated regex, i remove everything from a group that is not a noun, we know that only those are meaningful in wikidata IDs
             ##an empty element is added in the end as a placeholder for the variable, that is obviously not in the text, for the sake of similar indexing with the order in specs
             #print(triple)
             T = Triple(triple, self.specs.patterns['triples'][key])
             self.variable = T.variable  ##set the question variables to be equal to the triple variables TODO: selection in multiple triples
             self.targetVariable = T.targetVariable
             self.query_list.append(T.SQL)
Example #3
0
def calc_similarity( quest, parag ):
    quest_words = set(TextNormalizer.tokenize_words(quest))

    parag_words = set(TextNormalizer.tokenize_words(parag))
    parag_lemmas = set(TextNormalizer.tokenize_lemmas(parag))
    parag_stems = set(TextNormalizer.tokenize_stems(parag))
    parag_crops = set(TextNormalizer.tokenize_crops(parag))

    matched_parag_words = set()

    sim = 0.0
    for qword in quest_words:
        if qword in parag_words:
            matched_parag_words.add(qword)
            sim += 1.0
        else:
            qlemma = TextNormalizer.lemmatize_word(qword)
            if qlemma in parag_lemmas:
                #matched_parag_lemmas.add(qlemma)
                sim += 1.0
            else:
                qstem = TextNormalizer.stem_word(qword)
                if qstem in parag_stems:
                    sim += 0.95
                else:
                    qcrop = TextNormalizer.crop_word(qword)
                    if qcrop in parag_crops:
                        sim += 0.80
                    else:
                        found_syn = False
                        for pstem in parag_stems:
                            if (qstem,pstem) in syn_stems:
                                sim += 0.70
                                found_syn = True
                                break

                        if not found_syn:
                            if qword in w2v:
                                qvec = w2v[qword]
                                max_cos = -1e38
                                for pword in parag_words:
                                    if pword in w2v:
                                        pvec = w2v[pword]
                                        c = v_cosine( qvec, pvec )
                                        max_cos = max( max_cos, c )

                                sim += max_cos*0.5

    return sim / len(quest_words)
Example #4
0
def tokenize4(s):
    return TextNormalizer.tokenize_crops(s)
Example #5
0
        ps_words3 = set(filter_NEs(tokenize3(parag_sent)))
        match = len(q_words3&ps_words3)
        max_ne_match = max( max_ne_match, match )

        paragraph2 = uniq_words2(parag_sent)
        idf_intersection2 = np.sum([idfs.get(x, 0.0) for x in paragraph2 & question2])
        max_idf_intersection2_sent = max( max_idf_intersection2_sent, idf_intersection2)

    return (max_sent_match, max_ne_match, max_idf_intersection2_sent)


# ========================== LSA ==========================
tfidf_corpus = set()
for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="extracting texts for LSA from " + name):
        question = TextNormalizer.tokenize_crops( TextNormalizer.preprocess_question_str(row.question) )
        paragraph = TextNormalizer.tokenize_crops( row.paragraph )
        tfidf_corpus.add(u' '.join(question))
        tfidf_corpus.add(u' '.join(paragraph))


vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 1), min_df=1, analyzer='word', stop_words=stopwords)

svd_model = TruncatedSVD(n_components=LSA_DIMS, algorithm='randomized', n_iter=20, random_state=42)

svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)])
svd_transformer.fit(tfidf_corpus)

del tfidf_corpus
gc.collect()
Example #6
0
def tokenize3(phrase):
    return TextNormalizer.tokenize_raw(phrase)
Example #7
0
def filter_NEs(tokens):
    return [ TextNormalizer.crop_word(word) for word in filter( is_NE, tokens ) ]
Example #8
0
    denom = 0
    for (word, stem) in zip(words, stems):
        if word in w2v:
            denom += stem2weight[stem]
            v += np.asarray(w2v[word]) * stem2weight[stem]
    return v / denom if denom > 0 else v


# -------------------------------------------------------------------------

stem2freq = collections.Counter()
for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(),
                                total=df.shape[0],
                                desc="Counting word frequencies in " + name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph
        stem2freq.update(TextNormalizer.tokenize_stems(question))
        stem2freq.update(TextNormalizer.tokenize_stems(paragraph))

total_freq = sum(stem2freq.values())
stem2weight = dict([(w, math.log(total_freq / freq))
                    for (w, freq) in stem2freq.iteritems()])

# -----------------------------------------------------------------------------------

segmenter = Segmenter()

for name, df in [('train', dftrain), ('test', dftest)]:

    for index, row in tqdm.tqdm(df.iterrows(),
Example #9
0
# ------------------------------------------------------------------------

dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8')
dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8')

# ------------------------------------------------------------------------

segmenter = Segmenter()

for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(),
                                total=df.shape[0],
                                desc="Computing the tail matching for " +
                                name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_words = TextNormalizer.tokenize_crops(question)[4:]

        max_tail_match = 0
        for parag_sent in segmenter.split(paragraph):
            parag_words = TextNormalizer.tokenize_crops(parag_sent)
            tail_match = len(set(quest_words) & set(parag_words))
            max_tail_match = max(max_tail_match, tail_match)

        df.loc[index, 'max_tail_match'] = max_tail_match

dftrain.to_csv("../data/dftrain.csv", index=True, encoding='utf-8')
dftest.to_csv("../data/dftest.csv", index=True, encoding='utf-8')
Example #10
0
def NEs_intersection(words1, words2):
    return len( set(filter_NEs(words1)) & set(filter_NEs(words2)) )

# ------------------------------------------------------------------------

dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8')
dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8')

# ------------------------------------------------------------------------

segmenter = Segmenter()

for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Processing named entries features for " + name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_words = TextNormalizer.tokenize_raw(question)
        df.loc[index, 'quest_ne_count'] = len(filter_NEs(quest_words))

        parag_words = TextNormalizer.tokenize_raw(paragraph)
        df.loc[index, 'nes_intersection'] = NEs_intersection( quest_words, parag_words )

        max_ne_match = 0
        for parag_sent in segmenter.split(paragraph):
            parag_words = TextNormalizer.tokenize_raw(parag_sent)
            ne_match = NEs_intersection( quest_words, parag_words )
            max_ne_match = max( max_ne_match, ne_match )

        df.loc[index, 'best_ne_match'] = max_ne_match
Example #11
0
def uniq_words(text):
    return set( TextNormalizer.tokenize_crops(text) )
Example #12
0
# ------------------------------------------------------------------------

dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8')
dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8')

# ------------------------------------------------------------------------

segmenter = Segmenter()

target0_words = Counter()
target1_words = Counter()
df = dftrain[['question','target']]

for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    quest_words = TextNormalizer.tokenize_raw(TextNormalizer.preprocess_question_str(row.question))
    if row['target']==0:
        target0_words.update(quest_words)
    else:
        target1_words.update(quest_words)


total0 = sum(target0_words.values())
word_0_freq = dict([ (w,f/total0) for (w,f) in target0_words.iteritems()])

total1 = sum(target1_words.values())
word_1_freq = dict([ (w,f/total1) for (w,f) in target1_words.iteritems()])

# ------------------------------------------------------------------------

segmenter = Segmenter()
Example #13
0
File: tests.py Project: yatrex/TT
import ModeloVectorial
import TextNormalizer
import UsualTools
import os
from ModeloVectorial import *
from TextNormalizer import *
from UsualTools import *
maxLen = 1000000  # Maxima mongitud de caracteres soportados por libro
exp = TextNormalizer()
llemmas = []
#----------------------------------------------------- Obtiene objetos tipo libros y los guardar -----------------------------------
#books = UsualTools.getLibros("./Libros de Goodreads")
#UsualTools.saveObject(books,"./Recursos/BooksList.json")
#-----------------------------------------------------
books = UsualTools.loadObject("./Recursos/BooksList.json")
exp.setVocabulary(UsualTools.loadObject("./Recursos/vocabulary.json"))
#----------------------------------------------------- Lemmatiza los libros y los guarda -------------------------------------------

#for book in books:
#	if (not (str(book.num)+".json" in os.listdir("./Recursos/lemmas/")) ):
#		print("-----------------")
#		print("Titulo:",book.nombre)
#		book.texto = exp.delExtraInfoPG("./Libros de Goodreads/"+str(book.num)+".txt")
#		book.texto = exp.deleteSpecialChars(book.texto)
#		booktam = len(book.texto)
#		lemmas = []
#		if (len(book.texto) < maxLen):#1000000 es el numero maximo de caracteres soportada por cada procesamiento
#			exp.setText(book.texto)
#			lemmas= exp.lemmatize_delSW()
#		else:#Si excede el numero de caracteres se divide en bloques y luego se guntan los lemmas
#			print("Libro grande")
Example #14
0
def is_FW(word):
    nword = TextNormalizer.normalize_word(word)
    return len(nword) > 0 and not (is_cyrword(nword) or is_digit(nword))
Example #15
0

# -------------------------------------------------------------------------

segmenter = Segmenter()

print('Segmentation of paragraph texts...')

for name, df in [('train', dftrain), ('test', dftest)]:

    for index, row in tqdm.tqdm(df.iterrows(),
                                total=df.shape[0],
                                desc="Calculating question similarities for " +
                                name):

        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_stems = TextNormalizer.tokenize_stems(u' '.join(
            TextNormalizer.preprocess_question(question)))
        quest_set = set(quest_stems)
        denom = float(len(quest_set))

        max_intersect3 = 0.0

        for parag_sent in segmenter.split(paragraph):
            parag_stems = TextNormalizer.tokenize_stems(
                Abbrev.normalize_abbrev(parag_sent))
            intersect3 = len(quest_set & set(parag_stems)) / denom
            max_intersect3 = max(max_intersect3, intersect3)
Example #16
0
#test_str = u'Сразу после возвращения Фрама Нансен стал главным специалистом по полярным исследованиям в мире, по выражению Р. Хантфорда — оракулом для всех исследователей полярных широт Севера и Юга [188]. Нансен консультировал бельгийского барона Адриена де Жерлаша, который планировал в 1898 году свою экспедицию в Антарктиду, одним из участников команды был Руаль Амундсен[189]. Известнейший исследователь Гренландии Кнуд Расмуссен сравнил посещение Нансена с посвящением в рыцари[190]. В то же время Нансен категорически отказался встречаться со своим соотечественником Карстеном Борхгревинком, сочтя его мошенником, хотя именно он совершил первую успешную зимовку на побережье Антарктиды[191]. В 1900 году в Норвегию приехал за консультациями Роберт Скотт со своим покровителем Клементом Маркхэмом — давним другом Нансена, готовившим британскую экспедицию в Антарктиду. Несмотря на то, что англичане практически проигнорировали все советы, Нансен и Скотт остались в хороших отношениях[192].'
#for l in segmenter.split(test_str):
#    print(l)

# -------------------------------------------------------------------------

wrt0 = codecs.open('../data/rows(y=0).txt', 'w', 'utf-8')
wrt1 = codecs.open('../data/rows(y=1).txt', 'w', 'utf-8')

for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
    question = row.question
    paragraph = row.paragraph

    wrt = wrt0 if row['target'] == 0 else wrt1

    question1 = TextNormalizer.preprocess_question_str(question)
    quest_stems = set(TextNormalizer.tokenize_stems(question1))
    quest_crops = set(TextNormalizer.tokenize_crops(question1))

    denom_stems = float(len(quest_stems))
    denom_crops = float(len(quest_crops))

    max_crop_match = 0.0
    best_crop_match_sent = u''

    max_stem_match = 0.0
    best_stem_match_sent = u''

    wrt.write('\n\nid={}\n'.format(index))
    for i, parag_sent in enumerate(segmenter.split(paragraph)):
        wrt.write(u'P[{}]\t{}\n'.format(i, parag_sent))
Example #17
0
    for word in words:
        if word not in word2id:
            word2id[word] = len(word2id)

    ids = tuple( [word2id[word] for word in words] )
    if ids not in sent2id:
        sent2id[ u' '.join(words) ] = len(sent_list)
        sent_list.append(ids)



segmenter = Segmenter()

for name, df in [('train', dftrain), ('test', dftest)]:
    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Extracting sentences from " + name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        sent_id = store_sent(question)

        for parag_sent in segmenter.split(paragraph):
            sent_id = store_sent(parag_sent)

nb_sent = len(sent_list)
nb_words = len(word2id)

print('{} sentences'.format(nb_sent))
print('{} words'.format(nb_words))

max_len = min( MAX_LEN, max( [len(s) for s in sent_list] ) )
print('max_len={}'.format(max_len))
Example #18
0
def uniq_words2(text):
    return set(nonstop(TextNormalizer.tokenize_crops(text)))
Example #19
0
import gensim
import scipy.spatial.distance
import codecs

# ------------------------------------------------------------------------

syn_stems = set()

with codecs.open('../data/word2similar.dat', 'r', 'utf-8') as rdr:
    for line in rdr:
        tx = line.strip().split(u'\t')
        if len(tx) == 2:
            word1 = tx[0].lower()
            word2 = tx[1].lower()

            stem1 = TextNormalizer.stem_word(word1)
            stem2 = TextNormalizer.stem_word(word2)
            if stem1!=stem2:
                syn_stems.add( (stem1, stem2) )
                syn_stems.add( (stem2, stem1) )

# ------------------------------------------------------------------------

dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8')
dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8')

# ------------------------------------------------------------------------

print('Loading the w2v model...')
# w2v = gensim.models.KeyedVectors.load_word2vec_format(r'f:\Word2Vec\word_vectors_cbow=1_win=5_dim=32.txt', binary=False)
w2v = gensim.models.KeyedVectors.load_word2vec_format('/home/eek/polygon/w2v/w2v.CBOW=1_WIN=20_DIM=32.txt',
Example #20
0
    text = tcleaner.clean_code(text)
    text = tcleaner.clean_text(text)
    tcleaner.output(text)

    tokenizer = Tokenizer.JanomeTokenizer()
    words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    tokenizer.output(words)

    #MeCab
    #tokenizer = Tokenizer.MeCabTokenizer()
    #words = tokenizer.wakati(text)
    #words = tokenizer.filter_by_pos(text, pos=('名詞'))
    #tokenizer.output(words)

    tnormalizer = TextNormalizer.TextNormalizer()
    nwords = []
    for w in words:
        nw = tnormalizer.normalize(w)
        nw = tnormalizer.lemmatize_term(nw, pos='v')
        nwords.append(nw)
    tnormalizer.output(nwords)

    stw_remover = StopwordRemover.StopwordRemover()
    stw_remover.load_stopword_file("./slothlib/stopwords.txt")
    stw_remover.load_stopword_file("./slothlib/stopwords_extend.txt")
    stw_remover.find_stopwords(nwords)
    stwords = stw_remover.remove_stopwords(nwords)
    stwords = stw_remover.remove_noisewords(stwords)

    stw_remover.output(stwords)
Example #21
0

# -------------------------------------------------------------------------

segmenter = Segmenter()

print('Segmentation of paragraph texts...')

for name, df in [('train', dftrain), ('test', dftest)]:

    for index, row in tqdm.tqdm(df.iterrows(),
                                total=df.shape[0],
                                desc="Calculating charwise similarities for " +
                                name):

        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_stems = stemmize(question)

        max_fuzz_partial_ratio = 0.0

        for parag_sent in segmenter.split(paragraph):
            parag_stems = stemmize(parag_sent)
            fuzz_partial_ratio = 0.01 * fuzz.partial_ratio(
                quest_stems, parag_stems)
            max_fuzz_partial_ratio = max(max_fuzz_partial_ratio,
                                         fuzz_partial_ratio)

        df.loc[index, 'max_fuzz_partial_ratio_str'] = max_fuzz_partial_ratio
Example #22
0
def is_oov(word):
    return TextNormalizer.normalize_word(
        word) not in known_words and len(word) > 0 and not word[0].isdigit()
Example #23
0
def stemmize(phrase):
    return u' '.join(TextNormalizer.tokenize_stems(phrase))
Example #24
0
def is_NE(word):
    nword = TextNormalizer.normalize_word(word)
    return len(word)>1 and nword not in funcwords\
           and nonstop( nword ) and word[0].isupper()
Example #25
0
    verbose_eval=False,
)

print('Validating...')
y_pred = cl.predict(D_test, ntree_limit=nbrounds)
val_score = sklearn.metrics.roc_auc_score(y_test, y_pred)
print('roc_auc_score={}'.format(val_score))

y_2 = [(1 if z > 0.5 else 0) for z in y_pred]
acc = sklearn.metrics.accuracy_score(y_test, y_2)
print('accuracy_score={}'.format(acc))

segmenter = Segmenter.Segmenter()

if False:
    print('Printing mispredictons')
    y_test = y_test.values
    with codecs.open('../submit/mispredictions.txt', 'w', 'utf-8') as wrt:
        for y_i, df_i in enumerate(i_test):
            if y_2[y_i] != y_test[y_i]:
                wrt.write('\n\ny_pred={} y_test={}\n'.format(
                    y_pred[y_i], y_test[y_i]))
                quest = dftrain.loc[df_i, ['question']].values[0]
                parag = dftrain.loc[df_i, ['paragraph']].values[0]
                quest_str = TextNormalizer.preprocess_question_str(quest)

                for j, parag_sent in enumerate(segmenter.split(parag)):
                    wrt.write(u'P[{}]:\t{}\n'.format(j, parag_sent))

                wrt.write(u'Q:\t{}\n'.format(quest_str))
Example #26
0
y_train = dftrain['target'].values

word2id = dict()
word2id[u''] = 0

segmenter = Segmenter()

for name, df in [('train', dftrain), ('test', dftest)]:

    X_parag  = np.zeros((df.shape[0], max_len), dtype=np.int32)
    X_parag1 = np.zeros((df.shape[0], max_len), dtype=np.int32)
    X_quest  = np.zeros((df.shape[0], max_len), dtype=np.int32)

    for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Processing " + name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_words = TextNormalizer.tokenize_words(question)[0:max_len]
        parag_words = TextNormalizer.tokenize_words(paragraph)[0:max_len]

        for word_pos,word in enumerate(quest_words):
            if word not in word2id:
                word2id[word] = len(word2id)

            X_quest[index, word_pos ] = word2id[word]

        for word_pos, word in enumerate(parag_words):
            if word not in word2id:
                word2id[word] = len(word2id)
Example #27
0
y_train = dftrain['target'].values

word2id = dict()
word2id[u''] = 0

for name, df in [('train', dftrain), ('test', dftest)]:

    X_parag = np.zeros((df.shape[0], max_paragraph_len), dtype=np.int32)
    X_quest = np.zeros((df.shape[0], max_question_len), dtype=np.int32)

    for index, row in tqdm.tqdm(df.iterrows(),
                                total=df.shape[0],
                                desc="Estimating the sequence lengths for " +
                                name):
        question = TextNormalizer.preprocess_question_str(row.question)
        paragraph = row.paragraph

        quest_words = TextNormalizer.tokenize_words(
            question)[0:max_question_len]
        parag_words = TextNormalizer.tokenize_words(
            paragraph)[0:max_paragraph_len]

        for word_pos, word in enumerate(quest_words):
            if word not in word2id:
                word2id[word] = len(word2id)

            X_quest[index, word_pos] = word2id[word]

        for word_pos, word in enumerate(parag_words):
            if word not in word2id: