def store_sent(sent_str): words = TextNormalizer.tokenize_lemmas(sent_str) for word in words: if word not in word2id: word2id[word] = len(word2id) ids = tuple( [word2id[word] for word in words] ) if ids not in sent2id: sent2id[ u' '.join(words) ] = len(sent_list) sent_list.append(ids)
def parse_regex(self): for key in self.specs.patterns[ 'triples']: ##specs is a dict with regex pattern as key, and order of arguments as value #print(key) matchObj = re.search(key, self.question, re.M | re.I | re.U) if matchObj: #print("expression found") triple = [ TextNormalizer( matchObj.group(1)).allowedTagKeeper('noun_adjective'), TextNormalizer( matchObj.group(2)).allowedTagKeeper('noun_adjective'), "" ] ##instead of complicated regex, i remove everything from a group that is not a noun, we know that only those are meaningful in wikidata IDs ##an empty element is added in the end as a placeholder for the variable, that is obviously not in the text, for the sake of similar indexing with the order in specs #print(triple) T = Triple(triple, self.specs.patterns['triples'][key]) self.variable = T.variable ##set the question variables to be equal to the triple variables TODO: selection in multiple triples self.targetVariable = T.targetVariable self.query_list.append(T.SQL)
def calc_similarity( quest, parag ): quest_words = set(TextNormalizer.tokenize_words(quest)) parag_words = set(TextNormalizer.tokenize_words(parag)) parag_lemmas = set(TextNormalizer.tokenize_lemmas(parag)) parag_stems = set(TextNormalizer.tokenize_stems(parag)) parag_crops = set(TextNormalizer.tokenize_crops(parag)) matched_parag_words = set() sim = 0.0 for qword in quest_words: if qword in parag_words: matched_parag_words.add(qword) sim += 1.0 else: qlemma = TextNormalizer.lemmatize_word(qword) if qlemma in parag_lemmas: #matched_parag_lemmas.add(qlemma) sim += 1.0 else: qstem = TextNormalizer.stem_word(qword) if qstem in parag_stems: sim += 0.95 else: qcrop = TextNormalizer.crop_word(qword) if qcrop in parag_crops: sim += 0.80 else: found_syn = False for pstem in parag_stems: if (qstem,pstem) in syn_stems: sim += 0.70 found_syn = True break if not found_syn: if qword in w2v: qvec = w2v[qword] max_cos = -1e38 for pword in parag_words: if pword in w2v: pvec = w2v[pword] c = v_cosine( qvec, pvec ) max_cos = max( max_cos, c ) sim += max_cos*0.5 return sim / len(quest_words)
def tokenize4(s): return TextNormalizer.tokenize_crops(s)
ps_words3 = set(filter_NEs(tokenize3(parag_sent))) match = len(q_words3&ps_words3) max_ne_match = max( max_ne_match, match ) paragraph2 = uniq_words2(parag_sent) idf_intersection2 = np.sum([idfs.get(x, 0.0) for x in paragraph2 & question2]) max_idf_intersection2_sent = max( max_idf_intersection2_sent, idf_intersection2) return (max_sent_match, max_ne_match, max_idf_intersection2_sent) # ========================== LSA ========================== tfidf_corpus = set() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="extracting texts for LSA from " + name): question = TextNormalizer.tokenize_crops( TextNormalizer.preprocess_question_str(row.question) ) paragraph = TextNormalizer.tokenize_crops( row.paragraph ) tfidf_corpus.add(u' '.join(question)) tfidf_corpus.add(u' '.join(paragraph)) vectorizer = TfidfVectorizer(max_features=None, ngram_range=(1, 1), min_df=1, analyzer='word', stop_words=stopwords) svd_model = TruncatedSVD(n_components=LSA_DIMS, algorithm='randomized', n_iter=20, random_state=42) svd_transformer = Pipeline([('tfidf', vectorizer), ('svd', svd_model)]) svd_transformer.fit(tfidf_corpus) del tfidf_corpus gc.collect()
def tokenize3(phrase): return TextNormalizer.tokenize_raw(phrase)
def filter_NEs(tokens): return [ TextNormalizer.crop_word(word) for word in filter( is_NE, tokens ) ]
denom = 0 for (word, stem) in zip(words, stems): if word in w2v: denom += stem2weight[stem] v += np.asarray(w2v[word]) * stem2weight[stem] return v / denom if denom > 0 else v # ------------------------------------------------------------------------- stem2freq = collections.Counter() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Counting word frequencies in " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph stem2freq.update(TextNormalizer.tokenize_stems(question)) stem2freq.update(TextNormalizer.tokenize_stems(paragraph)) total_freq = sum(stem2freq.values()) stem2weight = dict([(w, math.log(total_freq / freq)) for (w, freq) in stem2freq.iteritems()]) # ----------------------------------------------------------------------------------- segmenter = Segmenter() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(),
# ------------------------------------------------------------------------ dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8') dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8') # ------------------------------------------------------------------------ segmenter = Segmenter() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Computing the tail matching for " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_words = TextNormalizer.tokenize_crops(question)[4:] max_tail_match = 0 for parag_sent in segmenter.split(paragraph): parag_words = TextNormalizer.tokenize_crops(parag_sent) tail_match = len(set(quest_words) & set(parag_words)) max_tail_match = max(max_tail_match, tail_match) df.loc[index, 'max_tail_match'] = max_tail_match dftrain.to_csv("../data/dftrain.csv", index=True, encoding='utf-8') dftest.to_csv("../data/dftest.csv", index=True, encoding='utf-8')
def NEs_intersection(words1, words2): return len( set(filter_NEs(words1)) & set(filter_NEs(words2)) ) # ------------------------------------------------------------------------ dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8') dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8') # ------------------------------------------------------------------------ segmenter = Segmenter() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Processing named entries features for " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_words = TextNormalizer.tokenize_raw(question) df.loc[index, 'quest_ne_count'] = len(filter_NEs(quest_words)) parag_words = TextNormalizer.tokenize_raw(paragraph) df.loc[index, 'nes_intersection'] = NEs_intersection( quest_words, parag_words ) max_ne_match = 0 for parag_sent in segmenter.split(paragraph): parag_words = TextNormalizer.tokenize_raw(parag_sent) ne_match = NEs_intersection( quest_words, parag_words ) max_ne_match = max( max_ne_match, ne_match ) df.loc[index, 'best_ne_match'] = max_ne_match
def uniq_words(text): return set( TextNormalizer.tokenize_crops(text) )
# ------------------------------------------------------------------------ dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8') dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8') # ------------------------------------------------------------------------ segmenter = Segmenter() target0_words = Counter() target1_words = Counter() df = dftrain[['question','target']] for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]): quest_words = TextNormalizer.tokenize_raw(TextNormalizer.preprocess_question_str(row.question)) if row['target']==0: target0_words.update(quest_words) else: target1_words.update(quest_words) total0 = sum(target0_words.values()) word_0_freq = dict([ (w,f/total0) for (w,f) in target0_words.iteritems()]) total1 = sum(target1_words.values()) word_1_freq = dict([ (w,f/total1) for (w,f) in target1_words.iteritems()]) # ------------------------------------------------------------------------ segmenter = Segmenter()
import ModeloVectorial import TextNormalizer import UsualTools import os from ModeloVectorial import * from TextNormalizer import * from UsualTools import * maxLen = 1000000 # Maxima mongitud de caracteres soportados por libro exp = TextNormalizer() llemmas = [] #----------------------------------------------------- Obtiene objetos tipo libros y los guardar ----------------------------------- #books = UsualTools.getLibros("./Libros de Goodreads") #UsualTools.saveObject(books,"./Recursos/BooksList.json") #----------------------------------------------------- books = UsualTools.loadObject("./Recursos/BooksList.json") exp.setVocabulary(UsualTools.loadObject("./Recursos/vocabulary.json")) #----------------------------------------------------- Lemmatiza los libros y los guarda ------------------------------------------- #for book in books: # if (not (str(book.num)+".json" in os.listdir("./Recursos/lemmas/")) ): # print("-----------------") # print("Titulo:",book.nombre) # book.texto = exp.delExtraInfoPG("./Libros de Goodreads/"+str(book.num)+".txt") # book.texto = exp.deleteSpecialChars(book.texto) # booktam = len(book.texto) # lemmas = [] # if (len(book.texto) < maxLen):#1000000 es el numero maximo de caracteres soportada por cada procesamiento # exp.setText(book.texto) # lemmas= exp.lemmatize_delSW() # else:#Si excede el numero de caracteres se divide en bloques y luego se guntan los lemmas # print("Libro grande")
def is_FW(word): nword = TextNormalizer.normalize_word(word) return len(nword) > 0 and not (is_cyrword(nword) or is_digit(nword))
# ------------------------------------------------------------------------- segmenter = Segmenter() print('Segmentation of paragraph texts...') for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Calculating question similarities for " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_stems = TextNormalizer.tokenize_stems(u' '.join( TextNormalizer.preprocess_question(question))) quest_set = set(quest_stems) denom = float(len(quest_set)) max_intersect3 = 0.0 for parag_sent in segmenter.split(paragraph): parag_stems = TextNormalizer.tokenize_stems( Abbrev.normalize_abbrev(parag_sent)) intersect3 = len(quest_set & set(parag_stems)) / denom max_intersect3 = max(max_intersect3, intersect3)
#test_str = u'Сразу после возвращения Фрама Нансен стал главным специалистом по полярным исследованиям в мире, по выражению Р. Хантфорда — оракулом для всех исследователей полярных широт Севера и Юга [188]. Нансен консультировал бельгийского барона Адриена де Жерлаша, который планировал в 1898 году свою экспедицию в Антарктиду, одним из участников команды был Руаль Амундсен[189]. Известнейший исследователь Гренландии Кнуд Расмуссен сравнил посещение Нансена с посвящением в рыцари[190]. В то же время Нансен категорически отказался встречаться со своим соотечественником Карстеном Борхгревинком, сочтя его мошенником, хотя именно он совершил первую успешную зимовку на побережье Антарктиды[191]. В 1900 году в Норвегию приехал за консультациями Роберт Скотт со своим покровителем Клементом Маркхэмом — давним другом Нансена, готовившим британскую экспедицию в Антарктиду. Несмотря на то, что англичане практически проигнорировали все советы, Нансен и Скотт остались в хороших отношениях[192].' #for l in segmenter.split(test_str): # print(l) # ------------------------------------------------------------------------- wrt0 = codecs.open('../data/rows(y=0).txt', 'w', 'utf-8') wrt1 = codecs.open('../data/rows(y=1).txt', 'w', 'utf-8') for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]): question = row.question paragraph = row.paragraph wrt = wrt0 if row['target'] == 0 else wrt1 question1 = TextNormalizer.preprocess_question_str(question) quest_stems = set(TextNormalizer.tokenize_stems(question1)) quest_crops = set(TextNormalizer.tokenize_crops(question1)) denom_stems = float(len(quest_stems)) denom_crops = float(len(quest_crops)) max_crop_match = 0.0 best_crop_match_sent = u'' max_stem_match = 0.0 best_stem_match_sent = u'' wrt.write('\n\nid={}\n'.format(index)) for i, parag_sent in enumerate(segmenter.split(paragraph)): wrt.write(u'P[{}]\t{}\n'.format(i, parag_sent))
for word in words: if word not in word2id: word2id[word] = len(word2id) ids = tuple( [word2id[word] for word in words] ) if ids not in sent2id: sent2id[ u' '.join(words) ] = len(sent_list) sent_list.append(ids) segmenter = Segmenter() for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Extracting sentences from " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph sent_id = store_sent(question) for parag_sent in segmenter.split(paragraph): sent_id = store_sent(parag_sent) nb_sent = len(sent_list) nb_words = len(word2id) print('{} sentences'.format(nb_sent)) print('{} words'.format(nb_words)) max_len = min( MAX_LEN, max( [len(s) for s in sent_list] ) ) print('max_len={}'.format(max_len))
def uniq_words2(text): return set(nonstop(TextNormalizer.tokenize_crops(text)))
import gensim import scipy.spatial.distance import codecs # ------------------------------------------------------------------------ syn_stems = set() with codecs.open('../data/word2similar.dat', 'r', 'utf-8') as rdr: for line in rdr: tx = line.strip().split(u'\t') if len(tx) == 2: word1 = tx[0].lower() word2 = tx[1].lower() stem1 = TextNormalizer.stem_word(word1) stem2 = TextNormalizer.stem_word(word2) if stem1!=stem2: syn_stems.add( (stem1, stem2) ) syn_stems.add( (stem2, stem1) ) # ------------------------------------------------------------------------ dftrain = pd.read_csv("../data/dftrain.csv", encoding='utf-8') dftest = pd.read_csv("../data/dftest.csv", encoding='utf-8') # ------------------------------------------------------------------------ print('Loading the w2v model...') # w2v = gensim.models.KeyedVectors.load_word2vec_format(r'f:\Word2Vec\word_vectors_cbow=1_win=5_dim=32.txt', binary=False) w2v = gensim.models.KeyedVectors.load_word2vec_format('/home/eek/polygon/w2v/w2v.CBOW=1_WIN=20_DIM=32.txt',
text = tcleaner.clean_code(text) text = tcleaner.clean_text(text) tcleaner.output(text) tokenizer = Tokenizer.JanomeTokenizer() words = tokenizer.wakati(text) #words = tokenizer.filter_by_pos(text, pos=('名詞')) tokenizer.output(words) #MeCab #tokenizer = Tokenizer.MeCabTokenizer() #words = tokenizer.wakati(text) #words = tokenizer.filter_by_pos(text, pos=('名詞')) #tokenizer.output(words) tnormalizer = TextNormalizer.TextNormalizer() nwords = [] for w in words: nw = tnormalizer.normalize(w) nw = tnormalizer.lemmatize_term(nw, pos='v') nwords.append(nw) tnormalizer.output(nwords) stw_remover = StopwordRemover.StopwordRemover() stw_remover.load_stopword_file("./slothlib/stopwords.txt") stw_remover.load_stopword_file("./slothlib/stopwords_extend.txt") stw_remover.find_stopwords(nwords) stwords = stw_remover.remove_stopwords(nwords) stwords = stw_remover.remove_noisewords(stwords) stw_remover.output(stwords)
# ------------------------------------------------------------------------- segmenter = Segmenter() print('Segmentation of paragraph texts...') for name, df in [('train', dftrain), ('test', dftest)]: for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Calculating charwise similarities for " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_stems = stemmize(question) max_fuzz_partial_ratio = 0.0 for parag_sent in segmenter.split(paragraph): parag_stems = stemmize(parag_sent) fuzz_partial_ratio = 0.01 * fuzz.partial_ratio( quest_stems, parag_stems) max_fuzz_partial_ratio = max(max_fuzz_partial_ratio, fuzz_partial_ratio) df.loc[index, 'max_fuzz_partial_ratio_str'] = max_fuzz_partial_ratio
def is_oov(word): return TextNormalizer.normalize_word( word) not in known_words and len(word) > 0 and not word[0].isdigit()
def stemmize(phrase): return u' '.join(TextNormalizer.tokenize_stems(phrase))
def is_NE(word): nword = TextNormalizer.normalize_word(word) return len(word)>1 and nword not in funcwords\ and nonstop( nword ) and word[0].isupper()
verbose_eval=False, ) print('Validating...') y_pred = cl.predict(D_test, ntree_limit=nbrounds) val_score = sklearn.metrics.roc_auc_score(y_test, y_pred) print('roc_auc_score={}'.format(val_score)) y_2 = [(1 if z > 0.5 else 0) for z in y_pred] acc = sklearn.metrics.accuracy_score(y_test, y_2) print('accuracy_score={}'.format(acc)) segmenter = Segmenter.Segmenter() if False: print('Printing mispredictons') y_test = y_test.values with codecs.open('../submit/mispredictions.txt', 'w', 'utf-8') as wrt: for y_i, df_i in enumerate(i_test): if y_2[y_i] != y_test[y_i]: wrt.write('\n\ny_pred={} y_test={}\n'.format( y_pred[y_i], y_test[y_i])) quest = dftrain.loc[df_i, ['question']].values[0] parag = dftrain.loc[df_i, ['paragraph']].values[0] quest_str = TextNormalizer.preprocess_question_str(quest) for j, parag_sent in enumerate(segmenter.split(parag)): wrt.write(u'P[{}]:\t{}\n'.format(j, parag_sent)) wrt.write(u'Q:\t{}\n'.format(quest_str))
y_train = dftrain['target'].values word2id = dict() word2id[u''] = 0 segmenter = Segmenter() for name, df in [('train', dftrain), ('test', dftest)]: X_parag = np.zeros((df.shape[0], max_len), dtype=np.int32) X_parag1 = np.zeros((df.shape[0], max_len), dtype=np.int32) X_quest = np.zeros((df.shape[0], max_len), dtype=np.int32) for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Processing " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_words = TextNormalizer.tokenize_words(question)[0:max_len] parag_words = TextNormalizer.tokenize_words(paragraph)[0:max_len] for word_pos,word in enumerate(quest_words): if word not in word2id: word2id[word] = len(word2id) X_quest[index, word_pos ] = word2id[word] for word_pos, word in enumerate(parag_words): if word not in word2id: word2id[word] = len(word2id)
y_train = dftrain['target'].values word2id = dict() word2id[u''] = 0 for name, df in [('train', dftrain), ('test', dftest)]: X_parag = np.zeros((df.shape[0], max_paragraph_len), dtype=np.int32) X_quest = np.zeros((df.shape[0], max_question_len), dtype=np.int32) for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0], desc="Estimating the sequence lengths for " + name): question = TextNormalizer.preprocess_question_str(row.question) paragraph = row.paragraph quest_words = TextNormalizer.tokenize_words( question)[0:max_question_len] parag_words = TextNormalizer.tokenize_words( paragraph)[0:max_paragraph_len] for word_pos, word in enumerate(quest_words): if word not in word2id: word2id[word] = len(word2id) X_quest[index, word_pos] = word2id[word] for word_pos, word in enumerate(parag_words): if word not in word2id: