def get_auxiliary_feature(self, current_sentence, idx): avec = [] _aux_type = None _tmp_sen = current_sentence.words_list + ['none', 'none', 'none'] _current_word = _tmp_sen[idx] _next_words = _tmp_sen[idx + 1:idx + 4] _next_words = [lemma(_next_word) for _next_word in _next_words] try: avec.append(self.aux_word_2_idx.index(_current_word)) except ValueError: avec.append(len(self.aux_word_2_idx)) self.aux_word_2_idx.append(_current_word) try: avec.append(self.aux_lemma_2_idx.index(lemma(_current_word))) except ValueError: avec.append(len(self.aux_lemma_2_idx)) self.aux_lemma_2_idx.append(lemma(_current_word)) if _next_words[0] == 'so' or _next_words[ 0] == 'likewise' or _next_words[1] == 'same' or _next_words[ 1] == 'opposite' or _next_words[2] == 'opposite': _aux_type = 'so' else: _aux_type = 'modal' if lemma( _current_word) not in AUXILIARY_TYPE else lemma(_current_word) avec.append(AUXILIARY_TYPE.index(_aux_type)) current_sentence.aux_type = _aux_type current_sentence.trigger = _current_word current_sentence.trigger_index = idx return avec
def term_frequencies(tokens, selected_entities, key_phrase_tokens): # lmtzr = WordNetLemmatizer() # stemmer = SnowballStemmer("english") punctuations = list(string.punctuation) entities = "" for entity in selected_entities: entities += str(entity[0]) + ' ' entities = tokenize(entities.lower()) tf = {} i = 0 len_token = len(tokens) for token in tokens: # token = stemmer.stem(token) # token = lmtzr.lemmatize(token) if dictionary.check(token.lower()): token = en.lemma(token) if token not in stopwords.words( 'english' ) and token not in punctuations and token not in hoax_stopwords and len( token) > 1 and token != "''" and token != "``": # print token, 1.0 * (len_token - i) / (len_token * 1.0) if i == 0: try: tf[token] += 2.0 * (len_token - i) / (len_token * 1.0) except KeyError: tf[token] = 2.0 * (len_token - i) / (len_token * 1.0) else: try: tf[token] += 1.0 * (len_token - i) / (len_token * 1.0) except KeyError: tf[token] = 1.0 * (len_token - i) / (len_token * 1.0) elif token == ".": i += 1 for token in key_phrase_tokens: # token = stemmer.stem(token) # token = lmtzr.lemmatize(token) if dictionary.check(token.lower()): token = en.lemma(token) if token not in stopwords.words( 'english' ) and token not in punctuations and token not in hoax_stopwords and len( token) > 1 and token != "''" and token != "``": # print token, 1.0 * (len_token - i) / (len_token * 1.0) if i == 0: try: tf[token] += 2.0 * (len_token - i) / (len_token * 1.0) except KeyError: tf[token] = 2.0 * (len_token - i) / (len_token * 1.0) else: try: tf[token] += 1.0 * (len_token - i) / (len_token * 1.0) except KeyError: tf[token] = 1.0 * (len_token - i) / (len_token * 1.0) elif token == ".": i += 1 tf = sorted(tf.items(), key=operator.itemgetter(1), reverse=True) return tf
def VB_VB_correction( payload, raw_text, error_count): # correct errors of type is-walking OR has-cooked if (payload.tag_[:2] != 'VB'): return raw_text, error_count nounBeforeVerb = False nounAfterVerb = False verbFound = False if (payload.text == 'is' or payload.text == 'was' or payload.text == 'are' or payload.text == 'were'): return raw_text, error_count for ch in payload.children: if (ch.tag_[:2] == 'VB'): verbFound = True if ((not verbFound) and (ch.dep_ == 'nsubj')): print(ch.lower_) nounBeforeVerb = True if (verbFound and (ch.dep_ == 'nsubj')): nounAfterVerb = True ifHave = False ifBeen = False if (ch.tag_[:2] == 'VB'): # this might need to be removed dummy, error_count = VB_VB_VB_correction(ch, raw_text, error_count) try: if (ch.lower_ == 'has') or (ch.lower_ == 'have') or (ch.lower_ == 'had'): ifHave = True if (ch.lower_ == 'been' or payload.text == "been"): ifBeen = True if (ifHave and ifBeen): x = conjugate(verb=lemma(payload.text), tense=PAST + PARTICIPLE, mood=INDICATIVE, person=1, number=PL) elif (nounBeforeVerb and ((ch.lower_ == 'is') or (ch.lower_ == 'are') or (ch.lower_ == 'was') or (ch.lower_ == 'was') or (ch.lower_ == 'were'))): x = conjugate(verb=lemma(payload.text), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL) else: x = payload.text if (x != payload.text): error_count += 1 raw_text = raw_text[:payload. idx] + raw_text[payload.idx:].replace( payload.text, x, 1) return raw_text, error_count except TypeError: return raw_text, error_count return raw_text, error_count
def createConceptPair(self, filename, target_corpus): corpus_conceptPair = [] for sentence, target_sentence in zip(self.docs[filename], target_corpus): sentence_conceptPair = [] if len(target_sentence) > 0: target_sentence += [target_sentence[0]] for i in range(len(sentence)): if sentence[i] in target_sentence: curr_word = lemma(sentence[i]).encode('utf-8') prev_word = lemma( target_sentence[target_sentence.index(sentence[i]) - 1]).encode('utf-8') next_word = lemma( target_sentence[target_sentence.index(sentence[i]) + 1]).encode('utf-8') else: curr_word = lemma(sentence[i]).encode('utf-8') prev_word = curr_word next_word = curr_word #next_word = lemma(sentence[i+1]).encode('utf-8') #curr_word = lemma(sentence[i]).encode('utf-8') sentence_conceptPair.append([[curr_word, next_word], [curr_word, prev_word]]) #sentence_conceptPair.append([ [next_word, None, curr_word], [curr_word, None, prev_word] ]) corpus_conceptPair.append(sentence_conceptPair) return corpus_conceptPair
def getSynonyms(word, part): synonyms = [] wordToTry = lemma(word) if part[0] == 'V' else word synList = dictionary.synonym(wordToTry) if synList is None: return [word] for syn in synList: if " " not in syn: if part == "VB" or part == "VBP": synonyms.append(lemma(syn)) elif part == "VBD" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[3]) elif part == "VBG" and len(lexeme(syn)) > 0: synonyms.append(lexeme(syn)[0]) elif part == "VBN" and len(lexeme(syn)) > 3: synonyms.append(lexeme(syn)[-1]) elif part == "VBZ" and len(lexeme(syn)) > 1: synonyms.append(lexeme(syn)[1]) elif part == "NN" and syn[-2:] != "ss": synonyms.append(singularize(syn)) elif part == "NNS": synonyms.append(pluralize(syn)) else: synonyms.append(syn) return list(set(synonyms))
def sent_to_bin_q(sentence_tree): assert (sentence_tree.label() == SENTENCE) subj = sentence_tree[0] # if subjectless if subj.label() != NP: return tree_to_string(sentence_tree) assert (subj.label() == NP) vp = sentence_tree[1] assert (vp.label() == VP) head_verb = vp[0] assert (is_verb(head_verb.label()) and head_verb.label() != VP) verb = head_verb[0] # has auxiliary if is_modal(head_verb, vp) or lemma(verb) == 'be': uncap(subj) # uncapitalize original subj (unless proper noun) return ' '.join([verb, tree_to_string(subj)] + [tree_to_string(node) for node in vp[1:]]) # no auxiliary, use do-insertion else: head_verb[0] = lemma(verb) # convert head verb to infinitive uncap(subj) # uncapitalize original subj (unless proper noun) return ' '.join([ conjugate('do', get_inflection(head_verb)), tree_to_string(subj), tree_to_string(vp) ])
def testBasic(): from pattern.en import referenced print referenced('hour') from pattern.en import conjugate, lemma, lexeme print lexeme('purr') print lemma('purring') print conjugate('purred', '3sg') # he / she / it
def has_auxiliary(head_verb, head_verb_tag, vp_tags): if head_verb_tag == 'MD' or lemma(head_verb) in ['be', 'do']: return True if lemma(head_verb) in [ 'have' ] and len(vp_tags) > 2 and vp_tags[2].startswith('V'): return True return False
def get_conjugations(lem): vforms = [] if lemma(lem) == 'be': vforms = [i for i in EXCEPTIONS] else: for ta in TENSE_ASPECTS: c = conjugate(lemma(lem), ta) vforms.append( c +'|||'+ ta) return vforms
def is_partial_match(query, table_names): query = lemma(query) table_names = [[lemma(x) for x in names.split(' ') ] for names in table_names] same_count = 0 result = None for names in table_names: if query in names: same_count += 1 result = names return result if same_count == 1 else False
def lemmatize(self, word): if word in self._vocab._word2index: return word elif word in self._inverse_map.keys(): return self._inverse_map[word] elif lemma(word) in self._vocab._word2index: return lemma(word) elif lemma(word) in self._inverse_map.keys(): return self._inverse_map[lemma(word)] else: return self._vocab.index2word(0)
def get_verb_reduction(verb, tag): """Given string of existing verb, returns its corresponding reduction That's the verb itself if its lemma is in the top100, else its hash""" if lemma(verb.lower()) in literals.verbs: return verb.upper() if lemma(verb.lower()) in top100.verbs: return verb.upper() else: h = sha256(str(tenses(verb)).encode('utf_8')).hexdigest() result = tag + '_' + h return result
def decompose_verb(verb, verb_tag): logger.debug('verb \t %s verb_tag \t %s' % (verb, verb_tag)) tense = verb_tense_dict[verb_tag] if conjugate('do', tense) == 'did': return_do = 'is' return return_do, verb else: return_do = conjugate('do', tense) # elif conjugate('do', tense) == does: print('verb', verb, verb_tag) print(conjugate('do', tense)) print('lemma', lemma(verb)) return return_do, lemma(verb)
def search_answer(self, cnstrd_word_syn, wd_in_sent, key_wd_idx): """ This function searches the constrainted word of the question Parameters: cnstrd_word_syn (list) - the list of the synonyms of the constrainted word in the question wd_in_sent (list) - word tokenization text key_wd_idx (int) - the position of the key word in the sentence return: the position of the constrainted word in the sentence of the text """ porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() #print cnstrd_word_syn for cw in cnstrd_word_syn: cw_seperate = [] if '_' in cw: cw1 = cw.split('_')[0] cw2 = cw.split('_')[1] cw_seperate = [cw1, cw2] cw = ' '.join(cw.split('_')) cw_seperate.append(cw) #print(cw) for sent in wd_in_sent[key_wd_idx:]: #print(cw) #print(cw, sent) #print sent """ if cw_seperate: for c_s in cw_seperate: if porter.stem(c_s.lower()) == porter.stem(sent.lower()) or lemma(c_s) == lemma(sent): #or sent.lower() in cw.lower() or lemma(): print("!!!!!!!!") print(cw, sent) print(wd_in_sent.index(sent)) return wd_in_sent.index(sent) """ if porter.stem(cw.lower()) == porter.stem( sent.lower()) or lemma(cw) == lemma( sent): #or sent.lower() in cw.lower() or lemma(): #print("!!!!!!!!") #print(cw, sent) #print(wd_in_sent.index(sent)) return wd_in_sent.index(sent) """ elif cw_seperate: for cw_s in cw_seperate: if porter.stem(cw.lower()) == porter.stem(sent.lower()) or lemma(cw) == lemma(sent): return wd_in_sent.index(sent) """ return None
def createConceptPair(self, user): sentences_conceptPair = [] for sentence in self.docs[user]: sentence_conceptPair = [] sentence += [sentence[0]] for i in range(len(sentence) - 1): #if sentence[i+1] == "'s": # next_word = lemma(sentence[i+2]).encode('utf-8') #else: next_word = lemma(sentence[i+1]).encode('utf-8') curr_word = lemma(sentence[i]).encode('utf-8') sentence_conceptPair.append([next_word, None, curr_word]) sentences_conceptPair.append(sentence_conceptPair) return sentences_conceptPair
def is_subkeys(term1, term2): def common_words(key1, key2): return len(set(key1) & set(key2)) #/len(key2) lt1 = lemma(term1) lt2 = lemma(term2) # llt1 = lt1.split(" ") # llt2 = lt2.split(" ") # if common_words(llt1,llt2)== len(llt1) : # return True # else : # return False # return all(t in llt2 for t in llt1) return lt1 in lt2
def morphological_error(target, response): if response == pluralize(target): #Checks for the plural form of the target. return True if response == comparative(target) : #Checks for the comparative form of the target. return True if response == superlative(target): #Checks for the superlative form of the target. return True if lemma(target) == lemma(response): #Check to see if the target and response share a lemma. return True return False
def createConceptPair(self, filename): sentences_conceptPair = [] for sentence in self.docs[filename]: sentence_conceptPair = [] sentence = list(sentence) sentence += [sentence[0]] for i in range(len(sentence) - 1): prev_word = lemma(sentence[i - 1]).encode('utf-8') next_word = lemma(sentence[i + 1]).encode('utf-8') curr_word = lemma(sentence[i]).encode('utf-8') sentence_conceptPair.append([[curr_word, next_word], [curr_word, prev_word]]) #sentence_conceptPair.append([next_word, None, curr_word]) sentences_conceptPair.append(sentence_conceptPair) return sentences_conceptPair
def createFeatVector(self, word_relations_list): features = [] for sentence in word_relations_list: sentence_len = len(sentence) feature = np.zeros((sentence_len, 96)) emissions = np.zeros((sentence_len, 1)) if sentence_len == 0: continue for i in range(len(sentence)): word, rel_1, rel_2, emission = sentence[i] #relation_weights = cn.search(lemma(word).encode('utf-8')) word = lemma(word).encode('utf-8') if word in self.sparse_dict: cn_weights = [self.sparse_dict[word][w][0]["weight"] for w in self.sparse_dict[word]] cn_rels = [self.sparse_dict[word][w][0]["rel"] for w in self.sparse_dict[word]] #cn_weights, cn_rels = zip(*self.sparse_dict[word])[:2] for j in range(len(cn_rels)): if cn_rels[j] in rel_1: rel_index = self.Rels.index(cn_rels[j]) feature[i][rel_index] += cn_weights[j] if cn_rels[j] in rel_2: rel_index = self.Rels.index(cn_rels[j]+48) feature[i][rel_index] += cn_weights[j] emissions[i] += emission # Normalization feature = normalize(feature, axis=1) * emissions features.append(feature) features = np.asarray(features) return features
def pretreat_a(data): global train_ans, valid_ans, anslist fw = open(path + '/data/' + data + '.ans', 'w') f = open(path + '/data/' + data + '.a') lines = f.readlines() for line in lines: line = line.replace('\n', '').split(' ') j = len(line) for i in range(j): #'the','and','a','an','with' for div in ['the', 'and', 'a', 'an', 'with']: if line[i] == div: line[i] = '' #if re.match(r'[0-9]+', line[i]): #line[i] = 'num' #elif re.match(r'[a-z][0-9]+', line[i]): #line[i] = 'pos' line[i] = lemma(line[i]) if line[i] == 'have': line[i] = '' if i != j - 1: if line[i] != '' and line[i + 1] != '': line[i] = line[i] + ' ' else: line[i] = line[i] + '\n' fw.write(line[i]) f.close() fw.close()
def with_best_data(): format_text = '' word_labels = {} # 直接读文章,去掉标签,分词,词性还原,标记,输出 for parent, dirnames, filenames in os.walk(train_source_dir): for filename in filenames: # 读文件 source_fp = open(train_source_dir+filename) all_source_text = source_fp.read().decode("utf-8") # 注意编码 source_fp.close() # 去标签 re_h = re.compile('</?\w+[^>]*>') # HTML标签 source_text = re_h.sub(" ", all_source_text) # 分词,词性还原,情感标记 words = nltk.word_tokenize(source_text) print words for word in words: lemmed = lemma(word) polarity = sentiment(lemmed)[0] # 不行啊,报错 if polarity >= 0.5: label = 2 elif polarity <= -0.5: label = 1 else: label = 0 word_labels[lemmed] = label # print word, lemmed, label for word in word_labels: format_text += (word + '_' + str(word_labels[word]) + ' ') # 不对,搞成一句一行?? # 写入文件 write_file(format_text, output_filename)
def _check_synonyms_existence(word, word_sets, wn_pos): for synsets in wn.synsets(word.lower(), pos=wn_pos): for name in synsets.lemma_names(): if name in word_sets or wn.morphy(name) in word_sets or lemma( name) in word_sets: return True return False
def lookup(self, word): if word in self._vocab._word2index: return word elif word in self._inverse_map.keys(): return word else: try: base_word = lemma(word) if lemma(word) in self._vocab._word2index: return lemma(word) elif lemma(word) in self._inverse_map.keys(): return lemma(word) else: return self._vocab.index2word(0) except: return self._vocab.index2word(0)
def superlative(self, word): ''' Given a base-form word (Adj), return back a superlative form Args: word (str): base-form adj Raises: ValueError: [description] ValueError: [description] Returns: str: superlative form ''' if word in self._word2index: return superlative(word) else: try: base_form_word = lemma(word) if base_form_word in self._word2index: return superlative(base_form_word) else: raise ValueError( "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary" .format(word, base_form_word)) except: raise ValueError( "Can not found base-form for '{}'".format(word))
def pluralize(self, word): ''' Given base-form of the word, return back plural form of the word (For Noun only) Args: word (str): base-form of the word Raises: ValueError: The vocabulary does not contain the base-form ValueError: Can not find the base-form of the given word Returns: str: plural form of the word ''' if word in self._word2index: return pluralize(word) else: try: base_form_word = lemma(word) if base_form_word in self._word2index: return pluralize(base_form_word) else: raise ValueError( "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary" .format(word, base_form_word)) except: raise ValueError( "Can not found base-form for '{}'".format(word))
def clean_and_lemmatize_doc(doc, build_vocab): stripped_doc = [] for char in doc: if (ord(char) >= 48) & (ord(char) <= 57): stripped_doc.append(char) elif (ord(char) >= 65) & (ord(char) <= 89): stripped_doc.append(char) elif (ord(char) >= 97) & (ord(char) <= 122): stripped_doc.append(char) elif ord(char) == 39: continue else: stripped_doc.append(" ") doc = "".join(stripped_doc).lower() doc = doc.split(" ") stripped_doc = [] for word in doc: try: cleaned_word = lemma(word) if len(cleaned_word) > 0: stripped_doc.append(cleaned_word) if build_vocab == True: vocab.add(cleaned_word) except: continue return stripped_doc
def singularize(self, word): ''' Given a base-form of noun, return a singular form (For Noun only) Args: word (str): base-form of noun Raises: ValueError: [description] ValueError: [description] Returns: str: singular form of noun ''' if word in self._word2index: return singularize(word) else: try: base_form_word = lemma(word) if base_form_word in self._word2index: return singularize(base_form_word) else: raise ValueError( "Found the base-form for '{}': '{}'. But even the base-form not in vocabulary" .format(word, base_form_word)) except: raise ValueError( "Can not found base-form for '{}'".format(word))
def updateTerms(self, line, w2vmodel): list_term = line.split('_') list_result = [] whitelist = set( ['win', 'won', 'most', 'biggest', 'largest', 'fastest']) blacklist = set(['give', 'also']) stoplist = set(stopwords.words('english')) for term in list_term: if term in blacklist: continue if term not in whitelist and term in stoplist: continue # find lem = lemma(term) sing = singularize(term) if term in w2vmodel.vocab: list_result.append(term) elif lem in w2vmodel.vocab: list_result.append(lem) elif sing in w2vmodel.vocab: list_result.append(sing) return list_result
def lda_comparison(corpus_savepath): '''string -> lda topics corpus_savepath is the path to save the prepared corpus for lda''' #basic preprocessing and lemmatization almost like in lda2vec implementation texts = fetch_20newsgroups(subset='train').data texts = [unicode(d.lower()) for d in texts] texts = ["".join((char if char.isalpha() else " ") for char in text).split() for text in texts] texts = [stopwords.clean([lemma(i) for i in text[:1000]], "en") for text in texts] #creating frequency dictionary for tokens in text frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 #removing very infrequent and very frequent tokens in corpus texts = [[token for token in text if (frequency[token] > 10 and len(token) > 2 and frequency[token] < len(texts)*0.2)] for text in texts] #creating an LDA model dictionary = gensim.corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] gensim.corpora.MmCorpus.serialize(corpus_savepath, corpus) modelled_corpus = gensim.corpora.MmCorpus(corpus_savepath) lda = gensim.models.ldamodel.LdaModel(modelled_corpus, num_topics=20, update_every=100, passes=20, id2word=dictionary, alpha='auto', eval_every=5) #returning the resulting topics return lda.show_topics(num_topics=20, num_words=10, formatted=True)
def Content2wv(line): pat = re.compile(r'([A-z]+)') #分詞 line = line.replace('\n', '') words = line.split(" ") vector = np.zeros((100, 100), dtype=np.float) #算向量 for i in range(0, len(words)): #word = lemma(words[i]) match = pat.findall(words[i]) if match: try: vector += np.array(model.wv[match[0].lower()]) except: try: vector += np.array(model.wv[lemma(match[0])]) except: #print(lemma(match[0])) vector += np.zeros((100, 100), dtype=np.float) else: tmp = np.full((100, 100), -1, dtype=float) vector += tmp #取平均 tmpLi = [] for i in range(len(vector)): for j in range(len(vector[i])): vector[i][j] = vector[i][j] / len(words) tmpLi = tmpLi + list(vector[i]) return tmpLi
def procesar_ejercicio_hiponimos(self, texto): items_ejercicio = [] filtro = lambda x: es_sustantivo(x) or es_adjetivo(x) lista_palabras = obtener_palabras(filtro, texto) lista_palabras = list({each['token']: each for each in lista_palabras}.values()) for palabra in lista_palabras: palabra_token = lemma(palabra['token']) categorias = Categorias().listar_categorias() palabra_categoria = None for categoria in categorias: categoria_synset_id = categoria['synset_id'] es_hiponimo = hip.es_hiponimo(palabra_token, categoria_synset_id) if es_hiponimo: palabra_categoria = categoria['nombre'] if palabra_categoria: # Evitar palabras duplicadas if not any(obj.palabra == palabra_token for obj in items_ejercicio): item = ItemEjercicioHiponimos(palabra_token, palabra_categoria) items_ejercicio.append(item) return items_ejercicio
def _get_nouns(self, sentence): """ input : String output: list Returns features(nouns) for each sentence. """ doc = self.nlp(sentence) nouns = [unicode(lemma(str(word).lower())) for word in doc if word.pos == NOUN] return nouns
def basic_sentence_to_question(basic_sentence): sbj=' '.join(bits_to_words(basic_sentence['SBJ'])) obj=' '.join(bits_to_words(basic_sentence['OBJ'])) verb=' '.join(bits_to_words(basic_sentence['VP'])) if verb=='is': return "What is "+sbj.lower()+"? "+obj return "What does "+sbj.lower()+" "+lemma(verb.lower())+"?"+" "+obj
def createFeatVector(self, word_relations_list): sentence_len = len(word_relations_list) feature = np.zeros((sentence_len, 46)) for i in range(len(word_relations_list)): word, rels = word_relations_list[i] relation_weights = cn.search(lemma(word).encode('utf-8')) for j in range(len(self.Rels)): if self.Rels[j] in relation_weights and self.Rels[j] in rels: feature[i][j] = relation_weights[self.Rels[j]] return feature
def _get_nouns(self, review): """ Returns features(nouns) from each sentence of a review. """ review_features = [] for sent in review: doc = self.nlp(sent) # noun_phrase = [np.text for np in doc.noun_chunks] nouns = [unicode(lemma(str(word).lower())) for word in doc if word.pos == NOUN] review_features.append(nouns) return review_features
def lemmatize_article(article): ''' INPUT: article (str) - raw text from the article (where text has been lowered and punctuation removed already) OUTPUT: lemmatized_article - article text with all stopwords removed and the remaining text lemmatized ''' # Load in stopwords from load_data stopwords = stop_words() # Load Dictionary to fix commonly mislemmatized words correct_lemma = fix_lemmatized_words() # Lemmatize article by running each word through the pattern.en lemmatizer and only including it in the resulting text if the word doesn't appear in the set of stopwords article = ' '.join([en.lemma(w) for w in article.split() if w not in stopwords]) # Return the article text after fixing common mislemmatized words return ' '.join([correct_lemma[w] if w in correct_lemma else w for w in article.split()])
def extract_noun_phrases(body_part_name): stop = nltk.corpus.stopwords.words('english') filename = '/Users/rsteckel/tmp/Observable_body_parts-sentences-BODYPART1.tsv' df = pd.read_csv(filename, sep='\t', encoding='utf-8') df['lemmas'] = df['themeword'].apply(lambda x: lemma(x)) sentences = df[ df['lemmas'] == body_part_name]['sentence'].tolist() phrases = [] for sentence in sentences: ptree = parsetree(sentence) matches = search('NP', ptree) for match in matches: filtered_np = [ word for word in match if word.string.lower() not in stop ] if len(filtered_np) > 0: phrases.append( (sentence, filtered_np) ) return pd.DataFrame(phrases, columns=['sentence', 'phrase'])
print("") # COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() functions give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print("%s => %s => %s" % (word, comparative(word), superlative(word))) print("") # VERB CONJUGATION # ---------------- # The lexeme() function returns a list of all possible verb inflections. # The lemma() function returns the base form (infinitive) of a verb. print("lexeme: %s" % lexeme("be")) print("lemma: %s" % lemma("was")) print("") # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False)) print(conjugate("being", tense="1sg", negated=False)) print("")
#It matches anything from food to cat food, tasty cat food, the tasty cat food, etc. t = parsetree('tasty cat food') matches = search('DT? RB? JJ? NN+', ptree) for match in matches: print match print '\n' filename = '/Users/rsteckel/tmp/Observable_body_parts-sentences-BODYPART1.tsv' df = pd.read_csv(filename, sep='\t', encoding='utf-8') df['lemmas'] = df['themeword'].apply(lambda x: lemma(x)) grby = df.groupby(['lemmas']).count() sorted_df = grby.sort(['lemmas'], ascending=0) bpdf = sorted_df[:3] top_bps = set(bpdf['lemmas'].index.values) df['topbp'] = df['lemmas'].apply(lambda x: x if x in top_bps else 'other') records = [] for i,row in df.iterrows(): try: if i % 100 == 0: print '%d of %d' % (i, len(df))
# COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() commands give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print word, "=>", comparative(word), "=>", superlative(word) print print # VERB CONJUGATION # ---------------- # The lexeme() command returns a list of all possible verb inflections. # The lemma() command returns the base form (infinitive) of a verb. print "lexeme:", lexeme("be") print "lemma:", lemma("was") # The conjugate() command inflects a verb to another tense. # The tense can be given as a constant, e.g. # INFINITIVE, PRESENT_1ST_PERSON_SINGULAR PRESENT_PLURAL, PAST_PARTICIPLE, ... # or as an abbreviated alias: inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. print conjugate("being", tense="1sg", negated=False) # Prefer the full constants for code that will be reused/shared. # The tenses() command returns a list of all tenses for the given verb form. # For example: tenses("are") => ['present 2nd person singular', 'present plural'] # You can then check if a tense constant is in the list. # This will also work with aliases, even though they are not explicitly in the list. from pattern.en import PRESENT_PLURAL print tenses("are")
def words_same_sounds_score(w1, w2, pronunciations): """ Given 2 words and the pronunciation dictionary Returns a score for how alike they sound Works by looking at intersection of phonemes TODO: Currently only uses default pronunciation from CMU, ignores any variations Also should deal with words not in the CMU pronouncing dict """ import re # If words are double, make sure pronunciations for words are found separately and joined w1_split, w2_split = re.split('-|_| ',w1) , re.split('-|_| ',w2) p1_list, p2_list = [], [] # Firstly, if w1 contains a word in w2 or vice versa, return score of 0 from pattern.en import lemma w1_temp = [lemma(x) for x in w1_split] w2_temp = [lemma(x) for x in w2_split] overlap = [val for val in w1_temp if val in w2_temp] if overlap: return 0 try: for w in w1_split: p1_list.extend(pronunciations[w][0]) for w in w2_split: p2_list.extend(pronunciations[w][0]) except: # Can't find words in CMU dict return 0 # If we have pronunciations for each word we continue if p1_list and p2_list: # We will populate these scores exact_sounds_score, approx_sounds_score = 0, 0 # Remove stresses, not interested in these def remove_number(x): if x[-1] in ('0','1','2'): return x[:-1] else: return x p1_no_stress = [remove_number(x) for x in p1_list] p2_no_stress = [remove_number(x) for x in p2_list] # Find the overlap in sounds overlap = [x for x in p1_no_stress if x in p2_no_stress] # Find proportion of original sounds covered by overlap prop_1 = len ( [x for x in p1_no_stress if x in overlap] )*1.0 /len(p1_no_stress) prop_2 = len ( [x for x in p2_no_stress if x in overlap] )*1.0/len(p2_no_stress) #print prop_1, prop_2 if prop_1 + prop_2 > 0: exact_sounds_score = prop_1*prop_2*2.0/(prop_1+prop_2) # Now check with approximate sounds, same process (weight score lower though) p1_no_stress = [approx_sounds.get(x, x) for x in p1_no_stress] p2_no_stress = [approx_sounds.get(x, x) for x in p2_no_stress] # Find the overlap in approx sounds overlap = [x for x in p1_no_stress if x in p2_no_stress] prop_1 = len ([x for x in p1_no_stress if x in overlap])*1.0 /len(p1_no_stress) prop_2 = len ([x for x in p2_no_stress if x in overlap])*1.0/len(p2_no_stress) if prop_1 + prop_2 > 0: approx_sounds_score = prop_1*prop_2*2.0/(prop_1+prop_2) * 0.66 # Weighted down # Return the higher score of the two return max(exact_sounds_score, approx_sounds_score)
def buildRules(self, fname = ".tmp_pos", debug = 0): #fname = ".tmp_pos" wlist = ["0"] taglist = [] ind = 1 changed = 0 tverbs = [] with open(fname, "r") as fid: for line in fid: if line == "\n": #print " ".join(wlist) new_str = " ".join(taglist) #print new_str m = com1.search(new_str) if m!= None: wid = m.group(3).split("_") #print wlist[int(wid[0])] tverbs.append("BPA"+lemma(wlist[int(wid[0])])) m = com1_h.search(new_str) if m!= None: wid = m.group(3).split("_") #print wlist[int(wid[0])] tverbs.append("BPE"+lemma(wlist[int(wid[0])])) m = com2.search(new_str) if m!= None: wid = m.group(2).split("_") #print wlist[int(wid[0])] tverbs.append("BVP"+lemma(wlist[int(wid[0])])) m = com4.search(new_str) if m!= None: wid = m.group(2).split("_") #print wlist[int(wid[0])] tverbs.append("BVP"+lemma(wlist[int(wid[0])])) m = com7.search(new_str) if m!= None: wid = m.group(1).split("_") #print wlist[int(wid[0])] tverbs.append("AVP"+lemma(wlist[int(wid[0])])) m = com8.search(new_str) if m!= None: wid = m.group(1).split("_") #print wlist[int(wid[0])] tverbs.append("AVP"+lemma(wlist[int(wid[0])])) m = com13.search(new_str) if m!= None: wid = m.group(1).split("_") inid = m.group(2).split("_") #print wlist[int(wid[0])] tverbs.append("AVO"+wlist[int(wid[0])]+" "+wlist[int(wid[0])]) m = com14.search(new_str) if m!= None: wid = m.group(1).split("_") inid = m.group(2).split("_") #print m.groups() tverbs.append("AVN"+wlist[int(wid[0])]+" "+wlist[int(inid[0])]) m = com15.search(new_str) if m!= None: wid = m.group(1).split("_") inid = m.group(2).split("_") #print m.groups() tverbs.append("ANN"+wlist[int(wid[0])]+" "+wlist[int(inid[0])]) #print "\n" wlist = ["0"] taglist = [] ind = 1 else: line = line.rstrip() l = line.split("\t") if l[0] == "-LRB-": l[0] = "(" changed = 1 elif l[0] == "-RRB-": l[0] = ")" changed = 1 #elif l[0] in ["IBM", "ibm"]: elif l[0] in self.seed: l[1] = "BOUGHT" changed = 1 elif l[0] in ["is", "are", "was", "were", "been","have", "has", "had"]: l[1] = l[0] changed = 1 wlist.append(l[0]) if changed == 0: l[1] = str(ind)+"_"+l[1] taglist.append(l[1]) changed = 0 ind += 1 if debug == 1 : for s in set(tverbs): if tverbs.count(s) > 1: print s else: fout = open(self.outFile, "w") for s in set(tverbs): if tverbs.count(s) > 1: fout.write("%s\n"%s) fout.close()
def find_similarity(query): ### Finding Patterns a=query.split(",") files='/home/sandy/SEM-6/IRE/Project/Mine/htmloutput.html' ff=open(files,"r") data=ff.read() data=data.split('************** New PAGE *********************') while '' in data: data.remove('') patternout=[] for i in data: patternout=patternout+expandUsingPatterns(i,a) patternout=Counter(patternout) ### Word2vec model model1 = word2vec.Word2Vec.load_word2vec_format('./word2vec/wiki.model.bin', binary=True) print "Model loaded" print "\n" # a=query.split(",") scores={} f=open('./indexfile','r') for i in a: for line in f: line=line.strip("\n") try: sc=model1.similarity(i,line) except: sc=0 # print sc try: scores[line]=scores[line]+sc except: scores[line]=sc sorted_x = sorted(scores.items(), key=operator.itemgetter(1),reverse=True) k=0 print "Printing Results" print "\n" flag=0 output=[] for key,value in sorted_x: if key in a: continue for j in a: if en.lemma(key)==j or en.lemma(j)==key: flag=1 if flag==1: flag=0 continue for ll in output: if en.lemma(key)==ll or en.lemma(ll)==key: flag=1 if flag==1: flag=0 continue k=k+1 output.append(key) if k==40: break cnt=0 final=[] for res in output: if res in patternout.keys(): cnt=cnt+1 final.append(res) print cnt,":",res if cnt==10: break if cnt<10: for res in output: if res not in final: cnt=cnt+1 print cnt,":",res if cnt==10: break f.close() ff.close()
def test_lemma(self): # Assert the infinitive of "weren't". v = en.lemma("weren't") self.assertEqual(v, "be") print "pattern.en.inflect.lemma()"
# COMPARATIVE & SUPERLATIVE ADJECTIVES # ------------------------------------ # The comparative() and superlative() functions give the comparative/superlative form of an adjective. # Words with three or more syllables are simply preceded by "more" or "most". for word in ["gentle", "big", "pretty", "hurt", "important", "bad"]: print(word, "=>", comparative(word), "=>", superlative(word)) print() print() # VERB CONJUGATION # ---------------- # The lexeme() function returns a list of all possible verb inflections. # The lemma() function returns the base form (infinitive) of a verb. print("lexeme:", lexeme("be")) print("lemma:", lemma("was")) print() # The conjugate() function inflects a verb to another tense. # You can supply: # - tense : INFINITIVE, PRESENT, PAST, # - person: 1, 2, 3 or None, # - number: SINGULAR, PLURAL, # - mood : INDICATIVE, IMPERATIVE, # - aspect: IMPERFECTIVE, PROGRESSIVE. # The tense can also be given as an abbreviated alias, e.g., # inf, 1sg, 2sg, 3sg, pl, part, 1sgp, 2sgp, 3sgp, ppl, ppart. from pattern.en import PRESENT, SINGULAR print(conjugate("being", tense=PRESENT, person=1, number=SINGULAR, negated=False)) print(conjugate("being", tense="1sg", negated=False)) print()
def preprocess_debateIII(debate): #lemmatises text lemmatised_words = [lemma(w) for w in debate] return lemmatised_words
#Indefinite article print article('university') print article('hour') print referenced('university') print referenced('hour') #singularity print pluralize('child') print singularize('wolves') # print print lexeme('run') print lemma('running') print conjugate('purred', '3sg') print PAST in tenses('purred') # 'p' in tenses() also works. print (PAST, 1, PL) in tenses('purred') print 'Quantification' print quantify(['goose', 'goose', 'duck', 'chicken', 'chicken', 'chicken']) print quantify('carrot', amount=90) print quantify({'carrot': 100, 'parrot': 20}) print 'ngrams' print ngrams("I am eating a pizza.", n=2) #parse
def build_analyzer(self): analyzer = super(CountVectorizer, self).build_analyzer() return lambda doc: (en.lemma(word) for word in analyzer(doc) if str.isdigit(unicodedata.normalize('NFKD', word).encode('ascii','ignore')) == False)
def transform(text): transformations = [] # text text = text.lower() + " " # change "going to do something" -> "do something", while preserving "going to a thing" tags = tag(text) if tags[0][0] == "going" and tags[1][0] == "to" and tags[2][1] == "VB": text = " ".join(text.split()[2:]) + " " # transform the verb orig_verb = text.split()[0] if "f**k" in orig_verb: transformations += [(orig_verb, "")] try: orig_verb = text.split()[1] except IndexError: return '' elif "something" in orig_verb or "anything" in orig_verb: return '' new_verb = conjugate(lemma(orig_verb), person=3) # weird "lies" bug? if new_verb == 'layers': new_verb = 'lies' transformations += [(orig_verb, new_verb)] # transform first person to third transformations += [(" me ", " u ")] transformations += [(" my ", " ur ")] transformations += [(" i'm ", " ur ")] transformations += [(" im ", " ur ")] transformations += [(" i am ", " ur ")] transformations += [(" i ", " u ")] transformations += [(" i ", " u ")] transformations += [(" i've ", " u've ")] transformations += [(" ive ", " u've ")] transformations += [(" i'd ", " u'd ")] transformations += [(" id ", " u'd ")] transformations += [(" we ", " u ")] transformations += [(" ours ", " urs ")] transformations += [(" our ", " ur ")] transformations += [(" us ", " ur ")] # transform third person to gender-neutral transformations += [(" his ", " her ")] transformations += [(" him ", " her ")] transformations += [(" her ", " her ")] transformations += [(" he ", " she ")] transformations += [(" she ", " she ")] transformations += [(" he's ", " she's ")] transformations += [(" she's ", " she's ")] transformations += [(" hes ", " she's ")] transformations += [(" shes ", " she's ")] transformations += [(" n't ", " not ")] for orig, repl in transformations: text = text.replace(orig, repl) return text.strip()
from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lmtzr.lemmatize('humidity') from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() st.stem('luminous') lemma('humidity') frames = fn.frames_by_lemma(r'skin') for f in frames: print '%s - %s\n' % (f.name, f.definition) fn.lexical_units(r'') fn.frames_by_lemma(r'(?i)a little') for f in ('reflect', 'bank'): taxonomy.append(f, type='angle')