def extract_verb_and_entity_name_from_text(self, text): verb_result = [] np_result = [] pst = parsetree(text) for sentence in pst: for chunk in sentence.chunks: if chunk.type == "NP": np_result.append(chunk) continue if chunk.type == "VP": word_tagged_list = chunk.tagged for word in word_tagged_list: if word[1][0] == "V": verb_result.append(word[0]) continue verb_result = set(verb_result) final_result = [] for candidate in verb_result: candidate = self.lemmatizer.lemmatize(candidate) if candidate not in self.stopwords: final_result.append(candidate) for np in np_result: entity_name = self.get_clean_entity_name_for_string(np.string) if entity_name: final_result.append(entity_name) return list(set(final_result))
def extract_chunk(self, text): """ return all the chunk extracted form the text :param text: the text :return: list of chunk of object pattern.text.Chunk """ result = [] pst = parsetree(text) for sentence in pst: for chunk in sentence.chunks: if chunk.type == "NP": result.append(chunk) # print chunk.type, [(w.string, w.type) for w in chunk.words] return result
def extract_single_verb(self, text): """ return all the chunk extracted form the text :param text: the text :return: list of chunk of object pattern.text.Chunk """ result = [] pst = parsetree(text) for sentence in pst: for chunk in sentence.chunks: if chunk.type == "VP": result.append(chunk.string.split(" ")[0]) candidates = set(result) final_result = [] for candidate in candidates: candidate = self.lemmatizer.lemmatize(candidate) if candidate not in self.stopwords: final_result.append(candidate) return list(set(final_result))
def test_get_np_for_all(self): text_list = self.text_list from textblob.taggers import NLTKTagger from textblob.tokenizers import SentenceTokenizer chunker = ConllExtractor() tb = Blobber(pos_tagger=NLTKTagger(), tokenizer=SentenceTokenizer(), np_extractor=chunker) for text in text_list: # tbinstance=tb(text) # sentences=tbinstance.sentences # print(sentences) # for s in sentences: # s. pst = parsetree(text) print(pst) for sentence in pst: for chunk in sentence.chunks: if chunk.type == "NP": print chunk.type, [(w.string, w.type) for w in chunk.words]
print(parse("les chats noirs", chunks=False, language="fr", tagset=UNIVERSAL)) print(parse("i gatti neri", chunks=False, language="it", tagset=UNIVERSAL)) print(parse("de zwarte katten", chunks=False, language="nl", tagset=UNIVERSAL)) print("") # This comes at the expense of (in this example) losing information about plural nouns (NNS => NN). # But it may be more comfortable for you to build multilingual apps # using the universal constants (e.g., PRON, PREP, CONJ), # instead of learning the Penn Treebank tagset by heart, # or wonder why the Italian "che" is tagged "PRP", "IN" or "CC" # (in the universal tagset it is a PRON or a CONJ). from pattern.text import parsetree for sentence in parsetree("i gatti neri che sono la mia", language="it", tagset=UNIVERSAL): for word in sentence.words: if word.tag == PRON: print(word) # The language() function in pattern.text can be used to guess the language of a text. # It returns a (language code, confidence)-tuple. # It can guess en, es, de, fr, it, nl. from pattern.text import language print("") print(language(u"the cat sat on the mat")) # ("en", 1.00) print(language(u"de kat zat op de mat")) # ("nl", 0.80) print(language(u"le chat s'était assis sur le tapis")) # ("fr", 0.86)
print(parse("the black cats" , chunks=False, language="en", tagset=UNIVERSAL)) print(parse("los gatos negros" , chunks=False, language="es", tagset=UNIVERSAL)) print(parse("les chats noirs" , chunks=False, language="fr", tagset=UNIVERSAL)) print(parse("i gatti neri" , chunks=False, language="it", tagset=UNIVERSAL)) print(parse("de zwarte katten" , chunks=False, language="nl", tagset=UNIVERSAL)) print() # This comes at the expense of (in this example) losing information about plural nouns (NNS => NN). # But it may be more comfortable for you to build multilingual apps # using the universal constants (e.g., PRON, PREP, CONJ), # instead of learning the Penn Treebank tagset by heart, # or wonder why the Italian "che" is tagged "PRP", "IN" or "CC" # (in the universal tagset it is a PRON or a CONJ). from pattern.text import parsetree for sentence in parsetree("i gatti neri che sono la mia", language="it", tagset=UNIVERSAL): for word in sentence.words: if word.tag == PRON: print(word) # The language() function in pattern.text can be used to guess the language of a text. # It returns a (language code, confidence)-tuple. # It can guess en, es, de, fr, it, nl. from pattern.text import language print() print(language(u"the cat sat on the mat")) # ("en", 1.00) print(language(u"de kat zat op de mat")) # ("nl", 0.80) print(language(u"le chat s'était assis sur le tapis")) # ("fr", 0.86)
def getSortedSentenceList(query, raw_sentence_list, english_postagger, min_words=14, max_sentences=2000): # igraph object enhancedSentences=generateEnhancedSentences(raw_sentence_list,english_postagger) print 'Enhanced sentences==>', len(enhancedSentences) taggedSentences=english_postagger.tag_sents(nltk.word_tokenize(sent) for sent in enhancedSentences) taggedSentences=generateTempRewrittenSentences(taggedSentences) iobject = generateMultiplePaths(taggedSentences) startvertex = getVertex(iobject, '-start-/-/-start-') endvertex = getVertex(iobject, '-end-/-/-end-') vertexList = iobject.vs() allpaths = paths_from_to(iobject, startvertex, endvertex) shuffle(allpaths) allpaths=allpaths[0:2000] generatedSentences = [] a = print 'starting paths...' sentence_container = {} for path in allpaths: paired_parentheses = 0 quotation_mark_number = 0 if len(path) >= min_words: sentence = ' '.join(getWordFromVertexName(vertexList[element]['name']) for element in path) for word in sentence.split(): if word == '(': paired_parentheses -= 1 elif word == ')': paired_parentheses += 1 elif word == '"' or word == '\'\'' or word == '``': quotation_mark_number += 1 if paired_parentheses == 0 and \ (quotation_mark_number%2) == 0 and \ not sentence_container.has_key(sentence.strip()): generatedSentences.append(sentence.strip()) sentence_container[sentence.strip()]=1 b = print 'done with paths' , (b-a) shuffle(generatedSentences) generatedSentences=generatedSentences[0:max_sentences] for gensent in generatedSentences: s = parsetree(gensent, tokenize = True, relations=True, lemmata = True) chunkList=[chunk.type for row in s for chunk in row.chunks] relationList=[rel for row in s for rel in row.relations] if 'VP' not in chunkList or 'SBJ' not in relationList: #subject verb generatedSentences.remove(gensent) #shuffle(generatedSentences) docs=[] docs.append(query) ## Query add docs.extend(generatedSentences) bow_matrix = TfidfVectorizer(stop_words=stopwordList).fit_transform(docs) normalized = TfidfTransformer().fit_transform(bow_matrix) cosine_similarity_matrix = (normalized[1:] * normalized[1:].T).A sources, targets = cosine_similarity_matrix.nonzero() similarity_igraph = igraph.Graph(zip(sources, targets), directed=True) scores = igraph.Graph.pagerank(similarity_igraph) docqueryRelevance = linear_kernel(normalized[0:1], normalized[1:]).flatten() scoredList = [(scores[i] * docqueryRelevance[i], s, i) for i, s in enumerate(generatedSentences)] #scoredList = [(docqueryRelevance[i], s, i) for i, s in enumerate(generatedSentences)] #for score, sent, i in scoredList: #print score, sent, i #cosine_similarity_matrix=np.asmatrix(cosine_similarity_matrix) return scoredList, cosine_similarity_matrix
(r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) rt.evaluate(test_data) ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) ut.evaluate(test_data) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt) tree = parsetree(sentence) for sentence_tree in tree: print(sentence_tree.chunks)
''' Created on Apr 9, 2015 @author: sub253 ''' from pattern.text import parsetree, Chunk, pprint, parse sent = 'A number of methods have been recommended to help ease symptoms, including adequate liquid intake and rest.' #sent ='There are two more scripts of interest.' s = parsetree(sent, tokenize=True, relations=True, lemmata=True) #chunk = Chunk(s) # #parse(sent, relations=True)) print s relationList = s.sentences[0].relations print 'Relationlist=>', relationList sbjstring = '' objstring = '' if 'SBJ' in relationList: for chunk in relationList['SBJ'].values(): print chunk.words sbjstring = sbjstring + ' ' + ' '.join(word.string for word in chunk.words) if 'OBJ' in relationList: for chunk in relationList['OBJ'].values(): print chunk.words objstring = objstring + ' ' + ' '.join(word.string for word in chunk.words) print sbjstring.strip()