Esempio n. 1
0
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]

        # Yield candidates that are not filtered by stopwords and punctuation.
        for chunk in normalizer.normalize(chunks):
            yield chunk
Esempio n. 2
0
def extract_candidate_phrases(sents, grammar=GRAMMAR, tagged=False):

    # Create the chunker that uses our grammar
    chunker = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.word_tokenize(sent))

        # Parse the sentence, converting the parse tree into a tagged sequence
        sent = normalize(sent)
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract phrases and rejoin them with space
        phrases = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda term: term[-1] != 'O'
            ) if key
        ]

        for phrase in phrases:
            yield phrase
Esempio n. 3
0
    def build_vocabulary(self):
        """
        Generate a list of candidate phrases from the documents, using POS tagging and chunking
        functionality of nltk.
        """
        stop_words = set(stopwords.words('english'))

        vocabulary = []
        for doc in self.documents:
            words = []
            candidates = []
            clean_doc = text_cleaner(doc)
            sentences = sent_tokenize(clean_doc)
            words.extend([word_tokenize(sentence) for sentence in sentences])
            tagged_words = pos_tag_sents(words)

            grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
            chunker = RegexpParser(grammar)
            # split into a private function
            all_tag = chain.from_iterable(
                [tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
            for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
                candidate = ' '.join([word for (word, pos, chunk) in group])
                if key is True and candidate not in stop_words:
                    candidates.append(candidate)
            vocabulary.append(candidates)

        vocabulary = list(chain(*vocabulary))
        vocabulary = list(np.unique(vocabulary))

        self.vocabulary = vocabulary
Esempio n. 4
0
def generate_candidate(texts, method='phrase', remove_punctuation=True):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence)  # remove punctuation
            # sentence = re.sub(r'[^\w]', ' ', sentence)
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
        tagged_words = pos_tag_sents(words_)  # POS tagging
        words_.clear()

        if method == 'word':
            tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
            tagged_words = chain.from_iterable(tagged_words)
            for word, tag in tagged_words:
                if tag in tags and word.lower() not in stop_words:
                    candidates.append(word)
        elif method == 'phrase':
            # grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
            grammar = r'KT: {(<JJ><NN.*>)' \
                      r' | (<NN.*><NN.*>) ' \
                      r' | (<NN.*><NN.*><NN.*>) ' \
                      r'| (<JJ><JJ><NN.*>+)' \
                      r' | (<JJ><NN.*><NN.*>)' \
                      r' | (<NN.*><JJ><NN.*>) ' \
                      r'| (<NN.*><IN><NN.*>) ' \
                      r'| (<JJ><NN.*><IN><NN.*>) ' \
                      r'| (<NN.*><IN><JJ><NN.*>) ' \
                      r'| (<JJ><NN.*><IN><JJ><NN.*>) }'
            chunker = RegexpParser(grammar)
            all_tag = chain.from_iterable(
                [tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
            for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
                candidate = ' '.join([word for (word, pos, chunk) in group])
                if key is True and candidate not in stop_words:
                    candidates.append(candidate)
        else:
            print("Use either 'word' or 'phrase' in method")

    return candidates
Esempio n. 5
0
def buildchunkerlist(grammerlst, tagged):
    gtree = []
    for g in grammerlst:
        chunker = RegexpParser(g)
        OP = chunker.parse(tagged)
        if (OP.height() >= 3 ):
            gtree.append(OP.subtrees(lambda t: t.height() == 2))
            
    return gtree
Esempio n. 6
0
def parseRelatedFeature(sent, tagged):
    
    chunker = RegexpParser(''' OP5: {<.*>+<NN>?<CD><.*>+<NN>?} ''')
    OP = chunker.parse(tagged)
    if (OP.height() >= 3 ):
        for m in OP.subtrees(lambda t: t.height() == 2):
            for (word,tag) in m:
                if ( tag == "NN" and r3.match(word)):
                    return True
Esempio n. 7
0
class KeyPhraseGenerator():
    """
    Extracts keyphrases from input list of strings.
    """
    def __init__(self, grammar=GRAMMAR, stopwords=STOPWORDS):

        self.chunker = RegexpParser(grammar)
        self.stopwords = stopwords

    def clean_text(self, txt):
        """
        Removes emoji and urls from text.
        """
        cleaned = cleaner.remove_emojis(txt)
        cleaned = cleaner.remove_urls(cleaned)
        return cleaned

    def clean_tagged_text(self, tagged_text):
        """
        Remove punctuation from tagged text.
        """
        punct_tagged = lambda word: all(
            unicat(char).startswith("P") and char != "," for char in word)
        cleaned = filter(lambda t: not punct_tagged(t[0]), tagged_text)
        return list(cleaned)

    def extract_keyphrases_single(self, txt):
        """
        Yields keyphrases for one piece of text.
        """
        for sent in txt:
            sent = self.clean_tagged_text(sent)
            if not sent:
                continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(chunks, lambda term: term[-1] != "O")
                if key
            ]
            for phrase in phrases:
                if phrase.lower() not in self.stopwords and len(phrase) > 2:
                    yield phrase

    def extract_keyphrases(self, txt_list):
        """
        Returns keyphrases for input list of strings.
        """
        key_docs = []
        for txt in txt_list:
            tagged_doc = []
            txt = self.clean_text(txt)
            for sent in nltk.sent_tokenize(txt):
                tagged_doc.append(nltk.pos_tag(nltk.word_tokenize(sent)))
            key_docs.append(list(self.extract_keyphrases_single(tagged_doc)))
        return key_docs
def getConcepts(text):
    grammar = """
        CONCEPT:   {(<DT>)?(<JJ>)?<NN|NNS>+}
    """
    chunker = RegexpParser(grammar)
    taggedText = pos_tag(word_tokenize(text))
    textChunks = chunker.parse(taggedText)
    current_chunk = []
    for i in textChunks:
        if (type(i) == Tree and i.label() == "CONCEPT"):
            current_chunk.append(" ".join([token
                                           for token, pos in i.leaves()]))
    return current_chunk
def vocab_gen(texts, bool_key):
    list_word = []
    vocabs = []
    word_write = ""
    phrase_write = ""
    pos_write = ""
    sentences = sent_tokenize(texts)
    sentence_write = "\n".join(sentences)
    for sentence in sentences:
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        list_word.append(words)
    words_w_pos = pos_tag_sents(list_word)  # POS
    dumb = [j for sub in words_w_pos for j in sub]
    dumb = pos_tag_sents(dumb)
    dumb = [j for sub in dumb for j in sub]
    for i in dumb:
        pos_write += str(i)
        pos_write += "\n"
    # define grammar to pull out the phrases
    grammar = r'KT: ' \
              r'{' \
              r'(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+' \
              r'}'
    grammar = RegexpParser(grammar)
    all_tag = chain.from_iterable(
        [tree2conlltags(grammar.parse(tag)) for tag in words_w_pos])
    for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
        vocabs_temp = ' '.join([word for (word, pos, chunk) in group])
        if bool_key == 'Phrase':
            if key is True and vocabs_temp not in stop_words and len(
                    vocabs_temp) > 2 and (' ' in vocabs_temp) == True:
                vocabs.append(vocabs_temp)
                phrase_write += vocabs_temp
                phrase_write += "\n"
        else:
            if key is True and vocabs_temp not in stop_words and len(
                    vocabs_temp) > 2 and (' ' in vocabs_temp) == False:
                vocabs.append(vocabs_temp)
                word_write += vocabs_temp
                word_write += "\n"
    update_file = open(vocabs_word_path, 'w')
    update_file.write(word_write)
    if bool_key == 'Phrase':
        update_file = open(vocabs_phrase_path, 'w')
        update_file.write(phrase_write)
    update_file = open(sentence_path, 'w')
    update_file.write(sentence_write)
    update_file = open(pos_path, 'w')
    update_file.write(pos_write)
    return vocabs
Esempio n. 10
0
def extract_from_sentences(sentences, add_verbs=True, language="english"):
    """
    Processes Sentence objects to calculate contained Noun Phrases based on a given grammar and maps them to the
    sentences they occur in.

    :param sentences: A list of Sentence objects.
    :param add_verbs: Optional. Default: True. Whether or not verbs are to be added to the mapping.
    :param language: Optional. Default: English. The langue of the sentences.
    :return: A dictionary mapping tokens to the sentence IDs of the sentences they appear in.
    """
    # produce the mapping of sentences to their contained (words, pos) tuples
    pos_dictionary = {}
    NP_GRAMMAR_COMPOUND = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*((<CC>|,)<JJ.*>*(<N.*>|<JJ.*>)+((<IN>|<TO>)?<JJ.*>*(<N.*>|<JJ.*>)+)*)*}"
    for sentence in sentences:
        pos_dictionary[sentence.sentence_id] = [
            (token, tag) for token, tag in sentence.tokens.items()
        ]
    parser_cmp = RegexpParser(NP_GRAMMAR_COMPOUND)
    term2sentence_id = {}
    lemmatizer = WordNetLemmatizer()
    for sentence_id, pos_tagged_tokens in pos_dictionary.items():
        if add_verbs:
            # updating the inverse occurrence index with verbs
            for subject, tag in pos_tagged_tokens:
                # check if subject is tagged as a verb
                if tag.startswith("VB"):
                    verb = lemmatizer.lemmatize(subject, "v").lower()
                    if verb not in stopwords.words(language):
                        if verb not in term2sentence_id:
                            term2sentence_id[verb] = set()
                        term2sentence_id[verb].add(sentence_id)
        # trying to parse the sentence_id into a top-level chunk tree
        tree = parser_cmp.parse(pos_dictionary[sentence_id])
        # getting the top-level tree triples and decomposing the NPs
        cmp_triples, simple_trees = get_cooccurence([tree],
                                                    ignore_stopwords=False,
                                                    language=language)
        smp_triples, _ = get_cooccurence(simple_trees,
                                         ignore_stopwords=True,
                                         language=language)
        # updating the inverse occurrence index with NPs
        for subject, _, objecT in cmp_triples + smp_triples:
            if subject.lower() not in term2sentence_id:
                term2sentence_id[subject.lower()] = set()
            if objecT.lower() not in term2sentence_id:
                term2sentence_id[objecT.lower()] = set()
            term2sentence_id[subject.lower()].add(sentence_id)
            term2sentence_id[objecT.lower()].add(sentence_id)
    return term2sentence_id
Esempio n. 11
0
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = list(sent)
        if len(sent) == 2:
            sent = map(lambda t: (t[0].lower(), t[1]), [sent])
            sent = list(sent)
        else:
            sent = list()
        return sent

    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [
                    " ".join(word for word, pos, chunk in group).lower()
                    for key, group in groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for phrase in phrases:
                    yield phrase

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))
Esempio n. 12
0
def get_tokens(text):
    word_list = []
    voc = []
    voc_write = ''
    sent = sent_tokenize(text)
    word_single = word_tokenize(text)
    if os.path.exists('token_log.txt'):
        k = open('token_log.txt', 'w', encoding='UTF8')
    else:
        k = open('token_log.txt', 'x', encoding='UTF8')
        k = open('token_log.txt', 'w', encoding='UTF8')
    k.write(str(word_single))
    for i in sent:
        word = word_tokenize(i)
        words = list(map(lambda s: s.lower(), word))
        word_list.append(words)
    words_pos = pos_tag_sents(word_list)

    if os.path.exists('pos_log.txt'):
        f = open('pos_log.txt', 'w', encoding='UTF8')
    else:
        f = open('pos_log.txt', 'x', encoding='UTF8')
        f = open('pos_log.txt', 'w', encoding='UTF8')
    f.write(str(words_pos))

    grammar = r'KT: ' \
              r'{' \
              r'(<JJ>* <NN.*>+ <In>)? <JJ>* <NN.*>+' \
              r'}'
    grammar = RegexpParser(grammar)

    tags = chain.from_iterable(
        [tree2conlltags(grammar.parse(tag)) for tag in words_pos])

    for key, group in groupby(tags, lambda tag: tag[2] != 'O'):
        voc_temp = ' '.join([word for (word, pos, chunk) in group])
        if key is True and voc_temp not in stopwords.words(
                'english') and voc_temp != 'https':
            voc.append(voc_temp)
            voc_write += voc_temp
            voc_write += '\n'
    if os.path.exists('voc_log.txt'):
        f = open('voc_log.txt', 'w', encoding='UTF8')
    else:
        f = open('voc_log.txt', 'x', encoding='UTF8')
        f = open('voc_log.txt', 'w', encoding='UTF8')
    f.write(voc_write)
    return voc
Esempio n. 13
0
def getInstances(text):
    grammar = """
        PRE:   {<NNS|NNP|NN|NP|JJ|UH>+}
        MID: {<DT|IN|POS|FW|-|NP|NPS|NN|NNS>+}
        INSTANCE:   {(<DT+>)?(<JJ+>)?<PRE>(<MID><PRE>)?}
    """
    chunker = RegexpParser(grammar)
    taggedText = pos_tag(word_tokenize(text))
    textChunks = chunker.parse(taggedText)
    current_chunk = []
    for i in textChunks:
        if (type(i) == Tree and i.label() == "INSTANCE"):
            # print (i.leaves())
            current_chunk.append(" ".join([token
                                           for token, pos in i.leaves()]))
    return current_chunk
Esempio n. 14
0
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Wraps a PickledCorpusReader consisting of pos-tagged documents.
    """
    def __init__(self, grammar=GRAMMAR):
        self.grammar = GRAMMAR
        self.chunker = RegexpParser(self.grammar)

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(char).startswith('P') for char in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_keyphrases(self, document):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Yields extracted phrases.
        """
        for sents in document:
            for sent in sents:
                sent = self.normalize(sent)
                if not sent: continue
                chunks = tree2conlltags(self.chunker.parse(sent))
                phrases = [
                    " ".join(word for word, pos, chunk in group).lower()
                    for key, group in groupby(
                        chunks, lambda term: term[-1] != 'O'
                    ) if key
                ]
                for phrase in phrases:
                    yield phrase

    def fit(self, documents, y=None):
        return self

    def transform(self, documents):
        for document in documents:
            yield list(self.extract_keyphrases(document))
def generate_candidate(texts, method='word', remove_punctuation=False):
    """
    Generate word candidate from given string

    Parameters
    ----------
    texts: str, input text string
    method: str, method to extract candidate words, either 'word' or 'phrase'

    Returns
    -------
    candidates: list, list of candidate words
    """
    words_ = list()
    candidates = list()

    # tokenize texts to list of sentences of words
    sentences = sent_tokenize(texts)
    for sentence in sentences:
        if remove_punctuation:
            sentence = punct_re.sub(' ', sentence) # remove punctuation
        words = word_tokenize(sentence)
        words = list(map(lambda s: s.lower(), words))
        words_.append(words)
    tagged_words = pos_tag_sents(words_) # POS tagging

    if method == 'word':
        tags = set(['JJ', 'JJR', 'JJS', 'NN', 'NNP', 'NNS', 'NNPS'])
        tagged_words = chain.from_iterable(tagged_words)
        for word, tag in tagged_words:
            if tag in tags and word.lower() not in stop_words:
                candidates.append(word)
    elif method == 'phrase':
        grammar = r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'
        chunker = RegexpParser(grammar)
        all_tag = chain.from_iterable([tree2conlltags(chunker.parse(tag)) for tag in tagged_words])
        for key, group in groupby(all_tag, lambda tag: tag[2] != 'O'):
            candidate = ' '.join([word for (word, pos, chunk) in group])
            if key is True and candidate not in stop_words:
                candidates.append(candidate)
    else:
        print("Use either 'word' or 'phrase' in method")
    return candidates
Esempio n. 16
0
def create_phrase_vocabulary(raw_data):
    '''
	Extract vocabulary of nounphrase, because tfidfvectorizer only automatically extract ngram,
		if we want to use different format or different vocabulary, vocabulary must be created.
	'''

    #grammar to extract the noun phrase
    grammar = r'NP: {(<JJ.*>* <VBN>? <NN.*>+ <IN>)? <JJ.*>* <VBG>? <NN.*>+}'

    #set the punctuation and chunker
    punct = set(string.punctuation)
    chunker = RegexpParser(grammar)

    def lambda_unpack(f):
        #function to unpack the tuple
        return lambda args: f(*args)

    #tokenize and create pos tags per sentence, then get its IOB tag
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data)
    noun_phrases = list(
        chain.from_iterable(
            tree2conlltags(chunker.parse(tagged_sent))
            for tagged_sent in postag_sents))

    #join B-NP and I-NP tags as one noun phrase excluding O tags
    merged_nounphrase = [
        ' '.join(stemmer.stem(word) for word, pos, chunk in group).lower()
        for key, group in itertools.groupby(
            noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O'))
        if key
    ]

    #filter the term below than two characters and punctuation
    all_nounphrases = [
        cand for cand in merged_nounphrase
        if len(cand) > 2 and not all(char in punct for char in cand)
    ]

    #select distinct noun phrases
    vocabulary = (list(set(all_nounphrases)))
    return vocabulary
Esempio n. 17
0
def get_cooccurence(chunk_trees, ignore_stopwords=True, language="english"):
    """
    Parses a chunk tree and gets co-occurance of terms.

    :param chunk_trees: Tree from the NLTK RegexParser, generated over POS-tagged sentences using the provided grammar.
    :param ignore_stopwords: Optional. Default: True. Whether stopwords are to be ignored or not.
    :param language: Optional. Default: English. The language of the texts over which the chunk trees were generated.
    :return: A list of co-occuring tokens and a simple parse tree generated over the leaves of  the chunks of the
        provided one.
    """
    triples = []
    simple_trees = []
    lemmatizer = WordNetLemmatizer()
    NP_GRAMMAR_SIMPLE = "NP: {<JJ.*>*(<N.*>|<JJ.*>)+}"
    parser_simple = RegexpParser(NP_GRAMMAR_SIMPLE)
    for t in chunk_trees:
        entities = []
        for chunk in t:
            if isinstance(chunk, Tree) and chunk.label() == 'NP':
                # getting a tree for later processing of triples from the simple noun
                # phrases (if present)
                simple_trees.append(parser_simple.parse(chunk.leaves()))
                words = []
                for word, tag in chunk:
                    if (ignore_stopwords and word in stopwords.words(language)) or \
                            (not any(char.isalnum() for char in word)):
                        # do not process stopwords for simple trees, do not process purely
                        # non alphanumeric characters
                        continue
                    if tag.startswith('N'):
                        words.append(lemmatizer.lemmatize(word, 'n'))
                    elif tag.startswith('J'):
                        words.append(lemmatizer.lemmatize(word, 'a'))
                    else:
                        words.append(word)
                if len(words) > 0:
                    entities.append("_".join(words))
        for e1, e2 in combinations(entities, 2):
            triples.append((e1, "close to", e2))
            triples.append((e2, "close to", e1))
    return triples, simple_trees
def chunk_location_sent(pos_text, temp_text):
	list_of_locs = list()

	chunk_grammar = r"""

	LOC:   {((<CD>?<NNP>+<CD>?)|(<CD>?<NN>+<CD>?))+}

	"""
	chunker = RegexpParser(chunk_grammar)


	chunked_article = chunker.parse(pos_text)
	for subtree in chunked_article.subtrees(): 
		if subtree.label()=='LOC':
			#print(' '.join((tuples[0] for tuples in list(subtree))))
			#print(subtree.pprint())
			NNPs = ' '.join((tuples[0] for tuples in list(subtree)))
			#print("LOC: " + NNPs)
			list_of_locs.append(NNPs)
	#print("loc list:", list_of_locs)
	return list_of_locs
	def getNounPhrases(self):
		
		featureSet = []
		
		# Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim
		grammar = r"""
		    NBAR:
		    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
		    NP:
		    {<NBAR>}
		    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
		"""
		chunker = RegexpParser(grammar)
	
		for sentence in self.sentences:
			tokens = word_tokenize(sentence)
			
			if len(tokens) == 0:
				continue
			else:
				pass
			
			tagged = pos_tag(tokens)
			tree = chunker.parse(tagged)
			terms = []
			leafCollection = []
			
			for subtree in tree.subtrees(filter = lambda t : t.node == 'NP'):
				leafCollection.append(subtree.leaves())
			
			for leaf in leafCollection:
				term = [w for w,t in leaf if len(w) > 2]
				phrase = ' '.join(term)
				terms.append(phrase)
			
			featureSet += terms
		
		self.convertToFeatureDist(featureSet)
		self.helperObject.saveAllFeaturesExtracted(featureSet)
    def getNounPhrases(self):

        featureSet = []

        # Handbook of NLP - Multiword Expressions, Timothy Baldwin and Su Nam Kim
        grammar = r"""
		    NBAR:
		    {<NN.*|JJ>*<NN.*>}  # Nouns and Adjectives, terminated with Nouns
		    NP:
		    {<NBAR>}
		    {<NBAR><IN><NBAR>}  # Above, connected with in/of/etc...
		"""
        chunker = RegexpParser(grammar)

        for sentence in self.sentences:
            tokens = word_tokenize(sentence)

            if len(tokens) == 0:
                continue
            else:
                pass

            tagged = pos_tag(tokens)
            tree = chunker.parse(tagged)
            terms = []
            leafCollection = []

            for subtree in tree.subtrees(filter=lambda t: t.node == 'NP'):
                leafCollection.append(subtree.leaves())

            for leaf in leafCollection:
                term = [w for w, t in leaf if len(w) > 2]
                phrase = ' '.join(term)
                terms.append(phrase)

            featureSet += terms

        self.convertToFeatureDist(featureSet)
        self.helperObject.saveAllFeaturesExtracted(featureSet)
def chunk_name_sent(pos_text, temp_text):
	list_of_names = list()

	chunk_grammar = r"""

	NAME: 	{<NNP>+}

	"""
	chunker = RegexpParser(chunk_grammar)


	chunked_article = chunker.parse(pos_text)
	#print("chunk:", chunked_article)
	for subtree in chunked_article.subtrees(): 
		if subtree.label()=='NAME':
			#print(' '.join((tuples[0] for tuples in list(subtree))))
			#print(subtree.pprint())
			NNPs = ' '.join((tuples[0] for tuples in list(subtree)))
			#print("..: ", NNPs)
			#print("LOC: " + NNPs)
			list_of_names.append(NNPs)

	#print("namelist: ", list_of_names)
	return list_of_names
Esempio n. 22
0
def extract_words(nodetext, t2, doc, location):
	try:
	#	tokenizer = RegexT(r'\w*[a-zA-Z]\w*')
	#	return tokenizer.tokenize(nodetext)
	#except TypeError:
	#	return []
		grammar = "NP: {<JJ>*<NN>+}"
		phrases = []
		final_phrases = []
		for sent in sent_tokenize(nodetext):
			doc.add_sentence(Sentence(location, sent))
			tag_list = t2.tag(word_tokenize(sent))
			parser = RegexpParser(grammar)
			result = parser.parse(tag_list)
			for phrase in result:
				if isinstance(phrase, NLTREE.Tree) and phrase.node == "NP":
					phrases.append("_".join([word for word,pos in phrase.leaves()]))
					#n_phrase = "_".join([word for word,pos in phrase.leaves()])
					#if any(c.isdigit() for c in n_phrase):
				#		continue
				#	elif '.' in n_phrase:
				#		continue
				#	else:
				#		doc.add_word(Word(location, n_phrase, sent))

	except TypeError:
		return []
	for phrase in phrases:
		if any(c.isdigit() for c in phrase):
			continue
		elif '.' in phrase:
			continue
		else:
			final_phrases.append(phrase)

	return final_phrases
Esempio n. 23
0
def extract_candidate_chunks(sents, grammar=GRAMMAR, tagged=False, **kwargs):
    """
    Extracts key chunks based on a grammar for a list of tokenized sentences.
    If the sentences are already tokenized and tagged, pass in: tagged=True
    """
    normalizer = Normalizer(**kwargs)
    chunker    = RegexpParser(grammar)

    for sent in sents:
        # Tokenize and tag sentences if necessary
        if not tagged:
            sent = nltk.pos_tag(nltk.wordpunct_tokenize(sent))

        # Parse with the chunker if we have a tagged sentence
        if not sent: continue
        chunks = tree2conlltags(chunker.parse(sent))

        # Extract candidate phrases from our parsed chunks
        chunks = [
            " ".join(word for word, pos, chunk in group).lower()
            for key, group in groupby(
                chunks, lambda (word, pos, chunk): chunk != 'O'
            ) if key
        ]
Esempio n. 24
0
grammar = """NP:{<DT>?<JJ>*(<NN.*>)+}    
               PR:{<PRP.*>}
            """

#grammar for tagging noun phrases and pronouns
#DT - determiners eg: The, a, an, my
#JJ - adjectives
#NN.* - any type of noun
#PRP - personal pronoun eg: He, she, I, We, they

rp = RegexpParser(grammar)
count = 0
for s in listOfTaggedSents:

    chunkedTree = ParentedTree.convert(
        rp.parse(s))  #tree of chunked parts of the sentence
    #ParentedTree is used to convert tagged words to tree structure
    neTree = ne_chunk(s)  #tree with named entity tags

    #print (chunkedTree)
    #chunkedTree.draw()
    #neTree.draw()

    for n in chunkedTree:
        if isinstance(n, nltk.tree.Tree):
            if n.label() == 'NP':
                mostSigNoun = [
                    w for w in n if w[1] in ['NN', 'NNS', 'NNP', 'NNPS']
                ]
                for ne in neTree:  #ne contains nouns and pos
                    if isinstance(ne, nltk.tree.Tree):
def apply_grammar(pos_words):
    grammar_parser = RegexpParser(GRAMMAR)
    return grammar_parser.parse(pos_words)
Esempio n. 26
0
 def tagChunk(self, taggedword, loops=2):
     ## Cunking
     cp = RegexpParser(self.grammar, loop=loops)
     print('tagged word')
     print(taggedword)
     return cp.parse(taggedword)
Esempio n. 27
0
 def regex_chunk(self, tagged, pattern):
     pr = RegexpParser(pattern)
     chunked = [pr.parse(sent) for sent in tagged]
     return chunked
Esempio n. 28
0
                                del words[i]
                                i = i
                                words_len = len(words)
                            else:
                                i = i + 1
                                words_len = len(words)

                        words_only = words[1:]

                        i = 1
                        while (i < words_len):
                            lmtzr.lemmatize(words[i])
                            i = i + 1

                        pos_words = pos_tagger.tag(words_only)
                        parsed_out_pcfg = reg_parser.parse(pos_words)

                        pre_parsed_out = dependency_parser.parse(words_only)
                        dep = pre_parsed_out.__next__()
                        parsed_out = list(dep.triples())

                        Script_Word_Ct += len(pos_words)

                        i = 0
                        while (i < words_len - 1):
                            tags = pos_words[i][1]

                            if (i < len(pos_words) - 1 and tags == 'NP'
                                    and pos_words[i + 1][1] == 'PRP'):
                                NP_PRP += 1
Esempio n. 29
0
def summarizer(tex, reduce_per):
    def norm(word, pos='x'):  #normalizes all words except proper nouns
        word = word.lower()
        if pos not in ['NNP', 'NNPS']:
            wnl = WordNetLemmatizer()
            word = wnl.lemmatize(word)
        return (word)

    sentList = sent_tokenize(tex)  #list of all tokenized sentences

    #print(sentList)

    sentNounDict = defaultdict(
        list
    )  # a dictionary key:sentence_number value:all nouns in the sentence... (nouns are normalised)

    for s in sentList:
        for w, pos in pos_tag(word_tokenize(s)):
            if pos in ['NN', 'NNS', 'NNP', 'NNPS']:
                sentNounDict[sentList.index(s)].append(norm(w, pos))
    #print (sentNounDict)

    wordSentDict = defaultdict(
        list
    )  # a dictionary key:(word,pos) value:all sentences it appears in...(word is normalised)

    for s in sentList:
        for w, pos in pos_tag(word_tokenize(s)):
            wordSentDict[(norm(w, pos), pos)].append(sentList.index(s))
    #print (wordSentDict)


#list of all nouns in the text
    listOfNouns = list(
        sorted(
            set([
                norm(w, pos) for s in sentList
                for w, pos in pos_tag(word_tokenize(s))
                if pos in ['NN', 'NNS', 'NNP', 'NNPS']
            ])))
    #print (listOfNouns)

    listOfTaggedSents = [
    ]  #list of sentences of tokenized words with postags- list[tuple(w,pos)]

    for s in sentList:
        l = [(n, pos) for n, pos in pos_tag(word_tokenize(s))]
        listOfTaggedSents.append(l)
    #print (listOfTaggedSents)

    mostSigNoun = []  #most recently encountered significant noun
    mostSigNounObject = [
    ]  #most recently encountered significant noun which is not a person
    mostSigNounPerson = [
    ]  #most recently encountered significant noun which has named entity as person

    pronounNounDict = defaultdict(
        list
    )  #key:touple(pronoun,sentence_num) val:list(list(touple(noun,pos)))(noun not normalized)

    #grammar for tagging noun phrases and pronouns
    grammar = """NP:{<DT>?<JJ>*(<NN.*>)+}    
                   PR:{<PRP.*>}
                """
    rp = RegexpParser(grammar)
    for s in listOfTaggedSents:
        begin = True
        chunkedTree = ParentedTree.convert(
            rp.parse(s))  #tree of chunked parts of the sentence
        neTree = ne_chunk(s)  #tree with named entity tags
        #print (chunkedTree)
        #chunkedTree.draw()
        for n in chunkedTree:
            if isinstance(n, nltk.tree.Tree):
                if n.label() == 'NP':
                    if begin == True:
                        mostSigNoun = [
                            w for w in n
                            if w[1] in ['NN', 'NNS', 'NNP', 'NNPS']
                        ]
                        #print (mostSigNoun)
                        for ne in neTree:
                            if isinstance(ne, nltk.tree.Tree):
                                if ne[0] in mostSigNoun:
                                    if ne.label() == 'PERSON':
                                        mostSigNounPerson = []
                                        mostSigNounPerson.append(ne[0])
                                    else:
                                        mostSigNounObject = []
                                        mostSigNounObject.append(ne[0])
                        begin = False

                if n.label() == 'PR':
                    pron = n[0][0].lower()
                    #print pron
                    if pron in ['it', 'its']:  #for objects
                        if len(mostSigNounObject) > 0:
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNounObject)
                        else:  #if mostsignounobject does not exist
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNoun)
                    else:
                        if len(mostSigNounPerson) > 0:
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNounPerson)
                        else:
                            pronounNounDict[(pron, listOfTaggedSents.index(s)
                                             )].append(mostSigNoun)
                    begin = False
                    #print pronounNounDict

                    #adding the nouns corresponding to the pronouns to sentworddict and wordsentdict
                    for v1 in pronounNounDict[(pron,
                                               listOfTaggedSents.index(s))]:
                        for v11 in v1:  #it is a list of lists
                            sentNounDict[listOfTaggedSents.index(s)].append(
                                norm(v11[0], v11[1]))
                            wordSentDict[(norm(v11[0],
                                               v11[1]), v11[1])].append(
                                                   listOfTaggedSents.index(s))

    #print (sentNounDict)
    #print (wordSentDict)
    #print (pronounNounDict)

    for key, val in sentNounDict.items():  #making sentnoundict a set
        val = list(set(val))
        sentNounDict[key] = val
    #print (sentNounDict)

    #following code calculates the distance between two phrases
    distance = defaultdict(
        int
    )  #a dict.. key:(noun or noun(pronoun),sentence_num) value:position in the sentence from the begining

    for s in listOfTaggedSents:
        dist = 0
        chunkedTree = ParentedTree.convert(rp.parse(s))
        for n in chunkedTree:
            if isinstance(n, nltk.tree.Tree):
                if n.label() == 'NP':
                    tempNoun = [
                        w[0] for w in n
                        if w[1] in ['NN', 'NNS', 'NNP', 'NNPS']
                    ]
                    for w in tempNoun:
                        distance[(norm(w), listOfTaggedSents.index(s))] = dist
                if n.label() == 'PR':
                    pron = n[0][0].lower()
                    tempNoun = pronounNounDict[(pron,
                                                listOfTaggedSents.index(s))]
                    for v1 in tempNoun:
                        for v11 in v1:
                            distance[(norm(v11[0], v11[1]),
                                      listOfTaggedSents.index(s))] = dist
            dist += 1
    #print (distance)

    #the following code assigns relation factor between two nouns
    nounGraph = np.zeros((len(listOfNouns), len(listOfNouns)))

    for key, value in sentNounDict.items():
        for v1 in value:
            for v2 in value:
                d = 0
                if v2 != v1:
                    d = distance[v1, key] - distance[v2, key]
                    nounGraph[listOfNouns.index(v1)][listOfNouns.index(
                        v2)] += float((100 / (abs(d) + 1)))
                    #if nounGraph[listOfNouns.index(v1)][listOfNouns.index(v2)]>=100:
                    #print(v1+' '+v2+" "+str(d))

    #print(nounGraph)

    nounPriority = defaultdict(
        int
    )  #dict to hold noun priorities... key:noun(normalized)  value:priority
    sentencePriority = defaultdict(
        int
    )  #dict to hold sentence priorities...key:sentence_num   value:priority

    def calcNounPriority(
    ):  #function calculates the noun priority(sum of weights of all the edges attached to this noun in the noungraph)
        total = 0
        i = 0
        for x in nounGraph:
            total = sum(x)
            nounPriority[listOfNouns[i]] = total
            i += 1

    #print (sorted(nounPriority.items(),key=lambda x:x[1], reverse=True))

    def calcSentPriority(
    ):  #function calculates sentence priority(sum of priorities of all nouns in the sent)
        for key, value in sentNounDict.items():
            total = 0
            for n in value:
                total += nounPriority[n]
                sentencePriority[key] = total

    calcNounPriority()
    calcSentPriority()

    #print (sorted(sentencePriority.items(),key=lambda x:x[1], reverse=True))
    #for i in range(len(sentList)):
    #print(str(i)+' '+sentList[i])

    reducingFactor = 0.9  #10%
    summary = []  #list to hold the summary
    reduce_per = reduce_per / 100
    #print(reduce_per)
    for i in range(int(len(sentencePriority) * reduce_per)):
        summary.append(max(sentencePriority.items(), key=lambda x: x[1]))
        #print (summary)
        j = summary[-1][0]

        for n in sentNounDict[j]:
            nounPriority[
                n] *= reducingFactor  #reduce the priority of all nouns in the picked sentence

        del sentNounDict[j]
        del sentencePriority[j]  #remove the picked sentence
        calcSentPriority()  #recalculate sentence priority

    #print ("\n\n")
    i = 1
    s_list = []
    for s in sorted(summary):
        #print (i,sentList[s[0]])
        s_list.append(sentList[s[0]])
        i += 1

    return (s_list)
Esempio n. 30
0
 def tagChunk(self, taggedword, loops=2):
     ## Cunking
     cp = RegexpParser(self.grammar, loop=loops)
     return cp.parse(taggedword)
Esempio n. 31
0
class KeyphraseExtractor(BaseEstimator, TransformerMixin):
    """
    Extract adverbial and adjective phrases, and transform
    documents into lists of these keyphrases, with a total
    keyphrase lexicon limited by the nfeatures parameter
    and a document length limited/padded to doclen
    """
    def __init__(self, nfeatures=100000, doclen=60):
        self.grammar = r'KT: {(<RB.> <JJ.*>|<VB.*>|<RB.*>)|(<JJ> <NN.*>)}'
        # self.grammar = r'KT: {(<RB.*> <VB.>|<RB.>|<JJ.> <NN.*>)}'
        # self.grammar = r'KT: {<RB.>|<JJ.>}'
        self.chunker = RegexpParser(self.grammar)
        self.nfeatures = nfeatures
        self.doclen = doclen

    def normalize(self, sent):
        """
        Removes punctuation from a tokenized/tagged sentence and
        lowercases words.
        """
        is_punct = lambda word: all(unicat(c).startswith('P') for c in word)
        sent = filter(lambda t: not is_punct(t[0]), sent)
        sent = map(lambda t: (t[0].lower(), t[1]), sent)
        return list(sent)

    def extract_candidate_phrases(self, sents):
        """
        For a document, parse sentences using our chunker created by
        our grammar, converting the parse tree into a tagged sequence.
        Extract phrases, rejoin with a space, and yield the document
        represented as a list of it's keyphrases.
        """
        for sent in sents:
            sent = self.normalize(sent)
            if not sent: continue
            chunks = tree2conlltags(self.chunker.parse(sent))
            phrases = [
                " ".join(word for word, pos, chunk in group).lower()
                for key, group in groupby(chunks, lambda term: term[-1] != 'O')
                if key
            ]
            for phrase in phrases:
                yield phrase

    def fit(self, documents, y=None):
        return self

    def get_lexicon(self, keydocs):
        """
        Build a lexicon of size nfeatures
        """
        keyphrases = [keyphrase for doc in keydocs for keyphrase in doc]
        fdist = FreqDist(keyphrases)
        counts = fdist.most_common(self.nfeatures)
        lexicon = [phrase for phrase, count in counts]
        return {phrase: idx + 1 for idx, phrase in enumerate(lexicon)}

    def clip(self, keydoc, lexicon):
        """
        Remove keyphrases from documents that aren't in the lexicon
        """
        return [
            lexicon[keyphrase] for keyphrase in keydoc
            if keyphrase in lexicon.keys()
        ]

    def transform(self, documents):
        docs = [list(self.extract_candidate_phrases(doc)) for doc in documents]
        lexicon = self.get_lexicon(docs)
        clipped = [list(self.clip(doc, lexicon)) for doc in docs]
        return sequence.pad_sequences(clipped, maxlen=self.doclen)