def Extract_phrases_document (text, stop, split, all_pos_tags): doc_phrases = {} sentences = [] if split is None: sentences = NLP_sent.sentence_splitting(text, 1) else: can_sens = text.split(split) for can_sen in can_sens: sentences.extend(NLP_sent.sentence_splitting(can_sen, 1)) for sentence in sentences: phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue if phrase in all_pos_tags: pos_tags = all_pos_tags[phrase] else: #-------------------POS tagging output words = NLP_word.word_splitting(phrase.lower()) pos_tags = NLP_word.word_pos_tagging(words) all_pos_tags[phrase] = pos_tags #-------------------parsed tree grammar = r""" NBAR: # Nouns and Adjectives, terminated with Nouns {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} # Above, connected with in/of/etc... {<NBAR><IN><NBAR>} """ cp = nltk.RegexpParser(grammar, loop=2) cp_tree = cp.parse(pos_tags) terms = get_terms(cp_tree) for term in terms: phrase = ' '.join(term) if word_checking_stop(phrase, stop) ==0: # filter stop words if len(phrase)>1: # at least two characters doc_phrases[phrase] = 1 return doc_phrases
def keywords_syntax_nltk(sentence): global text_terms terms = [] phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue if phrase in text_terms: phrase_terms = text_terms[phrase] else: #-------------------POS tagging output words = NLP_word.word_splitting(phrase.lower()) pos_tags = NLP_word.word_pos_tagging(words) #-------------------parsed tree grammar = r""" NBAR: # Nouns and Adjectives, terminated with Nouns {<NN.*|JJ>*<NN.*>} NP: {<NBAR>} # Above, connected with in/of/etc... {<NBAR><IN><NBAR>} """ cp = nltk.RegexpParser(grammar, loop=2) cp_tree = cp.parse(pos_tags) phrase_terms = get_terms(cp_tree) text_terms[phrase] = phrase_terms terms += phrase_terms keywords = [] for term in terms: if len(term) > 0: keywords.append(' '.join(term)) return keywords
def keywords_ngrams(sentence): ngrams = [] phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue words = NLP_word.word_splitting(phrase.lower()) stop_pos = [] # record all positions of stop or non-preferred (POS) words in the phrase to increase efficiency for i in range(len(words)): type = word_checking_stop(words[i]) stop_pos.append(type) # Generate n-gram for i in range(len(words)): if 0 < stop_pos[i]: continue for j in reversed(range(i+1, min(len(words), i+4)+1)): # the maximum length of a ngram is 5 if 0 < stop_pos[j-1]:# check validity continue ngram = ' '.join(words[i:j]) if len(ngram)>2: # at least two characters ngrams.append(ngram) return ngrams
def Generate_ngrams_document (text, max_num, stop, use_pos, ptag, use_stem, split, all_pos_tags): doc_ngrams = {} sentences = [] if split is None: sentences = NLP_sent.sentence_splitting(text, 1) else: can_sens = text.split(split) for can_sen in can_sens: sentences.extend(NLP_sent.sentence_splitting(can_sen, 1)) for sentence in sentences: phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue words = NLP_word.word_splitting(phrase.lower()) if (use_pos): if phrase in all_pos_tags: pos_tags = all_pos_tags[phrase] else: pos_tags = NLP_word.word_pos_tagging(words) all_pos_tags[phrase] = pos_tags stop_pos = [] # record all positions of stop or non-preferred (POS) words in the phrase to increase efficiency for i in xrange(len(words)): type = word_checking_stop(words[i], stop) stop_pos.append(type) if use_stem: # enable or disable stemming words[i] = porter2.stem(words[i]) # if "patients who underwent" in phrase: # print "aa" for i in xrange(len(words)): if 0 < stop_pos[i] < 5: continue for j in xrange(i+1, min(len(words), i+max_num)+1): if 0 < stop_pos[j-1] < 5:# check validity continue meaningful_word = False if (j == i +1): if (stop_pos[i] == 0) and (not use_pos or (use_pos and word_checking_pos(pos_tags[i], ptag) == 0)): meaningful_word = True else: # if (use_pos and word_checking_pos(pos_tags[j-1], ptag) == 1): # continue mless_num = 0 for k in xrange(i,j): if stop_pos[k] ==0 or stop_pos[k]==5: meaningful_word =True else: mless_num +=1 if mless_num>=(j-i-1): continue if (meaningful_word): ngram = ' '.join(words[i:j]) if len(ngram)>1: # at least two characters if (ngram in doc_ngrams): doc_ngrams[ngram] += 1 else: doc_ngrams[ngram] = 1 return doc_ngrams