def __english_stemming(words): for i in xrange(len(words)): words[i] = stem(words[i]) return words
def Generate_ngrams_document (text, max_num, stop, use_pos, ptag, use_stem, split, all_pos_tags): doc_ngrams = {} sentences = [] if split is None: sentences = NLP_sent.sentence_splitting(text, 1) else: can_sens = text.split(split) for can_sen in can_sens: sentences.extend(NLP_sent.sentence_splitting(can_sen, 1)) for sentence in sentences: phrases = NLP_sent.phrase_splitting(sentence) for phrase in phrases: if len(phrase) <= 2: # e.g.'ii' continue words = NLP_word.word_splitting(phrase.lower()) if (use_pos): if phrase in all_pos_tags: pos_tags = all_pos_tags[phrase] else: pos_tags = NLP_word.word_pos_tagging(words) all_pos_tags[phrase] = pos_tags stop_pos = [] # record all positions of stop or non-preferred (POS) words in the phrase to increase efficiency for i in xrange(len(words)): type = word_checking_stop(words[i], stop) stop_pos.append(type) if use_stem: # enable or disable stemming words[i] = porter2.stem(words[i]) # if "patients who underwent" in phrase: # print "aa" for i in xrange(len(words)): if 0 < stop_pos[i] < 5: continue for j in xrange(i+1, min(len(words), i+max_num)+1): if 0 < stop_pos[j-1] < 5:# check validity continue meaningful_word = False if (j == i +1): if (stop_pos[i] == 0) and (not use_pos or (use_pos and word_checking_pos(pos_tags[i], ptag) == 0)): meaningful_word = True else: # if (use_pos and word_checking_pos(pos_tags[j-1], ptag) == 1): # continue mless_num = 0 for k in xrange(i,j): if stop_pos[k] ==0 or stop_pos[k]==5: meaningful_word =True else: mless_num +=1 if mless_num>=(j-i-1): continue if (meaningful_word): ngram = ' '.join(words[i:j]) if len(ngram)>1: # at least two characters if (ngram in doc_ngrams): doc_ngrams[ngram] += 1 else: doc_ngrams[ngram] = 1 return doc_ngrams
def standardizing(words, umls, stop): eng = False # map to umls if umls is not None: pwords = [] status = [] i = 0 while i < len(words): fnd = False if kernel_mining.word_checking_stop(words[i], stop) in [5, 0 ]: # judge stop word for j in reversed(xrange(i + 1, len(words) + 1)): if kernel_mining.word_checking_stop( words[j - 1], stop) in [5, 0]: # judge stop word s = ' '.join(words[i:j]) if ( s in umls.norm ): # another filter is "and (len(umls.norm[s]) <= 10)" cl = int(50) fs = None for pt in umls.norm[s]: dpt = pt.decode('utf-8') # retain same if dpt == s.decode('utf-8'): fs = s.decode('utf-8') break # acronym if len(umls.norm[s]) > 1: tkn = dpt.split() if len(tkn) == len(s): init = set(s) acr = len(tkn) for t in tkn: if t[0] in init: acr -= 1 if acr == 0: fs = dpt break # retain shorter if (len(dpt) < cl): fs = dpt cl = len(dpt) s = fs if s in umls.semantic: if (len(umls.stype) == 0) or ( len(umls.semantic[s] & umls.stype) > 0): pwords.append(s.encode('utf-8')) status.append(True) fnd = True i = j continue # do not stop the iterations to get subsets if fnd is False: pwords.append(words[i]) status.append(False) i += 1 # not found any umls term, ngram not valid if True not in status: return None # english stemmer if eng is True: for i in xrange(len(pwords)): if status[i] is False: pwords[i] = stem(pwords[i]) # processing repetition uwords = set(pwords) if len(uwords) <= math.floor(len(pwords) / float(2)): return None return ' '.join(pwords) elif eng is True: # english only return ' '.join(__english_stemming(words)) else: # nothing to do return ' '.join(words)
def stem_phrase(phrase): words = phrase.split() for i in range(0, len(words)): words[i] = porter2.stem(words[i]) return ' '.join(words)