def categorize_phrases(sentphrases, body_part_name, use_stem=True): stemmer = nltk.stem.PorterStemmer() records = [] for i,row in sentphrases.iterrows(): sentence = row['sentence'] phrase = row['phrase'] categories = [] parents = [] for word in phrase: if use_stem == True: category = taxonomy.classify(stemmer.stem(word.string)) else: category = taxonomy.classify(word.string) if category: categories.append(category) parent = taxonomy.parents(category) if parent and parent[0] != category: parents.append(parent[0]) else: parents.append('') #elif word.string == body_part_name: # categories.append(body_part_name) # parents.append(body_part_name) assert(len(categories) == len(parents)) phrase_str = ' '.join([ w.string+'/'+w.tag for w in phrase ]) phrase_str = phrase_str.replace(',', '') sentence_str = sentence.replace(',', '') score = len(categories) * (len(categories) / (1.*len(phrase))) for i,category in enumerate(categories): records.append( ( body_part_name, sentence_str, phrase_str, category, parents[i], score) ) df = pd.DataFrame( records, columns=['body_part', 'sentence', 'phrase', 'category', 'parent', 'score']) return df
from pattern.search import search, taxonomy, Classifier from pattern.en import parsetree # The search module includes a Taxonomy class # that can be used to define semantic word types. # For example, consider that you want to extract flower names from a text. # This would make search patterns somewhat unwieldy: # search("rose|lily|daisy|daffodil|begonia", txt). # A better approach is to use the taxonomy: for flower in ("rose", "lily", "daisy", "daffodil", "begonia"): taxonomy.append(flower, type="flower") print taxonomy.children("flower") print taxonomy.parents("rose") print taxonomy.classify("rose") # Yields the most recently added parent. print # Taxonomy terms can be included in a pattern by using uppercase: t = parsetree("A field of white daffodils.", lemmata=True) m = search("FLOWER", t) print t print m print # Another example: taxonomy.append("chicken", type="food") taxonomy.append("chicken", type="bird") taxonomy.append("penguin", type="bird") taxonomy.append("bird", type="animal") print taxonomy.parents("chicken")
from pattern.search import Pattern, Constraint, Classifier, taxonomy from pattern.en import Sentence, parse # The search module includes a Taxonomy class # that can be used to define semantic word types. # For example, consider that you want to extract flower names from a text. # This would make patterns somewhat unwieldy, e.g.: # Pattern.fromstring("rose|lily|daisy|daffodil|begonia"). # A better approach is to use the taxonomy: for flower in ("rose", "lily", "daisy", "daffodil", "begonia"): taxonomy.append(flower, type="flower") print taxonomy.children("flower") print taxonomy.parents("rose") print taxonomy.classify("rose") # Yields the most recently added parent. print # Taxonomy terms can be included in a pattern: p = Pattern([Constraint(taxa=["flower"])]) # or p = Pattern.fromstring("FLOWER") s = Sentence(parse("A field of white daffodils.", lemmata=True)) m = p.search(s) print s print m print from pattern.search import search taxonomy.append("chicken", type="food") taxonomy.append("chicken", type="bird")
for f in ('reflect', 'bank'): taxonomy.append(f, type='angle') for f in ('bank', 'financial-institution'): taxonomy.append(f, type='finance') t = parsetree('A field of daffodils is white.', lemmata=True) print search('PLANT', t) taxonomy.parents('daffodil', recursive=True) taxonomy.children('plant', recursive=False) taxonomy.classify('bank') from pattern.en import wordnet a = wordnet.synsets('tone')[4] b = wordnet.synsets('color')[0] wordnet.similarity(a,b)