def __init__(self, hp_obj, hp_subj, tagger, debug=False): # Syntactic forms for pattern extraction self.syntactic_forms = { "subj": [["BE", "VBN*|VBD*"], ["HAVE", "BE", "VB*"], ["VB*"], ["VB*", "*", "NN*|NP*|NC*"], ["VB*", "TO", "VB*"], ["HAVE", "TO", "BE"], ["HAVE", "NN*"]], "dobj": [["VB*"], ["TO", "VB*"], ["VB*", "TO", "VB*"]], "np": [["NN", "IN"], ["VB*", "NN", "IN"], ["BE", "VBN", "IN"], ["TO", "VB", "TO"]] } self.filename = "stored/learned_patterns" try: self.learned_patterns = pickle.load(open(self.filename)) if DEBUG1: print Tcolors.ADD + Tcolors.OKBLUE + " Loaded existing pattern knowledge!" + Tcolors.ENDC except: if DEBUG1: print Tcolors.ACT + Tcolors.RED + " Existing pattern knowledge not found." + Tcolors.ENDC self.learned_patterns = {} # Part Of Speech Sequential Tagger (Unigram->Bigram->Trigram) self.tagger = tagger # Sentence to be classified self.subjective = False self.objective = False # High precision objective classifier self.hp_obj = hp_obj # High precision subjective classifier self.hp_subj = hp_subj # Pattern-Based Subjective Classifier self.pb_subj = PbSubj(self.tagger, debug=debug) # Learned patterns self.patterns = {} self.debug = debug
def __init__(self, hp_obj, hp_subj, tagger, debug=False): # Syntactic forms for pattern extraction self.syntactic_forms = {"subj" : [["BE","VBN*|VBD*"], ["HAVE","BE","VB*"], ["VB*"], ["VB*","*","NN*|NP*|NC*"], ["VB*","TO","VB*"], ["HAVE","TO","BE"], ["HAVE","NN*"]], "dobj" : [["VB*"], ["TO","VB*"], ["VB*","TO","VB*"]], "np" : [["NN","IN"], ["VB*","NN","IN"], ["BE","VBN","IN"], ["TO","VB","TO"]] } self.filename = "stored/learned_patterns" try: self.learned_patterns = pickle.load(open(self.filename)) print Tcolors.ADD + Tcolors.OKBLUE + " Loaded existing pattern knowledge!" + Tcolors.ENDC except: print Tcolors.ACT + Tcolors.RED + " Existing pattern knowledge not found." + Tcolors.ENDC self.learned_patterns = {} # Part Of Speech Sequential Tagger (Unigram->Bigram->Trigram) self.tagger = tagger # Sentence to be classified self.subjective = False self.objective = False # High precision objective classifier self.hp_obj = hp_obj # High precision subjective classifier self.hp_subj = hp_subj # Pattern-Based Subjective Classifier self.pb_subj = PbSubj(self.tagger, debug=debug) # Learned patterns self.patterns = {} self.debug = debug
class Bootstrapping: """ Bootstrapping: Class performing the bootstrapping process for subjectivity and objectivity classification of sentences. The method learns linguistically rich extraction patterns for subjective (opinionated) expressions from unannotated data. The learned patterns are used to identify more subjective sentences that simple high precision classifiers can't recall. Related paper: E. Riloff and J. Wiebe. Learning extraction patterns for subjective expressions. In Proceedings of the 2003 conference on Empirical methods in natural language processing, EMNLP '03, pages 105--112, 2003. ACL. Learned patterns structure e.g. {"<subj> was killed" : {'type' : 'subj', 'display' : 'was killed', 'subj_freq' : 10, 'freq' : 20, 'prob' : 0.5}} """ def __init__(self, hp_obj, hp_subj, tagger, debug=False): # Syntactic forms for pattern extraction self.syntactic_forms = { "subj": [["BE", "VBN*|VBD*"], ["HAVE", "BE", "VB*"], ["VB*"], ["VB*", "*", "NN*|NP*|NC*"], ["VB*", "TO", "VB*"], ["HAVE", "TO", "BE"], ["HAVE", "NN*"]], "dobj": [["VB*"], ["TO", "VB*"], ["VB*", "TO", "VB*"]], "np": [["NN", "IN"], ["VB*", "NN", "IN"], ["BE", "VBN", "IN"], ["TO", "VB", "TO"]] } self.filename = "stored/learned_patterns" TryCnt = 0 while True: try: self.learned_patterns = pickle.load(open(self.filename)) print Tcolors.ADD + Tcolors.OKBLUE + " Loaded existing pattern knowledge!" + Tcolors.ENDC break except: if TryCnt < 5: print 'raja sekar' # raw_input() TryCnt += 1 sleep(2) continue else: print Tcolors.ACT + Tcolors.RED + " Existing pattern knowledge not found." + Tcolors.ENDC self.learned_patterns = {} break # Part Of Speech Sequential Tagger (Unigram->Bigram->Trigram) self.tagger = tagger # Sentence to be classified self.subjective = False self.objective = False # High precision objective classifier self.hp_obj = hp_obj # High precision subjective classifier self.hp_subj = hp_subj # Pattern-Based Subjective Classifier self.pb_subj = PbSubj(self.tagger, debug=debug) # Learned patterns self.patterns = {} self.debug = debug def classify(self, sentence, previous="", next=""): """ Subjectivity classification using boostrapping method. """ # STEP 1: Classify sentence with HP Subjective classifier self.subjective = self.hp_subj.classify(sentence) # STEP 1: Get help from learned patterns if not self.subjective: if self.debug: print Tcolors.ACT + " Training pattern based classifier...\n" self.pb_subj.train(self.learned_patterns) found, self.subjective, obj = self.pb_subj.classify(sentence) if not self.subjective and not self.objective: # STEP 2: Classify sentence with HP Objective classifier self.objective = self.hp_obj.classify(sentence, previous, next) if self.subjective or self.objective: # STEP 3: Learn self.learn_patterns_from(sentence) else: # STEP 4: Classify based on learned patterns found, self.subjective, self.objective = self.pb_subj.classify( sentence) # Uncomment the two following to bootstrap further the subjective # sentences detected from the pattern-based classifier. # if self.subjective: # self.learn_patterns_from(sentence) if self.subjective: return 'subjective' elif self.objective: return 'objective' else: return None def learn_patterns_from(self, sentence): """ Learns extraction patterns associated with subjectivity from a given sentence. """ tagged_sentence = self.tagger.tag(sentence) tags = [] words = [] if self.debug: print Tcolors.ACT + " Performing part of speech (POS) tagging..." + Tcolors.WARNING print tagged_sentence print Tcolors.ENDC for (w, tag) in tagged_sentence: if tag is None: tag = "" tags.append(tag) words.append(w) self.trigger_patterns(tags, words) def match_until_next_nn(self, i, tags, words, form, key): """ The hard job for triggering the syntactic forms :-) """ LIMITER = 4 BE = ['was', 'were', 'be', 'being', 'am', 'been', 'are', 'is'] HAVE = ['has', 'have', 'had'] matched = 0 prev_matched = 0 positions_matched = [] learned_pattern = [] star = False for j, ctag in enumerate(form): next = i + j + 1 inner = 0 found = False while (not found and next < len(tags)): next += inner if next < len(words) and ctag == "VB*" and words[next] in HAVE: next += 1 if next < len( words) and ctag == "VB*" and words[next] in BE: next += 1 elif next < len(words) and ctag == "VB*" and words[next] in BE: next += 1 if ctag == "*": star = True elif ctag.find("*") > -1: ortags = ctag.split("|") for ortag in ortags: if next < len(tags) and tags[next].find(ortag.replace("*","")) > -1\ and next not in positions_matched: if star and inner < 2: matched += 1 matched += 1 positions_matched.append(next) found = True elif ctag == "BE": if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("BE") > -1) \ and words[next] in BE and next not in positions_matched: matched += 1 positions_matched.append(next) found = True elif ctag == "HAVE": if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("HV") > -1)\ and words[next] in HAVE and next not in positions_matched: matched += 1 positions_matched.append(next) found = True elif next < len(tags) and tags[next].find(ctag) > -1\ and next not in positions_matched: matched += 1 positions_matched.append(next) found = True else: found = True inner += 1 if key == "subj": learned_pattern = ["<subj>"] for pos in positions_matched: learned_pattern.append(words[pos]) if key != "subj": learned_pattern.append("<" + key + ">") learned_pattern = " ".join(learned_pattern) if matched == len(form): if self.debug: print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC print "Pattern learned:", learned_pattern return True, learned_pattern else: return False, None def proccess_learned_pattern(self, pattern): """ Add pattern to learned patterns if it doesn't exist else update its probability. """ if pattern.find("subj") > -1: key = "subj" elif pattern.find("dobj") > -1: key = "dobj" else: key = "np" cur_subj_freq = 0 if self.subjective: cur_subj_freq = 1 pkey = pattern pkey = re.sub(r"<subj> | <np>| <dobj>", "", pkey) if self.learned_patterns.has_key(pattern): subj_freq = self.learned_patterns[pattern][ 'subj_freq'] + cur_subj_freq freq = self.learned_patterns[pattern]['freq'] + 1 prob = (float)(subj_freq) / (float)(freq) self.learned_patterns[pattern]['prob'] = prob self.learned_patterns[pattern]['subj_freq'] = subj_freq self.learned_patterns[pattern]['freq'] = freq if self.debug: print Tcolors.ADD + Tcolors.HEADER + " Updating pattern:", pattern, Tcolors.ENDC else: subj_freq = 0 freq = 1 subj_freq += cur_subj_freq prob = (float)(subj_freq) / (float)(freq) self.learned_patterns[pattern] = { 'type': key, 'display': pkey, 'freq': freq, 'subj_freq': subj_freq, 'prob': prob } if self.debug: print Tcolors.ADD + Tcolors.CYAN + " Learning pattern:", pattern, Tcolors.ENDC def store_knowledge(self): """ Stored learned patterns for future usage. """ output = open(self.filename, 'wb') pickle.dump(self.learned_patterns, output) def trigger_patterns(self, tags, words): """ Method that triggers syntactic forms and returns the learned patterns from the triggering. """ patterns = [] if self.debug: print Tcolors.ACT + " Triggering subjective syntactic forms..." for key in self.syntactic_forms.keys(): syntactic_forms = self.syntactic_forms[key] if self.debug: print Tcolors.PROC + Tcolors.GRAY + " Checking form group " + key + "..." + Tcolors.ENDC for form in syntactic_forms: for i, tag in enumerate(tags): if tag.find("NN") > -1 or tag.find("NP") > -1 \ or tag.find("PR") > -1: triggered, pattern = self.match_until_next_nn( i, tags, words, form, key) if pattern is not None and pattern not in patterns: if self.debug: print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC patterns.append(pattern) for pattern in patterns: self.proccess_learned_pattern(pattern) if self.debug: print Tcolors.OKBLUE print self.learned_patterns print Tcolors.ENDC self.store_knowledge() def train(self, data): """ Method to train the pattern-based classifier """ for sentence in data: self.classify(sentence) def clear_learned_data(self): self.learned_patterns = {}
class Bootstrapping: """ Bootstrapping: Class performing the bootstrapping process for subjectivity and objectivity classification of sentences. The method learns linguistically rich extraction patterns for subjective (opinionated) expressions from unannotated data. The learned patterns are used to identify more subjective sentences that simple high precision classifiers can't recall. Related paper: E. Riloff and J. Wiebe. Learning extraction patterns for subjective expressions. In Proceedings of the 2003 conference on Empirical methods in natural language processing, EMNLP '03, pages 105--112, 2003. ACL. Learned patterns structure e.g. {"<subj> was killed" : {'type' : 'subj', 'display' : 'was killed', 'subj_freq' : 10, 'freq' : 20, 'prob' : 0.5}} """ def __init__(self, hp_obj, hp_subj, tagger, debug=False): # Syntactic forms for pattern extraction self.syntactic_forms = {"subj" : [["BE","VBN*|VBD*"], ["HAVE","BE","VB*"], ["VB*"], ["VB*","*","NN*|NP*|NC*"], ["VB*","TO","VB*"], ["HAVE","TO","BE"], ["HAVE","NN*"]], "dobj" : [["VB*"], ["TO","VB*"], ["VB*","TO","VB*"]], "np" : [["NN","IN"], ["VB*","NN","IN"], ["BE","VBN","IN"], ["TO","VB","TO"]] } self.filename = "stored/learned_patterns" try: self.learned_patterns = pickle.load(open(self.filename)) print Tcolors.ADD + Tcolors.OKBLUE + " Loaded existing pattern knowledge!" + Tcolors.ENDC except: print Tcolors.ACT + Tcolors.RED + " Existing pattern knowledge not found." + Tcolors.ENDC self.learned_patterns = {} # Part Of Speech Sequential Tagger (Unigram->Bigram->Trigram) self.tagger = tagger # Sentence to be classified self.subjective = False self.objective = False # High precision objective classifier self.hp_obj = hp_obj # High precision subjective classifier self.hp_subj = hp_subj # Pattern-Based Subjective Classifier self.pb_subj = PbSubj(self.tagger, debug=debug) # Learned patterns self.patterns = {} self.debug = debug def classify(self, sentence, previous="", next=""): """ Subjectivity classification using boostrapping method. """ # STEP 1: Classify sentence with HP Subjective classifier self.subjective = self.hp_subj.classify(sentence) # STEP 1: Get help from learned patterns if not self.subjective: if self.debug: print Tcolors.ACT + " Training pattern based classifier...\n" self.pb_subj.train(self.learned_patterns) found, self.subjective, obj = self.pb_subj.classify(sentence) if not self.subjective and not self.objective: # STEP 2: Classify sentence with HP Objective classifier self.objective = self.hp_obj.classify(sentence, previous, next) if self.subjective or self.objective: # STEP 3: Learn self.learn_patterns_from(sentence) else: # STEP 4: Classify based on learned patterns found, self.subjective, self.objective = self.pb_subj.classify(sentence) # Uncomment the two following to bootstrap further the subjective # sentences detected from the pattern-based classifier. # if self.subjective: # self.learn_patterns_from(sentence) if self.subjective: return 'subjective' elif self.objective: return 'objective' else: return None def learn_patterns_from(self, sentence): """ Learns extraction patterns associated with subjectivity from a given sentence. """ tagged_sentence = self.tagger.tag(sentence) tags = [] words = [] if self.debug: print Tcolors.ACT + " Performing part of speech (POS) tagging..." + Tcolors.WARNING print tagged_sentence print Tcolors.ENDC for (w,tag) in tagged_sentence: if tag is None: tag = "" tags.append(tag) words.append(w) self.trigger_patterns(tags, words) def match_until_next_nn(self, i, tags, words, form, key): """ The hard job for triggering the syntactic forms :-) """ LIMITER = 4 BE = ['was','were','be','being','am','been','are','is'] HAVE = ['has','have','had'] matched = 0 prev_matched = 0 positions_matched = [] learned_pattern = [] star = False for j,ctag in enumerate(form): next = i + j + 1 inner = 0 found = False while(not found and next < len(tags)): next += inner if next < len(words) and ctag == "VB*" and words[next] in HAVE: next += 1 if next < len(words) and ctag == "VB*" and words[next] in BE: next += 1 elif next < len(words) and ctag == "VB*" and words[next] in BE: next += 1 if ctag == "*": star = True elif ctag.find("*") > -1: ortags = ctag.split("|") for ortag in ortags: if next < len(tags) and tags[next].find(ortag.replace("*","")) > -1\ and next not in positions_matched: if star and inner < 2: matched += 1 matched += 1 positions_matched.append(next) found = True elif ctag == "BE": if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("BE") > -1) \ and words[next] in BE and next not in positions_matched: matched += 1 positions_matched.append(next) found = True elif ctag == "HAVE": if next < len(tags) and (tags[next].find("VB") > -1 or tags[next].find("HV") > -1)\ and words[next] in HAVE and next not in positions_matched: matched += 1 positions_matched.append(next) found = True elif next < len(tags) and tags[next].find(ctag) > -1\ and next not in positions_matched: matched += 1 positions_matched.append(next) found = True else: found = True inner += 1 if key == "subj": learned_pattern = ["<subj>"] for pos in positions_matched: learned_pattern.append(words[pos]) if key != "subj": learned_pattern.append("<" + key +">") learned_pattern = " ".join(learned_pattern) if matched == len(form): if self.debug: print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC print "Pattern learned:", learned_pattern return True, learned_pattern else: return False, None def proccess_learned_pattern(self, pattern): """ Add pattern to learned patterns if it doesn't exist else update its probability. """ if pattern.find("subj") > -1: key = "subj" elif pattern.find("dobj") > -1: key = "dobj" else: key = "np" cur_subj_freq = 0 if self.subjective: cur_subj_freq = 1 pkey = pattern pkey = re.sub(r"<subj> | <np>| <dobj>","",pkey) if self.learned_patterns.has_key(pattern): subj_freq = self.learned_patterns[pattern]['subj_freq'] + cur_subj_freq freq = self.learned_patterns[pattern]['freq'] + 1 prob = (float)(subj_freq)/(float)(freq) self.learned_patterns[pattern]['prob'] = prob self.learned_patterns[pattern]['subj_freq'] = subj_freq self.learned_patterns[pattern]['freq'] = freq if self.debug: print Tcolors.ADD + Tcolors.HEADER + " Updating pattern:", pattern, Tcolors.ENDC else: subj_freq = 0 freq = 1 subj_freq += cur_subj_freq prob = (float)(subj_freq)/(float)(freq) self.learned_patterns[pattern] = {'type': key, 'display': pkey, 'freq' : freq, 'subj_freq' : subj_freq, 'prob' : prob} if self.debug: print Tcolors.ADD + Tcolors.CYAN + " Learning pattern:", pattern, Tcolors.ENDC def store_knowledge(self): """ Stored learned patterns for future usage. """ output = open(self.filename, 'wb') pickle.dump(self.learned_patterns, output) def trigger_patterns(self, tags, words): """ Method that triggers syntactic forms and returns the learned patterns from the triggering. """ patterns = [] if self.debug: print Tcolors.ACT + " Triggering subjective syntactic forms..." for key in self.syntactic_forms.keys(): syntactic_forms = self.syntactic_forms[key] if self.debug: print Tcolors.PROC + Tcolors.GRAY + " Checking form group " + key + "..." + Tcolors.ENDC for form in syntactic_forms: for i,tag in enumerate(tags): if tag.find("NN") > -1 or tag.find("NP") > -1 \ or tag.find("PR") > -1: triggered, pattern = self.match_until_next_nn(i, tags, words, form, key) if pattern is not None and pattern not in patterns: if self.debug: print Tcolors.ACT + Tcolors.RED + " Form triggered: ", form, Tcolors.ENDC patterns.append(pattern) for pattern in patterns: self.proccess_learned_pattern(pattern) if self.debug: print Tcolors.OKBLUE print self.learned_patterns print Tcolors.ENDC self.store_knowledge() def train(self, data): """ Method to train the pattern-based classifier """ for sentence in data: self.classify(sentence) def clear_learned_data(self): self.learned_patterns = {}