def repairEntities(brokenText): fixedText = brokenText replacements = [(r'î','î'), (r'é','é'), (r'’','’'), (r'“','‘'), (r'”','’'), (r'¢','¢'), (r'Â’','’'), (r' & ',' & ') ] for subSearch, subReplace in replacements: fixedText = re.subn(subSearch, subReplace, fixedText)[0] return unicode(fixedText, 'utf-8')
def subfile(inputfile, findreplace): outtext = inputfile if inputfile is None: return outtext for counter, pattern_replace in enumerate(findreplace): try: outtext, subs = re.subn(pattern_replace[0], pattern_replace[1], outtext) if _DEBUG and subs > 0: print pattern_replace[0].pattern print pattern_replace[1] print "=" * 30 except KeyboardInterrupt: print counter, ' substituted: ', subs print pattern_replace[0], "||||", pattern_replace[1], outtext return outtext
def test_re_subn(self): self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1)) self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0)) self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4)) self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
def classify(self): super(ClassifiedEvent, self).classify() # Running real_tokenize() on a rule replaces it with the name of the high-level rule. # Instead, let's grab the contents of the rule, which will assume is an Any(), and run on each of the Any() manual_dancer = rules.MANUAL_DANCER[grammar.STRONG] assert isinstance(manual_dancer, grammar.Name) assert isinstance(manual_dancer, grammar.Name) assert len(manual_dancer.children()) == 1 assert isinstance(manual_dancer.children()[0], rules.Any) manual_dancer_children = manual_dancer.children()[0].children() for rule in manual_dancer_children: self.processed_text.real_tokenize(rule) self.processed_text.real_tokenize(keywords.GOOD_INSTANCE_OF_BAD_CLUB) #TODO(lambert): These grab things that are good, and keep them good, so they can't be stolen by other things. # Removing them appears to drop us from 9132 true positives down to 9108 true positives. # Maybe we can investigate exactly what's going on, and reduce the number of real_tokenize calls needed? self.processed_text.real_tokenize(keywords.DANCE) self.processed_text.real_tokenize(keywords.STYLE_BREAK) self.processed_text.real_tokenize(keywords.STYLE_ROCK) self.processed_text.real_tokenize(keywords.STYLE_POP) self.processed_text.real_tokenize(keywords.STYLE_LOCK) self.processed_text.real_tokenize(keywords.STYLE_WAACK) self.processed_text.real_tokenize(keywords.STYLE_HIPHOP) self.processed_text.real_tokenize(keywords.STYLE_HOUSE) self.processed_text.real_tokenize(keywords.STYLE_DANCEHALL) self.processed_text.real_tokenize(keywords.STYLE_KRUMP) self.processed_text.real_tokenize(keywords.STYLE_TURF) self.processed_text.real_tokenize(keywords.STYLE_LITEFEET) self.processed_text.real_tokenize(keywords.STYLE_FLEX) self.processed_text.real_tokenize(keywords.STYLE_BEBOP) self.processed_text.real_tokenize(keywords.STYLE_ALLSTYLE) search_text = self.processed_text.get_tokenized_text() # Or if there are bad keywords, lets see if we can find good keywords on a short line short_lines = [ line for line in search_text.split('\n') if len(line) < 500 ] self.processed_short_lines = grammar_matcher.StringProcessor( '\n'.join(short_lines), self.boundaries) #if not self.processed_text.get_tokens(rules.ANY_GOOD): # self.dance_event = False # return a = time.time() b = time.time() self.manual_dance_keywords_matches = self.processed_text.get_tokens( rules.MANUAL_DANCE[grammar.STRONG]) self.times['manual_regex'] = time.time() - b self.real_dance_matches = self.processed_text.get_tokens( rules.GOOD_DANCE) if self.processed_text.get_tokens(dance_keywords.ROMANCE): event_matches = self.processed_text.get_tokens( rules.EVENT_WITH_ROMANCE_EVENT) else: event_matches = self.processed_text.get_tokens(rules.EVENT) club_and_event_matches = self.processed_text.get_tokens( dance_keywords.PRACTICE, dance_keywords.PERFORMANCE, dance_keywords.CONTEST) self.times['all_regexes'] = time.time() - a self.found_dance_matches = self.real_dance_matches + self.processed_text.get_tokens( dance_keywords.EASY_DANCE, keywords.AMBIGUOUS_DANCE_MUSIC, dance_keywords.EASY_CHOREO, keywords.HOUSE, keywords.TOO_EASY_VOGUE, keywords.EASY_VOGUE) + self.manual_dance_keywords_matches self.found_event_matches = event_matches + self.processed_text.get_tokens( keywords.EASY_EVENT, keywords.JAM) + club_and_event_matches self.found_wrong_matches = self.processed_text.get_tokens( all_styles.DANCE_WRONG_STYLE) + self.processed_text.get_tokens( keywords.CLUB_ONLY) title_wrong_style_matches = self.processed_title.get_tokens( all_styles.DANCE_WRONG_STYLE_TITLE) title_good_matches = self.processed_title.get_tokens(rules.ANY_GOOD) combined_matches_string = ' '.join(self.found_dance_matches + self.found_event_matches) dummy, combined_matches = re.subn(r'\w+', '', combined_matches_string) dummy, words = re.subn(r'\w+', '', re.sub(r'\bhttp.*?\s', '', search_text)) fraction_matched = 1.0 * (combined_matches + 1) / (words + 1) if not fraction_matched: self.calc_inverse_keyword_density = 100 else: self.calc_inverse_keyword_density = -math.log(fraction_matched, 2) #print self.processed_text.count_tokens(dance_keywords.EASY_DANCE) #print len(club_and_event_matches) #print self.processed_text.count_tokens(all_styles.DANCE_WRONG_STYLE) #print self.processed_text.count_tokens(keywords.CLUB_ONLY) #strong = 0 #for line in search_text.split('\n'): # proc_line = f(line) # matches = proc_line.get_tokens(rules.ANY_GOOD) # good_parts = sum(len(x) for x in matches) # if 1.0 * good_parts / len(line) > 0.1: # # strong! # strong += 1 music_or_dance_keywords = self.processed_text.count_tokens( keywords.AMBIGUOUS_DANCE_MUSIC) + self.processed_text.count_tokens( keywords.HOUSE) if len(self.manual_dance_keywords_matches) >= 1: self.dance_event = 'obvious dancer or dance crew or battle' # one critical dance keyword elif len(self.real_dance_matches) >= 1: self.dance_event = 'obvious dance style' # If the title has a bad-style and no good-styles, mark it bad elif ( title_wrong_style_matches and not (self.processed_title.get_tokens( keywords.AMBIGUOUS_DANCE_MUSIC) or self.manual_dance_keywords_matches or self.real_dance_matches) ): # these two are implied by the above, but do it here just in case future clause re-ordering occurs self.dance_event = False elif music_or_dance_keywords >= 1 and ( len(event_matches) + self.processed_text.count_tokens(dance_keywords.EASY_CHOREO) ) >= 1 and self.calc_inverse_keyword_density < 5 and not ( title_wrong_style_matches and not title_good_matches): self.dance_event = 'hiphop/funk and good event type' # one critical event and a basic dance keyword and not a wrong-dance-style and not a generic-club elif self.processed_text.count_tokens( dance_keywords.EASY_DANCE) >= 1 and ( len(event_matches) + self.processed_text.count_tokens( dance_keywords.EASY_CHOREO) ) >= 1 and not self.processed_text.count_tokens( all_styles.DANCE_WRONG_STYLE ) and self.calc_inverse_keyword_density < 5: self.dance_event = 'dance event thats not a bad-style' elif self.processed_text.count_tokens( dance_keywords.EASY_DANCE) >= 1 and len( self.found_event_matches ) >= 1 and not self.processed_text.count_tokens( all_styles.DANCE_WRONG_STYLE ) and self.processed_text.count_tokens( keywords.CLUB_ONLY) == 0: self.dance_event = 'dance show thats not a club' elif music_or_dance_keywords >= 1 and self.processed_text.count_tokens( dance_keywords.EASY_DANCE) >= 1: self.dance_event = 'good music and dance keyword' else: self.dance_event = False self.times['all_match'] = time.time() - a
def classify(self): #self.language not in ['ja', 'ko', 'zh-CN', 'zh-TW', 'th']: if cjk_detect.cjk_regex.search(self.search_text): cjk_chars = len(cjk_detect.cjk_regex.findall(self.search_text)) if 1.0 * cjk_chars / len(self.search_text) > 0.05: self.boundaries = regex_keywords.NO_WORD_BOUNDARIES else: self.boundaries = regex_keywords.WORD_BOUNDARIES else: self.boundaries = regex_keywords.WORD_BOUNDARIES self.processed_text = StringProcessor(self.search_text, self.boundaries) # This must be first, to remove the fake keywords self.processed_text.real_tokenize(keywords.PREPROCESS_REMOVAL) # Running real_tokenize() on a rule replaces it with the name of the high-level rule. # Instead, let's grab the contents of the rule, which will assume is an Any(), and run on each of the Any() manual_dancer = rules.MANUAL_DANCER[grammar.STRONG] assert isinstance(manual_dancer, grammar.Name) assert isinstance(manual_dancer, grammar.Name) assert len(manual_dancer.children()) == 1 assert isinstance(manual_dancer.children()[0], rules.Any) manual_dancer_children = manual_dancer.children()[0].children() for rule in manual_dancer_children: self.processed_text.real_tokenize(rule) self.processed_text.real_tokenize(keywords.GOOD_INSTANCE_OF_BAD_CLUB) #TODO(lambert): These grab things that are good, and keep them good, so they can't be stolen by other things. # Removing them appears to drop us from 9132 true positives down to 9108 true positives. # Maybe we can investigate exactly what's going on, and reduce the number of real_tokenize calls needed? self.processed_text.real_tokenize(keywords.DANCE) self.processed_text.real_tokenize(keywords.STYLE_BREAK) self.processed_text.real_tokenize(keywords.STYLE_ROCK) self.processed_text.real_tokenize(keywords.STYLE_POP) self.processed_text.real_tokenize(keywords.STYLE_LOCK) self.processed_text.real_tokenize(keywords.STYLE_WAACK) self.processed_text.real_tokenize(keywords.STYLE_HIPHOP) self.processed_text.real_tokenize(keywords.STYLE_HOUSE) self.processed_text.real_tokenize(keywords.STYLE_DANCEHALL) self.processed_text.real_tokenize(keywords.STYLE_KRUMP) self.processed_text.real_tokenize(keywords.STYLE_TURF) self.processed_text.real_tokenize(keywords.STYLE_LITEFEET) self.processed_text.real_tokenize(keywords.STYLE_FLEX) self.processed_text.real_tokenize(keywords.STYLE_BEBOP) self.processed_text.real_tokenize(keywords.STYLE_ALLSTYLE) self.final_search_text = self.processed_text.get_tokenized_text() search_text = self.final_search_text self.processed_title = StringProcessor(self.title, self.boundaries) self.processed_title.real_tokenize(keywords.PREPROCESS_REMOVAL) self.final_title = self.processed_title.get_tokenized_text() #if not self.processed_text.get_tokens(rules.ANY_GOOD): # self.dance_event = False # return a = time.time() b = time.time() self.manual_dance_keywords_matches = self.processed_text.get_tokens(rules.MANUAL_DANCE[grammar.STRONG]) self.times['manual_regex'] = time.time() - b self.real_dance_matches = self.processed_text.get_tokens(rules.GOOD_DANCE) if self.processed_text.get_tokens(keywords.ROMANCE): event_matches = self.processed_text.get_tokens(rules.EVENT_WITH_ROMANCE_EVENT) else: event_matches = self.processed_text.get_tokens(rules.EVENT) club_and_event_matches = self.processed_text.get_tokens(keywords.PRACTICE, keywords.PERFORMANCE, keywords.CONTEST) self.times['all_regexes'] = time.time() - a self.found_dance_matches = self.real_dance_matches + self.processed_text.get_tokens(keywords.EASY_DANCE, keywords.AMBIGUOUS_DANCE_MUSIC, keywords.EASY_CHOREO, keywords.HOUSE, keywords.TOO_EASY_VOGUE, keywords.EASY_VOGUE) + self.manual_dance_keywords_matches self.found_event_matches = event_matches + self.processed_text.get_tokens(keywords.EASY_EVENT, keywords.JAM) + club_and_event_matches self.found_wrong_matches = self.processed_text.get_tokens(keywords.DANCE_WRONG_STYLE) + self.processed_text.get_tokens(keywords.CLUB_ONLY) title_wrong_style_matches = self.processed_title.get_tokens(rules.DANCE_WRONG_STYLE_TITLE) title_good_matches = self.processed_title.get_tokens(rules.ANY_GOOD) combined_matches_string = ' '.join(self.found_dance_matches + self.found_event_matches) dummy, combined_matches = re.subn(r'\w+', '', combined_matches_string) dummy, words = re.subn(r'\w+', '', re.sub(r'\bhttp.*?\s', '', search_text)) fraction_matched = 1.0 * (combined_matches + 1) / (words + 1) if not fraction_matched: self.calc_inverse_keyword_density = 100 else: self.calc_inverse_keyword_density = -math.log(fraction_matched, 2) #print self.processed_text.count_tokens(keywords.EASY_DANCE) #print len(club_and_event_matches) #print self.processed_text.count_tokens(keywords.DANCE_WRONG_STYLE) #print self.processed_text.count_tokens(keywords.CLUB_ONLY) #strong = 0 #for line in search_text.split('\n'): # proc_line = f(line) # matches = proc_line.get_tokens(rules.ANY_GOOD) # good_parts = sum(len(x) for x in matches) # if 1.0 * good_parts / len(line) > 0.1: # # strong! # strong += 1 if len(self.manual_dance_keywords_matches) >= 1: self.dance_event = 'obvious dancer or dance crew or battle' # one critical dance keyword elif len(self.real_dance_matches) >= 1: self.dance_event = 'obvious dance style' # If the title has a bad-style and no good-styles, mark it bad elif (title_wrong_style_matches and not ( self.processed_title.get_tokens(keywords.AMBIGUOUS_DANCE_MUSIC) or self.manual_dance_keywords_matches or self.real_dance_matches)): # these two are implied by the above, but do it here just in case future clause re-ordering occurs self.dance_event = False elif self.processed_text.count_tokens(keywords.AMBIGUOUS_DANCE_MUSIC) + self.processed_text.count_tokens(keywords.HOUSE) >= 1 and (len(event_matches) + self.processed_text.count_tokens(keywords.EASY_CHOREO)) >= 1 and self.calc_inverse_keyword_density < 5 and not (title_wrong_style_matches and not title_good_matches): self.dance_event = 'hiphop/funk and good event type' # one critical event and a basic dance keyword and not a wrong-dance-style and not a generic-club elif self.processed_text.count_tokens(keywords.EASY_DANCE) >= 1 and (len(event_matches) + self.processed_text.count_tokens(keywords.EASY_CHOREO)) >= 1 and not self.processed_text.count_tokens(keywords.DANCE_WRONG_STYLE) and self.calc_inverse_keyword_density < 5: self.dance_event = 'dance event thats not a bad-style' elif self.processed_text.count_tokens(keywords.EASY_DANCE) >= 1 and len(self.found_event_matches) >= 1 and not self.processed_text.count_tokens(keywords.DANCE_WRONG_STYLE) and self.processed_text.count_tokens(keywords.CLUB_ONLY) == 0: self.dance_event = 'dance show thats not a club' else: self.dance_event = False self.times['all_match'] = time.time() - a