Beispiel #1
0
def repairEntities(brokenText):
	fixedText = brokenText
	replacements = [(r'î','î'),
					(r'é','é'),
					(r'’','’'),
					(r'“','‘'),
					(r'”','’'),
					(r'¢','¢'),
					(r'Â’','’'),
					(r' & ',' & ')
	]
	
	for subSearch, subReplace in replacements:
		fixedText = re.subn(subSearch, subReplace, fixedText)[0]
	return unicode(fixedText, 'utf-8')
Beispiel #2
0
def subfile(inputfile, findreplace):
    outtext = inputfile
    if inputfile is None:
        return outtext

    for counter, pattern_replace in enumerate(findreplace):
        try:
            outtext, subs = re.subn(pattern_replace[0], pattern_replace[1], outtext)
            if _DEBUG and subs > 0:
                print pattern_replace[0].pattern
                print pattern_replace[1]
                print "=" * 30
        except KeyboardInterrupt:
            print counter, ' substituted: ', subs 
            print pattern_replace[0], "||||", pattern_replace[1], outtext
    return outtext 
Beispiel #3
0
 def test_re_subn(self):
     self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
     self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
     self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
     self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
     self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
Beispiel #4
0
 def test_re_subn(self):
     self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
     self.assertEqual(re.subn("b+", "x", "bbbb BBBB"), ('x BBBB', 1))
     self.assertEqual(re.subn("b+", "x", "xyz"), ('xyz', 0))
     self.assertEqual(re.subn("b*", "x", "xyz"), ('xxxyxzx', 4))
     self.assertEqual(re.subn("b*", "x", "xyz", 2), ('xxxyz', 2))
    def classify(self):
        super(ClassifiedEvent, self).classify()

        # Running real_tokenize() on a rule replaces it with the name of the high-level rule.
        # Instead, let's grab the contents of the rule, which will assume is an Any(), and run on each of the Any()
        manual_dancer = rules.MANUAL_DANCER[grammar.STRONG]
        assert isinstance(manual_dancer, grammar.Name)
        assert isinstance(manual_dancer, grammar.Name)
        assert len(manual_dancer.children()) == 1
        assert isinstance(manual_dancer.children()[0], rules.Any)
        manual_dancer_children = manual_dancer.children()[0].children()
        for rule in manual_dancer_children:
            self.processed_text.real_tokenize(rule)

        self.processed_text.real_tokenize(keywords.GOOD_INSTANCE_OF_BAD_CLUB)
        #TODO(lambert): These grab things that are good, and keep them good, so they can't be stolen by other things.
        # Removing them appears to drop us from 9132 true positives down to 9108 true positives.
        # Maybe we can investigate exactly what's going on, and reduce the number of real_tokenize calls needed?
        self.processed_text.real_tokenize(keywords.DANCE)
        self.processed_text.real_tokenize(keywords.STYLE_BREAK)
        self.processed_text.real_tokenize(keywords.STYLE_ROCK)
        self.processed_text.real_tokenize(keywords.STYLE_POP)
        self.processed_text.real_tokenize(keywords.STYLE_LOCK)
        self.processed_text.real_tokenize(keywords.STYLE_WAACK)
        self.processed_text.real_tokenize(keywords.STYLE_HIPHOP)
        self.processed_text.real_tokenize(keywords.STYLE_HOUSE)
        self.processed_text.real_tokenize(keywords.STYLE_DANCEHALL)
        self.processed_text.real_tokenize(keywords.STYLE_KRUMP)
        self.processed_text.real_tokenize(keywords.STYLE_TURF)
        self.processed_text.real_tokenize(keywords.STYLE_LITEFEET)
        self.processed_text.real_tokenize(keywords.STYLE_FLEX)
        self.processed_text.real_tokenize(keywords.STYLE_BEBOP)
        self.processed_text.real_tokenize(keywords.STYLE_ALLSTYLE)

        search_text = self.processed_text.get_tokenized_text()

        # Or if there are bad keywords, lets see if we can find good keywords on a short line
        short_lines = [
            line for line in search_text.split('\n') if len(line) < 500
        ]
        self.processed_short_lines = grammar_matcher.StringProcessor(
            '\n'.join(short_lines), self.boundaries)

        #if not self.processed_text.get_tokens(rules.ANY_GOOD):
        #    self.dance_event = False
        #    return
        a = time.time()
        b = time.time()
        self.manual_dance_keywords_matches = self.processed_text.get_tokens(
            rules.MANUAL_DANCE[grammar.STRONG])
        self.times['manual_regex'] = time.time() - b
        self.real_dance_matches = self.processed_text.get_tokens(
            rules.GOOD_DANCE)
        if self.processed_text.get_tokens(dance_keywords.ROMANCE):
            event_matches = self.processed_text.get_tokens(
                rules.EVENT_WITH_ROMANCE_EVENT)
        else:
            event_matches = self.processed_text.get_tokens(rules.EVENT)
        club_and_event_matches = self.processed_text.get_tokens(
            dance_keywords.PRACTICE, dance_keywords.PERFORMANCE,
            dance_keywords.CONTEST)
        self.times['all_regexes'] = time.time() - a

        self.found_dance_matches = self.real_dance_matches + self.processed_text.get_tokens(
            dance_keywords.EASY_DANCE, keywords.AMBIGUOUS_DANCE_MUSIC,
            dance_keywords.EASY_CHOREO, keywords.HOUSE,
            keywords.TOO_EASY_VOGUE,
            keywords.EASY_VOGUE) + self.manual_dance_keywords_matches
        self.found_event_matches = event_matches + self.processed_text.get_tokens(
            keywords.EASY_EVENT, keywords.JAM) + club_and_event_matches
        self.found_wrong_matches = self.processed_text.get_tokens(
            all_styles.DANCE_WRONG_STYLE) + self.processed_text.get_tokens(
                keywords.CLUB_ONLY)

        title_wrong_style_matches = self.processed_title.get_tokens(
            all_styles.DANCE_WRONG_STYLE_TITLE)
        title_good_matches = self.processed_title.get_tokens(rules.ANY_GOOD)
        combined_matches_string = ' '.join(self.found_dance_matches +
                                           self.found_event_matches)
        dummy, combined_matches = re.subn(r'\w+', '', combined_matches_string)
        dummy, words = re.subn(r'\w+', '',
                               re.sub(r'\bhttp.*?\s', '', search_text))
        fraction_matched = 1.0 * (combined_matches + 1) / (words + 1)
        if not fraction_matched:
            self.calc_inverse_keyword_density = 100
        else:
            self.calc_inverse_keyword_density = -math.log(fraction_matched, 2)

        #print self.processed_text.count_tokens(dance_keywords.EASY_DANCE)
        #print len(club_and_event_matches)
        #print self.processed_text.count_tokens(all_styles.DANCE_WRONG_STYLE)
        #print self.processed_text.count_tokens(keywords.CLUB_ONLY)
        #strong = 0
        #for line in search_text.split('\n'):
        #   proc_line = f(line)
        #    matches = proc_line.get_tokens(rules.ANY_GOOD)
        #    good_parts = sum(len(x) for x in matches)
        #    if 1.0 * good_parts / len(line) > 0.1:
        #        # strong!
        #        strong += 1
        music_or_dance_keywords = self.processed_text.count_tokens(
            keywords.AMBIGUOUS_DANCE_MUSIC) + self.processed_text.count_tokens(
                keywords.HOUSE)
        if len(self.manual_dance_keywords_matches) >= 1:
            self.dance_event = 'obvious dancer or dance crew or battle'
        # one critical dance keyword
        elif len(self.real_dance_matches) >= 1:
            self.dance_event = 'obvious dance style'
        # If the title has a bad-style and no good-styles, mark it bad
        elif (
                title_wrong_style_matches
                and not (self.processed_title.get_tokens(
                    keywords.AMBIGUOUS_DANCE_MUSIC)
                         or self.manual_dance_keywords_matches
                         or self.real_dance_matches)
        ):  # these two are implied by the above, but do it here just in case future clause re-ordering occurs
            self.dance_event = False

        elif music_or_dance_keywords >= 1 and (
                len(event_matches) +
                self.processed_text.count_tokens(dance_keywords.EASY_CHOREO)
        ) >= 1 and self.calc_inverse_keyword_density < 5 and not (
                title_wrong_style_matches and not title_good_matches):
            self.dance_event = 'hiphop/funk and good event type'
        # one critical event and a basic dance keyword and not a wrong-dance-style and not a generic-club
        elif self.processed_text.count_tokens(
                dance_keywords.EASY_DANCE) >= 1 and (
                    len(event_matches) + self.processed_text.count_tokens(
                        dance_keywords.EASY_CHOREO)
                ) >= 1 and not self.processed_text.count_tokens(
                    all_styles.DANCE_WRONG_STYLE
                ) and self.calc_inverse_keyword_density < 5:
            self.dance_event = 'dance event thats not a bad-style'
        elif self.processed_text.count_tokens(
                dance_keywords.EASY_DANCE) >= 1 and len(
                    self.found_event_matches
                ) >= 1 and not self.processed_text.count_tokens(
                    all_styles.DANCE_WRONG_STYLE
                ) and self.processed_text.count_tokens(
                    keywords.CLUB_ONLY) == 0:
            self.dance_event = 'dance show thats not a club'
        elif music_or_dance_keywords >= 1 and self.processed_text.count_tokens(
                dance_keywords.EASY_DANCE) >= 1:
            self.dance_event = 'good music and dance keyword'
        else:
            self.dance_event = False
        self.times['all_match'] = time.time() - a
    def classify(self):
        #self.language not in ['ja', 'ko', 'zh-CN', 'zh-TW', 'th']:
        if cjk_detect.cjk_regex.search(self.search_text):
            cjk_chars = len(cjk_detect.cjk_regex.findall(self.search_text))
            if 1.0 * cjk_chars / len(self.search_text) > 0.05:
                self.boundaries = regex_keywords.NO_WORD_BOUNDARIES
            else:
                self.boundaries = regex_keywords.WORD_BOUNDARIES
        else:
            self.boundaries = regex_keywords.WORD_BOUNDARIES

        self.processed_text = StringProcessor(self.search_text, self.boundaries)
        # This must be first, to remove the fake keywords
        self.processed_text.real_tokenize(keywords.PREPROCESS_REMOVAL)

        # Running real_tokenize() on a rule replaces it with the name of the high-level rule.
        # Instead, let's grab the contents of the rule, which will assume is an Any(), and run on each of the Any()
        manual_dancer = rules.MANUAL_DANCER[grammar.STRONG]
        assert isinstance(manual_dancer, grammar.Name)
        assert isinstance(manual_dancer, grammar.Name)
        assert len(manual_dancer.children()) == 1
        assert isinstance(manual_dancer.children()[0], rules.Any)
        manual_dancer_children = manual_dancer.children()[0].children()
        for rule in manual_dancer_children:
            self.processed_text.real_tokenize(rule)

        self.processed_text.real_tokenize(keywords.GOOD_INSTANCE_OF_BAD_CLUB)
        #TODO(lambert): These grab things that are good, and keep them good, so they can't be stolen by other things.
        # Removing them appears to drop us from 9132 true positives down to 9108 true positives.
        # Maybe we can investigate exactly what's going on, and reduce the number of real_tokenize calls needed?
        self.processed_text.real_tokenize(keywords.DANCE)
        self.processed_text.real_tokenize(keywords.STYLE_BREAK)
        self.processed_text.real_tokenize(keywords.STYLE_ROCK)
        self.processed_text.real_tokenize(keywords.STYLE_POP)
        self.processed_text.real_tokenize(keywords.STYLE_LOCK)
        self.processed_text.real_tokenize(keywords.STYLE_WAACK)
        self.processed_text.real_tokenize(keywords.STYLE_HIPHOP)
        self.processed_text.real_tokenize(keywords.STYLE_HOUSE)
        self.processed_text.real_tokenize(keywords.STYLE_DANCEHALL)
        self.processed_text.real_tokenize(keywords.STYLE_KRUMP)
        self.processed_text.real_tokenize(keywords.STYLE_TURF)
        self.processed_text.real_tokenize(keywords.STYLE_LITEFEET)
        self.processed_text.real_tokenize(keywords.STYLE_FLEX)
        self.processed_text.real_tokenize(keywords.STYLE_BEBOP)
        self.processed_text.real_tokenize(keywords.STYLE_ALLSTYLE)

        self.final_search_text = self.processed_text.get_tokenized_text()
        search_text = self.final_search_text

        self.processed_title = StringProcessor(self.title, self.boundaries)
        self.processed_title.real_tokenize(keywords.PREPROCESS_REMOVAL)
        self.final_title = self.processed_title.get_tokenized_text()

        #if not self.processed_text.get_tokens(rules.ANY_GOOD):
        #    self.dance_event = False
        #    return
        a = time.time()
        b = time.time()
        self.manual_dance_keywords_matches = self.processed_text.get_tokens(rules.MANUAL_DANCE[grammar.STRONG])
        self.times['manual_regex'] = time.time() - b
        self.real_dance_matches = self.processed_text.get_tokens(rules.GOOD_DANCE)
        if self.processed_text.get_tokens(keywords.ROMANCE):
            event_matches = self.processed_text.get_tokens(rules.EVENT_WITH_ROMANCE_EVENT)
        else:
            event_matches = self.processed_text.get_tokens(rules.EVENT)
        club_and_event_matches = self.processed_text.get_tokens(keywords.PRACTICE, keywords.PERFORMANCE, keywords.CONTEST)
        self.times['all_regexes'] = time.time() - a

        self.found_dance_matches = self.real_dance_matches + self.processed_text.get_tokens(keywords.EASY_DANCE, keywords.AMBIGUOUS_DANCE_MUSIC, keywords.EASY_CHOREO, keywords.HOUSE, keywords.TOO_EASY_VOGUE, keywords.EASY_VOGUE) + self.manual_dance_keywords_matches
        self.found_event_matches = event_matches + self.processed_text.get_tokens(keywords.EASY_EVENT, keywords.JAM) + club_and_event_matches
        self.found_wrong_matches = self.processed_text.get_tokens(keywords.DANCE_WRONG_STYLE) + self.processed_text.get_tokens(keywords.CLUB_ONLY)

        title_wrong_style_matches = self.processed_title.get_tokens(rules.DANCE_WRONG_STYLE_TITLE)
        title_good_matches = self.processed_title.get_tokens(rules.ANY_GOOD)
        combined_matches_string = ' '.join(self.found_dance_matches + self.found_event_matches)
        dummy, combined_matches = re.subn(r'\w+', '', combined_matches_string)
        dummy, words = re.subn(r'\w+', '', re.sub(r'\bhttp.*?\s', '', search_text))
        fraction_matched = 1.0 * (combined_matches + 1) / (words + 1)
        if not fraction_matched:
            self.calc_inverse_keyword_density = 100
        else:
            self.calc_inverse_keyword_density = -math.log(fraction_matched, 2)

        #print self.processed_text.count_tokens(keywords.EASY_DANCE)
        #print len(club_and_event_matches)
        #print self.processed_text.count_tokens(keywords.DANCE_WRONG_STYLE)
        #print self.processed_text.count_tokens(keywords.CLUB_ONLY)
        #strong = 0
        #for line in search_text.split('\n'):
        #   proc_line = f(line)
        #    matches = proc_line.get_tokens(rules.ANY_GOOD)
        #    good_parts = sum(len(x) for x in matches)
        #    if 1.0 * good_parts / len(line) > 0.1:
        #        # strong!
        #        strong += 1
        if len(self.manual_dance_keywords_matches) >= 1:
            self.dance_event = 'obvious dancer or dance crew or battle'
        # one critical dance keyword
        elif len(self.real_dance_matches) >= 1:
            self.dance_event = 'obvious dance style'
        # If the title has a bad-style and no good-styles, mark it bad
        elif (title_wrong_style_matches and
            not (
                self.processed_title.get_tokens(keywords.AMBIGUOUS_DANCE_MUSIC) or
                self.manual_dance_keywords_matches or
                self.real_dance_matches)): # these two are implied by the above, but do it here just in case future clause re-ordering occurs
            self.dance_event = False

        elif self.processed_text.count_tokens(keywords.AMBIGUOUS_DANCE_MUSIC) + self.processed_text.count_tokens(keywords.HOUSE) >= 1 and (len(event_matches) + self.processed_text.count_tokens(keywords.EASY_CHOREO)) >= 1 and self.calc_inverse_keyword_density < 5 and not (title_wrong_style_matches and not title_good_matches):
            self.dance_event = 'hiphop/funk and good event type'
        # one critical event and a basic dance keyword and not a wrong-dance-style and not a generic-club
        elif self.processed_text.count_tokens(keywords.EASY_DANCE) >= 1 and (len(event_matches) + self.processed_text.count_tokens(keywords.EASY_CHOREO)) >= 1 and not self.processed_text.count_tokens(keywords.DANCE_WRONG_STYLE) and self.calc_inverse_keyword_density < 5:
            self.dance_event = 'dance event thats not a bad-style'
        elif self.processed_text.count_tokens(keywords.EASY_DANCE) >= 1 and len(self.found_event_matches) >= 1 and not self.processed_text.count_tokens(keywords.DANCE_WRONG_STYLE) and self.processed_text.count_tokens(keywords.CLUB_ONLY) == 0:
            self.dance_event = 'dance show thats not a club'
        else:
            self.dance_event = False
        self.times['all_match'] = time.time() - a