def format_as_search_query(text, broad=True): processed_text = grammar_matcher.StringProcessor(text) category_list = EVENT_TYPES.copy() category_list.update(BROAD_STYLES if broad else STYLES) for category, rule in category_list.iteritems(): replaced, count = processed_text.replace_with(rule, lambda x: ' categories:%s ' % category.categories_name) return processed_text.text
def _parse_contents(self, response): # Wix pages aren't really parseable, so anytime we see them, # let's re-run it (depth-1) with an escaped-fragment to get the real html source if 'https://static.wixstatic.com/' in response.body and '_escaped_fragment_' not in response.url: parsed_url = urlparse(response.url) qs = parse_qs(parsed_url.query) qs['_escaped_fragment_'] = '' wix_scrapeable_url = urlunparse( (parsed_url.scheme, parsed_url.netloc, parsed_url.path, parsed_url.params, urlencode(qs), parsed_url.fragment) ) response.meta['depth'] -= 1 return [scrapy.Request(wix_scrapeable_url, self.parse)] return if not hasattr(response, 'selector'): logging.info('Skipping unknown file from: %s', response.url) return # Get all text contents of tags (unless they are script or style tags) text_contents = ' '.join(response.selector.xpath('//*[not(self::script|self::style)]/text()').extract()).lower() processed_text = grammar_matcher.StringProcessor(text_contents, regex_keywords.WORD_BOUNDARIES) wrong = processed_text.get_tokens(all_styles.DANCE_WRONG_STYLE) good = processed_text.get_tokens(rules.STREET_STYLE) if (wrong or good): #print response.url, set(wrong), set(good) pass
def has_strong_organizer(self): title_is_other_dance = self._title_has_other() if title_is_other_dance: return False org_name = self._classified_event.fb_event['info'].get( 'owner', {}).get('name', '').lower() sp = grammar_matcher.StringProcessor(org_name) has_dance_organizer = sp.has_token(self.GOOD_DANCE_FULL) self._log('Searching organizer (%s) for %s, has: %s', org_name, self.GOOD_DANCE_FULL.name(), has_dance_organizer) if has_dance_organizer: self._log('Has good dance in event organizer: %s' % has_dance_organizer) return 'Has good dance in event organizer' has_super_strong_dance_organizer = sp.has_token( self.SUPER_STRONG_KEYWORDS) self._log('Searching organizer (%s) for %s, has: %s', org_name, self.SUPER_STRONG_KEYWORDS.name(), has_dance_organizer) if has_super_strong_dance_organizer: self._log('Has super-strong dance in event organizer: %s' % has_dance_organizer) return 'Has super-strong dance in event organizer' return False
def process_doc(fb_event): values = array.array(str("f")) processed_title = grammar_matcher.StringProcessor(fb_event['info'].get( 'name', '').lower()) processed_text = grammar_matcher.StringProcessor(fb_event['info'].get( 'description', '').lower()) dummy, title_word_count = re.subn(r'\w+', '', processed_title.text) dummy, text_word_count = re.subn(r'\w+', '', processed_text.text) values.append(title_word_count) values.append(text_word_count) # TODO: Ideally we want this to be the rules_list of the GrammarFeatureVector for i, (name, rule) in enumerate(named_rules_list): title_matches = 1.0 * processed_title.count_tokens(rule) text_matches = 1.0 * processed_text.count_tokens(rule) values.append(title_matches) values.append(text_matches) return values
def highlight_keywords(text): import jinja2 processed_text = grammar_matcher.StringProcessor( jinja2.Markup.escape(text)) processed_text.replace_with( rules.ANY_GOOD, lambda match: jinja2.Markup('<span class="matched-text">%s</span>' ) % match.group(0), flags=re.I) return jinja2.Markup(processed_text.get_tokenized_text())
def has_list_of_good_classes(self): # A "list of times with dance/music things" can often be clubs as well as classes # So let's try to throw out club-things first start_time = self._classified_event.start_time end_time = self._classified_event.end_time # Ignore club events (ends in the morning and less than 12 hours long) if end_time and end_time.time() < datetime.time( 12) and end_time - start_time < datetime.timedelta(hours=12): return False if len(set(self._get(keywords.CLUB_ONLY))) > 2: return False #if self._title_has_other(): # return False # if title is good strong keyword, and we have a list of classes: # why doesn't this get found by the is_workshop title classifier? where is our 'camp' keyword # https://www.dancedeets.com/events/admin_edit?event_id=317006008387038 schedule_groups = event_structure.get_schedule_line_groups( self._classified_event) for schedule_lines in schedule_groups: good_lines = [] bad_lines = [] for line in schedule_lines: proc_line = grammar_matcher.StringProcessor( line, self._classified_event.boundaries) good_matches = proc_line.get_tokens( self.GOOD_OR_AMBIGUOUS_DANCE) bad_matches = set() for x in self.OTHER_DANCES: bad_matches.update(proc_line.get_tokens(x)) # Sometimes we have a schedule with hiphop and ballet # Sometimes we have a schedule with hiphop and dj and beatbox/rap (more on music side) # Sometimes we have a schedule with hiphop, house, and beatbox (legit, crosses boundaries) # TODO: Should do a better job of classifying the ambiguous music/dance types, based on the presence of non-ambiguous dance types too if good_matches and not bad_matches: self._log('Found %s in line', good_matches) good_lines.append(good_matches) if not good_matches and bad_matches: bad_lines.append(bad_matches) num_dance_lines = len(good_lines) + len(bad_lines) self._log('Found %s of %s lines with dance styles: %s', num_dance_lines, len(schedule_lines), good_lines + bad_lines) # If more than 10% are good, then we found a good class self._log('Found %s of %s lines with good styles: %s', len(good_lines), len(schedule_lines), good_lines) if len(good_lines ) > len(schedule_lines) / 10 and num_dance_lines >= 2: return 'found schedule list with good styles' return False
def classify(self): #self.language not in ['ja', 'ko', 'zh-CN', 'zh-TW', 'th']: if cjk_detect.cjk_regex.search(self.search_text): cjk_chars = len(cjk_detect.cjk_regex.findall(self.search_text)) if 1.0 * cjk_chars / len(self.search_text) > 0.05: self.boundaries = regex_keywords.NO_WORD_BOUNDARIES else: self.boundaries = regex_keywords.WORD_BOUNDARIES else: self.boundaries = regex_keywords.WORD_BOUNDARIES self.processed_title = grammar_matcher.StringProcessor( self.title, self.boundaries) self.processed_text = grammar_matcher.StringProcessor( self.search_text, self.boundaries) # This must be first, to remove the fake keywords self.processed_title.real_tokenize(keywords.PREPROCESS_REMOVAL) self.processed_text.real_tokenize(keywords.PREPROCESS_REMOVAL) global_rule = styles.PREPROCESS_REMOVAL.get(None) per_language_rule = styles.PREPROCESS_REMOVAL.get(self.language) if global_rule: self.processed_title.real_tokenize(global_rule) self.processed_text.real_tokenize(global_rule) if per_language_rule: self.processed_title.real_tokenize(per_language_rule) self.processed_text.real_tokenize(per_language_rule) # Or if there are bad keywords, lets see if we can find good keywords on a short line short_lines = [ line for line in self.processed_text.get_tokenized_text().split('\n') if len(line) < 500 ] self.processed_short_lines = grammar_matcher.StringProcessor( '\n'.join(short_lines), self.boundaries)
def find_rules_in_text(text, rule_dict): # Eliminate all competitors, before trying to determine the style no_competitors_text = event_structure.find_competitor_list(text) if no_competitors_text: text = text.replace(no_competitors_text, '') found_styles = {} # Only grab the first 400 lines trimmed_text = '\n'.join(text.lower().split('\n')[:400]) processed_text = grammar_matcher.StringProcessor(trimmed_text) processed_text.real_tokenize(keywords.PREPROCESS_REMOVAL) # so we can match this with vogue, but not with house processed_text.real_tokenize(keywords.HOUSE_OF) for style, rule in rule_dict.iteritems(): tokens = processed_text.get_tokens(rule) if tokens: found_styles[style] = tokens return found_styles.keys()
def _compute_features(self, raw_documents): values = array.array(str("f")) print "Preloading regexes" dummy_processor = grammar_matcher.StringProcessor('') for name, rule in named_rules_list: dummy_processor.count_tokens(rule) print "Computing Features" result = Parallel(n_jobs=7 if process_all else 1, verbose=10)(delayed(process_doc)(fb_event) for event_id, fb_event in raw_documents) for row_values in result: values.extend(row_values) X = np.array(values) X.shape = (len(raw_documents), len(self.features)) return X
def parse_classes(self, response): table = response.css('table') date = None # Keep track of this row-to-row for row in table.css('tr'): cells = row.css('td') if not cells: continue row_contents = self._extract_text(row) if not row_contents or '---' in row_contents: continue potential_day = self._extract_text(cells[0]) if potential_day: date = dateparser.parse(potential_day).date() times = self._extract_text(cells[1]) classname = self._extract_text(cells[2]) if not times: continue teacher = self._extract_text(cells[3]) href_cell = cells[3].xpath('.//@href').extract() # Use our NLP event classification keywords to figure out which BDC classes to keep processor = grammar_matcher.StringProcessor(classname) if not processor.has_token(rules.DANCE_STYLE): continue item = items.StudioClass() item['style'] = classname item['teacher'] = teacher if href_cell: item['teacher_link'] = href_cell[0].strip() # do we care?? row[4] start_time, end_time = parse_times(self._cleanup(times)) item['start_time'] = datetime.datetime.combine(date, start_time) item['end_time'] = datetime.datetime.combine(date, end_time) for new_item in self._repeated_items_iterator(item): yield new_item
def runTest(self): string_processor = grammar_matcher.StringProcessor( u'the blocking dance') self.assertFalse(string_processor.get_tokens(keywords.STYLE_LOCK)) string_processor = grammar_matcher.StringProcessor( u'the locking dance') self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK)) string_processor = grammar_matcher.StringProcessor(u'今日はblockingです') self.assertFalse(string_processor.get_tokens(keywords.STYLE_LOCK)) string_processor = grammar_matcher.StringProcessor(u'今日はlockingです') self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK)) string_processor = grammar_matcher.StringProcessor(u'今日はロックイングです') self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK)) string_processor = grammar_matcher.StringProcessor(u'今日はブロックイングです') # Ideally we'd like this to return false, # but word segmentation is near-impossible with cjk (and japanese katakana phrases) self.assertTrue(string_processor.get_tokens(keywords.STYLE_LOCK))
def relevant_keywords(event): text = get_relevant_text(event) processed_text = grammar_matcher.StringProcessor(text) good_keywords = processed_text.get_tokens(rules.ANY_GOOD) return sorted(set(good_keywords))
def classify(self): super(ClassifiedEvent, self).classify() # Running real_tokenize() on a rule replaces it with the name of the high-level rule. # Instead, let's grab the contents of the rule, which will assume is an Any(), and run on each of the Any() manual_dancer = rules.MANUAL_DANCER[grammar.STRONG] assert isinstance(manual_dancer, grammar.Name) assert isinstance(manual_dancer, grammar.Name) assert len(manual_dancer.children()) == 1 assert isinstance(manual_dancer.children()[0], rules.Any) manual_dancer_children = manual_dancer.children()[0].children() for rule in manual_dancer_children: self.processed_text.real_tokenize(rule) self.processed_text.real_tokenize(keywords.GOOD_INSTANCE_OF_BAD_CLUB) #TODO(lambert): These grab things that are good, and keep them good, so they can't be stolen by other things. # Removing them appears to drop us from 9132 true positives down to 9108 true positives. # Maybe we can investigate exactly what's going on, and reduce the number of real_tokenize calls needed? self.processed_text.real_tokenize(keywords.DANCE) self.processed_text.real_tokenize(keywords.STYLE_BREAK) self.processed_text.real_tokenize(keywords.STYLE_ROCK) self.processed_text.real_tokenize(keywords.STYLE_POP) self.processed_text.real_tokenize(keywords.STYLE_LOCK) self.processed_text.real_tokenize(keywords.STYLE_WAACK) self.processed_text.real_tokenize(keywords.STYLE_HIPHOP) self.processed_text.real_tokenize(keywords.STYLE_HOUSE) self.processed_text.real_tokenize(keywords.STYLE_DANCEHALL) self.processed_text.real_tokenize(keywords.STYLE_KRUMP) self.processed_text.real_tokenize(keywords.STYLE_TURF) self.processed_text.real_tokenize(keywords.STYLE_LITEFEET) self.processed_text.real_tokenize(keywords.STYLE_FLEX) self.processed_text.real_tokenize(keywords.STYLE_BEBOP) self.processed_text.real_tokenize(keywords.STYLE_ALLSTYLE) search_text = self.processed_text.get_tokenized_text() # Or if there are bad keywords, lets see if we can find good keywords on a short line short_lines = [ line for line in search_text.split('\n') if len(line) < 500 ] self.processed_short_lines = grammar_matcher.StringProcessor( '\n'.join(short_lines), self.boundaries) #if not self.processed_text.get_tokens(rules.ANY_GOOD): # self.dance_event = False # return a = time.time() b = time.time() self.manual_dance_keywords_matches = self.processed_text.get_tokens( rules.MANUAL_DANCE[grammar.STRONG]) self.times['manual_regex'] = time.time() - b self.real_dance_matches = self.processed_text.get_tokens( rules.GOOD_DANCE) if self.processed_text.get_tokens(dance_keywords.ROMANCE): event_matches = self.processed_text.get_tokens( rules.EVENT_WITH_ROMANCE_EVENT) else: event_matches = self.processed_text.get_tokens(rules.EVENT) club_and_event_matches = self.processed_text.get_tokens( dance_keywords.PRACTICE, dance_keywords.PERFORMANCE, dance_keywords.CONTEST) self.times['all_regexes'] = time.time() - a self.found_dance_matches = self.real_dance_matches + self.processed_text.get_tokens( dance_keywords.EASY_DANCE, keywords.AMBIGUOUS_DANCE_MUSIC, dance_keywords.EASY_CHOREO, keywords.HOUSE, keywords.TOO_EASY_VOGUE, keywords.EASY_VOGUE) + self.manual_dance_keywords_matches self.found_event_matches = event_matches + self.processed_text.get_tokens( keywords.EASY_EVENT, keywords.JAM) + club_and_event_matches self.found_wrong_matches = self.processed_text.get_tokens( all_styles.DANCE_WRONG_STYLE) + self.processed_text.get_tokens( keywords.CLUB_ONLY) title_wrong_style_matches = self.processed_title.get_tokens( all_styles.DANCE_WRONG_STYLE_TITLE) title_good_matches = self.processed_title.get_tokens(rules.ANY_GOOD) combined_matches_string = ' '.join(self.found_dance_matches + self.found_event_matches) dummy, combined_matches = re.subn(r'\w+', '', combined_matches_string) dummy, words = re.subn(r'\w+', '', re.sub(r'\bhttp.*?\s', '', search_text)) fraction_matched = 1.0 * (combined_matches + 1) / (words + 1) if not fraction_matched: self.calc_inverse_keyword_density = 100 else: self.calc_inverse_keyword_density = -math.log(fraction_matched, 2) #print self.processed_text.count_tokens(dance_keywords.EASY_DANCE) #print len(club_and_event_matches) #print self.processed_text.count_tokens(all_styles.DANCE_WRONG_STYLE) #print self.processed_text.count_tokens(keywords.CLUB_ONLY) #strong = 0 #for line in search_text.split('\n'): # proc_line = f(line) # matches = proc_line.get_tokens(rules.ANY_GOOD) # good_parts = sum(len(x) for x in matches) # if 1.0 * good_parts / len(line) > 0.1: # # strong! # strong += 1 music_or_dance_keywords = self.processed_text.count_tokens( keywords.AMBIGUOUS_DANCE_MUSIC) + self.processed_text.count_tokens( keywords.HOUSE) if len(self.manual_dance_keywords_matches) >= 1: self.dance_event = 'obvious dancer or dance crew or battle' # one critical dance keyword elif len(self.real_dance_matches) >= 1: self.dance_event = 'obvious dance style' # If the title has a bad-style and no good-styles, mark it bad elif ( title_wrong_style_matches and not (self.processed_title.get_tokens( keywords.AMBIGUOUS_DANCE_MUSIC) or self.manual_dance_keywords_matches or self.real_dance_matches) ): # these two are implied by the above, but do it here just in case future clause re-ordering occurs self.dance_event = False elif music_or_dance_keywords >= 1 and ( len(event_matches) + self.processed_text.count_tokens(dance_keywords.EASY_CHOREO) ) >= 1 and self.calc_inverse_keyword_density < 5 and not ( title_wrong_style_matches and not title_good_matches): self.dance_event = 'hiphop/funk and good event type' # one critical event and a basic dance keyword and not a wrong-dance-style and not a generic-club elif self.processed_text.count_tokens( dance_keywords.EASY_DANCE) >= 1 and ( len(event_matches) + self.processed_text.count_tokens( dance_keywords.EASY_CHOREO) ) >= 1 and not self.processed_text.count_tokens( all_styles.DANCE_WRONG_STYLE ) and self.calc_inverse_keyword_density < 5: self.dance_event = 'dance event thats not a bad-style' elif self.processed_text.count_tokens( dance_keywords.EASY_DANCE) >= 1 and len( self.found_event_matches ) >= 1 and not self.processed_text.count_tokens( all_styles.DANCE_WRONG_STYLE ) and self.processed_text.count_tokens( keywords.CLUB_ONLY) == 0: self.dance_event = 'dance show thats not a club' elif music_or_dance_keywords >= 1 and self.processed_text.count_tokens( dance_keywords.EASY_DANCE) >= 1: self.dance_event = 'good music and dance keyword' else: self.dance_event = False self.times['all_match'] = time.time() - a
def _street_style(style): # Use our NLP event classification keywords to figure out which BDC classes to keep processor = grammar_matcher.StringProcessor(style) # Get rid of "Ballet with Pop Music" processor.real_tokenize(keywords.PREPROCESS_REMOVAL) return processor.has_token(rules.DANCE_STYLE)
def notMatchRule(self, rule, s): string_processor = grammar_matcher.StringProcessor(s) self.assertFalse(string_processor.get_tokens(rule))