def _markers(self, window=3): text = self.text().decode('utf-8') pars = paragraphs(text) df = pd.DataFrame({ 'bio_word_match': matches(pars, self.BIO_WORDS), 'text': pars, }) last_names = self._director_last_names() for k in last_names: assert k not in df density = lambda s: float(len(k)) * float(s.lower().count(k.lower())) / float(len(s)) df[k] = df.text.map(density) df['one_name'] = df[last_names].apply(max, 1) # Mark 5 paragraphs down from where we see a last name match df['name'] = pd.rolling_max(df.one_name, window=window, center=True) # Mark 5 paragraphs around where we see a bio word df['bio'] = pd.rolling_max(df.bio_word_match, window=window, center=True) # Multiply them together and see what we have df['mark'] = pd.rolling_mean(df.name * df.bio, window=window, center=True) s = np.zeros(len(df.mark)) for i, flag in enumerate(df.mark): if flag: if s[i-1]: s[i] = s[i-1] else: s[i] = i df['group'] = s s = df.group.value_counts() g = s[s.index != 0].argmax() df['flag'] = df.group == g return df
def matching_paragraphs(text, last_names): pattern = '(%s)' % '|'.join(last_names) matching = [p for p in paragraphs(text) if re.search(pattern, p, re.IGNORECASE)] return matching