Example #1
0
File: models.py Project: iangow/hal
    def _markers(self, window=3):
        text = self.text().decode('utf-8')
        pars = paragraphs(text)
        df = pd.DataFrame({
            'bio_word_match': matches(pars, self.BIO_WORDS),
            'text': pars,
        })

        last_names = self._director_last_names()
        for k in last_names:
            assert k not in df
            density = lambda s: float(len(k)) * float(s.lower().count(k.lower())) / float(len(s))
            df[k] = df.text.map(density)
        df['one_name'] = df[last_names].apply(max, 1)

        # Mark 5 paragraphs down from where we see a last name match
        df['name'] = pd.rolling_max(df.one_name, window=window, center=True)

        # Mark 5 paragraphs around where we see a bio word
        df['bio'] = pd.rolling_max(df.bio_word_match, window=window, center=True)

        # Multiply them together and see what we have
        df['mark'] = pd.rolling_mean(df.name * df.bio, window=window, center=True)

        s = np.zeros(len(df.mark))
        for i, flag in enumerate(df.mark):
            if flag:
                if s[i-1]:
                    s[i] = s[i-1]
                else:
                    s[i] = i
        df['group'] = s
        s = df.group.value_counts()
        g = s[s.index != 0].argmax()
        df['flag'] = df.group == g

        return df
Example #2
0
File: models.py Project: iangow/hal
def matching_paragraphs(text, last_names):
    pattern = '(%s)' % '|'.join(last_names)
    matching = [p for p in paragraphs(text) if re.search(pattern, p, re.IGNORECASE)]
    return matching