Esempio n. 1
0
class Source(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.String(255), unique=True)

    # Keep articles on the Source so if an
    # article's feed dies, we still know where the Article came from.
    articles = db.relationship('Article', backref='source', lazy='dynamic')
    feeds = db.relationship('Feed', backref='source')

    def __init__(self, name):
        self.name = name
Esempio n. 2
0
File: story.py Progetto: frnsys/drip
class Story(Keywordable):
    story_id = db.Column('id',
                         db.Integer,
                         db.ForeignKey('keywordable.id'),
                         primary_key=True)
    events = db.relationship('Event',
                             backref='story',
                             lazy='dynamic',
                             foreign_keys=[Event.story_id])
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow)

    def __init__(self, event):
        self.events = [event]
        self.update()

    def add(self, event):
        self.events.append(event)

    @property
    def vecs(self):
        return vstack([e.vec for e in self.events])

    @property
    def age(self):
        return datetime.utcnow() - self.created_at

    def update(self):
        #self.summary = multisummarize(self.articles)
        #self.title = title(self.articles)
        self.keywords = [
            Keyword.find_or_create(name=kw)
            for kw, score in keywords(self.events)
        ]

    @classmethod
    def candidates(cls, event):
        """search stories to find candidates for the event"""
        # TODO this could be made more efficient
        candidates = defaultdict(float)
        for kw in event.keywords:
            for s in kw.subjects:
                if not isinstance(s, cls):
                    continue
                candidates[s] += global_term_idf[kw.name]
        return sorted(candidates.items(), key=lambda t: t[1], reverse=True)

    def as_dict(self):
        whitelist = ['id', 'created_at', 'updated_at']
        data = {attr: getattr(self, attr) for attr in whitelist}
        data['events'] = [e.as_dict() for e in self.events]
        data['keywords'] = [k.as_dict() for k in self.keywords]
        return data
Esempio n. 3
0
class Keyword(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode)

    def __init__(self, name):
        self.name = name

    @classmethod
    def find_or_create(cls, **kwargs):
        obj = cls.query.filter_by(**kwargs).first()
        if obj is None:
            obj = cls(**kwargs)
            db.session.add(obj)
            db.session.commit()
        return obj

    def as_dict(self):
        return {'id': self.id, 'name': self.name}
Esempio n. 4
0
class Keywordable(db.Model):
    id          = db.Column(db.Integer, primary_key=True)
    type        = db.Column('type', db.String(50))

    @declared_attr
    def keywords(cls):
        return db.relationship('Keyword',
                               secondary=keywordables_keywords,
                               backref=db.backref('subjects', lazy='dynamic'))
    @declared_attr
    def __mapper_args__(cls):
        if cls.__name__ == 'Keywordable':
            return {
                    'polymorphic_on': cls.type,
                    'polymorphic_identity': 'Keywordable'
            }
        else:
            return {
                'polymorphic_identity': cls.__name__
            }

    @declared_attr
    def __tablename__(cls):
        return cls.__name__.lower()
Esempio n. 5
0
class Feed(db.Model):
    id = db.Column(db.Integer, primary_key=True)
    url = db.Column(db.Unicode)
    errors = db.Column(db.Integer, default=0)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow)
    articles = db.relationship('Article', backref='feed', lazy='dynamic')
    source_id = db.Column(db.Integer, db.ForeignKey('source.id'))

    def __init__(self, url, source):
        self.url = url
        self.source = source
        self.errors = 0

    def get_articles(self):
        data = feedparser.parse(self.url)

        # If the `bozo` value is anything
        # but 0, there was an error parsing (or connecting) to the feed.
        if data.bozo:
            # Some errors are ok.
            if not isinstance(data.bozo_exception, feedparser.CharacterEncodingOverride) \
                    and not isinstance(data.bozo_exception, feedparser.NonXMLContentType):
                raise data.bozo_exception

        for entry in data.entries:
            url = entry['links'][0]['href']

            # Check for an existing Article.
            # If one exists, skip.
            if Article.query.filter_by(
                    url=url).count() or Article.query.filter_by(
                        source=self.source, title=entry['title']).count():
                continue

            a_data = fetch(url)
            if a_data is None:
                continue
            a_data['feed'] = self

            # Although `newspaper` can extract published datetimes using metadata,
            # generally the published datetime included with the RSS entry will
            # be more precise (and sometimes `newspaper` does not successfully
            # extract a published datetime).
            # (see https://github.com/codelucas/newspaper/blob/41b930b467979577710b86ecb93c2a952e5c9a0d/newspaper/extractors.py#L166)
            if 'published' in entry:
                a_data['published'] = parser.parse(entry['published'])

            # Skip empty or short articles (which may be 404 pages)
            if a_data is None \
                or len(word_tokenize(a_data['text'])) <= 150:
                continue

            yield Article(**a_data)
Esempio n. 6
0
class Article(Keywordable):
    article_id = db.Column('id',
                           db.Integer,
                           db.ForeignKey('keywordable.id'),
                           primary_key=True)
    url = db.Column(db.Unicode)
    title = db.Column(db.Unicode)
    text = db.Column(db.UnicodeText)
    html = db.Column(db.UnicodeText)
    image = db.Column(db.String)
    score = db.Column(db.Float, default=0.0)
    published = db.Column(db.DateTime)
    source_id = db.Column(db.Integer, db.ForeignKey('source.id'))
    feed_id = db.Column(db.Integer, db.ForeignKey('feed.id'))
    event_id = db.Column(db.Integer, db.ForeignKey('event.id'))
    authors = db.relationship('Author',
                              secondary=articles_authors,
                              backref=db.backref('articles', lazy='dynamic'))

    def __init__(self, url, title, text, html, image, published, authors,
                 keywords, feed):
        self.url = url
        self.text = text
        self.html = html
        self.title = title
        self.image = image
        self.published = published
        self.authors = [Author.find_or_create(name=name) for name in authors]
        self.keywords = [
            Keyword.find_or_create(name=kw) for kw in set(keywords)
        ]
        self.feed = feed
        self.source = feed.source

    @property
    def vec(self):
        # TODO for now, not storing vec - need to setup pytables or something similar
        cleaned = clean('\n'.join([self.title, self.text]))
        return vectorizer.vectorize([cleaned])[0]

    def as_dict(self):
        whitelist = [
            'id', 'title', 'url', 'text', 'score', 'published', 'feed_id',
            'source_id', 'event_id'
        ]
        data = {attr: getattr(self, attr) for attr in whitelist}
        data['authors'] = [a.as_dict() for a in self.authors]
        data['keywords'] = [k.as_dict() for k in self.keywords]
        return data
Esempio n. 7
0
class Event(Keywordable):
    event_id = db.Column('id',
                         db.Integer,
                         db.ForeignKey('keywordable.id'),
                         primary_key=True)
    articles = db.relationship('Article',
                               backref='event',
                               lazy='dynamic',
                               foreign_keys=[Article.event_id])
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow)
    story_id = db.Column(db.Integer, db.ForeignKey('story.id'))
    title = db.Column(db.Unicode)
    summary = db.Column(db.UnicodeText)

    def __init__(self, article):
        self.articles = [article]
        self.created_at = article.published
        self.update()

    def add(self, article):
        self.articles.append(article)

    @property
    def vecs(self):
        return vstack([a.vec for a in self.articles])

    @property
    def age(self):
        return datetime.utcnow() - self.created_at

    @property
    def summary_pts(self):
        return self.summary.split('\n')

    @property
    def text(self):
        return '\n'.join([a.text for a in self.articles])

    def update(self):
        self.summary = '\n'.join(multisummarize(self.articles))
        self.title = title(self.articles)
        self.keywords = [
            Keyword.find_or_create(name=kw)
            for kw, score in keywords(self.articles)
        ]

        # Set oldest published date as this event's date
        self.created_at = min([a.published for a in self.articles])

    @classmethod
    def candidates(cls, dt):
        """return "active" events - those that are not too old given a datetime `dt`"""
        return cls.query.filter(
            dt - Event.created_at < timedelta(hours=36)).all()

    def as_dict(self):
        whitelist = ['id', 'title', 'created_at', 'updated_at', 'story_id']
        data = {attr: getattr(self, attr) for attr in whitelist}
        data['summary'] = self.summary.split('\n')
        data['articles'] = [a.as_dict() for a in self.articles]
        data['keywords'] = [k.as_dict() for k in self.keywords]
        return data