Esempio n. 1
0
 def mark_as_parsed(url):
     record = session.query(RawData).\
         filter(RawData.url == url).first()
     if record is not None:
         record.parsed_as_entry = True
         session.add(record)
         session.commit()
Esempio n. 2
0
 def create_entry(**kwargs):
     url = kwargs.get('url')
     record = session.query(RawData). \
         filter(RawData.url == url).first()
     if record is not None:
         return
     RawData.create(**kwargs)
Esempio n. 3
0
 def get_by_depth(depth):
     """
     filter(RawData.parsed_as_entry == 0)
     :param depth:
     :return:
     """
     records = session.query(RawData).\
         filter(RawData.depth == depth).\
         with_entities(RawData.id, RawData.depth, RawData.url).\
         offset(140000).limit(30000).all()
     return [{"id": x.id, "depth": x.depth, "url": x.url} for x in records]
Esempio n. 4
0
 def count(domain):
     """
     speed up sqlalchemy count
     docs https://gist.github.com/hest/8798884
     :param domain:
     :return:
     """
     q = session.query(Article). \
         filter(Article.url.like("%" + domain + "%"))
     count_q = q.statement.with_only_columns([func.count()]).order_by(None)
     return q.session.execute(count_q).scalar()
Esempio n. 5
0
    def create(domain, spider_name, article_num=None):
        now_date = datetime.datetime.utcnow()
        record = session.query(Domain).\
            filter_by(domain=domain).first()
        if record is None:
            record = Domain(domain=domain,
                            spider_name=spider_name,
                            article_num=article_num,
                            created_ts=now_date,
                            updated_ts=now_date)
        else:
            record.domain
            record.spider_name = spider_name
            record.article_num = article_num
            record.updated_ts = now_date
        session.add(record)
        session.commit()

        return record
Esempio n. 6
0
    def create(**kwargs):
        url = kwargs.get('url')
        now_date = datetime.datetime.utcnow()
        record = session.query(RawData). \
            filter(RawData.url == url).first()
        if record is None:
            record = RawData(created_ts=now_date, updated_ts=now_date)
            action = 'insert'
        else:
            record.updated_ts = now_date
            action = 'update'
        record.extend(kwargs)
        try:
            session.add(record)
            session.commit()
        except:
            session.rollback()
            raise
        finally:
            session.close()

        return action
Esempio n. 7
0
def raw_to_article():
    data = session.query(RawData).\
        filter(RawData.depth == 6).\
        filter(RawData.http_status == '200').\
        filter(RawData.html is not None).\
        offset(70000).limit(10000).all()
    num = 0
    for record in data:
        if re.compile('.*medium.com\/@?.*\/.*').match(str(
                record.url)) is not None:
            # TODO remove source= from url ? should confirm
            html = record.html
            response = Selector(text=html)
            url = remove_params(record.url)
            if url != record.url:
                print url
            item = parse_html(response, url=str(url))
            if item:
                article.insert(item, logger=logger)
                print record.url
                num += 1
    print 'success ' + str(num)
Esempio n. 8
0
 def check_exists(link):
     record = session.query(Article).\
         filter(Article.url == link).first()
     return record is not None