def mark_as_parsed(url): record = session.query(RawData).\ filter(RawData.url == url).first() if record is not None: record.parsed_as_entry = True session.add(record) session.commit()
def create_entry(**kwargs): url = kwargs.get('url') record = session.query(RawData). \ filter(RawData.url == url).first() if record is not None: return RawData.create(**kwargs)
def get_by_depth(depth): """ filter(RawData.parsed_as_entry == 0) :param depth: :return: """ records = session.query(RawData).\ filter(RawData.depth == depth).\ with_entities(RawData.id, RawData.depth, RawData.url).\ offset(140000).limit(30000).all() return [{"id": x.id, "depth": x.depth, "url": x.url} for x in records]
def count(domain): """ speed up sqlalchemy count docs https://gist.github.com/hest/8798884 :param domain: :return: """ q = session.query(Article). \ filter(Article.url.like("%" + domain + "%")) count_q = q.statement.with_only_columns([func.count()]).order_by(None) return q.session.execute(count_q).scalar()
def create(domain, spider_name, article_num=None): now_date = datetime.datetime.utcnow() record = session.query(Domain).\ filter_by(domain=domain).first() if record is None: record = Domain(domain=domain, spider_name=spider_name, article_num=article_num, created_ts=now_date, updated_ts=now_date) else: record.domain record.spider_name = spider_name record.article_num = article_num record.updated_ts = now_date session.add(record) session.commit() return record
def create(**kwargs): url = kwargs.get('url') now_date = datetime.datetime.utcnow() record = session.query(RawData). \ filter(RawData.url == url).first() if record is None: record = RawData(created_ts=now_date, updated_ts=now_date) action = 'insert' else: record.updated_ts = now_date action = 'update' record.extend(kwargs) try: session.add(record) session.commit() except: session.rollback() raise finally: session.close() return action
def raw_to_article(): data = session.query(RawData).\ filter(RawData.depth == 6).\ filter(RawData.http_status == '200').\ filter(RawData.html is not None).\ offset(70000).limit(10000).all() num = 0 for record in data: if re.compile('.*medium.com\/@?.*\/.*').match(str( record.url)) is not None: # TODO remove source= from url ? should confirm html = record.html response = Selector(text=html) url = remove_params(record.url) if url != record.url: print url item = parse_html(response, url=str(url)) if item: article.insert(item, logger=logger) print record.url num += 1 print 'success ' + str(num)
def check_exists(link): record = session.query(Article).\ filter(Article.url == link).first() return record is not None