Example #1
0
tags = jsons['results']

urls = []
for tag in tags:
    cnt = int(tag['subscribersCount'])
    fmt = """https://api.leancloud.cn/1.1/classes/Entry?&where={"tagsTitleArray":{"$in":["%s"]}}&include=user&limit=%d&skip=%d&order=-rankIndex"""
    total = int(tag['entriesCount'])
    i = 0

    while(i < total):
        print(fmt % (tag['title'], 20, i))
        urls.append(fmt % (tag['title'], 20, i))
        i += 20
            #print("%s\t\t%s\t%s\t%s" % (tag['title'], tag['viewsCount'], tag['subscribersCount'], tag['objectId']))

#print(urls)
db = Session()
for url in urls:
    task = SpiderTask()
    task.project = 'juejin'
    task.task_id = md5str(url)
    task.url = url
    task.callback = 'on_index_page'
    task.priority = 0
    task.last_time = 0
    task.status = 0
    db.add(task)
    db.commit()
    #task.create_at = now()
    print(url)
Example #2
0
CREATE TABLE `wec_topic` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `title` varchar(255) NOT NULL,
  `uuid` varchar(32) NOT NULL,
  `url` varchar(1000) NOT NULL,
  `view_count` int default 0,
  `content` MEDIUMTEXT,
  `created_at` datetime DEFAULT NULL,
  `updated_at` datetime DEFAULT NULL,
  PRIMARY KEY (`id`),
  KEY `ix_task_id` (`uuid`)
) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8;

"""

db = Session()

tasks = SpiderTask.query.filter_by(status=200, callback='on_entry_page').all()
for task in tasks:
    res = SpiderResult.query.filter_by(task_id=task.task_id).one()
    if res is None:
        print("Error, not found result %s" % task.task_id)
        # continue
    if task.result:
        print("save res %s" % task.task_id)
        resjson = json.loads(task.result)
        cnt = resjson['collectionCount']
        title = resjson['title']
        content = json.loads(res.content)
        task_id = res.task_id
        post = PostTopic()
Example #3
0
        fp.write(spider)
else:
    print("path %s already exists." % (filename))

config_filename="projects/%s/project.yaml"%project_name
if not os.path.exists(config_filename):
    with open(config_filename, "w+") as fp:
        fp.write(project_config)

init_filename="projects/%s/__init__.py"%project_name
if not os.path.exists(init_filename):
    with open(init_filename, "w+") as fp:
        fp.write("")


from easyspider.db import SpiderProject, Session

db = Session()

project = SpiderProject.query.filter_by(name=project_name).first()
if project is None:
    project = SpiderProject()
    project.name = project_name
    project.status = 0
    project.process = '{}'
    project.queue_name= project_name
    db.add(project)
    db.commit()


Example #4
0
from easyspider.db import Session , ScopedSession   
from sqlalchemy import Column, DateTime, String, Integer, ForeignKey, func
from sqlalchemy.orm import relationship, backref
from sqlalchemy.ext.declarative import declarative_base
 
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from sqlalchemy.orm.scoping import scoped_session
engine = create_engine("mysql://*****:*****@localhost/spider?charset=utf8")
scopedsession = scoped_session(sessionmaker(bind=engine))

Base = declarative_base()

#SpiderTask.query.filter(SpiderTask.project=='test').all()

db = Session()

db.query(SpiderTask).filter(SpiderTask.project=='test').all()

ScopedSession = scoped_session(sessionmaker())



q = scopedsession.query_property()
q2 = scopedsession.query_property()
print("scoped session", q, q==q2)


class Bar(object):
    """docstring for Bar"""
    def __init__(self):