コード例 #1
0
def task_tr_click(url):
    mongo = mongoConnection.mongoConnection(db='wanFang',
                                            collection='paperinfo')
    info = mongo.collection.find({'url': url}, {'spidertime': 1})
    info = dict(
        collections.Counter([str(x['spidertime'].split(' ')[0])
                             for x in info]))
    #time
    oneday = datetime.timedelta(days=1)
    now = datetime.date.today()
    x_value = [str(now - oneday * x) for x in range(30)]
    x_value.reverse()
    y_value = []
    for x in x_value:
        value = info.get(x)
        if value == None:
            y_value.append(0)
        else:
            y_value.append(value)
    xy_value = {'y_value': y_value, 'x_value': x_value}
    mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider')
    content = list(mongo.collection.find({'url': url}, {'_id': 0}))

    if len(content) > 0:
        content = content[0]
    return {'xy_value': xy_value, 'content': content}
コード例 #2
0
ファイル: models.py プロジェクト: mr2coder/spider
def already_exist_data():
    #get query task list
    mongo = mongoConnection.mongoConnection(db='patent', collection='spider')
    info = mongo.collection.find({})
    info = [x for x in info]
    #get related count
    mongo = mongoConnection.mongoConnection(db='patent',
                                            collection='patentinfo')
    find = mongo.collection.find  #function reference
    for x in info:
        x['_id'] = str(x['_id'])
        x['count'] = find({'spider_id': x['_id']}, {'_id': 0}).count()
    return info
コード例 #3
0
def api_delete_task(url):
    try:
        #delete spider information
        mongo = mongoConnection.mongoConnection(db='wanFang',
                                                collection='spider')
        _ = mongo.collection.remove({'url': url})
        #delete paperinfo's information
        mongo = mongoConnection.mongoConnection(db='wanFang',
                                                collection='paperinfo')
        _ = mongo.collection.remove({'url': url})
        return 0
    except Exception as e:
        return -1
コード例 #4
0
ファイル: models.py プロジェクト: mr2coder/spider
def patent_delete_task(spider_id):
    try:
        #delete spider information
        mongo = mongoConnection.mongoConnection(db='patent',
                                                collection='spider')
        _ = mongo.collection.remove({'_id': ObjectId(spider_id)})
        #delete ppatentinfo's information
        mongo = mongoConnection.mongoConnection(db='patent',
                                                collection='patentinfo')
        _ = mongo.collection.remove({'spider_id': spider_id})
        return 0
    except Exception as e:
        return -1
コード例 #5
0
ファイル: iqiyi.py プロジェクト: mr2coder/spider
def iqiyi_url_spider(content, site='iqiyi', socketio=None):
    mongoDB = mongoConnection.mongoConnection(db='video', collection='spider')
    data = list(
        mongoDB.collection.find({
            'content': content,
            'site': site
        }, {
            'time_limit': 1,
            '_id': 0,
            'length': 1
        }))[0]
    print(data)
    page_num = get_page_nums(content,
                             site,
                             time_limt=data['time_limit'],
                             length=data['length'])
    if socketio:
        socketio.emit('my_response', {'data': '总数为:' + str(page_num)},
                      namespace='/video')
    page_num = min((page_num + 19) // 20, 20)
    mongoDB = mongoConnection.mongoConnection(db='video', collection='urlinfo')
    for index in range(1, page_num + 1):
        result = get_page_info(content,
                               site,
                               time_limt=data['time_limit'],
                               length=data['length'],
                               pagenum=index)
        if socketio:
            for line in result:
                socketio.emit('my_response', {
                    'data':
                    'Currently crawling title is: ' + line['videoname']
                },
                              namespace='/video')
                socketio.sleep(1)
                try:
                    print(line)
                    infomation_id = mongoDB.collection.insert(line)
                    mongoDB.db['spider'].update(
                        {
                            'site': site,
                            'content': content
                        }, {'$set': {
                            'inactive': 0
                        }})
                except Exception as e:
                    logger.debug(e)
            socketio.emit('my_response', {'data': '已完成'}, namespace='/video')
            socketio.emit('disconnect', {'data': 'disconnect'},
                          namespace='/video')
コード例 #6
0
ファイル: models.py プロジェクト: mr2coder/spider
def task_tr_click(spider_id):
    mongo = mongoConnection.mongoConnection(db='patent',
                                            collection='patentinfo')
    info = mongo.collection.find({'spider_id': spider_id}, {'spidertime': 1})
    info = dict(
        collections.Counter([str(x['spidertime'].split(' ')[0])
                             for x in info]))
    #time
    oneday = datetime.timedelta(days=1)
    now = datetime.date.today()
    x_value = [str(now - oneday * x) for x in range(30)]
    x_value.reverse()
    y_value = []
    for x in x_value:
        value = info.get(x)
        if value == None:
            y_value.append(0)
        else:
            y_value.append(value)
    xy_value = {'y_value': y_value, 'x_value': x_value}
    content = list(mongo.collection.find({'spider_id': spider_id}, {'_id': 0}))
    for x in range(len(content)):
        if isinstance(content[x]['institution'], list):
            content[x]['institution'] = list(set(content[x]['institution']))
    return {'xy_value': xy_value, 'content': content}
コード例 #7
0
def store_spider(**kwargs):
    """
	"""
    try:
        mongoDB = mongoConnection.mongoConnection(db='video',
                                                  collection='spider')
        if not isinstance(kwargs['site'], list):
            kwargs['site'] = [kwargs['site']]
        infomations = [{
            'content': kwargs['content'],
            'status': 1,
            'site': site,
            'feq': kwargs['feq'],
            'length': kwargs['length'],
            'time_limit': kwargs['time_limit'],
            'time': kwargs['time'],
            'last_time': kwargs['time'],
            'inactive': 0
        } for site in kwargs['site']]
        infomation_id = mongoDB.collection.insert_many(infomations,
                                                       ordered=False)
        return infomation_id
    except Exception as e:
        print(e)
        return None
コード例 #8
0
ファイル: video_url_spider.py プロジェクト: mr2coder/spider
def youtube_url_spider(content, socketio=None):
    base_url = 'https://www.youtube.com/results?search_query='
    url = base_url + content
    response = requests.get(url)
    if response.status_code != 200:
        return False
    doc = html.fromstring(response.text)
    mongoDB = mongoConnection.mongoConnection(db='video', collection='spider')
コード例 #9
0
def api_delete_task(content, site, time):
    try:
        #delete spider information
        mongo = mongoConnection.mongoConnection(db='video',
                                                collection='spider')
        _ = mongo.collection.remove({
            'content': content,
            'site': site,
            'time': time
        })
        #delete urlinfo information
        mongo = mongoConnection.mongoConnection(db='video',
                                                collection='urlinfo')
        _ = mongo.collection.remove({'content': content, 'site': site})
        return 0
    except Exception as e:
        return -1
コード例 #10
0
def api_video_detail(content, site):
    mongo = mongoConnection.mongoConnection(db='video', collection='urlinfo')
    info = mongo.collection.find({
        'content': content,
        'site': site
    }, {'_id': 0})
    info = [x for x in info]
    return {'info': info}
コード例 #11
0
ファイル: video_url_spider.py プロジェクト: mr2coder/spider
def get_bilibili_info(args, socketio=None):
    """
	"""
    try:
        url = args[0]
        logger.info(url)
        content = args[1]
        sesson = None
        if not sesson: response = requests.get(url)
        else: response = sesson.get(url)
        if response.status_code != 200: return None
        json = response.json()
        doc = html.fromstring(json['html'])
        #Information extraction
        urls = doc.xpath("//li/a/@href")
        names = doc.xpath("//li/a/@title")
        infos = doc.xpath("//li/div/div[@class='des hide']/text()")
        playtimes = [
            x.strip() for x in doc.xpath(
                "//li/div/div[@class='tags']/span[@class='so-icon watch-num']/text()"
            )[1::2]
        ]
        showtimes = [
            x.strip() for x in doc.xpath(
                "//li/div/div[@class='tags']/span[@class='so-icon time']/text()"
            )[1::2]
        ]
        authors_sites = re.compile(
            r'http://space.bilibili.com/[0-9]{0,}').findall(response.text)
        times = []
        for x in playtimes:
            if x[-1] == u'万':
                temp = int(float(x[:-1]) * 10000)
            elif x[-1] == '-':
                temp = 0
            else:
                temp = int(x)
            times.append(temp)

        result = [{
            "videoname": x[0].strip(),
            "url": x[1],
            "showtime": x[2],
            "videoinfo": x[3].strip(),
            "playtimes": x[4],
            "spidertime": time.strftime('%Y-%m-%d %X', time.localtime()),
            "site": "sina",
            "content": content,
            "status": 1,
            "authors_site": x[5]
        } for x in zip(names, urls, showtimes, infos, times, authors_sites)]
        # pprint.pprint (len(result))
        # assert 1==2
        mongoDB = mongoConnection.mongoConnection(db='video',
                                                  collection='urlinfo')
        infomation_id = mongoDB.collection.insert_many(result, ordered=False)
    except Exception as e:
        logger.debug(e)
コード例 #12
0
ファイル: patent_url_spider.py プロジェクト: mr2coder/spider
def auto_click(id, socketio=None, proxy=False):
    mongo = mongoConnection.mongoConnection(db='patent', collection='spider')
    content = list(mongo.collection.find({'_id': ObjectId(id)}))
    content = content[0]
    url = URL
    if socketio:
        socketio.emit('my_response', {'data': URL}, namespace='/patent')
        socketio.sleep(1)
    click(url, content, socketio=socketio, proxy=proxy)
コード例 #13
0
ファイル: paperTool.py プロジェクト: mr2coder/spider
def get_collections(**kwargs):
    wanFang = mongoConnection.mongoConnection(**kwargs)
    collections = wanFang.db.collection_names()
    collections.remove('system.indexes')
    data = [{
        'collection': x,
        'count': wanFang.db[x].count()
    } for x in collections]
    return data
コード例 #14
0
def already_exist_data():
    #get query task list
    mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider')
    info = mongo.collection.find({}, {
        'last_time': 1,
        'url': 1,
        'time': 1,
        'feq': 1,
        '_id': 0
    })
    info = [x for x in info]
    #get related count
    mongo = mongoConnection.mongoConnection(db='wanFang',
                                            collection='paperinfo')
    find = mongo.collection.find  #function reference
    for x in info:
        x['count'] = find({'url': x['url']}).count()
    return info
コード例 #15
0
ファイル: video_url_spider.py プロジェクト: mr2coder/spider
def auto_run():
    mongo = mongoConnection.mongoConnection(db='video', collection='spider')
    tasks = list(mongo.collection.find({}, {
        'content': 1,
        'site': 1,
        '_id': 0
    }))
    tasks = [(x['content'], x['site']) for x in tasks]
    for x in tasks:
        click(*x)
コード例 #16
0
ファイル: models.py プロジェクト: mr2coder/spider
def patent_add_item(**kwargs):
    """
	add task, don't ask me why use dict as the param... 
	just because it's more shorter.
	"""
    try:
        patent = mongoConnection.mongoConnection(db='patent',
                                                 collection='spider')
        tag = patent.collection.insert(kwargs)
        return tag
    except Exception as e:
        print(e)
        return None
コード例 #17
0
def modify_task(**kwargs):
    #modify task
    o_url = kwargs['url']
    url = url_constract(**kwargs)
    mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider')
    feq = list(mongo.collection.find({'url': o_url}, {'feq': 1, '_id': 0}))[0]
    if_exist = mongo.collection.find({'url': url}).count()
    if if_exist != 0 and feq['feq'] == kwargs['feq']: return None
    kwargs['url'] = url
    if url != o_url:
        i_tag = mongo.collection.insert(kwargs)
        # r_tag = mongo.collection.remove({'url':o_url}) #逻辑问题,旧有的任务会造成paper悬空,因此暂时不删除原有任务
    else:
        mongo.collection.update({'url': url}, {"$set": {"feq": kwargs['feq']}})
    return 0
コード例 #18
0
def add_item(**kwargs):
    """
	add  task, don't ask me why use dict as the param... 
	just because it's more shorter.
	"""
    try:
        wanFang = mongoConnection.mongoConnection(db='wanFang',
                                                  collection='spider')
        url = url_constract(**kwargs)
        kwargs['url'] = url
        tag = wanFang.collection.insert(kwargs)
        return tag
    except Exception as e:
        print(e)
        return None
コード例 #19
0
ファイル: video_url_spider.py プロジェクト: mr2coder/spider
def click(content, site, socketio=None, proxy=False):
    args = {}
    logger.info(content + ':' + site)
    args['content'] = content
    args['site'] = site
    mongoDB = mongoConnection.mongoConnection(db='video', collection='spider')
    time_ = time.strftime('%Y-%m-%d %X', time.localtime())
    infomation_id = mongoDB.collection.update(
        {
            'site': site,
            'content': content
        }, {'$set': {
            'inactive': 1,
            'last_time': time_
        }})
    url_spider(args, socketio=socketio)
コード例 #20
0
ファイル: patent_url_spider.py プロジェクト: mr2coder/spider
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--type', dest='type')
    parser.add_argument('-c', '--content', dest='content')
    args = parser.parse_args()
    spider_id = args.content
    #数据库链接
    mongo = mongoConnection.mongoConnection(db='patent', collection='spider')
    content = list(mongo.collection.find({'_id': ObjectId(spider_id)}))
    content = content[0]
    url = URL
    if (args.type == 'click'):
        click(url, content)
    elif args.type == 'auto':
        auto_run()
    logger.info('Success: Task update finshed..')
コード例 #21
0
ファイル: video_url_spider.py プロジェクト: mr2coder/spider
 def get_info(url):
     logger.info('Currently crawling web pages is: ' + url)
     if socketio:
         socketio.emit(
             'my_response',
             {'data': 'Currently crawling web pages is: ' + url},
             namespace='/video')
         socketio.sleep(1)
     response = requests.get(url)
     if response.status_code != 200: return None
     result = [{
         "videoname":
         re.compile(r'(<.*?>)').sub("", x.get("videoname")),
         "url":
         x.get("url"),
         "showtime":
         x.get("showtime"),
         "videoinfo":
         x.get("videoinfo"),
         "playtimes":
         x.get("playtimes"),
         "spidertime":
         time.strftime('%Y-%m-%d %X', time.localtime()),
         "site":
         "sina",
         "content":
         content,
         "status":
         1
     } for x in response.json()["list"]]
     mongoDB = mongoConnection.mongoConnection(db='video',
                                               collection='urlinfo')
     try:
         infomation_id = mongoDB.collection.insert_many(result,
                                                        ordered=False)
         mongoDB.db['spider'].update(
             {
                 'site': 'sina',
                 'content': content
             }, {'$set': {
                 'inactive': 0
             }})
     except Exception as e:
         logger.debug(e)
     return 0
コード例 #22
0
def paper_list(url):
    #根据task的url查询关于次url的所有文章,返回dict,
    mongo = mongoConnection.mongoConnection(db='wanFang',
                                            collection='paperinfo')
    info = mongo.collection.find({'url': url}, {
        '_id': 0,
        'spidertime': 1,
        'title': 1,
        'link': 1,
        'abstract': 1,
        'authors': 1,
        'date': 1
    })
    info = [x for x in info]
    for line in info:
        line['authors'] = list(line['authors'].keys())
        line['abstract'] = line['abstract']['Chinese']
        if line['date']:
            line['date'] = line['date'].get('year')
    return info
コード例 #23
0
ファイル: patent_url_spider.py プロジェクト: mr2coder/spider
def store(patents, spider_id):
    #将得到的patents存入mongdb
    result = [{
        "title": x[0].strip(),
        "t_id": x[1],
        "o_id": x[2],
        "o_date": x[3].strip(),
        "author": x[4],
        "spidertime": time.strftime('%Y-%m-%d %X', time.localtime()),
        "institution": x[5],
        "proxie": x[6],
        "proxy_insititution": x[7],
        "icp_id": x[8],
        "spider_id": spider_id,
        "r_date": x[9]
    } for x in zip(patents["titles"], patents["t_ids"], patents["o_ids"],
                   patents["o_dates"], patents["authors"],
                   patents["institutions"], patents["proxies"],
                   patents["proxy_insititutions"], patents["icp_ids"],
                   patents["r_dates"])]
    mongoDB = mongoConnection.mongoConnection(db='patent',
                                              collection='patentinfo')
    infomation_id = mongoDB.collection.insert_many(result, ordered=False)
    return infomation_id
コード例 #24
0
def api_video_gragh(content, site):
    mongo = mongoConnection.mongoConnection(db='video', collection='urlinfo')
    info = mongo.collection.find({
        'content': content,
        'site': site
    }, {'spidertime': 1})
    info = dict(
        collections.Counter([str(x['spidertime'].split(' ')[0])
                             for x in info]))
    #time
    oneday = datetime.timedelta(days=1)
    now = datetime.date.today()
    x_value = [str(now - oneday * x)
               for x in range(120)]  #display 120days data
    x_value.reverse()
    y_value = []
    for x in x_value:
        value = info.get(x)
        if value == None:
            y_value.append(0)
        else:
            y_value.append(value)
    xy_value = {'y_value': y_value, 'x_value': x_value}
    return {'xy_value': xy_value, 'content': content}
コード例 #25
0
ファイル: patent_url_spider.py プロジェクト: mr2coder/spider
def click(url, content, socketio=None, proxy=False):
    proxies = [None]
    if proxy:
        if socketio:
            socketio.sleep(1)
            socketio.emit('my_response',
                          {'data': '免费代理获取中  \n这可能花费几分钟,请稍后...'},
                          namespace='/patent')
        proxies = fproxy.fetch_all()
        proxies = [{'http': 'http://' + x} for x in proxies]
        if socketio:
            socketio.sleep(1)
            socketio.emit('my_response',
                          {'data': '免费代理获取中  \n这可能花费几分钟,请稍后...'},
                          namespace='/patent')
    form = form_produce(content)
    num = get_page_nums(url, form)
    logger.info(num)
    mongo = mongoConnection.mongoConnection(db='patent',
                                            collection='patentinfo')
    i = 1
    if not num and socketio:
        socketio.emit('my_response', {'data': '目标网站连接失败,请稍后重试!'},
                      namespace='/patent')
        socketio.emit('disconnect', {'data': 'disconnect'},
                      namespace='/patent')
        return
    while i <= num:
        failed_tag = 0
        attempt = 0
        form = form_produce(content, i)
        proxie = random.choice(proxies)
        patents = get_patent(url, form, proxie)
        while patents is None:
            logger.debug('失败次数为:' + str(attempt + 1) + str(failed_tag))
            failed_tag += 1
            attempt += 1
            if attempt % 3 == 0:
                attempt = 0
                break
            if failed_tag % 10 == 0:
                logger.info("抓取新代理,请稍等")
                if socketio:
                    socketio.sleep(1)
                    socketio.emit('my_response', {'data': '抓取新代理,请稍等'},
                                  namespace='/patent')
                proxies = fproxy.fetch_all()
                proxies = [{'http': 'http://' + x} for x in proxies]
            proxie = random.choice(proxies)
            # print('新换ip代理为:',proxie)
            patents = get_patent(url, form, proxie)

        failed_tag = 0
        if patents != -1:
            try:
                for x in patents['titles']:
                    logger.info('title:' + x)
                    if socketio:
                        socketio.sleep(1)
                        socketio.emit('my_response', {'data': 'title:' + x},
                                      namespace='/patent')
                store(patents, str(content['_id']))
            except Exception as e:
                logger.debug(e)
                logger.debug('插入数据库失败...')
        i += 50
コード例 #26
0
ファイル: models.py プロジェクト: mr2coder/spider
def get_count(**kwargs):
    #default collection:patent_new
    #get collection's item num
    patent = mongoConnection.mongoConnection(**kwargs)
    result = patent.collection.count()
    return result
コード例 #27
0
def initial_spider():
    mongo = mongoConnection.mongoConnection(db='video', collection='spider')
    info = mongo.collection.find({}, {'_id': 0})
    info = [x for x in info]
    return info
コード例 #28
0
ファイル: patent_url_spider.py プロジェクト: mr2coder/spider
def auto_run(proccess_num=10):
    mongo = mongoConnection.mongoConnection(db='patent', collection='spider')
    tasks = list(mongo.collection.find({}))
    pool = ThreadPool(proccess_num)
    results = pool.map(click, zip(repeat(URL), tasks))
コード例 #29
0
ファイル: models.py プロジェクト: mr2coder/spider
def get_count(**kwargs):
    #default collection:paper_new
    #get collection's item num
    wanFang = mongoConnection.mongoConnection(**kwargs)
    result = wanFang.collection.count()
    return result
コード例 #30
0
ファイル: models.py プロジェクト: mr2coder/spider
def db_find(key):
    wanFang = mongoConnection.mongoConnection()
    result = wanFang.collection.find({}, {key: 1})
    return result