Exemple #1
0
def save_metadata(dbcurr, binhash, address, start_time, data):
    utcnow = datetime.datetime.utcnow()
    name = threading.currentThread().getName()
    try:
        info = parse_metadata(data)
        if not info:
            return
    except:
        traceback.print_exc()
        return
    info_hash = binhash.encode('hex')
    info['info_hash'] = info_hash
    # need to build tags
    info['tagged'] = False
    info['classified'] = False
    info['requests'] = 1
    info['last_seen'] = utcnow
    info['source_ip'] = address[0]

    if info.get('files'):
        files = [z for z in info['files'] if not z['path'].startswith('_')]
        if not files:
            files = info['files']
    else:
        files = [{'path': info['name'], 'length': info['length']}]
    files.sort(key=lambda z:z['length'], reverse=True)
    bigfname = files[0]['path']
    info['extension'] = metautils.get_extension(bigfname).lower()
    info['category'] = metautils.get_category(info['extension'])
    if info['category'] == u'安装包':
        pass
    elif info['category'] ==  u'压缩文件':
        pass
    elif info['category'] ==  u'图像':
        pass
    elif info['category'] ==  u'文档书籍':
        pass

    if 'files' in info:
        try:
            dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
        except:
            print name, 'insert error', sys.exc_info()[1]
        del info['files']

    try:
        try:
            print '\n', datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0], geoip.country_name_by_addr(address[0]),
        except:
            print '\n',datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'Saved', info['info_hash'], sys.exc_info()[1]
        ret = dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 
            'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
            (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
            info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
            info.get('comment',''), info.get('creator','')))
        dbcurr.connection.commit()
    except:
        print name, 'save error', info
        traceback.print_exc()
        return
Exemple #2
0
    def got_torrent(self):
        utcnow = datetime.datetime.utcnow()
        binhash, address, data, dtype, start_time = self.metadata_queue.get()
        if dtype == 'pt':
            self.n_downloading_pt -= 1
        elif dtype == 'lt':
            self.n_downloading_lt -= 1
        if not data:
            return
        self.n_valid += 1

        try:
            info = self.parse_torrent(data)
            if not info:
                return
        except:
            traceback.print_exc()
            return
        info_hash = binhash.encode('hex')
        info['info_hash'] = info_hash
        # need to build tags
        info['tagged'] = False
        info['classified'] = False
        info['requests'] = 1
        info['last_seen'] = utcnow
        info['source_ip'] = address[0]

        if info.get('files'):
            files = [z for z in info['files'] if not z['path'].startswith('_')]
            if not files:
                files = info['files']
        else:
            files = [{'path': info['name'], 'length': info['length']}]
        files.sort(key=lambda z:z['length'], reverse=True)
        bigfname = files[0]['path']
        info['extension'] = metautils.get_extension(bigfname).lower()
        info['category'] = metautils.get_category(info['extension'])

        if 'files' in info:
            try:
                self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
            except:
                print self.name, 'insert error', sys.exc_info()[1]
            del info['files']

        try:
            print '\n', 'Saved', info['info_hash'], dtype, info['name'], (time.time()-start_time), 's', address[0],
            ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 
                'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
                info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
                info.get('comment',''), info.get('creator','')))
            self.dbconn.commit()
        except:
            print self.name, 'save error', self.name, info
            traceback.print_exc()
            return
        self.n_new += 1
Exemple #3
0
def save_metadata(dbcurr, binhash, address, start_time, data, blacklist):
    utcnow = datetime.datetime.utcnow()
    name = threading.currentThread().getName()
    try:
        info = parse_metadata(data)
        if not info:
            return
    except:
        traceback.print_exc()
        return
    info_hash = binhash.encode('hex')
    info['info_hash'] = info_hash
    # need to build tags
    info['tagged'] = False
    info['classified'] = False
    info['requests'] = 1
    info['last_seen'] = utcnow
    info['source_ip'] = address[0]

    for item in blacklist:
        if str(item) in info['name']:
            return
    if info.get('files'):
        files = [z for z in info['files'] if not z['path'].startswith('_')]
        if not files:
            files = info['files']
    else:
        files = [{'path': info['name'], 'length': info['length']}]
    files.sort(key=lambda z:z['length'], reverse=True)
    bigfname = files[0]['path']
    info['extension'] = metautils.get_extension(bigfname).lower()
    info['category'] = metautils.get_category(info['extension'])

    if 'files' in info:
        try:
            dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
        except:
            print name, 'insert error', sys.exc_info()[1]
        del info['files']

    try:
        try:
            print '\n', 'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0], geoip.country_name_by_addr(address[0]),
        except:
            print '\n', 'Saved', info['info_hash'], sys.exc_info()[1]
        try:
            ret = dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 
            'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
            (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
            info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
            info.get('comment',''), info.get('creator','')))
        except:
            print 'insert search_hash err: ',info['info_hash']
        dbcurr.connection.commit()
    except:
        print name, 'save error', info
        traceback.print_exc()
        return
  def got_torrent(self):  
      if self.metadata_queue.qsize() == 0:  
          return  
      binhash, address, data,start_time = self.metadata_queue.get()  
      if not data:  
          return  
      try:  
          info = self.parse_metadata(data)  
          if not info:  
              return  
      except:  
          traceback.print_exc()  
          return  
 
      temp = time.time()  
      x = time.localtime(float(temp))  
      utcnow = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now  
         
      info_hash = binhash.encode('hex') #磁力  
      info['info_hash'] = info_hash  
      # need to build tags  
      info['tagged'] = False  
      info['classified'] = False  
      info['requests'] = 1  
      info['last_seen'] = utcnow  
      info['create_time'] = utcnow  
      info['source_ip'] = address[0]  
         
      if info.get('files'):  
          files = [z for z in info['files'] if not z['path'].startswith('_')]  
          if not files:  
              files = info['files']  
      else:  
          files = [{'path': info['name'], 'length': info['length']}]  
      files.sort(key=lambda z:z['length'], reverse=True)  
      bigfname = files[0]['path']  
      info['extension'] = metautils.get_extension(bigfname).lower()  
      info['category'] = metautils.get_category(info['extension'])  
 
      try:  
          try:  
              print '\n', 'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0]  
          except:  
              print '\n', 'Saved', info['info_hash']  
          ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' +   
              'length,create_time,last_seen,requests) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',  
              (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],  
              info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests']))  
          if self.count %50 ==0:  
              self.dbconn.commit()  
              if self.count>100000:  
                  self.count=0  
      except:  
          print self.name, 'save error', self.name, info  
          traceback.print_exc()  
          return  
Exemple #5
0
    def got_torrent(self):
        if self.metadata_queue.qsize() == 0:
            return
        binhash, address, data,start_time = self.metadata_queue.get()
        if not data:
            return
        try:
            info = self.parse_metadata(data)
            if not info:
                return
        except:
            traceback.print_exc()
            return

        temp = time.time()
        x = time.localtime(float(temp))
        utcnow = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now

        info_hash = binhash.encode('hex') #磁力
        info['info_hash'] = info_hash
        # need to build tags
        info['tagged'] = False
        info['classified'] = False
        info['requests'] = 1
        info['last_seen'] = utcnow
        info['create_time'] = utcnow
        info['source_ip'] = address[0]

        if info.get('files'):
            files = [z for z in info['files'] if not z['path'].startswith('_')]
            if not files:
                files = info['files']
        else:
            files = [{'path': info['name'], 'length': info['length']}]
        files.sort(key=lambda z:z['length'], reverse=True)
        bigfname = files[0]['path']
        info['extension'] = metautils.get_extension(bigfname).lower()
        info['category'] = metautils.get_category(info['extension'])

        try:
            try:
                print '\n', 'Saved', info['info_hash'], info['name'], (time.time()-start_time), 's', address[0]
            except:
                print '\n', 'Saved', info['info_hash']
            ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' +
                'length,create_time,last_seen,requests) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
                info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests']))
            if self.count % 50 == 0 :
                self.dbconn.commit()
                if self.count > 100000:
                    self.count = 0
        except:
            print self.name, 'save error', self.name, info
            traceback.print_exc()
            return
Exemple #6
0
def save_metadata(dbcurr, binhash, address, start_time, data):
    utcnow = datetime.datetime.utcnow()
    name = threading.currentThread().getName()
    try:
        info = parse_metadata(data)
        if not info:
            return
    except:
        traceback.print_exc()
        return
    info_hash = binhash.encode('hex')
    info['info_hash'] = info_hash
    # need to build tags
    info['tagged'] = False
    info['classified'] = False
    info['requests'] = 1
    info['last_seen'] = utcnow
    info['source_ip'] = address[0]

    if info.get('files'):
        files = [z for z in info['files'] if not z['path'].startswith('_')]
        if not files:
            files = info['files']
    else:
        files = [{'path': info['name'], 'length': info['length']}]
    files.sort(key=lambda z: z['length'], reverse=True)
    bigfname = files[0]['path']
    info['extension'] = metautils.get_extension(bigfname).lower()
    info['category'] = metautils.get_category(info['extension'])

    if 'files' in info:
        try:
            dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)',
                           (info['info_hash'], json.dumps(info['files'])))
        except Exception:
            pass
        del info['files']

    try:
        sql = (
            'INSERT INTO search_hash(info_hash,category,data_hash,name,extension,'
            'classified,source_ip,tagged,length,create_time,last_seen,requests,'
            'comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        )
        ret = dbcurr.execute(
            sql, (info['info_hash'], info['category'], info['data_hash'],
                  info['name'], info['extension'], info['classified'],
                  info['source_ip'], info['tagged'], info['length'],
                  info['create_time'], info['last_seen'], info['requests'],
                  info.get('comment', ''), info.get('creator', '')))
        dbcurr.connection.commit()
    except:
        print name, 'save error', info
        traceback.print_exc()
        return
Exemple #7
0
def save_metadata(dbcurr, infohash, address, start_time, info):
    utcnow = datetime.datetime.utcnow()
    name = threading.currentThread().getName()
    # try:
    #     info = parse_metadata(data)
    #     if not info:
    #         raise Exception("no info!")
    # except:
    #     traceback.print_exc()
    #     return
    info_hash = infohash
    info['info_hash'] = info_hash
    # need to build tags
    info['tagged'] = False
    info['classified'] = False
    info['requests'] = 1
    info['last_seen'] = utcnow
    info['source_ip'] = address[0]

    if info.get('files'):
        files = [z for z in info['files'] if not z['path'].startswith('_')]
        if not files:
            files = info['files']
    else:
        files = [{'path': info['name'], 'length': info['length']}]
    files.sort(key=lambda z: z['length'], reverse=True)
    bigfname = files[0]['path']
    info['extension'] = metautils.get_extension(bigfname).lower()
    info['category'] = metautils.get_category(info['extension'])

    if 'files' in info:
        try:
            dbcurr.execute('INSERT IGNORE `ssbc`.`search_filelist` VALUES(%s, %s) ',
                           (info['info_hash'], json.dumps(info['files'])))
        except:
            logging.error(name+ 'insert error'+ sys.exc_info()[1])
        del info['files']

    try:
        logging.debug ( 'Saved : '+ info['info_hash'])
        ret = dbcurr.execute(
            'INSERT INTO `ssbc`.`search_hash`(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' +
            'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ON DUPLICATE KEY UPDATE requests = requests + 1',
            (
            info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
            info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
            info.get('comment', ''), info.get('creator', '')))
        dbcurr.connection.commit()
    except:
        logging.error( name, 'save error', info)
        traceback.print_exc()
        return
Exemple #8
0
    def run(self):
        self.name = threading.currentThread().getName()
        print self.name, 'started'
        n_reqs = n_valid = n_new = 0
        while True:
            x = q.get()
            print 'get + ' + x + '  :' + self.getName() +'\n'
            n_reqs += 1

            utcnow = datetime.datetime.utcnow()
            date = (utcnow + datetime.timedelta(hours=8))
            date = datetime.datetime(date.year, date.month, date.day)
            while True:
                # Check if we have this info_hash
                self.dbcurr.execute('SELECT id FROM movie_hash WHERE hash=%s', (x))
                y = self.dbcurr.fetchone()
                if not y:
                    try:
                        data = self.get_torrent(x)
                    except:
                        traceback.print_exc()
                        break
                    if not data:
                        sys.stdout.write('!')
                        break
                    try:
                        info = self.parse_torrent(data)
                    except:
                        traceback.print_exc()
                        break
                    info['info_hash'] = x
                    info['reqtimes'] = 1
                    info['updatetime'] = utcnow
                    info['source'] = '127.0.0.1'
                    info['create_time'] = utcnow

                    if info.get('files'):
                        files = [z for z in info['files'] if not z['path'].startswith('_')]
                        if not files:
                            files = info['files']
                    else:
                        files = [{'path': info['name'], 'length': info['length']}]
                    files.sort(key=lambda z:z['length'], reverse=True)
                    bigfname = files[0]['path']
                    info['extension'] = metautils.get_extension(bigfname).lower()
                    info['category'] = metautils.get_category(info['extension'])
                    
                    try:
                        if 'files' in info:
                            self.dbcurr.execute('INSERT INTO filelists VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
                            del info['files']
                    except:
                        traceback.print_exc()
                    try:
                        ret = self.dbcurr.execute('INSERT INTO movie_hash(hash,category,name,source,filesize,createtime,updatetime,reqtimes) VALUES(%s,%s,%s,%s,%s,%s,%s,%s)',
                            (info['info_hash'], info['category'], info['name'], info['source'], 
                            info['length'], info['create_time'], info['updatetime'], info['reqtimes']))
                    except:
                        traceback.print_exc()
                        break
                    n_new += 1
                n_valid += 1

                sys.stdout.write('#')
                sys.stdout.flush()
                self.dbconn.commit()
                break

            if n_reqs >= MAX_READ:
                self.dbcurr.execute('INSERT INTO statusreport(date,new_hashs,total_requests, valid_requests)  VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE ' +
                    'total_requests=total_requests+%s, valid_requests=valid_requests+%s, new_hashs=new_hashs+%s',
                    (date, n_new, n_reqs, n_valid, n_reqs, n_valid, n_new))
                    
                n_reqs = n_valid = n_new = 0
Exemple #9
0
    def got_torrent(self):
        utcnow = datetime.datetime.utcnow()
        binhash, address, data, dtype, start_time = self.metadata_queue.get()
        if dtype == 'pt':
            self.n_downloading_pt -= 1
        elif dtype == 'lt':
            self.n_downloading_lt -= 1
        if not data:
            return
        self.n_valid += 1

        try:
            info = self.parse_torrent(data)
            if not info:
                return
        except:
            traceback.print_exc()
            return
        info_hash = binhash.encode('hex')
        info['info_hash'] = info_hash
        # need to build tags
        info['tagged'] = False
        info['classified'] = False
        info['requests'] = 1
        info['last_seen'] = utcnow
        info['source_ip'] = address[0]

        if info.get('files'):
            files = [z for z in info['files'] if not z['path'].startswith('_')]
            if not files:
                files = info['files']
        else:
            files = [{'path': info['name'], 'length': info['length']}]
        files.sort(key=lambda z:z['length'], reverse=True)
        bigfname = files[0]['path']
        info['extension'] = metautils.get_extension(bigfname).lower()
        info['category'] = metautils.get_category(info['extension'])

        if 'files' in info:
            try:
                self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
            except:
                print self.name, 'insert error', sys.exc_info()[1]
            del info['files']

        try:
            try:
                print '\n', 'Saved', info['info_hash'], dtype, info['name'], (time.time()-start_time), 's', address[0],
            except:
                print '\n', 'Saved', info['info_hash'],
            ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 
                'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
                info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
                info.get('comment',''), info.get('creator','')))
            self.dbconn.commit()
        except:
            print self.name, 'save error', self.name, info
            traceback.print_exc()
            return
        self.n_new += 1
Exemple #10
0
    def run(self):
        self.name = threading.currentThread().getName()
        print self.name, 'started'
        n_reqs = n_valid = n_new = 0
        while True:
            x = q.get()
            n_reqs += 1

            utcnow = datetime.datetime.utcnow()
            date = (utcnow + datetime.timedelta(hours=8))
            date = datetime.datetime(date.year, date.month, date.day)
            while True:
                # Check if we have this info_hash
                self.dbcurr.execute('SELECT id FROM search_hash WHERE info_hash=%s', (x['info_hash'],))
                y = self.dbcurr.fetchone()
                if not y:
                    try:
                        data = self.fetch_torrent(x['info_hash'])
                    except:
                        traceback.print_exc()
                        break
                    if not data:
                        sys.stdout.write('!')
                        #print self.name, 'Missing torrent file', x['info_hash'], 'from', x['ip']
                        break
                    try:
                        info = self.parse_torrent(data)
                    except:
                        traceback.print_exc()
                        break
                    info['info_hash'] = x['info_hash']
                    # need to build tags
                    info['tagged'] = False
                    info['classified'] = False
                    info['requests'] = 1
                    info['last_seen'] = utcnow
                    info['source_ip'] = x['ip']

                    if info.get('files'):
                        files = [z for z in info['files'] if not z['path'].startswith('_')]
                        if not files:
                            files = info['files']
                    else:
                        files = [{'path': info['name'], 'length': info['length']}]
                    files.sort(key=lambda z:z['length'], reverse=True)
                    bigfname = files[0]['path']
                    info['extension'] = metautils.get_extension(bigfname).lower()
                    info['category'] = metautils.get_category(info['extension'])

                    if 'files' in info:
                        self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
                        del info['files']

                    try:
                        #db.basic.save(info, w=1)
                        ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 
                            'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                            (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
                            info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
                            info.get('comment',''), info.get('creator','')))
                    except:
                        print 'save error', self.name, info
                        traceback.print_exc()
                        break
                    n_new += 1
                n_valid += 1

                # Check if we have log this request
                #_id_data = '%s-%s-%s' % (x['info_hash'], x['ip'], x['port'])
                #tmpid = binascii.crc32(_id_data) & 0xffffffff
                #if db_log.requests_user.find_one({'_id': tmpid}):
                #    sys.stdout.write('S')
                #    sys.stdout.flush()
                #    break
                #db_log.requests_user.save({'_id': tmpid, 
                #    'time': utcnow})
                
                # 更新最近发现时间,请求数
                if y:
                    self.dbcurr.execute('UPDATE search_hash SET last_seen=%s, requests=requests+1 WHERE info_hash=%s', (utcnow, x['info_hash']))

                # 更新日请求量统计
                #colname = 'requests_d_' + date.strftime('%y%m%d')
                #db_log[colname].update(
                #    {'_id': bid}, 
                #    {'$inc': {'requests': 1}}
                #    , upsert=True)
                sys.stdout.write('#')
                sys.stdout.flush()
                self.dbconn.commit()
                break

            if n_reqs >= MAX_READ:
                self.dbcurr.execute('INSERT INTO search_statusreport(date,new_hashes,total_requests, valid_requests)  VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE ' +
                    'total_requests=total_requests+%s, valid_requests=valid_requests+%s, new_hashes=new_hashes+%s',
                    (date, n_new, n_reqs, n_valid, n_reqs, n_valid, n_new))
                    
                n_reqs = n_valid = n_new = 0
Exemple #11
0
    def run(self):
        self.name = threading.currentThread().getName()
        print self.name, 'started'
        n_reqs = n_valid = n_new = 0
        while True:
            x = q.get()
            n_reqs += 1

            utcnow = datetime.datetime.utcnow()
            date = (utcnow + datetime.timedelta(hours=8))
            date = datetime.datetime(date.year, date.month, date.day)
            while True:
                # Check if we have this info_hash
                self.dbcurr.execute('SELECT id FROM search_hash WHERE info_hash=%s', (x['info_hash'],))
                y = self.dbcurr.fetchone()
                if not y:
                    try:
                        data = self.fetch_torrent(x['info_hash'])
                    except:
                        traceback.print_exc()
                        break
                    if not data:
                        sys.stdout.write('!')
                        #print self.name, 'Missing torrent file', x['info_hash'], 'from', x['ip']
                        break
                    try:
                        info = self.parse_torrent(data)
                    except:
                        traceback.print_exc()
                        break
                    info['info_hash'] = x['info_hash']
                    # need to build tags
                    info['tagged'] = False
                    info['classified'] = False
                    info['requests'] = 1
                    info['last_seen'] = utcnow
                    info['source_ip'] = x['ip']

                    if info.get('files'):
                        files = [z for z in info['files'] if not z['path'].startswith('_')]
                        if not files:
                            files = info['files']
                    else:
                        files = [{'path': info['name'], 'length': info['length']}]
                    files.sort(key=lambda z:z['length'], reverse=True)
                    bigfname = files[0]['path']
                    info['extension'] = metautils.get_extension(bigfname).lower()
                    info['category'] = metautils.get_category(info['extension'])

                    if 'files' in info:
                        self.dbcurr.execute('INSERT INTO search_filelist VALUES(%s, %s)', (info['info_hash'], json.dumps(info['files'])))
                        del info['files']

                    try:
                        #db.basic.save(info, w=1)
                        ret = self.dbcurr.execute('INSERT INTO search_hash(info_hash,category,data_hash,name,extension,classified,source_ip,tagged,' + 
                            'length,create_time,last_seen,requests,comment,creator) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)',
                            (info['info_hash'], info['category'], info['data_hash'], info['name'], info['extension'], info['classified'],
                            info['source_ip'], info['tagged'], info['length'], info['create_time'], info['last_seen'], info['requests'],
                            info.get('comment',''), info.get('creator','')))
                    except:
                        print 'save error', self.name, info
                        traceback.print_exc()
                        break
                    n_new += 1
                n_valid += 1

                # Check if we have log this request
                #_id_data = '%s-%s-%s' % (x['info_hash'], x['ip'], x['port'])
                #tmpid = binascii.crc32(_id_data) & 0xffffffff
                #if db_log.requests_user.find_one({'_id': tmpid}):
                #    sys.stdout.write('S')
                #    sys.stdout.flush()
                #    break
                #db_log.requests_user.save({'_id': tmpid, 
                #    'time': utcnow})
                
                # 更新最近发现时间,请求数
                if y:
                    self.dbcurr.execute('UPDATE search_hash SET last_seen=%s, requests=requests+1 WHERE info_hash=%s', (utcnow, x['info_hash']))

                # 更新日请求量统计
                #colname = 'requests_d_' + date.strftime('%y%m%d')
                #db_log[colname].update(
                #    {'_id': bid}, 
                #    {'$inc': {'requests': 1}}
                #    , upsert=True)
                sys.stdout.write('#')
                sys.stdout.flush()
                self.dbconn.commit()
                break

            if n_reqs >= MAX_READ:
                self.dbcurr.execute('INSERT INTO search_statusreport(date,new_hashes,total_requests, valid_requests)  VALUES(%s,%s,%s,%s) ON DUPLICATE KEY UPDATE ' +
                    'total_requests=total_requests+%s, valid_requests=valid_requests+%s, new_hashes=new_hashes+%s',
                    (date, n_new, n_reqs, n_valid, n_reqs, n_valid, n_new))
                    
                n_reqs = n_valid = n_new = 0
Exemple #12
0
    def startSpider(self):
        if self.spider_queue.empty():
            fetched_users = self.db.execute(
                'SELECT * from spider_list ORDER BY weight DESC limit 0,20')
            if fetched_users <= 0:
                print 'nothing to spider,spider_list is empty'
                return False
            self.start = 'start'
            self.errno = ERR_NO
            fetchall = self.db.fetchall()
            #将数据库中取出的待爬取的分享者,加入爬取队列
            for item in fetchall:
                self.spider_queue.put({
                    'sid': item[0],
                    'uk': item[1],
                    'file_fetched': item[2],
                    'follow_fetched': item[3],
                    'follow_done': item[4],
                    'file_done': item[5],
                    'weight': item[6],
                    'uid': item[7]
                })
            self.got_follow_count = 0
            self.got_files_count = 0
            self.while_count = 0

        while not self.spider_queue.empty():
            self.while_count += 1
            share_user = self.spider_queue.get()
            #爬取分享者的文件列表
            if not share_user['file_done']:
                print '%d now spidering file ,%d  file fetched' % (
                    share_user['uk'], share_user['file_fetched'])
                rs = self.getShareLists(share_user['uk'],
                                        share_user['file_fetched'])
                if not rs:
                    print 'uk:%d error to fetch files,try again later...' % share_user[
                        'uk']
                    return True
                total_count, fetched_count, file_list = rs
                total_fetched = share_user['file_fetched'] + fetched_count
                print 'fetched_file_count:%d' % fetched_count
                if total_fetched >= total_count or total_count == 0:
                    share_user['file_done'] = 1  #该分享者所有文件爬取完成
                if total_count == 0:
                    self.db.execute(
                        "UPDATE spider_list set file_done=%s WHERE sid=%s",
                        (1, share_user['sid']))
                    self.db.commit()
                else:
                    try:
                        files_count = 0
                        for file in file_list:
                            files_count += 1
                            ext = ''
                            file_type = ''
                            file_type_i = -1
                            if file['isdir'] == 0 and file[
                                    'feed_type'] == 'share':
                                ext = metautils.get_extension(
                                    file['title']).lower()
                                file_type = metautils.get_category(ext)
                                file_type_i = self.file_type_t[file_type]
                            time_stamp = int(time.time())
                            self.db.execute(
                                "INSERT INTO share_file (title,uk,shareid,shorturl,isdir,size,md5,ext,feed_time,create_time,file_type,uid,feed_type) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                (file['title'], file['uk'], file['shareid'],
                                 file['shorturl'], file['isdir'], file['size'],
                                 file['md5'], ext, file['feed_time'],
                                 time_stamp, file_type_i, share_user['uid'],
                                 file['feed_type']))
                    except:
                        share_user['file_done'] = 0
                        self.db.rollback()
                        traceback.print_exc()
                        return False
                    else:
                        self.db.execute(
                            "UPDATE spider_list set file_fetched=%s,file_done=%s WHERE sid=%s",
                            (total_fetched, share_user['file_done'],
                             share_user['sid']))
                        self.db.execute(
                            "UPDATE share_users set fetched=%s WHERE uid=%s",
                            (total_fetched, share_user['uid']))
                        share_user['file_fetched'] = total_fetched
                        self.got_files_count += files_count
                        self.db.commit()

            #爬取完文件后在爬取订阅列表
            if share_user['follow_done'] == 0 and share_user['file_done'] == 1:
                print '%d now spidering follow ,%d  follow fetched' % (
                    share_user['uk'], share_user['follow_fetched'])
                rs = self.getFollows(share_user['uk'],
                                     share_user['follow_fetched'])
                if not rs:
                    print 'error to fetch follows,try again later...'
                    return
                total_count, fetched_count, follow_list = rs
                total_fetched = share_user['follow_fetched'] + fetched_count
                print 'fetched_follow_count:%d' % fetched_count
                if total_fetched >= total_count or total_count == 0:
                    share_user['follow_done'] = 1
                if total_count == 0:
                    self.db.execute("DELETE FROM spider_list WHERE sid=%s",
                                    (share_user['sid'], ))
                    self.db.commit()
                else:
                    try:
                        follow_count = 0
                        for follow in follow_list:
                            follow_count += 1
                            #判断该用户是否已经在表中了
                            if self.db.execute(
                                    'SELECT * FROM share_users WHERE uk=%s',
                                (follow['follow_uk'], )) > 0:
                                print 'uk:%d has already in share_user table' % follow[
                                    'follow_uk']
                                continue
                            time_stamp = int(time.time())
                            self.db.execute(
                                "INSERT INTO share_users (uk,user_name,avatar_url,intro,follow_count,album_count,\
								fens_count,pubshare_count,last_visited,create_time,weight) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
                                (follow['follow_uk'], follow['follow_uname'],
                                 follow['avatar_url'], follow['intro'],
                                 follow['follow_count'], follow['album_count'],
                                 follow['fans_count'],
                                 follow['pubshare_count'], time_stamp,
                                 time_stamp, 5))
                            #将获取的新分享者加入爬取列表
                            self.db.execute(
                                "INSERT INTO spider_list (uk,uid) VALUES(%s,%s)",
                                (follow['follow_uk'], self.db.last_row_id()))
                    except:
                        share_user['follow_done'] = 0
                        self.db.rollback()
                        traceback.print_exc()
                        return False
                    else:
                        if share_user['follow_done'] == 1:
                            #订阅者爬取完成,该分享者的任务完成,从待爬取列表中删除
                            print 'delete follow fetched sid:%d from spider_list' % share_user[
                                'sid']
                            self.db.execute(
                                "DELETE FROM spider_list WHERE sid=%s",
                                (share_user['sid'], ))
                        else:
                            self.db.execute(
                                "UPDATE spider_list set follow_fetched=%s,follow_done=%s WHERE sid=%s",
                                (total_fetched, share_user['follow_done'],
                                 share_user['sid']))
                        share_user['follow_fetched'] = total_fetched
                        self.got_follow_count += follow_count
                        self.db.commit()
            #只要分享者列表没完成,说明该分享者还未爬取完,则加入工作队列,继续爬取
            if share_user['follow_done'] == 0:
                self.spider_queue.put(share_user)
            else:
                print '%d has done' % share_user['uk']
                del share_user
            time.sleep(SPIDER_INTERVAL)

        print '-----------------Done------------------'
        print 'while_count:%d' % self.while_count
        print 'got_follow_count:%d' % self.got_follow_count
        print 'got_files_count:%d' % self.got_files_count
        return True