def collect_customer_bg(cusid='', page=0, src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='customer_erp'):
	###### initialize objects
	src = APIStorage(vgvars.erp) if not src else src
	# des = VarStorage({'data': defaultdict(list)}) if not des else des
	# des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name)})
	apicollector = DataCollector(src, des)	

	customer_stmt_temp = vgvars.erp['customer']

	if cusid: 
		customer_stmt_temp =  customer_stmt_temp.format('id={}'.format(cusid))
		data = apicollector.fetch_data(customer_stmt_temp)['data']['currentItems']
		des = FileStorage({'fpath': '{}{}_cusid_{}'.format(out_dir, output_name, cusid)})
		apicollector.des = des
		apicollector.insert_data({
			'selected_format': 'json',
			'values': data
		})
	else:
		customer_stmt_temp = customer_stmt_temp.format('page={}')
		if not page:
			page_num = apicollector.fetch_data(customer_stmt_temp.format(1))['data']['totalPage']	
		
		# page_num = 2 # Testing	

		for i in range(1, page_num + 1):	
			data = apicollector.fetch_data(customer_stmt_temp.format(i))['data']['currentItems']
			des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, i)})			
			apicollector.des = des
			apicollector.insert_data({
				'selected_format': 'json',
				'values': data
			})
Beispiel #2
0
 def open(self):
     self.synclock.acquire()
     try:
         self.storage = FileStorage(self.path, self.name, self.extension)
         self.storage.open()
     finally:
         self.synclock.release()
def create(config: StorageConfiguration) -> Storage:
    storage = Storage(
        JsonFormattedHostStorage(FileStorage(config.hosts_filename)),
        JsonFormattedUserAccountStorage(
            FileStorage(config.user_accounts_filename)),
        JsonFormattedUnixAccountStorage(
            FileStorage(config.unix_accounts_filename)),
        UserAccountActivationInMemoryStorage())
    return storage
Beispiel #4
0
    def __init__(self):
        self.db = FileStorage('./pybl.db')
        self.current_hash = None

        if self.db.empty():
            genesis = Block.genesis_block()
            self.db.put_block(genesis)
            self.tip = genesis.hash
        else:
            self.tip = self.db.get_last_hash()
Beispiel #5
0
class Blockchain:
    def __init__(self):
        self.db = FileStorage('./pybl.db')
        self.current_hash = None

        if self.db.empty():
            genesis = Block.genesis_block()
            self.db.put_block(genesis)
            self.tip = genesis.hash
        else:
            self.tip = self.db.get_last_hash()

    def __iter__(self):
        self.current_hash = self.tip
        return self

    def __next__(self):
        if self.current_hash == '':
            raise StopIteration

        block = self.db.get_block(self.current_hash)
        prev_hash = block.prev_block_hash
        self.current_hash = prev_hash
        return block

    def add_block(self, data):
        last = self.db.get_last_hash()
        block = Block(data, last)
        self.db.put_block(block)
        self.tip = block.hash
Beispiel #6
0
class Repo:
    def __init__(self, repoDir):
        if not os.path.exists(repoDir):
            raise InvalidRepo("Invalid Repo path")
        #creating required structure
        self.path = repoDir
        self.storage  = FileStorage(os.path.join(repoDir,".svcs"))
        #create objects directory under.svcs directory
        #os.makedirs(os.path.join(wd,".svcs","objects"))
        #os.makedirs(os.path.join(wd,".svcs","tip"))
        
        
    def commit(self,commitMsg, userId, listOfFiles):
        """stre all the give files"""
        latestCommitId =None
        date = datetime.datetime.utcnow().replace(microsecond = 0)
        #parent = None
        files =[]
        for itm in listOfFiles:
            filePath = os.path.join(self.path,itm)
            if not os.path.exists(filePath):
                raise InvalidFile
            
            fd= open(filePath, "r")
            fileObj =File(fd.read())
            self.storage.store_object(fileObj)
            fd.close()
            latestCommitId = fileObj.id
            files.append([itm,fileObj.id])
            
        parent = self.storage.get_tip()
        if parent!=None:
            parent = parent.id
        comObj = Commit(userId,commitMsg, date,parent,files)
        self.storage.store_object(comObj)
        self.storage.update_tip(comObj)

        return latestCommitId
    def getLogs(self):
        """ Returns all log entry in the current repo"""
        currentTip = self.storage.get_tip()
        c = currentTip.id
        logs =[]
        while c!=None:
            c = self.storage.get_object(c)
            logs.append({"id":c.files[0][1],"committer":c.committer, "message":c.message})
            c = c.parent
            pass
        return logs
        
        
        
            
        
Beispiel #7
0
class Repo:
    def __init__(self, repoDir):
        if not os.path.exists(repoDir):
            raise InvalidRepo("Invalid Repo path")
        #creating required structure
        self.path = repoDir
        self.storage = FileStorage(os.path.join(repoDir, ".svcs"))
        #create objects directory under.svcs directory
        #os.makedirs(os.path.join(wd,".svcs","objects"))
        #os.makedirs(os.path.join(wd,".svcs","tip"))

    def commit(self, commitMsg, userId, listOfFiles):
        """stre all the give files"""
        latestCommitId = None
        date = datetime.datetime.utcnow().replace(microsecond=0)
        #parent = None
        files = []
        for itm in listOfFiles:
            filePath = os.path.join(self.path, itm)
            if not os.path.exists(filePath):
                raise InvalidFile

            fd = open(filePath, "r")
            fileObj = File(fd.read())
            self.storage.store_object(fileObj)
            fd.close()
            latestCommitId = fileObj.id
            files.append([itm, fileObj.id])

        parent = self.storage.get_tip()
        if parent != None:
            parent = parent.id
        comObj = Commit(userId, commitMsg, date, parent, files)
        self.storage.store_object(comObj)
        self.storage.update_tip(comObj)

        return latestCommitId

    def getLogs(self):
        """ Returns all log entry in the current repo"""
        currentTip = self.storage.get_tip()
        c = currentTip.id
        logs = []
        while c != None:
            c = self.storage.get_object(c)
            logs.append({
                "id": c.files[0][1],
                "committer": c.committer,
                "message": c.message
            })
            c = c.parent
            pass
        return logs
Beispiel #8
0
    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            url  = 'http://weibo.com/%s/follow?page=%s' %(uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
        
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path)
        
        start_time = time.time()
        
        parser = ComFollowsParser(self.storage, uids_storage=self.uids_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)
        
                num_pages = settings.PAGE_LIMIT
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
                
            worker_manager.wait_all_complete()

        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
Beispiel #9
0
 def open(self):
     self.synclock.acquire()
     try:
         self.storage = FileStorage(self.path, self.name, self.extension)
         self.storage.open()
     finally:
         self.synclock.release()
Beispiel #10
0
def collect_orders(start_date,
                   end_date,
                   eids=[],
                   src='',
                   des='',
                   out_dir=vgvars.dir_path['out_dir'],
                   output_name='order_data',
                   gen_stmt=get_collect_order_stmt):
    ###### prepare
    has_des = False if not des else True

    ###### initialize objects
    src = DBStorage(vgvars.vgdb) if not src else src
    db_collector = VGDBCollector(src)

    ###### collecting
    # make sure that data is collect within a selected period of time like a year, a month, or a week
    date_list = convert_datetime.divide_dates(start_date,
                                              end_date)  # in months

    for i in date_list:
        start_date, end_date = i

        # update destination storage
        # des = GSStorage(des)
        des = des if has_des else FileStorage({
            'fpath':
            '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date)
        })
        db_collector.des = des

        # collect data
        collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt)

    return db_collector.des.data
def setup_function(function):
    global storage, flower_entry

    with open(STORAGE_FILENAME, 'w') as f:
        pass
    storage = FileStorage(STORAGE_FILENAME)
    flower_entry = FlowerEntry(1, "tree", 1, 1)
Beispiel #12
0
def process_file(file_url: str) -> Tuple[str, Tuple[str, ...]]:
    """Process file with download, cache and upgrade."""

    _, file_ext = os.path.splitext(file_url)
    folder_hash = md5(file_url.encode('utf-8')).hexdigest()

    path = f"/notebooks/{folder_hash}"
    original = f"original{file_ext}"
    converted = f"converted{file_ext}"

    # TODO: delete the folder completely if `force`
    if not os.path.exists(path):
        file_content = _download_file(file_url)

        os.mkdir(path)
        with open(f"{path}/{original}", "w") as original_file:
            original_file.write(file_content)

        try:
            output = _convert_file(f"{path}/{original}", f"{path}/{converted}")
        except ConvertionException as error:
            shutil.rmtree(path)
            raise error

        with open(f"{path}/output", "w") as summary_output:
            summary_output.write('\n'.join(output))

        shutil.copy('report.txt', f"{path}/report")

        # persist `report.txt` to GCS
        storage = FileStorage()
        storage.save_file('report.txt', folder_hash)

        # found a python file, need to encode separately
        if original.endswith('.py'):
            result_filenames = []
            for py_file in [original, converted]:
                result_filenames.append(_save_ipynb_from_py(path, py_file))

            assert len(result_filenames) == 2
            return path, tuple(result_filenames[:2])

    if original.endswith('.py'):
        return path, (original.replace('.py', '.ipynb'),
                      converted.replace('.py', '.ipynb'))

    return path, (original, converted)
Beispiel #13
0
def local(db='file', folder=None, uids=[]):
    global give_ups
    
    create = create_cookie_file()
    fetcher = CnFetcher(account, pwd, cookie_file if not create else None)
    if create:
        fetcher.login(cookie_filename=cookie_file)
    while give_ups > 0:
        while len(tokens) == 0:
            if give_ups > 0:
                pass
            else:
                return
        
        token = tokens.pop()
        cb = callback(token)
        
        if len(uids) == 0:
            give_ups = 0
        else:
            uid = uids.pop()
            
            try:
                crawler = UserCrawler(uid, is_uid=True, fetcher=fetcher, 
                                      fetch_fans=False, callbacks=cb, span=False)
                uid = crawler.uid
                if db == 'file' and folder is not None:
                    storage = FileStorage(uid, folder)
                elif db == 'mongo':
                    storage = MongoStorage(uid)
                else:
                    raise ValueError('db must be "file" or "mongo", ' + 
                                     'when is "file", you must define folder parameter.')
                
                if storage.crawled: 
                    storage.complete()
                    cb()
                    continue
                else:
                    crawler.set_storage(storage)
                    crawler.start()
            except Exception, e:
                cb()
                # raise e
                logger.exception(e)
Beispiel #14
0
def collect_estore_bg(eids=[],
                      eloginnames=[],
                      get_eids=False,
                      get_eids_args=[],
                      get_eids_function=get_ordered_estore_id,
                      src='',
                      des='',
                      out_dir=vgvars.dir_path['out_dir'],
                      output_name='e_bg',
                      gen_stmt=get_collect_estore_bg_stmt,
                      max_query=vgvars.max_vgdb_query_num):

    ###### prepare
    has_des = False if not des else True

    ###### initialize objects
    src = DBStorage(vgvars.vgdb) if not src else src
    db_collector = VGDBCollector(src)

    # by default, collect estore bg of estore that has order in a given period of time
    if get_eids:
        get_eids_args.append(db_collector)
        eids = get_eids_function(*get_eids_args)

    if eloginnames or eids:
        if eloginnames:
            qnum = math.ceil(len(eloginnames) / max_query)
        else:
            qnum = math.ceil(len(eids) / max_query)

        for i in range(qnum):

            start_index = i * max_query
            end_index = start_index + max_query

            selectedloginnames = eloginnames[start_index:end_index]
            selectedeids = eids[start_index:end_index]

            des = des if has_des else FileStorage({
                'fpath':
                '{}{}_{}_{}'.format(out_dir, output_name, start_index,
                                    end_index - 1)
            })
            db_collector.des = des

            if selectedeids or selectedloginnames:
                logging.debug('collect estore bg from {} to {}'.format(
                    start_index, end_index - 1))
                stmt = get_collect_estore_bg_stmt(
                    estore_loginnames=selectedloginnames,
                    estore_ids=selectedeids,
                )
                db_collector.fetch_data(stmt)
                db_collector.insert_data()

    else:  # later. collect bg when no eids and eloginnames provided
        pass
Beispiel #15
0
 def __init__(self):
     self.parser = AdvertisementParser()
     self.storage = MongoStorage(
         'adv_data') if storage_type == 'mongo' else FileStorage('adv_data')
     if isinstance(self.storage, MongoStorage):
         self.links = self.storage.load('adv_links', {'flag': False})
     else:
         self.links = self.storage.load('lnk')
     self.queue = self.create_queue()
def test_storage_delete_entry_by_name_false():
    # Given
    another_storage = FileStorage(STORAGE_FILENAME)

    # When
    deleted = storage.delete_entry_by_name(flower_entry)
    
    # Then
    assert not deleted
    assert_is_entry_in_storage(storage, flower_entry, amount=0, exists=False)
    assert_is_entry_in_storage(another_storage, flower_entry, amount=0, exists=False)
Beispiel #17
0
    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            html = self._fetch_weibo(uid, page)
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0
            
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        
        is_exist = self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            
            return
        
        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)
        
        start_time = time.time()
        
        parser = ComWeibosParser(self.uid, self.storage, weibos_storage=self.weibos_storage)
        
        num_pages = _crawl(parser, self.uid, page=1)

        pages = [i for i in xrange(2, num_pages+1)]
        """
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
            
            worker_manager.wait_all_complete()
        """
        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
def test_storage_add_entry():
    # Given
    

    # When
    storage.add_entry(flower_entry)
    
    another_storage = FileStorage(STORAGE_FILENAME)

    # Then
    assert_is_entry_in_storage(storage, flower_entry, amount=1, exists=True)
    assert_is_entry_in_storage(another_storage, flower_entry, amount=1, exists=True)
Beispiel #19
0
 def crawl_msg_reposts(self):
     def _crawl(parser, msg_id, page, num_pages=''):
         msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_repost(msg_id, page)
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
         except:
             pass
         
         return num_pages
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return
       
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path)
     
     start_time = time.time()
     
     parser = ComRepostsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl message(%s)\'s reposts: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window) 
Beispiel #20
0
def collect_estore_contracts_erp(src='',
                                 des='',
                                 contract_ids=[],
                                 from_date='',
                                 serviceName='',
                                 page=0,
                                 out_dir=vgvars.dir_path['out_dir'],
                                 output_name='e_erp_contract'):

    ###### initialize objects
    src = APIStorage(vgvars.erp) if not src else src
    des = VarStorage({'data': defaultdict(list)}) if not des else des
    apicollector = DataCollector(src, des)

    ####### perform
    contract_stmt_temp = vgvars.erp['contract']
    contract_dict = defaultdict(list)

    if not page:
        page_num = apicollector.fetch_data(contract_stmt_temp.format(
            1, '', ''))['data']['totalPage']

    # page_num = 2 # TESTING

    for n in range(1, page_num + 1):
        data = apicollector.fetch_data(contract_stmt_temp.format(
            n, '', ''))['data']['currentItems']
        if from_date:
            df = DataFrame(data)
            origin_dates = df['createdDateTime']
            df['createdDateTime'] = pd.to_datetime(df['createdDateTime'])
            selectedDf = df[df['createdDateTime'] >= from_date]
            selectedDf['createdDateTime'] = selectedDf['createdDateTime'].map(
                lambda x: x.strftime('%Y-%m-%d'))
            selectedDf = selectedDf.T
            selected_data = selectedDf.to_dict()

            group_contract_by_start_date(contract_dict, selected_data)
            if len(selected_data) < len(data): break

        else:
            group_contract_by_start_date(contract_dict, data)

    for m in contract_dict:
        apicollector.des = FileStorage(
            {'fpath': '{}{}_{}'.format(out_dir, output_name, m)})
        apicollector.insert_data({
            'selected_format': 'json',
            'values': contract_dict[m]
        })
def collect_dept_bg(company_id=vgvars.erp_default['vghn'], src='', des='', out_dir=vgvars.dir_path['out_dir'], output_name='dept_erp'):

	###### initialize objects
	src = APIStorage(vgvars.erp) if not src else src
	# des = VarStorage({'data': defaultdict(list)}) if not des else des
	des = FileStorage({'fpath': '{}{}_{}'.format(out_dir, output_name, company_id)})
	apicollector = DataCollector(src,des)	

	####### perform
	dept_stmt_temp = vgvars.erp['dept']

	data = apicollector.fetch_data(dept_stmt_temp.format(company_id))['data']

	apicollector.insert_data({
		'selected_format': 'json',
		'values': data
	})
Beispiel #22
0
def main():
    args = docopt(__doc__)

    storage = FileStorage("entries.txt")
    bailer.init_storage(storage)

    if args.get('getall'):
        print(bailer.get_all_flowers())
    elif args.get('add'):
        print(
            bailer.add_flower(args.get('<flower-name>'),
                              args.get('<watering-interval>')))
    elif args.get('remove'):
        print(bailer.remove_flower(args.get('<flower-name>')))
    elif args.get('water'):
        if args.get('--force'):
            print(bailer.water_flower_force(args.get('<flower-name>')))
        else:
            print(bailer.water_flower(args.get('<flower-name>')))
Beispiel #23
0
    def check_new_weibos(self):
        def _crawl(parser, uid, page, num_pages=''):
            msg = 'check new weibo in user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return 0

        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)

        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:
            return

        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return

        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)

        start_time = time.time()

        parser = ComWeibosParser(self.uid, self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s'
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
def collect_estore_spending(start_date,
                            end_date,
                            eids=[],
                            get_eids=False,
                            src='',
                            des='',
                            out_dir=vgvars.dir_path['out_dir'],
                            output_name='e_spending_data',
                            gen_stmt=get_collect_estore_spending_stmt):
    ###### prepare
    has_des = False if not des else True

    ###### initialize objects
    src = DBStorage(vgvars.vgdb) if not src else src
    db_collector = VGDBCollector(src)

    ###### collecting
    # make sure that data is collect within a selected period of time like a year, a month, or a week
    date_list = convert_datetime.divide_dates(start_date,
                                              end_date)  # in months

    for d in date_list:
        start_date, end_date = d

        # update destination storage
        # des = GSStorage(des)
        des = des if has_des else FileStorage({
            'fpath':
            '{}{}_{}_{}'.format(out_dir, output_name, start_date, end_date)
        })
        db_collector.des = des

        # not good enough, since data is check against all estores having orders in a month
        if get_eids:
            min_ord_id, max_ord_id = db_collector.get_max_min_id_by_time(
                start_date, end_date)
            eids = collect_estore_ids(min_ord_id, max_ord_id, db_collector)

        # collect data
        collect(start_date, end_date, db_collector, eids, gen_stmt=gen_stmt)

    return db_collector.des.data
Beispiel #25
0
    def crawl_infos(self):
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:
            return
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            return
        
        msg = 'Crawl user(%s)\'s profile' %self.uid
        logger.info(msg)
        write_message(msg, self.window)
        
        self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path)
        
        start_time = time.time()

        url    = 'http://weibo.com/%s/info' % self.uid
        parser = ComInfosParser(self.uid, self.storage)
        
        html   = self._fetch(url, query=settings.QUERY_INFO)
        try:
            pq_doc = pq(html)
            parser.parse(pq_doc)
        except:
            pass
    
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' 
               %(self.uid, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
def collect_staff_bg_erp(compids=['315', '319', '305', '320'],
                         page=0,
                         src='',
                         des='',
                         out_dir=vgvars.dir_path['out_dir'],
                         output_name='sfaff_bg_erp'):
    ###### initialize objects
    src = APIStorage(vgvars.erp) if not src else src

    apicollector = DataCollector(src, '')

    ####### perform
    dept_stmt_temp = vgvars.erp['staffbg']

    if compids:
        for compid in compids:

            if not page:
                page_num = apicollector.fetch_data(
                    dept_stmt_temp.format(compid, 1, '',
                                          ''))['data']['totalPage']

            # page_num = 2 # TESTING
            datalist = []
            for n in range(1, page_num + 1):
                data = apicollector.fetch_data(
                    dept_stmt_temp.format(compid, n, '',
                                          ''))['data']['currentItems']
                datalist.extend(data)

            des = FileStorage(
                {'fpath': '{}{}_{}'.format(out_dir, output_name, compid)})
            apicollector.des = des
            apicollector.insert_data({
                'selected_format': 'json',
                'values': datalist
            })
 def crawl_msg_comments(self):
     def _crawl(parser, msg_id, page, num_pages='?'):
         msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page)
         write_message(msg, self.window)
     
         html, num_pages = self._fetch_msg_comment(msg_id, page)
         
         if html is None:
             return None
         
         try:
             pq_doc = pq(html)
             parser.parse(pq_doc)
             
             return num_pages
         except:
             return None
     
     msg = 'Checking: whether message exists or not...'
     write_message(msg, self.window)
     msg_id = self.fetcher.check_message(self.msg_url)
     
     if msg_id is None:      #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
         
     if msg_id is False:
         msg = 'Not exist: %s.' %self.msg_url            
         logger.info(msg)
         write_message(msg, self.window)
         
         return False 
     
     self.msg_id = msg_id
     self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path)
     
     start_time = time.time()
     
     parser = ComCommentsParser(msg_id, self.storage)
     num_pages = _crawl(parser, self.msg_id, 1)
     
     if num_pages is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         try:
             self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
         except:
             pass
         
         return None
     
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
         
         worker_manager.wait_all_complete()
         is_None = worker_manager.get_result()
         worker_manager.stop()
         
         if is_None:    #error occur
             msg = 'Error'
             logger.info(msg)
             write_message(msg, self.window)
             
             try:
                 self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
             except:
                 pass
                     
             return None
     
     cost_time = int(time.time() - start_time)
         
     msg = ('Crawl message(%s)\'s comments: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
     
     return True
    def crawl_infos(self):
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            
            return False
        
        msg = 'Crawl user(%s)\'s profile' %self.uid
        logger.info(msg)
        write_message(msg, self.window)
        
        self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path)
        
        start_time = time.time()

        url    = 'http://weibo.com/%s/info' % self.uid
        parser = ComInfosParser(self.uid, self.storage)
        
        html   = self._fetch(url, query=settings.QUERY_INFO)
        
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' 
               %(self.uid, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
                
        if html is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name)
            except:
                pass
            
            return None
        
        try:
            pq_doc = pq(html)
            parser.parse(pq_doc)
            
            return True
        except:
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name)
            except:
                pass
            
            return None    #error occur
class ComWeiboCrawler(object):
    def __init__(self, fetcher, store_path, **kwargs):
        self.fetcher    = fetcher
        self.store_path = store_path
        
        self.uid     = kwargs.get('uid', None)
        self.msg_url = kwargs.get('msg_url', None)
        self.window  = kwargs.get('window', None)
                
    def _check_page_right(self, html):
        '''
        check whether the page is got before login or after.
        '''
        
        if html is None:
            return False
        
        if len(html) == 0:
            msg = u'weibo改版了,信息标签发生变化'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        return not (u'<title>' in html)
    
    def _fetch_weibo(self, uid, page):
        html = self.fetcher.fetch_weibo(uid, page)
        
        page_right = self._check_page_right(html)
        
        if page_right is None:
            return None
        
        if page_right:
            return html
        
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()
            
            sec = (tries + 1) * 10
            write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
            time.sleep(sec)
            
            html = self.fetcher.fetch_weibo(uid, page)
            page_right = self._check_page_right(html)
            
            if page_right:
                return html
            
            tries += 1
        
        return None
    
    def _fetch(self, url, query):
        html = self.fetcher.fetch(url, query)
        
        page_right = self._check_page_right(html)
        
        if page_right is None:
            return None
        
        if page_right:
            return html
        
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()
            
            sec = (tries + 1) * 10
            write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
            time.sleep(sec)
            
            html = self.fetcher.fetch(url, query)
            page_right = self._check_page_right(html)
            
            if page_right:
                return html
            
            tries += 1
        
        return None
    
    def _fetch_msg_repost(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
        
        page_right = self._check_page_right(html)

        if page_right is None:
            return None
        
        if page_right:
            return html, num_pages
        
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()
            
            sec = (tries + 1) * 10
            write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
            time.sleep(sec)
            
            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)
            
            if page_right:
                return html, num_pages
            
            tries += 1
        
        return None, None
 
    def _fetch_msg_comment(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page)
        
        page_right = self._check_page_right(html)

        if page_right is None:
            return None
        
        if page_right:
            return html, num_pages
        
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()
            
            sec = (tries + 1) * 10
            write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
            time.sleep(sec)
            
            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)
            
            if page_right:
                return html, num_pages
            
            tries += 1
        
        return None, None
                
    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s weibos-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            html = self._fetch_weibo(uid, page)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
            
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        
        is_exist = self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            
            return False
        
        self.storage = FileStorage(self.uid, settings.MASK_WEIBO, self.store_path)
        
        start_time = time.time()
        
        parser = ComWeibosParser(self.uid, self.storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        
        if num_pages is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.weibos_fp, self.storage.weibos_f_name)
            except:
                pass
            
            return None
                
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
            
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
            
                try:
                    self.storage.delete(self.storage.weibos_fp, self.storage.weibos_f_name)
                except:
                    pass
            
                return None
        
        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True
            
    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            url  = 'http://weibo.com/%s/follow?page=%s' %(uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
        
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return False

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path)
        
        start_time = time.time()
        
        parser = ComFollowsParser(self.storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        
        if num_pages is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name)
            except:
                pass
            
            return None
        
        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)
        
                num_pages = settings.PAGE_LIMIT
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
                
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur: _crawl return None
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
               
                try:
                    self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name)
                except:
                    pass
               
                return None

        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True

    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
            
            url  = 'http://weibo.com/%s/fans?page=%s' %(uid, page)
            html = self._fetch(url, query=settings.QUERY_FANS)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
            
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return False
        
        self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path)
        
        start_time = time.time()
        
        parser = ComFansParser(self.storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        
        if num_pages is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
            except:
                pass
            
            return None
        
        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' %(num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)
        
                num_pages = settings.PAGE_LIMIT
                
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
            
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
                
                try:
                    self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
                except:
                    pass
            
                return None
            
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True
        
    def crawl_infos(self):
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %self.uid
            logger.info(msg)
            write_message(msg, self.window)
            
            return False
        
        msg = 'Crawl user(%s)\'s profile' %self.uid
        logger.info(msg)
        write_message(msg, self.window)
        
        self.storage = FileStorage(self.uid, settings.MASK_INFO, self.store_path)
        
        start_time = time.time()

        url    = 'http://weibo.com/%s/info' % self.uid
        parser = ComInfosParser(self.uid, self.storage)
        
        html   = self._fetch(url, query=settings.QUERY_INFO)
        
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' 
               %(self.uid, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
                
        if html is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name)
            except:
                pass
            
            return None
        
        try:
            pq_doc = pq(html)
            parser.parse(pq_doc)
            
            return True
        except:
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.infos_fp, self.storage.infos_f_name)
            except:
                pass
            
            return None    #error occur

    def crawl_msg_reposts(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s reposts-page:%s:%s' %(self.msg_id, num_pages, page)
            write_message(msg, self.window)
        
            html, num_pages = self._fetch_msg_repost(msg_id, page)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)
                
                return num_pages
            except:
                return None
        
        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)
        
        if msg_id is None:      #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
                        
        if msg_id is None:
            msg = 'Not exist: %s.' %self.msg_url            
            logger.info(msg)
            write_message(msg, self.window)
            
            return False
          
        self.msg_id = msg_id
        self.storage= FileStorage(self.msg_id, settings.MASK_REPOST, self.store_path)
        
        start_time = time.time()
        
        parser = ComRepostsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)
        
        if num_pages is None:   #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.reposts_fp, self.storage.reposts_f_name)
            except:
                pass
            
            return None
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
            
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
            
                try:
                    self.storage.delete(self.storage.reposts_fp, self.storage.reposts_f_name)
                except:
                    pass
            
                return None
            
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl message(%s)\'s reposts: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True 
    
    def crawl_msg_comments(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s comments-page:%s:%s' %(msg_id, num_pages, page)
            write_message(msg, self.window)
        
            html, num_pages = self._fetch_msg_comment(msg_id, page)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)
                
                return num_pages
            except:
                return None
        
        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)
        
        if msg_id is None:      #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
            
        if msg_id is False:
            msg = 'Not exist: %s.' %self.msg_url            
            logger.info(msg)
            write_message(msg, self.window)
            
            return False 
        
        self.msg_id = msg_id
        self.storage= FileStorage(self.msg_id, settings.MASK_COMMENT, self.store_path)
        
        start_time = time.time()
        
        parser = ComCommentsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)
        
        if num_pages is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
            except:
                pass
            
            return None
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg, num_pages)
            
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
                
                try:
                    self.storage.delete(self.storage.comments_fp, self.storage.comments_f_name)
                except:
                    pass
                        
                return None
        
        cost_time = int(time.time() - start_time)
            
        msg = ('Crawl message(%s)\'s comments: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True
Beispiel #30
0
class Persistent(object):
    def __init__(self, name, **kw):
        super(Persistent, self).__init__()
        self.name = name
        self.storage = None
        self.synclock = RLock()
        self.path = kw.get('path', PDODIR)
        self.encode = kw.get('encode', repr)
        self.decode = kw.get('decode', eval)
        self.extension = kw.get('extension', 'dat')
        self.autopersist = kw.get('autopersist', True)
        if self.autopersist:
            self.load()
    def open(self):
        self.synclock.acquire()
        try:
            self.storage = FileStorage(self.path, self.name, self.extension)
            self.storage.open()
        finally:
            self.synclock.release()
    def close(self):
        self.synclock.acquire()
        try:
            self.storage.close()
            self.storage = None
        finally:
            self.synclock.release()
    def closed(self):
        storage = self.storage
        if storage is None:
            return True
        elif storage.closed():
            return True
        return False
    def update_storage(self):
        """
            Serialize and data associated with object and 
            update storage record to match serialization.
        """
        self.synclock.acquire()
        try:
            data = self.getencoded()
            self.storage.set(data)
        finally:
            self.synclock.release()
    def update_data(self):
        self.synclock.acquire()
        try:
            data = self.storage.getdata()
            self.setencoded(data)
        finally:
            self.synclock.release()
    def commit(self):
        """
            Update storage with most recent data, then 
            commit changes.
        """
        self.synclock.acquire()
        try:
            self.update_storage()
            self.storage.commit()
            self.notify_committed()
        finally:
            self.synclock.release()
    def load(self):
        """
            Load most recently stored data, then update 
            current data with loaded content.
        """
        self.synclock.acquire()
        try:
            if self.storage is None:
                self.open()
            self.storage.load()
            self.update_data()
            self.notify_loaded()
        finally:
            self.synclock.release()
    def serialize(self, data):
        if self.encode is not None:
            data = self.encode(data)
        return data
    def unserialize(self, data):
        if self.decode is not None:
            data = self.decode(data)
        return data
    def getencoded(self):
        """
            Return encoded representation of current data object.
            
            This method must be overridden in type-specific 
            subclasses.
        """
        raise TypeError("Method must be overridden")
    def setencoded(self, data):
        """
            Use encoded representation of persisted data object 
            to update current data object.
            
            This method must be overridden in type-specific 
            subclasses.
        """
        raise TypeError("Method must be overridden")
    def notify_committed(self):
        pass
    def notify_loaded(self):
        pass
Beispiel #31
0
class MainWindow(QDialog):
    def __init__(self, parent=None):
        QWidget.__init__(self, parent)
        
        
        self.storage   = FileStorage()
        #self.languages = Languages()
        
        
        self.setWindowTitle("Find snippet")
        self.setWindowFlags(self.windowFlags() | Qt.WindowStaysOnTopHint)
        
        
        # -----------------------------------------------------------
        # Window layout
        self.input = QLineEdit(self)
        self.input.setMinimumWidth(300)
        QObject.connect(self.input, SIGNAL('returnPressed()'), self.on_return)
        
        self.outcome = QLabel("")
        
        layout = QVBoxLayout()
        layout.addWidget(self.input)
        layout.addWidget(self.outcome)
        layout.setSizeConstraint(QLayout.SetFixedSize)
        self.setLayout(layout)
        
        
        # -----------------------------------------------------------
        # In window shortcuts
        def create_shortcut(keys, slot, *args):
            shortcut = QShortcut(self)
            shortcut.setKey(keys)
            if slot:
                if args:
                    QObject.connect(shortcut, SIGNAL("activated()"), partial(slot, *args))
                else:
                    QObject.connect(shortcut, SIGNAL("activated()"), slot)
            
        for i in xrange(0, 10):
            create_shortcut("Ctrl+%d" % i, self.on_copy, i)
            create_shortcut("Shift+Ctrl+%d" % i, self.on_delete, i)
            
        create_shortcut("Esc", self.on_escape)
        
        create_shortcut("Ctrl+Up",   self.on_page, 'prev')
        create_shortcut("Ctrl+Down", self.on_page, 'next')
        create_shortcut("Up",   self.on_page, 'prev')
        create_shortcut("Down", self.on_page, 'next')
        
        
        # -----------------------------------------------------------
        # Systray and global shortcuts
        self.systray = KSystemTrayIcon(self)
        self.systray.setIcon(QIcon(icon_path()))
        self.systray.show()
        
        def add_action(systray, id, text, icon, shortcut, slot):
            action = systray.actionCollection().addAction(id)
            action.setText(text)
            action.setIcon(icon)
            if shortcut:
                ashortcut =  KShortcut(shortcut)
                action.setShortcut(ashortcut)
                action.setGlobalShortcut(ashortcut)
            self.connect(action, SIGNAL("triggered()"), slot)
            
            menu = systray.contextMenu()
            menu.addAction(action)
            
        add_action(self.systray, 'find-snippet', "Find snippet", QIcon(icon_path()), 'Ctrl+Alt+B', self.on_toogle)
        add_action(self.systray, 'add-snippet',  "Add snippet",  QIcon(icon_path()), 'Ctrl+Alt+N', self.on_add)
        
        
        self.add_dialog = AddDialog(self)
        self.set_results([])
    
    def closeEvent(self, event):
        self.setVisible(False)
        event.ignore()
    
    def on_systray(self, reason):
        # QSystemTrayIcon.DoubleClick
        if reason == QSystemTrayIcon.Trigger:
            self.on_toogle()
        if reason == QSystemTrayIcon.MiddleClick:
            self.on_add()
    
    def on_toogle(self, *a):
        if self.isVisible():
            self.hide()
        else:
            self.show()
    
    def on_add(self):
        #self.add_dialog.show()
        self.add_dialog.display()
        
    def on_copy(self, nr):
        nr = nr - 1;
        if nr < (len(self.search_results) - 10 * self.search_page):
            text = self.search_results[10 * self.search_page + nr].code
            QApplication.clipboard().setText(text)
            self.close()
    
    def on_delete(self, nr):
        nr = nr - 1;
        if nr < (len(self.search_results) - 10 * self.search_page):
            snippet = self.search_results[10 * self.search_page + nr]
            
            reply = QMessageBox.question(
                self,
                "Delete snippet",
                "Delete this snippet?" + format_code(snippet.code, snippet.lang),
                QMessageBox.Yes|QMessageBox.Default,
                QMessageBox.No|QMessageBox.Escape
            )
            if reply:
                
                self.storage.delete(snippet)
                self.close()
    
    def on_return(self, *a):
        query_str = unicode(self.input.text())
        try:
            query_ast = parse(query_str)
            
            result = self.storage.search(query_ast)
            self.set_results(result)
        except ParseError, e:
            self.display_error()
Beispiel #32
0
 def __init__(self, parent=None):
     QWidget.__init__(self, parent)
     
     
     self.storage   = FileStorage()
     #self.languages = Languages()
     
     
     self.setWindowTitle("Find snippet")
     self.setWindowFlags(self.windowFlags() | Qt.WindowStaysOnTopHint)
     
     
     # -----------------------------------------------------------
     # Window layout
     self.input = QLineEdit(self)
     self.input.setMinimumWidth(300)
     QObject.connect(self.input, SIGNAL('returnPressed()'), self.on_return)
     
     self.outcome = QLabel("")
     
     layout = QVBoxLayout()
     layout.addWidget(self.input)
     layout.addWidget(self.outcome)
     layout.setSizeConstraint(QLayout.SetFixedSize)
     self.setLayout(layout)
     
     
     # -----------------------------------------------------------
     # In window shortcuts
     def create_shortcut(keys, slot, *args):
         shortcut = QShortcut(self)
         shortcut.setKey(keys)
         if slot:
             if args:
                 QObject.connect(shortcut, SIGNAL("activated()"), partial(slot, *args))
             else:
                 QObject.connect(shortcut, SIGNAL("activated()"), slot)
         
     for i in xrange(0, 10):
         create_shortcut("Ctrl+%d" % i, self.on_copy, i)
         create_shortcut("Shift+Ctrl+%d" % i, self.on_delete, i)
         
     create_shortcut("Esc", self.on_escape)
     
     create_shortcut("Ctrl+Up",   self.on_page, 'prev')
     create_shortcut("Ctrl+Down", self.on_page, 'next')
     create_shortcut("Up",   self.on_page, 'prev')
     create_shortcut("Down", self.on_page, 'next')
     
     
     # -----------------------------------------------------------
     # Systray and global shortcuts
     self.systray = KSystemTrayIcon(self)
     self.systray.setIcon(QIcon(icon_path()))
     self.systray.show()
     
     def add_action(systray, id, text, icon, shortcut, slot):
         action = systray.actionCollection().addAction(id)
         action.setText(text)
         action.setIcon(icon)
         if shortcut:
             ashortcut =  KShortcut(shortcut)
             action.setShortcut(ashortcut)
             action.setGlobalShortcut(ashortcut)
         self.connect(action, SIGNAL("triggered()"), slot)
         
         menu = systray.contextMenu()
         menu.addAction(action)
         
     add_action(self.systray, 'find-snippet', "Find snippet", QIcon(icon_path()), 'Ctrl+Alt+B', self.on_toogle)
     add_action(self.systray, 'add-snippet',  "Add snippet",  QIcon(icon_path()), 'Ctrl+Alt+N', self.on_add)
     
     
     self.add_dialog = AddDialog(self)
     self.set_results([])
Beispiel #33
0
class ComWeiboCrawler(object):
    def __init__(self, fetcher, store_path, **kwargs):
        self.fetcher = fetcher
        self.store_path = store_path

        self.uid = kwargs.get('uid', None)
        self.msg_url = kwargs.get('msg_url', None)
        self.window = kwargs.get('window', None)

    def _check_page_right(self, html):
        '''
        check whether the page is got before login or after.
        '''

        if html is None:
            return False

        if len(html) == 0:
            msg = u'weibo改版了,信息标签发生变化'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        return not (u'<title>' in html)

    def _fetch_weibo(self, uid, page):
        html = self.fetcher.fetch_weibo(uid, page)

        page_right = self._check_page_right(html)

        if page_right is None:
            return None

        if page_right:
            return html

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html = self.fetcher.fetch_weibo(uid, page)
            page_right = self._check_page_right(html)

            if page_right:
                return html

            tries += 1

        return None

    def _fetch(self, url, query):
        html = self.fetcher.fetch(url, query)

        page_right = self._check_page_right(html)

        if page_right is None:
            return None

        if page_right:
            return html

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html = self.fetcher.fetch(url, query)
            page_right = self._check_page_right(html)

            if page_right:
                return html

            tries += 1

        return None

    def _fetch_msg_repost(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)

        page_right = self._check_page_right(html)

        if page_right is None:
            return None

        if page_right:
            return html, num_pages

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)

            if page_right:
                return html, num_pages

            tries += 1

        return None, None

    def _fetch_msg_comment(self, msg_id, page=1):
        html, num_pages = self.fetcher.fetch_msg_comments(msg_id, page)

        page_right = self._check_page_right(html)

        if page_right is None:
            return None

        if page_right:
            return html, num_pages

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html, num_pages = self.fetcher.fetch_msg_reposts(msg_id, page)
            page_right = self._check_page_right(html)

            if page_right:
                return html, num_pages

            tries += 1

        return None, None

    def crawl_weibos(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s weibos-page: %s:%s' % (self.uid,
                                                            num_pages, page)
            write_message(msg, self.window)

            html = self._fetch_weibo(uid, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)

        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_WEIBO,
                                   self.store_path)

        start_time = time.time()

        parser = ComWeibosParser(self.uid, self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.weibos_fp,
                                    self.storage.weibos_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.weibos_fp,
                                        self.storage.weibos_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)
        msg = ('Crawl user(%s)\'s weibos: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True

    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' % (self.uid,
                                                             num_pages, page)
            write_message(msg, self.window)

            url = 'http://weibo.com/%s/follow?page=%s' % (uid, page)
            html = self._fetch(url, query=settings.QUERY_FOLLOWS)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW,
                                   self.store_path)

        start_time = time.time()

        parser = ComFollowsParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.follows_fp,
                                    self.storage.follows_f_name)
            except:
                pass

            return None

        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' % (
                    num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)

                num_pages = settings.PAGE_LIMIT

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur: _crawl return None
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.follows_fp,
                                        self.storage.follows_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True

    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.com/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url, query=settings.QUERY_FANS)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FAN,
                                   self.store_path)

        start_time = time.time()

        parser = ComFansParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.fans_fp,
                                    self.storage.fans_f_name)
            except:
                pass

            return None

        if settings.PAGE_LIMIT != 0:
            if num_pages > settings.PAGE_LIMIT:
                msg = 'For sina policy, reduce page count from %s to %s' % (
                    num_pages, settings.PAGE_LIMIT)
                write_message(msg, self.window)

                num_pages = settings.PAGE_LIMIT

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.fans_fp,
                                        self.storage.fans_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True

    def crawl_infos(self):
        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % self.uid
            logger.info(msg)
            write_message(msg, self.window)

            return False

        msg = 'Crawl user(%s)\'s profile' % self.uid
        logger.info(msg)
        write_message(msg, self.window)

        self.storage = FileStorage(self.uid, settings.MASK_INFO,
                                   self.store_path)

        start_time = time.time()

        url = 'http://weibo.com/%s/info' % self.uid
        parser = ComInfosParser(self.uid, self.storage)

        html = self._fetch(url, query=settings.QUERY_INFO)

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s infos: cost time=%s sec, connections=%s' %
               (self.uid, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        if html is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.infos_fp,
                                    self.storage.infos_f_name)
            except:
                pass

            return None

        try:
            pq_doc = pq(html)
            parser.parse(pq_doc)

            return True
        except:
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.infos_fp,
                                    self.storage.infos_f_name)
            except:
                pass

            return None  #error occur

    def crawl_msg_reposts(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s reposts-page:%s:%s' % (self.msg_id,
                                                               num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_repost(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None

        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if msg_id is None:
            msg = 'Not exist: %s.' % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_REPOST,
                                   self.store_path)

        start_time = time.time()

        parser = ComRepostsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.reposts_fp,
                                    self.storage.reposts_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg,
                                       num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.reposts_fp,
                                        self.storage.reposts_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl message(%s)\'s reposts: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True

    def crawl_msg_comments(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s comments-page:%s:%s' % (
                msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_comment(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None

        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if msg_id is False:
            msg = 'Not exist: %s.' % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT,
                                   self.store_path)

        start_time = time.time()

        parser = ComCommentsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.comments_fp,
                                    self.storage.comments_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg,
                                       num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.comments_fp,
                                        self.storage.comments_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl message(%s)\'s comments: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
Beispiel #34
0
    def crawl_msg_comments(self):
        def _crawl(parser, msg_id, page, num_pages='?'):
            msg = 'Crawl message(%s)\'s comments-page:%s:%s' % (
                msg_id, num_pages, page)
            write_message(msg, self.window)

            html, num_pages = self._fetch_msg_comment(msg_id, page)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                parser.parse(pq_doc)

                return num_pages
            except:
                return None

        msg = 'Checking: whether message exists or not...'
        write_message(msg, self.window)
        msg_id = self.fetcher.check_message(self.msg_url)

        if msg_id is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if msg_id is False:
            msg = 'Not exist: %s.' % self.msg_url
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.msg_id = msg_id
        self.storage = FileStorage(self.msg_id, settings.MASK_COMMENT,
                                   self.store_path)

        start_time = time.time()

        parser = ComCommentsParser(msg_id, self.storage)
        num_pages = _crawl(parser, self.msg_id, 1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.comments_fp,
                                    self.storage.comments_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.msg_id, pg,
                                       num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.comments_fp,
                                        self.storage.comments_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl message(%s)\'s comments: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.msg_id, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
Beispiel #35
0
 def __init__(self, cities=default_cities, link=base_link):
     self.cities = cities
     self.link = link
     self.storage = MongoStorage(
         'adv_links') if storage_type == 'mongo' else FileStorage(
             'adv_links')
Beispiel #36
0
class CnWeiboCrawler(object):
    def __init__(self, fetcher, store_path, uid, window=None):
        self.fetcher = fetcher
        self.store_path = store_path
        self.uid = uid
        self.window = window

    def _check_page_right(self, html):
        if html is None:
            return False

        try:
            pq_doc = pq(html)
            title = pq_doc.find('title').text().strip()
            return title != u'微博广场' and title != u'新浪微博-新浪通行证'
        except AttributeError:
            return False

    def _fetch(self, url):
        html = self.fetcher.fetch(url)

        page_right = self._check_page_right(html)

        if page_right:
            return html

        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()

            sec = (tries + 1) * 10
            write_message(
                '_fetch trying: %s, sleep: %s seconds' % (tries, sec),
                self.window)
            time.sleep(sec)

            html = self.fetcher.fetch(url)
            page_right = self._check_page_right(html)

            if page_right:
                return html

            tries += 1

        return None

    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' % (self.uid,
                                                             num_pages, page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/follow?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW,
                                   self.store_path)

        start_time = time.time()

        parser = CnFollowsParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.follows_fp,
                                    self.storage.follows_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur: _crawl return None
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.follows_fp,
                                        self.storage.follows_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True

    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FAN,
                                   self.store_path)

        start_time = time.time()

        parser = CnFansParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.fans_fp,
                                    self.storage.fans_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.fans_fp,
                                        self.storage.fans_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
Beispiel #37
0
 def __init__(self, repoDir):
     if not os.path.exists(repoDir):
         raise InvalidRepo("Invalid Repo path")
     #creating required structure
     self.path = repoDir
     self.storage  = FileStorage(os.path.join(repoDir,".svcs"))
Beispiel #38
0
    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' % (self.uid, num_pages,
                                                          page)
            write_message(msg, self.window)

            url = 'http://weibo.cn/%s/fans?page=%s' % (uid, page)
            html = self._fetch(url)

            if html is None:
                return None

            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None

        msg = 'Checking: whether user(%s) exists or not...' % self.uid
        write_message(msg, self.window)
        is_exist = self.fetcher.check_user(self.uid)

        if is_exist is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            return None

        if not is_exist:
            msg = 'Not exist: %s.' % (self.uid)
            logger.info(msg)
            write_message(msg, self.window)

            return False

        self.storage = FileStorage(self.uid, settings.MASK_FAN,
                                   self.store_path)

        start_time = time.time()

        parser = CnFansParser(self.storage)

        num_pages = _crawl(parser, self.uid, page=1)

        if num_pages is None:  #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)

            try:
                self.storage.delete(self.storage.fans_fp,
                                    self.storage.fans_f_name)
            except:
                pass

            return None

        pages = [i for i in xrange(2, num_pages + 1)]
        if len(pages) > 0:
            n_threads = 5

            worker_manager = WorkerManager(n_threads)

            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)

            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()

            if is_None:  #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)

                try:
                    self.storage.delete(self.storage.fans_fp,
                                        self.storage.fans_f_name)
                except:
                    pass

                return None

        cost_time = int(time.time() - start_time)

        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' %
               (self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)

        return True
class CnWeiboCrawler(object):
    def __init__(self, fetcher, store_path, uid, window=None):
        self.fetcher    = fetcher
        self.store_path = store_path
        self.uid        = uid
        self.window     = window
    
    def _check_page_right(self, html):
        if html is None:
            return False
        
        try:
            pq_doc = pq(html)
            title = pq_doc.find('title').text().strip()
            return title != u'微博广场' and title != u'新浪微博-新浪通行证'
        except AttributeError:
            return False
    
    def _fetch(self, url):
        html = self.fetcher.fetch(url)
        
        page_right = self._check_page_right(html)
        
        if page_right:
            return html
        
        tries = 0
        while not page_right and tries <= 10:
            time.sleep(10)
            self.fetcher.check_cookie()
            
            sec = (tries + 1) * 10
            write_message('_fetch trying: %s, sleep: %s seconds' %(tries, sec), self.window)
            time.sleep(sec)
            
            html = self.fetcher.fetch(url)
            page_right = self._check_page_right(html)
            
            if page_right:
                return html
            
            tries += 1
        
        return None
    
    def crawl_follows(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s follows-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
        
            url  = 'http://weibo.cn/%s/follow?page=%s' %(uid, page)
            html = self._fetch(url)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
        
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return False

        self.storage = FileStorage(self.uid, settings.MASK_FOLLOW, self.store_path)
        
        start_time = time.time()
        
        parser = CnFollowsParser(self.storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        
        if num_pages is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name)
            except:
                pass
            
            return None
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
                
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur: _crawl return None
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
               
                try:
                    self.storage.delete(self.storage.follows_fp, self.storage.follows_f_name)
                except:
                    pass
               
                return None

        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s follows: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True
    
    def crawl_fans(self):
        def _crawl(parser, uid, page, num_pages='?'):
            msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
            write_message(msg, self.window)
            
            url  = 'http://weibo.cn/%s/fans?page=%s' %(uid, page)
            html = self._fetch(url)
            
            if html is None:
                return None
            
            try:
                pq_doc = pq(html)
                return parser.parse(pq_doc)
            except:
                return None
            
        msg = 'Checking: whether user(%s) exists or not...' %self.uid
        write_message(msg, self.window)
        is_exist= self.fetcher.check_user(self.uid)
        
        if is_exist is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            return None
        
        if not is_exist:
            msg = 'Not exist: %s.' %(self.uid)
            logger.info(msg)
            write_message(msg, self.window)
            
            return False
        
        self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path)
        
        start_time = time.time()
        
        parser = CnFansParser(self.storage)
        
        num_pages = _crawl(parser, self.uid, page=1)
        
        if num_pages is None:    #error occur
            msg = 'Error'
            logger.info(msg)
            write_message(msg, self.window)
            
            try:
                self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
            except:
                pass
            
            return None
        
        pages = [i for i in xrange(2, num_pages+1)]
        if len(pages) > 0:
            n_threads = 5
            
            worker_manager = WorkerManager(n_threads)
            
            for pg in pages:
                worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
            
            worker_manager.wait_all_complete()
            is_None = worker_manager.get_result()
            worker_manager.stop()
            
            if is_None:    #error occur
                msg = 'Error'
                logger.info(msg)
                write_message(msg, self.window)
                
                try:
                    self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
                except:
                    pass
            
                return None
            
        cost_time = int(time.time() - start_time)
        
        msg = ('Crawl user(%s)\'s fans: total page=%s,'
               ' cost time=%s sec, connections=%s' 
               %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
        logger.info(msg)
        write_message(msg, self.window)
        
        return True
Beispiel #40
0
class Persistent(object):
    def __init__(self, name, **kw):
        super(Persistent, self).__init__()
        self.name = name
        self.storage = None
        self.synclock = RLock()
        self.path = kw.get('path', PDODIR)
        self.encode = kw.get('encode', repr)
        self.decode = kw.get('decode', eval)
        self.extension = kw.get('extension', 'dat')
        self.autopersist = kw.get('autopersist', True)
        if self.autopersist:
            self.load()

    def open(self):
        self.synclock.acquire()
        try:
            self.storage = FileStorage(self.path, self.name, self.extension)
            self.storage.open()
        finally:
            self.synclock.release()

    def close(self):
        self.synclock.acquire()
        try:
            self.storage.close()
            self.storage = None
        finally:
            self.synclock.release()

    def closed(self):
        storage = self.storage
        if storage is None:
            return True
        elif storage.closed():
            return True
        return False

    def update_storage(self):
        """
            Serialize and data associated with object and 
            update storage record to match serialization.
        """
        self.synclock.acquire()
        try:
            data = self.getencoded()
            self.storage.set(data)
        finally:
            self.synclock.release()

    def update_data(self):
        self.synclock.acquire()
        try:
            data = self.storage.getdata()
            self.setencoded(data)
        finally:
            self.synclock.release()

    def commit(self):
        """
            Update storage with most recent data, then 
            commit changes.
        """
        self.synclock.acquire()
        try:
            self.update_storage()
            self.storage.commit()
            self.notify_committed()
        finally:
            self.synclock.release()

    def load(self):
        """
            Load most recently stored data, then update 
            current data with loaded content.
        """
        self.synclock.acquire()
        try:
            if self.storage is None:
                self.open()
            self.storage.load()
            self.update_data()
            self.notify_loaded()
        finally:
            self.synclock.release()

    def serialize(self, data):
        if self.encode is not None:
            data = self.encode(data)
        return data

    def unserialize(self, data):
        if self.decode is not None:
            data = self.decode(data)
        return data

    def getencoded(self):
        """
            Return encoded representation of current data object.
            
            This method must be overridden in type-specific 
            subclasses.
        """
        raise TypeError("Method must be overridden")

    def setencoded(self, data):
        """
            Use encoded representation of persisted data object 
            to update current data object.
            
            This method must be overridden in type-specific 
            subclasses.
        """
        raise TypeError("Method must be overridden")

    def notify_committed(self):
        pass

    def notify_loaded(self):
        pass
 def crawl_fans(self):
     def _crawl(parser, uid, page, num_pages='?'):
         msg = 'Crawl user(%s)\'s fans-page: %s:%s' %(self.uid, num_pages, page)
         write_message(msg, self.window)
         
         url  = 'http://weibo.cn/%s/fans?page=%s' %(uid, page)
         html = self._fetch(url)
         
         if html is None:
             return None
         
         try:
             pq_doc = pq(html)
             return parser.parse(pq_doc)
         except:
             return None
         
     msg = 'Checking: whether user(%s) exists or not...' %self.uid
     write_message(msg, self.window)
     is_exist= self.fetcher.check_user(self.uid)
     
     if is_exist is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         return None
     
     if not is_exist:
         msg = 'Not exist: %s.' %(self.uid)
         logger.info(msg)
         write_message(msg, self.window)
         
         return False
     
     self.storage = FileStorage(self.uid, settings.MASK_FAN, self.store_path)
     
     start_time = time.time()
     
     parser = CnFansParser(self.storage)
     
     num_pages = _crawl(parser, self.uid, page=1)
     
     if num_pages is None:    #error occur
         msg = 'Error'
         logger.info(msg)
         write_message(msg, self.window)
         
         try:
             self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
         except:
             pass
         
         return None
     
     pages = [i for i in xrange(2, num_pages+1)]
     if len(pages) > 0:
         n_threads = 5
         
         worker_manager = WorkerManager(n_threads)
         
         for pg in pages:
             worker_manager.add_job(_crawl, parser, self.uid, pg, num_pages)
         
         worker_manager.wait_all_complete()
         is_None = worker_manager.get_result()
         worker_manager.stop()
         
         if is_None:    #error occur
             msg = 'Error'
             logger.info(msg)
             write_message(msg, self.window)
             
             try:
                 self.storage.delete(self.storage.fans_fp, self.storage.fans_f_name)
             except:
                 pass
         
             return None
         
     cost_time = int(time.time() - start_time)
     
     msg = ('Crawl user(%s)\'s fans: total page=%s,'
            ' cost time=%s sec, connections=%s' 
            %(self.uid, num_pages, cost_time, self.fetcher.n_connections))
     logger.info(msg)
     write_message(msg, self.window)
     
     return True
Beispiel #42
0
from typing import Optional

from contextlib import closing
from hashlib import md5
from storage import FileStorage
from db import Blob,Session
from storage import ExistsError,NotFoundError
from exceptions import DbCorruptionError

storage_backend = FileStorage()

def withsession(fn):
    def inner(sess=None,*args,**kwargs):
        if sess is None:
            with closing(Session()) as sess:
                return fn(sess = sess,*args,**kwargs)
        else:
            fn(sess,*args,**kwargs)
    return inner

@withsession
def store(data: bytes,sess: Optional[Session] = None):
    if sess is not None:
        data_id = md5(data).hexdigest()

        existing = sess.query(Blob).get(data_id)
        if existing is not None:
            raise ExistsError

        blob = Blob(id=data_id)
        sess.add(blob)