コード例 #1
0
def setup(config, pool, acc, user):
    total = 20
    rows = {}
    for _ in xrange(total):
        uid, r = db_txn(pool, partial(do_setup, acc, user))
        print uid, r
        rows[uid] = r

    fetch = fetcher(config, pool, Condition(Lock()), Condition(Lock()))
    clean = cleaner(config, pool, Condition(Lock()), Condition(Lock()))
    ev = Event()
    ev.clear()
    load = Thread(target=load_database, args=(config, pool, ev, [fetch, clean]))  # @IgnorePep8
    load.start()
    ev.wait()
    fetch.start()

    tasks = []
    while len(tasks) < total:
        print 'already fetched: ', len(tasks), 'tasks'
        print 'to fetch tasks from db'
        fetch.request(acc)
        for r in fetch.replies(True):
            ts = r[1]
            print 'fetched', len(ts), 'tasks'
            for t in ts:
                if t.uuid in rows:
                    tasks.append((t, rows[t.uuid]))

    return (clean, tasks)
コード例 #2
0
def genLevelSecond(secondLevelurlsfile,outfile,thirdfile):
    f = fetcher(threads=threadsNum)
    with open(secondLevelurlsfile , 'r') as fi:
        urls = fi.readlines()
        urls = map (lambda x: x.strip("\r\n") , urls)
        ans = _genLevelSecond(f,urls,thirdfile)
    fout = open(outfile , 'w')
    for  url in ans:
        fout.write("%s\n" % url.encode('utf-8' , 'ignore'))
コード例 #3
0
def genLevelThree(threeLevelUrlsFile , outFile , failfiles , skipNum):
    f = fetcher(threads=threadsNum)
    with open(threeLevelUrlsFile , 'r') as fi:
        urls = fi.readlines()
        urls = map (lambda x: x.strip("\r\n") , urls)
        ans = _genLevelThree(f,urls,failfiles , skipNum)
    fout = open(outFile , 'w')
    for  url in ans:
        fout.write("%s\n" % url.encode('utf-8' , 'ignore'))
コード例 #4
0
ファイル: main.py プロジェクト: rambo/httpchecker
 def __init__(self, configfile):
     self.config_file_path = configfile
     if not self.load_config():
         raise RuntimeError("Could not load config file %s" % configfile)
     self.regexes = {}
     self.precompile_regexes()
     self.hook_signals()
     self.logger = logger.logger_wrapper(configfile.replace('.yml', '') + ".db")
     self.fetcher = fetcher.fetcher()
コード例 #5
0
 def __init__(self, configfile):
     self.config_file_path = configfile
     if not self.load_config():
         raise RuntimeError("Could not load config file %s" % configfile)
     self.regexes = {}
     self.precompile_regexes()
     self.hook_signals()
     self.logger = logger.logger_wrapper(
         configfile.replace('.yml', '') + ".db")
     self.fetcher = fetcher.fetcher()
コード例 #6
0
def main():
    if len(sys.argv) != 2:
        print "Usage: python %s firstLevelurl  1> outfile 2>logFile" % sys.argv[0]
        exit(1)
    f = fetcher(threads=threadsNum)
    with open(sys.argv[1],'r') as fi:
        urls = fi.readlines()
        urls = map (lambda x: x.strip("\r\n") , urls)
        level1Urls = genLevelFirst(f,urls)
    for url in level1Urls:
        print url
コード例 #7
0
ファイル: dldmgr.py プロジェクト: wruibo/ispider
 def start(self):
     #init&start all downloaders
     for i in range(0, self._dlder_num):
         worker = dlder.downloader(self._db_path+"/downloader_"+str(i))
         worker.start()
         self._dlders.append(worker)
     
     #init&start a fetcher
     self._fetcher = fetcher.fetcher()
     self._fetcher.start()
     
     #start downloader manager thread
     self._stopflag = False
     threading.Thread.start(self)        
コード例 #8
0
    def start(self):
        #init&start all downloaders
        for i in range(0, self._dlder_num):
            worker = dlder.downloader(self._db_path + "/downloader_" + str(i))
            worker.start()
            self._dlders.append(worker)

        #init&start a fetcher
        self._fetcher = fetcher.fetcher()
        self._fetcher.start()

        #start downloader manager thread
        self._stopflag = False
        threading.Thread.start(self)
コード例 #9
0
def main():
    if len(sys.argv) != 4:
        print "Usage: python %s urlFile outfile skipNum 2>logFile" % sys.argv[0]
        exit(1)
    count = 0
    pattern = re.compile("rootCatId=([0-9]+)", re.M)
    f = fetcher(threads=threadsNum)
    fileout = open(sys.argv[2],'w')
    fileLog = open('brand_cat.log','w')
    with open(sys.argv[1],'r') as fi:
        for line in fi:
            line = line.strip('\r\n')
            if(len(line) == 0):
                continue
            count += 1
            if(count < int(sys.argv[3])):
                continue
            line = line.decode('utf-8','ignore')
            arr = line.split('\t')
            if(count % 10 == 0):
                sys.stderr.write("%s:Doing %d %s\n" % (datetime.datetime.now(),count,arr[0].encode('utf-8')))
                fileLog.write("%s:Doing %d %s\n" % (datetime.datetime.now(),count,arr[0].encode('utf-8')))
                fileLog.flush()
            try:
                mystr = arr[0]
                taskCount = 0
                for i in range(len(arr) - 1):
                    if str(arr[i+1]).find('tmall') == -1:
                        taskCount += 1
                        f.push(arr[i+1])
                while taskCount > 0:
                    taskCount -= 1
                    url , content = f.pop()
#                    f.taskleft() #watch ans
                    ans = pattern.findall(content)
                    for cat in ans:
                        mystr = mystr + "\t" + cat
                fileout.write("%s\n" % mystr.encode('utf-8','ignore'))
                fileout.flush()
                #print ("%s\n" % mystr.encode('utf-8','ignore'))
            except Exception as err:
                sys.stderr.write("%s:%s\n" % (datetime.datetime.now(),err))
コード例 #10
0
ファイル: crawler.py プロジェクト: mkalte666/ClawSearch
    def __init__(self,
                 StayOnDomain=False,
                 maxsites=None,
                 maxWorkThreads=10,
                 fetchToWorkRatio=0.25):
        self.fetcher = fetcher.fetcher(int(fetchToWorkRatio * maxWorkThreads))
        self.indexer = index.indexer()

        self.domains = dict()
        self.words = dict()

        self.domainOnly = StayOnDomain
        self.startdomain = ""
        self.maxsites = maxsites

        self.maxthreads = maxWorkThreads
        self.curthreads = 0
        self.input = Queue.Queue()
        self.visited = []

        self.avtime = 0

        self.fetcher.AddHandler(self.ParsePage)
コード例 #11
0
ファイル: meseek.py プロジェクト: chugh22/meseek
def main():
	print(bcolors.BLUE + "Running Meseek v1.0.0" + bcolors.ENDC)
	query = ''

	if (sys.argv[1] == '--custom' or sys.argv[1] == '-c'):
		i=2
		while (i<len(sys.argv)):
			query = query + sys.argv[i]
			i+=1
	else:
		i=1
		command = []
		while (i<len(sys.argv)):
			command.append(sys.argv[i])
			i+=1
		try:
			process = subprocess.run(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE)
		except FileNotFoundError:
			query = FileNotFoundError
		
		if (query!=''):
			query = querycleaner(query)
		elif (process.returncode==0):
			query = str(process.stdout)
			query = querycleaner(query)
			print (bcolors.BLUE + "Command executed with no issues." + bcolors.ENDC)
			return
		else:
			query = str(process.stderr)
			query = querycleaner(query)

	fixlist = fetcher (query)
	logging.info('user queried: %s',query)

	decision = 1
	fix_counter = 1
	
	while (decision!=0):
		if (len(fixlist)==0):
			print(bcolors.BLUE + "No Fix available" + bcolors.ENDC)
			return
		print("==="+query)
		print(bcolors.BLUE + "Please select an option:")
		print("1. Open Fix (" + str(fix_counter) + '/' + str(len(fixlist)) + ')')
		print("2. Share Fix with Meseek")
		print("0. Exit"  + bcolors.ENDC)
		decision = int(input())
		
		if (decision==1):
			url = fixlist[fix_counter-1]
			openurl(url)
			fix_counter += 1
			if (fix_counter==len(fixlist)+1):
				print(bcolors.BLUE + "Meseek ran out of solutions" + bcolors.ENDC)
				decision = 0
		
		elif (decision==2):
			#Write code here
			logging.warning('User tried to give the solution')
			decision=0

		else:
			print(bcolors.BLUE + "Not a valid option. Exiting." + bcolors.ENDC)
			logging.warning('User tried to use invalid option.')
			decision=0
コード例 #12
0
ファイル: scheduler.py プロジェクト: buptjinguodong/spider
 def __init__(self, start_items):
     super(scheduler, self).__init__(start_items)
     self.fetcher = fetcher()
     self.processor = processor()
     self.storer = storer()
コード例 #13
0
ファイル: cook_soup.py プロジェクト: huangzx/archlinux_search
 def cook(self, url):
     respon = fetcher(url)
     soup = BeautifulSoup(respon)
     return soup
コード例 #14
0
def main():
    # use MW_HOME to find etc/ & var/ directories
    path = getenv('MW_HOME')
    if path == None:
        stderr.write("MW_HOME not set in environment, program cannot start.")
        _exit(1)
    logging.config.fileConfig('/'.join([path, 'etc', 'logging.conf']),
                              disable_existing_loggers=False)
    config = parse_config('/'.join([path, 'etc', 'vddb_async.conf']))
    start_dbpc(config, 'task_manager')

    # make db connection pool, assign 1 connection to each db accessing thread
    # kingship checker + db loader + task fetcher +
    # task cleanup : a total of 4 threads, + 1 manager thread pool
    db_pool = make_db_pool(
        config, DB_RESERVE_CONN + int(config['cleaner_threads_num']))
    # kingship granted event, all threads wait till this event is set
    hbase_pool = make_hbase_pool(config)

    kev = Event()
    kev.clear()
    # db load finished eventm all threads wait till this event is set
    lev = Event()
    lev.clear()
    # conditions each thread wait on, named after the waiter
    fetch_cond = Condition(Lock())
    pick_cond = Condition(Lock())
    manage_cond = Condition(Lock())
    clean_cond = Condition(Lock())
    # kingship checker
    check = Thread(
        target=check_kingship,
        args=(
            config,
            db_pool,
            kev,
            config['tm_module_name'],
            # NOTE: if task manager stops working for X minutes, the
            #       tasks won't be scheduled and executed for X
            #       minutes since tasks have to be finished in a
            #       timely fashion, the task manager master timeout
            #       should not be too long
            int(config['tm_master_timeout']),
            logging.getLogger('mwtm_tmcheck')))
    check.start()
    # all other threads wait here
    kev.wait()
    # this event is never cleared
    #assert kev.isSet()
    # worker threads, created in dependence order
    fetch = fetcher(config, db_pool, fetch_cond, pick_cond)
    clean = cleaner_cluster(config, db_pool, hbase_pool, clean_cond,
                            manage_cond)
    manage = manager(config, db_pool, manage_cond, pick_cond, clean)
    pick = picker(config, pick_cond, fetch, manage)
    # db config observers, loader will notify them when there's new data loaded
    observers = [fetch, pick, manage, clean]
    load = Thread(target=load_database, args=(config, db_pool, lev, observers))
    # start the loader, and make other threads wait for the first load
    load.start()
    lev.wait()
    assert lev.isSet()

    # start all other threads
    fetch.start()
    clean.start()
    manage.start()
    pick.start()
コード例 #15
0
import time
import os
from fetcher import fetcher
from preprocessing import preprocessing
from sendmessage import *

while True:
    start = time.time()
    df = fetcher()
    df = preprocessing(df)
    #os.chdir('C:\\Users\\Adithya\\Documents\\GitHub\\Athena')
    #os.chdir('C:\\Users\\Dexter\\Documents\\Projects\\SIH\\Athena')
    os.chdir('/home/pi/Desktop/Athena')
    for dest, msg in zip(df.To, df.Content):
        phno, lang = getInfo(dest)
        msg = changeLang(msg, lang)
        print(phno)
        sendmessage(phno, msg)
    exe_time = time.time() - start
    time.sleep(120 - exe_time)
コード例 #16
0
def main():
    # use MW_HOME to find etc/ & var/ directories
    path = getenv('MW_HOME')
    if path == None:
        stderr.write("MW_HOME not set in environment, program cannot start.")
        _exit(1)
    logging.config.fileConfig('/'.join([path, 'etc', 'logging.conf']),
                              disable_existing_loggers=False)
    config = parse_config('/'.join([path, 'etc', 'vddb_async.conf']))
    start_dbpc(config, 'task_manager')

    # make db connection pool, assign 1 connection to each db accessing thread
    # kingship checker + db loader + task fetcher +
    # task cleanup : a total of 4 threads, + 1 manager thread pool
    db_pool = make_db_pool(config, DB_RESERVE_CONN + int(config['cleaner_threads_num']))
    # kingship granted event, all threads wait till this event is set
    hbase_pool = make_hbase_pool(config)

    kev = Event()
    kev.clear()
    # db load finished eventm all threads wait till this event is set
    lev = Event()
    lev.clear()
    # conditions each thread wait on, named after the waiter
    fetch_cond = Condition(Lock())
    pick_cond = Condition(Lock())
    manage_cond = Condition(Lock())
    clean_cond = Condition(Lock())
    # kingship checker
    check = Thread(target=check_kingship,
                   args=(config, db_pool, kev, config['tm_module_name'],
                         # NOTE: if task manager stops working for X minutes, the
                         #       tasks won't be scheduled and executed for X
                         #       minutes since tasks have to be finished in a
                         #       timely fashion, the task manager master timeout
                         #       should not be too long
                         int(config['tm_master_timeout']),
                         logging.getLogger('mwtm_tmcheck')))
    check.start()
    # all other threads wait here
    kev.wait()
    # this event is never cleared
    #assert kev.isSet()
    # worker threads, created in dependence order
    fetch = fetcher(config, db_pool, fetch_cond, pick_cond)
    clean = cleaner_cluster(config, db_pool, hbase_pool, clean_cond, manage_cond)
    manage = manager(config, db_pool, manage_cond, pick_cond, clean)
    pick = picker(config, pick_cond, fetch, manage)
    # db config observers, loader will notify them when there's new data loaded
    observers = [fetch, pick, manage, clean]
    load = Thread(target=load_database, args=(config, db_pool, lev, observers))
    # start the loader, and make other threads wait for the first load
    load.start()
    lev.wait()
    assert lev.isSet()

    # start all other threads
    fetch.start()
    clean.start()
    manage.start()
    pick.start()
コード例 #17
0
 def cook(self, url):
     respon = fetcher(url)
     soup = BeautifulSoup(respon)
     print 'Done', url
     return soup