def setup(config, pool, acc, user): total = 20 rows = {} for _ in xrange(total): uid, r = db_txn(pool, partial(do_setup, acc, user)) print uid, r rows[uid] = r fetch = fetcher(config, pool, Condition(Lock()), Condition(Lock())) clean = cleaner(config, pool, Condition(Lock()), Condition(Lock())) ev = Event() ev.clear() load = Thread(target=load_database, args=(config, pool, ev, [fetch, clean])) # @IgnorePep8 load.start() ev.wait() fetch.start() tasks = [] while len(tasks) < total: print 'already fetched: ', len(tasks), 'tasks' print 'to fetch tasks from db' fetch.request(acc) for r in fetch.replies(True): ts = r[1] print 'fetched', len(ts), 'tasks' for t in ts: if t.uuid in rows: tasks.append((t, rows[t.uuid])) return (clean, tasks)
def genLevelSecond(secondLevelurlsfile,outfile,thirdfile): f = fetcher(threads=threadsNum) with open(secondLevelurlsfile , 'r') as fi: urls = fi.readlines() urls = map (lambda x: x.strip("\r\n") , urls) ans = _genLevelSecond(f,urls,thirdfile) fout = open(outfile , 'w') for url in ans: fout.write("%s\n" % url.encode('utf-8' , 'ignore'))
def genLevelThree(threeLevelUrlsFile , outFile , failfiles , skipNum): f = fetcher(threads=threadsNum) with open(threeLevelUrlsFile , 'r') as fi: urls = fi.readlines() urls = map (lambda x: x.strip("\r\n") , urls) ans = _genLevelThree(f,urls,failfiles , skipNum) fout = open(outFile , 'w') for url in ans: fout.write("%s\n" % url.encode('utf-8' , 'ignore'))
def __init__(self, configfile): self.config_file_path = configfile if not self.load_config(): raise RuntimeError("Could not load config file %s" % configfile) self.regexes = {} self.precompile_regexes() self.hook_signals() self.logger = logger.logger_wrapper(configfile.replace('.yml', '') + ".db") self.fetcher = fetcher.fetcher()
def __init__(self, configfile): self.config_file_path = configfile if not self.load_config(): raise RuntimeError("Could not load config file %s" % configfile) self.regexes = {} self.precompile_regexes() self.hook_signals() self.logger = logger.logger_wrapper( configfile.replace('.yml', '') + ".db") self.fetcher = fetcher.fetcher()
def main(): if len(sys.argv) != 2: print "Usage: python %s firstLevelurl 1> outfile 2>logFile" % sys.argv[0] exit(1) f = fetcher(threads=threadsNum) with open(sys.argv[1],'r') as fi: urls = fi.readlines() urls = map (lambda x: x.strip("\r\n") , urls) level1Urls = genLevelFirst(f,urls) for url in level1Urls: print url
def start(self): #init&start all downloaders for i in range(0, self._dlder_num): worker = dlder.downloader(self._db_path+"/downloader_"+str(i)) worker.start() self._dlders.append(worker) #init&start a fetcher self._fetcher = fetcher.fetcher() self._fetcher.start() #start downloader manager thread self._stopflag = False threading.Thread.start(self)
def start(self): #init&start all downloaders for i in range(0, self._dlder_num): worker = dlder.downloader(self._db_path + "/downloader_" + str(i)) worker.start() self._dlders.append(worker) #init&start a fetcher self._fetcher = fetcher.fetcher() self._fetcher.start() #start downloader manager thread self._stopflag = False threading.Thread.start(self)
def main(): if len(sys.argv) != 4: print "Usage: python %s urlFile outfile skipNum 2>logFile" % sys.argv[0] exit(1) count = 0 pattern = re.compile("rootCatId=([0-9]+)", re.M) f = fetcher(threads=threadsNum) fileout = open(sys.argv[2],'w') fileLog = open('brand_cat.log','w') with open(sys.argv[1],'r') as fi: for line in fi: line = line.strip('\r\n') if(len(line) == 0): continue count += 1 if(count < int(sys.argv[3])): continue line = line.decode('utf-8','ignore') arr = line.split('\t') if(count % 10 == 0): sys.stderr.write("%s:Doing %d %s\n" % (datetime.datetime.now(),count,arr[0].encode('utf-8'))) fileLog.write("%s:Doing %d %s\n" % (datetime.datetime.now(),count,arr[0].encode('utf-8'))) fileLog.flush() try: mystr = arr[0] taskCount = 0 for i in range(len(arr) - 1): if str(arr[i+1]).find('tmall') == -1: taskCount += 1 f.push(arr[i+1]) while taskCount > 0: taskCount -= 1 url , content = f.pop() # f.taskleft() #watch ans ans = pattern.findall(content) for cat in ans: mystr = mystr + "\t" + cat fileout.write("%s\n" % mystr.encode('utf-8','ignore')) fileout.flush() #print ("%s\n" % mystr.encode('utf-8','ignore')) except Exception as err: sys.stderr.write("%s:%s\n" % (datetime.datetime.now(),err))
def __init__(self, StayOnDomain=False, maxsites=None, maxWorkThreads=10, fetchToWorkRatio=0.25): self.fetcher = fetcher.fetcher(int(fetchToWorkRatio * maxWorkThreads)) self.indexer = index.indexer() self.domains = dict() self.words = dict() self.domainOnly = StayOnDomain self.startdomain = "" self.maxsites = maxsites self.maxthreads = maxWorkThreads self.curthreads = 0 self.input = Queue.Queue() self.visited = [] self.avtime = 0 self.fetcher.AddHandler(self.ParsePage)
def main(): print(bcolors.BLUE + "Running Meseek v1.0.0" + bcolors.ENDC) query = '' if (sys.argv[1] == '--custom' or sys.argv[1] == '-c'): i=2 while (i<len(sys.argv)): query = query + sys.argv[i] i+=1 else: i=1 command = [] while (i<len(sys.argv)): command.append(sys.argv[i]) i+=1 try: process = subprocess.run(command,stdout=subprocess.PIPE,stderr=subprocess.PIPE) except FileNotFoundError: query = FileNotFoundError if (query!=''): query = querycleaner(query) elif (process.returncode==0): query = str(process.stdout) query = querycleaner(query) print (bcolors.BLUE + "Command executed with no issues." + bcolors.ENDC) return else: query = str(process.stderr) query = querycleaner(query) fixlist = fetcher (query) logging.info('user queried: %s',query) decision = 1 fix_counter = 1 while (decision!=0): if (len(fixlist)==0): print(bcolors.BLUE + "No Fix available" + bcolors.ENDC) return print("==="+query) print(bcolors.BLUE + "Please select an option:") print("1. Open Fix (" + str(fix_counter) + '/' + str(len(fixlist)) + ')') print("2. Share Fix with Meseek") print("0. Exit" + bcolors.ENDC) decision = int(input()) if (decision==1): url = fixlist[fix_counter-1] openurl(url) fix_counter += 1 if (fix_counter==len(fixlist)+1): print(bcolors.BLUE + "Meseek ran out of solutions" + bcolors.ENDC) decision = 0 elif (decision==2): #Write code here logging.warning('User tried to give the solution') decision=0 else: print(bcolors.BLUE + "Not a valid option. Exiting." + bcolors.ENDC) logging.warning('User tried to use invalid option.') decision=0
def __init__(self, start_items): super(scheduler, self).__init__(start_items) self.fetcher = fetcher() self.processor = processor() self.storer = storer()
def cook(self, url): respon = fetcher(url) soup = BeautifulSoup(respon) return soup
def main(): # use MW_HOME to find etc/ & var/ directories path = getenv('MW_HOME') if path == None: stderr.write("MW_HOME not set in environment, program cannot start.") _exit(1) logging.config.fileConfig('/'.join([path, 'etc', 'logging.conf']), disable_existing_loggers=False) config = parse_config('/'.join([path, 'etc', 'vddb_async.conf'])) start_dbpc(config, 'task_manager') # make db connection pool, assign 1 connection to each db accessing thread # kingship checker + db loader + task fetcher + # task cleanup : a total of 4 threads, + 1 manager thread pool db_pool = make_db_pool( config, DB_RESERVE_CONN + int(config['cleaner_threads_num'])) # kingship granted event, all threads wait till this event is set hbase_pool = make_hbase_pool(config) kev = Event() kev.clear() # db load finished eventm all threads wait till this event is set lev = Event() lev.clear() # conditions each thread wait on, named after the waiter fetch_cond = Condition(Lock()) pick_cond = Condition(Lock()) manage_cond = Condition(Lock()) clean_cond = Condition(Lock()) # kingship checker check = Thread( target=check_kingship, args=( config, db_pool, kev, config['tm_module_name'], # NOTE: if task manager stops working for X minutes, the # tasks won't be scheduled and executed for X # minutes since tasks have to be finished in a # timely fashion, the task manager master timeout # should not be too long int(config['tm_master_timeout']), logging.getLogger('mwtm_tmcheck'))) check.start() # all other threads wait here kev.wait() # this event is never cleared #assert kev.isSet() # worker threads, created in dependence order fetch = fetcher(config, db_pool, fetch_cond, pick_cond) clean = cleaner_cluster(config, db_pool, hbase_pool, clean_cond, manage_cond) manage = manager(config, db_pool, manage_cond, pick_cond, clean) pick = picker(config, pick_cond, fetch, manage) # db config observers, loader will notify them when there's new data loaded observers = [fetch, pick, manage, clean] load = Thread(target=load_database, args=(config, db_pool, lev, observers)) # start the loader, and make other threads wait for the first load load.start() lev.wait() assert lev.isSet() # start all other threads fetch.start() clean.start() manage.start() pick.start()
import time import os from fetcher import fetcher from preprocessing import preprocessing from sendmessage import * while True: start = time.time() df = fetcher() df = preprocessing(df) #os.chdir('C:\\Users\\Adithya\\Documents\\GitHub\\Athena') #os.chdir('C:\\Users\\Dexter\\Documents\\Projects\\SIH\\Athena') os.chdir('/home/pi/Desktop/Athena') for dest, msg in zip(df.To, df.Content): phno, lang = getInfo(dest) msg = changeLang(msg, lang) print(phno) sendmessage(phno, msg) exe_time = time.time() - start time.sleep(120 - exe_time)
def main(): # use MW_HOME to find etc/ & var/ directories path = getenv('MW_HOME') if path == None: stderr.write("MW_HOME not set in environment, program cannot start.") _exit(1) logging.config.fileConfig('/'.join([path, 'etc', 'logging.conf']), disable_existing_loggers=False) config = parse_config('/'.join([path, 'etc', 'vddb_async.conf'])) start_dbpc(config, 'task_manager') # make db connection pool, assign 1 connection to each db accessing thread # kingship checker + db loader + task fetcher + # task cleanup : a total of 4 threads, + 1 manager thread pool db_pool = make_db_pool(config, DB_RESERVE_CONN + int(config['cleaner_threads_num'])) # kingship granted event, all threads wait till this event is set hbase_pool = make_hbase_pool(config) kev = Event() kev.clear() # db load finished eventm all threads wait till this event is set lev = Event() lev.clear() # conditions each thread wait on, named after the waiter fetch_cond = Condition(Lock()) pick_cond = Condition(Lock()) manage_cond = Condition(Lock()) clean_cond = Condition(Lock()) # kingship checker check = Thread(target=check_kingship, args=(config, db_pool, kev, config['tm_module_name'], # NOTE: if task manager stops working for X minutes, the # tasks won't be scheduled and executed for X # minutes since tasks have to be finished in a # timely fashion, the task manager master timeout # should not be too long int(config['tm_master_timeout']), logging.getLogger('mwtm_tmcheck'))) check.start() # all other threads wait here kev.wait() # this event is never cleared #assert kev.isSet() # worker threads, created in dependence order fetch = fetcher(config, db_pool, fetch_cond, pick_cond) clean = cleaner_cluster(config, db_pool, hbase_pool, clean_cond, manage_cond) manage = manager(config, db_pool, manage_cond, pick_cond, clean) pick = picker(config, pick_cond, fetch, manage) # db config observers, loader will notify them when there's new data loaded observers = [fetch, pick, manage, clean] load = Thread(target=load_database, args=(config, db_pool, lev, observers)) # start the loader, and make other threads wait for the first load load.start() lev.wait() assert lev.isSet() # start all other threads fetch.start() clean.start() manage.start() pick.start()
def cook(self, url): respon = fetcher(url) soup = BeautifulSoup(respon) print 'Done', url return soup