def putRequest(queue, payload=None):
    response = {}
    statusCode = {}
    data = {}
    while not queue.empty():
        resourceURI = queue.get(timeout=DMON_TIMEOUT)
        response['Node'] = resourceURI
        try:
            if payload is None:
                r = requests.put(resourceURI, timeout=20)
            else:
                r = requests.put(resourceURI, data=payload, timeout=20)
            if r.headers['Content-Type'] == 'application/json':
                data = r.json
            else:
                data = r.text
            response['StatusCode'] = r.status_code
            response['Data'] = data
        except requests.exceptions.Timeout:
            response['StatusCode'] = 408
            response['Data'] = data
        except requests.exceptions.ConnectionError:
            response['Node'] = resourceURI
            statusCode['StatusCode'] = 404
            response['Data'] = 'n/a'

        GreenletRequests.NodeResponsesPost.append(response)
        # print 'Threaded PUT with ID ' + str(GreenletRequests.npo) + ' executed for ' + resourceURI
        app.logger.info(
            '[%s] : [INFO] Thread PUT with ID %s executed for %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            str(GreenletRequests.ng), resourceURI)
        GreenletRequests.npo += 1
        gevent.sleep(0)
def randomT(queue, name):
    while not queue.empty():
        t = queue.get(timeout=1)
        gevent.sleep(5)
        print "I am + " + name + " executing " + str(GreenletRequests.ng)
        GreenletRequests.ng += 1
        gevent.sleep(0)
def putRequest(queue, payload=None):
    response = {}
    statusCode = {}
    data = {}
    while not queue.empty():
        resourceURI = queue.get(timeout=1)
        response["Node"] = resourceURI
        try:
            if payload is None:
                r = requests.put(resourceURI, timeout=20)
            else:
                r = requests.put(resourceURI, data=payload, timeout=20)
            if r.headers["Content-Type"] == "application/json":
                data = r.json
            else:
                data = r.text
            response["StatusCode"] = r.status_code
            response["Data"] = data
        except requests.exceptions.Timeout:
            response["StatusCode"] = 408
            response["Data"] = data
        except requests.exceptions.ConnectionError:
            response["Node"] = resourceURI
            statusCode["StatusCode"] = 404
            response["Data"] = "n/a"

        GreenletRequests.NodeResponsesPost.append(response)
        print "Threaded PUT with ID " + str(GreenletRequests.npo) + " executed for " + resourceURI
        GreenletRequests.npo += 1
        gevent.sleep(0)
def deleteRequest(queue):
    response = {}
    while not queue.empty():
        resURI = queue.get(timeout=DMON_TIMEOUT)
        try:
            r = requests.delete(resURI, timeout=DMON_TIMEOUT)
            data = r.json()
            response['Node'] = resURI
            response['StatusCode'] = r.status_code
            response['Data'] = data
        except requests.exceptions.Timeout:
            response['Node'] = resURI
            response['StatusCode'] = 408
            response['Data'] = 'n/a'
        except requests.exceptions.ConnectionError:
            response['Node'] = resURI
            response['StatusCode'] = 404
            response['Data'] = 'n/a'

        GreenletRequests.NodeResponsesGet.append(response)
        # print 'Threaded DELETE with ID ' + str(GreenletRequests.nd) + ' executed for ' + resURI
        app.logger.info(
            '[%s] : [INFO] Thread DELETE with ID %s executed for %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            str(GreenletRequests.ng), resURI)
        GreenletRequests.nd += 1
        gevent.sleep(0)
Example #5
0
def _download_helper():
    while not queue.empty():
        h = queue.get()
        if not h:
            break

        r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h})
        open(h, "wb").write(r.content)
Example #6
0
File: vtdl.py Project: jbremer/vtdl
def _download_helper():
    while not queue.empty():
        h = queue.get()
        if not h:
            break

        r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h})
        open(h, "wb").write(r.content)
Example #7
0
 def response_generator():
     ii = 0
     while npending or not queue.empty():
         ii += 1
         result = queue.get()
         msg = '{} {}\n'.format(ii, result)
         print(msg, end='')
         yield msg
     t2 = datetime.datetime.now()
     print('====', t2 - t1)
Example #8
0
def RegexpMatchWait(queue):
    if queue.empty():
        gevent.sleep(1)
        return ''
    (tweet_dic, match_result) = queue.get()
    if tweet_dic is None or match_result is None:
        return "\n"
    result_dic = tweet_dic.copy()
    result_dic['match_result'] = match_result
    logging.info('waiting tweet text got: %s' % str(result_dic))
    return "%s\n" % json.dumps(result_dic)
def randomT(queue, name):
    while not queue.empty():
        t = queue.get(timeout=1)
        gevent.sleep(5)
        # print 'I am + ' + name + ' executing ' + str(GreenletRequests.ng)
        app.logger.info(
            '[%s] : [INFO] %s executing %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            name, str(GreenletRequests.ng))
        GreenletRequests.ng += 1
        gevent.sleep(0)
Example #10
0
def RegexpMatchWait(queue):
    if queue.empty():
        gevent.sleep(1)
        return ''
    (tweet_dic, match_result) = queue.get()
    if tweet_dic is None or match_result is None:
        return "\n"
    result_dic = tweet_dic.copy()
    result_dic['match_result'] = match_result
    logging.info('waiting tweet text got: %s' % str(result_dic))
    return "%s\n" % json.dumps(result_dic)
Example #11
0
def send_message(socket):
    global queue
    while True:
        try:
            if not queue.empty():
                #print("QUEUE NOT EMPTY")
                message = queue.get(block=False)
                if not socket.closed:
                    socket.send(json.dumps(message))
                    #print('Sent response')

            #We need a sleep call so that other greenlets can run
            gevent.sleep()
        except Exception as e:
            print("SEND: %s" % e)
            raise e
Example #12
0
def init():

    #queue init
    #main.queue.put("")
    #main.pool.spawn(getLink).join()

    #give worker pool
    print('start crwaling')
    #while not pool.free_count() == 15:
    while not queue.empty():
        gevent.sleep(0.8)
        for x in range(0, min(queue.qsize(), pool.free_count())):
            pool.spawn(getData)

    #wait for everything complete
    pool.join()
Example #13
0
def send_message(socket):
    global queue
    while True:
        try:
            if not queue.empty():
                #print("QUEUE NOT EMPTY")
                message = queue.get(block=False)
                if not socket.closed:
                    socket.send(json.dumps(message))
                    #print('Sent response')

            #We need a sleep call so that other greenlets can run
            gevent.sleep()
        except Exception as e:
            print("SEND: %s" % e)
            raise e
Example #14
0
    def task_thread(self, queue):
        """
        Executes tasks in queue
        """
        while not self.shutdown.is_set():
            if queue.empty() is False:
                (job, task) = queue.get_nowait()

                # Don't run the task if the job is done
                if job.status in [Status.ERROR, Status.ABORT]:
                    task.status = Status.ABORT
                else:
                    options = {}
                    gpu_id = -1
                    try:
                        if isinstance(task, model_tasks.TrainTask):
                            ### Select GPU
                            if len(self.gpu_list):
                                for gpu in self.gpu_list:
                                    if not gpu['active']:
                                        gpu_id = gpu['index']
                                        gpu['active'] = True
                                        break
                                assert gpu_id != -1, 'no available GPU'
                            else:
                                gpu_id = None
                            options['gpu_id'] = gpu_id

                        task.run(**options)

                    except Exception as e:
                        logger.error('%s: %s' % (type(e).__name__, e),
                                     job_id=job.id())
                        task.exception = e
                        task.traceback = traceback.format_exc()
                        task.status = Status.ERROR
                    finally:
                        ### Release GPU
                        if gpu_id != -1 and gpu_id is not None:
                            for gpu in self.gpu_list:
                                if gpu['index'] == gpu_id:
                                    gpu['active'] = False
            else:
                # Wait before checking again for a task
                time.sleep(utils.wait_time())
Example #15
0
def _download_helper():
    t = time.time()
    while not queue.empty():
        h = queue.get()
        if not h:
            break

        if h == "wait":
            time.sleep(max(0, 60 - time.time() + t))
            t = time.time()
            continue

        if os.path.exists(h):
            print "skipping..", h
            continue

        r = requests.get(VT_DOWNLOAD, params={"apikey": apikey, "hash": h})
        open(h, "wb").write(r.content)
Example #16
0
    def task_thread(self, queue):
        """
        Executes tasks in queue
        """
        while not self.shutdown.is_set():
            if queue.empty() is False:
                (job, task) = queue.get_nowait()

                # Don't run the task if the job is done
                if job.status in [Status.ERROR, Status.ABORT]:
                    task.status = Status.ABORT
                else:
                    options = {}
                    gpu_id = -1
                    try:
                        if isinstance(task, model_tasks.TrainTask):
                            ### Select GPU
                            if len(self.gpu_list):
                                for gpu in self.gpu_list:
                                    if not gpu['active']:
                                        gpu_id = gpu['index']
                                        gpu['active'] = True
                                        break
                                assert gpu_id != -1, 'no available GPU'
                            else:
                                gpu_id = None
                            options['gpu_id'] = gpu_id

                        task.run(**options)

                    except Exception as e:
                        logger.error('%s: %s' % (type(e).__name__, e), job_id=job.id())
                        task.exception = e
                        task.traceback = traceback.format_exc()
                        task.status = Status.ERROR
                    finally:
                        ### Release GPU
                        if gpu_id != -1 and gpu_id is not None:
                            for gpu in self.gpu_list:
                                if gpu['index'] == gpu_id:
                                    gpu['active'] = False
            else:
                # Wait before checking again for a task
                time.sleep(utils.wait_time())
def getrequestFile(queue, output):
    response = {}
    while not queue.empty():
        resURI = queue.get(timeout=1)
        app.logger.info(
            '[%s] : [INFO] Thread File GET with ID %s starts execution for %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            str(GreenletRequests.ng), resURI)
        hostURL = urlparse(resURI)
        hostID = hostURL.hostname
        logName = 'worker-%s.tar' % hostID
        logDump = os.path.join(output, logName)
        try:
            r = requests.get(resURI, timeout=DMON_TIMEOUT, stream=True)
            if r.status_code == 200:
                with open(
                        logDump,
                        'wb') as out_file:  # TODO investaigate chunck writter
                    shutil.copyfileobj(r.raw, out_file)

            response['Node'] = resURI
            response['StatusCode'] = r.status_code
            response['LogName'] = logDump
            response['Headers'] = r.headers
            del r
        except requests.exceptions.Timeout:
            response['Node'] = resURI
            response['StatusCode'] = 408
            response['LogName'] = logDump
        except requests.exceptions.ConnectionError:
            response['Node'] = resURI
            response['StatusCode'] = 404
            response['LogName'] = logDump

        GreenletRequests.NodeResponsesGet.append(response)
        # print 'Threaded GET with ID ' + str(GreenletRequests.ng) + ' executed for ' + resURI
        app.logger.info(
            '[%s] : [INFO] Thread File GET with ID %s executed for %s',
            datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
            str(GreenletRequests.ng), resURI)
        GreenletRequests.ng += 1
        gevent.sleep(0)
Example #18
0
def scrape_base_url():
    global data
    startTime = datetime.now()
    tree = html.fromstring(session.get(base_url).text)

    func = lambda x: queue.put_nowait((parse_comp, {
        'url': domain + x.xpath('./@href')[0],
        'name': x.xpath('./text()')[0]
    }))
    [
        func(x) for x in tree.xpath('//div[@class="st-text"]//td/a')
        if x.xpath('./text()') != []
    ]

    while not queue.empty() and not pool.full():
        for x in xrange(0, min(queue.qsize(), pool.free_count())):
            t = queue.get_nowait()
            pool.start(pool.spawn(t[0], t[1]))
    pool.join()
    print 'Time Taken : ', datetime.now() - startTime
    with open('data.json', 'w') as fp:
        json.dump(data, fp)
def deleteRequest(queue):
    response = {}
    while not queue.empty():
        resURI = queue.get(timeout=1)
        try:
            r = requests.delete(resURI, timeout=2)
            data = r.json()
            response["Node"] = resURI
            response["StatusCode"] = r.status_code
            response["Data"] = data
        except requests.exceptions.Timeout:
            response["Node"] = resURI
            response["StatusCode"] = 408
            response["Data"] = "n/a"
        except requests.exceptions.ConnectionError:
            response["Node"] = resURI
            response["StatusCode"] = 404
            response["Data"] = "n/a"

        GreenletRequests.NodeResponsesGet.append(response)
        print "Threaded DELETE with ID " + str(GreenletRequests.nd) + " executed for " + resURI
        GreenletRequests.nd += 1
        gevent.sleep(0)
Example #20
0
def getData():
    #가지고 있는 url만큼만 loop
    global error_count
    error_log = open('./err.txt', mode='a')
    while not queue.empty():
        #저장되어있는 link를 queue에서 가져옴
        #pool의 worker들이 link로 request 동기보다 n배 빠름
        link = queue.get(timeout=0)
        if link != "":
            gevent.sleep(0.3)
            getdata = requests.get(link)
            soup = BS(getdata.text, 'lxml-xml')
            #validation check
            okflag = soup.find('resultCode')
            try:
                if okflag.text != '00':
                    print("okflag: ", okflag.text)

                    raise ValueError('okcode is not 00')
                else:
                    #검색잘되면 엑셀 파싱
                    #pool map method vs pool map_async
                    #어떤것이 더 효율이 좋을지 결정필요
                    print(len(soup.find_all('item')))
                    pool_excel.map(makeCSV, soup.find_all('item'))

            except:
                error_log.write(link + '\n')
                error_log.write('==================================\n')
                error_count += 1
                error_log.write(str(error_count) + '\n')
                queue.put(link)

    print('stop crwaling')
    print(main_row)
    error_log.close()
Example #21
0
        if result != '':
            print 'Found [%s][%s] in %s' % (result, link, tag)
            a += 1
            if tag in json_dict:
                json_dict[tag].append((result, link, imgs[i]))
            else:
                json_dict[tag] = list()
                json_dict[tag].append((result, link, imgs[i]))

r = session.get(url)
tree = html.fromstring(r.text)
a_tags = tree.xpath('//li[@class="menu-item"]//a')
tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags]

for t in tags:
    url = t[0]
    result = regex.findall(t[1])
    # print url, result
    # scrape(url[0], result[0])
    queue.put((url[0], result[0]))


while not queue.empty() and not pool.full():
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(worker)
pool.join()
print a
print 'Time Taken : ', datetime.now() - start_time
with open('data.json', 'w') as fp:
    json.dump(json_dict, fp)
Example #22
0
                      (response.status_code, url))

        except gevent.queue.Empty:
            print('queue empty')
            break


if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('USAGE:\n\t%s <base_url> <entry_path>' % sys.argv[0])
        sys.exit(1)

    if validators.url(sys.argv[1]) != True:
        print('Invalid Url')
        sys.exit(1)

    queue.put(getUrl(sys.argv[2]))
    pool.spawn(crawler)

    while 1:
        if queue.empty() and pool.free_count() == WORKER_COUNT:
            print('No more links left and nothing running')
            break

        for x in range(0, min(queue.qsize(), pool.free_count())):
            pool.spawn(crawler)
        gevent.sleep(0.1)

    # Wait for everything to complete
    pool.join()
Example #23
0
            break

        print "job done"
        handler.log("job done")
        print "so far crawled %s pages" % crawled
        handler.log("so far crawled %s pages" % crawled)


queue.put(start_url_1)
queue.put(start_url_2)
pool.spawn(crawler)
handler = Handler()

print 'starting Crawler...'
handler.log('starting Crawler...')
while not queue.empty() and not pool.free_count() == workers_count:
    gevent.sleep(0.8)
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(crawler)


#wait for jobs to finish
pool.join()
print "Done"
handler.log("Done+\n")
print '\n'
print "collected %s imgs" % ITEMS_COUNT
handler.log("collected %s imgs" % ITEMS_COUNT)
print "see generated output and log files"

handler.close() #close the IO files
    global crawled

    while 1:
        try:
            u = queue.get(timeout=0)
            response = requests.get(u)
            print response.status_code, u

            for link in re.findall('<a href="(http.*?)"', response.content):

                if crawled < 10:
                    crawled += 1
                    queue.put(link)

        except gevent.queue.Empty:
            break

# Read the seed url from stdin
queue.put(sys.argv[1])
pool.spawn(crawler)

while not queue.empty() and not pool.free_count() == 5:
    gevent.sleep(0.1)
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(crawler)

# Wait for everything to complete
pool.join()

print datetime.now() - startTime # Took 5.943 seconds, varying
def crawler():
    '''A very simple queued gevent web crawler'''
    global crawled

    while 1:
        try:
            u = queue.get(timeout=1)
            response = requests.get(u)
            print(response.status_code)

            # Extract some links to follow
            for link in re.findall('<a href="(http.*?)"', response.content):
                # Limit to 10 pages (ignores links when the pool is already full)
                if crawled < 10:
                    crawled += 1
                    queue.put(link)

        except gevent.queue.Empty:
            break


queue.put(sys.argv[1])

while not queue.empty() and not pool.free_count() == 5:
    gevent.sleep(0.1)
    for x in xrange(0, min(queue.qsize(), pool.free_count())):
        pool.spawn(crawler)

# Wait for everything to complete
pool.join()