Beispiel #1
0
def parse_mobile(params):
    global mobiles, errors, req

    url = params['url']
    comp_name = params['name']

    print 'Request sent for - ' + url
    req += 1
    try:
        mobiles += 1
        tree = html.fromstring(session.get(url).text)
        name = tree.xpath('//h1[@class="specs-phone-name-title"]//text()')[0]
        print 'Total[%i], Mobile found : %s' % (len(data.keys()), name)

        data[name] = {}
        d = data[name]
        d['Url'] = url
        d['Brand'] = comp_name
        for x in tree.xpath('//div[@id="specs-list"]//table//tr'):
            name = x.xpath('.//td[@class="ttl"]/a/text()')
            if name != []: d[name[0]] = x.xpath('.//td[@class="nfo"]/text()')
        d['Battery'] = tree.xpath(
            '//th[text()="Battery"]/ancestor::tr//td[@class="nfo"]/text()')

    except Exception as e:
        print 'Error in %s Restarting.... \n Error message : %s' % (url,
                                                                    e.message)
        queue.put_nowait((parse_mobile, params))
        errors += 1
Beispiel #2
0
 def recv(self, sock, queue):
     """Receiver."""
     # Note that this is not really an actor, but we want to send
     # messages anyway, so we need to access the actor.
     sock.setTimeout(None)
     while True:
         data = sock.read()
         queue.put_nowait(data)
         self.send(self.address, ('data',))
Beispiel #3
0
 def recv(self, sock, queue):
     """Receiver."""
     # Note that this is not really an actor, but we want to send
     # messages anyway, so we need to access the actor.
     sock.setTimeout(None)
     while True:
         data = sock.read()
         queue.put_nowait(data)
         self.send(self.address, ('data',))
Beispiel #4
0
 def UpdateTweet(self, tweet_text, tweet_dic):
     for regexp_watcher in self.regexp_watcher_list.values():
         if 're_prog' not in regexp_watcher or 'queue' not in regexp_watcher:
             continue
         re_prog = regexp_watcher['re_prog']
         queue = regexp_watcher['queue']
         if re_prog is None or queue is None:
             continue
         match_result = re_prog.findall(tweet_text)
         if not match_result:
             continue
         #print "streaming hit for %s (%s)" % (regexp_watcher['user_name'].decode('utf-8'), regexp_watcher['description'].decode('utf-8'))
         queue.put_nowait((tweet_dic, match_result))
 def UpdateTweet(self, tweet_text, tweet_dic):
     for regexp_watcher in self.regexp_watcher_list.values():
         if 're_prog' not in regexp_watcher or 'queue' not in regexp_watcher:
             continue
         re_prog = regexp_watcher['re_prog']
         queue = regexp_watcher['queue']
         if re_prog is None or queue is None:
             continue
         match_result = re_prog.findall(tweet_text)
         if not match_result:
             continue
         #print "streaming hit for %s (%s)" % (regexp_watcher['user_name'].decode('utf-8'), regexp_watcher['description'].decode('utf-8'))
         queue.put_nowait((tweet_dic, match_result))
Beispiel #6
0
    def handle_post(post):
        if not post.get('approxLoc'):
            print 'Post %r does not have approxLoc field' % post['id']
            return

        print "New post", post['id']

        data = {
            'post': anonymize(post),
            'lat': post['approxLoc']['lat'],
            'lng': post['approxLoc']['lng'],
        }
        try:
            queue.put_nowait(data)
        except gevent.queue.Full:
            return
Beispiel #7
0
    def handle_post(post):
        if not post.get('approxLoc'):
            print 'Post %r does not have approxLoc field' % post['id']
            return

        print "New post", post['id']

        data = {
            'post': anonymize(post),
            'lat': post['approxLoc']['lat'],
            'lng': post['approxLoc']['lng'],
        }
        try:
            queue.put_nowait(data)
        except gevent.queue.Full:
            return
Beispiel #8
0
def parse_comp(params):
    global errors, req

    url = params['url']
    comp_name = params['name']

    dp.append(url)
    print 'Request sent for [%s] - %s' % (comp_name, url)
    req += 1
    try:
        tree = html.fromstring(session.get(url).text)
        for x in tree.xpath('//div[@class="nav-pages"]//a/@href'):
            if domain + x not in dp:
                queue.put_nowait((parse_comp, {
                    'url': domain + x,
                    'name': comp_name
                }))
        for x in tree.xpath('//div[@class="makers"]//a/@href'):
            queue.put_nowait((parse_mobile, {
                'url': domain + x,
                'name': comp_name
            }))

    except Exception as e:
        print 'Error in %s Restarting.... \n Error message : %s' % (url,
                                                                    e.message)
        queue.put_nowait((parse_comp, params))
        errors += 1
Beispiel #9
0
def scrape_base_url():
    global data
    startTime = datetime.now()
    tree = html.fromstring(session.get(base_url).text)

    func = lambda x: queue.put_nowait((parse_comp, {
        'url': domain + x.xpath('./@href')[0],
        'name': x.xpath('./text()')[0]
    }))
    [
        func(x) for x in tree.xpath('//div[@class="st-text"]//td/a')
        if x.xpath('./text()') != []
    ]

    while not queue.empty() and not pool.full():
        for x in xrange(0, min(queue.qsize(), pool.free_count())):
            t = queue.get_nowait()
            pool.start(pool.spawn(t[0], t[1]))
    pool.join()
    print 'Time Taken : ', datetime.now() - startTime
    with open('data.json', 'w') as fp:
        json.dump(data, fp)
Beispiel #10
0
 def SendHeartbeat(self):
     for regexp_watcher in self.regexp_watcher_list.values():
         queue = regexp_watcher['queue']
         #print "send heartbeat to %s (%s)" % (regexp_watcher['user_name'].decode('utf-8'), regexp_watcher['description'].decode('utf-8'))
         queue.put_nowait((None, None))
Beispiel #11
0
            self.put_to_queue(link['href'])

    def run_parse(self):
        try:
            self._queue.put(self._start_url)
            self.parse_html(self.get_content())
        except Exception, e:
            print e, self._start_url, self.get_content()

    def pack_url_md5(self, url):
        m = hashlib.md5(url)
        return m.hexdigest()[8:-8]


queue = queue.Queue()
queue.put_nowait("")
threads = []
threads.append(
    gevent.spawn(
        Consumers(queue, 'thread1', "http://beijing.lashou.com/").run_parse))
threads.append(
    gevent.spawn(
        Consumers(queue, 'thread2', "http://beijing.lashou.com/").run_parse))
threads.append(
    gevent.spawn(
        Consumers(queue, 'thread3', "http://beijing.lashou.com/").run_parse))
gevent.joinall(threads)
# if __name__ == "__main__":
#     pass
#     parse_html(get_content("http://beijing.lashou.com/"))
#     print pack_url_md5("http://beijing.lashou.com/")
Beispiel #12
0
        htmlobj = BeautifulSoup(response.read())
        links = htmlobj.findAll(href=re.compile(r'^(http|/page)'))
        print response.geturl()
        for link in links:
            print link['href'],link.string
            self.put_to_queue(link['href'])
            
    def run_parse(self):
        try:
            self._queue.put(self._start_url)
            self.parse_html(self.get_content())
        except Exception,e:
            print e,self._start_url,self.get_content()
            
    def pack_url_md5(self,url):
        m = hashlib.md5(url)
        return m.hexdigest()[8:-8]

queue = queue.Queue()
queue.put_nowait("")
threads = []
threads.append(gevent.spawn(Consumers(queue,'thread1',"http://beijing.lashou.com/").run_parse))
threads.append(gevent.spawn(Consumers(queue,'thread2',"http://beijing.lashou.com/").run_parse))
threads.append(gevent.spawn(Consumers(queue,'thread3',"http://beijing.lashou.com/").run_parse))
gevent.joinall(threads)
# if __name__ == "__main__":
#     pass
#     parse_html(get_content("http://beijing.lashou.com/"))
#     print pack_url_md5("http://beijing.lashou.com/")
        
Beispiel #13
0
 def SendHeartbeat(self):
     for regexp_watcher in self.regexp_watcher_list.values():
         queue = regexp_watcher['queue']
         #print "send heartbeat to %s (%s)" % (regexp_watcher['user_name'].decode('utf-8'), regexp_watcher['description'].decode('utf-8'))
         queue.put_nowait((None, None))
Beispiel #14
0
 def _notify(waiters, data):
     for queue in waiters:
         queue.put_nowait(data)