Exemple #1
0
    def parse(self):
        sql = "select id,name from test.xiecheng_landmark "
        result_set = self.db_conn.QueryDict(sql)
        logging.info(sql)
        logging.info("result_set len: %d" % len(result_set))
        for row in result_set:
            origin_keyword = self.ToString(row['name'])
            logging.info("process id: %d, name: %s" % (row['id'], row['name']))
            format_keyword = self._rtrim(origin_keyword)
            format_keyword = self._replace(format_keyword)
#            format_keyword = self._full_match(format_keyword)

            (city, keyword) = self._ltrimcity(format_keyword)
            db_dict = {}
            if city:
                db_dict['city'] = city
            if len(keyword) < 5:
                db_dict['kxflag'] = 'invalid'
                db_dict['reason'] = 'too short'
            db_dict['format_name'] = keyword
            self._search_landmark(db_dict)

            self.db_conn.ExecuteUpdateDict('test.xiecheng_landmark', db_dict, {'id': row['id']})


if __name__ == '__main__':
    btlog_init('encode.log', console=True, logfile=True)
    e = XiechengProcessor()
    e.parse()
Exemple #2
0
        sql = ("select distinct i.hotelid from tmp_hotel_info i, tmp_hotel_price p "
                " where p.hotelid = i.hotelid order by hotelid asc")
        hotel_result_set = self.product_conn.Query(sql)
        logging.info("there are total hotel in product db: %d" % len(hotel_result_set))

        (only_exists_in_mapping, both_exists, only_exists_in_product) = self.DiffList(mapping_result_set, hotel_result_set)
        logging.info("only exists in mapping: %d" % len(only_exists_in_mapping))
        logging.info("both exists: %d" % len(both_exists))
        logging.info("only exists in product: %d" % len(only_exists_in_product))

        tmp_list = [str(i[0]) for i in only_exists_in_product]
        self.SaveList(filename, tmp_list)


    def Run(self, filename="new_product_hotelid.txt"):
        self.NewHotel(filename)

        file_list = self.LoadList(filename)
        new_list = []
        for i in file_list:
            if len(i) > 0:
                new_list.append(i)

        self.GenerateHotelInfoAdd(new_list)


if __name__ == '__main__':
    btlog_init('log/log_hotel_info_add.log', logfile=False, console=True)
    k = HotelInfoAdd()
    k.Run()
        for row in result_set:
            path_list = row['ext_path'].split(u'|')
            name_list = []
            for path in path_list:
                name_list.append(zz6_info_dict[int(path)])
            zz6_name_dict[row['id']] = name_list

        for sid, name_list in zz6_name_dict.iteritems():
            flag = False
            for city_id, city_name_list in path_name_dict.iteritems():
                b = self.name_math(name_list, city_name_list)
                if b:
                    flag = True
                    sql = "update zz6_info_new set ext_city_id = '%s' where id = %s" % (self.ToString(city_id), self.ToString(sid))
                    logging.info(sql)
                    self.db_conn.Execute(sql)
                    break
            if not flag:
                logging.info("invalid sid: %s" % self.ToString(sid))


    def run(self):
        self.calculate_path_level()

if __name__ == '__main__':
    btlog_init('log_format.log', logfile=True, console=False)
    d = Zz6Formater()
#    d.run()
#    d.stat()
    d.calculate_city_id()
        (self.opt, others) = parser.parse_args()

        self.db_conn = MySQLOperator()
        if not self.db_conn.Connect(**DB_CONF):
            logging.error("db error")
            sys.exit()

    def run(self):
        if self.IsRunning('== running =='):
            return

        self.WritePidFile()

        process_pool = []
        for site in ('hidemyass', 'freeproxylists', 'free_proxy_list'):
            process_pool.append( Process(target=do_verify, args=(site, 'good')) )
            process_pool.append( Process(target=do_verify, args=(site, 'moderate')) )
            process_pool.append( Process(target=do_verify, args=(site, None)) )

        for process in process_pool:
            process.start()

        for process in process_pool:
            process.join()
            

if __name__ == '__main__':
    btlog_init('log_manager.log', logfile=True, console=True, level='DEBUG')
    v = VerifierManager()
    v.run()
        except Exception, e:
            logging.warn("e: %s" % str(e))
            logging.warn("traceback: %s" % traceback.print_exc())
            return ua_list
        return ua_list

    def do_url(self, url):
        logging.debug("url: %s" % self.ToString(url))
        html_data = self._crawl_url(url)
        if len(html_data) == 0:
            return
        return self._parse_html(html_data)

    def run(self):
        url_list = [
            "http://www.useragentstring.com/pages/Chrome/",
            "http://www.useragentstring.com/pages/Internet%20Explorer/",
            "http://www.useragentstring.com/pages/Firefox/",
        ]
        total_ua_list = []
        for url in url_list:
            a_list = self.do_url(url)
            total_ua_list.extend(a_list)

        self.SaveList('ua_list.txt', total_ua_list)

if __name__ == '__main__':
    btlog_init("log_kuoci_processor.log", logfile=False, console=True, level='DEBUG')
    p = UAUtil()
    p.run()
                logging.info("AAAA: %s" % sql)
                if self.opt.commit:
                    self.sem_conn.Execute(sql)
            else:
                logging.warn("SKIP keywordid: %d, adgroupname: %s" % (info_dict['keywordid'], info_dict['adgroupname']))

        '''
        if self.opt.commit:
            for keywordid in only_exists_in_mapping:
                sql = "delete from %s where kwid = %d" % (self.KEYWORD_SERVLET, keywordid)
                self.sem_conn.Execute(sql)
        '''

    def Run(self):
        if self.opt.account:
            self.DoAccount(self.opt.account)
        if self.opt.full:
            for account in ("1", "30", "32"):
                self.DoAccount(account)


    def test(self):
        l1 = [1,2,3,4,5,6,7,8,9]
        l2 = [4,6,8,9, 200, 201]
        print self._Diff(l1, l2)

if __name__ == '__main__':
    btlog_init('log/log_keyword_hotel_relation.log', logfile=True, console=True)
    k = KeywordHotelRelation()
    k.Run()
    def do_hidemyass(self):
        url_list = [
            "http://proxylist.hidemyass.com/1",
            "http://proxylist.hidemyass.com/2",
            "http://proxylist.hidemyass.com/3",
            "http://proxylist.hidemyass.com/4",
            "http://proxylist.hidemyass.com/5",
            "http://proxylist.hidemyass.com/6",
            "http://proxylist.hidemyass.com/7",
            "http://proxylist.hidemyass.com/8",
            "http://proxylist.hidemyass.com/9",
            "http://proxylist.hidemyass.com/10",
        ]
        for url in url_list:
            proxy_list = self.do_url(url, self._parse_hidemyass)
            logging.info("count: %d for url: %s" % (len(proxy_list), url))
            for proxy in proxy_list:
                logging.info("proxy: %s" % str(proxy))
                proxy['kxflag']         = ''
                proxy['create_time']    = datetime.now()
                self.db_conn.Upsert('proxy_hidemyass', proxy, ['ip', 'port'])

    def run(self):
        self.do_hidemyass()

if __name__ == '__main__':
    btlog_init('log_download.log', logfile=True, console=True, level='DEBUG')
    d = ProxyDownloader()
    d.run()
Exemple #8
0
        items = line.split("\t")
        if len(items) != 2:
            print line, items
            raise Exception
        return items

class CsvProcessor(EncodeChinese):
    def line_items(self, line):
        items = line.split(",")
        if len(items) != 2:
            print line, items
            raise Exception
        return items

def test():
    c = CsvProcessor()
    for line in ('1,http://a.b.c/-北京-jiudian', '2,http://b.c.d/-a-jiudian', '3,http://a.b.c/-%E5%8C%97%E4%BA%AC-jiudian'):
        print c.do_line(line)
    
def usage():
    print 'useage: %s filename gbk|utf8' % sys.argv[0]
    sys.exit()

if __name__ == '__main__':
    if len(sys.argv) != 3 or sys.argv[2] not in ['gbk', 'utf8']:
        usage()

    btlog_init('log/encode.log')
    e = CsvProcessor()
    e.run(sys.argv[1])
    # start scrapy
    def do_run(self):
        # get min(task_date)
        sql = "select task_date from baidu_keyword_manager where flag = 'init' order by task_date asc limit 1"
        result_set = self.task_db_conn.Query(sql)
        if not result_set or len(result_set) == 0:
            logging.info("no task")
            return
        task_date = result_set[0][0]
        cmd = "/usr/local/bin/scrapy crawl baidu_keyword -a task_date=%s --logfile=log_scrapy.baidu_keyword.%s.log" % (
            task_date,
            task_date,
        )
        logging.info("cmd: %s" % cmd)
        subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE).communicate()[0].strip()

    def run(self):
        if self.IsRunning():
            logging.info("== is running ==")
            return

        self.WritePidFile()
        self.do_run()


if __name__ == "__main__":
    btlog_init("log_baidu_keyword_scrapy_manager.log", logfile=True, console=True)
    t = SeoCoreKeywordManager()
    t.run()
            "http://hidemyass.com/proxy-list/5",
            "http://hidemyass.com/proxy-list/6",
            "http://hidemyass.com/proxy-list/7",
            "http://hidemyass.com/proxy-list/8",
            "http://hidemyass.com/proxy-list/9",
            "http://hidemyass.com/proxy-list/10",
        ]
        for url in url_list:
            proxy_list = self.do_url(url, self._parse_hidemyass)
            logging.info("count: %d for url: %s" % (len(proxy_list), url))
            for proxy in proxy_list:
                logging.info("proxy: %s" % str(proxy))
                proxy['kxflag']         = ''
                proxy['create_time']    = datetime.now()
                self.db_conn.Upsert('proxy_hidemyass', proxy, ['ip', 'port'])


    def run(self):
        self.do_hidemyass()

    def test(self):
        c = self.LoadFile("cache/8024ef3ca080b74ab57cb5ef36562e5d.html")
        p = self._parse_hidemyass(c)
        print p
        

if __name__ == '__main__':
    btlog_init('log_download.log', logfile=True, console=True)
    d = ProxyDownloader()
    d.run()
            self.conn_57.Execute("update viator_destination_attraction set destination=%s,destinationid=%s where id=%s", [destination,destinationid,row['id']])


    def tmp2(self):
        sql = "select distinct anchor_text,dgroup,dgroupid from viator_group "
        self.group_res = self.conn_57.QueryDict(sql)

        sql = "select destination,destinationid from viator_attraction_city where pid=0 "
        tmp_res = self.conn_57.QueryDict(sql)
        for destination_info in tmp_res:
            for group_info in self.group_res:
                new_url = "http://www.viator.com/%s-tours/%s/%s-%s" % (destination_info['destination'], 
                                group_info['dgroup'], destination_info['destinationid'], group_info['dgroupid'])
                sql = "insert into viator_ttd_group(destination,destinationid,dgroup,dgroupid,href,source) values(%s,%s,%s,%s,%s,'proc_gen')"
                self.conn_57.Execute(sql, [destination_info['destination'],destination_info['destinationid'],
                                            group_info['dgroup'],group_info['dgroupid'],new_url])



'''
select distinct anchor_text,dgroup,href_key from viator_group
'''

if __name__ == '__main__':
    btlog_init('log_tool.log', logfile=False, console=True)
    k = ToolTest()
#    k.test()
#    k.do_format()
#    k.check2()
    k.tmp2()
Exemple #12
0
from datetime import datetime

from scrapy.selector import HtmlXPathSelector

sys.path.append('/home/yangrq/projects/pycore')
from utils.common_handler import CommonHandler
from utils.btlog import btlog_init
from db.mysqlv6 import MySQLOperator

from baidu_common import BaiduCommon
from config import *

class Tool(CommonHandler):
    def __init__(self):
        self.db_conn = MySQLOperator()
        if not self.db_conn.Connect(**DB_CONF):
            raise Exception, "db error"

    def run(self):
        for flag in ('good', 'moderate', ''):
            for table in ('proxy_free_proxy_list', 'proxy_freeproxylists', 'proxy_hidemyass'):
                sql = "select count(*) from %s where kxflag = '%s' " % (table, flag)
                result_set = self.db_conn.Query(sql)
                logging.info(sql)
                logging.info("count: %d" % result_set[0][0])

if __name__ == '__main__':
    btlog_init(logfile=False, console=True, level='DEBUG')
    v = Tool()
    v.run()
Exemple #13
0
                url = BaiduCommon.random_request()
                print url
                html = urllib2.urlopen(url, timeout=3).read()
                if len(html) > 100:
                    parse_dict = BaiduCommon.parse(html)
                    if parse_dict['valid_flag']:
                        succeed_count += 1
            except Exception, e:
                logging.warn("error: %s" % str(e))
        kxflag = ''
        if succeed_count == 0:
            kxflag = 'bad'
        elif succeed_count < 6:
            kxflag = 'pool'
        elif succeed_count < 9:
            kxflag = 'moderate'
        else:
            kxflag = 'good'
        logging.info("proxy: %s, succeed_count: %d" % (tmp_proxy, succeed_count))
        return kxflag

    def test(self):
        proxy_list = ['http://140.120.94.26:8088', 'http://181.208.70.75:8080']
        for proxy in proxy_list:
            self._real_verify(proxy)

if __name__ == '__main__':
    btlog_init('log_verifier.log', logfile=True, console=True, level='DEBUG')
    v = ProxyVerifier()
    v.test()
Exemple #14
0
FILE_PATH = os.path.dirname(__file__)
sys.path.append('/home/yangrq/github/pycore')

from utils.btlog import btlog_init
from utils.common_handler import CommonHandler
from utils.http_client import HttpClient

class Tester(CommonHandler, HttpClient):
    def __init__(self):
        HttpClient.__init__(self)
        self.key = urllib.quote_plus('1d1180f3c7c41fd1760c5819fad8b4ed')
        pass

    def fetch(self):
        ret = self.DoGet('storage.service.kuxun.cn', 80, '/storage/fetch-item?key=%s' % self.key)
        print ret

    def store(self):
        data = self.LoadFile('test.py')
        ret = self.DoPost('storage.service.kuxun.cn', 80, '/storage/store-item?key=%s' % self.key, data)
        print ret

    def Run(self):
        self.store()

if __name__ == '__main__':
    btlog_init('a.log', console=True, logfile=False, level=logging.DEBUG)
    a = Tester()
    a.Run()
#    a.fetch()