Esempio n. 1
0
def productInfo():
    db = DBService(dbName='jddata', tableName='jdproductbaseinfo2database')
    data = db.getData(var='productHref,commentCount', limit=200000)
    proDict = {}
    for item in data:
        proDict[item[0]] = item[1]
    return proDict
def gen_url():
    def url_join(t):
        if '.html' in t:
            return None
        else:
            temp = t.rsplit('/', 1)
            return temp[0] + '/contactinfo/' + temp[1] + '.html'

    def change_par(x):
        if '//www' in x:
            return url_join(x)
        elif '//pt' in x:
            return url_join(x.replace('//pt', '//www'))
        elif '//ru' in x:
            return url_join(x.replace('//ru', '//www'))
        elif '//es' in x:
            return url_join(x.replace('//es', '//www'))
        else:
            return None

    db_g = DBService(dbName=db_name, tableName='aliexpress_temp', **connect_dict)
    href_list_t = db_g.getData(var='store_href', distinct=True)
    href_s = map(
            lambda t: change_par(t), map(
                    lambda x: x[0], href_list_t
            )
    )
    return list(set(filter(lambda x: 1 if x else 0, href_s)))
Esempio n. 3
0
def craweldhref():
    db = DBService('elec_platform', 'yms_tmall_shopinfo_com_withoutjudge')
    href = db.getData(var='href')
    href = [item[0] for item in href]
    F = lambda x: x[:-1] if x[-1] == '/' else x
    href = map(F, href)
    print(len(href))
    return href
Esempio n. 4
0
def gen_url():
    DB = DBService(dbName="alibaba", tableName="alibaba_cow_powder_3")
    url_detail_page = DB.getData(var="credit_detail_href", distinct=True)
    urls = map(lambda x: x[0] if x else " ", url_detail_page)
    url = []
    for t in urls:
        if t:
            url.append(t)
    return url
Esempio n. 5
0
def companyInfo():
    # 返回公司信息,字典形式
    db = DBService(dbName='jddata', tableName='thirdPartShopInfo')
    data = db.getData(limit=200000)
    data = [item for item in data if not item[2] == '-']
    comDict = {}
    for item in data:
        comDict[item[1]] = item[1:]
    return comDict
Esempio n. 6
0
def commentHrefList():
    db = DBService('elec_platform', 'tmall_baseinfo_everyweek')
    judgePageHref = db.getData(var='name,href,judgepage_href')
    judgePageHref = [tuple(item) for item in judgePageHref if
                     not 'http' in item[2]]
    judgePageHref = [item for item in judgePageHref if not item[2].isnumeric()]
    judgePageHref = set(judgePageHref)
    judgePageHref = list(judgePageHref)
    print(len(judgePageHref))
    return judgePageHref
def proxy_collection():
    # get proxies from website
    proxies_list_website = pc.get_proxies_from_website()
    # at the same time , get other proxies from local database
    table_names_proxies = 'proxy_other_source,proxy_you_dai_li'
    proxies_list_local = list()
    for proxies_t_n in table_names_proxies.split(','):
        dbs = DBService(dbName='base', tableName=proxies_t_n, **connect_dict)
        proxies_list_local += map(lambda x: x[0], dbs.getData(var='proxy_port'))
    return list(set(proxies_list_website + proxies_list_local))
Esempio n. 8
0
def begin():
    db = DBService(dbName='jddata', tableName='thirdPartShopInfo')
    data = db.getData()
    title = db.getTableTitle()[1:-2]
    S = set()
    for item in data:
        S.add(tuple(item[1:-2]))
    data = []
    for item in S:
        data.append(list(item))
    csv = CSV()
    csv.writeCsv(savePath='D:/spider', fileTitle=title, data=data, fileName='jdData')
Esempio n. 9
0
def sumCommentCount():
    db = DBService(dbName='jddata', tableName='thirdPartShopInfoAddCommnetCount')
    # db = DBService(dbName='jddata', tableName='thirdPartShopInfoAddtest')
    data = db.getData(var='shopName,commnetCount')
    dict = {}
    for item in data:
        if item[0] in dict.keys():
            dict[item[0]] = int(item[1]) + dict[item[0]]
        else:
            dict[item[0]] = int(item[1])
    data = []
    for item in dict.items():
        data.append([item[0], item[1]])
    csv = CSV()
    csv.writeCsv(savePath='D:/spider', fileTitle=['shopName', 'commnetCount'], data=data, fileName='jdDataSum')
Esempio n. 10
0
 def startUrlList(self):
     """
     # 方法重载
     :return:
     """
     dbs = DBService(dbName='jddata', tableName='jdproductbaseinfo2database')
     data = dbs.getData(var='productHref,sku', distinct=True)
     dataThirdPartBase = [item[0] for item in data if len(item[1]) >= 10]
     dataHadCrawled = DBService(dbName='jddata', tableName='thirdPartShopInfo').getData(var='productHref')
     if not dataHadCrawled:
         return dataThirdPartBase
     dataHadCrawled = set([item[0] for item in dataHadCrawled])
     dataThirdPart = [item for item in dataThirdPartBase if item not in dataHadCrawled]
     dataThirdPart = [item for item in dataThirdPart if item[:4] == 'http']
     # print len(dataThirdPart)
     return dataThirdPart
Esempio n. 11
0
def savePicture():
    from screenShot import saveScreenShot
    from ms_spider_fw.DBSerivce import DBService
    import time
    import random

    db = DBService(dbName='tmalldata', tableName='tmall_baseinfo_realtime')
    data = db.getData(var='name,href', distinct=True)
    nameD = map(lambda x: x[0], data)
    data = map(lambda x: x[1], data)
    print(len(data))
    dri = None
    for url in data:
        name=nameD[data.index(url)]
        print(name)
        dri = saveScreenShot(url, driver=dri,title=name)
        time.sleep(abs(random.gauss(3, 2)))
Esempio n. 12
0
# connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'}
connect_dict = {'host': 'localhost', 'user': '******', 'passwd': '', 'charset': 'utf8'}

# db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict)
# proxy_list = map(lambda x: x[0], db_server.getData(var='proxy_port', distinct=True))
# for p in proxy_list:
#     qu_proxy_test.put(p)

patt_ip = re.compile(r'(?<![\.\d])(?:\d{1,3}\.){3}\d{1,3}(?![\.\d])')
proxy_list = []

for table_name in table_name_s.split(','):
    print table_name
    db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict)
    if db_server.isTableExist():
        proxy_list += map(lambda x: x[0], db_server.getData(var='proxy_port'))

proxy_list_t=list(set(proxy_list))
for p in proxy_list_t:
    qu_proxy_test.put(p)


def original_ip_address():
    t = requests.get('http://httpbin.org/ip').text
    return json.loads(t).get('origin')


original = original_ip_address()


def test():
Esempio n. 13
0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from ms_spider_fw.DBSerivce import DBService
import json
import re
import requests
import sys
from datetime import datetime

reload(sys)
sys.setdefaultencoding('utf8')

db_server = DBService(dbName='test', tableName='weibo_cellphone')  # , **connect_dict)
data = db_server.getData(var='detail_json', limit=20)
data = filter(lambda x: 1 if x[0][0] == '{' else 0, filter(lambda x: 1 if x[0] else 0, data))

re_sub_p = re.compile('<.+?>')
re_sub_t = re.compile('\+\d+?\s')


def time_format(ori):
    if not ori:
        return ''
    o = re.sub(re_sub_t, '', ori)
    s = datetime.strptime(o, '%a %b %d %H:%M:%S %Y')
    return s.strftime('%Y-%m-%d %H:%M:%S')


# extract_info from json string
def extract_info(x):
Esempio n. 14
0
 def startUrlList(self):
     db = DBService(dbName="jddata", tableName="thirdPartShopAppID")
     data = db.getData(var="appID")
     data = ["http://mall.jd.com/view_search-" + item[0] + "-0-5-1-24-1.html" for item in data if item[0]]
     return data
Esempio n. 15
0
import threading
import time
from Queue import Queue as qu

from ms_proxy import proxy_test
from ms_spider_fw.DBSerivce import DBService

# config text
db_name = 'b2c_base'
# give some tables name to extract proxy list to test , different table name be combined use ','
table_name_s = 'proxy_you_dai_li,proxy_xi_ci_dai_li'
connect_dict = {'host': '10.118.187.12', 'user': '******', 'passwd': 'admin', 'charset': 'utf8'}
proxy_list = []
for table_name in table_name_s.split(','):
    db_server = DBService(dbName=db_name, tableName=table_name, **connect_dict)
    proxy_list += map(lambda x: x[0], db_server.getData(var='proxy_port', distinct=True))

# with open("d:/proxy_2.txt", 'r')as f:
#     t = f.read()
# proxy_list = t.split('\n')

# script
qu_proxy_test = qu(0)
qu_proxy_ok = qu(0)

for t in set(proxy_list):
    qu_proxy_test.put(t)


def test():
    while qu_proxy_test.qsize():
Esempio n. 16
0
#coding:utf8
__author__ = '613108'
from ms_spider_fw.DBSerivce import DBService
dbs=DBService(dbName='elec_platform',tableName='tmall_baseinfo_everyweek')
data=dbs.getData()
data=[item for item in data if int(item[-2])>=35]
print(len(data))
Esempio n. 17
0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from ms_spider_fw.DBSerivce import DBService
import json
import re
import requests
import sys

reload(sys)
sys.setdefaultencoding('utf8')

db_server = DBService(dbName='platform_data', tableName='jd_comment_woman_cloth')
data = db_server.getData(var='comment_json', distinct=True, limit=10)
data = filter(lambda x: 1 if x[0][0] == '{' else 0, filter(lambda x: 1 if x[0] else 0, data))

re_sub_p = re.compile('<.+?>')


# extract_info from json string
def extract_info(x):
    try:
        d_t = json.loads(x[0])
        d = d_t['comments']
        return [
            {
                "id": it.get("id"),
                "content": it.get("content").replace('\n', ''),
                "creationtime": it.get("creationTime"),
                "referencename": it.get("referenceName"),
                "referencetime": it.get("referenceTime"),
Esempio n. 18
0
 def startUrlList(self):
     db = DBService(dbName="jddata", tableName="jd_shop_gradeHref")
     data = db.getData()
     data = map(lambda x: x[0], data)
     return data
Esempio n. 19
0
__author__ = '613108'

from ms_spider_fw.DBSerivce import DBService

dbs = DBService(dbName='jddata', tableName='thirdPartShopInfo')
companyCount = dbs.getData(var='companyName', distinct=True)
shopCount1 = dbs.getData(var='shopHref', distinct=True)
shopCount2 = dbs.getData(var='shopName', distinct=True)
gradeHref = dbs.getData(var='gradeHref', distinct=True)
print len(companyCount)
print len(shopCount1)
print len(shopCount2)
print len(gradeHref)
Esempio n. 20
0
 def startUrlList(self):
     db = DBService(dbName='jddata', tableName='thirdPartShopInfo')
     data = db.getData(var='shopHref', distinct=True)
     data = [item[0] for item in data]
     return data