Esempio n. 1
0
def getHtmlContent(target_url, header_type):
    config = Config()
    try:
        if header_type == 'sougou':
            send_headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Connection': 'keep-alive',
                'Cookie': 'com_sohu_websearch_ITEM_PER_PAGE='+str(config.getValue("pagesize", "sougou"))
            }
        else:
            send_headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Connection': 'keep-alive'
            }

        if sys.version > '3':
            req = urllib.request.Request(target_url, headers=send_headers)
            response = urllib.request.urlopen(req, timeout=10)
        else:
            req = urllib2.Request(target_url, headers=send_headers)
            response = urllib2.urlopen(req, timeout=30)
            # print get_request.info()

        return response.read().decode('utf-8')

    except Exception as e:
        print("Get html page content error:" + e.message)
Esempio n. 2
0
    def __init__(self, module, page, keyword):
        config = Config()
        pagesize = config.getValue("pagesize", module)

        print("\033[1;37;40m[*]Search Engine [%s] starting!The number of display bars per page is %s" % (module, pagesize))

        myps = multiprocessing.Process(target=Collect, args=(module, page, pagesize, keyword,))
        myps.start()

        processList = gol.get_value("process")
        processList.append(myps)
        gol.set_value("process", processList)
Esempio n. 3
0
class Collect(object):

    def __init__(self, module, page, pagesize, keyword):
        self.config = Config()
        self.module = module
        self.page = int(page)
        self.keyword = keyword
        self.pageSize = int(pagesize)

        self.saveFile = self.config.getValue("global", "savefile")

        if self.saveFile == 'True':
            self.outfile = OutFile(unquote(self.keyword))
        else:
            self.outfile = None

        self.collection()



    def collection(self):

        for i in range(self.page):
            print("\033[1;37;40m[*]Search Engine [%s],Page [%s] Start collecting." % (self.module, i+1))

            page_pn = (i * self.pageSize)

            if self.module == "baidu":
                my_baidu = Baidu(self.outfile)
                my_baidu.search(self.keyword, self.pageSize, page_pn)

            elif self.module == "so":
                my_so = So(self.outfile)
                my_so.search(self.keyword, i+1)

            elif self.module == "sougou":
                my_sougou = Sougou(self.outfile)
                my_sougou.search(self.keyword, i+1)

            if self.config.sleeptime > 0:
                time.sleep(self.config.sleeptime)

        if self.outfile == 'True':
            self.outfile.closeFile()
Esempio n. 4
0
class Collect(object):
    def __init__(self, module, page, pagesize, keyword):
        self.config = Config()
        self.module = module
        self.page = int(page)
        self.keyword = keyword
        self.pageSize = int(pagesize)

        self.saveFile = self.config.getValue("global", "savefile")

        if self.saveFile == 'True':
            self.outfile = OutFile(unquote(self.keyword))
        else:
            self.outfile = None

        self.collection()

    def collection(self):

        for i in range(self.page):
            print(
                "\033[1;37;40m[*]Search Engine [%s],Page [%s] Start collecting."
                % (self.module, i + 1))

            page_pn = (i * self.pageSize)

            if self.module == "baidu":
                my_baidu = Baidu(self.outfile)
                my_baidu.search(self.keyword, self.pageSize, page_pn)

            elif self.module == "so":
                my_so = So(self.outfile)
                my_so.search(self.keyword, i + 1)

            elif self.module == "sougou":
                my_sougou = Sougou(self.outfile)
                my_sougou.search(self.keyword, i + 1)

            if self.config.sleeptime > 0:
                time.sleep(self.config.sleeptime)

        if self.outfile == 'True':
            self.outfile.closeFile()
Esempio n. 5
0
    def __init__(self, module, page, keyword):
        config = Config()
        pagesize = config.getValue("pagesize", module)

        print(
            "\033[1;37;40m[*]Search Engine [%s] starting!The number of display bars per page is %s"
            % (module, pagesize))

        myps = multiprocessing.Process(target=Collect,
                                       args=(
                                           module,
                                           page,
                                           pagesize,
                                           keyword,
                                       ))
        myps.start()

        processList = gol.get_value("process")
        processList.append(myps)
        gol.set_value("process", processList)
Esempio n. 6
0
def getHtmlContent(target_url, header_type):
    config = Config()
    try:
        if header_type == 'sougou':
            send_headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Connection':
                'keep-alive',
                'Cookie':
                'com_sohu_websearch_ITEM_PER_PAGE=' +
                str(config.getValue("pagesize", "sougou"))
            }
        else:
            send_headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
                'Accept':
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Connection': 'keep-alive'
            }

        if sys.version > '3':
            req = urllib.request.Request(target_url, headers=send_headers)
            response = urllib.request.urlopen(req, timeout=10)
        else:
            req = urllib2.Request(target_url, headers=send_headers)
            response = urllib2.urlopen(req, timeout=30)
            # print get_request.info()

        return response.read().decode('utf-8')

    except Exception as e:
        print("Get html page content error:" + e.message)
Esempio n. 7
0
class Filter(object):

    # filter_title_array = ['翻译', '词典']

    def __init__(self):
        self.config = Config()
        self.filterUrlParam = self.config.getValue("filter", "filter_urlparam")
        self.filterUrl = self.config.getValue("filter", "filter_url")
        self.filterTitle = self.config.getValue("filter", "filter_title")

        self.filterUrlList = self.get_filterurl()
        self.filterTitleList = self.get_filtertitle()


    # Filter the real URL
    def filter_data(self, url, title):
        try:
            #domain = get_tld(url)
            urldata = tldextract.extract(url)
            domain = '.'.join(urldata[1:3])
        except:
            print("解析URL:"+url+" 失败!")
            domain = url
            
        if self.filterUrl == 'True':
            if domain in self.filterUrlList:
                return 'filter'

        if self.filterTitle == 'True':
            for filter_titlestr in self.filterTitleList:
                if filter_titlestr in title:
                    return 'filter'
        
        if self.filterUrlParam == 'True':
            reg = r'^https?:\/\/([a-z0-9\-\.]+)[\/\?]?'
            m = re.match(reg, url)
            if m:
                uri = m.groups()[0]
                return uri[uri.rfind('//', 0, uri.rfind('.')) + 1:]
        else:
            return url


    def get_filterurl(self):
        file_object = open('config/filter_url.txt')
        try:
            file_context = file_object.read()
        finally:
            file_object.close()

        return file_context



    def get_filtertitle(self):
        file_object = open('config/filter_title.txt')
        try:
            file_context = file_object.read().decode("utf-8")
        finally:
            file_object.close()

        return file_context