Ejemplo n.º 1
0
    def doRequest(self):
        d = Download(self.Url)
        if d.doRequest():
            return 1

        self.recs = d.getSOURCE()
        return 0
Ejemplo n.º 2
0
 def doRequest(self):
   d = Download(self.Url)
   if d.doRequest():
     return 1
   
   self.recs = d.getSOURCE()
   return 0
Ejemplo n.º 3
0
def isGoogleSearch(schema, ip):
  d = Download(schema + '://' + ip)
  if d.doRequest():
    return False

  if Utility.containsGoogle(d.getSOURCE()):
    return True
  
  return False
Ejemplo n.º 4
0
def isGoogleSearch(schema, ip):
    d = Download(schema + '://' + ip)
    if d.doRequest():
        return False

    if Utility.containsGoogle(d.getSOURCE()):
        return True

    return False
Ejemplo n.º 5
0
    def run(self):
        url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str(self.PageNumber) + self.BASE_URL_PART_5
        d = Download(url)
        if d.doRequest():
            # fail
            print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber)
        else:
            utfstr2file(d.getSOURCE(), './data/' + self.SeasonId + '-' + str(self.PageNumber) + '.raw')

        return url
Ejemplo n.º 6
0
    def requestHtml(self):
        url = self.BaseUrl + self.ISBN
        # print url, self.User_Agent
        d = Download(url, self.User_Agent)
        if d.doRequest():
            return 1

        self.HTML = d.getSOURCE()

        return 0
Ejemplo n.º 7
0
    def request(self):
        baseUrl = 'http://shaishufang.com/index.php/site/detail/uid/'
        postFix = '/status//category/none/friend/false'
        url = baseUrl + self.UID + '/ubid/' + self.BID + postFix

        d = Download(url, self.Cookie, self.Proxy)
        if d.doRequest():
            return False

        self.HTML = d.getSOURCE()
        return True
Ejemplo n.º 8
0
 def doRequest(self):
   playerId = str(self.PlayerId)
   seasonType = self.SeasonType.replace(" ", "+")
   url = self.Url + "PlayerId=" + playerId + "&SeasonType=" + seasonType + "&League=" + self.LeagueId
   d = Download(url)
   
   if d.doRequest() == 1:
     return 1
   
   self.recs = dumps(loads(d.getSOURCE()))
   return 0
Ejemplo n.º 9
0
    def request(self):
        baseUrl = "http://shaishufang.com/index.php/site/main/uid/"
        postFix = "/friend/false/category//status//type//page/"
        url = baseUrl + self.UID + postFix + str(self.Page)

        d = Download(url, self.Cookie, self.Proxy)
        if d.doRequest():
            return False

        self.HTML = d.getSOURCE()
        return True
Ejemplo n.º 10
0
    def run(self):
        url = self.BASE_URL + self.SeasonId + self.BASE_URL_PART_3 + str(
            self.PageNumber) + self.BASE_URL_PART_5
        d = Download(url)
        if d.doRequest():
            # fail
            print 'ERROR: ' + self.SeasonId + '-' + str(self.PageNumber)
        else:
            utfstr2file(
                d.getSOURCE(), './data/' + self.SeasonId + '-' +
                str(self.PageNumber) + '.raw')

        return url
Ejemplo n.º 11
0
  def run(self):
    while True:
      print 'INFO: ........................................ START'
      stats = self.dbm.getStats()
      print 'INFO: deadLinks-', stats[0], ' unvisitedLinks-', stats[1], ' visitedLinks-', stats[2]
      # get an url from unvisitedLinks
      url = self.dbm.retrieveUnvisitedLink()
      if url == False:
        print 'DEBUG: DONE -- retrieveUnvisitedLink return False'
        break

      print 'DEBUG: Processing ', url

      if not self.urlFilter.isPlainText(url):
        print 'DEBUG: NotPlainTextURL ', url
        continue
      
      if not self.domainFilter.isInDomain(url):
        print 'DEBUG: NOT IN DOMAIN ', url
        continue

      # requet the url
      d = Download(url)
      if d.doRequest() == 1:
        if not self.dbm.createDeadLink(url):
          print 'DEBUG: deadLinks already contain ', url
        else:
          print 'DEBUG: Add To deadLinks ', url
      else:
        if self.dbm.createVisitedLink(url):
          print 'DEBUG: Add To visitedLinks ', url
        else:
          print 'DEBUG: Failed Add To visitedLinks ', url

        # extract urls from the sourc2
        u = URLExtractor(d.getSOURCE(), url)
        tmpUrls = u.getUrls()
        if tmpUrls:
          for url in tmpUrls:
            if self.dbm.isInDeadLink(url):
              continue
            elif self.dbm.isInVisitedLink(url):
              continue
            elif self.dbm.isInUnvisitedLink(url):
              continue
            else:
              print 'DEBUG: Add To unvisitedLink ', url
              self.dbm.createUnvisitedLink(url)
    
      print 'INFO: ........................................ END'
Ejemplo n.º 12
0
  def getStats(self):
    d = Download(self.API)
    if d.doRequest():
      return False

    res = []
    j = loads(d.getSOURCE())
    for item in j['resultSets'][1]['rowSet']:
      res.append(item[1:])

    if len(res) == 0:
      return False
    else:
      return res
Ejemplo n.º 13
0
def worker(appids, isbns, appidsCycle):
    # appidsCycle = cycle(appids)

    for isbn in isbns:
        url = 'http://' + appidsCycle.next() + '.appspot.com/url?url=' + 'http://book.douban.com/isbn/' + str(isbn)
        # print 'DEBUG: ', url

        d = Download(url)
        if d.doRequest():
            print isbn, 'network error'
            continue

        j = json.loads(d.getSOURCE())
        print isbn, j['status_code']

    return
Ejemplo n.º 14
0
    def run(self, processName='MainProcess'):
        for isbn in self.ISBNS:
            url = 'http://www.amazon.cn/s/ref=nb_sb_noss?field-keywords=' + isbn
            d = Download(url)
            if d.doRequest():
                print 'ERROR[' + processName + ']: ', isbn, 'NERR'
                appendstr2file(isbn, './NERR.txt')
                continue

            asin = ASINParser(d.getSOURCE())
            if asin.getAsin():
                print 'INFO[' + processName + ']: ', isbn, asin.getAsin()
                appendstr2file(isbn + ',' + asin.getAsin(), './OK.txt')
            else:
                print 'WARN[' + processName + ']: ', isbn, 'NOER'
                appendstr2file(isbn, './NOER.txt')
Ejemplo n.º 15
0
def Google_Web_Search_Helper(q, hl='en', start=0):
  Google_Web_Search_URL = 'https://www.google.com/search?'

  if not q:
    return {}
  else:
    Google_Web_Search_URL = Google_Web_Search_URL + 'q=' + q

  Google_Web_Search_URL = Google_Web_Search_URL + '&hl=' + hl
  Google_Web_Search_URL = Google_Web_Search_URL + '&start=' + start

  d = Download(Google_Web_Search_URL)
  if d.doRequest():
    return {}
  else:
    g = GoogleSearchResultParser(d.getSOURCE())
    return g.getJson()
  """
    def run(self, processName='MainProcess'):
        for asin in self.ASINS:
            url = 'http://www.amazon.cn/dp/' + asin
            d = Download(url)
            if d.doRequest():
                print 'ERROR[' + processName + ']: ', asin, 'NERR'
                appendstr2file(asin, './NERRBasicInfo.txt')
                continue

            b = BasicInfoParser(d.getSOURCE())
            jsonRes = b.basicInfo()

            if json.loads(jsonRes):
                print 'info[' + processName + ']: ', asin
                appendstr2file(jsonRes, './OKBasicInfo.txt')
            else:
                print 'WARN[' + processName + ']: ', asin, 'NOER'
                appendstr2file(asin, './NOERBasicInfo.txt')
Ejemplo n.º 17
0
  def walker(self):
    while True:
      urls = self.dbm.retrieveUnvisitedLinks(0, 100)
      urls = self.urlFilter.getFilteredUrls(urls)
      if len(urls) == 0:
        break

      for url in urls:
        print 'INFO: Processing ', url
        d = Download(url)
        if d.doRequest() == 1:
          self.dbm.createDeadLink(url)
        else:
          self.dbm.createVisitedLink(url)
          u = URLExtractor(d.getSOURCE(), url)
          tmpUrls = u.getUrls()
          if tmpUrls:
            self.dbm.createUnvisitedLinks(list(set(tmpUrls)))

    return True
Ejemplo n.º 18
0
    def getStats(self):
        d = Download(self.API)
        if d.doRequest():
            return False

        res = []
        j = loads(d.getSOURCE())
        for item in j['resultSets'][0]['rowSet']:
            tmp = []
            name = item[3]
            pos = item[5]
            if item[6] == 'null':
                height = 'None'
            else:
                height = item[6]
            if item[7] == " ":
                weight = 'None'
            else:
                weight = item[7]
            age = item[9]
            if item[10] == 'R' or item[10] == 'None' or item[10] == None:
                exp = 0
            else:
                exp = item[10]

            tmp.append(name)
            tmp.append(pos)
            tmp.append(height)
            tmp.append(weight)
            tmp.append(age)
            tmp.append(exp)
            res.append(tmp)

        if len(res) == 0:
            return False
        else:
            return res
Ejemplo n.º 19
0
  def getStats(self):
    d = Download(self.API)
    if d.doRequest():
      return False

    res = []
    j = loads(d.getSOURCE())
    for item in j['resultSets'][0]['rowSet']:
      tmp = []
      name = item[3]
      pos = item[5]
      if item[6] == 'null':
        height = 'None'
      else:
        height = item[6]
      if item[7] == " ":
        weight = 'None'
      else:
        weight = item[7]
      age = item[9]
      if item[10] == 'R' or item[10] == 'None' or item[10] == None:
        exp = 0
      else:
        exp = item[10]

      tmp.append(name)
      tmp.append(pos)
      tmp.append(height)
      tmp.append(weight)
      tmp.append(age)
      tmp.append(exp)
      res.append(tmp)

    if len(res) == 0:
      return False
    else:
      return res
Ejemplo n.º 20
0
#!/usr/bin/env python
#coding=utf-8
#
# Author: Archer Reilly
# Date: 11/Aug/2014
# File: PlayerInfoParserTest.py
# Description: test the PlayerInfoParser class
# Website: http://csrgxtu.blog.com/
#
# Produced By CSRGXTU
from PlayerInfoParser import PlayerInfoParser
from Download import Download

URL = "http://sports.qq.com/d/f_players/3/2890/"
player = Download(URL)
if player.doRequest() != 0:
    print "Download Cant Do Requst"
else:
    print "Successfully Do Request"

playerParser = PlayerInfoParser(player.getSOURCE())
Ejemplo n.º 21
0
 def doRequest(self, url):
   d = Download(url)
   if d.doRequest() == None:
     return None
   else:
     return d.getSOURCE()
Ejemplo n.º 22
0
from TeamInfoParser import TeamInfoParser

"""
page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
print page.text
parser = Parser(page.text)
#print parser.getBuyers()
"""
URL = "http://sports.qq.com/d/f_teams/1/42/"
soccer = Download(URL)
if soccer.doRequest() == 0:
  print "Successfully do request"
else:
  print "Failed do request"

html = soccer.getSOURCE()
parser = TeamInfoParser(html)
name = parser.getTeamName()
print "name:", unicode(name).encode('utf8')
name_cn = parser.getTeamNameCN()
print "name_cn:", unicode(name_cn).encode('utf8')
logo = parser.getTeamLogo()
print "logo:", logo
city = parser.getTeamCity()
print "city:", city
league = parser.getTeamLeague()
print "league:", league
found_time = parser.getTeamFoundTime()
print "found_time:", found_time
home_court_cn = parser.getTeamHomeCourtCN()
print "home_court_cn:", home_court_cn
Ejemplo n.º 23
0
from Parser import Parser
from TeamInfoParser import TeamInfoParser
"""
page = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
print page.text
parser = Parser(page.text)
#print parser.getBuyers()
"""
URL = "http://sports.qq.com/d/f_teams/1/42/"
soccer = Download(URL)
if soccer.doRequest() == 0:
    print "Successfully do request"
else:
    print "Failed do request"

html = soccer.getSOURCE()
parser = TeamInfoParser(html)
name = parser.getTeamName()
print "name:", unicode(name).encode('utf8')
name_cn = parser.getTeamNameCN()
print "name_cn:", unicode(name_cn).encode('utf8')
logo = parser.getTeamLogo()
print "logo:", logo
city = parser.getTeamCity()
print "city:", city
league = parser.getTeamLeague()
print "league:", league
found_time = parser.getTeamFoundTime()
print "found_time:", found_time
home_court_cn = parser.getTeamHomeCourtCN()
print "home_court_cn:", home_court_cn
Ejemplo n.º 24
0
#!/usr/bin/env python
#coding=utf-8
#
# Author: Archer Reilly
# Date: 11/Aug/2014
# File: PlayerInfoParserTest.py
# Description: test the PlayerInfoParser class
# Website: http://csrgxtu.blog.com/
#
# Produced By CSRGXTU
from PlayerInfoParser import PlayerInfoParser
from Download import Download

URL = "http://sports.qq.com/d/f_players/3/2890/"
player = Download(URL)
if player.doRequest() != 0:
  print "Download Cant Do Requst"
else:
  print "Successfully Do Request"

playerParser = PlayerInfoParser(player.getSOURCE())

Ejemplo n.º 25
0
#!/usr/bin/env python
#
# Usage: python crawlerapitester.py 10
#
from Download import Download
import json
import sys

url = 'http://csrgxtu01.appspot.com/url?url=http://book.douban.com/isbn/9787508653594'

for i in range(int(sys.argv[1])):
    d = Download(url)
    if d.doRequest():
        print i, 'cant doRequest'
        continue

    j = json.loads(d.getSOURCE())
    print i, j['err'], j['status_code']