コード例 #1
0
ファイル: tlds.py プロジェクト: hoytnix/spidey
def dumps_to_db():
  j = Jsondb()
  db = j.db

  print len(db)

  dumps = get_dumps()
  for line in dumps:
    try:
      d = get_domain_parts('http://' + line)
      u = urlparse('http://' + line)
      p = u.path
      url = '%s.%s' % (d.domain, d.tld)
      if url not in db:
        db[url] = {
          'subdomains': [],
          'paths':      []
        }
      if p not in db[url]['paths']:
        db[url]['paths'].append(p)
      if d.subdomains:
        for domain in d.subdomains:
          if domain not in db[url]['subdomains']:
            db[url]['subdomains'].append(domain)
    except ValueError:
      pass

  j.update(db)
  print len(j.db)
  #j._print()
  j.save()
コード例 #2
0
ファイル: 03.py プロジェクト: hoytnix/spidey
def main():
  # Init DB
  db = DB(f = './json/03.json')
  json = db.db
  print db.f, len(db.db)

  # Shortcuts :)
  br_1 = '-' * 100
  br_2 = '-' * 80

  # Get Archive!
  archive = ''.join(get_archive())
  
  # Process
  archive = archive.split(br_1)

  keywords = archive[0]
  archive  = archive[1]

  sites = archive.split(br_2)
  sites = sites[:-1]
  
  for site in sites:
    s = site.split('\n')

    for x in s:
      if x == '':
        s.remove('')
    
    if len(s) > 1:
      domain = s[0].replace('file: ', '')

      words = []
      i = 1
      while i < len(s):
        words.append(s[i])
        i += 1

      if domain not in json['domains']:
        json['domains'][domain] = [word for word in words]
      else:
        for word in words:
          json['domains'][domain].append(word)

      for word in words:
        if word not in json['words']:
          json['words'][word] = 1
        else:
          json['words'][word] += 1

  # Update DB
  db.update(json)
  db.save()
コード例 #3
0
ファイル: tlds.py プロジェクト: hoytnix/spidey
def update_db():
  j = Jsondb(f='/home/johnny/dev/backzupz/spider/a/_dumps/db_15-05-26.json')
  db = j.db

  for line in db:
    try:
      d = get_domain_parts('http://' + line)
      db[line]['tld'] = d.tld
    except ValueError:
      print line

  j.update(db)
  j.save()
コード例 #4
0
def dumps_to_db():
    j = Jsondb()
    db = j.db

    print len(db)

    dumps = get_dumps()
    for line in dumps:
        try:
            d = get_domain_parts('http://' + line)
            u = urlparse('http://' + line)
            p = u.path
            url = '%s.%s' % (d.domain, d.tld)
            if url not in db:
                db[url] = {'subdomains': [], 'paths': []}
            if p not in db[url]['paths']:
                db[url]['paths'].append(p)
            if d.subdomains:
                for domain in d.subdomains:
                    if domain not in db[url]['subdomains']:
                        db[url]['subdomains'].append(domain)
        except ValueError:
            pass

    j.update(db)
    print len(j.db)
    #j._print()
    j.save()
コード例 #5
0
def domain_stats():
    j = Jsondb(f='/home/johnny/dev/backzupz/spider/a/_dumps/db_15-05-26.json')
    db = j.db

    stats = {}
    for line in db:
        tld = db[line]['tld']
        if tld in stats:
            stats[tld] += 1
        else:
            stats[tld] = 1

    return stats
コード例 #6
0
def update_db():
    j = Jsondb(f='/home/johnny/dev/backzupz/spider/a/_dumps/db_15-05-26.json')
    db = j.db

    for line in db:
        try:
            d = get_domain_parts('http://' + line)
            db[line]['tld'] = d.tld
        except ValueError:
            print line

    j.update(db)
    j.save()
コード例 #7
0
ファイル: forms.py プロジェクト: hoytnix/spidey
    def doWork(self):
        while True:
            self.domain = int(self.q.get())
            self.txt = self.get_txt(self.domain)

            try:
              self.s = self.search_forms(txt = self.txt)
              if self.s:
                self.json[self.sites[self.domain]] = self.s
            except:
              pass

            if len(self.json) % 50 == 0:
              print strftime('%H:%S')

            if len(self.json) == self.lines - 150000 - 2:
              self.db.update(self.json)
              self.db.save()
              self.dbs += 1
              #self.new_db()
              self.db = DB(f='./json/forms/forms-%d.json' % self.dbs)
              self.json = {}

            self.q.task_done()    
コード例 #8
0
ファイル: forms.py プロジェクト: hoytnix/spidey
    def __init__(self):
        
        # Initialize Database
        self.dbs = 70
        self.new_db()
        self.db = DB(f='./json/forms/forms-%d.json' % self.dbs)
        self.json = {}

        # Stuff
        self.txt = ''
        self.sites = self.get_files()
        self.lines = len(self.sites)
        self.concurrent = 1
        self.q = Queue(self.concurrent * 2)
        for self.i in range(self.concurrent):
            self.t = Thread(target=self.doWork)
            self.t.daemon = True
            self.t.start()
        try:
            for self.i in xrange(self.dbs * 2000, self.lines):
                self.q.put(self.i)
            self.q.join()
        except KeyboardInterrupt:
            sys.exit(1)
コード例 #9
0
ファイル: 01.py プロジェクト: hoytnix/spidey
def main():
  # Init DB
  db = DB(f = './json/01.json')
  json = db.db
  print db.f, len(db.db)

  # Shortcuts :)
  br_1 = '-' * 100
  br_2 = '-' * 80

  # Get Archive!
  archive = ''.join(get_archive())
  
  # Process
  archive = archive.split(br_1)


  keywords = archive[0]
  archive  = archive[1]

  sites = archive.split(br_2)
  sites = sites[:-1]
  
  for site in sites:
    s = site.split('\n')

    for x in s:
      if x == '':
        s.remove('')
    
    if len(s) > 2:
      
      url = s[0].replace('file: ', '')

      i = 2
      while i < len(s):
        row = s[i]
        
        f = None; m = None
        if len(row) > 0:
          if row[0] == 'f':
            _f = row[13:]
            if 'fbml' not in _f and '.php' not in _f and 'oauth' not in _f:
              if 'facebook.com' in _f:
                if _f != 'facebook.com' and _f != 'facebook.com/':
                  f = _f
          if row[0] == 'm':
            _m = row[8:]
            if '@' in _m:
              m = _m

        if f or m:
          json[url] = {}
          if f:
            json[url]['facebook'] = f
          if m:
            json[url]['email'] = m

        i += 1

  #print json

  # Update DB
  db.update(json)
  db.save()
コード例 #10
0
ファイル: forms.py プロジェクト: hoytnix/spidey
class Work:
    def __init__(self):
        
        # Initialize Database
        self.dbs = 70
        self.new_db()
        self.db = DB(f='./json/forms/forms-%d.json' % self.dbs)
        self.json = {}

        # Stuff
        self.txt = ''
        self.sites = self.get_files()
        self.lines = len(self.sites)
        self.concurrent = 1
        self.q = Queue(self.concurrent * 2)
        for self.i in range(self.concurrent):
            self.t = Thread(target=self.doWork)
            self.t.daemon = True
            self.t.start()
        try:
            for self.i in xrange(self.dbs * 2000, self.lines):
                self.q.put(self.i)
            self.q.join()
        except KeyboardInterrupt:
            sys.exit(1)

    def new_db(self):
      with open('./json/forms/forms-%d.json' % self.dbs, 'w+') as f:
        f.write('{}')

    def doWork(self):
        while True:
            self.domain = int(self.q.get())
            self.txt = self.get_txt(self.domain)

            try:
              self.s = self.search_forms(txt = self.txt)
              if self.s:
                self.json[self.sites[self.domain]] = self.s
            except:
              pass

            if len(self.json) % 50 == 0:
              print strftime('%H:%S')

            if len(self.json) == self.lines - 150000 - 2:
              self.db.update(self.json)
              self.db.save()
              self.dbs += 1
              #self.new_db()
              self.db = DB(f='./json/forms/forms-%d.json' % self.dbs)
              self.json = {}

            self.q.task_done()    

    def control(self, c):
      name     = c.get('name')
      _id      = c.get('id')
      action   = c.get('action')
      method   = c.get('method')
      _class   = c.get('class')
      _type    = c.get('type')
      disabled = c.get('disabled')

      control = {}
      if name:
        control['name'] = name

      if _id:
        control['id'] = _id

      if action:
        control['action'] = action

      if method:
        control['method'] = method

      if _class:
        control['class'] = _class

      if _type:
        control['type'] = _type

      if disabled:
        control['disabled'] = disabled

      return control

    def search_forms(self, txt):

      # Soup Object
      self.soup = bs4(txt)

      self.forms = self.soup.findAll('form')
      self.form_len = len(self.forms)

      if self.form_len < 0:
        return False
        
      self.site = {}

      self.i = 0
      for self.form in self.forms:
        self._f = self.control(self.form)

        self._key = str(self.i)

        self.site[self._key] = {}

        for self.key in self._f:
          self.site[self._key][self.key] = self._f[self.key]

        self.inputs = self.forms[self.i].findAll('input')
        self.site[self._key]['inputs'] = []
        for self._input in self.inputs:
          self._in = {}
          self._i = self.control(self._input)
          for self.key in self._i:
            self._in[self.key] = self._i[self.key]
          self.site[self._key]['inputs'].append(self._in)

        self.i += 1

      return self.site

    def get_txt(self, domain):
      # Read Data.
      cd = ''
      with open('%s/%s' % ( DUMPS_DIR, self.sites[domain]), 'r') as f:
        cd = f.read()

      # Decompress
      return zlib.decompress(cd)

    def get_files(self):
      txt = ''
      with open('../dumps/04-forms', 'r') as f:
        txt = f.read()

      # Sanatize.
      txt = txt.split('-' * 100)
      txt = txt[1]
      txt = txt.split('-' * 5)[:-1]

      i = 0
      for x in txt:
        txt[i] = txt[i].strip()
        i += 1

      for x in txt:
        if x == '':
          txt.remove('')

      # Search.
      sites = []
      for site in txt:
        sites.append( site.split('\n')[-1].replace('file: ', '') )

      return sites
コード例 #11
0
ファイル: 01.py プロジェクト: hoytnix/spidey
def main():
    # Init DB
    db = DB(f='./json/01.json')
    json = db.db
    print db.f, len(db.db)

    # Shortcuts :)
    br_1 = '-' * 100
    br_2 = '-' * 80

    # Get Archive!
    archive = ''.join(get_archive())

    # Process
    archive = archive.split(br_1)

    keywords = archive[0]
    archive = archive[1]

    sites = archive.split(br_2)
    sites = sites[:-1]

    for site in sites:
        s = site.split('\n')

        for x in s:
            if x == '':
                s.remove('')

        if len(s) > 2:

            url = s[0].replace('file: ', '')

            i = 2
            while i < len(s):
                row = s[i]

                f = None
                m = None
                if len(row) > 0:
                    if row[0] == 'f':
                        _f = row[13:]
                        if 'fbml' not in _f and '.php' not in _f and 'oauth' not in _f:
                            if 'facebook.com' in _f:
                                if _f != 'facebook.com' and _f != 'facebook.com/':
                                    f = _f
                    if row[0] == 'm':
                        _m = row[8:]
                        if '@' in _m:
                            m = _m

                if f or m:
                    json[url] = {}
                    if f:
                        json[url]['facebook'] = f
                    if m:
                        json[url]['email'] = m

                i += 1

    #print json

    # Update DB
    db.update(json)
    db.save()