def handle_data(self, data): if self.inscript: self.liens.extend(lamejs.lamejs(data).getLinks()) candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data) candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data) for jstr in candidates: if ('/' in jstr or '.' in jstr or '?' in jstr) and jstr not in self.common_js_strings: self.liens.append(jstr)
def handle_data(self, data): if self.inscript: allowed_ext = [".php", ".asp", ".xml", ".js", ".json", ".jsp"] self.liens.extend(lamejs.lamejs(data).getLinks()) candidates = re.findall(r'"([A-Za-z0-9_=#&%\.\+\?/-]*)"', data) candidates += re.findall(r"'([A-Za-z0-9_=#&%\.\+\?/-]*)'", data) for jstr in candidates: if jstr not in self.common_js_strings: for ext in allowed_ext: if ext in jstr: self.liens.append(jstr)
def handle_starttag(self, tag, attrs): tmpdict = {} for k, v in attrs: if v is None: continue lk = k.lower() if not lk in tmpdict: tmpdict[lk] = v if lk in self.js_events: self.liens.extend(lamejs.lamejs(v).getLinks()) if tag.lower() in ['a', 'link']: if "href" in tmpdict: if tmpdict['href'].lower().startswith("javascript:"): self.liens.extend(lamejs.lamejs(tmpdict["href"].split(':', 1)[1]).getLinks()) else: self.liens.append(tmpdict['href']) if tag.lower() == 'form': self.inform = 1 self.form_values = [] self.current_form_url = self.url if "action" in tmpdict: if tmpdict['action'].lower().startswith("javascript"): self.liens.extend(lamejs.lamejs(tmpdict["action"].split(':', 1)[1]).getLinks()) self.liens.append(tmpdict['action']) self.current_form_url = tmpdict['action'] # Forms use GET method by default self.current_form_method = "get" if "method" in tmpdict: if tmpdict["method"].lower() == "post": self.current_form_method = "post" if tag.lower() == 'input': if self.inform == 1: if "type" not in tmpdict: tmpdict["type"] = "text" if "name" in tmpdict: if tmpdict['type'].lower() in self.__defaults: # use the value from the form or use our default value if "value" in tmpdict and tmpdict["value"] != "": val = tmpdict["value"] else: val = self.__defaults[tmpdict['type'].lower()] self.form_values.append([tmpdict['name'], val]) if tmpdict['type'].lower() == "image": self.form_values.append([tmpdict['name'] + ".x", "1"]) self.form_values.append([tmpdict['name'] + ".y", "1"]) if "formaction" in tmpdict: self.liens.append(tmpdict['formaction']) if tag.lower() in ["textarea", "select"]: if self.inform == 1: if "name" in tmpdict: self.form_values.append([tmpdict['name'], u'on']) if tag.lower() in ["frame", "iframe"]: if "src" in tmpdict: self.liens.append(tmpdict['src']) if tag.lower() in ["img", "embed", "track", "source"]: if "src" in tmpdict: if "?" in tmpdict['src'] or tmpdict['src'].endswith(".swf"): self.liens.append(tmpdict['src']) if tag.lower() == "script": self.inscript = 1 if "src" in tmpdict: # if "?" in tmpdict['src']: self.liens.append(tmpdict['src']) if tag.lower() == "meta": if "http-equiv" in tmpdict and "content" in tmpdict: if tmpdict["http-equiv"].lower() == "refresh": content_str = tmpdict["content"].lower() url_eq_idx = content_str.find("url=") if url_eq_idx >= 0: self.liens.append(tmpdict["content"][url_eq_idx + 4:])
# Mismatch ! Convert the response text to the encoding detected by BeautifulSoup resp.setEncoding(page_encoding) else: page_encoding = resp_encoding data = resp.getPage() else: # Can't find an encoding... beware of non-html content data = resp.getRawPage() if "application/x-shockwave-flash" in mime_type or web_resource.file_ext == "swf": try: flash_parser = swf_parser.swf_parser(data) swf_links = flash_parser.getLinks() except Exception, err_data: swf_links = err_data[1] elif "/x-javascript" in mime_type or "/x-js" in mime_type or "/javascript" in mime_type: js_links = lamejs.lamejs(data).getLinks() data = "" # Manage redirections if "location" in info: redir = self.correctlink(info["location"], current, current_full_url, currentdir, proto, None) if redir is not None: if self.__inzone(redir) == 0: self.link_encoding[redir] = self.link_encoding[url] redir = HTTP.HTTPResource(redir, link_depth=current_depth+1) # Is the document not visited yet and not forbidden ? if (redir not in self.browsed_links and redir not in self.tobrowse and not self.isExcluded(redir)): self.tobrowse.append(redir)