def __init__(self): # Use naive scoring algorithm if formasuarus is not installed if FORMASAURUS: self.form_extractor = FormExtractor.load() else: self.form_extractor = None # Set some friendly headers self.user_agent = ( 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 ' '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 ' 'Chrome/43.0.2357.130 Safari/537.36' ) self.accept_encoding = ( 'text/html,application/xhtml+xml,' 'application/xml;q-0.9,*/*;q=0.8' ) self.headers = { 'User-Agent': self.user_agent, 'Accept': self.accept_encoding, 'Accept-Language': 'en', } logging.basicConfig(level=logging.INFO, format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s:%(message)s', datefmt='%a, %d %b %Y %H:%M:%S', filename=None, filemode='a+') # Cookie jar and html parser for making requests with urllib self.parser = HTMLParser(recover=True, encoding='utf-8') self.cookie_jar = CookieJar()
def __init__(self, html_in=None, url=None): #!needs to take in html #words/phrases to check against that indicate that a field #is a certain type of field, we'll add to this as we add to #forms. This should be checked against field names and placeholders self.keywords_dic = { "email": ["user[email]", "email"], "email_confirmation": ["user[email_confirmation]"], "name": ["user[name]"], "password": ["user[password]"], "password_confirmation": ["user[password_confirmation]"], } self.html_in = html_in if not self.html_in: r = requests.get(url) self.html_in = r.text print self.html_in self.fe = FormExtractor.load() self.form = self._extract_forms_and_types(self.html_in) self.action = self.form.action self.inputs = self._get_inputs() self.filled_inputs = None self.filled_form = RegistrationForm()
def __init__(self): # Use naive scoring algorithm if formasuarus is not installed if FORMASAURUS: self.form_extractor = FormExtractor.load() else: self.form_extractor = None # Set some friendly headers self.user_agent = ('Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 ' '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 ' 'Chrome/43.0.2357.130 Safari/537.36') self.accept_encoding = ('text/html,application/xhtml+xml,' 'application/xml;q-0.9,*/*;q=0.8') self.headers = { 'User-Agent': self.user_agent, 'Accept': self.accept_encoding, 'Accept-Language': 'en', } # Cookie jar and html parser for making requests with urllib self.parser = HTMLParser(recover=True, encoding='utf-8') self.cookie_jar = CookieJar()
def __init__(self): # Use naive scoring algorithm if formasuarus is not installed if FORMASAURUS: self.form_extractor = FormExtractor.load() else: self.form_extractor = None # Set some friendly headers self.user_agent = ( 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 ' '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 ' 'Chrome/43.0.2357.130 Safari/537.36' ) self.accept_encoding = ( 'text/html,application/xhtml+xml,' 'application/xml;q-0.9,*/*;q=0.8' ) self.headers = { 'User-Agent': self.user_agent, 'Accept': self.accept_encoding, 'Accept-Language': 'en', } # Cookie jar and html parser for making requests with urllib self.parser = HTMLParser(recover=True, encoding='utf-8') self.cookie_jar = CookieJar()
import json import os import subprocess import urllib from flask import Flask from formasaurus import FormExtractor import lxml app = Flask(__name__) ex = FormExtractor.load() LEADER = "/tmp" @app.route("/") def hello(): return "Hello!" @app.route("/extract/local/<path:file_path>") def extract_local_file_forms(file_path): full_path = os.path.join(LEADER, file_path) tree = lxml.html.parse(full_path) return _process_parsed_tree(tree) @app.route("/extract/url/<path:url>") def extract_site_forms(url): tree = lxml.html.parse(url) return _process_parsed_tree(tree) def _process_parsed_tree(tree): forms = ex.extract_forms(tree)