Ejemplo n.º 1
0
 def __init__(self):
     # Use naive scoring algorithm if formasuarus is not installed
     if FORMASAURUS:
         self.form_extractor = FormExtractor.load()
     else:
         self.form_extractor = None
     # Set some friendly headers
     self.user_agent = (
         'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 '
         '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 '
         'Chrome/43.0.2357.130 Safari/537.36'
     )
     self.accept_encoding = (
         'text/html,application/xhtml+xml,'
         'application/xml;q-0.9,*/*;q=0.8'
     )
     self.headers = {
         'User-Agent': self.user_agent,
         'Accept': self.accept_encoding,
         'Accept-Language': 'en',
     }
     logging.basicConfig(level=logging.INFO,
             format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s:%(message)s',
             datefmt='%a, %d %b %Y %H:%M:%S',
             filename=None,
             filemode='a+') 
     # Cookie jar and html parser for making requests with urllib
     self.parser = HTMLParser(recover=True, encoding='utf-8')
     self.cookie_jar = CookieJar()
Ejemplo n.º 2
0
    def __init__(self, html_in=None, url=None):
        #!needs to take in html
        #words/phrases to check against that indicate that a field
        #is a certain type of field, we'll add to this as we add to
        #forms. This should be checked against field names and placeholders

        self.keywords_dic = {
            "email": ["user[email]", "email"],
            "email_confirmation": ["user[email_confirmation]"],
            "name": ["user[name]"],
            "password": ["user[password]"],
            "password_confirmation": ["user[password_confirmation]"],
        }

        self.html_in = html_in

        if not self.html_in:
            r = requests.get(url)
            self.html_in = r.text

        print self.html_in

        self.fe = FormExtractor.load()
        self.form = self._extract_forms_and_types(self.html_in)
        self.action = self.form.action
        self.inputs = self._get_inputs()
        self.filled_inputs = None
        self.filled_form = RegistrationForm()
Ejemplo n.º 3
0
 def __init__(self):
     # Use naive scoring algorithm if formasuarus is not installed
     if FORMASAURUS:
         self.form_extractor = FormExtractor.load()
     else:
         self.form_extractor = None
     # Set some friendly headers
     self.user_agent = ('Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 '
                        '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 '
                        'Chrome/43.0.2357.130 Safari/537.36')
     self.accept_encoding = ('text/html,application/xhtml+xml,'
                             'application/xml;q-0.9,*/*;q=0.8')
     self.headers = {
         'User-Agent': self.user_agent,
         'Accept': self.accept_encoding,
         'Accept-Language': 'en',
     }
     # Cookie jar and html parser for making requests with urllib
     self.parser = HTMLParser(recover=True, encoding='utf-8')
     self.cookie_jar = CookieJar()
Ejemplo n.º 4
0
 def __init__(self):
     # Use naive scoring algorithm if formasuarus is not installed
     if FORMASAURUS:
         self.form_extractor = FormExtractor.load()
     else:
         self.form_extractor = None
     # Set some friendly headers
     self.user_agent = (
         'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 '
         '(KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 '
         'Chrome/43.0.2357.130 Safari/537.36'
     )
     self.accept_encoding = (
         'text/html,application/xhtml+xml,'
         'application/xml;q-0.9,*/*;q=0.8'
     )
     self.headers = {
         'User-Agent': self.user_agent,
         'Accept': self.accept_encoding,
         'Accept-Language': 'en',
     }
     # Cookie jar and html parser for making requests with urllib
     self.parser = HTMLParser(recover=True, encoding='utf-8')
     self.cookie_jar = CookieJar()
Ejemplo n.º 5
0
import json
import os
import subprocess
import urllib

from flask import Flask
from formasaurus import FormExtractor
import lxml

app = Flask(__name__)
ex = FormExtractor.load()

LEADER = "/tmp"

@app.route("/")
def hello():
    return "Hello!"

@app.route("/extract/local/<path:file_path>")
def extract_local_file_forms(file_path):
    full_path = os.path.join(LEADER, file_path)
    tree = lxml.html.parse(full_path)
    return _process_parsed_tree(tree)

@app.route("/extract/url/<path:url>")
def extract_site_forms(url):
    tree = lxml.html.parse(url)
    return _process_parsed_tree(tree)

def _process_parsed_tree(tree):
    forms = ex.extract_forms(tree)