Beispiel #1
0
def extract_forms(url):
    """
    This method searches in the web-page if there is an input form used to search something in the page.
    :return: (List of input_forms, list of texts)
    """
    url = remove_scheme(url)
    page = db_get_page(url=url)
    webpage = BeautifulSoup(page[4], "lxml")
    forms = webpage.find_all(name="form")
    for i, form in enumerate(forms):
        method = form.get("method")
        action = form.get("action")
        inputs = form.find_all(name="input")
        inputs = [
            input for input in inputs
            if input.get("type") == "search" or input.get("type") == "text"
        ]
        for j, input in enumerate(inputs):
            input_name = input.get("name")
            input_text = input.get("placeholder")
            db_insert_form(page_url=url,
                           form_num=i,
                           method=method,
                           action=action,
                           input_num=j,
                           input_name=input_name,
                           input_text=input_text)
def db_insert_page_link(page_url, link_url, link_text, x_position, y_position,
                        in_list, in_nav):
    page_url = remove_scheme(page_url)
    link_url = remove_scheme(link_url)

    page_link = PageLink(page_url=page_url,
                         link_url=link_url,
                         link_text=link_text,
                         x_position=x_position,
                         y_position=y_position,
                         in_list=in_list,
                         in_nav=in_nav)
    session = db_session()
    session.add(page_link)
    session.commit()
    session.close()
Beispiel #3
0
def db_delete_page(url):
    """
    This method deletes a page from the pages table of the database.
    :param url: A string containing the URl of the web page to delete.
    :return: None
    """
    url = remove_scheme(url)
    sql = "DELETE FROM pages WHERE url LIKE :url"
    engine.connect().execute(sql, url=url)
Beispiel #4
0
def db_add_clean_text_to_page(url, clean_text):
    """
    This method updates a page in the pages table of the database.
    :param url: A string representing the URL of the web page to update.
    :param clean_text: A string representing the clear main text of the web page to insert.
    """
    url = remove_scheme(url)
    sql = "UPDATE pages SET clean_text=:clean_text WHERE url LIKE :url"
    engine.connect().execute(sql, clean_text=clean_text, url=url)
def db_get_text_links(page_url):
    """
    This method returns all the links contained in the main text of a web page.
    :param page_url: A string containing the URL of the web page.
    :return: An array containing tuples (position, link_text) with all the info about the links of the web page.
    """
    page_url = remove_scheme(page_url)
    sql = "SELECT position, link_text FROM text_links WHERE page_url LIKE :page_url"
    result = engine.connect().execute(sql, page_url=page_url).fetchall()
    return result
Beispiel #6
0
def db_get_page(url):
    """
    This method returns a tuple containing info about the last visit of a web page.
    :param url: A string containing the URL of the web page.
    :return: A tuple (url, topic, summary, language, simple_html, parsed_html, clear_text, last_visit) or None.
    """
    url = remove_scheme(url)
    sql = "SELECT * FROM pages WHERE url LIKE :url"
    result = engine.connect().execute(sql, url=url).fetchone()
    return result
Beispiel #7
0
def read_links_article(url):
    url = remove_scheme(url)
    links = db_get_page_links(url=url)
    if len(links) > 0:
        # Keep only links with 4 words or more in text.
        links = list(filter(lambda x: len(extract_words(x[0])) > 3, links))
        # Keep only links not contained in lists.
        links = list(filter(lambda x: x[3] == 0, links))
        # Remove duplicates.
        links = remove_duplicate_links(links)

    return links
Beispiel #8
0
    def process_item(self, item, spider):
        """
        This method is called for every item pipeline component
        """
        session = self.Session()

        try:
            link = PageLink()
            link.page_url = remove_scheme(item["page_url"])
            link.link_url = remove_scheme(item["link_url"])
            link.link_text = item["link_text"]
            link.in_list = item["in_list"]
            link.in_nav = item["in_nav"]
            session.add(link)
            session.commit()
        except Exception:
            session.rollback()
            raise
        finally:
            session.close()

        return item
def db_get_text_link(page_url, link_num):
    """
    This method returns a link contained in the main text of a web page.
    :param page_url: A string containing the URL of the web page containing the link.
    :param link_num: A number representing the index of the link to get between all the other links of the text.
    :return: A tuple (link_url) containing the URL of the link requested or None.
    """
    page_url = remove_scheme(page_url)
    sql = "SELECT link_url FROM text_links WHERE page_url LIKE :page_url AND link_num = :link_num"
    result = engine.connect().execute(sql,
                                      page_url=page_url,
                                      link_num=link_num).fetchone()
    return result
def db_insert_bookmark(url, name, user):
    url = remove_scheme(url)
    bookmark = Bookmark(url=url, name=name, user=user)
    session = db_session()
    try:
        session.add(bookmark)
        session.commit()
    except Exception as e:
        if "url" in e.args[0]:
            raise BookmarkUrlTaken
        elif "name" in e.args[0]:
            raise BookmarkNameTaken
    finally:
        session.close()
Beispiel #11
0
def db_insert_action(action, url):
    """
    This method inserts an action performed by the user into the history table of the database.
    :param action: A string indicating the action performed by the user.
    :param url: The url of the web page related to the action performed by the user.
    """
    url = remove_scheme(url)
    session = db_session()
    history = History(user="******",
                      action=action,
                      url=url,
                      timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    session.add(history)
    session.commit()
    session.close()
Beispiel #12
0
def db_insert_functionality_link(page_url, name, link_url, score):
    page_url = remove_scheme(page_url)
    session = db_session()
    functionality = Functionality(page_url=page_url,
                                  type="link",
                                  name=name,
                                  link_url=link_url,
                                  score=score)
    try:
        session.add(functionality)
        session.commit()
    except Exception:
        # Exception if link already present.
        session.rollback()
    finally:
        session.close()
Beispiel #13
0
def db_insert_page(url, simple_html):
    """
    This method inserts a web page in the pages table of the database.
    """
    url = remove_scheme(url)
    page = Page(url=url,
                simple_html=simple_html,
                last_visit=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                parsed_html="In progress.")
    session = db_session()
    try:
        session.add(page)
        session.commit()
    except IntegrityError:
        session.rollback()
    finally:
        session.close()
def db_insert_text_link(page_url, link_num, link):
    """
    This method inserts a link contained in the main text of a web page into the page_links table of the database.
    :param page_url: A string containing the URL of the web page containing the link.
    :param link_num: A number representing the index of the link to insert between all the other links of the text.
    :param link: A tuple (position, link_text, link_url) containing info about the link.
    :return: None.
    """
    page_url = remove_scheme(page_url)
    text_link = TextLink(page_url=page_url,
                         link_num=link_num,
                         position=link[0],
                         link_text=link[1],
                         link_url=link[2])
    session = db_session()
    session.add(text_link)
    session.commit()
    session.close()
Beispiel #15
0
def db_insert_form(page_url, form_num, method, action, input_num, input_name,
                   input_text):
    page_url = remove_scheme(page_url)
    form = Form(page_url=page_url,
                form_num=form_num,
                method=method,
                action=action,
                input_num=input_num,
                input_name=input_name,
                input_text=input_text)
    session = db_session()
    try:
        session.add(form)
        session.commit()
    except Exception:
        session.rollback()
    finally:
        session.close()
def db_delete_all_page_links(url):
    url = remove_scheme(url)
    sql = "DELETE FROM page_links WHERE page_url LIKE :url;"
    engine.connect().execute(sql, url=url)
Beispiel #17
0
def db_add_parsed_html_to_page(url, parsed_html):
    url = remove_scheme(url)
    sql = "UPDATE pages SET parsed_html=:parsed_html WHERE url LIKE :url"
    engine.connect().execute(sql, parsed_html=parsed_html, url=url)
Beispiel #18
0
def db_get_forms(page_url):
    page_url = remove_scheme(page_url)
    sql = """SELECT page_url, form_num, method, action, input_num, input_name, input_text
                FROM forms WHERE page_url LIKE :page_url"""
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows
def db_delete_bookmark(url, user):
    url = remove_scheme(url)
    sql = "DELETE FROM bookmarks WHERE url LIKE :url AND user LIKE :user;"
    engine.connect().execute(sql, url=url, user=user)
Beispiel #20
0
def db_add_topic_to_page(url, topic):
    url = remove_scheme(url)
    sql = "UPDATE pages SET topic=:topic WHERE url LIKE :url"
    engine.connect().execute(sql, topic=topic, url=url)
Beispiel #21
0
def read_links(url):
    url = remove_scheme(url)
    links = db_get_page_links(url=url)
    links = remove_duplicate_links(links)
    return links
Beispiel #22
0
def db_add_language_to_page(url, language):
    url = remove_scheme(url)
    sql = "UPDATE pages SET language=:language WHERE url LIKE :url"
    engine.connect().execute(sql, language=language, url=url)
def db_get_page_links(url):
    page_url = remove_scheme(url)
    sql = "SELECT link_text, link_url, y_position, in_list, in_nav FROM page_links WHERE page_url LIKE :page_url"
    rows = engine.connect().execute(sql, page_url=page_url).fetchall()
    return rows