Beispiel #1
0
    def run(self, query):

        self.browser.open("https://www.startpage.com/")

        form = self.browser.get_form(id="search_form")
        form['query'].value = "\"" + query + "\""
        form['abp'].value = "true"
        self.browser.submit_form(form)

        parser = self.browser.parsed

        # Prepare tables
        headlines = []
        links = []
        descriptions = []

        # Return number of results from search
        count = parser.find('div', {
            'id': 'results_content'
        }).find('p', {'id': 'results_count_p'})
        sanitized_count = str(count)

        first = "About"
        last = "results ("

        self.results['result_count'] = self.extract_string(
            sanitized_count, first, last)

        # Return first page of results #
        page = parser.find('div', {'id': 'results'})
        section = []

        for element in page:
            if element.name == "ol":
                section = element.find_all('li')
                break

        for li in section:

            try:
                headline = li.h3.text
                link = li.a['href']
                description = li.find('p', {'class': 'desc'}).text
                result = [headline, link, description]
                self.results['top_results'].append(result)

            except:
                pass

        return self.results
Beispiel #2
0
    def run(self, ip):
        results = []

        url_param = ip.replace(".", "/")

        url = "https://www.robtex.com/en/advisory/ip/" + url_param + "/shared.html"

        self.browser.open(url)

        parser = self.browser.parsed
        search = parser.find("span", {"id": "shared_ma"})

        if search is not None:
            # count = self.extract_string(search.text, "(", " shown")
            # if int(count) <= 50:

            for result in search.parent.parent.find("ol", {
                    "class": "xbul"
            }).findChildren('li'):
                result_value = result.text

                if ' ' in result_value:
                    result_value = re.sub(' ', '.', result_value)
                    results.append(result_value)

                else:
                    results.append(result_value)

            # else:
            #    results.append("%s domains identified" % str(count))

        return results
Beispiel #3
0
    def run(self, ip):

        results = []

        url_param = ip.replace(".", "/")
        url = "https://www.robtex.com/en/advisory/ip/" + url_param + "/shared.html"

        self.browser.open(url)

        parser = self.browser.parsed
        search = parser.find("span", {"id": "shared_ma"})

        if search is not None:
            # count = self.extract_string(search.text, "(", " shown")
            # if int(count) <= 50:

            for result in search.parent.parent.find("ol", {"class": "xbul"}).findChildren('li'):
                result_value = result.text

                if ' ' in result_value:
                    result_value = re.sub(' ', '.', result_value)
                    results.append(result_value)

                else:
                    results.append(result_value)

            # else:
            #    results.append("%s domains identified" % str(count))

        return results
Beispiel #4
0
    def run(self, query):

        self.browser.open("https://www.startpage.com/")

        form = self.browser.get_form(id="search_form")
        form['query'].value = "\"" + query + "\""
        form['abp'].value = "true"
        self.browser.submit_form(form)

        parser = self.browser.parsed

        # Prepare tables
        headlines = []
        links = []
        descriptions = []

        # Return number of results from search
        count = parser.find('div', {'id': 'results_content'}).find('p', {'id': 'results_count_p'})
        sanitized_count = str(count)

        first = "About"
        last = "results ("

        self.results['result_count'] = self.extract_string(sanitized_count, first, last)

        # Return first page of results #
        page = parser.find('div', {'id': 'results'})
        section = []

        for element in page:
            if element.name == "ol":
                section = element.find_all('li')
                break

        for li in section:

            try:
                headline = li.h3.text
                link = li.a['href']
                description = li.find('p', {'class': 'desc'}).text
                result = [headline, link, description]
                self.results['top_results'].append(result)

            except:
                pass

        return self.results
Beispiel #5
0
    def run(self, indicator):

        results = []
        self.browser.open("http://www.threatexpert.com/reports.aspx")

        form = self.browser.get_form(action="reports.aspx")
        form['find'].value = "\"" + indicator + "\""
        self.browser.submit_form(form)

        parser = self.browser.parsed

        # Return number of results from search [0] + number of pages of results [1]
        section = parser.find('span', {'id': 'txtResults'}).find_all('table')

        if section:

            if len(section) > 1:
                page_count = len(
                    section[1].find_all('td')) - 1  # Acquire page count
            else:
                page_count = 1

            # scrape current page
            data = section[0].find_all('tr')
            page = self.scrape_page(data, indicator)
            results.extend(page)

            # Gather records from subsequent pages
            for x in range(2, page_count + 1):
                url = "http://www.threatexpert.com/reports.aspx?page=%s&find=%s" % (
                    x, indicator)
                self.browser.open(url)
                parser = self.browser.parsed
                section = parser.find('span', {
                    'id': 'txtResults'
                }).find('table')

                if section:
                    data = section.find_all('tr')
                    page = self.scrape_page(data, indicator)
                    results.extend(page)

        return results
Beispiel #6
0
    def run(self, indicator):

        results = []
        self.browser.open("http://www.threatexpert.com/reports.aspx")

        form = self.browser.get_form(action="reports.aspx")
        form['find'].value = "\"" + indicator + "\""
        self.browser.submit_form(form)

        parser = self.browser.parsed

        # Return number of results from search [0] + number of pages of results [1]
        section = parser.find('span', {'id': 'txtResults'}).find_all('table')

        if section:

            if len(section) > 1:
                page_count = len(section[1].find_all('td')) - 1 # Acquire page count
            else:
                page_count = 1

            # scrape current page
            data = section[0].find_all('tr')
            page = self.scrape_page(data, indicator)
            results.extend(page)

            # Gather records from subsequent pages
            for x in range(2, page_count + 1):
                url = "http://www.threatexpert.com/reports.aspx?page=%s&find=%s" % (x, indicator)
                self.browser.open(url)
                parser = self.browser.parsed
                section = parser.find('span', {'id': 'txtResults'}).find('table')

                if section:
                    data = section.find_all('tr')
                    page = self.scrape_page(data, indicator)
                    results.extend(page)

        return results
Beispiel #7
0
    def run(self, ip):
        """
        Created by: LNguyen
        Date: 26January2017
        Updated scraping logic because of existing bug that was dependent finding an ID = shared_ma that no longer existed in the Robtex web pages.
        The new logic finds a list of shared domains located in the tag <ol class:xbul.>

        :param ip: The ip address to scrape the Robtex web page for
        :return: A list of domains found for the given ip address
        """
        results = []

        url_param = ip.replace(".", "/")

        url = "https://www.robtex.com/en/advisory/ip/" + url_param + "/shared.html"
        # print("url:",url)
        self.browser.open(url)

        parser = self.browser.parsed

        search = parser.find("ol", {"class": "xbul"})
        # print("search: ", search)

        total = 0
        if search is not None:
            for result in search.find_all("li"):
                total += 1

                if total > 100:
                    break
                else:
                    result_value = result.text
                    #       print("result_value: ",result.text)

                    if ' ' in result_value:
                        result_value = re.sub(' ', '.', result_value)
                        results.append(result_value)

                    else:
                        results.append(result_value)
                        #  print("scraperesults:",results)
                        # print("robtex_total:",total)
        return results
Beispiel #8
0
    def scrape_data(self, indicator, query_type):

        passive_table = []

        # search period 7 is "complete history"
        search_period = '7'

        # 0 = Current Day
        # 1 = Past 72 Hours
        # 2 = Past Week
        # 3 = Past Month
        # 4 = Past 3 Months
        # 5 = Past 6 Months
        # 6 = Past Year

        format = '0'
        # 0 = Display results on screen
        # 1 = Output to CSV file (Comma separated w/o quotes)
        # 2 = Output to CSV file (Comma separated with quotes)
        # 3 = Output to CSV file (Tab separated w/o quotes)
        # 4 = Output to CSV file (Tab separated with quotes)
        # 5 = Output to CSV file (Pipe separated w/o quotes)
        # 6 = Output to CSV file (Pipe separated with quotes)

        # queryType
        # A = Query IP Address or CIDR,
        # H = Query Hostname
        # X = Query Domain Name for Hosts
        # D = Query Domain for Authoritative Nameservers
        # N = Query Nameserver for Authoritative Domains

        url = "https://research.iad.internetidentity.com/index.php?period=" + search_period + "&format=" + format + "&queryType=" + query_type + "&target=" + indicator + "&submit=Submit+Query"

        self.browser.open(url, timeout=20000)
        parser = self.browser.parsed

        search = parser.find("table", {
            "style":
            "text-align: left; margin-left: auto; margin-right: auto;"
        })

        for tr in search.find('tbody'):
            #for tr in parser.find_all('tr')[7:]:

            tds = []
            if tr != "\n":
                for td in tr.find_all('td'):
                    tds.append(td.text.strip())

                # check that table data exists
                if len(tds) == 4:
                    IID_seen = tds[0]
                    IID_host = tds[1]
                    IID_qType = tds[2]
                    IID_ip = tds[3]

                    passive_table.append({
                        'ip': IID_ip,
                        'domain': IID_host,
                        'date': IID_seen,
                        'firstseen': IID_seen,
                        'lastseen': {},
                        'ip_location': {}
                    })

                tds[:] = []

        self.results.extend(passive_table)
Beispiel #9
0
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup as bs
import sqlite3
import dateutil.parser  # dateutil module refines the date data to the desired value.

#html crawling part
url = "http://www.espnfc.com/player/149945/son-heung-min"
response = requests.get(url)
source = response.content

#html parsing part
parser = bs(source, "html.parser")
navigator = parser.find("div", {
    "id": "player-appearances-2018"
}).find_all("tr")[
    1]  # The latest game content // div(id:??) => tr's second one => td

#data refining part
dic = {
    "Date": "",
    "League": "",
    "Vs": "",
    "Result": "",
    "Goal": "",
    "Assist": "",
    "Sh": "",
    "ShT": "",
    "Yel": "",
    "Red": "",
    "Appearance": ""
Beispiel #10
0
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup as bs
import dateutil.parser
import re
import sqlite3
import dateutil.relativedelta

url = "https://www.skysports.com/tottenham-hotspur-results"
response = requests.get(url)
source = response.content
parser = bs(source, "html.parser")
objlink = parser.find("div", {
    "class": "fixres__item"
}).find("a", href=True)["href"]

#updating rating data to epl.db
response2 = requests.get(objlink)
source2 = response2.content
parser2 = bs(source2, "html.parser")

UpdatedDate = parser2.find("div", {
    "class": "article__header-details"
}).find("p", {
    "class": "article__header-date-time"
}).text[14:]
UpdatedDate = UpdatedDate[3:6] + UpdatedDate[:2] + UpdatedDate[5:]
UpdatedDate = str(dateutil.parser.parse(UpdatedDate))[2:][:8].replace("-", ".")

conn = sqlite3.connect("/home/ubuntu/epl/epl.db")
cur = conn.cursor()