Exemple #1
0
    def redistribute(self):
        db_path = self.db_folder + '/js_corpus_functionized_' + self.job_name + '.db'
        if not os.path.exists(db_path):
            print('Error: \'' + db_path +
                  '\' is not exist! Check and do the last step again.')
            return False

        db_op = DBOperation(db_path, "corpus")
        callables = db_op.query_all()
        target_path = self.data_folder + '/js_corpus_redistributed_' + self.job_name
        if self.remove_if_files_exist:
            if os.path.exists(target_path):
                os.remove(target_path)
            os.mkdir(target_path)
        elif not os.path.exists(target_path):
            os.mkdir(target_path)

        counter = 0
        for callable in callables:
            counter += 1
            progress = "\rProcessing: %d" % counter
            sys.stdout.write(progress)
            content = callable.__getitem__(0).decode('utf-8')
            if re.findall('function[\s\S]*?\(', content).__len__() > 0:
                re.sub('function[\s\S]*?\(', 'function(', content, 1)
            content = 'var a = ' + content
            file_name = uuid.uuid4().__str__() + '.js'
            try:
                self.create_file(target_path + '/' + file_name, content)
            except Exception:
                pass
        counter += 0
        print('\rExecute Redistribution Finished on ' + str(counter) +
              ' Files')
        return True
Exemple #2
0
 def __init__(self, db_path, save_path, target_count=100, temper_round=100):
     self.db_op = DBOperation(db_path, 'corpus')
     self.callables = self.db_op.query_all()
     self.callable_processor = CallableProcessor(self.callables)
     self.save_path = save_path
     self.target = target_count
     self.temper_round = temper_round
Exemple #3
0
    def functionize(self):
        source_db_path = self.db_folder + '/js_corpus_initial_filtrated_' + self.job_name + '.db'
        if not os.path.exists(source_db_path):
            print('Error: \'' + source_db_path +
                  '\' is not exist! Check and do the last step again.')
            return False

        target_db_path = self.db_folder + '/js_corpus_functionized_' + self.job_name + '.db'
        source_db_op = DBOperation(source_db_path, 'corpus')
        target_db_op = DBOperation(target_db_path, 'corpus')
        if self.remove_if_files_exist:
            if os.path.exists(target_db_path):
                os.remove(target_db_path)
            target_db_op.init_db()
        elif not os.path.exists(target_db_path):
            target_db_op.init_db()

        raws = source_db_op.query_all()

        counter = 0
        for raw in raws:
            counter += 1
            progress = "\rProcessing Raw No.%d" % counter
            sys.stdout.write(progress)
            raw = raw.__getitem__(0)
            if raw.__contains__('function'):
                self.extract_function(raw, target_db_op)
        target_size = target_db_op.query_count().__getitem__(0)
        counter += 0
        print('\rExecute Functionizing Finished. Extracted ' +
              str(target_size) + ' Functions From ' + str(counter) + ' Raws.')
        return True
Exemple #4
0
    def final_filtration(self):
        source_path = self.data_folder + '/js_corpus_redistributed_' + self.job_name
        if not os.path.exists(source_path):
            print('Error: \'' + source_path +
                  '\' is not exist! Check and do the last step again.')
            return False

        db_path = self.db_folder + '/js_corpus_final_filtrated_' + self.job_name + '.db'
        db_op = DBOperation(db_path, "corpus")
        if self.remove_if_files_exist:
            if os.path.exists(db_path):
                os.remove(db_path)
            db_op.init_db()
        elif not os.path.exists(db_path):
            db_op.init_db()

        counter = 0
        if os.path.isdir(source_path):
            for root, dirs, files in os.walk(source_path):
                for file in files:
                    counter += 1
                    progress = "\rProcessing: %d --> %s" % (counter, file)
                    sys.stdout.write(progress)
                    if self.syntax_check(source_path + '/' + file):
                        with open(source_path + '/' + file, 'r') as f:
                            file_content = f.read().replace('var a = ', '', 1)
                            db_op.insert(file_content.encode('utf-8'))
            counter += 0
            print('\rExecute Final Filtration Finished on ' + str(counter) +
                  ' Files')
            return True
        else:
            print('\'' + source_path + '\' Is Not A Directory.')
            return False
Exemple #5
0
def redistribute():
    source_path = "../../BrowserFuzzingData/db/js_corpus_top_1000.db"
    source_op = DBOperation(source_path, "corpus")
    raw_list = source_op.query_all()
    target_path = "../../BrowserFuzzingData/redistributed_top_1000"
    if not os.path.exists(target_path):
        os.mkdir(target_path)

    file_count = raw_callables.__len__()
    counter = 0
    for raw_list in raw_callables:
        counter += 1
        progress = "\rProcessing: %d" % counter
        sys.stdout.write(progress)
        content = raw_list.__getitem__(0).decode('utf-8')
        generate_js_fuzzer(raw_callables, content)
    print('\nExecute Redistributing Finished on ' + str(file_count) + ' ' + FLAGS.file_type + ' Files')
Exemple #6
0
class WordFreq:
    def __init__(self, methods, source_path, target_path):
        self.methods = methods
        self.columns = self.get_columns()
        self.db_op_source = DBOperation(source_path, 'corpus')
        self.db_op_target = DBOperation(target_path, "result")
        self.db_op_target.init_db()

    def frequence(self):
        # 切割文件形成一个字典
        file_name = 0  # 文件名起始值
        db_file_number = self.db_op_source.query_count()[0]  # 目录下文件的个数  592304
        callables = self.db_op_source.query_all()  # 返回所有的content
        count = 0  # 计数,最大为文件夹下文件的个数
        length = len(self.methods)  # 变量类型方法个数

        while count < db_file_number:
            file = ""
            file += callables[count][0].decode('utf-8')
            i = 0
            progress = "\rProcessing: %d" % file_name
            sys.stdout.write(progress)
            frequencies = []
            while i < length:
                frequencies.append(file.count(self.methods[i]))
                i = i + 1
            self.db_op_target.insert_frequencies(self.columns, frequencies)  # 插入数据
            count = count + 1
            file_name = file_name + 1
        self.db_op_source.finalize()
        self.db_op_target.finalize()

    def get_columns(self):
        columns = []
        for i in range(0, self.methods.__len__()):
            columns.append("'" + self.methods[i] + "'")
        return columns
Exemple #7
0
    def write_files_to_db(self):
        """
        将语料写入数据库
        """

        # 拼装语料库路径
        corpus_path = self.raw_folder
        # 如果文件夹不存在,报错并提前结束
        if not os.path.exists(corpus_path):
            print('Error: \'' + corpus_path +
                  '\' is not exist! Check and do the last step again.')
            return False

        # 拼装数据库文件路径
        db_path = self.db_folder + '/js_corpus_initial_filtrated_' + self.job_name + '.db'
        db_op = DBOperation(db_path, 'corpus')
        if self.remove_if_files_exist:
            if os.path.exists(db_path):
                os.remove(db_path)
            db_op.init_db()
        elif not os.path.exists(db_path):
            db_op.init_db()

        counter = 0
        if os.path.isdir(corpus_path):
            for root, dirs, files in os.walk(corpus_path):
                # 如果本次预处理是对js文件执行
                for file in files:
                    try:
                        counter += 1
                        progress = "\rProcessing: %d --> %s" % (counter, file)
                        sys.stdout.write(progress)
                        f = open(corpus_path + '/' + file, 'rb')
                        db_op.insert(f.read().decode())
                        f.close()
                    except Exception:
                        pass
            counter += 0
            print('\rExecute Writing Content to DB Finished on ' +
                  str(counter) + ' Files.')
            return True
        else:
            print('\'' + corpus_path + '\' Is Not A Directory.')
            return False
Exemple #8
0
def write_corpus_to_db(db_name):
    """
    将语料写入数据库
    """
    # 拼装数据库文件路径
    db_path = FLAGS.db_folder + FLAGS.file_type + '_corpus_' + db_name + '.db'
    db_op = DBOperation(db_path, 'corpus')
    if not os.path.exists(db_path):
        db_op.init_db()

    # 拼装语料库路径
    corpus_path = FLAGS.corpus_folder
    # 如果文件夹不存在,创建
    if not os.path.exists(corpus_path):
        os.mkdir(corpus_path)

    print('----------------------- Executing Write Corpus to DB -----------------------')

    file_count = 0
    counter = 0
    if os.path.isdir(corpus_path):
        for root, dirs, files in os.walk(corpus_path):
            # 统计语料库中源文件个数
            file_count += 0
            file_count = files.__len__()
            # 如果本次预处理是对js文件执行
            for file in files:
                try:
                    counter += 1
                    progress = "\rProcessing: %d -> %s\n" % (counter, file)
                    sys.stdout.write(progress)
                    f = open(corpus_path + '/' + file, 'rb')
                    db_op.insert(f.read().decode())
                except Exception:
                    pass

        file_count += 0
        print('Execute Write Corpus to DB on ' + str(file_count) + ' ' + FLAGS.file_type + ' Files.')
    else:
        print('\'' + corpus_path + '\' is not a directory.')
import xlrd
from db_operation import DBOperation

excel = xlrd.open_workbook("F://毕业论文资料//PT_LCRDSpending.xlsx")
table = excel.sheet_by_index(0)
nrows = table.nrows
result = []
word_list = ["大数据", "人工智能", "ai", "云", "区块链", "移动商务", "sas", "saas"]
db_operation = DBOperation()
for index in range(3, nrows):
    company_code = table.cell(index, 0).value
    year = int(table.cell(index, 1).value[0:4])
    detail = table.cell(index, 12).value
    if detail:
        for i in word_list:
            if i in detail:
                result.append((company_code, year))
                break

db_operation.insert_year_info(result)
Exemple #10
0
class Selector:
    def __init__(self, db_path, save_path, target_count=100, temper_round=100):
        self.db_op = DBOperation(db_path, 'corpus')
        self.callables = self.db_op.query_all()
        self.callable_processor = CallableProcessor(self.callables)
        self.save_path = save_path
        self.target = target_count
        self.temper_round = temper_round

    def execute(self):
        index_of_callables = 0
        i = 0
        while i < self.target:
            try:
                function_body = self.callables[index_of_callables].__getitem__(
                    0).decode('utf-8')
                test_case = ''
                self_calling = self.callable_processor.get_self_calling(
                    function_body)
                self.create_and_fill_file('./ISTANBUL_TEST_CASE.js',
                                          self_calling)
                st, br, fu, li = self.istanbul_cover('ISTANBUL_TEST_CASE.js')

                for j in range(0, self.temper_round):
                    self_calling = self.callable_processor.get_self_calling(
                        function_body)
                    self.create_and_fill_file('./ISTANBUL_TEST_CASE.js',
                                              self_calling)
                    st_tmp, br_tmp, fu_tmp, li_tmp = self.istanbul_cover(
                        'ISTANBUL_TEST_CASE.js')
                    if st_tmp - st + br_tmp - br + fu_tmp - fu + li_tmp - li > 0:
                        test_case.join('')  # 必要!这一句为了扩展一下test_case变量的作用域
                        test_case = self_calling
                        st, br, fu, li = self.set_coverage_values(
                            st_tmp, br_tmp, fu_tmp, li_tmp)
                    os.remove('./ISTANBUL_TEST_CASE.js')
                if test_case.__len__() > 0:
                    self.create_and_fill_file(
                        self.save_path + '/' + str(i) + '.js', test_case)
                    i += 1
                index_of_callables += 1
            except Exception:
                pass

    def create_and_fill_file(self, file_path, content):
        with open(file_path, 'a') as file:
            file.write(content)

    def set_coverage_values(self, st_tmp, br_tmp, fu_tmp, li_tmp):
        return st_tmp, br_tmp, fu_tmp, li_tmp

    def istanbul_cover(self, file_name):
        st, br, fu, li = self.set_coverage_values(0, 0, 0, 0)

        cmd = ['istanbul', 'cover', file_name]
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
        p.poll()
        stdout_list = p.stdout.readlines()
        stdout = ''
        for line in stdout_list:
            stdout = stdout + line.decode('utf-8')
        coverage_of_single_sample = re.findall(': [\s\S]*?%', stdout)
        if coverage_of_single_sample.__len__() == 4:
            st = re.sub('%', '', re.sub(': ', '',
                                        coverage_of_single_sample[0]))
            br = re.sub('%', '', re.sub(': ', '',
                                        coverage_of_single_sample[1]))
            fu = re.sub('%', '', re.sub(': ', '',
                                        coverage_of_single_sample[2]))
            li = re.sub('%', '', re.sub(': ', '',
                                        coverage_of_single_sample[3]))
        return float(st), float(br), float(fu), float(li)
Exemple #11
0
import time
from selenium import webdriver
from db_operation import DBOperation
from selenium.webdriver.chrome.options import Options
import random
from datetime import datetime

from selenium_text_crawler import SeleniumCompanyCrawler

chrome_options = Options()
db_operation = DBOperation()
se = SeleniumCompanyCrawler()
chrome_options.add_argument('lang=zh_CN.UTF-8')
chrome_options.add_argument("--headless")
USER_AGENTS = [
    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"
]
chrome_options.add_argument("user_agent=" + random.choice(USER_AGENTS))
driver = webdriver.Chrome(options=chrome_options)
# 打开google主页
for i in range(35, 50):
    driver.get("http://s.askci.com/stock/a/?reportTime=2018-09-30&pageNum=" +
               str(i) + "#QueryCondition")
    i = 1
    time.sleep(4)
    rtable = driver.find_element_by_xpath('//*[@id="myTable04"]/tbody')
    trlist = rtable.find_elements_by_tag_name("tr")
    result = []
    for row in trlist:
        tdList = row.find_elements_by_tag_name("td")
        company_code = tdList[1].text
class GoogleSearchResultTextCrawler(object):
    db_operation = DBOperation()
    logger = logging.getLogger()  # 不加名称设置root logger
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh = logging.FileHandler('F://log.txt')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)

    def isElementExistByXpath(self, driver, element):
        flag = True
        try:
            driver.find_element_by_xpath(element)
            return flag

        except:
            flag = False
            return flag

    def isElementExistByClassName(self, driver, element):
        flag = True
        try:
            driver.find_element_by_class_name(element)
            return flag

        except:
            flag = False
            return flag

    # 分析单个公司的搜索结果
    def operate_browser(self, company_web_site, company_id):
        random_ip = random.choice(crawler_constant.PROXIES)
        proxies = {'http': random_ip}
        random_ua = random.choice(crawler_constant.USER_AGENTS)
        headers = {'User-Agent': random_ua, 'Connection': 'close'}
        search_title_link = crawler_constant.SEARCH_TEXT_URL.format(
            company_web_site)
        statistics_result = []
        start = 0
        try:
            response = requests.get(search_title_link + str(start),
                                    headers=headers,
                                    timeout=10,
                                    proxies=proxies).text
            # response = requests.get(
            #     search_title_link + str(start), headers=headers, timeout=10).text
            html_result = self.statistics_html(response, company_id,
                                               company_web_site)
            if not html_result:
                statistics_result.append([(company_id, 0, 0, 0, 0, 0)])
            else:
                statistics_result.append(html_result)
                while True:
                    start += crawler_constant.PAGE_SIZE
                    if start / crawler_constant.PAGE_SIZE > 5:
                        sleep_time = 10 + random.random()
                    else:
                        sleep_time = random.randint(0, 2) + random.random()
                    time.sleep(sleep_time)
                    response_str = requests.get(search_title_link + str(start),
                                                headers=headers,
                                                timeout=10,
                                                proxies=proxies).text
                    # response_str = requests.get(
                    #     search_title_link + str(start), headers=headers, timeout=10).text
                    html_result_str = self.statistics_html(
                        response_str, company_id, company_web_site)
                    if not html_result_str:
                        break
                    else:
                        statistics_result.append(html_result_str)
            insert_records = []
            for i in statistics_result:
                for j in i:
                    insert_records.append(j)
            logging.info("start insert company id: " + str(company_id) +
                         "size: " + str(len(insert_records)))
            self.db_operation.batch_insert_records(insert_records)
        except Exception as e:
            logging.info(e)
            logging.info("company_id: " + str(company_id) +
                         " insert records:has exception")

    # 批量获得结果
    def get_statistics_results(self):
        start = 0
        page_size = 100
        return self.db_operation.get_company_info(start, page_size)
        # while True:
        #     result = self.db_operation.get_company_info(start, page_size)
        #     if not result:
        #         break
        #     else:
        #         for r in result:
        #             parsed_uri = urlparse(r.company_web_site)
        #             domain = '{uri.netloc}'.format(uri=parsed_uri)
        #             self.operate_browser(domain, r.company_id)
        #         start += page_size

    def statistics_html(self, html, company_id, company_web_site):
        statistics_result = []
        soup = BeautifulSoup(html, "lxml")
        search_result = soup.find_all(class_="g")
        if not search_result:
            return None
        for result in search_result:
            article_link = str(result.find('a')['href'])
            st_node = result.find(class_="st")
            try:
                position_count = article_link.split(company_web_site)[1].count(
                    "/")
                if st_node is not None:
                    for st_str in st_node.strings:
                        try:
                            article_year_time_text = st_str
                            logging.info(article_year_time_text)
                            article_year = int(article_year_time_text[0:4])
                        except Exception as e:
                            article_year = -1
                        break
                    em_list = st_node.find_all('em')
                    b_list = st_node.find_all('b')
                    data_list = []
                    if em_list is not None or b_list is not None:
                        if b_list is not None:
                            for b in b_list:
                                data_list.append(b.get_text())
                        if em_list is not None:
                            for em in em_list:
                                data_list.append(em.get_text())
                        big_data_word_count = data_list.count('大数据')
                        ai_word_count = data_list.count('人工智能')
                        cloud_compute_word_count = data_list.count('云计算')
                else:
                    big_data_word_count = 0
                    ai_word_count = 0
                    cloud_compute_word_count = 0
                if big_data_word_count != 0:
                    statistics_result.append(
                        (company_id, article_year, position_count, '大数据',
                         big_data_word_count, 0))
                if ai_word_count != 0:
                    statistics_result.append(
                        (company_id, article_year, position_count, '人工智能',
                         ai_word_count, 0))

                if cloud_compute_word_count != 0:
                    statistics_result.append(
                        (company_id, article_year, position_count, '云计算',
                         cloud_compute_word_count, 0))
            except Exception as e:
                logging.info(e)
                logging.info("use beautifulsoup get html info has exception")
        return statistics_result
Exemple #13
0
import xlrd
from db_operation import DBOperation

excel = xlrd.open_workbook("F://毕业论文资料//固定资产-IT.xlsx")
table = excel.sheet_by_index(0)
nrows = table.nrows

result = {}
db_operation = DBOperation()
for index in range(3, nrows):
    company_code = table.cell(index, 0).value
    year = int(table.cell(index, 1).value[0:4])
    increase = table.cell(index, 6).value
    if increase:
        increase = round(float(table.cell(index, 6).value), 2)
    else:
        increase = float(0)
    if (company_code, year) in result.keys():
        result[(company_code, year)] += increase
    else:
        result[(company_code, year)] = increase

insert_records = []
for key, value in result.items():
    company_code = key[0]
    year = key[1]
    increase = value
    insert_records.append((company_code, year, increase))

db_operation.batch_insert_investment(insert_records)
Exemple #14
0
class SeleniumCompanyCrawler(object):
    db_operation = DBOperation()
    logger = logging.getLogger()  # 不加名称设置root logger
    logger.setLevel(logging.INFO)
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: - %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh = logging.FileHandler('F://log.txt')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)

    def isElementExistByXpath(self, driver, element):
        flag = True
        try:
            driver.find_element_by_xpath(element)
            return flag

        except:
            flag = False
            return flag

    def isElementExistByClassName(self, driver, element):
        flag = True
        try:
            driver.find_element_by_class_name(element)
            return flag

        except:
            flag = False
            return flag

    # 分析单个公司的搜索结果
    def operate_browser(self, company_web_site, company_id):
        chrome_options = Options()
        chrome_options.add_argument('lang=zh_CN.UTF-8')
        chrome_options.add_experimental_option("debuggerAddress",
                                               "127.0.0.1:9222")
        # chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        USER_AGENTS = [
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
            "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
            "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; zh-CN)",
            "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
            "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
            "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
            "Mozilla/5.0 (X11; U; Linux; zh-CN) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
            "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
            "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
        ]
        chrome_options.add_argument("user_agent=" + random.choice(USER_AGENTS))
        driver = webdriver.Chrome(options=chrome_options)
        # 打开google主页
        driver.get("http://www.google.com")
        # 缓冲2秒
        time.sleep(2)
        # 获取BeautifulSoup,用于后续的html节点分析
        search_box = driver.find_element_by_name('q')  # 取得搜索框,用name去获取DOM
        # 循环网址
        search_box.send_keys('site:' + company_web_site +
                             ' intext:"大数据"|intext:"人工智能"|intext:"云计算" ')
        search_box.submit()  # 令 chrome 按下 submit按钮
        time.sleep(5)  # 缓冲3秒
        statistics_result = []
        while True:
            soup = BeautifulSoup(driver.page_source, "lxml")
            search_result = soup.find_all(class_="g")
            if not search_result:
                statistics_result.append((company_id, 0, 0, 0, 0, 0))
                break
            for result in search_result:
                article_link = str(result.find('a')['href'])
                st_node = result.find(class_="st")
                try:
                    em_result = st_node.find('em').contents
                    big_data_word_count = em_result.count('大数据')
                    ai_word_count = em_result.count('人工智能')
                    cloud_compute_word_count = em_result.count('云计算')
                    position_count = article_link.split(
                        company_web_site)[1].count("/")
                    # 如果有时间参数,则加入统计
                    if st_node.find(class_="f") is not None:
                        article_time_text = st_node.find(class_="f")
                        article_year = int(article_time_text.text[0:4])
                        if big_data_word_count != 0:
                            statistics_result.append(
                                (company_id, article_year, position_count,
                                 '大数据', big_data_word_count, 0))
                        if ai_word_count != 0:
                            statistics_result.append(
                                (company_id, article_year, position_count,
                                 '人工智能', ai_word_count, 0))

                        if cloud_compute_word_count != 0:
                            statistics_result.append(
                                (company_id, article_year, position_count,
                                 '云计算', cloud_compute_word_count, 0))
                    # 没有时间参数,另外统计
                    else:
                        if big_data_word_count != 0:
                            statistics_result.append(
                                (company_id, -1, position_count, '大数据',
                                 big_data_word_count, 1))
                        if ai_word_count != 0:
                            statistics_result.append(
                                (company_id, -1, position_count, '人工智能',
                                 ai_word_count, 0))

                        if cloud_compute_word_count != 0:
                            statistics_result.append(
                                (company_id, -1, position_count, '云计算',
                                 cloud_compute_word_count, 0))

                except Exception as e:
                    logging.info("crawler website: " + company_web_site +
                                 " id: " + str(company_id) + "has exception")
                    print(e)
                    continue
            # 翻页按钮
            if (SeleniumCompanyCrawler.isElementExistByXpath(
                    self, driver, '//*[@id="pnnext"]/span[2]')):
                next_page = driver.find_element_by_xpath(
                    '//*[@id="pnnext"]/span[2]')
                next_page.click()
                time.sleep(5)
            else:
                time.sleep(60)
                break
        # driver.close()
        try:
            self.db_operation.batch_insert_records(statistics_result)
        except Exception as e:
            logging.info(statistics_result)
            logging.info("insert records:has exception")

    # 批量获得结果
    def get_statistics_results(self):
        start = 0
        page_size = 20
        while True:
            result = self.db_operation.get_company_info(start, page_size)
            if not result:
                break
            else:
                for r in result:
                    parsed_uri = urlparse(r.company_web_site)
                    domain = '{uri.netloc}'.format(uri=parsed_uri)
                    self.operate_browser(domain, r.company_id)
                start += page_size
Exemple #15
0
 def __init__(self, methods, source_path, target_path):
     self.methods = methods
     self.columns = self.get_columns()
     self.db_op_source = DBOperation(source_path, 'corpus')
     self.db_op_target = DBOperation(target_path, "result")
     self.db_op_target.init_db()
Exemple #16
0
def readdb():
    db_path = '../../BrowserFuzzingData/js_corpus_final_top_1000.db'  # 数据库文件
    op = DBOperation(db_path, 'corpus')
    result = op.query_all()  # result为整体数据,格式为:list[str]
    # print(result)
    return result