def redistribute(self): db_path = self.db_folder + '/js_corpus_functionized_' + self.job_name + '.db' if not os.path.exists(db_path): print('Error: \'' + db_path + '\' is not exist! Check and do the last step again.') return False db_op = DBOperation(db_path, "corpus") callables = db_op.query_all() target_path = self.data_folder + '/js_corpus_redistributed_' + self.job_name if self.remove_if_files_exist: if os.path.exists(target_path): os.remove(target_path) os.mkdir(target_path) elif not os.path.exists(target_path): os.mkdir(target_path) counter = 0 for callable in callables: counter += 1 progress = "\rProcessing: %d" % counter sys.stdout.write(progress) content = callable.__getitem__(0).decode('utf-8') if re.findall('function[\s\S]*?\(', content).__len__() > 0: re.sub('function[\s\S]*?\(', 'function(', content, 1) content = 'var a = ' + content file_name = uuid.uuid4().__str__() + '.js' try: self.create_file(target_path + '/' + file_name, content) except Exception: pass counter += 0 print('\rExecute Redistribution Finished on ' + str(counter) + ' Files') return True
def __init__(self, db_path, save_path, target_count=100, temper_round=100): self.db_op = DBOperation(db_path, 'corpus') self.callables = self.db_op.query_all() self.callable_processor = CallableProcessor(self.callables) self.save_path = save_path self.target = target_count self.temper_round = temper_round
def functionize(self): source_db_path = self.db_folder + '/js_corpus_initial_filtrated_' + self.job_name + '.db' if not os.path.exists(source_db_path): print('Error: \'' + source_db_path + '\' is not exist! Check and do the last step again.') return False target_db_path = self.db_folder + '/js_corpus_functionized_' + self.job_name + '.db' source_db_op = DBOperation(source_db_path, 'corpus') target_db_op = DBOperation(target_db_path, 'corpus') if self.remove_if_files_exist: if os.path.exists(target_db_path): os.remove(target_db_path) target_db_op.init_db() elif not os.path.exists(target_db_path): target_db_op.init_db() raws = source_db_op.query_all() counter = 0 for raw in raws: counter += 1 progress = "\rProcessing Raw No.%d" % counter sys.stdout.write(progress) raw = raw.__getitem__(0) if raw.__contains__('function'): self.extract_function(raw, target_db_op) target_size = target_db_op.query_count().__getitem__(0) counter += 0 print('\rExecute Functionizing Finished. Extracted ' + str(target_size) + ' Functions From ' + str(counter) + ' Raws.') return True
def final_filtration(self): source_path = self.data_folder + '/js_corpus_redistributed_' + self.job_name if not os.path.exists(source_path): print('Error: \'' + source_path + '\' is not exist! Check and do the last step again.') return False db_path = self.db_folder + '/js_corpus_final_filtrated_' + self.job_name + '.db' db_op = DBOperation(db_path, "corpus") if self.remove_if_files_exist: if os.path.exists(db_path): os.remove(db_path) db_op.init_db() elif not os.path.exists(db_path): db_op.init_db() counter = 0 if os.path.isdir(source_path): for root, dirs, files in os.walk(source_path): for file in files: counter += 1 progress = "\rProcessing: %d --> %s" % (counter, file) sys.stdout.write(progress) if self.syntax_check(source_path + '/' + file): with open(source_path + '/' + file, 'r') as f: file_content = f.read().replace('var a = ', '', 1) db_op.insert(file_content.encode('utf-8')) counter += 0 print('\rExecute Final Filtration Finished on ' + str(counter) + ' Files') return True else: print('\'' + source_path + '\' Is Not A Directory.') return False
def redistribute(): source_path = "../../BrowserFuzzingData/db/js_corpus_top_1000.db" source_op = DBOperation(source_path, "corpus") raw_list = source_op.query_all() target_path = "../../BrowserFuzzingData/redistributed_top_1000" if not os.path.exists(target_path): os.mkdir(target_path) file_count = raw_callables.__len__() counter = 0 for raw_list in raw_callables: counter += 1 progress = "\rProcessing: %d" % counter sys.stdout.write(progress) content = raw_list.__getitem__(0).decode('utf-8') generate_js_fuzzer(raw_callables, content) print('\nExecute Redistributing Finished on ' + str(file_count) + ' ' + FLAGS.file_type + ' Files')
class WordFreq: def __init__(self, methods, source_path, target_path): self.methods = methods self.columns = self.get_columns() self.db_op_source = DBOperation(source_path, 'corpus') self.db_op_target = DBOperation(target_path, "result") self.db_op_target.init_db() def frequence(self): # 切割文件形成一个字典 file_name = 0 # 文件名起始值 db_file_number = self.db_op_source.query_count()[0] # 目录下文件的个数 592304 callables = self.db_op_source.query_all() # 返回所有的content count = 0 # 计数,最大为文件夹下文件的个数 length = len(self.methods) # 变量类型方法个数 while count < db_file_number: file = "" file += callables[count][0].decode('utf-8') i = 0 progress = "\rProcessing: %d" % file_name sys.stdout.write(progress) frequencies = [] while i < length: frequencies.append(file.count(self.methods[i])) i = i + 1 self.db_op_target.insert_frequencies(self.columns, frequencies) # 插入数据 count = count + 1 file_name = file_name + 1 self.db_op_source.finalize() self.db_op_target.finalize() def get_columns(self): columns = [] for i in range(0, self.methods.__len__()): columns.append("'" + self.methods[i] + "'") return columns
def write_files_to_db(self): """ 将语料写入数据库 """ # 拼装语料库路径 corpus_path = self.raw_folder # 如果文件夹不存在,报错并提前结束 if not os.path.exists(corpus_path): print('Error: \'' + corpus_path + '\' is not exist! Check and do the last step again.') return False # 拼装数据库文件路径 db_path = self.db_folder + '/js_corpus_initial_filtrated_' + self.job_name + '.db' db_op = DBOperation(db_path, 'corpus') if self.remove_if_files_exist: if os.path.exists(db_path): os.remove(db_path) db_op.init_db() elif not os.path.exists(db_path): db_op.init_db() counter = 0 if os.path.isdir(corpus_path): for root, dirs, files in os.walk(corpus_path): # 如果本次预处理是对js文件执行 for file in files: try: counter += 1 progress = "\rProcessing: %d --> %s" % (counter, file) sys.stdout.write(progress) f = open(corpus_path + '/' + file, 'rb') db_op.insert(f.read().decode()) f.close() except Exception: pass counter += 0 print('\rExecute Writing Content to DB Finished on ' + str(counter) + ' Files.') return True else: print('\'' + corpus_path + '\' Is Not A Directory.') return False
def write_corpus_to_db(db_name): """ 将语料写入数据库 """ # 拼装数据库文件路径 db_path = FLAGS.db_folder + FLAGS.file_type + '_corpus_' + db_name + '.db' db_op = DBOperation(db_path, 'corpus') if not os.path.exists(db_path): db_op.init_db() # 拼装语料库路径 corpus_path = FLAGS.corpus_folder # 如果文件夹不存在,创建 if not os.path.exists(corpus_path): os.mkdir(corpus_path) print('----------------------- Executing Write Corpus to DB -----------------------') file_count = 0 counter = 0 if os.path.isdir(corpus_path): for root, dirs, files in os.walk(corpus_path): # 统计语料库中源文件个数 file_count += 0 file_count = files.__len__() # 如果本次预处理是对js文件执行 for file in files: try: counter += 1 progress = "\rProcessing: %d -> %s\n" % (counter, file) sys.stdout.write(progress) f = open(corpus_path + '/' + file, 'rb') db_op.insert(f.read().decode()) except Exception: pass file_count += 0 print('Execute Write Corpus to DB on ' + str(file_count) + ' ' + FLAGS.file_type + ' Files.') else: print('\'' + corpus_path + '\' is not a directory.')
import xlrd from db_operation import DBOperation excel = xlrd.open_workbook("F://毕业论文资料//PT_LCRDSpending.xlsx") table = excel.sheet_by_index(0) nrows = table.nrows result = [] word_list = ["大数据", "人工智能", "ai", "云", "区块链", "移动商务", "sas", "saas"] db_operation = DBOperation() for index in range(3, nrows): company_code = table.cell(index, 0).value year = int(table.cell(index, 1).value[0:4]) detail = table.cell(index, 12).value if detail: for i in word_list: if i in detail: result.append((company_code, year)) break db_operation.insert_year_info(result)
class Selector: def __init__(self, db_path, save_path, target_count=100, temper_round=100): self.db_op = DBOperation(db_path, 'corpus') self.callables = self.db_op.query_all() self.callable_processor = CallableProcessor(self.callables) self.save_path = save_path self.target = target_count self.temper_round = temper_round def execute(self): index_of_callables = 0 i = 0 while i < self.target: try: function_body = self.callables[index_of_callables].__getitem__( 0).decode('utf-8') test_case = '' self_calling = self.callable_processor.get_self_calling( function_body) self.create_and_fill_file('./ISTANBUL_TEST_CASE.js', self_calling) st, br, fu, li = self.istanbul_cover('ISTANBUL_TEST_CASE.js') for j in range(0, self.temper_round): self_calling = self.callable_processor.get_self_calling( function_body) self.create_and_fill_file('./ISTANBUL_TEST_CASE.js', self_calling) st_tmp, br_tmp, fu_tmp, li_tmp = self.istanbul_cover( 'ISTANBUL_TEST_CASE.js') if st_tmp - st + br_tmp - br + fu_tmp - fu + li_tmp - li > 0: test_case.join('') # 必要!这一句为了扩展一下test_case变量的作用域 test_case = self_calling st, br, fu, li = self.set_coverage_values( st_tmp, br_tmp, fu_tmp, li_tmp) os.remove('./ISTANBUL_TEST_CASE.js') if test_case.__len__() > 0: self.create_and_fill_file( self.save_path + '/' + str(i) + '.js', test_case) i += 1 index_of_callables += 1 except Exception: pass def create_and_fill_file(self, file_path, content): with open(file_path, 'a') as file: file.write(content) def set_coverage_values(self, st_tmp, br_tmp, fu_tmp, li_tmp): return st_tmp, br_tmp, fu_tmp, li_tmp def istanbul_cover(self, file_name): st, br, fu, li = self.set_coverage_values(0, 0, 0, 0) cmd = ['istanbul', 'cover', file_name] p = subprocess.Popen(cmd, stdout=subprocess.PIPE) p.poll() stdout_list = p.stdout.readlines() stdout = '' for line in stdout_list: stdout = stdout + line.decode('utf-8') coverage_of_single_sample = re.findall(': [\s\S]*?%', stdout) if coverage_of_single_sample.__len__() == 4: st = re.sub('%', '', re.sub(': ', '', coverage_of_single_sample[0])) br = re.sub('%', '', re.sub(': ', '', coverage_of_single_sample[1])) fu = re.sub('%', '', re.sub(': ', '', coverage_of_single_sample[2])) li = re.sub('%', '', re.sub(': ', '', coverage_of_single_sample[3])) return float(st), float(br), float(fu), float(li)
import time from selenium import webdriver from db_operation import DBOperation from selenium.webdriver.chrome.options import Options import random from datetime import datetime from selenium_text_crawler import SeleniumCompanyCrawler chrome_options = Options() db_operation = DBOperation() se = SeleniumCompanyCrawler() chrome_options.add_argument('lang=zh_CN.UTF-8') chrome_options.add_argument("--headless") USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)" ] chrome_options.add_argument("user_agent=" + random.choice(USER_AGENTS)) driver = webdriver.Chrome(options=chrome_options) # 打开google主页 for i in range(35, 50): driver.get("http://s.askci.com/stock/a/?reportTime=2018-09-30&pageNum=" + str(i) + "#QueryCondition") i = 1 time.sleep(4) rtable = driver.find_element_by_xpath('//*[@id="myTable04"]/tbody') trlist = rtable.find_elements_by_tag_name("tr") result = [] for row in trlist: tdList = row.find_elements_by_tag_name("td") company_code = tdList[1].text
class GoogleSearchResultTextCrawler(object): db_operation = DBOperation() logger = logging.getLogger() # 不加名称设置root logger logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh = logging.FileHandler('F://log.txt') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) def isElementExistByXpath(self, driver, element): flag = True try: driver.find_element_by_xpath(element) return flag except: flag = False return flag def isElementExistByClassName(self, driver, element): flag = True try: driver.find_element_by_class_name(element) return flag except: flag = False return flag # 分析单个公司的搜索结果 def operate_browser(self, company_web_site, company_id): random_ip = random.choice(crawler_constant.PROXIES) proxies = {'http': random_ip} random_ua = random.choice(crawler_constant.USER_AGENTS) headers = {'User-Agent': random_ua, 'Connection': 'close'} search_title_link = crawler_constant.SEARCH_TEXT_URL.format( company_web_site) statistics_result = [] start = 0 try: response = requests.get(search_title_link + str(start), headers=headers, timeout=10, proxies=proxies).text # response = requests.get( # search_title_link + str(start), headers=headers, timeout=10).text html_result = self.statistics_html(response, company_id, company_web_site) if not html_result: statistics_result.append([(company_id, 0, 0, 0, 0, 0)]) else: statistics_result.append(html_result) while True: start += crawler_constant.PAGE_SIZE if start / crawler_constant.PAGE_SIZE > 5: sleep_time = 10 + random.random() else: sleep_time = random.randint(0, 2) + random.random() time.sleep(sleep_time) response_str = requests.get(search_title_link + str(start), headers=headers, timeout=10, proxies=proxies).text # response_str = requests.get( # search_title_link + str(start), headers=headers, timeout=10).text html_result_str = self.statistics_html( response_str, company_id, company_web_site) if not html_result_str: break else: statistics_result.append(html_result_str) insert_records = [] for i in statistics_result: for j in i: insert_records.append(j) logging.info("start insert company id: " + str(company_id) + "size: " + str(len(insert_records))) self.db_operation.batch_insert_records(insert_records) except Exception as e: logging.info(e) logging.info("company_id: " + str(company_id) + " insert records:has exception") # 批量获得结果 def get_statistics_results(self): start = 0 page_size = 100 return self.db_operation.get_company_info(start, page_size) # while True: # result = self.db_operation.get_company_info(start, page_size) # if not result: # break # else: # for r in result: # parsed_uri = urlparse(r.company_web_site) # domain = '{uri.netloc}'.format(uri=parsed_uri) # self.operate_browser(domain, r.company_id) # start += page_size def statistics_html(self, html, company_id, company_web_site): statistics_result = [] soup = BeautifulSoup(html, "lxml") search_result = soup.find_all(class_="g") if not search_result: return None for result in search_result: article_link = str(result.find('a')['href']) st_node = result.find(class_="st") try: position_count = article_link.split(company_web_site)[1].count( "/") if st_node is not None: for st_str in st_node.strings: try: article_year_time_text = st_str logging.info(article_year_time_text) article_year = int(article_year_time_text[0:4]) except Exception as e: article_year = -1 break em_list = st_node.find_all('em') b_list = st_node.find_all('b') data_list = [] if em_list is not None or b_list is not None: if b_list is not None: for b in b_list: data_list.append(b.get_text()) if em_list is not None: for em in em_list: data_list.append(em.get_text()) big_data_word_count = data_list.count('大数据') ai_word_count = data_list.count('人工智能') cloud_compute_word_count = data_list.count('云计算') else: big_data_word_count = 0 ai_word_count = 0 cloud_compute_word_count = 0 if big_data_word_count != 0: statistics_result.append( (company_id, article_year, position_count, '大数据', big_data_word_count, 0)) if ai_word_count != 0: statistics_result.append( (company_id, article_year, position_count, '人工智能', ai_word_count, 0)) if cloud_compute_word_count != 0: statistics_result.append( (company_id, article_year, position_count, '云计算', cloud_compute_word_count, 0)) except Exception as e: logging.info(e) logging.info("use beautifulsoup get html info has exception") return statistics_result
import xlrd from db_operation import DBOperation excel = xlrd.open_workbook("F://毕业论文资料//固定资产-IT.xlsx") table = excel.sheet_by_index(0) nrows = table.nrows result = {} db_operation = DBOperation() for index in range(3, nrows): company_code = table.cell(index, 0).value year = int(table.cell(index, 1).value[0:4]) increase = table.cell(index, 6).value if increase: increase = round(float(table.cell(index, 6).value), 2) else: increase = float(0) if (company_code, year) in result.keys(): result[(company_code, year)] += increase else: result[(company_code, year)] = increase insert_records = [] for key, value in result.items(): company_code = key[0] year = key[1] increase = value insert_records.append((company_code, year, increase)) db_operation.batch_insert_investment(insert_records)
class SeleniumCompanyCrawler(object): db_operation = DBOperation() logger = logging.getLogger() # 不加名称设置root logger logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: - %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh = logging.FileHandler('F://log.txt') fh.setLevel(logging.DEBUG) fh.setFormatter(formatter) def isElementExistByXpath(self, driver, element): flag = True try: driver.find_element_by_xpath(element) return flag except: flag = False return flag def isElementExistByClassName(self, driver, element): flag = True try: driver.find_element_by_class_name(element) return flag except: flag = False return flag # 分析单个公司的搜索结果 def operate_browser(self, company_web_site, company_id): chrome_options = Options() chrome_options.add_argument('lang=zh_CN.UTF-8') chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222") # chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; zh-CN)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; zh-CN) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", ] chrome_options.add_argument("user_agent=" + random.choice(USER_AGENTS)) driver = webdriver.Chrome(options=chrome_options) # 打开google主页 driver.get("http://www.google.com") # 缓冲2秒 time.sleep(2) # 获取BeautifulSoup,用于后续的html节点分析 search_box = driver.find_element_by_name('q') # 取得搜索框,用name去获取DOM # 循环网址 search_box.send_keys('site:' + company_web_site + ' intext:"大数据"|intext:"人工智能"|intext:"云计算" ') search_box.submit() # 令 chrome 按下 submit按钮 time.sleep(5) # 缓冲3秒 statistics_result = [] while True: soup = BeautifulSoup(driver.page_source, "lxml") search_result = soup.find_all(class_="g") if not search_result: statistics_result.append((company_id, 0, 0, 0, 0, 0)) break for result in search_result: article_link = str(result.find('a')['href']) st_node = result.find(class_="st") try: em_result = st_node.find('em').contents big_data_word_count = em_result.count('大数据') ai_word_count = em_result.count('人工智能') cloud_compute_word_count = em_result.count('云计算') position_count = article_link.split( company_web_site)[1].count("/") # 如果有时间参数,则加入统计 if st_node.find(class_="f") is not None: article_time_text = st_node.find(class_="f") article_year = int(article_time_text.text[0:4]) if big_data_word_count != 0: statistics_result.append( (company_id, article_year, position_count, '大数据', big_data_word_count, 0)) if ai_word_count != 0: statistics_result.append( (company_id, article_year, position_count, '人工智能', ai_word_count, 0)) if cloud_compute_word_count != 0: statistics_result.append( (company_id, article_year, position_count, '云计算', cloud_compute_word_count, 0)) # 没有时间参数,另外统计 else: if big_data_word_count != 0: statistics_result.append( (company_id, -1, position_count, '大数据', big_data_word_count, 1)) if ai_word_count != 0: statistics_result.append( (company_id, -1, position_count, '人工智能', ai_word_count, 0)) if cloud_compute_word_count != 0: statistics_result.append( (company_id, -1, position_count, '云计算', cloud_compute_word_count, 0)) except Exception as e: logging.info("crawler website: " + company_web_site + " id: " + str(company_id) + "has exception") print(e) continue # 翻页按钮 if (SeleniumCompanyCrawler.isElementExistByXpath( self, driver, '//*[@id="pnnext"]/span[2]')): next_page = driver.find_element_by_xpath( '//*[@id="pnnext"]/span[2]') next_page.click() time.sleep(5) else: time.sleep(60) break # driver.close() try: self.db_operation.batch_insert_records(statistics_result) except Exception as e: logging.info(statistics_result) logging.info("insert records:has exception") # 批量获得结果 def get_statistics_results(self): start = 0 page_size = 20 while True: result = self.db_operation.get_company_info(start, page_size) if not result: break else: for r in result: parsed_uri = urlparse(r.company_web_site) domain = '{uri.netloc}'.format(uri=parsed_uri) self.operate_browser(domain, r.company_id) start += page_size
def __init__(self, methods, source_path, target_path): self.methods = methods self.columns = self.get_columns() self.db_op_source = DBOperation(source_path, 'corpus') self.db_op_target = DBOperation(target_path, "result") self.db_op_target.init_db()
def readdb(): db_path = '../../BrowserFuzzingData/js_corpus_final_top_1000.db' # 数据库文件 op = DBOperation(db_path, 'corpus') result = op.query_all() # result为整体数据,格式为:list[str] # print(result) return result