def SetupProject(git_repo): error_log = ErrorLog() setup_bat_file_path = join(common_file_path, 'setup_git_stuff.bat') dst_file_path = join(main_dir_path, 'setup_git_stuff.bat') if not os.path.exists(setup_bat_file_path): error_log.LogError("Setup Git Stuff file template not found") return False try: with open(setup_bat_file_path) as setup_file: dst_file = open(dst_file_path, 'w+') for line in setup_file: temp_line = line.replace('%HOST%', git_repo.host) temp_line = temp_line.replace('%ORGANIZATION%', git_repo.organization) temp_line = temp_line.replace('%PROJECT%', git_repo.project) dst_file.write(temp_line) dst_file.close() except OSError: error_log.LogError("Unable to generate bat file for git") return False try: subprocess.call([dst_file_path]) except: error_log.LogError("Git setup bat file execution failed for " + git_repo.project) return False os.remove(dst_file_path) return True
def RemoveFile(path): error_log = ErrorLog() if os.path.exists(path): try: os.remove(path) except OSError: error_log.LogError("Unable to delete " + path)
def FixStupidSolutionFile(proj_name): error_log = ErrorLog() temp_sln_file_path = join(main_dir_path, proj_name, proj_name + "_temp.sln") sln_file_path = join(main_dir_path, proj_name, proj_name + ".sln") os.rename(sln_file_path, temp_sln_file_path) try: with open(temp_sln_file_path) as sln_file: output_file = open(sln_file_path, 'w+') for line in sln_file: if line.find('"src", "src",') != -1: sln_file.readline() elif line.find( 'GlobalSection(NestedProjects) = preSolution') != -1: sln_file.readline() sln_file.readline() else: output_file.write(line) output_file.close() except OSError: error_log.LogError("Unable to fix " + proj_name + " solution file") return os.remove(temp_sln_file_path)
def __init__(self, my_database): """ 书籍集合对象初始化 :param my_database: """ self._error_log = ErrorLog() # 创建错误日志输出对象 self._db = my_database # 数据库对象 self._books = self._db.book # 获取book集合
def __init__(self, my_database): """ URL集合对象 :param my_database: """ self._error_log = ErrorLog() # 新建错误日志对象 self._db = my_database # 数据库对象 self._urls = self._db.urls # URL集合
def AddSharedSubmodule(mapping_dir_path): error_log = ErrorLog() bat_file_path = join(mapping_dir_path, 'add_shared.bat') try: subprocess.call([bat_file_path]) except: error_log.LogError("Unable to add shared submodule") os.remove(bat_file_path) return
def RemoveNugetExe(path): error_log = ErrorLog() for root, dirs, files in os.walk(path): for file in files: if file.lower() == "nuget.exe": os.remove(join(root, file)) if not os.listdir(root): os.removedirs(root) return error_log.LogError("Could not find nuget.exe in dir " + path)
def AddMapperProjToSolution(proj_name, root_dir): error_log = ErrorLog() bat_file_path = join(root_dir, 'add_project.bat') AddCustomFile(join(common_file_path, 'replaceable'), 'add_project.bat', proj_name, root_dir) try: subprocess.call([bat_file_path]) except: error_log.LogError("Unable to fix sln for " + proj_name + " Project") FixStupidSolutionFile(proj_name, root_dir) os.remove(bat_file_path)
def MoveInMappingFiles(proj_name): error_log = ErrorLog() mapping_path = join(main_dir_path, proj_name + '.Mapper', 'cpp') dst_path = join(main_dir_path, proj_name, 'src', proj_name + '.Mapper') for file_name in os.listdir(join(mapping_path)): if file_name.find('.map.cpp') == -1: try: copy(join(mapping_path, file_name), dst_path) except OSError: error_log.LogError("Failed to move mapper file: " + join(mapping_path, file_name))
def __init__(self, thread_count): """ 初始化爬虫对象 :param thread_count: 线程数量统计对象 """ self._conn = MyDatabase() self._db = self._conn.database self._book_coll = BookColl(self._db) # 初始化对象 self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._error_log = ErrorLog() # 新建错误日志输出记录对象
def main(): thread_count = ThreadCount() # 创建线程计数对象 error_log = ErrorLog() # 创建错误日志记录对象 error_log.clear_error_log() # 如果没有文件新建文件,清空错误日志 crawler = Crawler(thread_count) # 新建爬虫对象 url = get_one_url() while not url is None: # 判断是否为空 if thread_count.total < 5: # 判断当前线程数量是否超出 print("加载:" + url) crawler.get_book(url) # 让爬虫获取书籍页面 url = get_one_url() # 获取新的URL数据 else: # 线程数量超出 sleep(10) # 休眠10秒等待,线程执行完成
def AddMainDirCommonFile(dst_path): error_log = ErrorLog() src_dir_path = join(common_file_path, 'main_dir') dir_contents = os.listdir(src_dir_path) for f in dir_contents: src_file_path = join(src_dir_path, f) if isfile(src_file_path): try: copy(src_file_path, dst_path) except OSError: error_log.LogError("Unable to copy file " + f + " to " + dst_path) elif not isdir(src_file_path): error_log.LogError("Unable to copy file " + f + " to " + dst_path)
def __init__(self, inputdata, outputdata): dao = DAOPsql('furman') self.geo = GeoSearch(dao) self.error_log = ErrorLog(self.__class__.__name__) self.progress = Progress() self.input = inputdata self.output = outputdata
def AddCustomFile(src_dir, file_name, replace_value, dst_dir): error_log = ErrorLog() orig_file_path = join(src_dir, file_name) if not os.path.exists(orig_file_path): error_log.LogError("Common file template not found") return dst_file_path = join(dst_dir, file_name) if file_name.find('AppName') != -1: dst_file_path = join(dst_dir, file_name.replace('AppName', replace_value)) try: with open(orig_file_path) as orig_file: dst_file = open(dst_file_path, 'w+') dst_file.write(orig_file.read().replace('%REPLACE%', replace_value)) dst_file.close() except OSError: error_log.LogError("Unable to create custom file " + file_name + " with replace value " + replace_value)
def __init__(self, database): self.connection = None self.db = database self.sgbd = None self.dbAtrib = None self.settings() self.error_log = ErrorLog(self.__class__.__name__) self.cursor = self.get_connection().cursor()
def GenerateMappingEnvironment(dir_path = "c:/Starcounter/AppName", git_repo = GitRepo()): error_log = ErrorLog() app_name = dir_path.split('/')[-1] mapping_dir_path = join(dir_path, 'src', app_name + ".Mapper") replaceable_dir_path = join(common_file_path, 'replaceable') git_repo.SetProject(app_name) if not os.path.exists(mapping_dir_path): try: os.makedirs(mapping_dir_path) except OSError: error_log.LogError("Creation of Mapper directory failed. Check permissions and try again") return AddMainDirCommonFile(mapping_dir_path) AddCustomFile(replaceable_dir_path, 'AppName.Mapper.csproj', app_name, mapping_dir_path) AddSharedSubmodule(mapping_dir_path) AddMapperProjToSolution(app_name, dir_path)
class ConnectionFactory: def __init__(self, database): self.connection = None self.db = database self.sgbd = None self.dbAtrib = None self.settings() self.error_log = ErrorLog(self.__class__.__name__) self.cursor = self.get_connection().cursor() def get_connection(self): try: self.connection = self.sgbd.connect(user=self.db.user, passwd=self.db.pswrd, db=self.db.db_name) return self.connection except Exception as e: self.close() self.error_log.open() self.error_log.write(e.message) self.error_log.close() def close(self): self.connection.close() def settings(self): pass
class PSQL: def __init__(self, database): self.connection = None self.db = database self.sgbd = None self.dbAtrib = None self.settings() self.error_log = ErrorLog(self.__class__.__name__) self.cursor = self.get_connection().cursor() def settings(self): try: self.db.db_name = "nycgisdb" self.db.user = "******" self.db.host = "localhost" self.db.pswrd = "m2a3rcio" self.sgbd = psycopg2 self.dbAtrib = ("dbname='" + self.db.db_name + "' user='******' host='" + self.db.host + "' password='******'") except TypeError as e: print e.message def select(self, arg): self.cursor.execute(arg) self.connection.commit() return self.cursor.fetchall() def get_connection(self): try: self.connection = self.sgbd.connect(self.dbAtrib) return self.connection except Exception as e: self.error_log.open() self.error_log.write(e.message) self.error_log.close()
class BookColl(object): """ 书籍集合对象 """ def __init__(self, my_database): """ 书籍集合对象初始化 :param my_database: """ self._error_log = ErrorLog() # 创建错误日志输出对象 self._db = my_database # 数据库对象 self._books = self._db.book # 获取book集合 def insert_to_db(self, data): """ 插入数据到数据库 :type data dict :param data: 需要插入的字典数据 :return: None """ try: self._books.insert_one(data) # 插入数据 except (errors, Exception) as e: self._error_log.write_error('BookColl插入错误' + e) # 错误日志记录 def get_book_name(self): """ 获取书籍名称 :return: """ # for result in self._books.find({'book_name': {'$regex': '\w'}}): for result in self._books.find(): print(result['book_name']) with open('book_name.txt', 'a', encoding='utf-8') as f: f.write(result['book_name'] + '\n')
def error_txt_to_csv(ms_name): with open('./error-logs/esb-' + ms_name + '-errors.txt', 'r', encoding='utf-8-sig') as csv_in_file: reader = csv.reader(csv_in_file, delimiter='|') print('Creating esb-'+ ms_name + '-errors.csv file...') print('Writing into esb-'+ ms_name + '-errors.csv file...') with open('./error-logs/esb-'+ ms_name + '-errors.csv', 'w', newline='') as csv_out_file: writer = csv.writer(csv_out_file, delimiter=',') temp = ErrorLog() writer.writerow(temp.__dict__.keys()) for row in reader: writer.writerow(get_error_log_from_txt(row)) print('Finished processing esb-'+ ms_name + '-errors.csv file... \n')
def get_error_log_from_txt(data): error_log = ErrorLog() error_log.date = data[0] error_log.log_level = data[1] error_log.log_id = data[2] error_log.log_event = data[3] error_log.route = data[4] error_log.transaction_id = data[6] error_log.user_email = data[9] error_log.package = data[13] error_log.error_message = data[14] return error_log
def get_error_log_from_dict(data): error_log = ErrorLog() error_log.date = data['_date'] error_log.log_level = data['_log_level'] error_log.log_id = data['_log_id'] error_log.log_event = data['_log_event'] error_log.route = data['_route'] error_log.transaction_id = data['_transaction_id'] error_log.user_email = data['_user_email'] error_log.package = data['_package'] error_log.error_message = data['_error_message'] return error_log
class UrlColl(object): """ URL集合对象 """ def __init__(self, my_database): """ URL集合对象 :param my_database: """ self._error_log = ErrorLog() # 新建错误日志对象 self._db = my_database # 数据库对象 self._urls = self._db.urls # URL集合 def add_url(self, url): """ 添加URL数据到集合中 :param url: URL数据 :return: None """ try: if not self.is_exist_url(url): # 判断是否存在 self._urls.insert_one({'url': url, 'isExist': 'false'}) # 插入 except (errors, Exception) as e: self._error_log.write_error('UrlColl添加错误' + e) # 错误日志写入 def is_exist_url(self, url): """ 判读是否已经存在相对应的数据 :param url: URL地址 :return: boolean 存在返回True不存在返回False """ try: result = self._urls.find_one({"url": url}) # 获取查询结果 if result is None: return False # 返回False else: return True # 返回True except (errors, Exception) as e: self._error_log.write_error('UrlColl查找错误' + e) # 错误日志写入 def get_url(self): """ 从数据库中随机获取一条数据 :return: URL地址字符串 """ num = randint(1, 100) # 随机数 try: result = self._urls.find({ 'isExist': 'false' }).skip(num).limit(1) # 跳跃式获取数据 return result[0]['url'] # 返回对应的URL地址 except (errors, Exception) as e: self._error_log.write_error('UrlColl获取url错误' + e) # 错误日志写入 def update_url(self, url): """ 更新URL数据 :param url: 需要更新的URL数据 :return: None """ try: self._urls.update({'url': url}, {'$set': { 'isExist': 'true' }}) # 更新URL的状态为True表示已经爬取过了 except (errors, Exception) as e: self._error_log.write_error('UrlColl更新URl数据错误' + e) # 错误日志写入
class RealEstateSettings: def __init__(self, inputdata, outputdata): dao = DAOPsql('furman') self.geo = GeoSearch(dao) self.error_log = ErrorLog(self.__class__.__name__) self.progress = Progress() self.input = inputdata self.output = outputdata def fix_acris(self): tuples = self.preprocess() real_estates = [] while tuples: try: t = tuples.pop(0) bbl = Normalizer.set_bbl(t[0], t[1], t[2]) address = t[3]+" "+t[4] address = Normalizer.set_address(address, bbl) date = Normalizer.set_str_to_epoch(t[5]) price = t[6] real_estates.append((bbl, address, date, price)) except ValueError: self.error_log.open() self.error_log.write(t[1]+", "+str(t[0])) self.error_log.close() except KeyboardInterrupt: print "" print "Stopped" CsvManager.append_geo_codes(real_estates, self.output) CsvManager.append_geo_codes(real_estates, self.output) def preprocess(self): tuples = CsvManager.read(self.input) num = CsvManager.read_progress() print num if num == 0: CsvManager.write_geo_codes([], self.output) CsvManager.write_progress('0') self.progress.set_size(len(tuples)) self.progress.update_progress(num) Normalizer.set_tuple(num, tuples) return tuples def build_geocodings(self): nominatim = NominatimGeocode(self.progress, self.error_log, self.geo) google = GoogleGeocode(self.progress, self.error_log, self.geo) opencage = OpenCageGeocode(self.progress, self.error_log, self.geo) bing = BingGeocode(self.progress, self.error_log, self.geo) tiger = TIGERGeocode(self.progress, self.error_log, self.geo) return nominatim, google, opencage, bing, tiger def search_lat_long(self): tuples = self.preprocess() count = 1 nominatim, google, opencage, bing, tiger = self.build_geocodings() while tuples: t = tuples.pop(0) status, found = self.geocode_process(t, nominatim) if not found: if status == -1: status, found = self.geocode_process(t, bing) if not found and status == -1: self.geocode_process(t, tiger) elif status == -2: i = 1 while i < 3: print "Waiting 45' for the "+Normalizer.set_order(str(i))+" time" time.sleep(2700) status, found = self.geocode_process(t, nominatim) if found: continue elif status == -2: i += 1 elif status == -3: return if count % 100 == 0: for i in range(3): t = tuples.pop(0) status, found = self.geocode_process(t, google) time.sleep(3) if not found: self.geocode_process(t, opencage) time.sleep(3) else: t = tuples.pop(0) self.geocode_process(t, opencage) time.sleep(3) count += 1 def geocode_process(self, t, geocode): re, num = geocode.get_coordinates(t) if num: CsvManager.append_geo_codes([re], self.output) self.progress.update_progress(num) else: val = CsvManager.read_progress() self.progress.update_progress(val+1) return re, num
def FixStupidSolutionFile(proj_name, root_dir): error_log = ErrorLog() temp_sln_file_path = join(root_dir, proj_name + "_temp.sln") sln_file_path = join(root_dir, proj_name + ".sln") os.rename(sln_file_path, temp_sln_file_path) try: with open(temp_sln_file_path) as sln_file: output_file = open(sln_file_path, 'w+') for line in sln_file: if line.find('"src", "src",') != -1: sln_file.readline() elif line.find('GlobalSection(NestedProjects) = preSolution') != -1: sln_file.readline() sln_file.readline() else: output_file.write(line) output_file.close() except OSError: error_log.LogError("Unable to fix " + proj_name + " solution file") return os.remove(temp_sln_file_path) error_log = ErrorLog() error_log.ClearErrorLog() GenerateMappingEnvironment()
) elif line.find('mapperOutput') != -1: output_file.write( ' "mapperOutput": "..\\\$app$\\\\bin\\\$config$\\\$app$.map.cpp",\n' ) else: output_file.write(line) output_file.close() except OSError: error_log.LogError("Unable to fix " + proj_name + " mgen file") return os.remove(temp_file_path) # MAIN error_log = ErrorLog() subfolders = [f.path for f in os.scandir(main_dir_path) if f.is_dir()] for dir in subfolders: if dir.find(".Mapper") != -1 and dir.find("Blending") == -1: proj_name = dir.split('\\')[-1].split('.')[0] git_repo = GitRepo() git_repo.SetProject(proj_name) if SetupProject(git_repo): # Remove unneccessary dirs RemoveFile(join(main_dir_path, proj_name, 'Rebracer.xml')) RemoveNugetExe(join(main_dir_path, proj_name)) # Add new dirs not added by setup bat try: os.mkdir(join(main_dir_path, proj_name, '%STAR_NUGET%')) except OSError: error_log.LogError("%STAR_NUGET% folder already exists for " +
class Crawler(object): def __init__(self, thread_count): """ 初始化爬虫对象 :param thread_count: 线程数量统计对象 """ self._conn = MyDatabase() self._db = self._conn.database self._book_coll = BookColl(self._db) # 初始化对象 self._url_coll = UrlColl(self._db) self._thread_count = thread_count self._error_log = ErrorLog() # 新建错误日志输出记录对象 def get_book(self, url): """ 获取书籍数据 :param url: 获取书籍的URL地址 :return: None """ book = {} # 初始化字典 用于保存数据 # 初始化浏览器驱动程序,获得浏览器驱动对象 driver = webdriver.Firefox( executable_path='E:\DevelopTools\Python\geckodriver') # driver = webdriver.Ie(executable_path='E:\DevelopTools\Python\IEDriverServer') try: driver.set_page_load_timeout(12) # 设置页面加载超时时间 driver.set_script_timeout(30) # 设置页面脚本响应超时时间 driver.get(url) # 设置浏览器获取页面的地址 js = "var q=document.documentElement.scrollTop=100000" # 浏览器执行的js代码 向下滑动100000xp driver.execute_script(js) # 运行脚本 time.sleep(1) # 休眠等待浏览器执行 js = "var q=document.documentElement.scrollTop=0" # 浏览器js代码 回到顶部 driver.execute_script(js) # 运行脚本 time.sleep(2) # 休眠等待浏览器执行 js = "var q=document.documentElement.scrollTop=100000" # 浏览器js代码, 回到底部 driver.execute_script(js) # 运行脚本 time.sleep(1) # 休眠等待浏览器执行, 模拟浏览器滑动完成 soup = BeautifulSoup(driver.page_source, "lxml") # 传递页面数据, 初始化bs4对象 except Exception as e: print(e) # 输出错误信息 self._error_log.write_error(e) # 记录错误信息 return # 返回空 finally: driver.close() # 关闭浏览器 # target = driver.find_element_by_id("footer") # driver.execute_script("arguments[0].scrollIntoView();", target) # 拖动到可见的元素去 # 下面是相关标签的数据获取 null_wrap = soup.find("div", {"class": "null_wrap"}) if not null_wrap is None: self._url_coll.update_url(url) return book['url'] = url book_name = soup.find("div", {"class": "name_info"}) if book_name is None: self._url_coll.update_url(url) return book['book_name'] = book_name.h1.get_text(strip=True) book['image_url'] = soup.find("div", {"class": "big_pic"}).img['src'] book['book_type'] = soup.find("div", { "class": "breadcrumb" }).get_text(strip=True) book['introduction'] = soup.find("span", { "class": "head_title_name" }).get_text(strip=True) author = soup.find("span", {"id": "author"}) if author is None: book['author'] = "" else: book['author'] = soup.find("span", {"id": "author"}).text messbox = soup.find("div", {"class": "messbox_info"}) for item in messbox: if "出版社" in str(item): book['publishing'] = item.get_text(strip=True) elif "出版时间" in str(item): book['publishing_time'] = item.get_text(strip=True) book['price'] = soup.find("p", { "id": "dd-price" }).get_text(strip=True).split("¥")[1] editors_choice = soup.find("div", {"id": "abstract"}) if editors_choice is None: book['editors_choice'] = "" else: book['editors_choice'] = editors_choice.contents[1].get_text() content_validity = soup.find("div", {"id": "content"}) if content_validity is None: book['content_validity'] = "" else: book['content_validity'] = content_validity.contents[1].get_text() about_author = soup.find("div", {"id": "authorIntroduction"}) if about_author is None: book['about_author'] = "" else: book['about_author'] = about_author.contents[1].get_text() catalog = soup.find("textarea", {"id": "catalog-textarea"}) if catalog is None: catalog2 = soup.find("div", {"id": "catalog"}) if catalog2 is None: book['catalog'] = "" else: book['catalog'] = catalog2.contents[1].get_text() else: book['catalog'] = catalog.get_text(strip=True) media_reviews = soup.find("div", {"id": "mediaFeedback"}) if media_reviews is None: book['media_reviews'] = "" else: book['media_reviews'] = media_reviews.get_text() # 数据获取成功,插入book集合 self._book_coll.insert_to_db(book) self._conn.close_conn() print(url + "完成") try: self._thread_count.add_one() # 线程计数加一 thread = MyThread(soup, self._thread_count) # 创建线程对象 thread.start() # 开启线程 except Exception as e: self._error_log.write_error(e) # 写入错误日志 print("Error: 无法启动线程" + e)