def __init__(self, input_shapes=None, params=None, logger_path=None, root_path=ROOT_PATH): """create instance of AbstractModel :type logger_path: str :param logger_path: path for log file if logger_path is None, log ony stdout """ self.root_path = root_path if logger_path is None: self.log = Logger(self.__class__.__name__, LOG_PATH) else: self.log = Logger(self.__class__.__name__, logger_path) self.sess = None self.saver = None self.summary_writer = None self.is_built = False # gen instance id self.input_shapes = input_shapes self.params = params self.id = "_".join([self.__str__(), time_stamp()]) self.instance_path = os.path.join(INSTANCE_PATH, self.id) self.instance_visual_result_folder_path = os.path.join( self.instance_path, VISUAL_RESULT_FOLDER) self.instance_source_folder_path = os.path.join( self.instance_path, 'src_code') self.instance_summary_folder_path = os.path.join( self.instance_path, 'summary') self.instance_class_name = self.__class__.__name__ self.instance_source_path = os.path.join( self.instance_source_folder_path, self.id + '.py') self.metadata_path = os.path.join(self.instance_path, 'instance.meta') self.save_folder_path = os.path.join(self.instance_path, 'check_point') self.check_point_path = os.path.join(self.save_folder_path, 'instance.ckpt') self.metadata = { MODEL_METADATA_KEY_INSTANCE_ID: self.id, MODEL_METADATA_KEY_INSTANCE_PATH: self.instance_path, MODEL_METADATA_KEY_INSTANCE_VISUAL_RESULT_FOLDER_PATH: self.instance_visual_result_folder_path, MODEL_METADATA_KEY_INSTANCE_SOURCE_FOLDER_PATH: self.instance_source_folder_path, MODEL_METADATA_KEY_INSTANCE_SOURCE_PATH: self.instance_source_path, MODEL_METADATA_KEY_INSTANCE_SUMMARY_FOLDER_PATH: self.instance_summary_folder_path, MODEL_METADATA_KEY_INSTANCE_CLASS_NAME: self.instance_class_name, MODEL_METADATA_KEY_METADATA_PATH: self.metadata_path, MODEL_METADATA_KEY_CHECK_POINT_PATH: self.check_point_path, MODEL_METADATA_KEY_SAVE_FOLDER_PATH: self.save_folder_path, MODEL_METADATA_KEY_PARAMS: self.params, MODEL_METADATA_KEY_INPUT_SHAPES: self.input_shapes, }
def __init__(self, env, serviceName): self.log = Logger("debug") opera = OperationIni() chrome_driver = findPath.data_dir(fileName='chromedriver.exe', pathName='driver') base_url = opera.read_ini(section='CONFIG', key='base_url') url = base_url + opera.read_ini(section=env, key='url') self.userName = opera.read_ini(section='CONFIG', key='userName') self.passWord = opera.read_ini(section='CONFIG', key='passWord') self.ServiceName = opera.read_ini(section='CONFIG', key=serviceName) chrome_options = Options() # 设置chrome浏览器无界面模式 chrome_options.add_argument('--headless') self.log.info("开始调用webdriver,当前模式为Chrome无界面模式") self.d = webdriver.Chrome(executable_path=chrome_driver, chrome_options=chrome_options) self.d.maximize_window() self.log.info('成功打开谷歌浏览器') self.d.get(url) self.d.implicitly_wait(30) print('成功打开网址:{0}'.format(url)) self.log.info('成功打开网址:{0}'.format(url))
def __init__(self, func=None, n_parallel=4, initializer=None, initargs=(), child_timeout=30): self.logger = Logger(self.__class__.__name__) self.log = self.logger.get_log() self.func = func self.n_parallel = n_parallel if initializer is None: self.initializer = init_worker else: self.initializer = initializer self.initargs = initargs self.child_timeout = child_timeout self.pools = [ Pool(1, initializer=init_worker, initargs=initargs) for _ in range(n_parallel) ] self.queues = [Queue() for _ in range(n_parallel)] self.pbar = None self.fail_list = []
def __init__(self, preprocess=None, batch_after_task=None, before_load_task=None): """ init dataset attrs *** bellow attrs must initiate other value *** self._SOURCE_URL: (str) url for download dataset self._SOURCE_FILE: (str) file name of zipped dataset self._data_files = (str) files name in dataset self.batch_keys = (str) feature label of dataset, managing batch keys in dict_keys.dataset_batch_keys recommend :param preprocess: injected function for preprocess dataset :param batch_after_task: injected function for after iter mini_batch :param before_load_task: hookable function for AbstractDataset.before_load """ self._SOURCE_URL = None self._SOURCE_FILE = None self._data_files = None self.batch_keys = None self.logger = Logger(self.__class__.__name__, stdout_only=True) self.log = self.logger.get_log() self.preprocess = preprocess self.batch_after_task = batch_after_task self.data = {} self.cursor = {} self.data_size = 0 self.before_load_task = before_load_task
def __init__(self, creator, server, name=None, description=None, openTime=None, closeTime=None, absoluteThreshold=None, percentThreshold=None, percentThresholdMinimum=None, thresholdTime=None, keepUpdated=True, pollid=None): self.log = Logger() self.base = CabbageBase() self.creator = creator self.server = server self.name = name self.description = description self.openTime = openTime self.closeTime = closeTime self.absoluteThreshold = absoluteThreshold self.percentThreshold = percentThreshold self.percentThresholdMinimum = percentThresholdMinimum self.thresholdTime = thresholdTime self.options = {'short': [], 'long': [], 'emoji': []} self.keepUpdated = keepUpdated if pollid: self.pollid = pollid else: self.genPollid() self.update()
def calculate_average_bow_size(res_folder): """ Calculate average bow size for the URLBow database :param res_folder: :return: """ total_bow_sizes={"right":0,"wrong":0,"swing":0} bow_count={"right":0,"wrong":0,"swing":0} Logger.info("Average bow size, on right bow size") for right_res in RightResultsIter(res_folder): total_bow_sizes["right"]+=len(URLBow.objects.get(index=right_res.ref_id).bow) bow_count["right"]+=1 Logger.info("Average bow size, on wrong bow size") for wrong_res in WrongResultsIter(res_folder): if wrong_res.is_swing_sample(): label="swing" else: label="wrong" bow_count[label]+=1 total_bow_sizes[label]+=len(URLBow.objects.get(index=wrong_res.ref_id).bow) print([(label,total/bow_count[label] if bow_count[label] != 0 else 1,bow_count[label]) for label,total in total_bow_sizes.items()])
def get_genre_refs(*genres): genre_objs=[] for agenre_obj in genres: #find all matching genre if isinstance(agenre_obj,str): genre_models=Genre.objects(genre=agenre_obj) else: genre_models=Genre.objects(genre=agenre_obj['genre']) if(len(genre_models)==0): if not isinstance(agenre_obj,str): genre_model=Genre() for (k,v) in agenre_obj.items(): genre_model[k]=v else: genre_model=Genre(genre=agenre_obj) try: genre_model.save() except: Logger.error("Error saving: "+str(agenre_obj)) genre_objs.append(genre_model) else: genre_objs.extend(genre_models.all()) return genre_objs
def __init__(self): self.logger = Logger() self.service = { "ObjectTracking": { "Performance": { "DetectionSpeed": "ProcessingTime", "DetectionAccuracy": "DetectionRate" }, "Reliability": { "VideoContinuity": "FPS" }, "Security": { "VideoComposition": "NumberOfComposedVideos" } }, "ObjectCounting": { "Performance": { "DetectionSpeed": "ProcessingTime", "DetectionAccuracy": "DetectionRate" }, "Reliability": { "VideoContinuity": "FPS" }, "Security": { "VideoComposition": "NumberOfComposedVideos" } } } self.logger.debug("Get Service Knowledge")
def set_binary_model(self, model_file_path: str): timer: Timer = Timer() Logger().start_analyzing("Loading binary Word2VecModel") self.model = KeyedVectors.load_word2vec_format(model_file_path, binary=True) Logger().finish_analyzing(timer.get_duration(), "Loading binary Word2VecModel")
def __init__(self, net_type, model_out_dir, frequency, electrodes, learning_rate=0.002, batch_size=32, epochs=30): """ initializes the basic class variables Args: learning_rate: the chosen learning rate batch_size: the amount of items per batch epochs: the amount of epochs """ if frequency is None: self.input_shape = (len(electrodes), 101) else: self.input_shape = (len(electrodes), 5) self.frequency = frequency self.net_type = net_type self.model_out_dir = model_out_dir self.batch_size = batch_size self.learning_rate = learning_rate self.epochs = epochs self.model = None self.history = None self.logger = Logger(model_out_dir, self.net_type) if not os.path.exists(os.path.join(model_out_dir, self.net_type)): os.makedirs(os.path.join(model_out_dir, self.net_type))
def __init__(self, env='QA'): self.log = Logger("debug") self.opera = OperationIni(fileName='config.ini', pathName='config') path = '/website/saas/account/api2/user/login' self.key = env.lower() + '_token' d = get_env_authorization(env=env) self.url = d[0] + path self.cookie = d[1] self.userName = d[2] self.passWord = d[3] # if env == 'QA': # self.url = self.opera.read_ini(section='Authorization', key='qa_url') + path # self.cookie = self.opera.read_ini(section='Authorization', key='qa_cookie') # self.userName = self.opera.read_ini(section='Authorization', key='qa_username') # self.passWord = self.opera.read_ini(section='Authorization', key='qa_password') # if env == 'DEV': # self.url = self.opera.read_ini(section='Authorization', key='dev_url') + path # self.cookie = self.opera.read_ini(section='Authorization', key='dev_cookie') # self.userName = self.opera.read_ini(section='Authorization', key='dev_username') # self.passWord = self.opera.read_ini(section='Authorization', key='dev_password') self.headers = { 'Cookie': self.cookie, 'Content-Type': 'application/x-www-form-urlencoded' }
def __init__(self, root_path): self.root_path = root_path self.logger = Logger(self.__class__.__name__, self.root_path) self.log = self.logger.get_log() self.model = None self.visualizers = [] self.sub_process = {}
def __init__(self, path=None, execute_interval=None, name=None): """create Visualizer :type path: str :type execute_interval: int :type name: str :param path: path for saving visualized result :param execute_interval: interval for execute :param name: naming for visualizer """ self.execute_interval = execute_interval self.name = name self.visualizer_path = os.path.join(path, self.__str__()) if not os.path.exists(path): os.mkdir(path) if not os.path.exists(self.visualizer_path): os.mkdir(self.visualizer_path) files = glob(os.path.join(self.visualizer_path, '*')) self.output_count = len(files) self.logger = Logger(self.__class__.__name__, self.visualizer_path) self.log = self.logger.get_log()
def apiMoveUserVision(userId): if request.method == 'POST': if SessionManager.userLoggedIn(): userInfo = SessionManager.getUser() if userInfo['id'] != userId: abort(406) parameters = request.json if not 'visionId' in parameters or \ not 'srcIndex' in parameters or \ not 'destIndex' in parameters: abort(406) visionId = parameters['visionId'] srcIndex = parameters['srcIndex'] destIndex = parameters['destIndex'] Logger.debug("V:%s src: %s dest: %s" % (visionId, srcIndex, destIndex)) user = User.getById(userInfo['id']) result = user.moveVision(visionId, srcIndex, destIndex) if True == result: data = { 'result' : "success" } else: data = { 'result' : "error" } return jsonify(data) abort(403) abort(405)
def __init__(self, root_path=ROOT_PATH): """create DatasetManager todo """ self.root_path = root_path self.logger = Logger(self.__class__.__name__, self.root_path) self.log = self.logger.get_log() self.datasets = {}
def create(): ''' Debugging Tip: if you see: Bad Request The browser (or proxy) sent a request that this server could not understand. (a 400 error) Make sure all of the form fields are given correctly http://stackoverflow.com/questions/8552675/form-sending-error-flask ''' mediaUrl = request.form[Constant.BOOKMARKLET_POST_MEDIA_URL] text = request.form[Constant.BOOKMARKLET_POST_TEXT] pageUrl = request.form[Constant.BOOKMARKLET_POST_PAGE_URL] pageTitle = request.form[Constant.BOOKMARKLET_POST_PAGE_TITLE] #Vision Privacy private = False if Constant.BOOKMARKLET_POST_IS_PRIVATE in request.form: private = True #Format for saving visionIsPublic = not private #Validate Parameters if mediaUrl is None \ or text is None \ or pageUrl is None \ or pageTitle is None: return "Invalid Vision Parameters" Logger.debug("URL: " + mediaUrl) #Question: Do we really need to check the login again here? #Check Login if not SessionManager.userLoggedIn(): return redirect(url_for('login')) #Get the user id userId = SessionManager.getUser()['id'] #Add user = User.getById(userId) if user: # TODO: should we save pageUrl and pageTitle also? vision, message = user.addVision(mediaUrl, text, False, visionIsPublic) if vision: #Successful Create! return render_template('successCreatingVision.html', visionId=vision.id(), userId=userId) #Error return render_template('errorCreatingVision.html', message=message)
def apiAddUserVision(userId): if request.method == 'POST': if SessionManager.userLoggedIn(): userInfo = SessionManager.getUser() if userInfo['id'] != userId: abort(406) parameters = request.json if not 'useImage' in parameters or \ not 'text' in parameters or \ not 'privacy' in parameters: abort(406) useImage = parameters['useImage'] text = parameters['text'].strip() isPublic = parameters['privacy'] Logger.debug("IsPublic: " + str(isPublic)) # Make sure input OK to create a new vision if useImage == False: # TODO: should we allow text w/o image? #if useImage == False and len(text) == 0: abort(406) # Make sure image link OK url = "" if useImage == True: url = SessionManager.getPreviewUrl() # Create a new vision with the photo user = User.getById(userId) # Make sure we have a valid user if not user: data = {'result' : "error"} else: vision, errorMsg = user.addVision(url, text, True, isPublic) if vision: objList = [] if None != vision: objList = VisionList.createFromVision(vision) if len(objList.visions()) == 1: data = { 'result' : "success", 'newVision' : objList.toDictionary( options=[Vision.Options.PICTURE, Vision.Options.USER, Vision.Options.PARENT_USER, Vision.Options.COMMENT_PICTURES, Vision.Options.COMMENTS])[0] } else: data = { 'result' : "error" } return jsonify(data) abort(403) abort(405)
def main(argv): parser = argparse.ArgumentParser( description='Upload a hub to display on Apollo.') parser.add_argument('-j', '--data_json', help='JSON file containing the metadata of the inputs') parser.add_argument('-o', '--output', help='HTML output') #parser.add_argument('-e', '--extra_file_path', help='Extra file path for generated jbrowse hub') #parser.add_argument('-d', '--jbrowsehub', help='Name of the HTML summarizing the content of the JBrowse Hub Archive') # Get the args passed in parameter args = parser.parse_args() json_inputs_data = args.data_json outputFile = args.output #outputFile = args.jbrowsehub ##Parse JSON file with Reader reader = Reader(json_inputs_data) # Begin init variables extra_files_path = reader.getExtFilesPath() #user_email = reader.getUserEmail() species_name = reader.getSpeciesName() #apollo_host = reader.getApolloHost() apollo_port = reader.getPortNum() apollo_host = "http://localhost:" + apollo_port + "/apollo" #apollo_host = "http://localhost:8080/apollo" #apollo_user = reader.getApolloUser() apollo_admin_user = reader.getAdminUser() toolDirectory = reader.getToolDir() #jbrowse_hub = reader.getJBrowseHubDir() debug_mode = reader.getDebugMode() #### Logging management #### # If we are in Debug mode, also print in stdout the debug dump log = Logger(tool_directory=toolDirectory, debug=debug_mode, extra_files_path=extra_files_path) log.setup_logging() logging.info( "#### JBrowseArchiveCreator: Start to upload JBrowse Hub to Apollo instance: %s #### ", apollo_host) logging.debug('JSON parameters: %s\n\n', json.dumps(reader.args)) # Set up apollo apollo = ApolloInstance(apollo_host, apollo_admin_user, toolDirectory) jbrowse_hub_dir = _getHubDir(extra_files_path) apollo.loadHubToApollo(apollo_admin_user, species_name, jbrowse_hub_dir, admin=True) outHtml(outputFile, apollo_host, species_name) logging.info( '#### JBrowseArchiveCreator: Congratulation! JBrowse Hub is uploaded! ####\n' )
def __init__(self, logLevel, action): self.actionToRun = action self.logger = Logger(name="a2d2 thread", logFile=conf.APPLICATION_LOG_FILE, level=logLevel) threading.Thread.__init__(self) self.__stopFlag = False self.__bypass = False # if True, actions are skipped in periodic check self.logger.info("Initialised.")
def __init__(self, config): threading.Thread.__init__(self) self.logger = Logger() self.serviceList = [] self.config = config self.ip = config['MQTT']['ip'] self.port = int(config['MQTT']['port'])
def query_sql_keep_connection(db, sql): try: Logger.logDebug("query_db: " + sql) rs = db.query(sql) return rs except: traceback.print_exc(file=sys.stdout) db.close() raise
class DBManager(object): def __init__(self, mongo, collection): self.logger = Logger() self.logger.debug("INTO DBManager!") client = MongoClient(mongo["ip"], username=mongo["username"], password=mongo["password"], authSource=mongo["database"], authMechanism='SCRAM-SHA-1') database = client.get_database(mongo["database"]) self.collection = database.get_collection(collection) def getCollection(self): return self.collection
def execute_sql_keep_connection(db, sql): try: Logger.logDebug("excute_db: " + sql) affect = db.execute(sql) db.commit() return affect except: traceback.print_exc(file=sys.stdout) db.close() raise
def save_url(**kwargs): kwargs['url']=replace_dot_url(kwargs['url']) url_model=URLToGenre(**kwargs) try: save_obj=url_model.save() except: Logger.error("Error saving: "+str(kwargs['url'])) return save_obj
def scrape_links_from_position(self,pos): MongoDB.connect(settings.HOST_NAME,settings.PORT) links=self.__get_next_urls(pos) Logger.info(links) for link in links: self.scrape_link_and_child(link) Logger.debug('Process job completed') return 0
def __init__(self, logger_path=None): """create instance of AbstractModel :type logger_path: str :param logger_path: path for log file if logger_path is None, log ony stdout """ if logger_path is None: self.logger = Logger(self.__class__.__name__, with_file=True) else: self.logger = Logger(self.__class__.__name__, logger_path) self.log = self.logger.get_log()
def __init__(self, root_path=ROOT_PATH): """ create a 'InstanceManager' at env_path :type root_path: str :param root_path: env path for manager """ self.root_path = root_path self.logger = Logger(self.__class__.__name__, self.root_path) self.log = self.logger.get_log() self.instance = None self.visualizers = {} self.subprocess = {}
def post(self, command): xml = request.data Logger().debug(xml) result = json.dumps(xmltodict.parse(xml)['service']) Logger().debug(result) if command == "start": self.serviceManager.receiveService(ServiceInstance(result)) elif command == "stop": self.serviceManager.stopService(ServiceInstance(result)) else: pass
def execute_sql(sql, dbcfg, dbtype = "oracle"): try: db = connect_db(dbcfg, dbtype) Logger.logDebug("execute_sql: " + sql) affect = db.execute(sql) db.commit() return affect except: traceback.print_exc(file=sys.stdout) raise finally: db.close()
def startService(name): logger = Logger() logger.debug("Start Service!") client = docker.from_env() service = client.services.create( "face_detection", name=name, networks=["swarm_net"], mounts=["/home/pi/video/face_detection/container:/data:rw"], mode="replicated", constraints=["node.labels.name==node03"]) #container = client.containers.run("face_detection:latest", detach=True) return service
def __init__(self, pack_keys=None): super().__init__() self.log = Logger(self.__class__.__name__) if pack_keys is None: pack_keys = self.class_pack.keys() self.pack = {} for key in pack_keys: self.pack[key] = self.class_pack[key]() self.optimize_result = {} self.params_save_path = SKLEARN_PARAMS_SAVE_PATH
def push_to_queue(number,url_doc): try: if URLQueue.objects(number=number): return None URLQueue(number=number,document=url_doc).save() except: try: Logger.error('Failed to save with url: {}'.format(url_doc['url'])) except: Logger.error('Complete error to save number: {}'+number)
def __init__(self, ip, port, duration, name): self.logger = Logger() self.logger.debug("INTO DeviceAbstractor!") self.capabilityList = [] self.ip = ip self.port = port self.duration = duration self.name = name self.profiler = Profiler(self.duration) self.doProfiling()
def __init__(self, pid, env='QA'): self.log = Logger("debug") opera = OperationIni(fileName='config.ini', pathName='config') self.get_skuId = GetGoodsDetail(env=env, pid=pid) self.get_access_token = GetAccessToken(env=env, pid=pid) # env字符串转小写 env = env.lower() key = env + '_url' self.base_url = opera.read_ini(section='goods', key=key) self.path = opera.read_ini(section='goods', key='wholeUpdateStock') self.access_token = self.get_access_token.get_ini_access_token()
def __init__(self, env='QA'): self.log = Logger("debug") opera = OperationIni(fileName='config.ini', pathName='config') self.env = env self.get_access_token = GetAccessToken(env=env) # env字符串转小写 env = env.lower() key = env + '_url' self.url = opera.read_ini(section='goods', key=key) self.path = opera.read_ini(section='goods', key='queryGoodsDetail') self.access_token = self.get_access_token.get_ini_access_token()
def parse(self): Logger().start_analyzing(self.relative_path) self.identifier_list_model = LanguageParser().parse_file( self.extension, self.content) self.identifier_dictionary_model = IdentifierDictionaryModel( self.identifier_list_model) self.word_dictionary_model = WordDictionaryModel( self.identifier_dictionary_model) if Word2VecModel.instance.exists(): self.calculate_semantic_metrics() self.identifier_dictionary_model.set_word_metrics( self.word_dictionary_model.get_dictionary()) Logger().finish_analyzing(self.timer.get_duration(), self.relative_path)
class ServiceManager(object): def __init__(self, config): threading.Thread.__init__(self) self.logger = Logger() self.serviceList = [] self.config = config def receiveService(self, serviceInstance): t = threading.Thread(target=self.publishService(serviceInstance)) self.serviceList.append([t, serviceInstance]) t.start() def stopService(self, serviceInstance): ClusterManager.stopService(serviceInstance) # # Service Management # ''' Name: publishService parameter: ServiceInstance action: let requirementInterpreter interpret service's requirements in terms of device's capabilities --> let resourceSelector select suitable nodes which satisfy service's requirements --> let clusterManager make selected nodes start service ''' def publishService(self, serviceInstance): self.logger.debug("PublishService starts!") # INTERPRET interpretedRequirement = RequirementInterpreter.interpret( serviceInstance) # SELECT serviceInstance.setInterpretedRequirement(interpretedRequirement) serviceCapabilityManager = ServiceCapabilityManager( self.config, serviceInstance) serviceCapabilityManager.start() selectedNodes = ResourceSelector.selectNodes(serviceInstance, serviceCapabilityManager) print("selected nodes: " + ", ".join(selectedNodes)) self.logger.debug("selected nodes: " + ", ".join(selectedNodes)) # START serviceInstance.setSeledtedNodes(selectedNodes) ClusterManager.startService(serviceInstance)
def scrape_link_and_child(self,parent_url): parent_url=base_util.replace_dot_url(parent_url) webpage_body,parent_url=self.scrape(base_util.unreplace_dot_url(parent_url),None) #exit if failed to scrap website if webpage_body is None: return MongoDB.save_page(url=parent_url,page=webpage_body) Logger.info('Completed page: '+parent_url) #Now, we grab the childs of this webpage all_ahref=[base_util.combine_parent_rel_link(parent_url,a.attrs['href']) for a in BeautifulSoup(webpage_body,'html.parser', from_encoding="utf-8").find_all('a') if 'href' in a.attrs] child_urls=random.sample(all_ahref,settings.GET_X_CHILD) if len(all_ahref)>=settings.GET_X_CHILD else all_ahref #get rid of bad normalization if not re.match('^www[.].*$',parent_url): Logger.info('Updating bad url for {}'.format(parent_url)) MongoDB.update_url(base_util.normalize_url(parent_url),parent_url) if len(child_urls) > 0: parent_genres=MongoDB.get_genre(parent_url) #get the childs for child_url in child_urls: child_page=self.scrape(child_url,parent_url) if child_page is None: exploredset=set() tries=0 for url in set(all_ahref)^(exploredset): if tries==settings.MAX_RETRIES: Logger.info('Max retrie number exceeded') break Logger.info("trying new url: "+url) child_page=self.scrape(url,parent_url) if child_page is not None: break exploredset.add(url) tries+=1 if child_page is not None: MongoDB.save_modify_url(url=base_util.replace_dot_url(child_url),parent=[MongoDB.get_url_object(parent_url)],genre=parent_genres,page=child_page) Logger.info('Completed page: '+child_url)
def main(): script_name: str = PathExtractor().get_file_name(sys.argv[0]) if len(sys.argv) != 2: Logger().usage(f'python {script_name} <wiki.en.raw.txt>') return file_path = sys.argv[1] if PathValidator().is_valid_files([file_path]): Logger().info(f'Input file: "{file_path}"') Logger().info("Starting to remove stopwords") timer = Timer() remove_stopwords(file_path) Logger().finish_script(timer.get_duration(), script_name)
def calculate_genres_per_instance(res_folder,classifiers=""): current_classifier=classifiers right_genresize_counter=collections.Counter() wrong_genresize_counter=collections.Counter() swing_genresize_counter=collections.Counter() Logger.info("Current on rights") #iterate over the right samples first, we don't write to file because right files are the same for right_res_obj in {x.ref_id: x for x in RightResultsIter(res_folder,classifiers)}.values(): assert isinstance(right_res_obj,ClassificationResultInstance) if right_res_obj.classifier != current_classifier: current_classifier=right_res_obj.classifier #now find the size of its genre right_genresize_counter.update([len(URLBow.objects.get(index=right_res_obj.ref_id).short_genres)]) Logger.info("Current on wrongs") swing_file=res_folder+"/{}swing.txt".format(classifiers+"_" if classifiers.strip()!="" else classifiers) wrong_file=res_folder+"/{}wrong_true.txt".format(classifiers+"_" if classifiers.strip()!="" else classifiers) with open(swing_file,mode="w") as swing_handle,open(wrong_file,mode="w") as wrong_handle: #iterate over the wrong samples for wrong_res_obj in {x.ref_id: x for x in WrongResultsIter(res_folder,classifiers)}.values(): assert isinstance(wrong_res_obj,ClassificationResultInstance) if wrong_res_obj.classifier != current_classifier: current_classifier=wrong_res_obj.classifier if wrong_res_obj.is_swing_sample(): swing_handle.write(str(wrong_res_obj)+"\n") swing_genresize_counter.update([len(URLBow.objects.get(index=wrong_res_obj.ref_id).short_genres)]) else: wrong_handle.write(str(wrong_res_obj)+"\n") #now find the size of its genre wrong_genresize_counter.update([len(URLBow.objects.get(index=wrong_res_obj.ref_id).short_genres)]) print("Wrong predicted sample distrbution: {}".format(sorted(wrong_genresize_counter.items(),key=operator.itemgetter(0)))) print("Right predicted sample distrbution: {}".format(sorted(right_genresize_counter.items(),key=operator.itemgetter(0)))) print("Swing sample distrbution: {}".format(sorted(swing_genresize_counter.items(),key=operator.itemgetter(0))))
def scrape_urls_multiproc(cls): #current position pos=MongoDB.get(MetaData,'position',type='queue') #current cap cap=pos process_queue=queue.Queue(maxsize=settings.NUM_PROCESSES) #creates all the necessary processes for p_num in range(0,settings.NUM_PROCESSES): p=mp.Process(target=WebScraper().scrape_links_from_position,args=[cap]) #get curresponding objects process_queue.put(p) cap+=settings.NUM_URLS_PER_PROCESS #now start p.start() head=process_queue.get() #wait and create new processes as needed while(pos<MongoDB.count(URLQueue)): head.join() if not head.exitcode ==0: Logger.error('Error with Process, terminating') return #update counter MongoDB.increment_url_counter(settings.NUM_URLS_PER_PROCESS) p=mp.Process(target=WebScraper().scrape_links_from_position,args=[cap]) process_queue.put(p) p.start() #increase both cap and current position cap+=settings.NUM_URLS_PER_PROCESS pos+=settings.NUM_URLS_PER_PROCESS head=process_queue.get() print(p.exitcode) return cls
def get(self, url): self._randomized_wait() response=None try: response= self.http.request('GET',url,timeout=settings.TIME_OUT) self.bad_count=0 except: self.bad_count+=1 # wait and sleep until we get an answer if self.bad_count >= settings.REQUEST_EXCEPTION_UNTIL_TEST_CONNECTION: while(not self.testInternet()): Logger.info('Waiting for internet') time.sleep(2) response= self.http.request('GET',url,timeout=settings.TIME_OUT) self.bad_count=0 return response
def scrape(self): home=self.http.get(dmoz_home) home_page_links=self._scrapeHomeAndGetLinks(home.data) #visit each link in homepage and dig down #for url in home_page_links: i=0 while i<settings.NUM_RANDOM_WEBPAGE: result=self._scrapPage(home_page_links[random.randint(0,len(home_page_links)-1)]) if result is not None and MongoDB.get_url_object(result['url']) is None: i+=1 try: page=utf_8_safe_decode(self.http.get(result['url']).data) MongoDB.save_modify_url(page=page,**result) Logger.info("Completed: "+result['url']) except Exception as ex: Logger.error(ex)
def scrape(self,url,parent): Logger.debug('Starting url scrap for {}'.format(url)) config.last_url_and_parent=url+', {}'.format('' if parent==None else parent) new_url=base_util.unreplace_dot_url(url) response=self.http.get(new_url) Logger.debug('Got URL') if not hasattr(response,'data') and new_url.startswith('www.'): new_url=new_url.replace('www.','http://') response=self.http.get(new_url) if not hasattr(response,'data'): new_url=new_url.replace('http://','http://www.') response=self.http.get(new_url) if hasattr(response,'data'): body=base_util.utf_8_safe_decode(response.data) else: Logger.error('No data associated with '+new_url) raise AttributeError(new_url+':::No data') return body,new_url
def calculate_similarity(): q = DBQueue("similarity_queue") genre_meta_data = GenreMetaData.objects.order_by("url")[q.get() :] # init the Analytics analytics_coll = col.Analytics() if analytics_coll.select(name=ANALYTICS_NAME).find_one() is None: analytics_coll.create( alexa_total=0, edit_distance_count=0, total_edit_distance=0, alexa_match=0, name=ANALYTICS_NAME, alexa_genre_length=0, ) urls = set() # calculate the similar on a document to document basis for genre_meta in genre_meta_data: if genre_meta["url"] not in urls: urls.add(genre_meta["url"]) Logger.info("Doing genre for url: {}".format(genre_meta["url"])) similarity_res = _calculate_similarity_document(genre_meta) analytics_obj = analytics_coll.select(name=ANALYTICS_NAME).find_one() for k in similarity_res.keys(): similarity_res[k] += analytics_obj[k] analytics_coll.select(name=ANALYTICS_NAME).update(**similarity_res) q.increment() print("URL has a unique percent of {}".format(len(urls) / len(genre_meta_data) * 100))
headers[key] = self.headers[key] response_tuple = conn.connect(str(self.command), str(self.path), headers, body) self.send_response(response_tuple[1]) try: for key in response_tuple[3]: self.send_header(key, response_tuple[3][key]) except: import traceback traceback.print_exc() self.end_headers() self.wfile.write(response_tuple[4]) except: self.send_error(500) class ThreadServer(ThreadingMixIn, HTTPServer): pass if __name__ == '__main__': # server = ProxyServer() # server.setDaemon(True) # server.start() # while True: # pass server = ThreadServer(('localhost', 7890), HttpHandler) Logger.log('Starting server at 7890') server.serve_forever() # h = HttpConnection.get_single_instance() # h.connect('GET', 'http://www.zhihu.com', None)
def lda(lda,train_set,n_top_words): """ Conduct lda with the train_set """ lda.fit(train_set.X) vocab=None topic_word=lda.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) if __name__=="__main__": clustering_logger=Logger() """ Unsupervised Clustering bootstrap """ mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"} #s=SourceMapper(URLBow.objects(),mapping) X_pickle_path=os.path.join(PICKLE_DIR,"X_summary_pickle") y_pickle_path=os.path.join(PICKLE_DIR,"y_summary_pickle") ref_index_pickle_path=os.path.join(PICKLE_DIR,"refIndex_summary_pickle") mapping={"short_genres":"short_genre","index":"ref_index","bow":"attr_map"} #SETTING UP LABEL settings=LearningSettings(type="unsupervised",dim_reduction="chi",feature_selection="summary",num_attributes=10000)
def __init__(self,type_queue,position=0): self.queue=MetaData(type=type_queue) if self.queue.find_one()==None: Logger.info('Queue of Type: {} does not exist in database, creating'.format(type_queue)) self.queue.create(type=type_queue,position=position).save()
# Get Config app.config.from_object(DEFAULT_CONFIG) if os.getenv('PROJECT_AWESOME_FLASK_SETTINGS'): app.config.from_envvar('PROJECT_AWESOME_FLASK_SETTINGS') # Read LOCAL_DB and PROD from environment variable # (this is set on heroku for production) if os.getenv('LOCAL_DB'): app.config['LOCAL_DB'] = (os.getenv('LOCAL_DB') == "true") if os.getenv('PROD'): app.config['PROD'] = (os.getenv('PROD') == "true") #If we are using the production database if app.config['LOCAL_DB'] == False: Logger.info(" ******** Using the Product DB - be careful! ******** ") # Print current status of the config variables Logger.info("PROD=" + str(app.config['PROD']) + " DEBUG=" + str(app.config['DEBUG']) + " LOCAL_DB=" + str(app.config['LOCAL_DB'])) SITE_DOMAIN = "http://www.goprojectawesome.com" if app.config['PROD'] == False: SITE_DOMAIN = "http://127.0.0.1:5000" # # Add methods to Jinja2 context for creating URLs # def full_url_for(*args, **kwargs): '''Wrapper for url_for that prepends the domain to the path'''
def update_url(url,new_url): url=replace_dot_url(url) new_url=replace_dot_url(new_url) Logger.info('Updating {} to {}'.format(url,new_url)) return URLToGenre.objects(url=url).update(url=new_url)
def collect_bad_url(): """ Make bows of websites in the bad url list :return: """ queue=DBQueue_old("genre_bow") #don't trust anything summarizer=Summarizer() bow=BagOfWords() short_genre_to_genre=coll.ShortGenre() url_to_bow=coll.URLBow() start_pos=queue.get() for c,line in enumerate(open("bad_url_summarize_bow.txt")): if c<start_pos: continue url=line.split(" ")[1].split(":::")[0] try: print('New url {} num: {}'.format(url,c)) url_obj=coll.URLToGenre().select(url=url).find_one() if not hasattr(url_obj,"original") or not url_obj["original"]: print("Not original") continue #request page anyways, most of the bad pages are due to bad pagess data=Request().get_data(base_util.unreplace_dot_url(base_util.unreplace_dot_url(url_obj["url"]))) if data is None: raise Exception('url {} No has page'.format(url)) else: if not hasattr(url_obj,"page") or len(data)>len(url_obj["page"]): print("updating data") data=base_util.utf_8_safe_decode(data) if not hasattr(url_obj,"page"): #save page if the new page is significantly bigger than the old one url_obj.save(page=data) else: url_obj.update(page=data) url_obj.reload() if len(data) > len(url_obj.page): raise Exception("Inconsistency b/w data and page data") #url_obj=repair.genre_to_genre_data(url_obj.document) #get genre strings #register the genre with the short genres for faster retrieval genre_string_list=[] for g in url_obj.genre: normalized_string=base_util.normalize_genre_string(g["genre"]) genre_string_list.append(normalized_string) short_genre_to_genre.select(short_genre=normalized_string).update(upsert=True,add_to_set__genres=g) Logger.info("Getting bow rep") #get BOW representation bow_dict=bow.get_word_count(summarizer.summarize(url_obj.page if isinstance(url_obj.page,str) else base_util.utf_8_safe_decode(url_obj))) if len(bow_dict)<20: raise Exception("Words less than 20") Logger.info("Update count:"+str(bow_dict)) #store the url bow in urlbow table if not url_to_bow.select(url=url_obj["url"]).find_one(): url_to_bow.create(url=url_obj["url"],bow=bow_dict,short_genres=genre_string_list) else: print('Exists bow url number {}'.format(url)) queue.increment() except Exception as ex: Logger.error(url_obj['url']+":::"+str(ex),"C:/Users/Kevin/Desktop/GitHub/Research/Webscraper/bad_url_summarize_bow1.txt")
def Worker_print(string): Logger.debug(string)
from data.training_testing import MultiData from data.util import unpickle_obj from classification.classification import feature_selection from functools import partial from util.base_util import normalize_genre_string from util.genre import filter_genres from util.Logger import Logger from data.X_y import match_sets_based_on_ref_id from classification.classification import classify, load_training_testing import operator as op from classification.results import ResCrossValidation __author__ = 'Kevin' supervised_logger=Logger() genre_dict={'Sports': 8757, 'Business': 8553, 'Shopping': 6920, 'Computers': 6245, 'Arts': 6165, 'Society': 5841, 'Recreation': 5770, 'Health': 5418, 'Science': 3662, 'Games': 2767, 'Reference': 2219, 'Kids': 2142, 'News': 1954, 'Regional': 1949,