def __init__(self, language="en"): # TODO replace 'x' with class # to generate dynamic path for file to load if not language in self._cached_stop_words: path = "text/stopwords-%s.txt" % language try: self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) except: self._cached_stop_words[language] = set( FileHelper.loadResourceFile("text/stopwords-en.txt").splitlines() ) self.STOP_WORDS = self._cached_stop_words[language]
def contents(self): test, suite, module, cls, func = self.cls.id().split('.') path = os.path.join(os.path.dirname(CURRENT_PATH), "data", suite, module, "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) yield self.cls.data['url'], content.encode('utf-8')
def loadCustomSiteMapping(self): # TODO dataFile = FileHelper.loadResourceFile("images/known-image-css.txt") lines = dataFile.splitlines() for line in lines: domain, css = line.split('^') self.customSiteMapping.update({domain: css})
def content(self, req): current_test = self.cls._get_current_testname() path = os.path.join(CURRENT_PATH, "data", "extractors", "%s.html" % current_test) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content
def html_content(self, req): current_test = self.cls._get_current_testname() path = os.path.join(os.path.dirname(CURRENT_PATH), "data", "extractors", "images", current_test, "%s.html" % current_test) path = os.path.abspath(path) return FileHelper.loadResourceFile(path)
def loadCustomSiteMapping(self): # TODO dataFile = FileHelper.loadResourceFile("images/known-image-css.txt", "xx") lines = dataFile.splitlines() for line in lines: domain, css = line.split('^') self.customSiteMapping.update({domain:css})
def html_content(self, req): current_test = self.cls._get_current_testname() path = os.path.join( os.path.dirname(CURRENT_PATH), "data", "extractors", "images", current_test, "%s.html" % current_test ) path = os.path.abspath(path) return FileHelper.loadResourceFile(path)
def __init__(self, language='zh'): # force zh languahe code language = 'zh' if not language in self._cached_stop_words: path = os.path.join('text', 'stopwords-%s.txt' % language) self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) self.STOP_WORDS = self._cached_stop_words[language]
def __init__(self, language='en'): # TODO replace 'x' with class # to generate dynamic path for file to load if not language in self._cached_stop_words: path = os.path.join('text', 'stopwords-%s.txt' % language) self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) self.STOP_WORDS = self._cached_stop_words[language]
def __init__(self, language="zh"): # force zh languahe code language = "zh" if not language in self._cached_stop_words: path = "text/stopwords-%s.txt" % language self._cached_stop_words[language] = set(FileHelper.loadResourceFile(path).splitlines()) self.STOP_WORDS = self._cached_stop_words[language]
def content(self, req): current_test = self.cls._get_current_testname() path = os.path.join(CURRENT_PATH, "data", "videos", "%s.html" % current_test) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content
def getRawHtml(self): test, suite, module, cls, func = self.id().split('.') path = os.path.join(os.path.dirname(CURRENT_PATH), "data", suite, module, "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content
def load_test_file(self, suffix): suite, module, cls, func = self.id().split('.') path = os.path.join(CURRENT_PATH, "data", module, "%s%s" % (func, suffix)) path = os.path.abspath(path) try: return FileHelper.loadResourceFile(path) except IOError: print "No File"
def content(self, req): current_test = self.cls._get_current_testname() path = os.path.join(CURRENT_PATH, "data", "extractors", "%s.html" % current_test) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) if content is None: raise Exception ("Test could not be found") return content
def load_customesite_mapping(self): # TODO path = os.path.join('images', 'known-image-css.txt') data_file = FileHelper.loadResourceFile(path) lines = data_file.splitlines() for line in lines: domain, css = line.split('^') self.custom_site_mapping.update({domain: css})
def loadData(self): """\ """ suite, module, cls, func = self.id().split('.') path = os.path.join(CURRENT_PATH, "data", module, "%s.json" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) self.data = json.loads(content)
def loadData(self): """\ """ test, suite, module, cls, func = self.id().split(".") path = os.path.join(os.path.dirname(CURRENT_PATH), "data", suite, module, func, "%s.json" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) self.data = json.loads(content)
def getRawHtml(self): test, suite, module, cls, func = self.id().split('.') path = os.path.join( os.path.dirname(CURRENT_PATH), "data", suite, module, "%s.html" % func) path = os.path.abspath(path) content = FileHelper.loadResourceFile(path) return content
def __init__(self, language='en'): # TODO replace 'x' with class # to generate dynamic path for file to load if not language in self._cached_stop_words: path = os.path.join('text', 'stopwords-%s.txt' % language) try: content = FileHelper.loadResourceFile(path) word_list = content.splitlines() except IOError: word_list = [] self._cached_stop_words[language] = set(word_list) self.STOP_WORDS = self._cached_stop_words[language]
def __init__(self, language='en'): # TODO replace 'x' with class # to generate dynamic path for file to load if isinstance(language,str): language = [language] language = set(language) self.char_split = False if 'zh' in language or 'ko' in language or 'ja' in language: self.char_split = True self.STOP_WORDS = None for l in language: if not l in StopWords._cached_stop_words: path = 'text/stopwords-%s.txt' % l try: _stop_list = FileHelper.loadResourceFile(path) if l in ['zh','ko','ja']: _stop_list = _stop_list.decode('utf-8') StopWords._cached_stop_words[l] = set(_stop_list.splitlines()) except: StopWords._cached_stop_words[l] = set() if self.STOP_WORDS is None: self.STOP_WORDS = StopWords._cached_stop_words[l] else: self.STOP_WORDS |= StopWords._cached_stop_words[l]
def getHtml(self, filename): return FileHelper.loadResourceFile(filename)
def get_html(self, filename): path = os.path.join(CURRENT_PATH, 'data', filename) return FileHelper.loadResourceFile(path)
def get_html(self, filename): path = os.path.join(CURRENT_PATH, "data", filename) path = os.path.abspath(path) return FileHelper.loadResourceFile(path)
def get_html(self, filename): path = os.path.join(CURRENT_PATH, 'data', filename) path = os.path.abspath(path) print "CURRENT_PATH %s" % path return FileHelper.loadResourceFile(path)
def getHtml(self, filename): return FileHelper.loadResourceFile(filename, 'x')
def __init__(self, language='en'): self.PUNCTUATION = re.compile("[^\\p{Ll}\\p{Lu}\\p{Lt}\\p{Lo}\\p{Nd}\\p{Pc}\\s]") # TODO replace 'x' with class # to generate dynamic path for file to load path = 'text/stopwords-%s.txt' % language self.STOP_WORDS = set(FileHelper.loadResourceFile(path, 'x').splitlines())