def single_picky(slug='test'): try: f = open(PICKY_DIR + slug + '.md') except IOError: abort(404) picky = f.read() f.close() meta_regex = re.compile( r"^\s*(?:-|=){3,}\s*\n((?:.|\n)+?)\n\s*(?:-|=){3,}\s*\n*", re.MULTILINE ) match = re.match(meta_regex, picky) if not match: abort(404) metas = match.group(1) title = None date = None meta = metas.split("\n") try: title = meta[0].split("=>")[1] except IndexError: title = meta[0].split("=>")[0] try: date = meta[1].split("=>")[1] except IndexError: date = meta[1].split("=>")[0] cont = to_unicode(picky[match.end():]) content = to_markdown(cont) return template('picky.html', content=content, title=to_unicode(title), date=to_unicode(date), slug=slug)
def __init__(self, input, transposed=True): """ Initialize the matrix reader. The `input` refers to a file on local filesystem, which is expected to be in the sparse (coordinate) Matrix Market format. Documents are assumed to be rows of the matrix (and document features are columns). `input` is either a string (file path) or a file-like object that supports `seek()` (e.g. gzip.GzipFile, bz2.BZ2File). """ logger.info("initializing corpus reader from %s" % input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith('%%matrixmarket matrix coordinate real general'): raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = map(int, line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" % (self.num_docs, self.num_terms, self.num_nnz))
def load_cat2vec_format(cls, cat_model=None, sent_model=None, word_model=None): """ Load sentence vectors """ model = Category2Vec(None) count = 0 if cat_model: logger.info("loading %s object(cat) from %s" % (cls.__name__, cat_model)) for line in open(cat_model,"r"): line = line.rstrip() if count == 0: info = line.split() model.cat_len = int(info[0]) model.layer1_size = int(info[1]) model.sg = int(info[2]) model.hs = int(info[3]) model.negative = int(info[4]) model.cbow_mean = int(info[5]) model.cats = empty((model.cat_len, model.layer1_size), dtype=REAL) model.cat_no_hash = {} model.cat_id_list = [] else: idx = count - 1 row = line.split("\t") cat_id = utils.to_unicode(row[0]) model.cat_no_hash[cat_id] = idx model.cat_id_list.append(cat_id) vals = row[1].split() for j in xrange(model.layer1_size): model.cats[idx][j] = float(vals[j]) count += 1 count = 0 if sent_model: logger.info("loading %s object(sentence) from %s" % (cls.__name__, sent_model)) for line in open(sent_model,"r"): line = line.rstrip() if count == 0: info = line.split() model.sents_len = int(info[0]) model.sents = empty((model.sents_len, model.layer1_size), dtype=REAL) model.sent_no_hash = {} model.sent_id_list = [] else: idx = count - 1 row = line.split("\t") sent_id = utils.to_unicode(row[0]) model.sent_no_hash[sent_id] = idx model.sent_id_list.append(sent_id) vals = row[1].split() for j in xrange(model.layer1_size): model.sents[idx][j] = float(vals[j]) count += 1 if word_model: logger.info("loading word2vec from %s" % word_model) model.w2v = Word2Vec.load(word_model) model.vocab = model.w2v.vocab return model
def __init__ (self, id, uri, name, type): if id is None: self.id = DBRepository.id_counter DBRepository.id_counter += 1 else: self.id = id self.uri = to_unicode (uri) self.name = to_unicode (name) self.type = to_unicode (type)
def __init__ (self, id, commit): if id is None: self.id = DBLog.id_counter DBLog.id_counter += 1 else: self.id = id self.rev = to_unicode (commit.revision) self.committer = None self.author = None self.date = commit.date self.message = to_unicode (commit.message) self.composed_rev = commit.composed_rev
def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such # Things that don't have seek will trigger an exception self.source.seek(0) for line in self.source: yield utils.to_unicode(line).split() except AttributeError: # If it didn't work like a file, use it as a string filename with utils.smart_open(self.source) as fin: for line in fin: yield utils.to_unicode(line).split()
def response(self,msg,**kwargs): ## msg is parsed and your handled data.Actually,it is a dict. ## Your could specify a type by assign.ex response(type='music').I list all legal types. ''' ex: response(message,type='yourType') optional kwargs: type='legal_types',content='yourContent',handler=foo,count=1 ps:when type is news,the count kwarg is nessceary support types: text,image,voice,video,music,news ''' msg['receiver'],msg['sender'] = msg['sender'],msg['receiver'] legal_types = ['text','music','image','voice','video','news'] ## get some kwargs ## # key word content ---- which force type to textand return a static string if kwargs.get('type'): type = kwargs.get('type') else:type = msg['type'] if type == 'music': if not msg['hq_musurl']: msg['hq_musurl'] = msg['musurl'] # charge receiver and sender if kwargs.get('content'): msg['type'] = type = 'text' msg['content'] = to_unicode(kwargs.get('content')) if not type in legal_types: raise Exception("Illgal type!You could only choose one type from legal_types!") # key word handler ---- which is a function object,accept a dict and return a modified dict else: msg['type'] = type if kwargs.get('handler'): msg = kwargs.get('handler')(msg) ## more kwargs ## if not type == 'news': template = to_unicode(getattr(Template(),type)) else: count = kwargs.get('count') if count: temp = Template() template = to_unicode(temp.news(count)) else: raise Exception('When type is set to news,the count kwarg is necessary!') logging.info(template.format(**msg)) try: retdata = template.format(**msg) except: raise Exception("You did't pass enough args or pass wrong args,please check args which template needed.Read template.py maybe inspire your mind") return retdata
def __init__(self, unique_name, base_filepath, parameters): """ Arguments --------- keyword_name : feature unique name base_filepath : filepath of feature config parameters : lexicon parameters, presented by dictionary """ self.unique_name = unique_name self.parameters = parameters filepath = os.path.join( base_filepath, parameters[BagOfClustersFeature.PARAM_CLUSTERED_WORDS_FILEPATH]) if parameters[BagOfClustersFeature.PARAM_ENABLED] == 'false': return print "Loading file with clusters of words: {}".format(filepath) with io.open(filepath, 'r', encoding='utf-8') as f: self.clustered_words = json.load(f, encoding='utf-8') print "Create dictionary with all clusters, accessed by cluster_id ..." self.clusters = {} for word in self.clustered_words.iterkeys(): cluster_id = self.clustered_words[word] if cluster_id not in self.clusters: self.clusters[cluster_id] = [] self.clusters[cluster_id].append(utils.to_unicode(word))
def add_header(self): if self.file is not None : dis = "" dis += "Script file : %s\n" % sys.argv[0] dis += "Date : %s\n" % time.strftime("%d/%m/%Y %H:%M:%S", self.gtime.start_date) dis += "\n%s\n" % self.format("Time(s)", "Scope", "Info") self.file.write(utils.to_unicode(dis))
def get_search_string(search_string=None): """Ask the user for a search string""" keyboard = xbmc.Keyboard(search_string, localize(30134)) keyboard.doModal() if keyboard.isConfirmed(): search_string = to_unicode(keyboard.getText()) return search_string
def get_property(key, default=None, window_id=10000): """Get a Window property""" from xbmcgui import Window value = to_unicode(Window(window_id).getProperty(key)) if value == '' and default is not None: return default return value
def get_project(self): """ get the project """ ret = "" mid = self.fields['project'] if mid is not None: ret = mid['key'] return to_unicode(ret)
def get_reporter(self): """ Get the issue reporter """ ret = "" mid = self.fields['reporter'] if mid is not None: ret = mid['displayName'] return to_unicode(ret)
def get_type(self): """ Get the Issue type """ ret = "" mid = self.fields['issuetype'] if mid is not None: ret = mid['name'] return to_unicode(ret)
def get_assignee(self): """ Get the assignee """ ret = "" mid = self.fields['assignee'] if mid is not None: ret = mid['displayName'] return to_unicode(ret)
def get_cluster_id(self, word): """ Returns ------- Returns id of cluster, which is contain the 'word' """ return self.clustered_words[utils.to_unicode(word)]
def __iter__(self): # the entire corpus is one gigantic line -- there are no sentence marks at all # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens sentence, rest, max_sentence_length = [], b'', 1000 with utils.smart_open(self.fname) as fin: while True: text = rest + fin.read( 8192) # avoid loading the entire file (=1 line) into RAM if text == rest: # EOF sentence.extend( rest.split() ) # return the last chunk of words, too (may be shorter/longer) if sentence: yield sentence break last_token = text.rfind( b' ' ) # the last token may have been split in two... keep it for the next iteration words, rest = ( utils.to_unicode(text[:last_token]).split(), text[last_token:].strip()) if last_token >= 0 else ([], text) sentence.extend(words) while len(sentence) >= max_sentence_length: yield sentence[:max_sentence_length] sentence = sentence[max_sentence_length:]
def _create_des_(msg): if msg is None : return {} elif not(isinstance(msg, types.StringTypes)): raise pexception.PytestembError("Msg must be a string") else: return dict({"msg":"%s" % utils.to_unicode(msg)})
def get_priority(self): """ Get the priority """ ret = "" pri = self.fields['priority'] if pri is not None: ret = pri['name'] return to_unicode(ret)
def get_local_features(token, word_freq=None): assert len(token) >= 1 features = [] ntoken = normalize(token, lowercase=False) if token.isalpha(): if 'UpperCase' in features_on: if first_upper_case(ntoken): features += ['IsUpperCase'] if 'AllUpperCase' in features_on: if all_upper_case(ntoken): features += ['IsAllUpperCase'] if 'AllLowerCase' in features_on: if all_lower_case(ntoken): features += ['IsAllLowerCase'] if 'Freq' in features_on: features += ['Freq:%s' % str(word_freq[ntoken])] if 'Rare' in features_on: if word_freq[ntoken] <= rare_thr: features += ['IsRare'] if 'IsWord' in features_on: features += ['IsWord'] elif token.isdigit(): if 'Number' in features_on: features += ['IsNumber'] elif token.isalnum(): if 'AlphaNum' in features_on: features += ['IsAlphaNum'] elif len(to_unicode(token)) == 1: if is_punct(token): if 'Separator' in features_on: features += ['IsSeparator'] else: if 'NonAlphanum' in features_on: features += ['IsNonAlphanum'] if 'Word' in features_on: if not any(x in features for x in ['IsNumber', 'IsAlphaNum']): features += ['W=%s' % ntoken] if 'Length' in features_on: features += ['Length:%s' % str(len(ntoken))] return features
def unfollow(program_name, title, program_id=None): """The API interface to unfollow a program used by the context menu""" move_down = bool(plugin.args.get('move_down')) from favorites import Favorites Favorites().unfollow(program_name=program_name, title=to_unicode(unquote_plus(from_unicode(title))), program_id=program_id, move_down=move_down)
def __iter__(self): try: self.source.seek(0) for line in self.source: k = utils.to_unicode(line.rstrip()).split("\t") categories = k[3].split(" ") for cat in categories: if "/" in cat: continue yield k[4:], k[1], cat except AttributeError: with utils.smart_open(self.source) as fin: for line in fin: k = utils.to_unicode(line.rstrip()).split("\t") categories = k[3].split(" ") for cat in categories: if "/" in cat: continue yield k[4:], k[1], cat
def __init__(self, id, name): if id is None: self.id = DBTag.id_counter DBTag.id_counter += 1 else: self.id = id self.name = to_unicode(name)
def __init__ (self, id, name): if id is None: self.id = DBTag.id_counter DBTag.id_counter += 1 else: self.id = id self.name = to_unicode (name)
def _check_same_origin(self, current_url): """ 检查两个URL是否同源 """ current_url = to_unicode(current_url) url_part = urlparse.urlparse(current_url) url_origin = (url_part.scheme, url_part.netloc) return url_origin == self.origin
def replace(infile, outfile, find_what, replace_with, match_case=False, output_paragraphs=False): u"""Replace find_what with replace_with in docx or docm. :param infile: file in which replacement will be performed :type infile: str | unicode :param outfile: file in which the new content will be saved :type outfile: str | unicode :param find_what: text to search for :type find_what: str | unicode :param replace_with: text to replace find_what with :type replace_with: str | unicode :param match_case: True to make search case-sensitive :type match_case: bool :param output_paragraphs: True to output the paragraphs replaced :type output_paragraphs: bool """ if not os.path.isfile(infile): raise ValueError('infile not found.') if not outfile: raise ValueError('outfile must be specified.') if not find_what: raise ValueError('find_what must be specified') if replace_with is None: replace_with = u'' infile = to_unicode(infile) outfile = to_unicode(outfile) find_what = to_unicode(find_what) replace_with = to_unicode(replace_with) global count count = 0 parts = extract_parts(infile) for part in parts: part['content'] = __replace_part(etree.fromstring(part['content']), find_what, replace_with, match_case, output_paragraphs) save_parts(parts, infile, outfile) print('Paragraphs replaced: {0}'.format(count))
def get_terms_info(self, term): """ returns: dict amount of documents which includes 'term' for different sentiment classes and at all (DocVocabulary.ALL) """ uterm = to_unicode(term) return self.terms_info[uterm]
def createI18NFile(data): columns = json.loads(data.get("columns")) strprefix = data.get("strprefix") cns, ens = [], [] for column in columns: if column.get("creat_i18n") == 'on': i18nStr = strprefix + column.get("column_name") if column.get("data_type") in ("date", "datetime"): cns.append({ "key": i18nStr + "_search", "value": utils.to_unicode("查询" + column.get("cn_name")) }) ens.append({ "key": i18nStr + "_search", "value": "Search" + column.get("en_name") }) cns.append({ "key": i18nStr, "value": utils.to_unicode(column.get("cn_name")) }) ens.append({"key": i18nStr, "value": column.get("en_name")}) cns.append({ "key": "com.zhiyin.mes.app.to", "value": utils.to_unicode("到") }) ens.append({"key": "com.zhiyin.mes.app.to", "value": "To"}) if data.get("checkFactory") == 'on': cns.append({ "key": "com.zhiyin.mes.app.factory_name", "value": utils.to_unicode("所属工厂") }) ens.append({ "key": "com.zhiyin.mes.app.factory_name", "value": "FactoryName" }) utils.create_file( data.get("packageName") + "/i18n", render_template('/generate/i18n.txt', data=cns), data.get("packageName") + ".datagrid_zh_CN.properties") utils.create_file( data.get("packageName") + "/i18n", render_template('/generate/i18n.txt', data=ens), data.get("packageName") + ".datagrid_en_US.properties")
def __init__ (self, id, person): if id is None: self.id = DBPerson.id_counter DBPerson.id_counter += 1 else: self.id = id self.name = to_unicode (person.name) self.email = person.email or None
def ReplyText(ToUserName,FromUserName,Content): Temp="""<xml> <ToUserName><![CDATA[%s]]></ToUserName> <FromUserName><![CDATA[%s]]></FromUserName> <CreateTime>%s</CreateTime> <MsgType><![CDATA[text]]></MsgType> <Content><![CDATA[%s]]></Content> </xml>""" return Temp % (ToUserName,FromUserName,str(int(time.time())),to_unicode(Content))
def __getitem__(self, name): '''Get a header value, from the message, decoded and as a unicode string. If the header does not exist, None is returned''' value = self._msg[name] if value is None: return None return u''.join(to_unicode(*tupl) for tupl in decode_header(value))
def __init__ (self, id, file_name): if id is None: self.id = DBFile.id_counter DBFile.id_counter += 1 else: self.id = id self.file_name = to_unicode (file_name) self.repository_id = None
def get_setting(key, default=None): """Get an add-on setting as string""" try: value = to_unicode(ADDON.getSetting(key)) except RuntimeError: # Occurs when the add-on is disabled return default if value == '' and default is not None: return default return value
def __init__(self, id, person): if id is None: self.id = DBPerson.id_counter DBPerson.id_counter += 1 else: self.id = id self.name = to_unicode(person.name) self.email = person.email or None
def __init__(self, id, file_name): if id is None: self.id = DBFile.id_counter DBFile.id_counter += 1 else: self.id = id self.file_name = to_unicode(file_name) self.repository_id = None
def push_upnext(self): """Push episode info to Up Next service add-on""" if has_addon('service.upnext') and get_setting_bool( 'useupnext', default=True) and self.isPlaying(): info_tag = self.getVideoInfoTag() next_info = self.apihelper.get_upnext( dict( program=to_unicode(info_tag.getTVShowTitle()), playcount=info_tag.getPlayCount(), rating=info_tag.getRating(), path=self.path, runtime=self.total, )) if next_info: from base64 import b64encode from json import dumps data = [to_unicode(b64encode(dumps(next_info).encode()))] sender = '{addon_id}.SIGNAL'.format(addon_id=addon_id()) notify(sender=sender, message='upnext_data', data=data)
def get_term_in_voc_count(self, term, sentiment=None): """ term : string sentiment: '-1', '0', '1' Describes the sentiment class """ uterm = to_unicode(term) term_counters = self.term_in_voc_count[self.get_term_index(uterm)] return term_counters[sentiment] if sentiment is not None \ else term_counters[self.ALL]
def get_release_note(self): if self.notes is None: field = self.parent.field_id_map['Release Note'] if field in self.fields: self.notes = to_unicode(self.fields[field]) elif self.get_incompatible_change() or self.get_important(): self.notes = self.get_description() else: self.notes = "" return self.notes
def filter_wiki(raw): """ Filter out wiki mark-up from `raw`, leaving only text. `raw` is either unicode or utf-8 encoded string. """ # parsing of the wiki markup is not perfect, but sufficient for our purposes # contributions to improving this code are welcome :) text = utils.to_unicode(raw, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' return remove_markup(text)
def get_all_tones_from_table(self): logging.info("Loading lexicon '%s': %s ..." % (self.unique_name, self.table_filepath)) df = pd.read_csv(self.table_filepath, sep=',') for row in df.index: name = df[self.term_column_name][row] value = df[self.value_column_name][row] self.cache[utils.to_unicode(name)] = float(value)
def load_model(self, model: str, model_path: str): try: encoding = 'utf-8' unicode_errors = 'strict' model_file = [ f for f in os.listdir(model_path) if os.path.isfile(os.path.join(model_path, f)) ] f = open(os.path.join(model_path, model_file[0]), 'rb') header = to_unicode(f.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format binary_len = dtype(real).itemsize * vector_size for _ in tqdm(range(vocab_size)): word = [] while True: ch = f.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(f.read(binary_len), dtype=real).astype(real) self.word_vectors[word] = weights self.model = model print("Model loaded Successfully !") return self except Exception as e: print('Error loading Model, ', str(e))
def get_terms_info_safe(self, term): """ returns : dict or 0 amount of documents which includes 'term' for different sentiment classes and at all (DocVocabulary.ALL) """ uterm = to_unicode(term) if (uterm in self.terms_info): return self.get_terms_info(uterm) else: return None
def _download_url(self, url): """ Return a tuple (status, content), with content being the content of the url as a string, and status a boolean marking whether the download was succesful or not.""" try: web = urllib2.urlopen(url) except Exception as e: result = (False, to_unicode(str(e))) else: result = (True, web.read()) return result
def main(): args = parse_args() cfg = read_cfg(args.config) if args.reloads is not None: if 'all' in args.reloads: tags = cfg.keys() else: tags = args.reloads tags = tuple( (to_unicode(tag) for tag in tags if to_unicode(tag) in cfg)) reload_cfg(cfg, tags) elif args.failovers is not None: if 'all' in args.failovers: tags = cfg.keys() else: tags = args.failovers tags = tuple( (to_unicode(tag) for tag in tags if to_unicode(tag) in cfg)) do_failover(cfg, tags) else: start_scheduler(cfg, args.seconds)
def replace(infile, outfile, find_what, replace_with, match_case=False, output_paragraphs=False): u"""Replace find_what with replace_with in pptx or pptm. :param infile: file in which replacement will be performed :type infile: str | unicode :param outfile: file in which the new content will be saved :type outfile: str | unicode :param find_what: text to search for :type find_what: str | unicode :param replace_with: text to replace find_what with :type replace_with: str | unicode :param match_case: True to make search case-sensitive :type match_case: bool :param output_paragraphs: True to output the paragraphs replaced :type match_case: bool """ if not os.path.isfile(infile): raise ValueError('infile not found.') if not outfile: raise ValueError('outfile must be specified.') if not find_what: raise ValueError('find_what must be specified') if replace_with is None: replace_with = u'' infile = to_unicode(infile) outfile = to_unicode(outfile) find_what = to_unicode(find_what) replace_with = to_unicode(replace_with) global count count = 0 parts = extract_parts(infile) for part in parts: part['content'] = __replace_part(etree.fromstring(part['content']), find_what, replace_with, match_case, output_paragraphs) save_parts(parts, infile, outfile) print('Paragraphs replaced: {0}'.format(count))
def parse_user_msg(signature, xml): """ Parse xml from wechat server and return an Message :param xml: raw xml from wechat server. :return: an Message object """ if not xml: return _msg = dict((child.tag, to_unicode(child.text)) for child in ElementTree.fromstring(xml)) return pkgRequest(signature, **_msg)
def onNotification(self, sender, method, data): # pylint: disable=invalid-name """Handler for notifications""" # log(2, '[Notification] sender={sender}, method={method}, data={data}', sender=sender, method=method, data=to_unicode(data)) # Handle play_action events from upnextprovider if sender.startswith('upnextprovider') and method.endswith( 'plugin.video.vrt.nu_play_action'): from json import loads hexdata = loads(data) if not hexdata: return # NOTE: With Python 3.5 and older json.loads() does not support bytes or bytearray, so we convert to unicode from base64 import b64decode data = loads(to_unicode(b64decode(hexdata[0]))) log(2, '[Up Next notification] sender={sender}, method={method}, data={data}', sender=sender, method=method, data=to_unicode(data)) self._playerinfo.add_upnext(data.get('video_id'))
def feed_url(self, url): """ 设置初始爬取URL """ if isinstance(url, basestring): url = to_unicode(url) url = UrlData(url) if self.same_origin: url_part = urlparse.urlparse(unicode(url)) self.origin = (url_part.scheme, url_part.netloc) self.fetcher_queue.put(url, block=True)
def feed_url(self, url): ''' 设置初始爬取URL ''' if isinstance(url, basestring): url = to_unicode(url) url = UrlData(url) if self.same_origin: url_part = urlparse.urlparse(unicode(url)) self.origin = (url_part.scheme, url_part.netloc) self.fetcher_queue.put(url, block=True)
def get_term_in_docs_count(self, term, sentiment=None): """ term : str sentiment : None or '-1', '0', '1' sentiment class name returns : int amount of 'term' appeared in all documents or in a documents of the certain 'sentiment' class """ uterm = to_unicode(term) term_info = self.terms_info[uterm] return term_info[self.ALL] if sentiment is None \ else self.__get_term_in_sentiment_docs_count(term_info, sentiment)
def get_json_data(response, fail=None): """Return json object from HTTP response""" from json import load, loads try: if (3, 0, 0) <= version_info < ( 3, 6, 0): # the JSON object must be str, not 'bytes' return loads(to_unicode(response.read())) return load(response) except TypeError as exc: # 'NoneType' object is not callable log_error('JSON TypeError: {exc}', exc=exc) return fail except ValueError as exc: # No JSON object could be decoded log_error('JSON ValueError: {exc}', exc=exc) return fail