def get_images_and_capture_dates(self): img_file_names = [] if self.recursive: for root, _, file_names in os.walk(self.path): img_file_names += [to_unicode(os.path.join(root, file_name)) for file_name in file_names if self.pattern.match(file_name)] else: img_file_names = [to_unicode(os.path.join(self.path, file_name)) for file_name in os.listdir(self.path) if os.path.isfile(os.path.join(self.path, file_name)) and self.pattern.match(file_name)] capture_times = [] images_with_times = [] for img_file_name in img_file_names: with open(img_file_name, 'rb') as img_file: try: tags = exifread.process_file(img_file, stop_tag="DateTimeOriginal", details=False) except: continue # if a file is corrupt in any way, skip it date = tags.get("EXIF DateTimeOriginal") or tags.get("EXIF DateTimeDigitized") \ or tags.get("Image DateTime") if not date: continue # skip images without info when they were taken try: # EXIF date format is YYYY:MM:DD HH:MM:SS date = datetime.strptime(date.printable, "%Y:%m:%d %H:%M:%S").date() except ValueError: continue # skip image if parsing the date fails capture_times.append(date) images_with_times.append(img_file_name) return images_with_times, capture_times
def to_xml(self): args = ( self.tosuer, self.fromuser, self.create_time, self.msg_type, to_unicode(self.music_title), to_unicode(self.music_description), self.music_url, self.music_url, self.music_hq_url, self.func_flag ) return self.xml_template % args
def save(self): """Write the configuration options to the primary file.""" if not self.filename: return # Only save options that differ from the defaults sections = [] for section in self.sections(): options = [] for option in self[section]: default = None if self.parent: default = self.parent.get(section, option) current = self.parser.has_option(section, option) and \ to_unicode(self.parser.get(section, option)) if current is not False and current != default: options.append((option, current)) if options: sections.append((section, sorted(options))) fileobj = open(self.filename, 'w') try: fileobj.write('# -*- coding: utf-8 -*-\n\n') for section, options in sections: fileobj.write('[%s]\n' % section) for key, val in options: if key in self[section].overridden: fileobj.write('# %s = <inherited>\n' % key) else: val = val.replace(CRLF, '\n').replace('\n', '\n ') fileobj.write('%s = %s\n' % (key, val.encode('utf-8'))) fileobj.write('\n') finally: fileobj.close()
def __init__(self, trigger, func, args, kwargs, misfire_grace_time, coalesce, name=None, max_runs=None, max_instances=1): if not trigger: raise ValueError('The trigger must not be None') if not hasattr(func, '__call__'): raise TypeError('func must be callable') if not hasattr(args, '__getitem__'): raise TypeError('args must be a list-like object') if not hasattr(kwargs, '__getitem__'): raise TypeError('kwargs must be a dict-like object') if misfire_grace_time <= 0: raise ValueError('misfire_grace_time must be a positive value') if max_runs is not None and max_runs <= 0: raise ValueError('max_runs must be a positive value') if max_instances <= 0: raise ValueError('max_instances must be a positive value') self._lock = Lock() self.trigger = trigger self.func = func self.args = args self.kwargs = kwargs self.name = to_unicode(name or get_callable_name(func)) self.misfire_grace_time = misfire_grace_time self.coalesce = coalesce self.max_runs = max_runs self.max_instances = max_instances self.runs = 0 self.instances = 0
def prepro(self, contexts): num = len(contexts) context_tokens = [] for text in contexts: text = to_unicode(text) lst = re.split(r",|\?|!|。|,|?|!", text) tokens = [] for x in lst: para_tokens = word_tokenize(x) tokens.append(para_tokens) context_tokens.append(tokens) context_idxs = np.zeros([self.batch_size, self.para_max_num, self.para_max_length], dtype=np.int32) def _get_word(each): if each in self.word2idx_dict: return self.word2idx_dict[each] return 1 for b, context_token in enumerate(context_tokens): for i, tokens in enumerate(context_token): if i < para_max_num: for j, token in enumerate(tokens): if j < para_max_length: context_idxs[b, i,j] = _get_word(token) return context_idxs
def _check(self, value): valid = (value and value.startswith('1') and value.isdigit() and len(util.to_unicode(value)) == 11) if valid: data = value else: data = self._messages['default'] return valid, data
def __init__(self, field_name=None, default_value=None): self._messages = {} self._messages.update(_default_messages) self._message_vars = { 'name': util.to_unicode(field_name), } if default_value is not None: self._default_value = default_value
def _check(self, value): if not self._check_mm(len(util.to_unicode(value))): message = self._messages[self._message_key + '_len'] return False, message if self._format: import re if not re.match(self._format, value): return False, self._messages['format'] return True, value
def to_xml(self): args = ( self.touser, self.fromuser, self.create_time, self.msg_type, 0, to_unicode(self.content) ) return self.xml_template % args
def process_body(self, body, url, obj_id): body = to_unicode(body) body.replace('<?xml version="1.0" encoding="utf-8"?>', "") body = self.cleaner.clean_html(body) with open("../data/mining_task/" + str(obj_id), "wb") as fout: g = gzip.GzipFile(mode="wb", fileobj=fout) try: g.write(body.encode("utf-8")) finally: g.close() print url
def _check(self, value): if not self._check_mm(len(util.to_unicode(value))): message = self._messages[self._message_key + '_len'] return False, message valid = value.replace('-', '').replace(' ', '').isdigit() if not valid: data = self._messages['default'] else: data = value return valid, data
def get(self): users = WXUser.all() p = Push() if not users.count(): return opener = poster.streaminghttp.register_openers() weatherinfo = json.loads(opener.open(settings.weather1_url % settings.weather_city, timeout=5).read())['weatherinfo'] logging.info(weatherinfo) city = weatherinfo['city'] temp = weatherinfo['temp'] wd = weatherinfo['WD'] ws = weatherinfo['WS'] sd = weatherinfo['WS'] time = weatherinfo['time'] args = (to_unicode(city), temp, to_unicode(wd), to_unicode(ws), sd, time) logging(str(args)) for user in users: msg = ''' 城市:%s 温度:%s 摄氏度 风向:%s 风力:%s 湿度:%s 发布时间:%s''' % (to_unicode(city), temp, to_unicode(wd), to_unicode(ws), sd, time) logging.info(msg) p.send_txt_msg(user.fake_id, msg)
def set(self, name, value): """Change a configuration value. These changes are not persistent unless saved with `save()`. """ if not self.config.parser.has_section(self.name): self.config.parser.add_section(self.name) if value is None: self.overridden[name] = True value = '' else: value = to_unicode(value).encode('utf-8') return self.config.parser.set(self.name, name, value)
def parse_message(xml): '''Parse from weixin receive xml to message ''' if not xml: return logging.info(xml) root = et.fromstring(xml) _msg = dict( touser=root.find('ToUserName').text, fromuser=root.find('FromUserName').text, create_time=root.find('CreateTime').text ) msg_type = root.find('MsgType').text if msg_type == 'text': _msg['content'] = to_unicode(root.find('Content').text) _msg['msg_id'] = root.find('MsgId').text return TextMessage(**_msg) elif msg_type == 'image': _msg['pic_url'] = root.find('PicUrl').text _msg['msg_id'] = root.find('MsgId').text return ImageMessage(**_msg) elif msg_type == 'location': _msg['x'] = root.find('Location_x').text _msg['y'] = root.find('Location_y').text _msg['scale'] = root.find('Scale').text _msg['label'] = to_unicode(root.find('Label').text) _msg['msg_id'] = root.find('MsgId').text return LocationMessage(**_msg) elif msg_type == 'link': _msg['title'] = to_unicode(root.find('Title').text) _msg['description'] = to_unicode(root.find('Description').text) _msg['url'] = root.find('Url').text _msg['msg_id'] = root.find('MsgId').text return LinkMessage(**_msg) elif msg_type == 'event': _msg['event'] = root.find('Event').text _msg['event_key'] = root.find('EventKey').text _msg['msg_id'] = None return EventMessage(**_msg)
def _check(self, value): if self._max is None: self._max = self._max_urs_length self._message_vars['max_len'] = self._max if self._min is None: self._min = self._min_urs_length self._message_vars['min_len'] = self._min if not self._check_mm(len(util.to_unicode(value))): message = self._messages[self._message_key + '_len'] return False, message value = value.lower() return self.is_email(value)
def post(self): global token _args = dict( token=token, timestamp=self.request.get('timestamp'), nonce=self.request.get('nonce'), signature=self.request.get('signature') ) if not checkSignure(**_args): return webapp2.abort(403) message = parse_message(self.request.body) reply = generate_reply(message) self.response.content_type = 'application/xml' self.response.write(to_unicode(reply.to_xml()))
def process_body(self, body, task): url = task.get('url') #print url, body[:100][:1000] body_size = len(body) body = to_unicode(body) body.replace('<?xml version="1.0" encoding="utf-8"?>', '') #body = self.cleaner.clean_html(body) self.logger.info("page body, url:%s, body:%s" % (url, body[:100])) self.db_helper.save_mining_result(body, body_size, task) if task.get('depth') <= self.maxdepth: tree = lxml.html.document_fromstring(body) a_elements = tree.xpath('//a') #import pdb;pdb.set_trace() urls = valid_a_href(a_elements, url) not_exist = self.url_dedup.insert_not_exist(urls) #self.db_helper.insert_mining_task(task, urls) self.db_helper.insert_mining_task(task, not_exist)
def get_embedding(counter, data_type, limit=-1, emb_file=None, vec_size=None, token2idx_dict=None): print("Generating {} embedding...{}".format(data_type, emb_file)) embedding_dict = {} filtered_elements = [k for k, v in counter.items() if v > limit] if emb_file is not None: assert vec_size is not None with codecs.open(emb_file, "r", encoding="utf-8") as fh: for line in fh: array = to_unicode(line).strip().split() word = "".join(array[0:-vec_size]) vector = list(map(float, array[-vec_size:])) if word in counter and counter[word] > limit: embedding_dict[word] = vector print("{} / {} tokens have corresponding {} embedding vector".format( len(embedding_dict), len(filtered_elements), data_type)) else: assert vec_size is not None for token in filtered_elements: embedding_dict[token] = [ np.random.normal(scale=0.01) for _ in range(vec_size) ] print("{} tokens have corresponding embedding vector".format( len(filtered_elements))) NULL = "--NULL--" OOV = "--OOV--" token2idx_dict = { token: idx for idx, token in enumerate(embedding_dict.keys(), 2) } if token2idx_dict is None else token2idx_dict token2idx_dict[NULL] = 0 token2idx_dict[OOV] = 1 embedding_dict[NULL] = [0. for _ in range(vec_size)] embedding_dict[OOV] = [0. for _ in range(vec_size)] idx2emb_dict = { idx: embedding_dict[token] for token, idx in token2idx_dict.items() } emb_mat = [idx2emb_dict[idx] for idx in range(len(idx2emb_dict))] return emb_mat, token2idx_dict
def get(self, name, default=''): """Return the value of the specified option. Valid default input is a string. Returns a string. """ if self.config.parser.has_option(self.name, name): value = self.config.parser.get(self.name, name) elif self.config.parent: value = self.config.parent[self.name].get(name, default) else: option = Option.registry.get((self.name, name)) if option: value = option.default or default else: value = default if not value: return u'' elif isinstance(value, basestring): return to_unicode(value) else: return value
def get_by_city_code(city_code='101110101'): """default city_code is 北京 """ remote_url = URL.format(city_code) response = ul.urlopen(remote_url).read() response = util.to_unicode(response) try: data = json.loads(response) weather_info = data['weatherinfo'] cur = get_current() temp = 'temp%d' % (cur) weather = 'weather%d' % (cur) re = { 'date': weather_info['date_y'], 'city': weather_info['city'], 'temp': weather_info[temp], 'weather': weather_info[weather], 'tip': weather_info['index_d'] } except ValueError: re = None return re
def get_by_city_code(city_code='101110101'): """default city_code is 北京 """ remote_url = URL.format(city_code) response = ul.urlopen(remote_url).read() response = util.to_unicode(response) try: data = json.loads(response) weather_info = data['weatherinfo'] cur = get_current() temp = 'temp%d' % (cur) weather = 'weather%d' % (cur) re = { 'date' : weather_info['date_y'], 'city' : weather_info['city'], 'temp' : weather_info[temp], 'weather' : weather_info[weather], 'tip' : weather_info['index_d'] } except ValueError: re = None return re
def get_by_city_name(city=u'北京'): if isinstance(city, dict): city = city['where'] city = util.to_unicode(city) return get_by_city_code(city_dict[city])
def _check(self, value): if not self._check_mm(len(util.to_unicode(value))): message = self._messages[self._message_key + '_len'] return False, message return self.is_email(value)
def run(nogui=False): """Parses command-line arguments and either runs GUI, or a CLI action.""" global is_cli, is_gui_possible, is_verbose if (getattr(sys, 'frozen', False) # Binary application or sys.executable.lower().endswith("pythonw.exe")): sys.stdout = ConsoleWriter(sys.stdout) # Hooks for attaching to sys.stderr = ConsoleWriter(sys.stderr) # a text console if "main" not in sys.modules: # E.g. setuptools install, calling main.run srcdir = os.path.abspath(os.path.dirname(__file__)) if srcdir not in sys.path: sys.path.append(srcdir) sys.modules["main"] = __import__("main") argparser = argparse.ArgumentParser(description=ARGUMENTS["description"]) for arg in ARGUMENTS["arguments"]: argparser.add_argument(*arg.pop("args"), **arg) subparsers = argparser.add_subparsers(dest="command") for cmd in ARGUMENTS["commands"]: kwargs = dict((k, cmd[k]) for k in cmd if k in ["help", "description"]) subparser = subparsers.add_parser(cmd["name"], **kwargs) for arg in cmd["arguments"]: kwargs = dict((k, arg[k]) for k in arg if k != "args") subparser.add_argument(*arg["args"], **kwargs) if "nt" == os.name: # Fix Unicode arguments, otherwise converted to ? sys.argv[:] = win32_unicode_argv() argv = sys.argv[1:] if not argv or (argv[0] not in subparsers.choices and argv[0].endswith(".db")): argv[:0] = ["gui"] # argparse hack: force default argument if argv[0] in ("-h", "--help") and len(argv) > 1: argv[:2] = argv[:2][::-1] # Swap "-h option" to "option -h" arguments = argparser.parse_args(argv) if hasattr(arguments, "FILE1") and hasattr(arguments, "FILE2"): arguments.FILE1 = [util.to_unicode(f) for f in arguments.FILE1] arguments.FILE2 = [util.to_unicode(f) for f in arguments.FILE2] arguments.FILE = arguments.FILE1 + arguments.FILE2 if arguments.FILE: # Expand wildcards to actual filenames arguments.FILE = sum([glob.glob(f) if "*" in f else [f] for f in arguments.FILE], []) arguments.FILE = sorted(set(util.to_unicode(f) for f in arguments.FILE)) if "gui" == arguments.command and (nogui or not is_gui_possible): argparser.print_help() status = None if not nogui: status = ("\n\nwxPython not found. %s graphical program " "will not run." % conf.Title) sys.exit(status) elif "gui" != arguments.command: conf.load() is_cli = sys.modules["main"].is_cli = True is_verbose = sys.modules["main"].is_verbose = arguments.verbose # Avoid Unicode errors when printing to console. enc = sys.stdout.encoding or locale.getpreferredencoding() or "utf-8" sys.stdout = codecs.getwriter(enc)(sys.stdout, "xmlcharrefreplace") sys.stderr = codecs.getwriter(enc)(sys.stderr, "xmlcharrefreplace") if "diff" == arguments.command: run_diff(*arguments.FILE) elif "merge" == arguments.command: run_merge(arguments.FILE, arguments.output) elif "export" == arguments.command: run_export(arguments.FILE, arguments.type, arguments.chat, arguments.author, arguments.ask_password) elif "search" == arguments.command: run_search(arguments.FILE, arguments.QUERY) elif "gui" == arguments.command: run_gui(arguments.FILE)
def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # Process the article article_words = article.split() if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: if raw_article_sents is not None and len(raw_article_sents) > 0: self.tokenized_sents = [ process_sent(sent) for sent in raw_article_sents ] self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding else: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves article_str = util.to_unicode(article) raw_article_sents = nltk.tokenize.sent_tokenize(article_str) self.tokenized_sents = [ process_sent(sent) for sent in raw_article_sents ] self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.raw_article_sents = raw_article_sents self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.all_original_abstract_sents = all_abstract_sentences self.doc_indices = doc_indices # doc_id in multidoc correspond to each word
import util from os.path import join, abspath, dirname #--------------------------- Get Version ------------------------------# try: f = open('version', 'r') lines = f.readlines() v = lines[0].strip() vd = lines[1].strip() except: v = '0.7' vd = '0101' VERSION = v VERSION_DATE = vd #-----------------------------------------------------------------------# ROOT_PATH = util.to_unicode(abspath(dirname(sys.argv[0]))) XRC_PATH = join(ROOT_PATH, 'resource', 'xrc') LANG_PATH = join(ROOT_PATH, 'lang') PUZZLE_PATH = join(ROOT_PATH, 'puzzle') nCellSize = eval(util.config.get('APP', 'CellSize', '50')) nAnswerCellSize = nCellSize*0.6 nLINE = 9 nGRID = 3 rgLINE = range(nLINE) rgGRID = range(nGRID) clBgFocus = '#C1DEA3' clBgOver = '#8FD6FF' clBgNormal = '#EEEEEE' clBgDefault = '#E3EDFF'
def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, article_lcs_paths_list, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # # Process the article # article_words = article.split() # if len(article_words) > hps.max_enc_steps: # article_words = article_words[:hps.max_enc_steps] # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: if raw_article_sents is not None and len(raw_article_sents) > 0: # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] self.tokenized_sents = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] if self.hps.sep: for sent in self.tokenized_sents[:-1]: sent.append(data.SEP_TOKEN) # Process the article article_words = util.flatten_list_of_lists( self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[: hps . max_enc_steps] self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding else: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves article_str = util.to_unicode(article) raw_article_sents = nltk.tokenize.sent_tokenize(article_str) self.tokenized_sents = [ util.process_sent(sent) for sent in raw_article_sents ] # Process the article article_words = util.flatten_list_of_lists( self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[: hps . max_enc_steps] self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding if self.hps.word_imp_reg: self.enc_importances = self.get_enc_importances( self.tokenized_sents, abstract_words) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) if ssi is not None: # Translate the similar source indices into masks over the encoder input self.ssi_masks = [] for source_indices in ssi: ssi_sent_mask = [0.] * len(raw_article_sents) for source_idx in source_indices: if source_idx >= len(ssi_sent_mask): a = 0 ssi_sent_mask[source_idx] = 1. ssi_mask = pg_mmr_functions.convert_to_word_level( ssi_sent_mask, self.tokenized_sents) self.ssi_masks.append(ssi_mask) summary_sent_tokens = [ sent.strip().split() for sent in abstract_sentences ] if self.hps.ssi_data_path is None and len( self.ssi_masks) != len(summary_sent_tokens): raise Exception( 'len(self.ssi_masks) != len(summary_sent_tokens)') self.sent_indices = pg_mmr_functions.convert_to_word_level( list(range(len(summary_sent_tokens))), summary_sent_tokens).tolist() if article_lcs_paths_list is not None: if len(article_lcs_paths_list) > 1: raise Exception('Need to implement for non-sent_dataset') article_lcs_paths = article_lcs_paths_list[0] imp_mask = [0] * len(article_words) to_add = 0 for source_idx, word_indices_list in enumerate(article_lcs_paths): if source_idx > 0: to_add += len(self.tokenized_sents[source_idx - 1]) for word_idx in word_indices_list: if word_idx + to_add >= len(imp_mask): if len(imp_mask) == hps.max_enc_steps: continue else: print(self.tokenized_sents, article_lcs_paths) raise Exception( 'word_idx + to_add (%d) is larger than imp_mask size (%d)' % (word_idx + to_add, len(imp_mask))) imp_mask[word_idx + to_add] = 1 self.importance_mask = imp_mask # Store the original strings self.original_article = article self.raw_article_sents = raw_article_sents self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.all_original_abstract_sents = all_abstract_sentences self.doc_indices = doc_indices self.ssi = ssi self.article_lcs_paths_list = article_lcs_paths_list
def print_body(body): body = to_unicode(body) print body[:500]
def process_body(body, url): import pdb;pdb.set_trace() body = to_unicode(body) doc = lxml.html.fromstring(body)
def __init__(self, value=False, reason=None): self.__bool = bool(value) self.__reason = to_unicode(reason) if reason else ""
def run(argv): """Parses command-line arguments and either runs GUI, or a CLI action.""" global is_cli, is_gui_possible, is_verbose if (getattr(sys, 'frozen', False) # Binary application or sys.executable.lower().endswith("pythonw.exe")): sys.stdout = ConsoleWriter(sys.stdout) # Hooks for attaching to sys.stderr = ConsoleWriter(sys.stderr) # a text console argparser = argparse.ArgumentParser(description=ARGUMENTS["description"]) for arg in ARGUMENTS["arguments"]: names = arg["args"]; del arg["args"] argparser.add_argument(*names, **arg) subparsers = argparser.add_subparsers(dest="command") for cmd in ARGUMENTS["commands"]: kwargs = dict((k, cmd[k]) for k in cmd if k in ["help", "description"]) subparser = subparsers.add_parser(cmd["name"], **kwargs) for arg in cmd["arguments"]: kwargs = dict((k, arg[k]) for k in arg if k != "args") subparser.add_argument(*arg["args"], **kwargs) if not argv or (argv[0] not in subparsers.choices and argv[0].endswith(".db")): argv[:0] = ["gui"] # argparse hack: force default argument if argv[0] in ("-h", "--help") and len(argv) > 1: argv[:2] = argv[:2][::-1] # Swap "-h option" to "option -h" arguments = argparser.parse_args(argv) if hasattr(arguments, "FILE1") and hasattr(arguments, "FILE2"): arguments.FILE1 = [util.to_unicode(f) for f in arguments.FILE1] arguments.FILE2 = [util.to_unicode(f) for f in arguments.FILE2] arguments.FILE = arguments.FILE1 + arguments.FILE2 if arguments.FILE: # Expand wildcards to actual filenames arguments.FILE = sum([(sorted(glob.glob(f)) if "*" in f else [f]) for f in arguments.FILE], []) arguments.FILE = [util.to_unicode(f) for f in arguments.FILE] if "gui" == arguments.command and not is_gui_possible: argparser.print_help() print("\n\nwxPython not found. %s graphical program will not run." % conf.Title) sys.exit() elif "gui" != arguments.command: is_cli = sys.modules["main"].is_cli = True is_verbose = sys.modules["main"].is_verbose = arguments.verbose enc = sys.stdout.encoding or locale.getpreferredencoding() or "utf-8" if "nt" == os.name: # Avoid print encoding errors under windows sys.stdout = codecs.getwriter(enc)(sys.stdout, "xmlcharrefreplace") sys.stderr = codecs.getwriter(enc)(sys.stderr, "xmlcharrefreplace") if "diff" == arguments.command: run_diff(*arguments.FILE) elif "merge" == arguments.command: run_merge(arguments.FILE) elif "export" == arguments.command: run_export(arguments.FILE, arguments.type) elif "search" == arguments.command: run_search(arguments.FILE, arguments.QUERY) elif "gui" == arguments.command: run_gui(arguments.FILE)
def run(nogui=False): """Parses command-line arguments and either runs GUI, or a CLI action.""" global is_cli, is_gui_possible, is_verbose if (getattr(sys, 'frozen', False) # Binary application or sys.executable.lower().endswith("pythonw.exe")): sys.stdout = ConsoleWriter(sys.stdout) # Hooks for attaching to sys.stderr = ConsoleWriter(sys.stderr) # a text console if "main" not in sys.modules: # E.g. setuptools install, calling main.run srcdir = os.path.abspath(os.path.dirname(__file__)) if srcdir not in sys.path: sys.path.append(srcdir) sys.modules["main"] = __import__("main") argparser = argparse.ArgumentParser(description=ARGUMENTS["description"]) for arg in ARGUMENTS["arguments"]: argparser.add_argument(*arg.pop("args"), **arg) subparsers = argparser.add_subparsers(dest="command") for cmd in ARGUMENTS["commands"]: kwargs = dict((k, cmd[k]) for k in cmd if k in ["help", "description"]) subparser = subparsers.add_parser(cmd["name"], **kwargs) for arg in cmd["arguments"]: kwargs = dict((k, arg[k]) for k in arg if k != "args") subparser.add_argument(*arg["args"], **kwargs) if "nt" == os.name: # Fix Unicode arguments, otherwise converted to ? sys.argv[:] = win32_unicode_argv() argv = sys.argv[1:] if not argv or (argv[0] not in subparsers.choices and argv[0].endswith(".db")): argv[:0] = ["gui"] # argparse hack: force default argument if argv[0] in ("-h", "--help") and len(argv) > 1: argv[:2] = argv[:2][::-1] # Swap "-h option" to "option -h" arguments = argparser.parse_args(argv) if hasattr(arguments, "FILE1") and hasattr(arguments, "FILE2"): arguments.FILE1 = [util.to_unicode(f) for f in arguments.FILE1] arguments.FILE2 = [util.to_unicode(f) for f in arguments.FILE2] arguments.FILE = arguments.FILE1 + arguments.FILE2 if arguments.FILE: # Expand wildcards to actual filenames arguments.FILE = sum( [glob.glob(f) if "*" in f else [f] for f in arguments.FILE], []) arguments.FILE = sorted(set( util.to_unicode(f) for f in arguments.FILE)) if "gui" == arguments.command and (nogui or not is_gui_possible): argparser.print_help() status = None if not nogui: status = ("\n\nwxPython not found. %s graphical program " "will not run." % conf.Title) sys.exit(status) elif "gui" != arguments.command: conf.load() is_cli = sys.modules["main"].is_cli = True is_verbose = sys.modules["main"].is_verbose = arguments.verbose # Avoid Unicode errors when printing to console. enc = sys.stdout.encoding or locale.getpreferredencoding() or "utf-8" sys.stdout = codecs.getwriter(enc)(sys.stdout, "xmlcharrefreplace") sys.stderr = codecs.getwriter(enc)(sys.stderr, "xmlcharrefreplace") if "diff" == arguments.command: run_diff(*arguments.FILE) elif "merge" == arguments.command: run_merge(arguments.FILE, arguments.output) elif "export" == arguments.command: run_export(arguments.FILE, arguments.type, arguments.chat, arguments.author, arguments.ask_password) elif "search" == arguments.command: run_search(arguments.FILE, arguments.QUERY) elif "gui" == arguments.command: run_gui(arguments.FILE)