def run_step(self, prev, params): df = pd.read_csv(prev) for locale in ['es', 'en']: for level in ['sector']: df.sort_values(by=['{}_id'.format(level)], inplace=True) df['{}_{}_short'.format(level, locale)] = df['{}_{}_short'.format( level, locale)].ffill() df['{}_{}'.format(level, locale)] = df['{}_{}'.format( level, locale)].ffill() df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}_short'.format(level, locale)] = \ df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}'.format(level, locale)] df = df[['sector_id', 'sector_es', 'sector_en']].copy() # codes ids cols_es = list(df.columns[df.columns.str.contains('_es')]) cols_en = list(df.columns[df.columns.str.contains('_en')]) nltk.download('stopwords') stopwords_es = nltk.corpus.stopwords.words('spanish') df = format_text(df, cols_es, stopwords=stopwords_es) df = format_text(df, cols_en, stopwords=stop_words.ENGLISH_STOP_WORDS) df.drop_duplicates(subset=['sector_id'], inplace=True) for col in ['sector_id']: df[col] = df[col].astype(str) return df
def parse_info(self, nodes, object, map={}): """ map is a hash, the keys are the names in the fxd file, the content the variable names, e.g. {'content': 'description, } All tags not in the map are stored with the same name in info. """ if not nodes: return if hasattr(nodes, 'children'): for node in nodes.children: if node.name == 'info': nodes = [node] break else: nodes = [] for node in nodes: for child in node.children: txt = child.textof() if not txt: continue if child.name in map: object.info[map[child.name]] = util.format_text(txt) object.info[child.name] = util.format_text(txt)
def parse_info(self, nodes, object, map={}): """ map is a hash, the keys are the names in the fxd file, the content the variable names, e.g. {'content': 'description, } All tags not in the map are stored with the same name in info. """ if not nodes: return if hasattr(nodes, 'children'): for node in nodes.children: if node.name == 'info': nodes = [ node ] break else: nodes = [] for node in nodes: for child in node.children: txt = child.textof() if not txt: continue if child.name in map: object.info[map[child.name]] = util.format_text(txt) object.info[child.name] = util.format_text(txt)
def parse_result(self, result): plugin_id = result["plugin"]["id"] try: instance_id = result["instance"]["id"] except: instance_id = None plugin_msg = { "type": "plugin", "message": plugin_id, "filter": plugin_id, "plugin": plugin_id, "instance": instance_id, } instance_msg = { "type": "instance", "message": instance_id or "Context", "filter": instance_id, "duration": result["duration"], "plugin": plugin_id, "instance": instance_id, } record_msgs = list() for record in result["records"]: record["type"] = "record" record["filter"] = record["message"] record["message"] = util.format_text(str(record["message"])) record["plugin"] = plugin_id record["instance"] = instance_id record_msgs.append(record) error_msg = {"type": "error", "message": "No error", "filter": "", "plugin": plugin_id, "instance": instance_id} error_msg = None if result["error"] is not None: error = result["error"] error["type"] = "error" error["message"] = util.format_text(error["message"]) error["filter"] = error["message"] error["plugin"] = plugin_id error["instance"] = instance_id error_msg = error return {"plugin": plugin_msg, "instance": instance_msg, "records": record_msgs, "error": error_msg}
def run_step(self, prev, params): df = pd.read_csv(prev) for locale in ['es', 'en']: for level in [ 'sector', 'subsector', 'industry_group', 'naics_industry', 'national_industry' ]: df.sort_values(by=['{}_id'.format(level)], inplace=True) df['{}_{}_short'.format(level, locale)] = df['{}_{}_short'.format( level, locale)].ffill() df['{}_{}'.format(level, locale)] = df['{}_{}'.format( level, locale)].ffill() df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}_short'.format(level, locale)] = \ df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}'.format(level, locale)] # codes ids cols_es = list(df.columns[df.columns.str.contains('_es')]) cols_en = list(df.columns[df.columns.str.contains('_en')]) nltk.download('stopwords') stopwords_es = nltk.corpus.stopwords.words('spanish') df = format_text(df, cols_es, stopwords=stopwords_es) df = format_text(df, cols_en, stopwords=stop_words.ENGLISH_STOP_WORDS) for col in [ 'sector_id', 'subsector_id', 'industry_group_id', 'naics_industry_id', 'national_industry_id' ]: df[col] = df[col].astype(str) # when creating the industry dimension, the cms ask for members, so 'ghost' profiles are created # they also appear at the search bar which the canon-cms-warmup also gets query = 'SELECT distinct(national_industry_id) FROM inegi_economic_census' query_result = query_to_df(self.connector, raw_query=query) query_result = list(query_result['national_industry_id']) print('Total ids (dimension):', df.shape[0]) print('Ids in dimension but not in data:', df.loc[~df['national_industry_id'].isin(query_result)].shape[0]) print('Total ids (data):', df.loc[df['national_industry_id'].isin(query_result)].shape[0]) df = df.loc[df['national_industry_id'].isin(query_result)].copy() if params.get('is_dim'): df.drop_duplicates(subset=['national_industry_es'], inplace=True) return df
def childcontent(self, node, name): """ return the content of the child node with the given name """ for child in node.children: if child.name == name: return util.format_text(child.textof()) return ''
def parse_result(self, result): plugin_id = result["plugin"]["id"] try: instance_id = result["instance"]["id"] except: instance_id = None plugin_msg = { "type": "plugin", "message": plugin_id, "filter": plugin_id, "plugin": plugin_id, "instance": instance_id } instance_msg = { "type": "instance", "message": instance_id or "Context", "filter": instance_id, "duration": result["duration"], "plugin": plugin_id, "instance": instance_id } record_msgs = list() for record in result["records"]: record["type"] = "record" record["filter"] = record["message"] record["message"] = util.format_text(str(record["message"])) record["plugin"] = plugin_id record["instance"] = instance_id record_msgs.append(record) error_msg = { "type": "error", "message": "No error", "filter": "", "plugin": plugin_id, "instance": instance_id } error_msg = None if result["error"] is not None: error = result["error"] error["type"] = "error" error["message"] = util.format_text(error["message"]) error["filter"] = error["message"] error["plugin"] = plugin_id error["instance"] = instance_id error_msg = error return { "plugin": plugin_msg, "instance": instance_msg, "records": record_msgs, "error": error_msg, }
def gettext(self, node): """ rerurn the text of the node """ return util.format_text(node.textof())
def load_guide(XMLTV_FILE=None, popup_dialog=None): """ Load a guide from the raw XMLTV file using the xmltv.py support lib. Returns a TvGuide or None if an error occurred """ if not XMLTV_FILE: XMLTV_FILE = config.XMLTV_FILE # Create a new guide guide = TvGuide() # Is there a file to read from? if os.path.isfile(XMLTV_FILE): gotfile = 1 guide.timestamp = os.path.getmtime(XMLTV_FILE) else: logger.debug('XMLTV file (%s) missing!', XMLTV_FILE) gotfile = 0 if popup_dialog: popup_dialog.update_progress(_('Reading channels'), 0.0) # Add the channels that are in the config list, or all if the # list is empty if config.TV_CHANNELS: logger.debug('Only adding channels in TV_CHANNELS to TvGuide') for data in config.TV_CHANNELS: (id, displayname, tunerid) = data[:3] c = TvChannel(id, displayname, tunerid) # Handle the optional time-dependent station info c.times = [] if len(data) > 3 and len(data[3:4]) == 3: for (days, start_time, stop_time) in data[3:4]: c.times.append((days, int(start_time), int(stop_time))) guide.add_channel(c) else: # Add all channels in the XMLTV file logger.debug('Adding all channels to TvGuide') xmltv_channels = None if gotfile: # Don't read the channel info unless we have to, takes a long time! xmltv_channels = xmltv.read_channels(util.gzopen(XMLTV_FILE)) # Was the guide read successfully? if not xmltv_channels: return None # No for chan in xmltv_channels: id = chan['id'].encode(config.LOCALE, 'ignore') if ' ' in id: # Assume the format is "TUNERID CHANNELNAME" tunerid = id.split()[0] # XXX Educated guess displayname = id.split()[1] # XXX Educated guess else: display_name = chan['display-name'][0][0] if ' ' in display_name: tunerid = display_name.split()[0] displayname = display_name.split()[1] else: tunerid = _('REPLACE WITH TUNERID FOR %s') % display_name displayname = display_name c = TvChannel(id, displayname, tunerid) guide.add_channel(c) if popup_dialog: popup_dialog.update_progress(_('Reading programmes'), 0.25) xmltv_programs = None if gotfile: logger.debug('reading \"%s\" xmltv data', XMLTV_FILE) f = util.gzopen(XMLTV_FILE) xmltv_programs = xmltv.read_programmes(f) f.close() # Was the guide read successfully? if not xmltv_programs: return guide # Return the guide, it has the channels at least... needed_ids = [] for chan in guide.chan_dict: needed_ids.append(chan) logger.debug('creating guide for %s', needed_ids) if popup_dialog: popup_dialog.update_progress(_('Processing programmes'), 0.50) for p in xmltv_programs: if not p['channel'] in needed_ids: continue try: channel_id = p['channel'] date = 'date' in p and Unicode(p['date']) or '' start = '' pdc_start = '' stop = '' title = Unicode(p['title'][0][0]) desc = 'desc' in p and Unicode(util.format_text(p['desc'][0][0])) or '' sub_title = 'sub-title' in p and Unicode(p['sub-title'][0][0]) or '' categories = 'category' in p and [ cat[0] for cat in p['category'] ] or '' advisories = [] ratings = {} # Add credits to the description if 'credits' in p: desc += Unicode('\n\n') desc += _('Credits :\n') credits = p['credits'] if 'actor' in credits: desc += Unicode('\n') desc += _('Actors :\n') for actor in credits['actor']: desc += Unicode(actor + '\n') if 'director' in credits: desc += Unicode('\n') directors = credits['director'] if len(directors) == 1: desc += _('Director : %s') % directors[0] else: desc += _('Directors :\n') for d in directors: desc += Unicode(d + '\n') if 'rating' in p: for r in p['rating']: if r.get('system') == 'advisory': advisories.append(String(r.get('value'))) continue ratings[String(r.get('system'))] = String(r.get('value')) try: start = timestr2secs_utc(p['start']) pdc_start = 'pdc_start' in p and timestr2secs_utc(p['pdc_start']) or start try: stop = timestr2secs_utc(p['stop']) except: # Fudging end time stop = timestr2secs_utc(p['start'][0:8] + '235900' + p['start'][14:18]) except EpgException, why: logger.warning('EpgException: %s', why) continue # fix bad German titles to make favorites work if title.endswith('. Teil'): title = title[:-6] if title.rfind(' ') > 0: try: part = int(title[title.rfind(' ')+1:]) title = title[:title.rfind(' ')].rstrip() if sub_title: sub_title = u'Teil %s: %s' % (part, sub_title) else: sub_title = u'Teil %s' % part except Exception, e: print 'Teil:', e prog = TvProgram(channel_id, start, pdc_start, stop, title, sub_title, desc, categories, ratings) prog.advisories = advisories prog.date = date guide.add_program(prog)
tts = gTTS(text=text_sentences[i], lang='zh', slow=False) tts.save(audio_dir + '/' + str(i) + '.mp3') print('\n', text_sentences[i], '\n') print("created " + str(i) + " audio file") text_clip_list = [] audio_clip_list = [] silence = AudioFileClip('./audio/silence.mp3').subclip(0, 0.1) audio_clip_list.append(silence) for i in range(0, len(text_sentences)): sent_audio_clip = AudioFileClip(audio_dir + '/' + str(i) + '.mp3') print("length of audio: " + str(i) + " = ", sent_audio_clip.duration) audio_clip_list.append(sent_audio_clip) sent_txt_clip = TextClip( format_text(text_sentences[i]), font='ArialUnicode', fontsize=150, color='yellow', bg_color='black', stroke_width=30).set_pos('bottom').set_duration( sent_audio_clip.duration).resize(width=1000) text_clip_list.append(sent_txt_clip) audio_clip = concatenate_audioclips(audio_clip_list) file_names = [] for i in range(0, len(folder_names)): files = (fn for fn in os.listdir(picture_dir + '/' + folder_names[i]) if fn.endswith('.jpg') or fn.endswith('.png') or fn.endswith('.PNG') or fn.endswith('.JPG')
def load_guide(verbose=True, XMLTV_FILE=None): """ Load a guide from the raw XMLTV file using the xmltv.py support lib. Returns a TvGuide or None if an error occurred """ if not XMLTV_FILE: XMLTV_FILE = config.XMLTV_FILE # Create a new guide guide = epg_types.TvGuide() # Is there a file to read from? if os.path.isfile(XMLTV_FILE): gotfile = 1 guide.timestamp = os.path.getmtime(XMLTV_FILE) else: _debug_('XMLTV file (%s) missing!' % XMLTV_FILE) gotfile = 0 # Add the channels that are in the config list, or all if the # list is empty if config.TV_CHANNELS: if verbose: _debug_('epg_xmltv.py: Only adding channels in list') for data in config.TV_CHANNELS: (id, disp, tunerid) = data[:3] c = epg_types.TvChannel() c.id = id c.displayname = disp c.tunerid = tunerid # Handle the optional time-dependent station info c.times = [] if len(data) > 3 and len(data[3:4]) == 3: for (days, start_time, stop_time) in data[3:4]: c.times.append((days, int(start_time), int(stop_time))) guide.AddChannel(c) else: # Add all channels in the XMLTV file if verbose: _debug_('epg_xmltv.py: Adding all channels') xmltv_channels = None if gotfile: # Don't read the channel info unless we have to, takes a long time! xmltv_channels = xmltv.read_channels(util.gzopen(XMLTV_FILE)) # Was the guide read successfully? if not xmltv_channels: return None # No for chan in xmltv_channels: id = chan['id'].encode(config.LOCALE, 'ignore') c = epg_types.TvChannel() c.id = id if ' ' in id: # Assume the format is "TUNERID CHANNELNAME" c.displayname = id.split()[1] # XXX Educated guess c.tunerid = id.split()[0] # XXX Educated guess else: displayname = chan['display-name'][0][0] if ' ' in displayname: c.displayname = displayname.split()[1] c.tunerid = displayname.split()[0] else: c.displayname = displayname c.tunerid = _('REPLACE WITH TUNERID FOR %s') % displayname guide.AddChannel(c) xmltv_programs = None if gotfile: if verbose: _debug_('reading \"%s\" xmltv data' % XMLTV_FILE) f = util.gzopen(XMLTV_FILE) xmltv_programs = xmltv.read_programmes(f) f.close() # Was the guide read successfully? if not xmltv_programs: return guide # Return the guide, it has the channels at least... needed_ids = [] for chan in guide.chan_dict: needed_ids.append(chan) if verbose: _debug_('creating guide for %s' % needed_ids) for p in xmltv_programs: if not p['channel'] in needed_ids: continue try: prog = epg_types.TvProgram() prog.channel_id = p['channel'] prog.title = Unicode(p['title'][0][0]) if p.has_key('date'): prog.date = Unicode(p['date']) if p.has_key('category'): prog.categories = [cat[0] for cat in p['category']] if p.has_key('rating'): for r in p['rating']: if r.get('system') == 'advisory': prog.advisories.append(String(r.get('value'))) continue prog.ratings[String(r.get('system'))] = String( r.get('value')) if p.has_key('desc'): prog.desc = Unicode(util.format_text(p['desc'][0][0])) if p.has_key('sub-title'): prog.sub_title = p['sub-title'][0][0] try: prog.start = timestr2secs_utc(p['start']) if p.has_key('pdc_start'): prog.pdc_start = timestr2secs_utc(p['pdc_start']) else: prog.pdc_start = prog.start try: prog.stop = timestr2secs_utc(p['stop']) except: # Fudging end time prog.stop = timestr2secs_utc(p['start'][0:8] + '235900' + \ p['start'][14:18]) except EPG_TIME_EXC: continue # fix bad German titles to make favorites working if prog.title.endswith('. Teil'): prog.title = prog.title[:-6] if prog.title.rfind(' ') > 0: try: part = int(prog.title[prog.title.rfind(' ') + 1:]) prog.title = prog.title[:prog.title.rfind(' ')].rstrip( ) if prog.sub_title: prog.sub_title = u'Teil %s: %s' % (part, prog.sub_title) else: prog.sub_title = u'Teil %s' % part except Exception, e: print 'Teil:', e guide.AddProgram(prog) except: