Beispiel #1
0
    def run_step(self, prev, params):

        df = pd.read_csv(prev)

        for locale in ['es', 'en']:
            for level in ['sector']:
                df.sort_values(by=['{}_id'.format(level)], inplace=True)
                df['{}_{}_short'.format(level,
                                        locale)] = df['{}_{}_short'.format(
                                            level, locale)].ffill()
                df['{}_{}'.format(level, locale)] = df['{}_{}'.format(
                    level, locale)].ffill()
                df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}_short'.format(level, locale)] = \
                    df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}'.format(level, locale)]

        df = df[['sector_id', 'sector_es', 'sector_en']].copy()

        # codes ids
        cols_es = list(df.columns[df.columns.str.contains('_es')])
        cols_en = list(df.columns[df.columns.str.contains('_en')])
        nltk.download('stopwords')
        stopwords_es = nltk.corpus.stopwords.words('spanish')
        df = format_text(df, cols_es, stopwords=stopwords_es)
        df = format_text(df, cols_en, stopwords=stop_words.ENGLISH_STOP_WORDS)
        df.drop_duplicates(subset=['sector_id'], inplace=True)

        for col in ['sector_id']:
            df[col] = df[col].astype(str)

        return df
Beispiel #2
0
    def parse_info(self, nodes, object, map={}):
        """
        map is a hash, the keys are the names in the fxd file,
        the content the variable names, e.g. {'content': 'description, }
        All tags not in the map are stored with the same name in info.
        """
        if not nodes:
            return

        if hasattr(nodes, 'children'):
            for node in nodes.children:
                if node.name == 'info':
                    nodes = [node]
                    break
            else:
                nodes = []

        for node in nodes:
            for child in node.children:
                txt = child.textof()
                if not txt:
                    continue
                if child.name in map:
                    object.info[map[child.name]] = util.format_text(txt)
                object.info[child.name] = util.format_text(txt)
Beispiel #3
0
    def parse_info(self, nodes, object, map={}):
        """
        map is a hash, the keys are the names in the fxd file,
        the content the variable names, e.g. {'content': 'description, }
        All tags not in the map are stored with the same name in info.
        """
        if not nodes:
            return

        if hasattr(nodes, 'children'):
            for node in nodes.children:
                if node.name == 'info':
                    nodes = [ node ]
                    break
            else:
                nodes = []

        for node in nodes:
            for child in node.children:
                txt = child.textof()
                if not txt:
                    continue
                if child.name in map:
                    object.info[map[child.name]] = util.format_text(txt)
                object.info[child.name] = util.format_text(txt)
Beispiel #4
0
    def parse_result(self, result):
        plugin_id = result["plugin"]["id"]

        try:
            instance_id = result["instance"]["id"]
        except:
            instance_id = None

        plugin_msg = {
            "type": "plugin",
            "message": plugin_id,
            "filter": plugin_id,
            "plugin": plugin_id,
            "instance": instance_id,
        }

        instance_msg = {
            "type": "instance",
            "message": instance_id or "Context",
            "filter": instance_id,
            "duration": result["duration"],
            "plugin": plugin_id,
            "instance": instance_id,
        }

        record_msgs = list()

        for record in result["records"]:
            record["type"] = "record"
            record["filter"] = record["message"]
            record["message"] = util.format_text(str(record["message"]))

            record["plugin"] = plugin_id
            record["instance"] = instance_id

            record_msgs.append(record)

        error_msg = {"type": "error", "message": "No error", "filter": "", "plugin": plugin_id, "instance": instance_id}

        error_msg = None

        if result["error"] is not None:
            error = result["error"]
            error["type"] = "error"
            error["message"] = util.format_text(error["message"])
            error["filter"] = error["message"]

            error["plugin"] = plugin_id
            error["instance"] = instance_id

            error_msg = error

        return {"plugin": plugin_msg, "instance": instance_msg, "records": record_msgs, "error": error_msg}
Beispiel #5
0
    def run_step(self, prev, params):

        df = pd.read_csv(prev)

        for locale in ['es', 'en']:
            for level in [
                    'sector', 'subsector', 'industry_group', 'naics_industry',
                    'national_industry'
            ]:
                df.sort_values(by=['{}_id'.format(level)], inplace=True)
                df['{}_{}_short'.format(level,
                                        locale)] = df['{}_{}_short'.format(
                                            level, locale)].ffill()
                df['{}_{}'.format(level, locale)] = df['{}_{}'.format(
                    level, locale)].ffill()
                df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}_short'.format(level, locale)] = \
                    df.loc[df['{}_{}_short'.format(level, locale)].isna(), '{}_{}'.format(level, locale)]

        # codes ids
        cols_es = list(df.columns[df.columns.str.contains('_es')])
        cols_en = list(df.columns[df.columns.str.contains('_en')])
        nltk.download('stopwords')
        stopwords_es = nltk.corpus.stopwords.words('spanish')
        df = format_text(df, cols_es, stopwords=stopwords_es)
        df = format_text(df, cols_en, stopwords=stop_words.ENGLISH_STOP_WORDS)

        for col in [
                'sector_id', 'subsector_id', 'industry_group_id',
                'naics_industry_id', 'national_industry_id'
        ]:
            df[col] = df[col].astype(str)

        # when creating the industry dimension, the cms ask for members, so 'ghost' profiles are created
        # they also appear at the search bar which the canon-cms-warmup also gets
        query = 'SELECT distinct(national_industry_id) FROM inegi_economic_census'
        query_result = query_to_df(self.connector, raw_query=query)
        query_result = list(query_result['national_industry_id'])
        print('Total ids (dimension):', df.shape[0])
        print('Ids in dimension but not in data:',
              df.loc[~df['national_industry_id'].isin(query_result)].shape[0])
        print('Total ids (data):',
              df.loc[df['national_industry_id'].isin(query_result)].shape[0])
        df = df.loc[df['national_industry_id'].isin(query_result)].copy()

        if params.get('is_dim'):
            df.drop_duplicates(subset=['national_industry_es'], inplace=True)

        return df
Beispiel #6
0
 def childcontent(self, node, name):
     """
     return the content of the child node with the given name
     """
     for child in node.children:
         if child.name == name:
             return util.format_text(child.textof())
     return ''
Beispiel #7
0
 def childcontent(self, node, name):
     """
     return the content of the child node with the given name
     """
     for child in node.children:
         if child.name == name:
             return util.format_text(child.textof())
     return ''
Beispiel #8
0
    def parse_result(self, result):
        plugin_id = result["plugin"]["id"]

        try:
            instance_id = result["instance"]["id"]
        except:
            instance_id = None

        plugin_msg = {
            "type": "plugin",
            "message": plugin_id,
            "filter": plugin_id,
            "plugin": plugin_id,
            "instance": instance_id
        }

        instance_msg = {
            "type": "instance",
            "message": instance_id or "Context",
            "filter": instance_id,
            "duration": result["duration"],
            "plugin": plugin_id,
            "instance": instance_id
        }

        record_msgs = list()

        for record in result["records"]:
            record["type"] = "record"
            record["filter"] = record["message"]
            record["message"] = util.format_text(str(record["message"]))

            record["plugin"] = plugin_id
            record["instance"] = instance_id

            record_msgs.append(record)

        error_msg = {
            "type": "error",
            "message": "No error",
            "filter": "",
            "plugin": plugin_id,
            "instance": instance_id
        }

        error_msg = None

        if result["error"] is not None:
            error = result["error"]
            error["type"] = "error"
            error["message"] = util.format_text(error["message"])
            error["filter"] = error["message"]

            error["plugin"] = plugin_id
            error["instance"] = instance_id

            error_msg = error

        return {
            "plugin": plugin_msg,
            "instance": instance_msg,
            "records": record_msgs,
            "error": error_msg,
        }
Beispiel #9
0
 def gettext(self, node):
     """
     rerurn the text of the node
     """
     return util.format_text(node.textof())
Beispiel #10
0
 def gettext(self, node):
     """
     rerurn the text of the node
     """
     return util.format_text(node.textof())
Beispiel #11
0
def load_guide(XMLTV_FILE=None, popup_dialog=None):
    """
    Load a guide from the raw XMLTV file using the xmltv.py support lib.
    Returns a TvGuide or None if an error occurred
    """
    if not XMLTV_FILE:
        XMLTV_FILE = config.XMLTV_FILE

    # Create a new guide
    guide = TvGuide()

    # Is there a file to read from?
    if os.path.isfile(XMLTV_FILE):
        gotfile = 1
        guide.timestamp = os.path.getmtime(XMLTV_FILE)
    else:
        logger.debug('XMLTV file (%s) missing!', XMLTV_FILE)
        gotfile = 0
    if popup_dialog:
        popup_dialog.update_progress(_('Reading channels'), 0.0)
    # Add the channels that are in the config list, or all if the
    # list is empty
    if config.TV_CHANNELS:
        logger.debug('Only adding channels in TV_CHANNELS to TvGuide')

        for data in config.TV_CHANNELS:
            (id, displayname, tunerid) = data[:3]
            c = TvChannel(id, displayname, tunerid)

            # Handle the optional time-dependent station info
            c.times = []
            if len(data) > 3 and len(data[3:4]) == 3:
                for (days, start_time, stop_time) in data[3:4]:
                    c.times.append((days, int(start_time), int(stop_time)))
            guide.add_channel(c)


    else: # Add all channels in the XMLTV file
        logger.debug('Adding all channels to TvGuide')

        xmltv_channels = None
        if gotfile:
            # Don't read the channel info unless we have to, takes a long time!
            xmltv_channels = xmltv.read_channels(util.gzopen(XMLTV_FILE))

        # Was the guide read successfully?
        if not xmltv_channels:
            return None     # No

        for chan in xmltv_channels:
            id = chan['id'].encode(config.LOCALE, 'ignore')
            if ' ' in id:
                # Assume the format is "TUNERID CHANNELNAME"
                tunerid = id.split()[0]       # XXX Educated guess
                displayname = id.split()[1]   # XXX Educated guess
            else:
                display_name = chan['display-name'][0][0]
                if ' ' in display_name:
                    tunerid = display_name.split()[0]
                    displayname = display_name.split()[1]
                else:
                    tunerid = _('REPLACE WITH TUNERID FOR %s') % display_name
                    displayname = display_name

            c = TvChannel(id, displayname, tunerid)
            guide.add_channel(c)

    if popup_dialog:
        popup_dialog.update_progress(_('Reading programmes'), 0.25)

    xmltv_programs = None
    if gotfile:
        logger.debug('reading \"%s\" xmltv data', XMLTV_FILE)
        f = util.gzopen(XMLTV_FILE)
        xmltv_programs = xmltv.read_programmes(f)
        f.close()

    # Was the guide read successfully?
    if not xmltv_programs:
        return guide    # Return the guide, it has the channels at least...

    needed_ids = []
    for chan in guide.chan_dict:
        needed_ids.append(chan)

    logger.debug('creating guide for %s', needed_ids)
    if popup_dialog:
        popup_dialog.update_progress(_('Processing programmes'), 0.50)
    for p in xmltv_programs:
        if not p['channel'] in needed_ids:
            continue
        try:
            channel_id = p['channel']
            date = 'date' in p and Unicode(p['date']) or ''
            start = ''
            pdc_start = ''
            stop = ''
            title = Unicode(p['title'][0][0])
            desc = 'desc' in p and Unicode(util.format_text(p['desc'][0][0])) or ''
            sub_title = 'sub-title' in p and Unicode(p['sub-title'][0][0]) or ''
            categories = 'category' in p and [ cat[0] for cat in p['category'] ] or ''
            advisories = []
            ratings = {}

            # Add credits to the description
            if 'credits' in p:
                desc += Unicode('\n\n')
                desc += _('Credits :\n')
                credits = p['credits']
                if 'actor' in credits:
                    desc += Unicode('\n')
                    desc += _('Actors :\n')
                    for actor in credits['actor']:
                        desc += Unicode(actor + '\n')
                if 'director' in credits:
                    desc += Unicode('\n')
                    directors = credits['director']
                    if len(directors) == 1:
                        desc += _('Director : %s') % directors[0]
                    else:
                        desc += _('Directors :\n')
                        for d in directors:
                            desc += Unicode(d + '\n')

            if 'rating' in p:
                for r in p['rating']:
                    if r.get('system') == 'advisory':
                        advisories.append(String(r.get('value')))
                        continue
                    ratings[String(r.get('system'))] = String(r.get('value'))
            try:
                start = timestr2secs_utc(p['start'])
                pdc_start = 'pdc_start' in p and timestr2secs_utc(p['pdc_start']) or start
                try:
                    stop = timestr2secs_utc(p['stop'])
                except:
                    # Fudging end time
                    stop = timestr2secs_utc(p['start'][0:8] + '235900' + p['start'][14:18])
            except EpgException, why:
                logger.warning('EpgException: %s', why)
                continue

            # fix bad German titles to make favorites work
            if title.endswith('. Teil'):
                title = title[:-6]
                if title.rfind(' ') > 0:
                    try:
                        part = int(title[title.rfind(' ')+1:])
                        title = title[:title.rfind(' ')].rstrip()
                        if sub_title:
                            sub_title = u'Teil %s: %s' % (part, sub_title)
                        else:
                            sub_title = u'Teil %s' % part
                    except Exception, e:
                        print 'Teil:', e

            prog = TvProgram(channel_id, start, pdc_start, stop, title, sub_title, desc, categories, ratings)
            prog.advisories = advisories
            prog.date = date
            guide.add_program(prog)
        tts = gTTS(text=text_sentences[i], lang='zh', slow=False)
        tts.save(audio_dir + '/' + str(i) + '.mp3')
        print('\n', text_sentences[i], '\n')
        print("created " + str(i) + " audio file")

    text_clip_list = []
    audio_clip_list = []
    silence = AudioFileClip('./audio/silence.mp3').subclip(0, 0.1)
    audio_clip_list.append(silence)

    for i in range(0, len(text_sentences)):
        sent_audio_clip = AudioFileClip(audio_dir + '/' + str(i) + '.mp3')
        print("length of audio: " + str(i) + " = ", sent_audio_clip.duration)
        audio_clip_list.append(sent_audio_clip)
        sent_txt_clip = TextClip(
            format_text(text_sentences[i]),
            font='ArialUnicode',
            fontsize=150,
            color='yellow',
            bg_color='black',
            stroke_width=30).set_pos('bottom').set_duration(
                sent_audio_clip.duration).resize(width=1000)
        text_clip_list.append(sent_txt_clip)

    audio_clip = concatenate_audioclips(audio_clip_list)

    file_names = []
    for i in range(0, len(folder_names)):
        files = (fn for fn in os.listdir(picture_dir + '/' + folder_names[i])
                 if fn.endswith('.jpg') or fn.endswith('.png')
                 or fn.endswith('.PNG') or fn.endswith('.JPG')
Beispiel #13
0
def load_guide(verbose=True, XMLTV_FILE=None):
    """
    Load a guide from the raw XMLTV file using the xmltv.py support lib.
    Returns a TvGuide or None if an error occurred
    """
    if not XMLTV_FILE:
        XMLTV_FILE = config.XMLTV_FILE

    # Create a new guide
    guide = epg_types.TvGuide()

    # Is there a file to read from?
    if os.path.isfile(XMLTV_FILE):
        gotfile = 1
        guide.timestamp = os.path.getmtime(XMLTV_FILE)
    else:
        _debug_('XMLTV file (%s) missing!' % XMLTV_FILE)
        gotfile = 0

    # Add the channels that are in the config list, or all if the
    # list is empty
    if config.TV_CHANNELS:
        if verbose:
            _debug_('epg_xmltv.py: Only adding channels in list')

        for data in config.TV_CHANNELS:
            (id, disp, tunerid) = data[:3]
            c = epg_types.TvChannel()
            c.id = id
            c.displayname = disp
            c.tunerid = tunerid

            # Handle the optional time-dependent station info
            c.times = []
            if len(data) > 3 and len(data[3:4]) == 3:
                for (days, start_time, stop_time) in data[3:4]:
                    c.times.append((days, int(start_time), int(stop_time)))
            guide.AddChannel(c)

    else:  # Add all channels in the XMLTV file
        if verbose:
            _debug_('epg_xmltv.py: Adding all channels')
        xmltv_channels = None
        if gotfile:
            # Don't read the channel info unless we have to, takes a long time!
            xmltv_channels = xmltv.read_channels(util.gzopen(XMLTV_FILE))

        # Was the guide read successfully?
        if not xmltv_channels:
            return None  # No

        for chan in xmltv_channels:
            id = chan['id'].encode(config.LOCALE, 'ignore')
            c = epg_types.TvChannel()
            c.id = id
            if ' ' in id:
                # Assume the format is "TUNERID CHANNELNAME"
                c.displayname = id.split()[1]  # XXX Educated guess
                c.tunerid = id.split()[0]  # XXX Educated guess
            else:
                displayname = chan['display-name'][0][0]
                if ' ' in displayname:
                    c.displayname = displayname.split()[1]
                    c.tunerid = displayname.split()[0]
                else:
                    c.displayname = displayname
                    c.tunerid = _('REPLACE WITH TUNERID FOR %s') % displayname

            guide.AddChannel(c)

    xmltv_programs = None
    if gotfile:
        if verbose:
            _debug_('reading \"%s\" xmltv data' % XMLTV_FILE)
        f = util.gzopen(XMLTV_FILE)
        xmltv_programs = xmltv.read_programmes(f)
        f.close()

    # Was the guide read successfully?
    if not xmltv_programs:
        return guide  # Return the guide, it has the channels at least...

    needed_ids = []
    for chan in guide.chan_dict:
        needed_ids.append(chan)

    if verbose:
        _debug_('creating guide for %s' % needed_ids)

    for p in xmltv_programs:
        if not p['channel'] in needed_ids:
            continue
        try:
            prog = epg_types.TvProgram()
            prog.channel_id = p['channel']
            prog.title = Unicode(p['title'][0][0])
            if p.has_key('date'):
                prog.date = Unicode(p['date'])
            if p.has_key('category'):
                prog.categories = [cat[0] for cat in p['category']]
            if p.has_key('rating'):
                for r in p['rating']:
                    if r.get('system') == 'advisory':
                        prog.advisories.append(String(r.get('value')))
                        continue
                    prog.ratings[String(r.get('system'))] = String(
                        r.get('value'))
            if p.has_key('desc'):
                prog.desc = Unicode(util.format_text(p['desc'][0][0]))
            if p.has_key('sub-title'):
                prog.sub_title = p['sub-title'][0][0]
            try:
                prog.start = timestr2secs_utc(p['start'])
                if p.has_key('pdc_start'):
                    prog.pdc_start = timestr2secs_utc(p['pdc_start'])
                else:
                    prog.pdc_start = prog.start
                try:
                    prog.stop = timestr2secs_utc(p['stop'])
                except:
                    # Fudging end time
                    prog.stop = timestr2secs_utc(p['start'][0:8] + '235900' + \
                                                 p['start'][14:18])
            except EPG_TIME_EXC:
                continue
            # fix bad German titles to make favorites working
            if prog.title.endswith('. Teil'):
                prog.title = prog.title[:-6]
                if prog.title.rfind(' ') > 0:
                    try:
                        part = int(prog.title[prog.title.rfind(' ') + 1:])
                        prog.title = prog.title[:prog.title.rfind(' ')].rstrip(
                        )
                        if prog.sub_title:
                            prog.sub_title = u'Teil %s: %s' % (part,
                                                               prog.sub_title)
                        else:
                            prog.sub_title = u'Teil %s' % part
                    except Exception, e:
                        print 'Teil:', e

            guide.AddProgram(prog)
        except: