def set_from_list(self, entrada, settings): """ Inicializa el objeto a partir de una lista de atributos Si el orden de las columnas en la planilla de entrada cambia, es aqui donde se debe cambiar el orden de lectura de datos. Este algoritmo asume que los datos estan todos contiguos, comenzando con el bloque de datos basicos (que abarca 7 columnas), seguido por el bloque de parte teorica (que abarca 10 columnas mas una columna por cada semana) y finalmente la parte practica (igual dimension que) la parte teorica. """ self.curriculum = normalize_str(entrada[0]) self.jornada = normalize_str(entrada[1]).upper() self.nivel = normalize_num(entrada[2]) self.siglas = normalize_str(entrada[3]) self.asignatura = normalize_str(entrada[4]) self.creditos = normalize_num(entrada[5]) self.horas = normalize_num(entrada[6]) # datos basicos: 7 columnas # datos teoria: 10 columnas + semanas # datos practica: 10 columnas + semanas ti = settings.ini_columnas + 7 tf = ti + 10 + settings.semanas pi = tf pf = pi + 10 + settings.semanas self.set_parte_teorica(entrada[ti:tf], settings.semanas) self.set_parte_practica(entrada[pi:pf], settings.semanas)
def get_phrases(cls, search='') -> List[str]: if len(cls.phrases_cache) == 0: cls.refresh_cache() return [ phrase for phrase in cls.phrases_cache if normalize_str(search) in normalize_str(phrase) ]
def get_most_similar(cls, text: str) -> Tuple[str, int]: phrases = cls.get_phrases() normalized_input_text = normalize_str(text) return max( [(phrase, fuzz.WRatio(normalized_input_text, normalize_str(phrase))) for phrase in phrases], key=lambda x: x[1], )
def read_ATEPC(fname, cv=10, clean_string=True): if os.path.isfile(fname) == False: raise ("[!] Data %s not found" % fname) tree = ET.parse(fname) root = tree.getroot() vocab = defaultdict(float) revs = [] conflict_sents = [] # prepare vocabulary for sentence in root: text = sentence.find('text').text asp_terms_tag = sentence.findall('aspectTerms') contain_conflict = False asp_terms = [] if len(asp_terms_tag) != 0: for asp_term_tag in asp_terms_tag[0].findall('aspectTerm'): from_idx = int(asp_term_tag.get('from')) to_idx = int(asp_term_tag.get('to')) tmp = "xxx " + text[from_idx:to_idx] + " xxx" asp_term = normalize_str(tmp, clean_string=clean_string) asp_term = " ".join(asp_term.split()[1:-1]) polarity = asp_term_tag.get('polarity').encode("utf-8") if polarity == "conflict": contain_conflict = True else: asp_terms.append({ "asp_term": asp_term, "from_idx": from_idx, "to_idx": to_idx, "polarity": polarity }) if contain_conflict is True: sentid = sentence.get("id") conflict_sents.append(sentid) continue else: pre_text = normalize_str(text, clean_string=clean_string) X, Y, dept = gen_sequence_label(pre_text, aspect_terms=asp_terms) revs.append({ "X": X, "Y": Y, "dept": dept, "no_aspterms": len(asp_terms), "no_words": len(X), "split": np.random.randint(0, cv) }) uni_words = set(X) for word in uni_words: vocab[word] += 1 return revs, vocab
def leer_llaves(planilla, hojas_planilla, settings): """Pre-Procesamiento de la planilla: obtener llaves Esta funcion lee todas las llaves de la planilla y las devuelve en un arreglo (ordenado). El arreglo contiene claves repetidas, a fin de contar el numero total de apariciones de cada clave """ llaves = [] for worksheet_name in hojas_planilla: worksheet = planilla.sheet_by_name(worksheet_name) num_rows = worksheet.nrows num_cols = worksheet.ncols #se leen filas de hoja for current_row in range(1, num_rows): #omitir 1 columna de encabezado fila_actual = [] #Recorrer las columnas for current_col in range(num_cols): fila_actual.append(normalize_str(worksheet.cell(current_row, current_col).value)) cursable_actual = Cursable(fila_actual) #Filtrar filas con otype = ST if cursable_actual.otype.upper() != 'ST': demanda = Demanda() demanda.set_from_cursable(cursable_actual, settings) llaves.append(demanda.llave) llaves.sort() return llaves
def __init__(self, data_fname): self.schools = [] self.schools_by_city = defaultdict(list) self.schools_by_area = defaultdict(list) self.csv_reader = csv.reader(open(data_fname, "rU"), delimiter=",") self.csv_reader.next() # Skip header for school_data in self.csv_reader: lat = school_data[10] lon = school_data[11] if not (lat and lon): log.debug("Invalid lat/lon data: %s (id %s)", school_data[1], school_data[0]) continue city = normalize_str(school_data[7]) if city == "cordoba": city = "capital" # DNE data says city='capital' school = School( min_edu_id=school_data[0], # Min. Edu. ID name=school_data[1], # nombre address=school_data[4], # domicilio city=city, # localidad area=school_data[8], # departamento postal_code=school_data[5], # cp lat=school_data[10], # lat lon=school_data[11], # lon ) self.schools.append(school) self.schools_by_city[school.city].append(school) self.schools_by_area[school.area].append(school)
def finish(self): """ All fields have been set to this song. Write the song to disk. """ if not self.check_complete(): self.set_defaults() return False log.info('committing song {title} to store'.format(title=self.title)) song = Song() song.title = self.title song.alternate_title = self.alternate_title # Values will be set when cleaning the song. song.search_title = '' song.search_lyrics = '' song.verse_order = '' song.song_number = self.song_number verses_changed_to_other = {} sxml = SongXML() other_count = 1 for (verse_def, verse_text, lang) in self.verses: if verse_def[0].lower() in VerseType.tags: verse_tag = verse_def[0].lower() else: new_verse_def = '{tag}{count:d}'.format( tag=VerseType.tags[VerseType.Other], count=other_count) verses_changed_to_other[verse_def] = new_verse_def other_count += 1 verse_tag = VerseType.tags[VerseType.Other] log.info('Versetype {old} changing to {new}'.format( old=verse_def, new=new_verse_def)) verse_def = new_verse_def sxml.add_verse_to_lyrics(verse_tag, verse_def[1:], normalize_str(verse_text), lang) song.lyrics = str(sxml.extract_xml(), 'utf-8') if not self.verse_order_list and self.verse_order_list_generated_useful: self.verse_order_list = self.verse_order_list_generated self.verse_order_list = [ verses_changed_to_other.get(v, v) for v in self.verse_order_list ] song.verse_order = ' '.join(self.verse_order_list) song.copyright = self.copyright song.comments = self.comments song.theme_name = self.theme_name song.ccli_number = self.ccli_number song.authors = self.authors if self.song_book_name: song.song_book_name = self.song_book_name song.topics = [] for topic_text in self.topics: if not topic_text: continue song.topics.append(topic_text) # We need to save the song now, before adding the media files, so that # we know where to save the media files to. if isinstance(self.store, list): self.store.append(song) self.set_defaults() return True
def __init__(self, data_fname): self.schools = [] self.schools_by_city = defaultdict(list) self.schools_by_area = defaultdict(list) self.csv_reader = csv.reader(open(data_fname, 'rU'), delimiter=',') self.csv_reader.next() # Skip header for school_data in self.csv_reader: lat = school_data[10] lon = school_data[11] if not (lat and lon): log.debug("Invalid lat/lon data: %s (id %s)", school_data[1], school_data[0]) continue city = normalize_str(school_data[7]) if city == 'cordoba': city = 'capital' # DNE data says city='capital' school = School( min_edu_id=school_data[0], # Min. Edu. ID name=school_data[1], # nombre address=school_data[4], # domicilio city=city, # localidad area=school_data[8], # departamento postal_code=school_data[5], # cp lat=school_data[10], # lat lon=school_data[11], # lon ) self.schools.append(school) self.schools_by_city[school.city].append(school) self.schools_by_area[school.area].append(school)
def add_usage_by_result_id(cls, result_id: str) -> None: is_audio = result_id.startswith('audio-') result_id = normalize_str(result_id[len('audio-short-'):] if is_audio else result_id[len('short-'):]) words = result_id.split(", ") phrases = cls.refresh_cache() for word in words: phrase: Optional['Phrase'] = next( iter(p for p in phrases if normalize_str(p.text) == word), None) if phrase: if is_audio: phrase.audio_daily_usages += 1 phrase.audio_usages += 1 else: phrase.daily_usages += 1 phrase.usages += 1 phrase.save()
def set_curso(self, datos_basicos, datos_semanas, semanas): tipos_sala = normalize_str(datos_basicos[0]) tipos_sala = tipos_sala.split('/') self.tipo_sala = [] for tipo_sala in tipos_sala: tipo_sala = tipo_sala.strip().encode('utf-8').upper() self.tipo_sala.append(tipo_sala) self.capacidad = normalize_num(datos_basicos[1]) self.sesiones = normalize_num(datos_basicos[2]) self.franja_horaria = normalize_str(datos_basicos[3]) self.semanas = semanas #Asignar restricciones de dias de semana (0: lunes - 5: sabado) for i in range(0,6): self.restriccion_dias[i] = normalize_num(datos_basicos[i+4]) #Asignar modulos por semanas for semana in datos_semanas: self.modulos_semana.append(normalize_num(semana))
def long_result_to_audio_result(result: InlineQueryResultArticle) -> InlineQueryResultVoice: title = result.title audio_url = get_audio_url(title) if not audio_url: text = result.input_message_content.message_text speech = polly_client.synthesize_speech(VoiceId='Enrique', OutputFormat='ogg_vorbis', Text=text,) audio_url = upload_audio(speech['AudioStream'].read(), title) result_id = normalize_str(f"audio-{result.id}") return InlineQueryResultVoice( result_id[:63], audio_url, title, )
def __init__(self, name='', address='', city='', area='', postal_code='', province='CORDOBA', dne_id=None, min_edu_id=None, lat=None, lon=None): self.name = normalize_str(name) self.address = normalize_str(address) self.city = normalize_str(city) self.area = normalize_str(area) # Departamento self.province = normalize_str(province) # Always CORDOBA in MinEdu.data self.postal_code = normalize_str(postal_code) self.dne_id = dne_id self.min_edu_id = min_edu_id self.lat = lat and float(lat) self.lon = lon and float(lon)
def check_surroundings(txt, span, original_annot, n_chars, n_words, original_label, predictions, pos_matrix): ''' DESCRIPTION: explore the surroundings of the match. Do not care about extra whitespaces or punctuation signs in the middle of the annotation. ''' ## 1. Get normalized surroundings ## large_span = txt[max(0, span[0] - n_chars):min(span[1] + n_chars, len(txt))] # remove half-catched words first_space = re.search('( |\n)', large_span).span()[1] last_space = (len(large_span) - re.search('( |\n)', large_span[::-1]).span()[0]) large_span_reg = large_span[first_space:last_space] # Tokenize text span token_span2id, id2token_span_pos, token_spans = tokenize_span( large_span_reg, n_words) # Normalize original_annotation_processed = normalize_str(original_annot, min_upper) token_span_processed2token_span = normalize_tokens(token_spans, min_upper) ## 2. Match ## try: res = token_span_processed2token_span[original_annotation_processed] id_ = token_span2id[res] pos = id2token_span_pos[id_] off0 = (pos[0] + first_space + max(0, span[0] - n_chars)) off1 = (pos[1] + first_space + max(0, span[0] - n_chars)) # Check new annotation is not contained in a previously stored new annotation if not any([(item[0] <= off0) & (off1 <= item[1]) for item in pos_matrix]): # STORE PREDICTION and eliminate old predictions contained in the new one. predictions, pos_matrix = \ store_prediction(pos_matrix, predictions, off0, off1, original_label, df_annot, original_annot, txt) except: pass return predictions, pos_matrix
def add_usage_by_result_id(cls, result_id: str) -> None: if 'long-bad-search-' in result_id: return is_audio = result_id.startswith('audio-') result_id = result_id[len('audio-long-' ):] if is_audio else result_id[len('long-'):] phrases = cls.refresh_cache() phrase: Optional['Phrase'] = next( iter(p for p in phrases if result_id in normalize_str(p.text)), None) if phrase: if is_audio: phrase.audio_daily_usages += 1 phrase.audio_usages += 1 else: phrase.daily_usages += 1 phrase.usages += 1 phrase.save()
def leer_bloques(planilla, settings, atributos): """ Genera la lista de todos los posibles bloques de la planilla """ #Arreglo para almacenar los posibles bloque ('D-MAN','V-14',..) bloques = [] #Leer nombres de hojas de la planilla hojas_planilla = planilla.sheet_names() for worksheet_name in hojas_planilla: worksheet = planilla.sheet_by_name(worksheet_name) num_rows = worksheet.nrows num_cols = worksheet.ncols #se leen filas de hoja for current_row in range(settings.ini_filas, num_rows): fila_actual = [] #Recorrer las columnas for current_col in range(num_cols): fila_actual.append(normalize_str(worksheet.cell(current_row, current_col).value)) registro_actual = Registro() registro_actual.set_from_list(fila_actual, settings) bloque_teorico, bloque_practico = registro_actual.procesar_bloques(atributos) if len(bloque_teorico) > 0: bloques.append(bloque_teorico) if len(bloque_practico) > 0: bloques.append(bloque_practico) #Unicuificar y ordenar el arreglo de bloques bloques = set(bloques) bloques = list(bloques) bloques.sort() return bloques
def get_bloque(self, jornada): bloque = '' franjas = ['D-MAN', 'D-TAR', 'V-SEM', 'V-SAB'] dias = '' for i in range(0,len(self.restriccion_dias)): if self.restriccion_dias[i] == 1: dias = dias + normalize_str(i+1) #Caso 1: Franja no vacia y valida if len(self.franja_horaria) > 0 and self.franja_horaria in franjas: #Si franja corresponde con jornada (Ej: 'V-SEM' y 'V') # --> Estudiar dias if jornada.upper() == self.franja_horaria[0].upper(): if len(dias) == 0: bloque = self.franja_horaria elif len(dias) > 0: #Si franja vespertina sem y no aparece sabado(6) --> OK if self.franja_horaria == 'V-SEM' and '6' not in dias: bloque = self.franja_horaria + '-' + dias #Si franja vespertina sab y solo aparece sabado(6) --> OK if self.franja_horaria == 'V-SAB' and dias == '6': bloque = self.franja_horaria + '-' + dias #Si franja diurna (manana o tarde) y no aparece sabado(6) --> OK if (self.franja_horaria == 'D-MAN' or self.franja_horaria == 'D-TAR') and '6' not in dias: bloque = self.franja_horaria + '-' + dias #Caso 2: Franja Vacia else: if len(dias) > 0: if (jornada == 'V') or (jornada == 'D' and '6' not in dias): bloque = jornada + '-' + dias #elif len(dias) == 0: # bloque = jornada return bloque.encode('utf-8')
def leer_referencias(nombre_archivo): """ Leer archivo de asignaturas de referencias""" ref_book = xlrd.open_workbook(nombre_archivo, encoding_override='LATIN1') asignaturas = [] #Recorrer las hojas for s in ref_book.sheets(): for row in range(1,s.nrows): #omitir encabezado fila_actual = [] #Recorrer las columnas for current_col in range(0,s.ncols): fila_actual.append(normalize_str(s.cell(row, current_col).value)) #Asignatura temporal (con todos sus alias) asignatura_actual = Asignatura(fila_actual) #Buscar la asignatura en las asignaturas ya revisadas asignatura_encontrada = False nasignaturas = len(asignaturas) for i in range(0,nasignaturas): #Si la asignatura ya fue revisada --> agregar posibles alias if asignatura_actual.codigo == asignaturas[i].codigo: asignaturas[i].add_alias(asignatura_actual.alias) asignatura_encontrada = True break #Si la asignatura no ha sido revisada --> agregar asignatura # (y todos sus alias) if not asignatura_encontrada: asignaturas.append(asignatura_actual) return asignaturas
def __init__( self, name="", address="", city="", area="", postal_code="", province="CORDOBA", dne_id=None, min_edu_id=None, lat=None, lon=None, ): self.name = normalize_str(name) self.address = normalize_str(address) self.city = normalize_str(city) self.area = normalize_str(area) # Departamento self.province = normalize_str(province) # Always CORDOBA in MinEdu.data self.postal_code = normalize_str(postal_code) self.dne_id = dne_id self.min_edu_id = min_edu_id self.lat = lat and float(lat) self.lon = lon and float(lon)
def clean_ner_output(df, remove_only_numbers=True): ''' Parameters -------- df: DataFrame Columns: ['filename', 'label', 'offset1', 'offset2', 'span','NSCO', 'USCO'] Returns ------- df_dedup: DataFrame Pandas Dataframe with no empty spans, and no duplicated entries. Columns: ['filename', 'label', 'offset1', 'offset2', 'span', 'span_normalized', 'span_lower','NSCO', 'USCO'] ''' # 2.1. Normalize span (lowercase, remove extra whitespaces, remove punctuation) df['span_normalized'] = df['span'].\ apply(lambda x: normalize_str(x, 3, keep_accents=False)) # Need this because doctors do not put accents properly df['span_lower'] = df['span'].\ apply(lambda x: normalize_str(x, 3, keep_accents=True, keep_punctuation=True)) # Need this to use Carlos normalization tool and to show the results ''' # 2. OPT: Remove MEDDOCAN predictions meddocan_labels = ['FECHAS', 'HOSPITAL', 'TERRITORIO', 'EDAD-SUJETO-ASISTENCIA', 'CALLE', 'PAIS', 'NOMBRE-SUJETO-ASISTENCIA', 'PROFESION', 'SEXO-SUJETO-ASISTENCIA', 'NOMBRE-PERSONAL-SANITARIO', 'ID-SUJETO-ASISTENCIA', 'CORREO-ELECTRONICO'] df_not_demo = df.loc[df['label'].isin(meddocan_labels) == False,:].copy() df_demo = df.loc[df['label'].isin(meddocan_labels) == True,:].copy() # 2. OPT: NORMALIZABLES --> FARMACO df_not_demo['label'].replace('NORMALIZABLES', 'FARMACO', inplace=True) # 2. OPT: Remove UNCLEAR df_not_demo = df_not_demo.drop(df_not_demo.loc[(df_not_demo['label'] == "UNCLEAR")].index) ''' # 2. OPT: Remove PROTEINAS and "covid19" df = df.drop(df.loc[((df['label'] == "PROTEINAS") & (df['span_normalized'].isin(["covid19", "sarscov2", "cov2"])))].index) # 2.2. Remove empty spans or null df['span_normalized'].replace('', np.nan, inplace=True) df.dropna(inplace=True) # 2.3 OPT: Remove only numbers def str2Float(x): try: float(x) return np.nan except: return x if remove_only_numbers==True: span_norm_not_numbers = df['span_normalized'].apply(lambda x: str2Float(x)) df = df.drop(['span_normalized'], axis=1) df = df.assign(span_normalized=span_norm_not_numbers.values) df.dropna(inplace=True) # 2.4. Reset index df.reset_index(drop=True, inplace=True) # 2.5 OPT: If one span_normalized have several labels, keep the most frequent label # Create dict to replace the duplicated span_normalized by the label which is most frequent if df.label.value_counts().shape[0]>1: aux = df.groupby(['span_normalized', 'label'], as_index=False)\ .count()[['span_normalized', 'label', 'filename']].copy() aux.columns = ['span_normalized', 'label', 'count'] aux = aux.sort_values(by=['count']) aux = aux.drop_duplicates(['span_normalized'], keep='last').copy() span_norm2label = dict(zip(aux.span_normalized, aux.label)) # Replace for idx, row in df.iterrows(): span_norm = df.loc[idx, 'span_normalized'] if span_norm in span_norm2label.keys(): df.loc[idx, 'label'] = span_norm2label[span_norm] # 2.6. Remove duplicated entries & reset index df_dedup = df.\ drop_duplicates(subset=['filename','label', 'span', 'offset1', 'offset2']).copy() df_dedup.reset_index(drop=True, inplace=True) df_dedup = df_dedup[['filename', 'label', 'offset1', 'offset2','span', 'span_normalized','span_lower','NSCO', 'USCO']] return df_dedup
report = None if args.report: report = csv.writer(open(args.report, 'w'), delimiter=',') report.writerow(['dne_id', 'dne name', 'dne address', 'dne city', 'dne area', 'min_edu_id', 'min edu name', 'min edu address', 'min edu city', 'min edu area', 'lat', 'lon', 'match_str', 'density_str', 'flag', 'max_distance', 'origin.lat', 'origin.lon', 'destiny.lat', 'destiny.lon']) voting_place = com_nac_electoral_data.next() # Skip header for data in com_nac_electoral_data: # data[13]: dne_seccion_id. It is the ID of the area (departamento) area = '' if data[13]: area = normalize_str(DEPARTAMENTOS[int(data[13])]) voting_place = School( name=data[11], # establecimiento address=data[6], # direccion city=data[9], # localidad area=area, province=data[10], # distrito postal_code=data[4], # codigo_postal dne_id=data[0], # D.N.E. ID ) most_probable = geo_processor.get_most_probable(voting_place) schools_set = list(most_probable['schools_set']) density = 'probable' if schools_set:
def build_discord_tr_embed(comment: dict, cmds_args: dict) -> DiscordEmbed: """Creates a Discord embed for a Utopian task request. :param comment: Steem root post with task request :type comment: dict :param cmds_args: Parsed bot commands and arguments :type cmds_args: dict """ category = get_category(comment, TASKS_PROPERTIES) color = 0 type_ = None thumbnail = None if category is not None: color = int(TASKS_PROPERTIES[category]["color"][1:], 16) type_ = TASKS_PROPERTIES[category]["category"] thumbnail = TASKS_PROPERTIES[category]["image_url"] title = f'{comment["title"]}' description_parts = [] if cmds_args.get("description") is not None: description_parts.append(cmds_args["description"].strip()) # description_parts.append( # f'*You can read [here]({build_comment_link(comment)}) the whole task by **{comment["author"]}**.*' # ) description = "\n\n".join(description_parts) embed = DiscordEmbed(title=title, description=description) author = Account(comment["author"]) embed.set_author( name=author.name, url=f"{UI_BASE_URL}/@{author.name}", icon_url=author.profile.get("profile_image"), ) embed.set_color(color) embed.set_footer(text="Verified by Utopian.io team") embed.set_thumbnail(url=thumbnail) embed.set_timestamp() if type_ is not None: embed.add_embed_field(name="Task Type", value=type_.upper(), inline=True) status = None if cmds_args.get("status") is not None: status = cmds_args["status"] embed.add_embed_field(name="Status", value=status.upper(), inline=True) if status and status.upper() == "CLOSED": return embed if cmds_args.get("skills"): skills = normalize_str(cmds_args["skills"]) embed.add_embed_field(name="Required skills", value=skills, inline=True) if cmds_args.get("discord") is not None: embed.add_embed_field( name="Discord", value=f'{cmds_args["discord"]}', inline=True ) if cmds_args.get("bounty"): bounty = normalize_str(cmds_args["bounty"]).upper() else: bounty = "See the task details" embed.add_embed_field(name="Bounty", value=bounty, inline=True) if cmds_args.get("deadline"): deadline = cmds_args["deadline"] else: deadline = "Not specified" embed.add_embed_field(name="Due date", value=deadline, inline=True) is_in_progress = status and status.upper() == "IN PROGRESS" if is_in_progress and cmds_args.get("assignees"): assignees = normalize_str(cmds_args["assignees"]).lower() assignees_links = accounts_str_to_md_links(assignees) embed.add_embed_field(name="Assignees", value=assignees_links, inline=False) if cmds_args.get("note") is not None: embed.add_embed_field(name="Misc", value=f'{cmds_args["note"]}', inline=False) return embed
def procesar_planilla(planilla, hoja_salida, bloques, atributos, output_row, settings, optativo): """ Procesa una planilla de plan de estudio o de optativos""" #Registros procesados registros_procesados = [] #Leer nombres de hojas de la planilla hojas_planilla = planilla.sheet_names() #Abrir archivo de log log = open(settings.log, 'a') if optativo == 0: log.write('\nPlanilla de Plan de Estudio\n') else: log.write('\nPlanilla de Optativos\n') #Procesamiento de la planilla de plan de estudio for worksheet_name in hojas_planilla: worksheet = planilla.sheet_by_name(worksheet_name) num_rows = worksheet.nrows num_cols = worksheet.ncols #numero de registros totales num_registros = 0 #numero de registros procesados num_procesados = 0 #numero de registros vacios (consecutivamente) num_vacios = 0 log.write('Hoja: '+worksheet_name+'\n') log.write('--Filas: '+str(num_rows)+', Columnas: '+str(num_cols)+'\n') #se leen filas de hoja (omitiendo las primeras 3 filas de encabezado) for current_row in range(settings.ini_filas, num_rows): fila_actual = [] #Recorrer las columnas for current_col in range(num_cols): fila_actual.append(normalize_str(worksheet.cell(current_row, current_col).value)) registro_actual = Registro() registro_actual.set_from_list(fila_actual, settings) num_registros = num_registros+1 #Omitir fila si no trae campos obligatorios: curriculum, jornada, nivel, siglas o asignatura if registro_actual.curriculum == '' or registro_actual.jornada == '' or registro_actual.nivel == 0 \ or registro_actual.siglas == '' or registro_actual.asignatura == '': log.write("Fila "+str(current_row+1)+" omitida por no tener campos obligatorios\n") num_vacios = num_vacios + 1 #Si se alcanza el maximo de filas vacias consecutivas permitidas: terminar hoja if num_vacios == settings.max_filas_vacias: log.write('Se alcanzo el maximo permitido de filas vacias consecutivamente (' \ +str(settings.max_filas_vacias)+') en la hoja '+worksheet_name+'\n') break continue else: #Omitir fila si ya se proceso una fila con igual curriculum-jornada-asignatura num_vacios = 0 clave = registro_actual.curriculum + '-' + registro_actual.jornada + '-' + registro_actual.siglas if clave in registros_procesados: log.write('Fila '+str(current_row+1)+' omitida por duplicidad de Curriculum-Jornada-Asignatura ('+clave+')\n') continue filas_salida = procesar_fila(registro_actual, settings, atributos, bloques, optativo) if len(filas_salida) == 0: log.write('Fila '+str(current_row+1)+' sin parte teorica ni practica validas\n') else: registros_procesados.append(clave) num_procesados = num_procesados + 1 #Puede existir una o 2 asignaturas (teoria y/o practica) for rw in filas_salida: escribir_arreglo(rw.export_list(), hoja_salida, output_row) output_row = output_row + 1 log.write('Total filas: '+str(num_registros)+'\n') porcentaje_procesados = round(100.0*num_procesados/(num_registros), 2) log.write('Total filas procesadas: '+str(num_procesados)+' ('+str(porcentaje_procesados)+'%)\n') log.close() return output_row
def main(argv = None): settings, args = procesar_argumentos(argv) # Crear los paths de los archivos de salida y log si no existen if not os.path.exists(os.path.dirname(settings.log)): os.makedirs(os.path.dirname(settings.log)) if not os.path.exists(os.path.dirname(settings.salida)): os.makedirs(os.path.dirname(settings.salida)) #Leer planilla planilla = xlrd.open_workbook(settings.entrada, encoding_override='LATIN1') #Leer hojas de la planilla hojas_planilla = planilla.sheet_names() #Crear archivo salida salida = xlwt.Workbook(encoding='LATIN-1') #Cear hoja de archivo de salida hoja_salida = salida.add_sheet('Sheet1') #Contador para las filas del archivo de salida output_row = 0 #Lista de llaves llaves = leer_llaves(planilla, hojas_planilla, settings) #Conjunto de llaves (no repetidas) a fines de optimizar la busqueda set_llaves = set(llaves) #Lista de llaves insertadas llaves_insertadas = [] #Obtener lista de posibles asignaturas y sus alias asignaturas = leer_referencias(settings.referencias) #Procesamiento de la planilla for worksheet_name in hojas_planilla: worksheet = planilla.sheet_by_name(worksheet_name) num_rows = worksheet.nrows num_cols = worksheet.ncols print('Hoja: {}'.format(worksheet_name)) print('--Filas: {}, Columnas: {}'.format(num_rows, num_cols)) #se leen filas de hoja for current_row in range(1, num_rows): #omitir 1 columna de encabezado fila_actual = [] #Recorrer las columnas for current_col in range(num_cols): fila_actual.append(normalize_str(worksheet.cell(current_row, current_col).value)) cursable_actual = Cursable(fila_actual) #Filtrar filas con otype = ST if cursable_actual.otype.upper() != 'ST': demanda = Demanda() demanda.set_from_cursable(cursable_actual, settings) #primera fila -> agregar encabezado if(output_row == 0): escribir_encabezado(demanda.export_header_list(), hoja_salida, output_row) output_row = output_row + 1 #Insertar el registro en el archivo de salida si aun no ha sido insertado #un registro con la misma clave (CARRERA-JORNADA-ASIGNATURA) if demanda.llave in set_llaves: if demanda.llave not in llaves_insertadas: #Calcular el numero de alumnos demanda.alumnos = llaves.count(demanda.llave) #Calcular la llave cruzada de acuerdo al valor de #demanda.asignatura for asignatura in asignaturas: lista_cruzada = asignatura.get_lista_cruzada(demanda.asignatura) if lista_cruzada != '': demanda.lista_cruzada = lista_cruzada break escribir_arreglo(demanda.export_list(), hoja_salida, output_row) llaves_insertadas.append(demanda.llave) output_row = output_row + 1 #Guardar archivo salida.save(settings.salida) return 0