def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 entry = None import re plural = re.compile(u"\(pl. ([^)]+)\)") parenthesis = re.compile(u" *\([^)]*\)") for line in pdfParser.lines(): line = line.strip() if not line: continue if line.isdigit(): continue if parsingStage == 0: if line == u"ABREVIATURAS:": parsingStage += 1 continue elif parsingStage == 1: if line == u"SÍMBOLOS:": break parts = line.split(u":") comment = parts[0].strip() entry = u":".join(parts[1:]).strip() subentries = set() for match in plural.finditer(entry): for subentry in self.parseEntry(match.group(1)): subentries.add(subentry) entry = re.sub(parenthesis, u"", entry) # Eliminar contido entre parénteses. entry = entry.strip() for subentry in self.parseEntry(entry): if subentry.endswith(u"o/a."): subentries.add(subentry[:-4] + u"a.") subentries.add(subentry[:-4] + u"o.") else: subentries.add(subentry) for subentry in subentries: entries[subentry] = comment dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(styleGuidePdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 continuesInTheNextLine = False previousLine = None for line in pdfParser.lines(): if line[-1:] in [u" ", u"-"] and parsingStage == 1: continuesInTheNextLine = True if parsingStage == 0: if line == u"7.3.2 Listaxe de símbolos de uso común": parsingStage += 1 continue elif parsingStage == 1: if line == u"7.4 O acrónimo": break if line.startswith(u"Ortografía e estilo"): continue if line.isdigit(): continue if previousLine: line = previousLine + line previousLine = None if continuesInTheNextLine: continuesInTheNextLine = False previousLine = line continue if u":" not in line: parts = line.split(u" ") entry = parts[0] comment = u" ".join(parts[1:]) else: try: entry, comment = line.split(u":") except ValueError: parts = line.split(u":") entry = parts[0] comment = u":".join(parts[1:]) entry = entry.strip() for subentry in self.parseEntry(entry): entries[subentry] = comment dictionary = u"# Relación de símbolos máis frecuentes\n" dictionary += u"# {}\n".format(styleGuidePdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"símbolo"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(languageUsageCriteria2012PdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 comment = None import string for line in pdfParser.lines(): line = line.strip() if parsingStage == 0: if line == u"ANEXO I. ABREVIATURAS MÁIS EMPREGADAS NA LINGUAXE ADMINISTRATIVA": parsingStage += 1 continue elif parsingStage == 1: if line.startswith("5"): parsingStage += 1 continue elif parsingStage == 2: if line == u"ANEXO I. ABREVIATURAS MÁIS EMPREGADAS NA LINGUAXE ADMINISTRATIVA": parsingStage += 1 else: continue elif parsingStage == 3: if line == u"ANEXO II. RELACIÓN DOS TOPÓNIMOS MÁIS HABITUAIS DE FÓRA DO ESTADO ESPAÑOL": break if line in string.uppercase: continue if line in [u"CRITERIOS PARA O USO DA LINGUA", u"ANEXO I. ABREVIATURAS MÁIS EMPREGADAS NA LINGUAXE ADMINISTRATIVA"]: continue if line.isdigit(): continue if not comment: comment = line.strip() else: entries[line.strip()] = comment comment = None dictionary = u"# Relación de abreviaturas máis frecuentes na linguaxe administrativa\n" dictionary += u"# {}\n".format(languageUsageCriteria2012PdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = uvigoContentCache.downloadFileIfNeededAndGetLocalPath( doubtsPdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 commentCache = None for line in pdfParser.lines(): if parsingStage == 0: if line == u"Relación de siglas e acrónimos máis frecuentes": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"49": parsingStage += 1 continue elif parsingStage == 2: if line == u"5": parsingStage += 1 else: continue elif parsingStage == 3: if line == u"55": break if commentCache: entry = line.strip() entries[entry] = commentCache commentCache = None elif u":" in line: comment, entry = line.split(u":") entry = entry.strip() if entry: entries[entry] = comment.strip() else: commentCache = comment.strip() dictionary = u"# Relación de acrónimos e siglas máis frecuentes\n" dictionary += u"# {}\n".format(doubtsPdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 lineIsContinuation = False comment = None for line in pdfParser.lines(): if comment and line[0] != u" " and parsingStage == 1: lineIsContinuation = True line = line.strip() if parsingStage == 0: if line == u"abril": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"Manuel Bermúdez": break if line.startswith(u"Abreviaturas, siglas, símbolos e léxico"): continue if line.isdigit(): continue if lineIsContinuation: lineIsContinuation = False comment += u" " + line continue if comment: for subentry in self.parseEntry(line): entries[subentry] = comment comment = None else: comment = line dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = uvigoContentCache.downloadFileIfNeededAndGetLocalPath(doubtsPdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 commentCache = None for line in pdfParser.lines(): if parsingStage == 0: if line == u"Relación de siglas e acrónimos máis frecuentes": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"49": parsingStage += 1 continue elif parsingStage == 2: if line == u"5": parsingStage += 1 else: continue elif parsingStage == 3: if line == u"55": break if commentCache: entry = line.strip() entries[entry] = commentCache commentCache = None elif u":" in line: comment, entry = line.split(u":") entry = entry.strip() if entry: entries[entry] = comment.strip() else: commentCache = comment.strip() dictionary = u"# Relación de acrónimos e siglas máis frecuentes\n" dictionary += u"# {}\n".format(doubtsPdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 lineIsContinuation = False comment = None for line in pdfParser.lines(): if comment and line[0] != u" " and parsingStage == 1: lineIsContinuation = True line = line.strip() if parsingStage == 0: if line == u"abril": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"Manuel Bermúdez": break if line.startswith(u"Abreviaturas, siglas, símbolos e léxico"): continue if line.isdigit(): continue if lineIsContinuation: lineIsContinuation = False comment += u" " + line continue if comment: for subentry in self.parseEntry(line): entries[subentry] = comment comment = None else: comment = line dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary( entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 lineIsContinuation = False comment = None for line in pdfParser.lines(): if comment and line[0] != u" " and parsingStage == 1: lineIsContinuation = True line = line.strip() if parsingStage == 0: if line == u"Asociación Española de Normalización e Certificación": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"A sigla caracterízase por:": break if line.isdigit(): continue if lineIsContinuation: lineIsContinuation = False comment += u" " + line continue if comment: entries[line] = comment comment = None else: comment = line dictionary = u"# Relación de siglas e acrónimos máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(languageUsageCriteria2012PdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 entry = None twoLineEntries = [u"ASISTA",] for line in pdfParser.lines(): line = line.strip() if parsingStage == 0: if line == u"ANEXO V. RELACIÓN DE SIGLAS E ACRÓNIMOS MÁIS HABITUAIS DA UDC": parsingStage += 1 continue elif parsingStage == 1: if line.startswith(u"ÚLTIMAS PUBLICACIÓNS"): break if line in [u"CRITERIOS PARA O USO DA LINGUA", u"ANEXO V. RELACIÓN DE SIGLAS E ACRÓNIMOS MÁIS HABITUAIS DA UDC"]: continue if line.isdigit(): continue if not entry: entry = line.strip() elif entry in twoLineEntries: # Caso especial que hai que xestionar como malamente se poida. if entry in entries: entries[entry] += u" " + line.strip() entry = None else: entries[entry] = line.strip() else: entries[entry] = line.strip() entry = None dictionary = u"# Relación de siglas e acrónimos máis frecuentes na UDC\n" dictionary += u"# {}\n".format(languageUsageCriteria2012PdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 for line in pdfParser.lines(): line = line.strip() if not line: continue if parsingStage == 0: if line == u"SÍMBOLOS:": parsingStage += 1 continue elif parsingStage == 1: if line == u"CASOS ESPECIAIS:": break if line.isdigit(): continue parts = line.split(u":") comment = parts[0].strip() entry = u":".join(parts[1:]).strip() if comment in [u"FM"]: # Entradas invertidas. temporary = comment comment = entry entry = temporary if u"," in entry: for subentry in entry.split(u","): entries[subentry.strip()] = comment else: entries[entry] = comment dictionary = u"# Relación de símbolos máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"símbolo"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 for line in pdfParser.lines(): line = line.strip() if not line: continue if parsingStage == 0: if line == u"SÍMBOLOS:": parsingStage += 1 continue elif parsingStage == 1: if line == u"CASOS ESPECIAIS:": break if line.isdigit(): continue parts = line.split(u":") comment = parts[0].strip() entry = u":".join(parts[1:]).strip() if comment in [u"FM"]: # Entradas invertidas. temporary = comment comment = entry entry = temporary if u"," in entry: for subentry in entry.split(u","): entries[subentry.strip()] = comment else: entries[entry] = comment dictionary = u"# Relación de símbolos máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary( entries, u"símbolo"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath( styleGuidePdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 previousLine = u"" for line in pdfParser.lines(): line = line.strip() if parsingStage == 0: if line == u"List of common abbreviations:": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"Addtional guidelines:": break # Yes, I know, ugliest decoding ever… It looks like different parts # of the PDF use different encoding, so… bare with me. line = line.replace(u"ñ", u"ó").replace(u"ð", u"ñ").replace(u"ö", u"ú") if line.startswith(u"(+)"): comment = previousLine entry = line[3:].strip() for subentry in self.parseSubEntries(entry): subentry = subentry.strip() entries[subentry] = comment.strip() previousLine = line dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(styleGuidePdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary( entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(abbreviationsPdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 entry = None for line in pdfParser.lines(): line = line.strip() if parsingStage == 0: if line == u"Tabela 24: Abreviaturas e sua expansão em galego.": parsingStage += 1 continue elif parsingStage == 1: if line == u"Abreviatura": continue elif line == u"Conversão ortográfica": continue elif line == u"54": break if not entry: entry = line.strip().replace(u"..", u".") elif entry == u"s.a.": # Caso especial que hai que xestionar como malamente se poida. if line == u"especificar": entries[entry] += u" " + line.strip() entry = None else: entries[entry] = line.strip() else: entries[entry] = line.strip() entry = None dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(abbreviationsPdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(styleGuidePdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 previousLine = u"" for line in pdfParser.lines(): line = line.strip() if parsingStage == 0: if line == u"List of common abbreviations:": parsingStage += 1 else: continue elif parsingStage == 1: if line == u"Addtional guidelines:": break # Yes, I know, ugliest decoding ever… It looks like different parts # of the PDF use different encoding, so… bare with me. line = line.replace(u"ñ", u"ó").replace(u"ð", u"ñ").replace(u"ö", u"ú") if line.startswith(u"(+)"): comment = previousLine entry = line[3:].strip() for subentry in self.parseSubEntries(entry): subentry = subentry.strip() entries[subentry] = comment.strip() previousLine = line dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(styleGuidePdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 entry = None for line in pdfParser.lines(): line = line.strip() if not line: continue if parsingStage == 0: if line == u"SIGLAS e ACRÓNIMOS": parsingStage += 1 continue elif parsingStage == 1: if line == u"ABREVIATURAS:": break if line.isdigit(): continue parts = line.split(u":") if parts[0].upper() != parts[0]: comment += u" " + parts[0].strip() entries[entry] = comment else: entry = parts[0].strip() comment = u":".join(parts[1:]).strip() entries[entry] = comment dictionary = u"# Relación de siglas e acrónimos máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"sigla"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(languageUsageCriteria2007PdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 comment = None entry = None twoLines = 0 for line in pdfParser.lines(): line = line.strip() if not line: continue if parsingStage == 0: if line == u"ABREVIATURAS DE TRATAMENTO": parsingStage += 1 continue elif parsingStage == 1: if line == u"CRITERIOS PARA O USO DA LINGUA": break if line.isdigit(): continue if twoLines != 0: if twoLines == 1: entry = line twoLines += 1 continue elif twoLines == 2: comment = comment[:-1] + line twoLines += 1 continue elif twoLines == 3: entry += u" " + line twoLines = 0 if not comment: comment = line if comment.endswith(u"-"): twoLines = 1 continue else: if not entry: entry = line entries[entry] = comment comment = None entry = None dictionary = u"# Relación de abreviaturas de tratamento\n" dictionary += u"# {}\n".format(languageUsageCriteria2007PdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(styleGuidePdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 continuesInTheNextLine = False previousLine = None import re, string plural = re.compile(u"\(plural ([^)]+)\)") fem = re.compile(u"\(fem. ([^)]+)\)") parenthesis = re.compile(u" *\([^)]*\)") for line in pdfParser.lines(): if line[-1:] == u" " and parsingStage == 1: continuesInTheNextLine = True line = line.strip() if parsingStage == 0: if line == u"7.1.2 Listaxe de abreviaturas": parsingStage += 1 continue elif parsingStage == 1: if line == u"7.2 A sigla": break if line in string.uppercase: continue if line.startswith(u"Ortografía e estilo"): continue if line.isdigit(): continue if continuesInTheNextLine: continuesInTheNextLine = False previousLine = line continue if previousLine: line = previousLine + u" " + line previousLine = None try: comment, entry = line.split(u":") except ValueError: parts = line.split(u":") comment = u":".join(parts[:-1]) entry = parts[-1] subentries = set() for match in plural.finditer(entry): for subentry in self.parseEntry(match.group(1)): subentries.add(subentry) for match in fem.finditer(entry): for subentry in self.parseEntry(match.group(1)): subentries.add(subentry) entry = re.sub(parenthesis, u"", entry) # Eliminar contido entre parénteses. entry = entry.strip() for subentry in self.parseEntry(entry): subentries.add(subentry) for subentry in subentries: entries[subentry] = comment dictionary = u"# Relación de abreviaturas máis frecuentes na linguaxe administrativa\n" dictionary += u"# {}\n".format(styleGuidePdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary(entries, u"abreviatura"): dictionary += entry return dictionary
def generateFileContent(self): filePath = contentCache.downloadFileIfNeededAndGetLocalPath(pdfUrl) pdfParser = PdfParser(filePath) entries = {} parsingStage = 0 entry = None import re plural = re.compile(u"\(pl. ([^)]+)\)") parenthesis = re.compile(u" *\([^)]*\)") for line in pdfParser.lines(): line = line.strip() if not line: continue if line.isdigit(): continue if parsingStage == 0: if line == u"ABREVIATURAS:": parsingStage += 1 continue elif parsingStage == 1: if line == u"SÍMBOLOS:": break parts = line.split(u":") comment = parts[0].strip() entry = u":".join(parts[1:]).strip() subentries = set() for match in plural.finditer(entry): for subentry in self.parseEntry(match.group(1)): subentries.add(subentry) entry = re.sub(parenthesis, u"", entry) # Eliminar contido entre parénteses. entry = entry.strip() for subentry in self.parseEntry(entry): if subentry.endswith(u"o/a."): subentries.add(subentry[:-4] + u"a.") subentries.add(subentry[:-4] + u"o.") else: subentries.add(subentry) for subentry in subentries: entries[subentry] = comment dictionary = u"# Relación de abreviaturas máis frecuentes\n" dictionary += u"# {}\n".format(pdfUrl) dictionary += u"\n" for entry in formatEntriesAndCommentsForDictionary( entries, u"abreviatura"): dictionary += entry return dictionary