def _get_type_and_status(self): for location in self: if location.real_id: response = requests.get('http://' + self.env['ir.config_parameter']. get_param('core_appliance_ip') + ':8001/service/labelinfo/' + location.real_id) if response: root = XML(response.text) for resp in root.iter('ConnectionStatus'): location.status = resp.text else: location.type = "UNKNOWN" location.status = "UNREGISTERED" response = requests.get('http://' + self.env['ir.config_parameter']. get_param('core_appliance_ip') + ':8001/service/labelinfo/type/' + location.real_id) if response: root = XML(response.text) for resp in root.iter('Name'): location.type = resp.text for resp in root.iter('DisplayWidth'): location.size = resp.text for resp in root.iter('DisplayHeight'): location.size += "*" + resp.text else: location.type = "UNKNOWN"
def __load_doc(path): """Документы в формате .doc и .docx разбираются как zip-архив, из него достаётся .xml файл с текстом Подробное описание метода: https://github.com/nmolivo/tesu_scraper/blob/master/Python_Blogs/01_extract_from_MSWord.ipynb """ document = zipfile.ZipFile(path) source_filename = 'word/document.xml' if source_filename in document.namelist(): xml_content = document.read(source_filename) else: raise FileNotFoundError( 'Cannot find {} inside selected file'.format(source_filename)) document.close() # Warning The xml.etree.ElementTree module is not secure against maliciously constructed data. tree = XML(xml_content) word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' para = word_namespace + 'p' text = word_namespace + 't' paragraphs = [] for paragraph in tree.iter(para): texts = [ node.text for node in paragraph.getiterator(text) if node.text ] if texts: paragraphs.append(''.join(texts)) lines = [ TextLoader.remove_trash_symbols(l.strip()) for parag in paragraphs for l in parag.split('.') ] lines = [l for l in lines if len(l) > 0] return lines
def parse_captions(soup): """Converts custom iView captions into SRT format, usable in most decent media players. """ # Horrible hack to escape literal ampersands, which have been seen in # some captions XML. Inspired by # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python if b"<![CDATA[" not in soup: # Not seen, but be future proof soup = re.sub(b"&(?![#\w]+;)", b"&", soup) xml = XML(soup) output = '' i = 1 for title in xml.iter('title'): start = title.get('start') (start, startfract) = start.rsplit(':', 1) end = title.get('end') (end, endfract) = end.rsplit(':', 1) output = output + '{}\n'.format(i) output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(start, startfract, end, endfract) output = output + title.text.replace('|','\n') + '\n\n' i += 1 return output
def parse_captions(soup): """Converts custom iView captions into SRT format, usable in most decent media players. """ # Horrible hack to escape literal ampersands, which have been seen in # some captions XML. Inspired by # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python if b"<![CDATA[" not in soup: # Not seen, but be future proof soup = re.sub(b"&(?![#\w]+;)", b"&", soup) xml = XML(soup) output = '' i = 1 for title in xml.iter('title'): start = title.get('start') (start, startfract) = start.rsplit(':', 1) end = title.get('end') (end, endfract) = end.rsplit(':', 1) output = output + '{}\n'.format(i) output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format( start, startfract, end, endfract) output = output + title.text.replace('|', '\n') + '\n\n' i += 1 return output
def get_text(path): # возвращаемый список с текстом text_list = list() word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' para = word_namespace + 'p' text = word_namespace + 't' document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) raw_text_list = list() for paragraph in tree.iter(para): texts = [node.text for node in paragraph.iter(text) if node.text] if texts: raw_text_list.append(''.join(texts)) all_text = '\n\n'.join(raw_text_list) # весь текст в одной строке strings = all_text.split('. ') # разбиение по предложениям for line in strings: lines = line.split('\n') # разбиение по абзацам for string in lines: if string: text_list.append(string) return text_list
def parse_config(soup): """There are lots of goodies in the config we get back from the ABC. In particular, it gives us the URLs of all the other XML data we need. """ xml = XML(soup) params = dict() for param in xml.iter('param'): params.setdefault(param.get('name'), param.get('value')) # should look like "rtmp://cp53909.edgefcs.net/ondemand" # Looks like the ABC don't always include this field. # If not included, that's okay -- ABC usually gives us the server in the auth result as well. rtmp_url = params['server_streaming'] categories_url = params['categories'] params.update({ 'rtmp_url': rtmp_url, 'auth_url': params['auth'], 'api_url': params['api'], 'categories_url': categories_url, 'captions_url': params['captions'], }) return params
def get_docx_table(path): """ Find the table inside the .docx file and return it in an array """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) rows = [] for xml_row in tree.iter(TR): row = [] for xml_cell in xml_row.iter(TC): # Each cell consists of one or more paragraph text = "" for paragraph in xml_cell.iter(PARA): texts = [ node.text for node in paragraph.iter(TEXT) if node.text ] paragraph_text = "".join(texts) if paragraph_text: text += paragraph_text + "\n" if text.endswith("\n"): text = text[0:-1] row.append(text) rows.append(row) return rows
def replace_string2(filename): global model_name global tokenizer global model model_name = 'Helsinki-NLP/opus-mt-en-de' tokenizer = MarianTokenizer.from_pretrained(model_name) model = MarianMTModel.from_pretrained(model_name) document = zipfile.ZipFile(filename) xml_content = document.read('word/document.xml') #document.close() tree = XML(xml_content) # using lxml instead of xml preserved the comments paragraphs = [] i = 0 for paragraph in tree.iter(PARA): i = i + 1 texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: #text = list(filter(None, text)) #text = [s for s in text if p.match(s)] #text = [">>de<< " + s for s in text] #print("%s: %s" %(i,texts)) target, duration = translat(texts) paragraph.text.replace(texts, target) document.save("new.docx")
def parse_config(soup): """There are lots of goodies in the config we get back from the ABC. In particular, it gives us the URLs of all the other XML data we need. """ xml = XML(soup) params = dict() for param in xml.iter('param'): params.setdefault(param.get('name'), param.get('value')) # should look like "rtmp://cp53909.edgefcs.net/ondemand" # Looks like the ABC don't always include this field. # If not included, that's okay -- ABC usually gives us the server in the auth result as well. rtmp_url = params['server_streaming'] categories_url = params['categories'] params.update({ 'rtmp_url' : rtmp_url, 'auth_url' : params['auth'], 'api_url' : params['api'], 'categories_url' : categories_url, 'captions_url' : params['captions'], }) return params
def get_docx_tables(path): """ Find the table inside the .docx file and return it in an array """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) for tbl in tree.iter(TBL): yield tbl
def _update_esl(self): for label in self.search([]): if label.len_products != 0 and label.need_update == True: response = requests.post( 'http://' + self.env['ir.config_parameter'].get_param( 'core_appliance_ip') + ':8001/service/task', data=label._build_task_body().encode('utf-8'), headers={'Content-Type': 'application/xml'}) root = XML(response.text) for transaction in root.iter('Transaction'): label.task_id = transaction.get('id') label.need_update = False
def get_para_list(path): document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(PARA): texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: paragraphs.append(''.join(texts)) return paragraph
def get_docx_text(filename): """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(filename) xml_content = document.read("word/document.xml") document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(PARA): texts = [node.text for node in paragraph.getiterator(TEXT) if node.text] if texts: paragraphs.append("".join(texts)) return "\n\n".join(paragraphs)
def _get_last_image_loaded(self): for label in self: task_id = "" labelupdatestatus = False if label.task_id: labelupdatestatus = requests.get( 'http://' + self.env['ir.config_parameter'].get_param( 'core_appliance_ip') + ':8001/service/updatestatus/transaction/' + label.task_id) if labelupdatestatus: root = XML(labelupdatestatus.text) for update in root.iter('UpdateStatus'): task_id = update.get('id') label.image = label._get_image_from_task(task_id) else: label.image = False
def docx_text_extractor(self, text): WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' PARA = WORD_NAMESPACE + 'p' TEXT = WORD_NAMESPACE + 't' document = zipfile.ZipFile(text) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(PARA): texts = [ node.text for node in paragraph.getiterator(TEXT) if node.text ] if texts: paragraphs.append(''.join(texts)) return '\n\n'.join(paragraphs)
def get_docx_text(path): schemas = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' para = schemas + 'p' text = schemas + 't' """ Take the path of a docx file as argument, return the text in unicode. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) paragraphs = [] for paragraph in tree.iter(para): texts = [node.text for node in paragraph.iter(text) if node.text] if texts: paragraphs.append(''.join(texts)) return os.linesep.join(paragraphs)
def _get_status_task(self): for location in self: if location.task_id: response = requests.get('http://' + self.env['ir.config_parameter']. get_param('core_appliance_ip') + ':8001/service/transaction/' + location.task_id + '/status') if response: root = XML(response.text) for resp in root.iter('TransactionStatusInfo'): if resp.get('failed') == "true": location.task_status = "FAILED" elif resp.get('finished') == "true": location.task_status = "FINISHED" else: location.task_status = "WAITING"
def get_docx_text(path): #use white-space: pre CSS """ Take the path of a docx file as argument, return the formatted text. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) #elements = list() styleAdditions = list() elementString = ["<article>"] for paragraph in tree.iter(PARA): """for paraStyleElement in paragraph.find(PARAPROPS).iter(): if paraStyleElement ==""" #JUSTIFICATION justificationInfo = paragraph.find(PARAPROPS).find( justification).attrib[WORD_NAMESPACE + "val"] if justificationInfo != "left" and justificationInfo != "both": styleAdditions.append("text-align:" + justificationInfo + ";") elif justificationInfo == "both": styleAdditions.append("text-align:justify;") #INDENT if paragraph.find(PARAPROPS).find(indent): indentInfo = dict() for attribKey in paragraph.find(PARAPROPS).find(indent).attrib: if attribKey == WORD_NAMESPACE + "hanging": indentInfo["hanging"] = (float( paragraph.find(PARAPROPS).find( indent).attrib[attribKey]) / 20) if attribKey == WORD_NAMESPACE + "end" or attribKey == WORD_NAMESPACE + "right": indentInfo["right"] = (float( paragraph.find(PARAPROPS).find( indent).attrib[attribKey]) / 20) if attribKey == WORD_NAMESPACE + "start" or attribKey == WORD_NAMESPACE + "left": indentInfo["left"] = (float( paragraph.find(PARAPROPS).find( indent).attrib[attribKey]) / 20) for key in indentInfo: if key == "hanging": styleAdditions.append("margin-top:" + str(indentInfo[key]) + ";") if key == "right" or key == "end": styleAdditions.append("margin-right:" + str(indentInfo[key]) + ";") if key == "left" or key == "start": styleAdditions.append("margin-left:" + str(indentInfo[key]) + ";") #left => start, right => end, hanging #BACKGROUND COLOURING if paragraph.find(PARAPROPS).find(shade): if paragraph.find(PARAPROPS).find(shade).attrib[WORD_NAMESPACE + "fill"] != "auto": styleAdditions.append( "background-color: #" + paragraph.find(PARAPROPS).find(shade).attrib["fill"] + ";") #BORDERS if paragraph.find(PARAPROPS).find(paraBorders): for sideElement in paragraph.find(PARAPROPS).find( paraBorders).iter(): pass if sideElement != WORD_NAMESPACE + "between": styleAdditions.append( "border-style:" + borderTypes[sideElement.attrib["val"]]) #GET ALL THE TEXT, FIX TO PUT TEXT INTO ITS HTML ELEMENT paragraphText = [ element.text for element in paragraph.iter(TEXT) if element.text ] #make a list comprehension of styles to match if not paragraphText: if paragraph.iter(shdBreak): for shadow in paragraph.iter( shdBreak ): #bloody generator making me use a for loop for one element if shadow.attrib[WORD_NAMESPACE + "fill"] == "auto": for element in paragraph.iter(fontSize): breakSize = str( float(element.attrib[WORD_NAMESPACE + "val"]) / 2 ) #gets a dict of fontSize element attributes, then gets the value representing font size, in points break styleAdditions.append( "min-height:" + breakSize + "pt;" ) #why not em? ATTENTION ATTENTION ATTENTION ATTENTION ATTENTION styleAdditions.append("margin:0;") break else: elementString.append("<p") elementString.append(' style="') #elementString.append("white-space:pre-wrap;word-wrap:break-word;") for style in styleAdditions: elementString.append(style) if ''.join(elementString)[-1:] != ";": elementString.append(";") elementString.append('"') elementString.append(">") elementString.append("</p>\n") styleAdditions = list() """texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: elements.append(''.join(texts))""" elementString.append("</article>") return ''.join(elementString)
def get_docx_text(path): #use white-space: pre CSS """ Take the path of a docx file as argument, return the formatted text. """ document = zipfile.ZipFile(path) xml_content = document.read('word/document.xml') document.close() tree = XML(xml_content) #elements = list() styleAdditions = list() elementString = ["<article>"] for paragraph in tree.iter(PARA): """for paraStyleElement in paragraph.find(PARAPROPS).iter(): if paraStyleElement ==""" #JUSTIFICATION justificationInfo = paragraph.find(PARAPROPS).find(justification).attrib[WORD_NAMESPACE+"val"] if justificationInfo != "left" and justificationInfo != "both": styleAdditions.append("text-align:"+justificationInfo+";") elif justificationInfo == "both": styleAdditions.append("text-align:justify;") #INDENT if paragraph.find(PARAPROPS).find(indent): indentInfo = dict() for attribKey in paragraph.find(PARAPROPS).find(indent).attrib: if attribKey == WORD_NAMESPACE +"hanging": indentInfo["hanging"] = (float(paragraph.find(PARAPROPS).find(indent).attrib[attribKey])/20) if attribKey == WORD_NAMESPACE +"end" or attribKey == WORD_NAMESPACE +"right": indentInfo["right"] = (float(paragraph.find(PARAPROPS).find(indent).attrib[attribKey])/20) if attribKey == WORD_NAMESPACE +"start" or attribKey == WORD_NAMESPACE +"left": indentInfo["left"] = (float(paragraph.find(PARAPROPS).find(indent).attrib[attribKey])/20) for key in indentInfo: if key == "hanging": styleAdditions.append("margin-top:"+str(indentInfo[key])+";") if key == "right" or key == "end": styleAdditions.append("margin-right:"+str(indentInfo[key])+";") if key == "left" or key == "start": styleAdditions.append("margin-left:"+str(indentInfo[key])+";") #left => start, right => end, hanging #BACKGROUND COLOURING if paragraph.find(PARAPROPS).find(shade): if paragraph.find(PARAPROPS).find(shade).attrib[WORD_NAMESPACE +"fill"] != "auto": styleAdditions.append("background-color: #"+paragraph.find(PARAPROPS).find(shade).attrib["fill"]+";") #BORDERS if paragraph.find(PARAPROPS).find(paraBorders): for sideElement in paragraph.find(PARAPROPS).find(paraBorders).iter(): pass if sideElement != WORD_NAMESPACE + "between": styleAdditions.append("border-style:"+borderTypes[sideElement.attrib["val"]]) #GET ALL THE TEXT, FIX TO PUT TEXT INTO ITS HTML ELEMENT paragraphText = [element.text for element in paragraph.iter(TEXT) if element.text] #make a list comprehension of styles to match if not paragraphText: if paragraph.iter(shdBreak): for shadow in paragraph.iter(shdBreak): #bloody generator making me use a for loop for one element if shadow.attrib[WORD_NAMESPACE+"fill"] == "auto": for element in paragraph.iter(fontSize): breakSize = str(float(element.attrib[WORD_NAMESPACE+"val"])/2) #gets a dict of fontSize element attributes, then gets the value representing font size, in points break styleAdditions.append("min-height:"+breakSize+"pt;") #why not em? ATTENTION ATTENTION ATTENTION ATTENTION ATTENTION styleAdditions.append("margin:0;") break else: elementString.append("<p") elementString.append(' style="') #elementString.append("white-space:pre-wrap;word-wrap:break-word;") for style in styleAdditions: elementString.append(style) if ''.join(elementString)[-1:] != ";": elementString.append(";") elementString.append('"') elementString.append(">") elementString.append("</p>\n") styleAdditions = list() """texts = [node.text for node in paragraph.iter(TEXT) if node.text] if texts: elements.append(''.join(texts))""" elementString.append("</article>") return ''.join(elementString)
def _get_type_template(self): if self.name: self.type = "" self.size = "" if self.env['ir.config_parameter'].get_param('core_appliance_ip'): response = requests.get('http://' + self.env['ir.config_parameter']. get_param('core_appliance_ip') + ':8001/service/template/' + self.name) if "articles" in response.text: self.multi = True else: self.multi = False root = XML(response.text.encode('utf-8')) for resp in root.iter('image'): count = 0 if resp.get('width') == "152" and resp.get( 'height') == "152": self.type += "G1 1.6 red " self.size += "152*152 " count += 1 if resp.get('width') == "212" and resp.get( 'height') == "104": self.type += "G1 2.2 red " self.size += "212*104 " count += 1 if resp.get('width') == "296" and resp.get( 'height') == "152": self.type += "G1 2.6 red " self.size += "296*152 " count += 1 if resp.get('width') == "264" and resp.get( 'height') == "176": self.type += "G1 2.7 red " self.size += "264*176 " count += 1 if resp.get('width') == "400" and resp.get( 'height') == "300": self.type += "G1 4.2 red G1 4.4 red " self.size += "400*300 " count += 1 if resp.get('width') == "480" and resp.get( 'height') == "176": self.type += "G1 4.5 red " self.size += "480*176 " count += 1 if resp.get('width') == "600" and resp.get( 'height') == "448": self.type += "G1 6.0 red " self.size += "600*448 " count += 1 if resp.get('width') == "480" and resp.get( 'height') == "800": self.type += "G1 7.4 red " self.size += "480*800 " count += 1 if resp.get('width') == "768" and resp.get( 'height') == "960": self.type += "G1 12.2 red " self.size += "768*960 " count += 1 if count == 0 or count == 9: self.type = "Dynamic" self.size = str(resp.get('width')) + "*" + str( resp.get('height')) self.dyn = True else: self.dyn = True
def load(path): """Загружает файл по указанному адресу, разбивает на предложения и возвращает список предложений :param path: путь к загружаемому файлу :type path: str :return: список, содержащий предложения, загруженные из указанного файла :rtype: [list] """ if not isinstance(path, str): raise ValueError('Path must be a string with path to file') extension = path.split('.')[-1] if extension == 'txt': file = open(path, 'r', encoding='utf-8').read() # todo: auto-check encoding lines = [ TextLoader.remove_trash_symbols(l.strip()) for l in file.split('.') ] lines = [l for l in lines if len(l) > 0] return lines elif extension == 'pdf': raise NotImplementedError('Under construction') elif extension in ['doc', 'docx']: """Документы в формате .doc и .docx разбираются как zip-архив, из него достаётся .xml файл с текстом Подробное описание метода: https://github.com/nmolivo/tesu_scraper/blob/master/Python_Blogs/01_extract_from_MSWord.ipynb """ document = zipfile.ZipFile(path) source_filename = 'word/document.xml' if source_filename in document.namelist(): xml_content = document.read(source_filename) else: raise FileNotFoundError( 'Cannot find {} inside selected file'.format( source_filename)) document.close() # Warning The xml.etree.ElementTree module is not secure against maliciously constructed data. tree = XML(xml_content) word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' para = word_namespace + 'p' text = word_namespace + 't' paragraphs = [] for paragraph in tree.iter(para): texts = [ node.text for node in paragraph.getiterator(text) if node.text ] if texts: paragraphs.append(''.join(texts)) lines = [ TextLoader.remove_trash_symbols(l.strip()) for parag in paragraphs for l in parag.split('.') ] lines = [l for l in lines if len(l) > 0] return lines else: raise ValueError("Can't handle non pdf, txt, doc or docx file")