Beispiel #1
0
 def _get_type_and_status(self):
     for location in self:
         if location.real_id:
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/labelinfo/' +
                                     location.real_id)
             if response:
                 root = XML(response.text)
                 for resp in root.iter('ConnectionStatus'):
                     location.status = resp.text
             else:
                 location.type = "UNKNOWN"
                 location.status = "UNREGISTERED"
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/labelinfo/type/' +
                                     location.real_id)
             if response:
                 root = XML(response.text)
                 for resp in root.iter('Name'):
                     location.type = resp.text
                 for resp in root.iter('DisplayWidth'):
                     location.size = resp.text
                 for resp in root.iter('DisplayHeight'):
                     location.size += "*" + resp.text
             else:
                 location.type = "UNKNOWN"
    def __load_doc(path):
        """Документы в формате .doc и .docx разбираются как zip-архив, из него достаётся .xml файл с текстом
        Подробное описание метода:
        https://github.com/nmolivo/tesu_scraper/blob/master/Python_Blogs/01_extract_from_MSWord.ipynb
        """
        document = zipfile.ZipFile(path)
        source_filename = 'word/document.xml'
        if source_filename in document.namelist():
            xml_content = document.read(source_filename)
        else:
            raise FileNotFoundError(
                'Cannot find {} inside selected file'.format(source_filename))
        document.close()

        # Warning The xml.etree.ElementTree module is not secure against maliciously constructed data.
        tree = XML(xml_content)

        word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        para = word_namespace + 'p'
        text = word_namespace + 't'

        paragraphs = []
        for paragraph in tree.iter(para):
            texts = [
                node.text for node in paragraph.getiterator(text) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))

        lines = [
            TextLoader.remove_trash_symbols(l.strip()) for parag in paragraphs
            for l in parag.split('.')
        ]
        lines = [l for l in lines if len(l) > 0]
        return lines
Beispiel #3
0
def parse_captions(soup):
    """Converts custom iView captions into SRT format, usable in most
    decent media players.
    """
    
    # Horrible hack to escape literal ampersands, which have been seen in
    # some captions XML. Inspired by
    # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
    if b"<![CDATA[" not in soup:  # Not seen, but be future proof
        soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)
    
    xml = XML(soup)

    output = ''

    i = 1
    for title in xml.iter('title'):
        start = title.get('start')
        (start, startfract) = start.rsplit(':', 1)
        end = title.get('end')
        (end, endfract) = end.rsplit(':', 1)
        output = output + '{}\n'.format(i)
        output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(start, startfract, end, endfract)
        output = output + title.text.replace('|','\n') + '\n\n'
        i += 1

    return output
Beispiel #4
0
def parse_captions(soup):
    """Converts custom iView captions into SRT format, usable in most
    decent media players.
    """

    # Horrible hack to escape literal ampersands, which have been seen in
    # some captions XML. Inspired by
    # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
    if b"<![CDATA[" not in soup:  # Not seen, but be future proof
        soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)

    xml = XML(soup)

    output = ''

    i = 1
    for title in xml.iter('title'):
        start = title.get('start')
        (start, startfract) = start.rsplit(':', 1)
        end = title.get('end')
        (end, endfract) = end.rsplit(':', 1)
        output = output + '{}\n'.format(i)
        output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(
            start, startfract, end, endfract)
        output = output + title.text.replace('|', '\n') + '\n\n'
        i += 1

    return output
Beispiel #5
0
def get_text(path):
    # возвращаемый список с текстом
    text_list = list()

    word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    para = word_namespace + 'p'
    text = word_namespace + 't'

    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    raw_text_list = list()
    for paragraph in tree.iter(para):
        texts = [node.text for node in paragraph.iter(text) if node.text]
        if texts:
            raw_text_list.append(''.join(texts))

    all_text = '\n\n'.join(raw_text_list)  # весь текст в одной строке

    strings = all_text.split('. ')  # разбиение по предложениям
    for line in strings:
        lines = line.split('\n')  # разбиение по абзацам
        for string in lines:
            if string:
                text_list.append(string)

    return text_list
Beispiel #6
0
def parse_config(soup):
    """There are lots of goodies in the config we get back from the ABC.
    In particular, it gives us the URLs of all the other XML data we
    need.
    """

    xml = XML(soup)
    params = dict()
    for param in xml.iter('param'):
        params.setdefault(param.get('name'), param.get('value'))

    # should look like "rtmp://cp53909.edgefcs.net/ondemand"
    # Looks like the ABC don't always include this field.
    # If not included, that's okay -- ABC usually gives us the server in the auth result as well.
    rtmp_url = params['server_streaming']
    categories_url = params['categories']

    params.update({
        'rtmp_url': rtmp_url,
        'auth_url': params['auth'],
        'api_url': params['api'],
        'categories_url': categories_url,
        'captions_url': params['captions'],
    })
    return params
def get_docx_table(path):
    """
    Find the table inside the .docx file and return it in an array
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    rows = []
    for xml_row in tree.iter(TR):
        row = []
        for xml_cell in xml_row.iter(TC):
            # Each cell consists of one or more paragraph
            text = ""
            for paragraph in xml_cell.iter(PARA):
                texts = [
                    node.text for node in paragraph.iter(TEXT) if node.text
                ]
                paragraph_text = "".join(texts)
                if paragraph_text:
                    text += paragraph_text + "\n"
            if text.endswith("\n"):
                text = text[0:-1]
            row.append(text)
        rows.append(row)
    return rows
def replace_string2(filename):
    global model_name
    global tokenizer
    global model

    model_name = 'Helsinki-NLP/opus-mt-en-de'

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    document = zipfile.ZipFile(filename)
    xml_content = document.read('word/document.xml')
    #document.close()
    tree = XML(xml_content)
    # using lxml instead of xml preserved the comments

    paragraphs = []
    i = 0
    for paragraph in tree.iter(PARA):
        i = i + 1
        texts = [node.text for node in paragraph.iter(TEXT) if node.text]
        if texts:
            #text = list(filter(None, text))
            #text = [s for s in text if p.match(s)]

            #text = [">>de<< " + s for s in text]
            #print("%s: %s" %(i,texts))
            target, duration = translat(texts)
            paragraph.text.replace(texts, target)

    document.save("new.docx")
Beispiel #9
0
def parse_config(soup):
    """There are lots of goodies in the config we get back from the ABC.
    In particular, it gives us the URLs of all the other XML data we
    need.
    """

    xml = XML(soup)
    params = dict()
    for param in xml.iter('param'):
        params.setdefault(param.get('name'), param.get('value'))

    # should look like "rtmp://cp53909.edgefcs.net/ondemand"
    # Looks like the ABC don't always include this field.
    # If not included, that's okay -- ABC usually gives us the server in the auth result as well.
    rtmp_url = params['server_streaming']
    categories_url = params['categories']

    params.update({
        'rtmp_url'  : rtmp_url,
        'auth_url'  : params['auth'],
        'api_url' : params['api'],
        'categories_url' : categories_url,
        'captions_url' : params['captions'],
    })
    return params
def get_docx_tables(path):
    """
    Find the table inside the .docx file and return it in an array
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    for tbl in tree.iter(TBL):
        yield tbl
Beispiel #11
0
 def _update_esl(self):
     for label in self.search([]):
         if label.len_products != 0 and label.need_update == True:
             response = requests.post(
                 'http://' + self.env['ir.config_parameter'].get_param(
                     'core_appliance_ip') + ':8001/service/task',
                 data=label._build_task_body().encode('utf-8'),
                 headers={'Content-Type': 'application/xml'})
             root = XML(response.text)
             for transaction in root.iter('Transaction'):
                 label.task_id = transaction.get('id')
             label.need_update = False
def get_para_list(path):
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text for node in paragraph.iter(TEXT) if node.text]
        if texts:
            paragraphs.append(''.join(texts))
    return paragraph
Beispiel #13
0
def get_docx_text(filename):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(filename)
    xml_content = document.read("word/document.xml")
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text for node in paragraph.getiterator(TEXT) if node.text]
        if texts:
            paragraphs.append("".join(texts))

    return "\n\n".join(paragraphs)
Beispiel #14
0
 def _get_last_image_loaded(self):
     for label in self:
         task_id = ""
         labelupdatestatus = False
         if label.task_id:
             labelupdatestatus = requests.get(
                 'http://' + self.env['ir.config_parameter'].get_param(
                     'core_appliance_ip') +
                 ':8001/service/updatestatus/transaction/' + label.task_id)
             if labelupdatestatus:
                 root = XML(labelupdatestatus.text)
                 for update in root.iter('UpdateStatus'):
                     task_id = update.get('id')
             label.image = label._get_image_from_task(task_id)
         else:
             label.image = False
Beispiel #15
0
 def docx_text_extractor(self, text):
     WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
     PARA = WORD_NAMESPACE + 'p'
     TEXT = WORD_NAMESPACE + 't'
     document = zipfile.ZipFile(text)
     xml_content = document.read('word/document.xml')
     document.close()
     tree = XML(xml_content)
     paragraphs = []
     for paragraph in tree.iter(PARA):
         texts = [
             node.text for node in paragraph.getiterator(TEXT) if node.text
         ]
         if texts:
             paragraphs.append(''.join(texts))
     return '\n\n'.join(paragraphs)
Beispiel #16
0
 def get_docx_text(path):
     schemas = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
     para = schemas + 'p'
     text = schemas + 't'
     """
     Take the path of a docx file as argument, return the text in unicode.
     """
     document = zipfile.ZipFile(path)
     xml_content = document.read('word/document.xml')
     document.close()
     tree = XML(xml_content)
     paragraphs = []
     for paragraph in tree.iter(para):
         texts = [node.text for node in paragraph.iter(text) if node.text]
         if texts:
             paragraphs.append(''.join(texts))
     return os.linesep.join(paragraphs)
Beispiel #17
0
 def _get_status_task(self):
     for location in self:
         if location.task_id:
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/transaction/' +
                                     location.task_id + '/status')
             if response:
                 root = XML(response.text)
                 for resp in root.iter('TransactionStatusInfo'):
                     if resp.get('failed') == "true":
                         location.task_status = "FAILED"
                     elif resp.get('finished') == "true":
                         location.task_status = "FINISHED"
                     else:
                         location.task_status = "WAITING"
Beispiel #18
0
def get_docx_text(path):
    #use white-space: pre CSS
    """
    Take the path of a docx file as argument, return the formatted text.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    #elements = list()
    styleAdditions = list()
    elementString = ["<article>"]

    for paragraph in tree.iter(PARA):
        """for paraStyleElement in paragraph.find(PARAPROPS).iter():
            if paraStyleElement =="""

        #JUSTIFICATION
        justificationInfo = paragraph.find(PARAPROPS).find(
            justification).attrib[WORD_NAMESPACE + "val"]
        if justificationInfo != "left" and justificationInfo != "both":
            styleAdditions.append("text-align:" + justificationInfo + ";")
        elif justificationInfo == "both":
            styleAdditions.append("text-align:justify;")

#INDENT
        if paragraph.find(PARAPROPS).find(indent):
            indentInfo = dict()
            for attribKey in paragraph.find(PARAPROPS).find(indent).attrib:
                if attribKey == WORD_NAMESPACE + "hanging":
                    indentInfo["hanging"] = (float(
                        paragraph.find(PARAPROPS).find(
                            indent).attrib[attribKey]) / 20)
                if attribKey == WORD_NAMESPACE + "end" or attribKey == WORD_NAMESPACE + "right":
                    indentInfo["right"] = (float(
                        paragraph.find(PARAPROPS).find(
                            indent).attrib[attribKey]) / 20)
                if attribKey == WORD_NAMESPACE + "start" or attribKey == WORD_NAMESPACE + "left":
                    indentInfo["left"] = (float(
                        paragraph.find(PARAPROPS).find(
                            indent).attrib[attribKey]) / 20)
            for key in indentInfo:
                if key == "hanging":
                    styleAdditions.append("margin-top:" +
                                          str(indentInfo[key]) + ";")
                if key == "right" or key == "end":
                    styleAdditions.append("margin-right:" +
                                          str(indentInfo[key]) + ";")
                if key == "left" or key == "start":
                    styleAdditions.append("margin-left:" +
                                          str(indentInfo[key]) + ";")
                #left => start, right => end, hanging

#BACKGROUND COLOURING
        if paragraph.find(PARAPROPS).find(shade):
            if paragraph.find(PARAPROPS).find(shade).attrib[WORD_NAMESPACE +
                                                            "fill"] != "auto":
                styleAdditions.append(
                    "background-color: #" +
                    paragraph.find(PARAPROPS).find(shade).attrib["fill"] + ";")

#BORDERS
        if paragraph.find(PARAPROPS).find(paraBorders):
            for sideElement in paragraph.find(PARAPROPS).find(
                    paraBorders).iter():
                pass
                if sideElement != WORD_NAMESPACE + "between":
                    styleAdditions.append(
                        "border-style:" +
                        borderTypes[sideElement.attrib["val"]])

#GET ALL THE TEXT, FIX TO PUT TEXT INTO ITS HTML ELEMENT
        paragraphText = [
            element.text for element in paragraph.iter(TEXT) if element.text
        ]  #make a list comprehension of styles to match
        if not paragraphText:
            if paragraph.iter(shdBreak):
                for shadow in paragraph.iter(
                        shdBreak
                ):  #bloody generator making me use a for loop for one element
                    if shadow.attrib[WORD_NAMESPACE + "fill"] == "auto":
                        for element in paragraph.iter(fontSize):
                            breakSize = str(
                                float(element.attrib[WORD_NAMESPACE + "val"]) /
                                2
                            )  #gets a dict of fontSize element attributes, then gets the value representing font size, in points
                            break
                        styleAdditions.append(
                            "min-height:" + breakSize + "pt;"
                        )  #why not em? ATTENTION ATTENTION ATTENTION ATTENTION ATTENTION
                        styleAdditions.append("margin:0;")
                    break
        else:
            elementString.append("<p")
            elementString.append(' style="')
            #elementString.append("white-space:pre-wrap;word-wrap:break-word;")
            for style in styleAdditions:
                elementString.append(style)
            if ''.join(elementString)[-1:] != ";":
                elementString.append(";")

            elementString.append('"')
            elementString.append(">")
            elementString.append("</p>\n")
            styleAdditions = list()
        """texts = [node.text
                 for node in paragraph.iter(TEXT)
                 if node.text]
        if texts:
            elements.append(''.join(texts))"""
    elementString.append("</article>")
    return ''.join(elementString)
def get_docx_text(path):
    #use white-space: pre CSS
    """
    Take the path of a docx file as argument, return the formatted text.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    
    #elements = list()
    styleAdditions = list()
    elementString = ["<article>"]
	
    for paragraph in tree.iter(PARA):
        """for paraStyleElement in paragraph.find(PARAPROPS).iter():
            if paraStyleElement =="""
		
		#JUSTIFICATION
        justificationInfo = paragraph.find(PARAPROPS).find(justification).attrib[WORD_NAMESPACE+"val"]
        if justificationInfo != "left" and justificationInfo != "both":
            styleAdditions.append("text-align:"+justificationInfo+";")
        elif justificationInfo == "both":
            styleAdditions.append("text-align:justify;")
		
		#INDENT
        if paragraph.find(PARAPROPS).find(indent):
            indentInfo = dict()
            for attribKey in paragraph.find(PARAPROPS).find(indent).attrib:
                if attribKey == WORD_NAMESPACE +"hanging":
                    indentInfo["hanging"] = (float(paragraph.find(PARAPROPS).find(indent).attrib[attribKey])/20)
                if attribKey == WORD_NAMESPACE +"end" or attribKey == WORD_NAMESPACE +"right":
                    indentInfo["right"] = (float(paragraph.find(PARAPROPS).find(indent).attrib[attribKey])/20)
                if attribKey == WORD_NAMESPACE +"start" or attribKey == WORD_NAMESPACE +"left":
                   indentInfo["left"] = (float(paragraph.find(PARAPROPS).find(indent).attrib[attribKey])/20)
            for key in indentInfo:
                if key == "hanging":
                    styleAdditions.append("margin-top:"+str(indentInfo[key])+";")
                if key == "right" or key == "end":
                    styleAdditions.append("margin-right:"+str(indentInfo[key])+";")
                if key == "left" or key == "start":
                    styleAdditions.append("margin-left:"+str(indentInfo[key])+";")
                #left => start, right => end, hanging
				
		#BACKGROUND COLOURING
        if paragraph.find(PARAPROPS).find(shade):
                if paragraph.find(PARAPROPS).find(shade).attrib[WORD_NAMESPACE +"fill"] != "auto":
                    styleAdditions.append("background-color: #"+paragraph.find(PARAPROPS).find(shade).attrib["fill"]+";")
                
		#BORDERS
        if paragraph.find(PARAPROPS).find(paraBorders):
            for sideElement in paragraph.find(PARAPROPS).find(paraBorders).iter():
                pass
                if sideElement != WORD_NAMESPACE + "between":
                    styleAdditions.append("border-style:"+borderTypes[sideElement.attrib["val"]])
                        
		#GET ALL THE TEXT, FIX TO PUT TEXT INTO ITS HTML ELEMENT
        paragraphText = [element.text for element in paragraph.iter(TEXT) if element.text] #make a list comprehension of styles to match
        if not paragraphText:
            if paragraph.iter(shdBreak):            
                for shadow in paragraph.iter(shdBreak): #bloody generator making me use a for loop for one element
                    if shadow.attrib[WORD_NAMESPACE+"fill"] == "auto":
                        for element in paragraph.iter(fontSize):
                            breakSize = str(float(element.attrib[WORD_NAMESPACE+"val"])/2) #gets a dict of fontSize element attributes, then gets the value representing font size, in points
                            break
                        styleAdditions.append("min-height:"+breakSize+"pt;") #why not em? ATTENTION ATTENTION ATTENTION ATTENTION ATTENTION
                        styleAdditions.append("margin:0;")
                    break
        else:
            elementString.append("<p")
            elementString.append(' style="')
            #elementString.append("white-space:pre-wrap;word-wrap:break-word;")
            for style in styleAdditions:
                elementString.append(style)
            if ''.join(elementString)[-1:] != ";":
                elementString.append(";")
            
            elementString.append('"')
            elementString.append(">")
            elementString.append("</p>\n")
            styleAdditions = list()
        """texts = [node.text
                 for node in paragraph.iter(TEXT)
                 if node.text]
        if texts:
            elements.append(''.join(texts))"""
    elementString.append("</article>")
    return ''.join(elementString)
Beispiel #20
0
 def _get_type_template(self):
     if self.name:
         self.type = ""
         self.size = ""
         if self.env['ir.config_parameter'].get_param('core_appliance_ip'):
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/template/' + self.name)
             if "articles" in response.text:
                 self.multi = True
             else:
                 self.multi = False
             root = XML(response.text.encode('utf-8'))
             for resp in root.iter('image'):
                 count = 0
                 if resp.get('width') == "152" and resp.get(
                         'height') == "152":
                     self.type += "G1 1.6 red "
                     self.size += "152*152 "
                     count += 1
                 if resp.get('width') == "212" and resp.get(
                         'height') == "104":
                     self.type += "G1 2.2 red "
                     self.size += "212*104 "
                     count += 1
                 if resp.get('width') == "296" and resp.get(
                         'height') == "152":
                     self.type += "G1 2.6 red "
                     self.size += "296*152 "
                     count += 1
                 if resp.get('width') == "264" and resp.get(
                         'height') == "176":
                     self.type += "G1 2.7 red "
                     self.size += "264*176 "
                     count += 1
                 if resp.get('width') == "400" and resp.get(
                         'height') == "300":
                     self.type += "G1 4.2 red G1 4.4 red "
                     self.size += "400*300 "
                     count += 1
                 if resp.get('width') == "480" and resp.get(
                         'height') == "176":
                     self.type += "G1 4.5 red "
                     self.size += "480*176 "
                     count += 1
                 if resp.get('width') == "600" and resp.get(
                         'height') == "448":
                     self.type += "G1 6.0 red "
                     self.size += "600*448 "
                     count += 1
                 if resp.get('width') == "480" and resp.get(
                         'height') == "800":
                     self.type += "G1 7.4 red "
                     self.size += "480*800 "
                     count += 1
                 if resp.get('width') == "768" and resp.get(
                         'height') == "960":
                     self.type += "G1 12.2 red "
                     self.size += "768*960 "
                     count += 1
                 if count == 0 or count == 9:
                     self.type = "Dynamic"
                     self.size = str(resp.get('width')) + "*" + str(
                         resp.get('height'))
                     self.dyn = True
         else:
             self.dyn = True
Beispiel #21
0
    def load(path):
        """Загружает файл по указанному адресу, разбивает на предложения и возвращает список предложений

        :param path: путь к загружаемому файлу
        :type path: str
        :return: список, содержащий предложения, загруженные из указанного файла
        :rtype: [list]
        """
        if not isinstance(path, str):
            raise ValueError('Path must be a string with path to file')
        extension = path.split('.')[-1]
        if extension == 'txt':
            file = open(path, 'r',
                        encoding='utf-8').read()  # todo: auto-check encoding
            lines = [
                TextLoader.remove_trash_symbols(l.strip())
                for l in file.split('.')
            ]
            lines = [l for l in lines if len(l) > 0]
            return lines
        elif extension == 'pdf':
            raise NotImplementedError('Under construction')
        elif extension in ['doc', 'docx']:
            """Документы в формате .doc и .docx разбираются как zip-архив, из него достаётся .xml файл с текстом
            Подробное описание метода: 
            https://github.com/nmolivo/tesu_scraper/blob/master/Python_Blogs/01_extract_from_MSWord.ipynb
            """
            document = zipfile.ZipFile(path)
            source_filename = 'word/document.xml'
            if source_filename in document.namelist():
                xml_content = document.read(source_filename)
            else:
                raise FileNotFoundError(
                    'Cannot find {} inside selected file'.format(
                        source_filename))
            document.close()

            # Warning The xml.etree.ElementTree module is not secure against maliciously constructed data.
            tree = XML(xml_content)

            word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
            para = word_namespace + 'p'
            text = word_namespace + 't'

            paragraphs = []
            for paragraph in tree.iter(para):
                texts = [
                    node.text for node in paragraph.getiterator(text)
                    if node.text
                ]
                if texts:
                    paragraphs.append(''.join(texts))

            lines = [
                TextLoader.remove_trash_symbols(l.strip())
                for parag in paragraphs for l in parag.split('.')
            ]
            lines = [l for l in lines if len(l) > 0]
            return lines
        else:
            raise ValueError("Can't handle non pdf, txt, doc or docx file")