Python XML.XMLの例、xml.etree.cElementTree.XML.XML Pythonの例

コード例 #1

0

ファイルを表示

 def _get_type_and_status(self):
     for location in self:
         if location.real_id:
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/labelinfo/' +
                                     location.real_id)
             if response:
                 root = XML(response.text)
                 for resp in root.iter('ConnectionStatus'):
                     location.status = resp.text
             else:
                 location.type = "UNKNOWN"
                 location.status = "UNREGISTERED"
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/labelinfo/type/' +
                                     location.real_id)
             if response:
                 root = XML(response.text)
                 for resp in root.iter('Name'):
                     location.type = resp.text
                 for resp in root.iter('DisplayWidth'):
                     location.size = resp.text
                 for resp in root.iter('DisplayHeight'):
                     location.size += "*" + resp.text
             else:
                 location.type = "UNKNOWN"

コード例 #2

0

ファイルを表示

ファイル: app.py プロジェクト: soitun/commcare-hq

def _clean_xml_for_partial_submission(xml, should_remove_case_actions):
    """
    Helper method to cleanup partially completed xml for submission
    :param xml: partially completed xml
    :param should_remove_case_actions: if True, remove case actions (create, update, close) from xml
    :return: byte str of cleaned xml
    """
    root = XML(xml)
    case_tag_regex = re.compile(
        r"^(\{.*\}){0,1}case$"
    )  # Use regex in order to search regardless of namespace
    meta_tag_regex = re.compile(r"^(\{.*\}){0,1}meta$")
    timeEnd_tag_regex = re.compile(r"^(\{.*\}){0,1}timeEnd$")
    current_timestamp = json_format_datetime(utcnow())
    for child in root:
        if case_tag_regex.match(child.tag) is not None:
            # Found the case tag
            case_element = child
            case_element.set("date_modified", current_timestamp)
            if should_remove_case_actions:
                child_elements = [case_action for case_action in case_element]
                for case_action in child_elements:
                    case_element.remove(case_action)
        elif meta_tag_regex.match(child.tag) is not None:
            # Found the meta tag, now set the value for timeEnd
            for meta_child in child:
                if timeEnd_tag_regex.match(meta_child.tag):
                    meta_child.text = current_timestamp
    return tostring(root)

コード例 #3

0

ファイルを表示

def fix_etree():
    try:
        from xml.etree.cElementTree import XML
        e = XML('<test><t a="1"/></test>')
        e.find('t[@a="1"]')
    except SyntaxError:
        import canari.xmltools.fixetree

コード例 #4

0

ファイルを表示

ファイル: Docx.py プロジェクト: ityas/pulling

def get_text(path):
    # возвращаемый список с текстом
    text_list = list()

    word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    para = word_namespace + 'p'
    text = word_namespace + 't'

    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    raw_text_list = list()
    for paragraph in tree.iter(para):
        texts = [node.text for node in paragraph.iter(text) if node.text]
        if texts:
            raw_text_list.append(''.join(texts))

    all_text = '\n\n'.join(raw_text_list)  # весь текст в одной строке

    strings = all_text.split('. ')  # разбиение по предложениям
    for line in strings:
        lines = line.split('\n')  # разбиение по абзацам
        for string in lines:
            if string:
                text_list.append(string)

    return text_list

コード例 #5

0

ファイルを表示

def get_docx_tags(xml_content):
    tree = XML(xml_content) 
    """
    Collect the contents of all tags
    """
    ctags = []
    all_tags = []

    pgh = Tag2Method()
    for paragraph in tree.iter(WORD_NAMESPACE+'p'):
    	for elem in paragraph.iter():
    		#pgh.call_method(elem)
    		#print elem
    		if WORD_NAMESPACE in elem.tag:
    			pass
    		else:
    			ctags.append('<'+elem.tag.replace(OMML_NAMESPACE, 'm:')+'>')
    			#print ctags
        if ctags:
            all_tags.append(' '.join(ctags))
        else:
        	all_tags.append('[No math tags in this cluster]')
        ctags = []
    	#print 'END of Paragraph'
    return all_tags

コード例 #6

0

ファイルを表示

def xml_from_string(xml_string):
    """Convert XML string to ElementTree with XNAT namespaces"""

    root = XML(xml_string)
    for prefix, namespace in XNAT_NS.items():
        register_namespace(prefix, namespace)
    return root

コード例 #7

0

ファイルを表示

def docx_extractor(path, vectors=False):
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    ## GET METADATA
    # use lxml to parse the xml file we are interested in
    try:
        doc = lxml.etree.fromstring(document.read('docProps/core.xml'))
        # retrieve creator
        ns = {'dc': 'http://purl.org/dc/elements/1.1/'}
        creator = doc.xpath('//dc:creator', namespaces=ns)[0].text
    except:
        creator = "Unknown"
    document.close()
    tree = XML(xml_content)

    doc = {}
    vector = {}
    paragraph_nb = 1
    for paragraph in tree.getiterator(PARA):
        texts = None
        text = ""
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            text = ''.join(texts)
            doc[str(paragraph_nb)] = fix_text(text)
            if vectors:
                vector[str(paragraph_nb)] = vectorizer(text, lang=detect(text))
            paragraph_nb += 1

    if vectors:
        return creator, doc, vector
    else:
        return creator, doc

コード例 #8

0

ファイルを表示

def get_raw_text(pthFile):
    """
	gets a path to a file as an argument and returns a list containing
	the paragraphs of the word document file
	"""
    """
	Constants used to iterate over the XML tree
	"""
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    docWordDoc = zipfile.ZipFile(pthFile)  #gets the documents of the word
    xmlContent = docWordDoc.read('word/document.xml')  #access the xml file
    docWordDoc.close()
    treeXML = XML(
        xmlContent
    )  #parses the xml content into a tree that will be further used to access the text

    lstParagraphs = []  #output list with the paragraphs of the text
    #now we proceed to extract the text from the tree
    #the idea is to iterate over the tree and
    #for each node that contains text, substract it and add it to
    #the output
    for parParagraph in treeXML.getiterator(PARA):
        lstTexts = [
            nodElement.text for nodElement in parParagraph.getiterator(TEXT)
            if nodElement.text
        ]
        if lstTexts:
            print lstTexts
            lstParagraphs.append(''.join(lstTexts))

    return lstParagraphs

コード例 #9

0

ファイルを表示

def createContainer(name,
                    panel="",
                    username="",
                    domain="",
                    password="",
                    hostname="",
                    protocol="",
                    port=""):
    inheritUsername = "******"
    if username == "[inherit]":
        username = ""
        inheritUsername = "******"
    inheritDomain = "false"
    if domain == "[inherit]":
        domain = ""
        inheritDomain = "true"
    inheritPassword = "******"
    if password == "[inherit]":
        password = ""
        inheritPassword = "******"
    inheritProtocol = "false"
    if protocol == "[inherit]":
        protocol = ""
        inheritProtocol = "true"
    inheritPort = "false"
    if port == "[inherit]":
        port = ""
        inheritPort = "true"

    node = XML(
        f'<Node Name="{name}" VmId="" UseVmId="false" Type="Container" Expanded="true" Descr="" Icon="Domain Controller" Panel="{panel}" Id="{uuid.uuid4()}" Username="******" Domain="{domain}" Password="******"")}" Hostname="{hostname}" Protocol="{protocol}" RdpVersion="rdc6" PuttySession="Default Settings" Port="{port}" ConnectToConsole="false" UseCredSsp="true" RenderingEngine="IE" ICAEncryptionStrength="EncrBasic" RDPAuthenticationLevel="NoAuth" RDPMinutesToIdleTimeout="0" RDPAlertIdleTimeout="false" LoadBalanceInfo="" Colors="Colors16Bit" Resolution="FitToWindow" AutomaticResize="true" DisplayWallpaper="false" DisplayThemes="false" EnableFontSmoothing="true" EnableDesktopComposition="false" CacheBitmaps="true" RedirectDiskDrives="true" RedirectPorts="false" RedirectPrinters="false" RedirectClipboard="true" RedirectSmartCards="false" RedirectSound="BringToThisComputer" SoundQuality="Dynamic" RedirectAudioCapture="false" RedirectKeys="true" Connected="false" PreExtApp="" PostExtApp="" MacAddress="" UserField="" Favorite="false" ExtApp="" VNCCompression="CompNone" VNCEncoding="EncHextile" VNCAuthMode="AuthVNC" VNCProxyType="ProxyNone" VNCProxyIP="" VNCProxyPort="0" VNCProxyUsername="" VNCProxyPassword="" VNCColors="ColNormal" VNCSmartSizeMode="SmartSAspect" VNCViewOnly="false" RDGatewayUsageMethod="Never" RDGatewayHostname="" RDGatewayUseConnectionCredentials="Yes" RDGatewayUsername="" RDGatewayPassword="" RDGatewayDomain="" InheritCacheBitmaps="false" InheritColors="false" InheritDescription="false" InheritDisplayThemes="false" InheritDisplayWallpaper="false" InheritEnableFontSmoothing="false" InheritEnableDesktopComposition="false" InheritDomain="{inheritDomain}" InheritIcon="false" InheritPanel="false" InheritPassword="******" InheritPort="{inheritPort}" InheritProtocol="{inheritProtocol}" InheritRdpVersion="false" InheritPuttySession="false" InheritRedirectDiskDrives="false" InheritRedirectKeys="false" InheritRedirectPorts="false" InheritRedirectPrinters="false" InheritRedirectClipboard="false" InheritRedirectSmartCards="false" InheritRedirectSound="false" InheritSoundQuality="false" InheritRedirectAudioCapture="false" InheritResolution="false" InheritAutomaticResize="false" InheritUseConsoleSession="false" InheritUseCredSsp="false" InheritRenderingEngine="false" InheritUsername="******" InheritICAEncryptionStrength="false" InheritRDPAuthenticationLevel="false" InheritRDPMinutesToIdleTimeout="false" InheritRDPAlertIdleTimeout="false" InheritLoadBalanceInfo="false" InheritPreExtApp="false" InheritPostExtApp="false" InheritMacAddress="false" InheritUserField="false" InheritFavorite="false" InheritExtApp="false" InheritVNCCompression="false" InheritVNCEncoding="false" InheritVNCAuthMode="false" InheritVNCProxyType="false" InheritVNCProxyIP="false" InheritVNCProxyPort="false" InheritVNCProxyUsername="******" InheritVNCProxyPassword="******" InheritVNCColors="false" InheritVNCSmartSizeMode="false" InheritVNCViewOnly="false" InheritRDGatewayUsageMethod="false" InheritRDGatewayHostname="false" InheritRDGatewayUseConnectionCredentials="false" InheritRDGatewayUsername="******" InheritRDGatewayPassword="******" InheritRDGatewayDomain="false" InheritVmId="false" InheritUseVmId="false" />'
    )

    return node

コード例 #10

0

ファイルを表示

ファイル: becompare.py プロジェクト: jonstewart/bulk_extractor

def validate_report(report):
    if not os.path.isdir(report):
        raise FileNotFoundError(f"{report} is not a directory")
    xmlfile = os.path.join(report,"report.xml")
    if not os.path.isfile(xmlfile):
        raise FileNotFoundError(xmlfile)
    tree = XML(open(xmlfile,"r").read())

コード例 #11

0

ファイルを表示

def epilepsy_docx_xml_to_txt(
        path,
        n_xml,
        docx_xml_to_txt_save_path="L:\\word_docs\\epilepsy_docx_xml_to_txt\\"):
    """
    Take the path of a docx file as argument, return the text in unicode.
    Run this if epilepsy_docx() isn't able to read the name.
    This should automatically read tables anyway.
    """

    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [n.text for n in paragraph.getiterator(TEXT) if n.text]
        if texts:
            paragraphs.append(''.join(texts))

    pt_txt_xml = '\n\n'.join(paragraphs)
    save_as_txt(path, pt_txt_xml, docx_xml_to_txt_save_path)

    n_xml += 1
    return pt_txt_xml, n_xml

コード例 #12

0

ファイルを表示

ファイル: parser.py プロジェクト: tarhan/python-iview

def parse_config(soup):
    """There are lots of goodies in the config we get back from the ABC.
    In particular, it gives us the URLs of all the other XML data we
    need.
    """

    xml = XML(soup)
    params = dict()
    for param in xml.iter('param'):
        params.setdefault(param.get('name'), param.get('value'))

    # should look like "rtmp://cp53909.edgefcs.net/ondemand"
    # Looks like the ABC don't always include this field.
    # If not included, that's okay -- ABC usually gives us the server in the auth result as well.
    rtmp_url = params['server_streaming']
    categories_url = params['categories']

    params.update({
        'rtmp_url': rtmp_url,
        'auth_url': params['auth'],
        'api_url': params['api'],
        'categories_url': categories_url,
        'captions_url': params['captions'],
    })
    return params

コード例 #13

0

ファイルを表示

ファイル: parser.py プロジェクト: tarhan/python-iview

def parse_captions(soup):
    """Converts custom iView captions into SRT format, usable in most
    decent media players.
    """

    # Horrible hack to escape literal ampersands, which have been seen in
    # some captions XML. Inspired by
    # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
    if b"<![CDATA[" not in soup:  # Not seen, but be future proof
        soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)

    xml = XML(soup)

    output = ''

    i = 1
    for title in xml.iter('title'):
        start = title.get('start')
        (start, startfract) = start.rsplit(':', 1)
        end = title.get('end')
        (end, endfract) = end.rsplit(':', 1)
        output = output + '{}\n'.format(i)
        output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(
            start, startfract, end, endfract)
        output = output + title.text.replace('|', '\n') + '\n\n'
        i += 1

    return output

コード例 #14

0

ファイルを表示

ファイル: Directory_processor.py プロジェクト: sidhant1608/Academic-Papers-Classifier

def get_email_para(path):
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'
    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    text = ''
    for paragraph in paragraphs:
        text += ' ' + paragraph
    #print (text)
    result = ''
    reresult = False
    match = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    for paragraph in paragraphs:
        if match[0] in paragraph:
            result = (paragraph.split(match[0])[0])
            #print (result)
    document = Document(path)
    paras = list(document.paragraphs)
    for paragraph in paras:
        if result in paragraph.text:
            paragraph.style = 'Correspondence'
            document.save(file.split(".")[0] + "_PROCESSED.docx")
            reresult = True
    return reresult

コード例 #15

0

ファイルを表示

ファイル: Trial_1_DocxTranslator.py プロジェクト: GJuniorG/testRepo

def replace_string2(filename):
    global model_name
    global tokenizer
    global model

    model_name = 'Helsinki-NLP/opus-mt-en-de'

    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    document = zipfile.ZipFile(filename)
    xml_content = document.read('word/document.xml')
    #document.close()
    tree = XML(xml_content)
    # using lxml instead of xml preserved the comments

    paragraphs = []
    i = 0
    for paragraph in tree.iter(PARA):
        i = i + 1
        texts = [node.text for node in paragraph.iter(TEXT) if node.text]
        if texts:
            #text = list(filter(None, text))
            #text = [s for s in text if p.match(s)]

            #text = [">>de<< " + s for s in text]
            #print("%s: %s" %(i,texts))
            target, duration = translat(texts)
            paragraph.text.replace(texts, target)

    document.save("new.docx")

コード例 #16

0

ファイルを表示

    def _gmetad_parse(self, xml_data):
        """Parse metrics from XML data"""

        self.log.info('Parsing %d bytes of gmetad XML', len(xml_data))

        metrics = {}
        try:
            for host in XML(xml_data).findall('GRID/CLUSTER/HOST'):
                host_name = host.get('NAME')
                metrics[host_name] = {
                    'reported': host.get('REPORTED'),
                    'metrics': {}
                }
                for metric in host.findall('METRIC'):
                    metric_name = metric.get('NAME')
                    metrics[host_name]['metrics'][metric_name] = {
                        'title': metric_name,
                        'units': metric.get('UNITS'),
                        'value': metric.get('VAL')
                    }
                    for extra_data in metric.findall(
                            'EXTRA_DATA/EXTRA_ELEMENT'):
                        if extra_data.get('NAME') == 'TITLE':
                            metrics[host_name]['metrics'][metric_name][
                                'title'] = extra_data.get('VAL')
                            break  # No need for further searching

        except Exception, e:
            raise self.GmetadXmlError('Error while parsing gmetad XML: %s' % e)

コード例 #17

0

ファイルを表示

def get_docx_text(xml_content):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    tmp = Tag2Method()
    childs = []

    tree = XML(xml_content)
    pctr = 0
    maths = []
    paragraphs = []
    for paragraph in tree.iter(WORD_NAMESPACE+'p'):
        texts = [node.text
                 for node in paragraph.iter(WORD_NAMESPACE+'t')
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))
            pctr = pctr + 1
        else:
        	paragraphs.append('[No text in this cluster]')
        	pctr = pctr + 1
        for mzone in paragraph.iter(OMML_NAMESPACE+'t'):
        	if mzone.text and pctr == 3:
        		maths.append(mzone.text)
    return paragraphs

コード例 #18

0

ファイルを表示

ファイル: word_table_reader.py プロジェクト: simsong/word_acronym_checker

def get_docx_table(path):
    """
    Find the table inside the .docx file and return it in an array
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    rows = []
    for xml_row in tree.iter(TR):
        row = []
        for xml_cell in xml_row.iter(TC):
            # Each cell consists of one or more paragraph
            text = ""
            for paragraph in xml_cell.iter(PARA):
                texts = [
                    node.text for node in paragraph.iter(TEXT) if node.text
                ]
                paragraph_text = "".join(texts)
                if paragraph_text:
                    text += paragraph_text + "\n"
            if text.endswith("\n"):
                text = text[0:-1]
            row.append(text)
        rows.append(row)
    return rows

コード例 #19

0

ファイルを表示

ファイル: text_loader.py プロジェクト: ant-sidr/NLP_exploration

    def __load_doc(path):
        """Документы в формате .doc и .docx разбираются как zip-архив, из него достаётся .xml файл с текстом
        Подробное описание метода:
        https://github.com/nmolivo/tesu_scraper/blob/master/Python_Blogs/01_extract_from_MSWord.ipynb
        """
        document = zipfile.ZipFile(path)
        source_filename = 'word/document.xml'
        if source_filename in document.namelist():
            xml_content = document.read(source_filename)
        else:
            raise FileNotFoundError(
                'Cannot find {} inside selected file'.format(source_filename))
        document.close()

        # Warning The xml.etree.ElementTree module is not secure against maliciously constructed data.
        tree = XML(xml_content)

        word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        para = word_namespace + 'p'
        text = word_namespace + 't'

        paragraphs = []
        for paragraph in tree.iter(para):
            texts = [
                node.text for node in paragraph.getiterator(text) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))

        lines = [
            TextLoader.remove_trash_symbols(l.strip()) for parag in paragraphs
            for l in parag.split('.')
        ]
        lines = [l for l in lines if len(l) > 0]
        return lines

コード例 #20

0

ファイルを表示

def parseDocx(inDoc):
    with open('temp/temp.txt', 'w+') as temp:
        # print("parseDocx is being called")
        import zipfile
        try:
            from xml.etree.cElementTree import XML
        except ImportError:
            from xml.etree.ElementTree import XML
            print("Running in compatibility mode")
        """
        parseDocx is a derivative of <https://github.com/mickmaccana/python-docx>
        """
        WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARA = WORD_NAMESPACE + 'p'
        TEXT = WORD_NAMESPACE + 't'

        document = zipfile.ZipFile(inDoc)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
        i = 0
        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))
                temp.write(repr(paragraphs))
        return paragraphs  ### this should be a list of all the stuf

コード例 #21

0

ファイルを表示

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    # f = open("data.xml", "w")
    # f.write(xml_content)
    tre = ET.parse('data.xml')
    root = tre.getroot()

    for child in root:
        print child.tag, child.attrib

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))

    return '----\n\n'.join(paragraphs)

コード例 #22

0

ファイルを表示

ファイル: doc_file.py プロジェクト: minlogiciel/docutone

    def docxml_to_text(self, filename):

        texts = ""

        document = zipfile.ZipFile(filename)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        sections = []
        for section in tree.getiterator(self.PARA):
            texts = ''
            for node in section.getiterator(self.TEXT):
                if node.text:
                    texts += node.text
            sections.append(''.join(texts))
        '''
        for section in tree.getiterator(self.PARA):
            texts = [node.text for node in section.getiterator(self.TEXT) if node.text]
            if texts:
                sections.append(''.join(texts))
        '''
        texts = '\n\n'.join(sections)

        return texts

コード例 #23

0

ファイルを表示

def get_docx_text(path=os.getcwd() + '\\word_samples'):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    contentToRead = ["header2.xml", "document.xml", "footer2.xml"]
    paragraphs = []

    for xmlfile in contentToRead:
        xml_content = document.read('word/{}'.format(xmlfile))
        tree = XML(xml_content)
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                textData = ''.join(texts)
                if xmlfile == "footer2.xml":
                    extractedTxt = "Footer : " + textData
                elif xmlfile == "header2.xml":
                    extractedTxt = "Header : " + textData
                else:
                    extractedTxt = textData

                paragraphs.append(extractedTxt)
    document.close()
    return '\n\n'.join(paragraphs)

コード例 #24

0

ファイルを表示

ファイル: simple_extractor.py プロジェクト: Beehivr/lambda-text-extractor

def docx_to_text(document_path, event_handler):
    global logger

    from docx import Document
    from docx.table import Table
    from docx.text.paragraph import Paragraph
    from docx.oxml.table import CT_Tbl
    from docx.oxml.text.paragraph import CT_P

    try:
        doc = Document(document_path)
        doc_body = doc.element.body
        blocks = []
        for child in doc_body.iterchildren():
            if isinstance(child, CT_P):
                blocks.append(Paragraph(child, doc_body).text)
            elif isinstance(child, CT_Tbl):
                blocks.append('\n'.join(
                    ' | '.join(cell.text for cell in row.cells)
                    for row in Table(child, doc_body).rows))
        #end for

        text = '\n\n'.join(blocks).strip()

        return text

    except Exception:
        logger.exception('Exception while parsing <{}>.'.format(
            event_handler.key))
    #end try

    # Extract it from the XML
    with ZipFile(document_path) as document_zipfile:
        xml_content = document_zipfile.read('word/document.xml')

    try:
        from xml.etree.cElementTree import XML
    except ImportError:
        from xml.etree.ElementTree import XML

    tree = XML(xml_content)

    DOCX_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    DOCX_PARA = DOCX_NAMESPACE + 'p'
    DOCX_TEXT = DOCX_NAMESPACE + 't'

    paragraphs = []
    for paragraph in tree.getiterator(DOCX_PARA):
        texts = [
            node.text for node in paragraph.getiterator(DOCX_TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    #end for

    text = '\n\n'.join(paragraphs)

    return text

コード例 #25

0

ファイルを表示

ファイル: uniprot.py プロジェクト: cgseitz/ProDy

def queryUniprot(id, expand=[], regex=True):
    """Query Uniprot with *id* and return a `dict` containing the raw results. 
    Regular users should use :func:`searchUniprot` instead.
    
    :arg expand: entries through which you want to loop dictElements
        until there aren't any elements left
    :type expand: list
    """

    if not isinstance(id, str):
        raise TypeError('id should be a string')

    try:
        record_file = openURL(
            'http://www.uniprot.org/uniprot/{0}.xml'.format(id))
    except:
        raise ValueError('No Uniprot record found with that id')

    data = record_file.read()
    record_file.close()
    data = XML(data)

    data = dictElement(data.getchildren()[0],
                       '{http://uniprot.org/uniprot}',
                       number_multiples=True)

    for key in data:
        value = data[key]
        if not key.startswith('dbReference'):
            continue

        try:
            if value.get('type') != 'PDB':
                continue
        except AttributeError:
            continue

        pdbid = value.get('id')
        refdata = {'PDB': pdbid}
        for prop in value:
            prop_key = prop.get('type')
            prop_val = prop.get('value')
            refdata[prop_key] = prop_val
        data[key] = refdata

    if expand:
        keys = []
        if regex:
            for lt in expand:
                lt_re = re.compile(lt)
                for key in data:
                    if lt_re.match(key):
                        keys.append(key)
        else:
            keys = expand
        data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}')

    return data

コード例 #26

0

ファイルを表示

ファイル: DOCX_XMLExtractor.py プロジェクト: thomas0sae/ResumeParserAssignment

    def getText(self, path):
        """
        Take the path of a docx file as argument, return the text in unicode.
        """
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))

        paragraphs.append('TABLE_INFORMATION_ROW_WISE')

        for tablerow in tree.getiterator(TABLE_ROW):
            texts = [
                node.text for node in tablerow.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(' '.join(texts))

        paragraphs.append('INFORMATION_EXTRACTED_FROM_HEADER')

        try:
            xml_content = document.read('word/header1.xml')
            document.close()
            tree = XML(xml_content)

            for paragraph in tree.getiterator(PARA):
                texts = [
                    node.text for node in paragraph.getiterator(TEXT)
                    if node.text
                ]
                if texts:
                    paragraphs.append(''.join(texts))

        except Exception:
            document.close()
            pass

        return '\n\n'.join(paragraphs)

コード例 #27

0

ファイルを表示

ファイル: word_table_reader.py プロジェクト: simsong/word_acronym_checker

def get_docx_tables(path):
    """
    Find the table inside the .docx file and return it in an array
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)
    for tbl in tree.iter(TBL):
        yield tbl

コード例 #28

0

ファイルを表示

ファイル: uninstall_package.py プロジェクト: josephrexme/canari

def uninstallmachines(package, prefix):
    try:
        prefix = path.join(prefix, 'config', 'Maltego', 'Machines')
        n = path.join(prefix, '.nbattrs')
        e = XML('<attributes version="1.0"/>')
        if path.exists(n):
            e = XML(file(n).read())
        if not path.exists(prefix):
            return
        package = '%s.resources.maltego' % package
        for m in filter(lambda x: x.endswith('.machine'), resource_listdir(package, '')):
            print 'Uninstalling machine %s...' % m
            try:
                unlink(path.join(prefix, m))
                uninstallnbattr(m, e)
            except OSError:
                pass
        ElementTree(e).write(file(n, 'wb'))
    except ImportError, e:
        pass

コード例 #29

0

ファイルを表示

ファイル: exportgraph.py プロジェクト: sh1nu11bi/Pandora

def mtgx2json(graph):
    zipfile = ZipFile(graph)
    graphs = filter(lambda x: x.endswith('.graphml'), zipfile.namelist())
    for f in graphs:
        multikeys = []
        xml = XML(zipfile.open(f).read())
        links = {}
        for edge in xml.findall('{http://graphml.graphdrawing.org/xmlns}graph/'
                                '{http://graphml.graphdrawing.org/xmlns}edge'):
            src = edge.get('source')
            dst = edge.get('target')
            if src not in links:
                links[src] = dict(in_=[], out=[])
            if dst not in links:
                links[dst] = dict(in_=[], out=[])
            links[src]['out'].append(dst)
            links[dst]['in_'].append(src)

        for node in xml.findall('{http://graphml.graphdrawing.org/xmlns}graph/'
                                '{http://graphml.graphdrawing.org/xmlns}node'):

            node_id = node.get('id')
            node = node.find(
                '{http://graphml.graphdrawing.org/xmlns}data/'
                '{http://maltego.paterva.com/xml/mtgx}MaltegoEntity')

            record = OrderedDict({
                'NodeID': node_id,
                'EntityType': node.get('type').strip()
            })
            props = {'Data': {}}
            for prop in node.findall(
                    '{http://maltego.paterva.com/xml/mtgx}Properties/'
                    '{http://maltego.paterva.com/xml/mtgx}Property'):
                value = prop.find(
                    '{http://maltego.paterva.com/xml/mtgx}Value').text or ''
                entity_prop = {prop.get('displayName'): value.strip()}
                props['Data'].update(entity_prop)
            record.update(props)
            s = ' - '.join([
                '%s: %s' % (key, value)
                for (key, value) in record['Data'].items()
            ])
            record.pop('Data')
            data = {'Data': s}
            record.update(data)
            link = {'Links': {}}
            i_link = {'Incoming': links.get(node_id, {}).get('in_', 0)}
            link['Links'].update(i_link)
            o_link = {'Outgoing': links.get(node_id, {}).get('out', 0)}
            link['Links'].update(o_link)
            record.update(link)
            multikeys.append(record)
        return multikeys

コード例 #30

0

ファイルを表示

 def validate(self, value, model_instance):
     super(HTMLField, self).validate(value, model_instance)
     if self.xml and value and value.strip():
         try:
             value = self.get_prep_value(value)
             if isinstance(value, unicode):
                 value = value.encode('utf-8')
             XML('<root>%s</root>' % value)
         except (ExpatError, SyntaxError):
             raise exceptions.ValidationError(
                 self.error_messages['invalid'])