Python XML Examples, xml.etree.cElementTree.XML Python Examples

Example #1

0

Show file

def parse_config(soup):
    """There are lots of goodies in the config we get back from the ABC.
    In particular, it gives us the URLs of all the other XML data we
    need.
    """

    xml = XML(soup)
    params = dict()
    for param in xml.iter('param'):
        params.setdefault(param.get('name'), param.get('value'))

    # should look like "rtmp://cp53909.edgefcs.net/ondemand"
    # Looks like the ABC don't always include this field.
    # If not included, that's okay -- ABC usually gives us the server in the auth result as well.
    rtmp_url = params['server_streaming']
    categories_url = params['categories']

    params.update({
        'rtmp_url'  : rtmp_url,
        'auth_url'  : params['auth'],
        'api_url' : params['api'],
        'categories_url' : categories_url,
        'captions_url' : params['captions'],
    })
    return params

Example #2

0

Show file

File: common.py Project: mattnewham/canari

def fix_etree():
    try:
        from xml.etree.cElementTree import XML
        e = XML('<test><t a="1"/></test>')
        e.find('t[@a="1"]')
    except SyntaxError:
        import canari.xmltools.fixetree

Example #3

0

Show file

def parse_captions(soup):
	"""	Converts custom iView captions into SRT format, usable in most
		decent media players.
	"""
	
	# Horrible hack to escape literal ampersands, which have been seen in
	# some captions XML. Inspired by
	# http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
	if b"<![CDATA[" not in soup:  # Not seen, but be future proof
		soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)
	
	xml = XML(soup)

	output = ''

	i = 1
	for title in xml.getiterator('title'):
		start = title.get('start')
		ids = start.rfind(':')
		end = title.get('end')
		ide = end.rfind(':')
		output = output + str(i) + '\n'
		output = output + start[:ids] + ',' + start[ids+1:] + ' --> ' + end[:ide] + ',' + end[ide+1:] + '\n'
		output = output + title.text.replace('|','\n') + '\n\n'
		i += 1

	return output

Example #4

0

Show file

File: test_course.py Project: AloneRoad/MELS

def get_status_code(output):
    try:
        tree = XML(output)
        child = tree.getchildren()[0]
        return int(child.get('status_code'))
    except:
        return None

Example #5

0

Show file

File: wctest.py Project: syakesaba/python-scripts-useful

 def parseOpenSearch(self,xmldata,format="atom"):
     """
     OpenSearchの出力XMLをパースします。
     xmldata=OpenSearch("keyword")[1]
     format=atomまたはrss
     """
     root=XML(xmldata)#xmldata
     if format == "rss":
         entrys=[elm for elm in root.getchildren()[0].getchildren() if "item" == elm.tag]
     elif format == "atom":
         entrys=[elm for elm in root if elm.tag.endswith("entry")]
     else:
         raise Exception("Unknown format : %s" % format)#xmlのフォーマットはatom/rssのみ対応
     hits=len(entrys)
     entrys=[entry.getiterator for entry in entrys]
     entrys=[j for j in [i() for i in entrys]]
     ret=[]
     h="content"
     for entry in entrys:
         buf={}
         for val in entry:
             dual=1
             if val != None and val.text != None:
                 if not val.text.startswith("\n"):
                     if buf.get(val.tag[val.tag.find("}")+1:]) == None:
                         buf[val.tag[val.tag.find("}")+1:]] = val.text
                     else:
                         dual= dual+1
                         buf[val.tag[val.tag.find("}")+1:] + str(dual)] = val.text
         if h in buf.keys():buf[h]=self.stripHTMLTags(buf[h])
         ret.append(buf)
     return hits,ret#data[0]["name"] で著者

Example #6

0

Show file

File: wordReader.py Project: JJLionHeart/Tartin

def get_raw_text(pthFile):
	"""
	gets a path to a file as an argument and returns a list containing
	the paragraphs of the word document file
	"""
	
	"""
	Constants used to iterate over the XML tree
	"""
	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
	PARA = WORD_NAMESPACE + 'p'
	TEXT = WORD_NAMESPACE + 't'

	docWordDoc = zipfile.ZipFile(pthFile) #gets the documents of the word
	xmlContent = docWordDoc.read('word/document.xml') #access the xml file
	docWordDoc.close()
	treeXML = XML(xmlContent) #parses the xml content into a tree that will be further used to access the text

	lstParagraphs = [] #output list with the paragraphs of the text
	#now we proceed to extract the text from the tree
	#the idea is to iterate over the tree and 
	#for each node that contains text, substract it and add it to
	#the output
	for parParagraph in treeXML.getiterator(PARA):
		lstTexts = [nodElement.text
			    for nodElement in parParagraph.getiterator(TEXT)
			    if nodElement.text]
		if lstTexts:
			print lstTexts
			lstParagraphs.append(''.join(lstTexts))
		
	return lstParagraphs

Example #7

0

Show file

def parse_captions(soup):
    """Converts custom iView captions into SRT format, usable in most
    decent media players.
    """
    
    # Horrible hack to escape literal ampersands, which have been seen in
    # some captions XML. Inspired by
    # http://stackoverflow.com/questions/6088760/fix-invalid-xml-with-ampersands-in-python
    if b"<![CDATA[" not in soup:  # Not seen, but be future proof
        soup = re.sub(b"&(?![#\w]+;)", b"&amp;", soup)
    
    xml = XML(soup)

    output = ''

    i = 1
    for title in xml.iter('title'):
        start = title.get('start')
        (start, startfract) = start.rsplit(':', 1)
        end = title.get('end')
        (end, endfract) = end.rsplit(':', 1)
        output = output + '{}\n'.format(i)
        output = output + '{},{:0<3.3} --> {},{:0<3.3}\n'.format(start, startfract, end, endfract)
        output = output + title.text.replace('|','\n') + '\n\n'
        i += 1

    return output

Example #8

0

Show file

File: test_course.py Project: AloneRoad/MELS

def get_property(output, key):
    try:
        tree = XML(output)
        child = tree.getchildren()[0]   # first tag
        return child.get(key)
    except IndexError:
        return None

Example #9

0

Show file

File: parser.py Project: opendatakosovo/112

 def get_docx_text(self, path):
     document = zipfile.ZipFile(path)
     xml_content = document.read('word/document.xml')
     tree = XML(xml_content)
     document.close()
     paragraphs = []
     for paragraph in tree.getiterator(para_tag):
         texts = [node.text for node in paragraph.getiterator(text_tag) if node.text]
         if texts:
             paragraphs.append(''.join(texts))
     return paragraphs

Example #10

0

Show file

File: uniprot.py Project: fongchun/ProDy

def queryUniprot(id, expand=[], regex=True):
    """Query Uniprot with *id* and return a `dictionary` containing the results
    
    :arg expand: entries through which you want to loop dictElements
        until there aren't any elements left
    :type expand: list
    """

    if not isinstance(id, str):
        raise TypeError('id should be a string')

    try:
        record_file = openURL('http://www.uniprot.org/uniprot/{0}.xml'.format(id))
    except:
        raise ValueError('No Uniprot record found with that id')
    
    data = record_file.read()
    record_file.close()
    data = XML(data)

    data = dictElement(data.getchildren()[0], '{http://uniprot.org/uniprot}', number_multiples=True)

    for key in data:
        value = data[key]
        if not key.startswith('dbReference'):
            continue
        
        try:
            if value.get('type') != 'PDB':
                continue
        except AttributeError:
            continue

        pdbid = value.get('id')
        refdata = {'PDB': pdbid}
        for prop in value:
            prop_key = prop.get('type')
            prop_val = prop.get('value')
            refdata[prop_key] = prop_val
        data[key] = refdata
            
    if expand:
        keys = []
        if regex:
            for lt in expand:
                lt_re = re.compile(lt)
                for key in data.keys():
                    if lt_re.match(key):
                        keys.append(key)
        else:
            keys = expand
        data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}')
    
    return data

Example #11

0

Show file

File: test_sharing.py Project: svn2github/calendarserver-raw

 def listChildrenViaPropfind():
     data = yield self.simpleSend(
         "PROPFIND", "/", resultcode=responsecode.MULTI_STATUS,
         headers=[('Depth', '1')]
     )
     tree = XML(data)
     seq = [e.text for e in tree.findall("{DAV:}response/{DAV:}href")]
     shortest = min(seq, key=len)
     seq.remove(shortest)
     filtered = [elem[len(shortest):].rstrip("/") for elem in seq]
     returnValue(filtered)

Example #12

0

Show file

def parse_highlights(xml):

    soup = XML(xml)

    highlightList = []

    for series in soup.iterfind('series'):
        tempSeries = dict(series.items())
        tempSeries.update(xml_text_elements(series))
        highlightList.append(tempSeries)

    return highlightList

Example #13

0

Show file

File: word_to_JSON.py Project: anastiel/open-catalog-generator

def get_docx_text(path):
    document = zipfile.ZipFile(path)
    xml_content = document.read("word/document.xml")
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text for node in paragraph.getiterator(TEXT) if node.text]
        if texts:
            paragraphs.append("".join(texts))
    return paragraphs

Example #14

0

Show file

File: RN_toolbag.py Project: KODeKarnage/Ratings_Note_Processor

def get_docx_text(path):
	"""
	Take the path of a docx file as argument, return the text in unicode
	in the form of a list.
	"""
	document = zipfile.ZipFile(path)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)
 
	paragraphs = []
	for paragraph in tree.getiterator(PARA):
		texts = [node.text.encode('utf-8')
				 for node in paragraph.getiterator(TEXT)
				 if node.text]
		if texts:
			paragraphs.append(''.join(texts))
 
	return paragraphs



 
# def get_docx_text(path):
# 	"""
# 	Take the path of a docx file as argument, return the text in unicode
# 	in the form of a list.
# 	"""
# 	document = zipfile.ZipFile(path)
# 	xml_content = document.read('word/document.xml')
# 	document.close()
# 	tree = XML(xml_content)
 
# 	sections = []
# 	for section in tree.getiterator(SECT):

# 		paragraphs = []
# 		for paragraph in section.getiterator(PARA):
# 			print 'para'
# 			texts = [node.text.encode('utf-8')
# 					 for node in paragraph.getiterator(TEXT)
# 					 if node.text]
# 			if texts:
# 				paragraphs.append(''.join(texts))

# 		print str(paragraphs)

# 		if paragraphs:
# 			sections.append(''.join(paragraphs))

 
# 	return sections

Example #15

0

Show file

File: test_sharing.py Project: anemitz/calendarserver

        def listChildrenViaPropfind():
            request = SimpleStoreRequest(self, "PROPFIND", "/calendars/__uids__/user01/", authid="user01")
            request.headers.setHeader("depth", "1")
            response = yield self.send(request)
            response = IResponse(response)
            data = yield allDataFromStream(response.stream)

            tree = XML(data)
            seq = [e.text for e in tree.findall("{DAV:}response/{DAV:}href")]
            shortest = min(seq, key=len)
            seq.remove(shortest)
            filtered = [elem[len(shortest):].rstrip("/") for elem in seq]
            returnValue(filtered)

Example #16

0

Show file

File: exportgraph.py Project: cedricdv/Pandora

def mtgx2json(graph):
    zipfile = ZipFile(graph)
    graphs = filter(lambda x: x.endswith(".graphml"), zipfile.namelist())
    for f in graphs:
        multikeys = []
        xml = XML(zipfile.open(f).read())
        links = {}
        for edge in xml.findall(
            "{http://graphml.graphdrawing.org/xmlns}graph/" "{http://graphml.graphdrawing.org/xmlns}edge"
        ):
            src = edge.get("source")
            dst = edge.get("target")
            if src not in links:
                links[src] = dict(in_=[], out=[])
            if dst not in links:
                links[dst] = dict(in_=[], out=[])
            links[src]["out"].append(dst)
            links[dst]["in_"].append(src)

        for node in xml.findall(
            "{http://graphml.graphdrawing.org/xmlns}graph/" "{http://graphml.graphdrawing.org/xmlns}node"
        ):

            node_id = node.get("id")
            node = node.find(
                "{http://graphml.graphdrawing.org/xmlns}data/" "{http://maltego.paterva.com/xml/mtgx}MaltegoEntity"
            )

            record = OrderedDict({"NodeID": node_id, "EntityType": node.get("type").strip()})
            props = {"Data": {}}
            for prop in node.findall(
                "{http://maltego.paterva.com/xml/mtgx}Properties/" "{http://maltego.paterva.com/xml/mtgx}Property"
            ):
                value = prop.find("{http://maltego.paterva.com/xml/mtgx}Value").text or ""
                entity_prop = {prop.get("displayName"): value.strip()}
                props["Data"].update(entity_prop)
            record.update(props)
            s = " - ".join(["%s: %s" % (key, value) for (key, value) in record["Data"].items()])
            record.pop("Data")
            data = {"Data": s}
            record.update(data)
            link = {"Links": {}}
            i_link = {"Incoming": links.get(node_id, {}).get("in_", 0)}
            link["Links"].update(i_link)
            o_link = {"Outgoing": links.get(node_id, {}).get("out", 0)}
            link["Links"].update(o_link)
            record.update(link)
            multikeys.append(record)
        return multikeys

Example #17

0

Show file

File: main.py Project: jackybaseline/local_search

def read_docx(file,document,path,trie):
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)
	paragraphs = ""
	for paragraph in tree.getiterator(PARA):
		texts=""
		for node in paragraph.getiterator(TEXT):
			if node.text:
				texts += node.text.replace('\u7460',"")
		if texts:
			paragraphs+=str(texts)
	#print(paragraphs)
	string_spilt(paragraphs,path,trie)
	trie.insert_doc_len(path,len(file)+len(paragraphs))

Example #18

0

Show file

File: dotextract.py Project: apdimier/Etumos

def get_doc_text(path):
    """
    Take the path of a docx or a dot file as argument, return the text in unicode.
    """
    if "docx" == path[-4:]:
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
    #print tree

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                paragraphs.append(''.join(texts))
                pass
            pass
    #print paragraphs
        return paragraphs
#        
    elif "odt" == path[-3:]:
        document = zipfile.ZipFile(path)
        xml_content = document.read('content.xml')
        document.close()
        doc = xml.dom.minidom.parseString(xml_content)
        print(" doc: ",doc)
        print("doc::end")
        #paras = doc.getElementsByTagName('text:span')
        #paras = doc.getElementsByTagName('text:p')
        #
        # we get here all elements Headers, text and table components: 
        #
        paras = doc.getElementsByTagName("*")
        print("I have ", len(paras), " paragraphs ")
        paragraphs = []
        for p in paras:
            for ch in p.childNodes:
                if ch.nodeType == ch.TEXT_NODE:
                    paragraphs.append(''.join(ch.wholeText))
                    pass
                pass
            pass
        print(paragraphs)
        return paragraphs
    else:
        print() 
        raise Warning("only docx and odt files are handled")

Example #19

0

Show file

File: becompare.py Project: jonstewart/bulk_extractor

def validate_report(report):
    if not os.path.isdir(report):
        raise FileNotFoundError(f"{report} is not a directory")
    xmlfile = os.path.join(report,"report.xml")
    if not os.path.isfile(xmlfile):
        raise FileNotFoundError(xmlfile)
    tree = XML(open(xmlfile,"r").read())

Example #20

0

Show file

def docx_do_docx(azip, afile):
    word_namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = word_namespace + 'p'
    txt = word_namespace + 't'

    xml_content = azip.read('word/document.xml')
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt) if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    text = '\n\n'.join(paragraphs)
    text_do_data(text, afile)

Example #21

0

Show file

File: trans2.py Project: presdec/practice

def get_docx_text(filename):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(filename)
    xml_content = document.read("word/document.xml")
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.iter(PARA):
        texts = [node.text for node in paragraph.iter(TEXT) if node.text]
        if texts:
            paragraphs.append("".join(texts))

    return "\n\n".join(paragraphs)

Example #22

0

Show file

def get_docx_text(path):
	"""	Take the path of a docx file as argument, return the text in unicode."""
	document = zipfile.ZipFile(path)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)
 
	paragraphs = []
	for paragraph in tree.getiterator(PARA):
		texts = [node.text
				for node in paragraph.getiterator(TEXT)
				if node.text]
		if texts:
			paragraphs.append(''.join(texts))
 
	return '\n\n'.join(paragraphs)

Example #23

0

Show file

 def _get_last_image_loaded(self):
     for label in self:
         task_id = ""
         labelupdatestatus = False
         if label.task_id:
             labelupdatestatus = requests.get(
                 'http://' + self.env['ir.config_parameter'].get_param(
                     'core_appliance_ip') +
                 ':8001/service/updatestatus/transaction/' + label.task_id)
             if labelupdatestatus:
                 root = XML(labelupdatestatus.text)
                 for update in root.iter('UpdateStatus'):
                     task_id = update.get('id')
             label.image = label._get_image_from_task(task_id)
         else:
             label.image = False

Example #24

0

Show file

File: docx.py Project: nmnz/blueflower

def docx_do_docx(azip, afile):
    word_namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
    par = word_namespace + "p"
    txt = word_namespace + "t"

    xml_content = azip.read("word/document.xml")
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt) if node.text]
        if texts:
            paragraphs.append("".join(texts))

    text = "\n\n".join(paragraphs)
    text_do_data(text, afile)

Example #25

0

Show file

def run(args):
    opts = parse_args(args)

    zipfile = ZipFile(opts.graph)
    graphs = filter(lambda x: x.endswith('.graphml'), zipfile.namelist())

    for f in graphs:
        with open(f.split('/')[1].split('.')[0] + '.csv', 'wb') as csvfile:
            csv = writer(csvfile)
            xml = XML(zipfile.open(f).read())
            links = {}
            for edge in xml.findall(
                    '{http://graphml.graphdrawing.org/xmlns}graph/'
                    '{http://graphml.graphdrawing.org/xmlns}edge'):
                src = edge.get('source')
                dst = edge.get('target')
                if src not in links:
                    links[src] = dict(in_=0, out=0)
                if dst not in links:
                    links[dst] = dict(in_=0, out=0)
                links[src]['out'] += 1
                links[dst]['in_'] += 1

            for node in xml.findall(
                    '{http://graphml.graphdrawing.org/xmlns}graph/'
                    '{http://graphml.graphdrawing.org/xmlns}node'):

                node_id = node.get('id')
                node = node.find(
                    '{http://graphml.graphdrawing.org/xmlns}data/'
                    '{http://maltego.paterva.com/xml/mtgx}MaltegoEntity')

                row = [to_utf8(('Entity Type=%s' % node.get('type')).strip())]
                for prop in node.findall(
                        '{http://maltego.paterva.com/xml/mtgx}Properties/'
                        '{http://maltego.paterva.com/xml/mtgx}Property'):
                    value = prop.find(
                        '{http://maltego.paterva.com/xml/mtgx}Value'
                    ).text or ''
                    row.append(
                        to_utf8(('%s=%s' %
                                 (prop.get('displayName'), value)).strip()))
                row.append('Incoming Links=%s' %
                           links.get(node_id, {}).get('in_', 0))
                row.append('Outgoing Links=%s' %
                           links.get(node_id, {}).get('out', 0))
                csv.writerow(row)

Example #26

0

Show file

 def _get_status_task(self):
     for location in self:
         if location.task_id:
             response = requests.get('http://' +
                                     self.env['ir.config_parameter'].
                                     get_param('core_appliance_ip') +
                                     ':8001/service/transaction/' +
                                     location.task_id + '/status')
             if response:
                 root = XML(response.text)
                 for resp in root.iter('TransactionStatusInfo'):
                     if resp.get('failed') == "true":
                         location.task_status = "FAILED"
                     elif resp.get('finished') == "true":
                         location.task_status = "FINISHED"
                     else:
                         location.task_status = "WAITING"

Example #27

0

Show file

def readWord(filename):
    try:
        document = zipfile.ZipFile(filename)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = ''
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs += str(texts) + ','
        return paragraphs
    except Exception as e:
        print('ReadWord exception', e)

Example #28

0

Show file

 def get_docx_text(path):
     schemas = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
     para = schemas + 'p'
     text = schemas + 't'
     """
     Take the path of a docx file as argument, return the text in unicode.
     """
     document = zipfile.ZipFile(path)
     xml_content = document.read('word/document.xml')
     document.close()
     tree = XML(xml_content)
     paragraphs = []
     for paragraph in tree.iter(para):
         texts = [node.text for node in paragraph.iter(text) if node.text]
         if texts:
             paragraphs.append(''.join(texts))
     return os.linesep.join(paragraphs)

Example #29

0

Show file

 def __init__(self,filePath):
     
     document = zipfile.ZipFile(filePath)
     xml_content = document.read('word/document.xml')
     document.close()
     tree = XML(xml_content)
     
     for paragraph in tree.getiterator(self.PARA):
         texts = [node.text for node in paragraph.getiterator(self.TEXT) if node.text]
         if texts:
             self.paragraphs.append(''.join(texts))
             sentenceEnders = re.compile('[.!?][\s]{1,2}(?=[A-Z])')
             self.sentenceList = self.sentenceList + sentenceEnders.split(''.join(texts))
                   
     self.data = '\n\n'.join(self.paragraphs)
     self.words = re.findall(r"[\w']+", self.data)
     self.filteredData = ' '.join(self.words)

Example #30

0

Show file

File: docx.py Project: rjmolesa/blueflower

def docx_do_docx(azip, afile):
    namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    par = namespace + 'p'
    txt = namespace + 't'

    xml_content = azip.read('word/document.xml')
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(par):
        texts = [node.text for node in paragraph.getiterator(txt)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    text = '\n\n'.join(paragraphs)
    text_do_data(text, afile)

Example #31

0

Show file

File: dotextract.py Project: apdimier/Etumos

    def getContents(self):
        """
        Just read the paragraphs from an XML file.
        """

        xml_content = self.my_docx.read('word/document.xml')
        self.my_docx.close()
        tree = XML(xml_content)

        self.text_in_paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                self.text_in_paragraphs.append(''.join(texts))
                pass
            pass
    #print paragraphs
        return self.text_in_paragraphs

Example #32

0

Show file

def docxparse(inDocx, outDocx):
    with open(outDocx, "w+") as outDocx:

        #Take the path of a docx file as argument, return the text in unicode.

        document = zipfile.ZipFile(inDocx)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)
        i = 0
        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [
                node.text for node in paragraph.getiterator(TEXT) if node.text
            ]
            if texts:
                paragraphs.append(''.join(texts))
        outDocx.write('\n'.join(paragraphs))

Example #33

0

Show file

File: mtgx2csv.py Project: digital4rensics/canari

def run(args):

    opts = parse_args(args)

    zip = ZipFile(opts.graph)
    graphs = filter(lambda x: x.endswith('.graphml'), zip.namelist())

    for f in graphs:
        csv = open(f.split('/')[1].split('.')[0] + '.csv', 'w')
        xml = XML(zip.open(f).read())
        for e in xml.findall('{http://graphml.graphdrawing.org/xmlns}graph/{http://graphml.graphdrawing.org/xmlns}node/{http://graphml.graphdrawing.org/xmlns}data/{http://maltego.paterva.com/xml/mtgx}MaltegoEntity'):
            csv.write(('"Entity Type=%s",' % e.get('type')).strip())
            for prop in e.findall('{http://maltego.paterva.com/xml/mtgx}Properties/{http://maltego.paterva.com/xml/mtgx}Property'):
                value = prop.find('{http://maltego.paterva.com/xml/mtgx}Value').text or ''
                if '"' in value:
                    value.replace('"', '""')
                csv.write(('"%s=%s",' % (prop.get('displayName'), value)).strip())
            csv.write('\n')

Example #34

0

Show file

        def listChildrenViaPropfind():
            authPrincipal = yield self.actualRoot.findPrincipalForAuthID(
                "user01")
            request = SimpleStoreRequest(self,
                                         "PROPFIND",
                                         "/calendars/__uids__/user01/",
                                         authPrincipal=authPrincipal)
            request.headers.setHeader("depth", "1")
            response = yield self.send(request)
            response = IResponse(response)
            data = yield allDataFromStream(response.stream)

            tree = XML(data)
            seq = [e.text for e in tree.findall("{DAV:}response/{DAV:}href")]
            shortest = min(seq, key=len)
            seq.remove(shortest)
            filtered = [elem[len(shortest):].rstrip("/") for elem in seq]
            returnValue(filtered)

Example #35

0

Show file

File: parser.py Project: bencollerson/python-iview

def parse_auth(soup, iview_config):
	"""	There are lots of goodies in the auth handshake we get back,
		but the only ones we are interested in are the RTMP URL, the auth
		token, and whether the connection is unmetered.
	"""

	xml = XML(soup)
	xmlns = "http://www.abc.net.au/iView/Services/iViewHandshaker"

	# should look like "rtmp://203.18.195.10/ondemand"
	rtmp_url = xml.find('{%s}server' % (xmlns,)).text

	# at time of writing, either 'Akamai' (usually metered) or 'Hostworks' (usually unmetered)
	stream_host = xml.find('{%s}host' % (xmlns,)).text

	if stream_host == 'Akamai':
		playpath_prefix = config.akamai_playpath_prefix
	else:
		playpath_prefix = ''

	if rtmp_url is not None:
		# Being directed to a custom streaming server (i.e. for unmetered services).
		# Currently this includes Hostworks for all unmetered ISPs except iiNet.

		rtmp_chunks = rtmp_url.split('/')
		rtmp_host = rtmp_chunks[2]
		rtmp_app = rtmp_chunks[3]
	else:
		# We are a bland generic ISP using Akamai, or we are iiNet.
		rtmp_url  = iview_config['rtmp_url']
		rtmp_host = iview_config['rtmp_host']
		rtmp_app  = iview_config['rtmp_app']

	token = xml.find("{%s}token" % (xmlns,)).text

	return {
		'rtmp_url'        : rtmp_url,
		'rtmp_host'       : rtmp_host,
		'rtmp_app'        : rtmp_app,
		'playpath_prefix' : playpath_prefix,
		'token'           : token,
		'free'            :
			(xml.find("{%s}free" % (xmlns,)).text == "yes")
	}

Example #36

0

Show file

File: layout.py Project: tcoopman/bloated-clock-plasmoid

class LayoutBuilder():
    def __init__(self, im,  pl):
        self.im = im
        self.pl = pl
        self.plugins = {}
        self.trb = TextRendererBuilder()
        
    def build(self, xml, parent):
        self.tree = XML(self._surround(xml))
        self._readPlugins(self.tree.find("plugins"))
        return self._readBody(self.tree.find("body"),  parent)
            
    def _buildLine(self, element,  parent):
        line = QGraphicsLinearLayout(Qt.Horizontal, parent)
        for e in element:
            line.addItem(self._buildLabel(e))
        return line
            
    def _buildLabel(self, element):
        item = self.im.getItemByName(element.tag)
        plugin = self.plugins[element.get('parser')]
        return self.trb.build(item, plugin, element.text, element.get("align"))
        
    def _readPlugins(self,  xml):
        for element in xml:
            plugin = self.pl.getPluginByName(element.tag)
            self.plugins[element.get("name")] = plugin
            optionKeys = element.keys()
            optionKeys.remove("name")
            options = {}
            for key in optionKeys:
                options[key] = element.get(key)
            plugin.load(options)
            
    def _readBody(self,  xml,  parent):
        layout = QGraphicsLinearLayout(Qt.Vertical, parent)
        for element in xml.getiterator("line"):
            print "parsing line"
            layout.addItem(self._buildLine(element,  layout))
        return layout
        
    def _surround(self,  xml):
        print xml
        return "<all>" + xml + "</all>"

Example #37

0

Show file

File: dotextract.py Project: apdimier/Etumos

    def getContents(self):
        """
        Just read the paragraphs from an XML file.
        """

        xml_content = self.my_docx.read('word/document.xml')
        self.my_docx.close()
        tree = XML(xml_content)

        self.text_in_paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text for node in paragraph.iter(TEXT) if node.text]
            if texts:
                self.text_in_paragraphs.append(''.join(texts))
                pass
            pass

    #print paragraphs
        return self.text_in_paragraphs

Example #38

0

Show file

File: word-doc-parser.py Project: mdhuffer299/Python

def get_docx_text(fileName):
    zipFile = zipfile.ZipFile(fileName)

    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'

    name = 'word/document.xml'
    document = zipFile.read(name, pwd=None)
    documentTree = XML(document)

    paragraphs = []
    for paragraph in documentTree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    return ('\n\n'.join(paragraphs))

Example #39

0

Show file

    def get_docx_text(path):
        WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
        PARA = WORD_NAMESPACE + 'p'
        TEXT = WORD_NAMESPACE + 't'
        """
        Take the path of a docx file as argument, return the text in unicode.
        """
        document = zipfile.ZipFile(path)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        paragraphs = []
        for paragraph in tree.getiterator(PARA):
            texts = [node.text
                    for node in paragraph.getiterator(TEXT)
                    if node.text]
            if texts: paragraphs.append(''.join(texts))
        return '\n\n'.join(paragraphs)

Example #40

0

Show file

File: uniprot.py Project: kaynakb/ProDy

def queryUniprot(id, expand=[], regex=True):
    """Query Uniprot with *id* and return a `dict` containing the raw results. 
    Regular users should use :func:`searchUniprot` instead.
    
    :arg expand: entries through which you want to loop dictElements
        until there aren't any elements left
    :type expand: list
    """

    if not isinstance(id, str):
        raise TypeError('id should be a string')

    try:
        record_file = openURL('http://www.uniprot.org/uniprot/{0}.xml'.format(id))
    except:
        raise ValueError('No Uniprot record found with that id')
    
    data = record_file.read()
    record_file.close()
    data = XML(data)

    data = dictElement(data[0], '{http://uniprot.org/uniprot}', number_multiples=True)

    for key in data:
        value = data[key]
        if not key.startswith('dbReference'):
            continue
        
        try:
            if value.get('type') != 'PDB':
                continue
        except AttributeError:
            continue

        pdbid = value.get('id')
        refdata = {'PDB': pdbid}
        for prop in value:
            prop_key = prop.get('type')
            prop_val = prop.get('value')
            refdata[prop_key] = prop_val
        data[key] = refdata
            
    if expand:
        keys = []
        if regex:
            for lt in expand:
                lt_re = re.compile(lt)
                for key in data:
                    if lt_re.match(key):
                        keys.append(key)
        else:
            keys = expand
        data = dictElementLoop(data, keys, '{http://uniprot.org/uniprot}')
    
    return data

Example #41

0

Show file

File: mtgx2csv.py Project: liorvh/canari

def mtgx2csv(opts):

    zipfile = ZipFile(opts.graph)
    graphs = filter(lambda x: x.endswith(".graphml"), zipfile.namelist())

    for f in graphs:
        filename = "%s_%s" % (opts.graph.replace(".", "_", 1), os.path.basename(f).replace(".graphml", ".csv", 1))
        print "Writing data from %s/%s to %s..." % (opts.graph, f, filename)
        with open(filename, "wb") as csvfile:
            csv = writer(csvfile)
            xml = XML(zipfile.open(f).read())
            links = {}
            for edge in xml.findall(
                "{http://graphml.graphdrawing.org/xmlns}graph/" "{http://graphml.graphdrawing.org/xmlns}edge"
            ):
                src = edge.get("source")
                dst = edge.get("target")
                if src not in links:
                    links[src] = dict(in_=0, out=0)
                if dst not in links:
                    links[dst] = dict(in_=0, out=0)
                links[src]["out"] += 1
                links[dst]["in_"] += 1

            for node in xml.findall(
                "{http://graphml.graphdrawing.org/xmlns}graph/" "{http://graphml.graphdrawing.org/xmlns}node"
            ):

                node_id = node.get("id")
                node = node.find(
                    "{http://graphml.graphdrawing.org/xmlns}data/" "{http://maltego.paterva.com/xml/mtgx}MaltegoEntity"
                )

                row = [to_utf8(("Entity Type=%s" % node.get("type")).strip())]
                for prop in node.findall(
                    "{http://maltego.paterva.com/xml/mtgx}Properties/" "{http://maltego.paterva.com/xml/mtgx}Property"
                ):
                    value = prop.find("{http://maltego.paterva.com/xml/mtgx}Value").text or ""
                    row.append(to_utf8(("%s=%s" % (prop.get("displayName"), value)).strip()))
                row.append("Incoming Links=%s" % links.get(node_id, {}).get("in_", 0))
                row.append("Outgoing Links=%s" % links.get(node_id, {}).get("out", 0))
                csv.writerow(row)

Example #42

0

Show file

 def validate(self, value, model_instance):
     super(HTMLField, self).validate(value, model_instance)
     if self.xml and value and value.strip():
         try:
             value = self.get_prep_value(value)
             if isinstance(value, unicode):
                 value = value.encode('utf-8')
             XML('<root>%s</root>' % value)
         except (ExpatError, SyntaxError):
             raise exceptions.ValidationError(
                 self.error_messages['invalid'])

Example #43

0

Show file

File: parser.py Project: bencollerson/python-iview

def parse_captions(soup):
	"""	Converts custom iView captions into SRT format, usable in most
		decent media players.
	"""
	xml = XML(soup)

	output = ''

	i = 1
	for title in xml.getiterator('title'):
		start = title.get('start')
		ids = start.rfind(':')
		end = title.get('end')
		ide = end.rfind(':')
		output = output + str(i) + '\n'
		output = output + start[:ids] + ',' + start[ids+1:] + ' --> ' + end[:ide] + ',' + end[ide+1:] + '\n'
		output = output + title.text.replace('|','\n') + '\n\n'
		i += 1

	return output

Example #44

0

Show file

File: uninstall_package.py Project: josephrexme/canari

def uninstallmachines(package, prefix):
    try:
        prefix = path.join(prefix, 'config', 'Maltego', 'Machines')
        n = path.join(prefix, '.nbattrs')
        e = XML('<attributes version="1.0"/>')
        if path.exists(n):
            e = XML(file(n).read())
        if not path.exists(prefix):
            return
        package = '%s.resources.maltego' % package
        for m in filter(lambda x: x.endswith('.machine'), resource_listdir(package, '')):
            print 'Uninstalling machine %s...' % m
            try:
                unlink(path.join(prefix, m))
                uninstallnbattr(m, e)
            except OSError:
                pass
        ElementTree(e).write(file(n, 'wb'))
    except ImportError, e:
        pass

Example #45

0

Show file

File: genericQuestionPaperDocx.py Project: vi-shruti/DOCX-Question-Paper

def get_docx_text(path, options):
    #Take the path of a docx file as argument, return the text in unicode.
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    para = []
    for paragraph in tree.getiterator(PARA):
        alltext = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        for bullet in paragraph.getiterator(BULLETNUM):
            if len(para) > 0:
                paragraphs.append(''.join(para))
                para = []
        para.append(''.join(alltext))
    paragraphs.append(''.join(para)) #for the last para to be appended
    return '\n\n'.join(paragraphs), [paragraphs[i:i+options+1] for i in range(0, len(paragraphs), options+1)]

Example #46

0

Show file

File: content_extractor.py Project: payalarya28/DocClassification

def docxExtractor(path):
    # Function to extract content from docx files, takes path input if path endswith .docx
    # Start by reading MSoffice zipfile
    document = zipfile.ZipFile(path)
    # Search in xml structure the location of content (here it's word/document.xml)
    xml_content = document.read('word/document.xml')
    document.close()
    # Generate xml tree structure from content location
    tree = XML(xml_content)
    # Initialize dictionary to contain content and index per paragraph
    doc = {}
    # Initialize string to contain concatenated text from document for nlp purposes
    s = ''
    # vector = {}
    paragraph_nb = 1
    # Iterate through all elements of the xml tree
    for paragraph in tree.getiterator(PARA):
        # Append to list if node in tree contains non-null text
        texts = [
            node.text
            for node in paragraph.iter(TEXT)  #paragraph.getiterator(TEXT)
            if node.text
        ]
        if texts:
            # Concatenate non null text contained in previous list
            text = ''.join(texts)
            # Index concatenated string to paragtaph number
            doc[str(paragraph_nb)] = fix_text(text)
            # Append concatenated string to current string (for nlp)
            s += fix_text(text)
            #            if vectors:
            #                vector[str(paragraph_nb)] = vectorizer(text, lang=detect(text))
            #            else:
            #                pass
            paragraph_nb += 1


#    if vectors:
#        return creator, doc, vector
#    else:
    return doc, s

Example #47

0

Show file

 def get_existing_bookmarks(self):
     """
     Get the existing bookmarks from the Google Bookmarks API.
     We need to do this in Firefox to have the cookie set which authorities us with the API.
     @return: -
     """
     self.client.navigate(
         "https://www.google.com/bookmarks/?output=xml&num=10000")
     # Initialise XML object
     root = XML(self.client.page_source.encode("utf-8"))
     # Return set of bookmarks
     return set([bookmark[1].text for bookmark in root[0]])

Example #48

0

Show file

File: ReadWord.py Project: Alfredit0/docx_to_excel

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))
    content = '\n'.join(paragraphs)
    content = ''.join([
        s for s in content.strip().splitlines(True) if s.strip("\n").strip()
    ])  #skiping blank lines
    return content

Example #49

0

Show file

File: views.py Project: GRaote/DjangoResumeSkillExtractor

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
    PARA = WORD_NAMESPACE + 'p'
    TEXT = WORD_NAMESPACE + 't'
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)

Example #50

0

Show file

File: document_import.py Project: MrSprinklez/flash_cards

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'   # formatting for docx
    PARA = WORD_NAMESPACE + 'p'                                                         # formatting for paragraphs
    TEXT = WORD_NAMESPACE + 't'                                                         # formatting for text
    document = zipfile.ZipFile(path)                                                    # the unzipped document path
    xml_content = document.read('word/document.xml')                                    # location of the primary xml document
    document.close()                                                                    # closes the document
    tree = XML(xml_content)                                                             # splits the xl into a tree

    paragraphs = []                                                                     # a list of the paragraphs
    for paragraph in tree.getiterator(PARA):                                            # for every new paragraph in the tree
        texts = [node.text                                                              # the text is the text node in the tree
                 for node in paragraph.getiterator(TEXT)                                # 
                 if node.text]                                                          # if the node is text, add it to the text list
        if texts:                                                                       # if a text is found,
            paragraphs.append(''.join(texts))                                           # add it to the paragraphs list
    #return('\n\n'.join(paragraphs))
    return(paragraphs)                                                                  # return the paragra

Example #51

0

Show file

def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [
            node.text for node in paragraph.getiterator(TEXT) if node.text
        ]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)


#print(get_docx_text('C:\\Users\\vanquangcz\\Desktop\\python\\project\\data\\input\\3.docx'))

Example #52

0

Show file

    def dt_docxml_to_text(self, filename, codec='utf-8'):
        """ 
        Argument :
        
        filename : input file name
        
        Return :
        
        return file text
        
        
        Note :
        
        docx file to text 

        """

        texts = ""

        document = zipfile.ZipFile(filename)
        xml_content = document.read('word/document.xml')
        document.close()
        tree = XML(xml_content)

        sections = []
        for section in tree.getiterator(self.PARA):
            texts = ''
            for node in section.getiterator(self.TEXT):
                if node.text:
                    texts += node.text
            sections.append(''.join(texts))
        '''
        for section in tree.getiterator(self.PARA):
            texts = [node.text for node in section.getiterator(self.TEXT) if node.text]
            if texts:
                sections.append(''.join(texts))
        '''
        texts = '\n\n'.join(sections)

        return texts

Example #53

0

Show file

File: app.py Project: caktus/commcare-hq

def submit_unfinished_form(session):
    """
    Gets the raw instance of the session's form and submits it. This is used with
    sms and ivr surveys to save all questions answered so far in a session that
    needs to close.

    If session.include_case_updates_in_partial_submissions is False, no case
    create / update / close actions will be performed, but the form will still be submitted.

    The form is only submitted if the smsforms session has not yet completed.
    """
    # Get and clean the raw xml
    try:
        response = FormplayerInterface(session.session_id,
                                       session.domain).get_raw_instance()
        # Formplayer's ExceptionResponseBean includes the exception message,
        # stautus ("error"), url, and type ("text")
        if response.get('status') == 'error':
            raise TouchformsError(response.get('exception'))
        xml = response['output']
    except InvalidSessionIdException:
        return
    root = XML(xml)
    case_tag_regex = re.compile(
        r"^(\{.*\}){0,1}case$"
    )  # Use regex in order to search regardless of namespace
    meta_tag_regex = re.compile(r"^(\{.*\}){0,1}meta$")
    timeEnd_tag_regex = re.compile(r"^(\{.*\}){0,1}timeEnd$")
    current_timstamp = json_format_datetime(utcnow())
    for child in root:
        if case_tag_regex.match(child.tag) is not None:
            # Found the case tag
            case_element = child
            case_element.set("date_modified", current_timstamp)
            if not session.include_case_updates_in_partial_submissions:
                # Remove case actions (create, update, close)
                child_elements = [case_action for case_action in case_element]
                for case_action in child_elements:
                    case_element.remove(case_action)
        elif meta_tag_regex.match(child.tag) is not None:
            # Found the meta tag, now set the value for timeEnd
            for meta_child in child:
                if timeEnd_tag_regex.match(meta_child.tag):
                    meta_child.text = current_timstamp
    cleaned_xml = tostring(root)

    # Submit the xml
    result = submit_form_locally(cleaned_xml,
                                 session.domain,
                                 app_id=session.app_id,
                                 partial_submission=True)
    session.submission_id = result.xform.form_id

Example #54

0

Show file

File: install_package.py Project: delawaredfir/canari

def installmtz(package, prefix):
    try:
        src = resource_filename('%s.resources.maltego' % package, 'entities.mtz')
        if not os.path.exists(src):
            return
        prefix = os.path.join(prefix, 'config', 'Maltego', 'Entities')
        z = ZipFile(src)
        entities = filter(lambda x: x.endswith('.entity'), z.namelist())

        for e in entities:
            data = z.open(e).read()
            xml = XML(data)
            category = xml.get('category')
            catdir = os.path.join(prefix, category)
            if not os.path.exists(catdir):
                os.mkdir(catdir)
            p = os.path.join(catdir, os.path.basename(e))
            print 'Installing entity %s to %s...' % (e, p)
            with open(p, 'wb') as f:
                f.write(data)
    except ImportError:
        pass

Example #55

0

Show file

File: install_package.py Project: josephrexme/canari

def installmachines(package, prefix):
    try:
        prefix = os.path.join(prefix, 'config', 'Maltego', 'Machines')
        n = os.path.join(prefix, '.nbattrs')
        e = XML('<attributes version="1.0"/>')
        if os.path.exists(n):
            e = XML(file(n).read())
        if not os.path.exists(prefix):
            os.mkdir(prefix)
        package = '%s.resources.maltego' % package
        for m in filter(lambda x: x.endswith('.machine'),
                        resource_listdir(package, '')):
            src = resource_filename(package, m)
            dst = os.path.join(prefix, m)
            print 'Installing machine %s to %s...' % (src, dst)
            with open(dst, 'wb') as f:
                data = file(src).read()
                f.write(data)
                installnbattr(e, data, m)
        ElementTree(e).write(file(n, 'wb'))
    except ImportError, e:
        pass

Example #56

0

Show file

File: tei.py Project: EshwarAnad/wikdict-gen

def write_tei_dict(from_lang, to_lang):
    print(from_lang, to_lang)
    pos_usage = ''.join('<item ana="{1}">{0}</item>'.format(*pos)
                        for pos in list(pos_mapping.values()))

    # get entries, this is where most work is done
    entries, headwords = get_tei_entries_as_xml(from_lang, to_lang)
    if headwords == 0:
        return

    if headwords >= 10000:
        status = 'big enough to be useful'
    elif headwords < 1000:
        status = 'too small'
    else:
        status = 'unknown'

    # prepare template
    register_namespace('', 'http://www.tei-c.org/ns/1.0')
    today = datetime.date.today().isoformat()
    version = today.replace('-', '.')
    tei_template_xml = XML(tei_template.format(
        from_name=language_names[from_lang],
        to_name=language_names[to_lang], headwords=headwords,
        from_lang=from_lang,
        today=today, version=version,
        pos_usage=pos_usage, status=status,
    ))
    indent(tei_template_xml)

    # render xml and add entries
    rendered_template = tostring(tei_template_xml, 'utf-8').decode('utf-8')
    complete_tei = rendered_template.format(
        entries=entries,
    )

    # write to file and add declarations
    out_dir = 'dictionaries/tei' + ('small/' if headwords < 5000 else '')
    os.makedirs(out_dir, exist_ok=True)
    out_filename = '{}/{}-{}.tei'.format(
        out_dir,
        language_codes3[from_lang],
        language_codes3[to_lang])
    with codecs.open(out_filename, 'w', 'utf-8') as out_file:
        out_file.write("""
<?xml version="1.0" encoding="UTF-8"?>
<?xml-stylesheet type="text/css" href="freedict-dictionary.css"?>
<?oxygen RNGSchema="freedict-P5.rng" type="xml"?>
<!DOCTYPE TEI SYSTEM "freedict-P5.dtd">
        """.strip() + '\n')
        out_file.write(complete_tei)

Example #57

0

Show file

File: KeywordIndexFileSearch.py Project: vigneshwars089/pyluc

    def indexDocs(self, path, search_str):
        """
        Take the path of a docx file as argument, return the text in unicode.
        """
        document = zipfile.ZipFile(path)
        #contentToRead = ["header2.xml", "document.xml", "footer2.xml"]
        contentToRead = ["document.xml"]
        paragraphs = []

        for xmlfile in contentToRead:
            xml_content = document.read('word/{}'.format(xmlfile))
            tree = XML(xml_content)
            for paragraph in tree.getiterator(PARA):
                texts = [
                    node.text for node in paragraph.getiterator(TEXT)
                    if node.text
                ]
                if texts:
                    textData = ''.join(texts)
                    if xmlfile == "footer2.xml":
                        extractedTxt = "Footer : " + textData
                    elif xmlfile == "header2.xml":
                        extractedTxt = "Header : " + textData
                    else:
                        extractedTxt = textData

                    paragraphs.append(extractedTxt)
        document.close()
        #return '\n\n'.join(paragraphs)
        #print '\n\n'.join(paragraphs)
        line_no = 1
        for line in paragraphs:
            #print line
            if search_str in line:
                #print paragraphs[i]
                print("Found in : " + str(fname) + "[Paragraph no - " +
                      str(line_no) + "] ")
            line_no += 1

Example #58

0

Show file

    def test_serializedAttributeWithTagWithAttribute(self):
        """
        Similar to L{test_serializedAttributeWithTag}, but for the additional
        complexity where the tag which is the attribute value itself has an
        attribute value which contains bytes which require substitution.
        """
        flattened = self.assertFlattensImmediately(
            tags.img(src=tags.a(href='<>&"')), '<img src="&lt;a href='
            '&quot;&amp;lt;&amp;gt;&amp;amp;&amp;quot;&quot;&gt;'
            '&lt;/a&gt;" />')

        # As in checkTagAttributeSerialization, belt-and-suspenders:
        self.assertXMLEqual(
            XML(flattened).attrib['src'], '<a href="&lt;&gt;&amp;&quot;"></a>')