def make_xml(filedig,filehw,fileout): # slurp txt file into list of lines with codecs.open(filein,encoding='utf-8',mode='r') as f: inlines = [line.rstrip('\r\n') for line in f] # parse xxxhw.txt hwrecs = init_hwrecs(filehw) # open output xml file fout = codecs.open(fileout,'w','utf-8') nout = 0 # count of lines written to fout # generate xml header lines lines = xml_header(xmlroot) for line in lines: fout.write(line + '\n') nout = nout + 1 # process hwrecs records one at a time and generate output nerr = 0 for ihwrec,hwrec in enumerate(hwrecs): if ihwrec > 1000000: # 12 print("debug stopping") break datalines = get_datalines(hwrec,inlines) # construct output xmlstring = construct_xmlstring(datalines,hwrec) # data is a string, which should be well-formed xml # try parsing this string to verify well-formed. try: root = ET.fromstring(xmlstring.encode('utf-8')) except:
def make_xml(filedig,filehw,fileout): # slurp txt file into list of lines with codecs.open(filein,encoding='utf-8',mode='r') as f: inlines = [line.rstrip('\r\n') for line in f] # parse xxxhw.txt hwrecs = init_hwrecs(filehw) # open output xml file fout = codecs.open(fileout,'w','utf-8') nout = 0 # count of lines written to fout # generate xml header lines lines = xml_header(xmlroot) for line in lines: fout.write(line + '\n') nout = nout + 1 # process hwrecs records one at a time and generate output nerr = 0 for ihwrec,hwrec in enumerate(hwrecs): if ihwrec > 1000000: # 12 print("debug stopping") break datalines = get_datalines(hwrec,inlines) # construct output xmlstring = construct_xmlstring(datalines,hwrec) # data is a string, which should be well-formed xml # try parsing this string to verify well-formed. try: root = ET.fromstring(xmlstring) except: # 01-09-2021. Remove conditional err messaging # since some Python versions (e.g. 2.7.5) give false occasions nerr = nerr + 1 # For debugging, change False to True if False: outarr = [] out = "<!-- xml error #%s: L = %s, hw = %s-->" %(nerr,hwrec.L,hwrec.k1) outarr.append(out) outarr.append("datalines = ") outarr = outarr + datalines outarr.append("xmlstring=") outarr.append(xmlstring) outarr.append('') for out in outarr: print(out) #exit(1) continue # write output fout.write(xmlstring + '\n') nout = nout + 1 # write closing line for xml file. out = "</%s>\n" % xmlroot fout.write(out) fout.close() if (nerr == 0): print("All records parsed by ET") else: print("WARNING: make_xml.py:",nerr,"records records not parsed by ET")
def make_xml(filedig, filehw, fileout): # slurp txt file into list of lines with codecs.open(filein, encoding='utf-8', mode='r') as f: inlines = [line.rstrip('\r\n') for line in f] # parse xxxhw.txt hwrecs = init_hwrecs(filehw) # open output xml file fout = codecs.open(fileout, 'w', 'utf-8') nout = 0 # count of lines written to fout # generate xml header lines lines = xml_header(xmlroot) for line in lines: fout.write(line + '\n') nout = nout + 1 # process hwrecs records one at a time and generate output nerr = 0 for ihwrec, hwrec in enumerate(hwrecs): if ihwrec > 1000000: # 12 print("debug stopping") break datalines = get_datalines(hwrec, inlines) # construct output xmlstring = construct_xmlstring(datalines, hwrec) # data is a string, which should be well-formed xml # try parsing this string to verify well-formed. try: root = ET.fromstring(xmlstring.encode('utf-8')) except: outarr = [] nerr = nerr + 1 out = "<!-- xml error #%s: L = %s, hw = %s-->" % (nerr, hwrec.L, hwrec.k1) outarr.append(out) outarr.append("datalines = ") outarr = outarr + datalines outarr.append("xmlstring=") outarr.append(xmlstring) outarr.append('') for out in outarr: print(out.encode('utf-8')) #exit(1) continue # write output fout.write(xmlstring + '\n') nout = nout + 1 # write closing line for xml file. out = "</%s>\n" % xmlroot fout.write(out) fout.close()
def extract_keys(filein, fileout): fout = codecs.open(fileout, "w", 'utf-8') hwrecs = hwparse.init_hwrecs(filein) #f = codecs.open(filein,"r",'utf-8') n = 0 # number of lines read nout = 0 # Number of lines written for r in hwrecs: n = n + 1 #m = re.search(r'<(H[^>]*)>.*?<key1>(.*?)</key1>.*?<L.*?>(.*?)</L>',line) #if not m: # skip boilerplate # continue # line = line.rstrip('\r\n') cat = 'H' + r.e key = r.k1 L = r.L #key = m.group(2) #L = m.group(3) fout.write('%s,%s,%s\n' % (key, cat, L)) nout = nout + 1 #f.close() fout.close() print(n, "records in,", nout, "records written")
pc = rec.pc k1 = rec.k1 L = rec.L out1 = '%s:%s:%s,%s:%s' % (pc, k1, ln1, ln2, L) if rec.type == None: out2 = '' else: # For alternate headwords, one more field type,LP out2 = ':%s,%s' % (rec.type, rec.LP) out = out1 + out2 return out def extract_hw2(hwrecs): recs2 = [] # an array of strings for rec in hwrecs: # rec is HW object # construct HW2 object by excluding key2 # out1 = '%s:%s:%s:%s,%s:%s' %(pc,key1,key2,linenum1,linenum2,L) rec2 = extract_hw2_helper(rec) recs2.append(rec2) return recs2 if __name__ == "__main__": filein = sys.argv[1] # xxxhw.txt fileout = sys.argv[2] hwrecs = init_hwrecs(filein) hw2recs = extract_hw2(hwrecs) write(hw2recs, fileout)