outfile = open("railshead.txt", "w") ## 4 cells of headers dup'd for each row of info cell REAL_1ST_LINE_NUM = 100004 ## EDIT as # delic users grows ("Overview/archit") ## XLS row 93 has "Lang Comparisons" # REAL_1ST_LINE_NUM=92 ## 2/28/07: adding 100k: IndxErr in flatfile4a.py # data struct to hold line#, 4 cumul.row headings import rowheadgs lineNO = 0 ## DON'T edit for start row in perlPYrb.XLS (0 or 68) # setup, clean up delimiters and split line for line in infile: lineNO += 1 # some cells have " appended outside |^ delimiters (?) so s/,"|^/,|^/ and s/|^",/|^,/ # test: even number of gsub! if len(lead2quote.findall(line)) != len(trail2quote.findall(line)): print "l. 36: CSV has unbalanced double quotes, line # %i" % lineNO ##DEBUG print "len(lead2quote.findall(line)): " + str(len(lead2quote.findall(line))) print "len(trail2quote.findall(line)): " + str(len(trail2quote.findall(line))) sys.exit() ##KILL PRogram if throw excep line = lead2quote.sub(r",~~~", line) line = trail2quote.sub(r"~~~,", line) # remove lead/trail |^, split, check # cells\ line = leaddelim.sub(r"", line) line = traildelim.sub(r"", line) line_exploded = delim_re.split(line, 15) # FIX: pyrb: trail'g "~";; rails: no trail'g # Line # in CSV file is 1-indexed, colNO = 0 while colNO < HEADCOLS:
i+=1 inrownums.close ## print " ^^^^^ newcellrows: "; print newcellrows[0:20] ##debug # print "len of arrayofrowhead: %i\t len of line 31: %i"%(len(arrayofrowhead), #len(arrayofrowhead[6]) ) ########### setup, clean up delimiters and split line arrayofcells=[]; pyURL_nosummary={}; rbURL_nosummary={}; ## NOT USED yet for line in incsv: line=line.rstrip() lineNO+=1 # few cells:" appended outside "~~~"delimiters: srch/replc w/trail2quote & lead, SO # test: even number of gsub! if (len(lead2quote.findall(line)) != len(trail2quote.findall(line)) ): print "CSV has unbalanced double quotes, line # %i" % lineNO line=lead2quote.sub(r',~~~',line) line=trail2quote.sub(r'~~~,',line) # remove lead/trail |^, split, check # cells\ line = leaddelim.sub(r'',line) line = traildelim.sub(r'',line) ## these 4 chars become $lt; etc: & < > " ## check/ test this method ## line = mult_rplc(multrplc.htmlDICT,line) ## removed for debug, 9/19/06 line_exploded=delim_re.split(line) ## 3 debug lines if len(line_exploded)!=NUMCOLS: print "PROB LIne numb: " ; print lineNO print print "line_exploded: "