Ejemplo n.º 1
0
    outfile = open("railshead.txt", "w")  ## 4 cells of headers dup'd for each row of info cell
    REAL_1ST_LINE_NUM = 100004  ## EDIT as # delic users grows ("Overview/archit")
    ## XLS row 93 has "Lang Comparisons"
    # REAL_1ST_LINE_NUM=92		## 2/28/07: adding 100k: IndxErr in flatfile4a.py
# data struct to hold line#, 4 cumul.row headings
import rowheadgs

lineNO = 0  ## DON'T edit for start row in perlPYrb.XLS  (0 or 68)

# setup, clean up delimiters and split line
for line in infile:
    lineNO += 1

    # some cells have " appended outside |^ delimiters (?) so s/,"|^/,|^/ and s/|^",/|^,/
    # test: even number of gsub!
    if len(lead2quote.findall(line)) != len(trail2quote.findall(line)):
        print "l. 36: CSV has unbalanced double quotes, line # %i" % lineNO  ##DEBUG
        print "len(lead2quote.findall(line)): " + str(len(lead2quote.findall(line)))
        print "len(trail2quote.findall(line)): " + str(len(trail2quote.findall(line)))
        sys.exit()  ##KILL PRogram if throw excep
    line = lead2quote.sub(r",~~~", line)
    line = trail2quote.sub(r"~~~,", line)

    # remove lead/trail |^, split, check # cells\
    line = leaddelim.sub(r"", line)
    line = traildelim.sub(r"", line)
    line_exploded = delim_re.split(line, 15)  # FIX: pyrb: trail'g "~";; rails: no trail'g

    # Line # in CSV file is 1-indexed,
    colNO = 0
    while colNO < HEADCOLS:
	i+=1
inrownums.close
## print " ^^^^^ newcellrows:  ";		print newcellrows[0:20]		##debug

# print "len of arrayofrowhead: %i\t len of line 31: %i"%(len(arrayofrowhead),
	#len(arrayofrowhead[6]) )

########### setup, clean up delimiters and split line
arrayofcells=[];
pyURL_nosummary={}; rbURL_nosummary={}; 		## NOT USED yet
for line in incsv:
	line=line.rstrip()
	lineNO+=1
		# few cells:" appended outside "~~~"delimiters: srch/replc w/trail2quote & lead, SO
		# test: even number of gsub! 
	if (len(lead2quote.findall(line)) != len(trail2quote.findall(line)) ):
		print "CSV has unbalanced double quotes, line # %i" % lineNO
	line=lead2quote.sub(r',~~~',line)
	line=trail2quote.sub(r'~~~,',line)
		# remove lead/trail |^, split, check # cells\
	line = leaddelim.sub(r'',line)
	line = traildelim.sub(r'',line)
## 	these 4 chars become $lt; etc: & < > "
			## check/ test this method
##	line = mult_rplc(multrplc.htmlDICT,line)		## removed for debug, 9/19/06
	line_exploded=delim_re.split(line)		## 3 debug lines
	if len(line_exploded)!=NUMCOLS:
		print "PROB LIne numb: " ; 
		print lineNO
		print
		print "line_exploded: "