Ejemplo n.º 1
0
def save_csv(f, tvseries):
    '''
    Output a CSV file containing highest ranking TV-series.
    '''
    # write caption of the table that is the csv file
    unicodeWriter = UnicodeWriter(f)
    unicodeWriter.writerow(['Title', 'Ranking', 'Genre', 'Actors', 'Runtime'])

    #writer = csv.writer(f)
    #writer.writerow(['Title', 'Ranking', 'Genre', 'Actors', 'Runtime'])

    # write the scraped data to the csv file
    for Row in tvseries:
        #writer.writerow(Row)
        unicodeWriter.writerow(Row)
def createCSV(url):
	content = urllib2.urlopen(url).read()
	objects = json.loads(content)
	for obj in objects:
		if len(obj['value']) > 2 :
			timestamp = datetime.datetime.fromtimestamp(obj['timestamp'] / 1e3)
			with open(str(timestamp) + '.csv', 'wb') as csvfile:
				csvWriter = UnicodeWriter(csvfile);
				x = obj['value']['x'].split(' ')
				y = obj['value']['y'].split(' ')
				z = obj['value']['z'].split(' ')
		
				for i in xrange(0,len(x)):
					row = [x[i],y[i],z[i]]
					csvWriter.writerow(row)
 def expCSV(self, save=False):
     if not save:
         nameInput = QInputDialog.getText(QWidget(), u"Название таблицы",
                                          u"Введите название таблицы")
         if nameInput[1]:
             name = nameInput[0]
         else:
             return
     if save:
         name = self.dlg.tabComboBox.currentText().replace('.csv', '')
     with open(self.plugin_dir + '/tab/' + name + '.csv', 'wb') as fout:
         writer = UnicodeWriter(fout, delimiter=';')
         for r in range(self.model.rowCount()):
             itemCode = self.model.item(r, 0).text()
             itemName = self.model.item(r, 1).text()
             itemSymb = self.model.item(r, 2).text()
             writer.writerow([itemCode, itemName, itemSymb])
Ejemplo n.º 4
0
	def __init__( self ) :

		#1) generate area lookup
		with xlrd.open_workbook( 'OOP_dislokace.xlsx', encoding_override="cp1251" ) as wb:
			sh = wb.sheet_by_index( 0 )
			self.areaLookup = AreaLookup( sh );

		with xlrd.open_workbook( 'kraje_ciselnik-stary.xls' ) as wb:
			sh = wb.sheet_by_index( 0 )
			self.regionLookup = RegionLookup( sh );

		with xlrd.open_workbook( 'okresy_ciselnik.xls' ) as wb:
			sh = wb.sheet_by_index( 0 )
			self.countyLookup = CountyLookup( sh );


		#

		with xlrd.open_workbook( 'SPH_OBEC.xls' ) as wb:
			
			with open('obce.csv', 'wb') as f:

				sh = wb.sheet_by_index( 0 )
				numRows = sh.nrows
				print wb.encoding
				
				writer = UnicodeWriter( f )
				for rowIndex in range(numRows):

					if rowIndex > 0 :

						placeId = sh.row( rowIndex )[0].value
						placeName = sh.row( rowIndex )[5].value
						nuts = sh.row( rowIndex )[1].value
						
						#regionName = self.regionLookup.getNameByCode( nuts )
						countyName = self.countyLookup.getNameByCode( nuts )
						keyName = unicode( placeName ) + "-" + unicode( countyName )
						#oopName = self.countyLookup.getNameByCode( nuts )
						oopName = self.areaLookup.getParentByName( keyName )

						if oopName == -1:
							#print sh.row( rowIndex )
							print unicode( placeName ) + "," + unicode( nuts ) + "," + unicode( keyName ) + "," + unicode( oopName )
						
						writer.writerow( [ str( int(placeId) ), unicode( oopName ) ] )
Ejemplo n.º 5
0
	def removeZeroRowsFromArray( origFile, newFile ):

		resultArr = []
		origFileReader = csv.reader( origFile )
		writer = UnicodeWriter( newFile )

		for row in origFileReader:
		
			#check if there is any value
			hasValue = False
			numColumns = len( row )
			for i in range( numColumns ):
				if i > 3 and i < ( numColumns - 1 ):
					value = float( row[i] )
					if value != 0:
						hasValue = True
						break

			if hasValue:
				writer.writerow( row )
Ejemplo n.º 6
0
		czechCrime = ""
		if crime != -1 :
			englishCrime = crime[3]
			a = englishCrime.split(" ")
			a[0] = a[0].capitalize()
			englishCrime = " ".join(a)
			
			czechCrime = crime[1]
			a = czechCrime.split(" ")
			a[0] = a[0].capitalize()
			czechCrime = " ".join(a)
			
		else:
			print row[0], row[1]

		finalRow = [ row[0], czechCrime, englishCrime, row[3] ]
		finalRows.append( finalRow )

with open( "crimeLookup2.csv", "wb" ) as csvfile:

	writer = UnicodeWriter( csvfile )

	for row in finalRows:

			rowArray = []
			for column in row:
				#print column
				rowArray.append( unicode( column.decode( "utf-8" ) ) )

			writer.writerow( rowArray )
Ejemplo n.º 7
0
	global kmls
	for row in csvreader:
		kmls.append( row[6] )

#create areaLookup
with xlrd.open_workbook( 'areaLookup.xls', encoding_override="utf-8" ) as wb:
	sh = wb.sheet_by_index( 0 )
	global areaLookup
	areaLookup = AreaLookupXls( sh );

with open('pridani-id-do-shp/shape-without-geo.csv', 'rb') as csvfile:
	csvreader = csv.reader(csvfile)

	with open('pridani-id-do-shp/shape-with-geo.csv', 'wb') as f:
		
		writer = UnicodeWriter( f )

		index = 0
		global kmls
		global areaLookup
		
		#for areaName in areaLookup.areas:
		#	print areaName

		for row in csvreader:
			name = row[1].lower().strip()
			code = areaLookup.getCodeByName( name )
			
			if index > 0:
				writer.writerow( [ unicode(row[1].decode("utf-8") ), unicode( kmls[index]) ] )
			index = index + 1
					if float( found ) > 0:
						solvedPerc =  ( float( solved ) / float( found )    ) * 100
					else: 
						solvedPerc = 0	
					finalRow.append( str( solvedPerc ) )
				else:
					finalRow.append( str( diff ) )			

		finalFile.append( finalRow )

		rowIndex = rowIndex + 1

print "3) writing final file with zeros "

with open('../generated/crimeData-2013:08:with-zeros.csv', 'wb') as csvfile:
	writer = UnicodeWriter(csvfile)

	for row in finalFile:
		writer.writerow( row )

print "4) writing final file without zeros "

with open('../generated/crimeData-2013:08.csv', 'wb') as csvfile:
	writer = UnicodeWriter(csvfile)

	for row in finalFile:

		#check if there is any value
		hasValue = False
		numColumns = len( row )
from UnicodeWriter import UnicodeWriter

# fetch list of scientwists' screennames
page = open("scientwists.htm")
soup = BeautifulSoup(page)

scientwists = soup.findAll('div', {"class" : 'scientwist'})
screennames = [sn.a.contents[0].encode('utf-8')
    for sn in scientwists]

#print screennames

# fetch rest of info from twitter, and write to spreadsheet
api = twitter.Api(username='******', password='******')
outfile = open('scientwists.csv', 'w')
csvout = UnicodeWriter(outfile, 'excel')

# column labels on first row
row = [
    'Screen Name',
    'Name',
    'Location',
    'Description',
    'Followers']
csvout.writerow(row)

for user in screennames:
    print user
    try:
        f = api.GetUser(user)
        """print 'screenname: ' + `f.GetScreenName()`
Ejemplo n.º 10
0
class Generator:

    COLUMN_NAMES = [
        "id",
        "time",
        "area",
        "crime",
        "found",
        "found-end",
        "found-total",
        "solved",
        "solved-perc",
        "solved-additionally",
        "commited-drugged",
        "commited-alcohol",
        "commited-recidivst",
        "commited-under-15",
        "comitted-15-17",
        "comitted-under-18",
        "charged-total",
        "charged-recidivist",
        "charged-under-15",
        "charged-15-17",
        "charged-women",
        "damage-total",
        "damage-found",
    ]

    areaLookup = ""
    crimeLookup = ""
    timeLookup = ""
    writer = ""
    areaWriter = ""
    crimeWriter = ""
    timeWriter = ""
    recordId = 1
    files = []
    districtCrimeDataSheetsByCode = {}
    districtCrimeDataSheets = []

    # store areasheets with keys, so we can find them retrospectively to add another areasheet
    areaSheets = {}

    generateAreaLookup = False
    generateCrimeLookup = False
    generateTimeLookup = False
    generateCrimeData = True

    def __init__(self, year, month, omitZeroValues, onCompleteCallback):

        # null everything
        self.areaLookup = ""
        self.crimeLookup = ""
        self.timeLookup = ""
        self.writer = ""
        self.areaWriter = ""
        self.crimeWriter = ""
        self.timeWriter = ""
        self.recordId = 1

        self.year = year
        self.month = month
        self.omitZeroValues = omitZeroValues
        self.onCompleteCallback = onCompleteCallback

        # 1) a) generate area lookup
        with xlrd.open_workbook("areaLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.areaLookup = AreaLookup(sh)

            # b) time lookup
        with xlrd.open_workbook("timeLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.timeLookup = TimeLookup(sh)

            # 2) generate lookups

            # area lookup
        if self.generateAreaLookup:

            with open("../generated/AreaLookup.csv", "wb") as f:

                rows = self.areaLookup.generate()
                numRows = len(rows)

                self.areaWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.areaWriter.writerow(row)

                    # crime lookup - is generated manually
        if self.generateCrimeLookup:

            with xlrd.open_workbook("../files/a______.xls", "wb") as wb:
                sh = wb.sheet_by_index(0)
                self.crimeLookup = CrimeLookup(sh)

                with open("../generated/CrimeLookup.csv", "wb") as f:

                    rows = self.crimeLookup.generate()
                    numRows = len(rows)

                    # 			self.crimeWriter = UnicodeWriter( f )
                    # 			for rowIndex in range( numRows ):
                    # 				row = rows[ rowIndex ]
                    # replace dot zero
                    # row[0] = row[0].replace( ".0","" )

                    # 				self.crimeWriter.writerow( [ row[0],row[1] ] )

                    # time lookup
        if self.generateTimeLookup:

            with open("../generated/TimeLookup.csv", "wb") as f:

                rows = self.timeLookup.generate()
                numRows = len(rows)

                self.timeWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.timeWriter.writerow(row)

                    # 3) go to folders and go through all folder and process all files
                    # itirate through all files

        if self.generateCrimeData:

            # -files
            # -2003
            # -1
            # -2
            # -...
            # -2004
            # -...

            directory = "../files"

            # itirate through all year folders
            yearFolders = listdir(directory)

            for yearFolder in yearFolders:

                # check if is year we're interested in
                if str(yearFolder) == str(self.year):

                    # itirate through all month folders
                    monthsFolders = listdir(directory + "/" + yearFolder)

                    for monthFolder in monthsFolders:

                        # check if is month we're interested in
                        if str(monthFolder) == str(self.month):
                            # itirate through files in month folder
                            files = listdir(directory + "/" + yearFolder + "/" + monthFolder)

                            # get time period id
                            periodId = self.timeLookup.getTimeIdByYearAndMonth(int(yearFolder), int(monthFolder))

                            for file in files:
                                # check only for excel files
                                if ".xls" in file or ".xlsx" in file:
                                    # omit files with underscore
                                    if not "__L" in file and not "__R" in file and not "__X" in file:

                                        # temp constrain to generate just one file
                                        # if "a0011__" in file :
                                        url = directory + "/" + yearFolder + "/" + monthFolder + "/" + file

                                        # create district sheet
                                        districtSheet = self.processFile(url, periodId)
                                        # temp
                                        if districtSheet:
                                            self.districtCrimeDataSheetsByCode[districtSheet.code] = districtSheet

                                            # self.files.append( self.processFile( url, periodId ) )

                                            # 4) add Letiste to respective districts
            transports = [
                {"from": "x004110", "to": "0011"},
                {"from": "x064160", "to": "0602"},
                {"from": "x074170", "to": "0704"},
                {"from": "x174150", "to": "1706"},
                {"from": "x194130", "to": "1903"},
                # add train stations
                {"from": "x060050", "to": "0602"},
                {"from": "x070050", "to": "0707"},
            ]

            lenTransports = len(transports)
            for transportIndex in range(lenTransports):
                transport = transports[transportIndex]
                baseDistrictSheet = self.districtCrimeDataSheetsByCode[transport["to"]]
                addingDistrictSheet = self.districtCrimeDataSheetsByCode[transport["from"]]
                baseDistrictSheet.addDistrictCrimeDataSheet(addingDistrictSheet)

                # 5) generate all files from district crime data
            rows = self.generate()

            # don't need all the objects any more
            self.clear()

            # 6) write to csv file
            fileName = str(self.year) + ":" + str("01-") + str(self.month)
            if not self.omitZeroValues:
                fileName = fileName + ":with-zeros"

            with open("../generated/crimeData-" + fileName + ".csv", "wb") as f:
                print "start writing file " + unicode(fileName)
                self.writer = UnicodeWriter(f)

                # write header
                # self.writer.writerow( self.COLUMN_NAMES )

                # write rest of the content
                numRows = len(rows)
                for rowIndex in range(numRows):

                    row = rows[rowIndex]
                    # print rows
                    self.writer.writerow(row)

                    # complete callback
                if self.onCompleteCallback:
                    self.onCompleteCallback()

    def addRecordArray(self, sourceArr, arrToAdd):
        sourceLen = len(sourceArr)
        if sourceLen > 0:
            for i in range(sourceLen):
                # first three columns just identifies record
                if i > 2:
                    sourceArr[i] = str(float(sourceArr[i]) + float(arrToAdd[i]))
        else:
            sourceArr = copy(arrToAdd)
        return sourceArr

    def processFile(self, fileUrl, timeId):

        # print "======= processing file: " + fileUrl + " ========== "

        with xlrd.open_workbook(fileUrl) as wb:

            numSheets = wb.nsheets

            fileRecords = []
            sheetRecords = []

            districtName = wb.sheet_by_index(0).row(4)[2].value
            areaName = wb.sheet_by_index(0).row(5)[2].value
            districtCode = str(self.areaLookup.getAreaCodeByName(districtName))

            # if Letiste or train station, need to add next row to make it more specific
            firstSheetName = wb.sheet_by_index(0).name
            if districtCode == "-1" or firstSheetName == "a060050" or firstSheetName == "a070050":
                # print "getting district code"
                districtName = districtName + "-" + areaName
                districtCode = str(self.areaLookup.getAreaCodeByName(districtName))

                # data records for districts
            districtCrimeDataSheet = DistrictCrimeDataSheet(districtCode, districtName, timeId)
            self.districtCrimeDataSheets.append(districtCrimeDataSheet)
            # self.districtCrimeDataSheetsByCode[ districtCode ] = districtCrimeDataSheet

            print unicode(districtName) + " - " + unicode(districtCode)

            if districtCode == -1:
                print districtCode
                print districtName

            for sheetIndex in range(numSheets):

                # print "========= processing sheet " + str( sheetIndex ) + " ========="
                sheet = wb.sheet_by_index(sheetIndex)
                areaSheet = AreaCrimeDataSheet(sheet, timeId, self.omitZeroValues)
                districtCrimeDataSheet.addAreaCrimeDataSheet(areaSheet)

                # testing correct naming
                areaName = areaSheet.name
                # taking code straight from name of the sheet
                areaCode = areaSheet.code

                if areaCode == -1:
                    Logger.throwError("Unknown Area: " + unicode(areaCode) + "," + unicode(areaName))

                    # print unicode( areaName )
                print unicode(areaCode) + "," + unicode(areaName)

                # generatedRecords = areaSheet.generate()
                # print generatedRecords
                # lenRecords = len( generatedRecords )

                # for recordIndex in range( lenRecords ):
                # get single record
                ##	record = generatedRecords[ recordIndex ]
                # print "========= processing single record " + str( recordIndex ) + " ========="
                # 	record.insert( 0, str( timeId ) )
                # increment variable value
                # 	self.recordId += 1

                # fileRecords.extend( generatedRecords )

                # fileRecords.extend( districtCrimeDataSheet.generate() )

                # return fileRecords
            return districtCrimeDataSheet

    def generate(self):

        # print "======= generating ======="
        files = []
        lenDistrictSheet = len(self.districtCrimeDataSheets)

        for sheetIndex in range(lenDistrictSheet):
            # for districtSheetIndex in self.districtCrimeDataSheetsByCode:
            # print "========= new district ========== " + unicode( sheetIndex )

            districtSheet = self.districtCrimeDataSheets[
                sheetIndex
            ]  # self.districtCrimeDataSheetsByCode[ districtSheetIndex ]
            files.extend(districtSheet.generate())

        return files

    def clear(self):
        self.districtCrimeDataSheets = []
        self.districtCrimeDataSheetsByCode.clear()
Ejemplo n.º 11
0
#/usr/bin/env python
# -*- coding: utf-8 -*-
# processing multiple root json as custom json or csv
import json
import csv
from UnicodeWriter import UnicodeWriter

with open('4s_venues.json', 'r') as f, open('output_file.csv', 'w') as fp:

    writer = UnicodeWriter(fp)

    for i, line in enumerate(f):
        response = json.loads(line)

        uid = response['venue']['id']
        name = response['venue']['name']
        address = response['venue']['location']['address']
        city = reponse['venue']['location']

        writer.writerow([uid, text])
Ejemplo n.º 12
0
		communityName = row[1].value
		#is population a float value?
		try:
			communityPopulation = locale.atof( str( row[9].value ) )
			#find corresponding community in lookup and add population
			global communityLookup
			communityLookup.addPopulationToCommunityByName( communityName, communityPopulation )

		except:
			pass
			#there was dash in the population value

#3) go through all through lookup 
print "3) go through all through lookup "
with open('communities-with-pop.csv', 'wb') as f:
	writer = UnicodeWriter( f )
	
	global communityLookup
	numCommunities = len( communityLookup.communities )

	for communityName in communityLookup.communities:
		communityRecord = communityLookup.getCommunityByName( communityName )
		writer.writerow( communityRecord.generate() )

#4) create oop lookup  				
#print "4) create oop lookup"
#with open('oop.csv', 'rb') as csvfile:
	#csvreader = csv.reader(csvfile)

	#global oopLookup
	#oopLookup = OopLookup( csvreader )
Ejemplo n.º 13
0
		#code = int( row[0].value )
		#zsjRecord = zsjLookup.getZsjByCode( code )

		#if zsjRecord != -1:
			
			#shapeLookup.zsjRecord = zsjRecord
			#oopRecord = oopLookup.getOop( zsjRecord.town, zsjRecord.county )
			#if oopRecord:
				#shapeLookup.oopRecord = oopRecord

			#print oopRecord
			#if not oopRecord: 
			#	print zsjRecord.town + "," + zsjRecord.zsj + "," + zsjRecord.county + "," + zsjRecord.region


#5) 
with open('shape-with-zsj.csv', 'wb') as f:
	print "start writing file"
	writer = UnicodeWriter( f )

	global shapeLookup
	records = shapeLookup.generate()
	lenRecords = len( records )

	#write header
	writer.writerow( [ "_ID,N,24,5", "_ID,N,24,5", "_GEO,C,192", "_KOD_LAU2,N,10,0", "OBEC_kod", "Obec", "Castobce_dil_kod", "Castobce_dil", "ZSJ_kod", "ZSJ", "Obvykle_bydlici", "Trvale_bydlici", "Kraj_kod", "Kraj", "Okres_kod", "Okres", "ORP_kod", "ORP", "OOP" ] )

	for index in range( lenRecords ):
		writer.writerow( records[ index ] )

Ejemplo n.º 14
0
	global areasByCode
	csvReader = csv.reader(contactfile, delimiter=',')
	for row in csvReader:
		area = areasByCode.get( row[0], -1 )
		if area != -1:
			contacts[ area[0] ] = row
		else:
			print "cound't find contact"
			print row[0]

#part for uppercasing normalizing the names
normalize = False
if normalize:

	with open('areas3.csv', 'wb' ) as f:
		writer = UnicodeWriter( f )

		for rowIndex in areas:
			row = areas[ rowIndex ]
			id = row[0]
			
			name = row[1].decode( "utf-8" ).lower().title()
			name = name.replace( "Oop", "OOP" )
			name = name.replace( "Mop", "MOP" )
			name = name.replace( "Úo".decode("utf-8"), "ÚO".decode("utf-8") )
			name = name.replace( "Křp".decode("utf-8"), "KŘP".decode("utf-8") )

			pop = row[2]
			contact = row[3]
			writer.writerow( [ unicode( id ), unicode( name ), unicode( pop ), unicode( contact ) ] )
Ejemplo n.º 15
0
# substrings to construct our Amazon affiliate links and image URLs
link0 = "http://www.amazon.com/gp/product/"
link1 = "/ref=as_li_ss_tl?ie=UTF8&camp=1789&creative=390957&creativeASIN="
link2 = "&linkCode=as2&tag=buyusabrand-20"
pict0 = "http://ws.assoc-amazon.com/widgets/q?_encoding=UTF8&ASIN="
pict1 = "&Format=_SL160_&ID=AsinImage&MarketPlace=US&ServiceVersion=20070822&WS=1&tag=buyusabrand-20"

soup = BeautifulSoup(urllib2.urlopen("http://www.21usdeal.com/zh/"))
#soup = BeautifulSoup(open("21usdeal_test.html"))

titles = soup.findAll(attrs={'class' : "art-postheader"})
descrs = soup.findAll(attrs={'class' : "art-postcontent"})
links = soup.findAll(attrs={'class' : "morelink"})

with open('wordpress.csv', 'wb') as f:
    writer = UnicodeWriter(f, delimiter=",")
    writer.writerow(["csv_post_title", "csv_post_post", "csv_post_type", "csv_post_excerpt", "csv_post_categories", "csv_post_tags", "csv_post_date", "desc", "link", "picture"])
    
    for (title, descr, link) in zip(titles, descrs, links):
        link = link.a.next_sibling.get('href')
        
        # only run if this is an Amazon link
        if link0 in link :
            title = title.a.get_text(strip=True)
            descr = descr.get_text(strip=True)
            
            # extract the Amazon product ID
            asin = link.split('/')[5]

            link = ""+link0+asin+link1+asin+link2
            pict = ""+pict0+asin+pict1
Ejemplo n.º 16
0
import urllib2
from xml.dom.minidom import parseString
import re
import csv
from UnicodeWriter import UnicodeWriter
from htmlentitydefs import name2codepoint 
pattern = re.compile(r'&(?:(#)(\d+)|([^;]+));')

output = open("merchant_category_information.csv", "wb")
UnicodeWriterObj = UnicodeWriter(output)
#csv_writer = csv.writer(output)
UnicodeWriterObj.writerow(['Updated On', 
                     'Merchant id',
                     'Merchant Name',
                     'averageConversionRate',
                     'averageCommission',
                     'logo',
                     'has_logo',
                     'category_id',
                     'category_name',
                     'domain_id',
                     'domain_name',
                     'country'])
for i in range(1, 624):
    info_pull_url = 'http://api-merchants.skimlinks.com/merchants/xml/639099dd7e85abb8717d17662901ecae/category/'+str(i)+'/limit/100000000'
    
    file = urllib2.urlopen(info_pull_url)
    data = file.read()
    file.close()
    dom = parseString(data)
    xmlTag = dom.getElementsByTagName('merchant')#[0].toxml()
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)

# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(),
                      max_time=1)

# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux \
i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
# The site we will navigate into, handling it's session
today = time.strftime('%Y_%m_%d')
file=open('craigsList_albanyga'+today+'.csv','wb')
UnicodeWriterObj = UnicodeWriter(file)
UnicodeWriterObj.writerow(['URL','Name','Location', 'Email', 'Description','Date','Phone'])

def main():
    #ExtractDetails('http://raleigh.craigslist.org/apa/3275803941.html')
    
    
    CreatedURL = 'http://albanyga.craigslist.org/apa/'
    ExtractLandingURL(str(CreatedURL))
    for i in range(100, 500, 100):
        SummaryURL = 'http://albanyga.craigslist.org/apa/index'+str(i)+'.html'
        print "*"*45
        print SummaryURL
        print "*"*45
        ExtractLandingURL(str(SummaryURL))
        
Ejemplo n.º 18
0
    def __init__(self, year, month, omitZeroValues, onCompleteCallback):

        # null everything
        self.areaLookup = ""
        self.crimeLookup = ""
        self.timeLookup = ""
        self.writer = ""
        self.areaWriter = ""
        self.crimeWriter = ""
        self.timeWriter = ""
        self.recordId = 1

        self.year = year
        self.month = month
        self.omitZeroValues = omitZeroValues
        self.onCompleteCallback = onCompleteCallback

        # 1) a) generate area lookup
        with xlrd.open_workbook("areaLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.areaLookup = AreaLookup(sh)

            # b) time lookup
        with xlrd.open_workbook("timeLookup.xls") as wb:
            sh = wb.sheet_by_index(0)
            self.timeLookup = TimeLookup(sh)

            # 2) generate lookups

            # area lookup
        if self.generateAreaLookup:

            with open("../generated/AreaLookup.csv", "wb") as f:

                rows = self.areaLookup.generate()
                numRows = len(rows)

                self.areaWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.areaWriter.writerow(row)

                    # crime lookup - is generated manually
        if self.generateCrimeLookup:

            with xlrd.open_workbook("../files/a______.xls", "wb") as wb:
                sh = wb.sheet_by_index(0)
                self.crimeLookup = CrimeLookup(sh)

                with open("../generated/CrimeLookup.csv", "wb") as f:

                    rows = self.crimeLookup.generate()
                    numRows = len(rows)

                    # 			self.crimeWriter = UnicodeWriter( f )
                    # 			for rowIndex in range( numRows ):
                    # 				row = rows[ rowIndex ]
                    # replace dot zero
                    # row[0] = row[0].replace( ".0","" )

                    # 				self.crimeWriter.writerow( [ row[0],row[1] ] )

                    # time lookup
        if self.generateTimeLookup:

            with open("../generated/TimeLookup.csv", "wb") as f:

                rows = self.timeLookup.generate()
                numRows = len(rows)

                self.timeWriter = UnicodeWriter(f)
                for rowIndex in range(numRows):
                    row = rows[rowIndex]
                    self.timeWriter.writerow(row)

                    # 3) go to folders and go through all folder and process all files
                    # itirate through all files

        if self.generateCrimeData:

            # -files
            # -2003
            # -1
            # -2
            # -...
            # -2004
            # -...

            directory = "../files"

            # itirate through all year folders
            yearFolders = listdir(directory)

            for yearFolder in yearFolders:

                # check if is year we're interested in
                if str(yearFolder) == str(self.year):

                    # itirate through all month folders
                    monthsFolders = listdir(directory + "/" + yearFolder)

                    for monthFolder in monthsFolders:

                        # check if is month we're interested in
                        if str(monthFolder) == str(self.month):
                            # itirate through files in month folder
                            files = listdir(directory + "/" + yearFolder + "/" + monthFolder)

                            # get time period id
                            periodId = self.timeLookup.getTimeIdByYearAndMonth(int(yearFolder), int(monthFolder))

                            for file in files:
                                # check only for excel files
                                if ".xls" in file or ".xlsx" in file:
                                    # omit files with underscore
                                    if not "__L" in file and not "__R" in file and not "__X" in file:

                                        # temp constrain to generate just one file
                                        # if "a0011__" in file :
                                        url = directory + "/" + yearFolder + "/" + monthFolder + "/" + file

                                        # create district sheet
                                        districtSheet = self.processFile(url, periodId)
                                        # temp
                                        if districtSheet:
                                            self.districtCrimeDataSheetsByCode[districtSheet.code] = districtSheet

                                            # self.files.append( self.processFile( url, periodId ) )

                                            # 4) add Letiste to respective districts
            transports = [
                {"from": "x004110", "to": "0011"},
                {"from": "x064160", "to": "0602"},
                {"from": "x074170", "to": "0704"},
                {"from": "x174150", "to": "1706"},
                {"from": "x194130", "to": "1903"},
                # add train stations
                {"from": "x060050", "to": "0602"},
                {"from": "x070050", "to": "0707"},
            ]

            lenTransports = len(transports)
            for transportIndex in range(lenTransports):
                transport = transports[transportIndex]
                baseDistrictSheet = self.districtCrimeDataSheetsByCode[transport["to"]]
                addingDistrictSheet = self.districtCrimeDataSheetsByCode[transport["from"]]
                baseDistrictSheet.addDistrictCrimeDataSheet(addingDistrictSheet)

                # 5) generate all files from district crime data
            rows = self.generate()

            # don't need all the objects any more
            self.clear()

            # 6) write to csv file
            fileName = str(self.year) + ":" + str("01-") + str(self.month)
            if not self.omitZeroValues:
                fileName = fileName + ":with-zeros"

            with open("../generated/crimeData-" + fileName + ".csv", "wb") as f:
                print "start writing file " + unicode(fileName)
                self.writer = UnicodeWriter(f)

                # write header
                # self.writer.writerow( self.COLUMN_NAMES )

                # write rest of the content
                numRows = len(rows)
                for rowIndex in range(numRows):

                    row = rows[rowIndex]
                    # print rows
                    self.writer.writerow(row)

                    # complete callback
                if self.onCompleteCallback:
                    self.onCompleteCallback()
Ejemplo n.º 19
0
        czechCrime = ""
        if crime != -1:
            englishCrime = crime[3]
            a = englishCrime.split(" ")
            a[0] = a[0].capitalize()
            englishCrime = " ".join(a)

            czechCrime = crime[1]
            a = czechCrime.split(" ")
            a[0] = a[0].capitalize()
            czechCrime = " ".join(a)

        else:
            print row[0], row[1]

        finalRow = [row[0], czechCrime, englishCrime, row[3]]
        finalRows.append(finalRow)

with open("crimeLookup2.csv", "wb") as csvfile:

    writer = UnicodeWriter(csvfile)

    for row in finalRows:

        rowArray = []
        for column in row:
            #print column
            rowArray.append(unicode(column.decode("utf-8")))

        writer.writerow(rowArray)