def loadfeeds(self): # for now I keep this utility command very basic, much more refactoring in future # now we start sorting the feeds feeds = Feed.objects.all().filter(enabled=True) for feed in feeds: self.stdout.write('\n***Parsing feed %s' % feed.name.encode('utf-8')) feed_parsed = feedparser.parse(feed.url_xml) for item_parsed in feed_parsed.entries: biased_link = item_parsed.link # link must be restored to original (in some case it is: http://news.google.com/news...&url=http//www.example.com self.stdout.write('Biased link: %s' % biased_link) link = get_original_url(biased_link) # then we unquote the url (ex http://voria.gr/index.php?module%3Dnews%26func%3Ddisplay%26sid%3D56406 becomes # http://voria.gr/index.php?module=news&func=display&sid=56406 link = urllib.unquote(link) if len(link)>255: self.stdout.write('\nItem link is more than 255 chars!\n%s\n' % link) link = link[:255] # must import only items not already in the database (check URL of item) items = Item.objects.filter(link=link) if len(items)==0: # new item, must import it! try: self.stdout.write('\nImporting item %s' % item_parsed.title.encode('utf-8')) # store item item = Item() item.title = item_parsed.title[:255].encode('utf-8') item.summary = item_parsed.summary.encode('utf-8') item.link = link item.feed = feed item.updated = datetime.datetime.fromtimestamp(mktime(item_parsed.updated_parsed)) # 0. domain # first let's check domain country parsed = urlparse.urlsplit(link) domain_name = parsed.netloc.encode('utf-8') g = GeoIP() country_code = g.country(domain_name)['country_code'] self.stdout.write('\nDomain: %s, country code: %s' % (domain_name, country_code)) countries = Country.objects.filter(iso2=country_code) country = None if countries.count() > 0: country = countries[0] # check if if we need to add domain in db domains = Domain.objects.all().filter(name=domain_name) if not domains: self.stdout.write('\nAdding a new domain to the system: %s for this item.' % domain_name) domain = Domain() domain.name = domain_name domain.country = country domain.save() else: domain = domains[0] # add the item to the place item.domain = domain # save item item.save() # define text to be parsed text2parse = item.title + item.summary # 1. keywords keywords = Keyword.objects.all() for keyword in keywords: if re.search(keyword.name, text2parse, re.IGNORECASE): self.stdout.write('\n***Keyword %s is in this item.' % keyword.name) #import ipdb;ipdb.set_trace() keyword.item.add(item) # 2. people people = Person.objects.all() for person in people: if re.search(person.name, text2parse, re.IGNORECASE): self.stdout.write('\n***Person %s is in this item.' % person.name) #import ipdb;ipdb.set_trace() person.item.add(item) # 3. images #url = item.link #soup = bs(urlopen(url)) #parsed = list(urlparse.urlparse(url)) soup = bs(item.summary) parsed = list(item.summary) for img in soup.findAll("img"): print img alt = '' if img.has_key('src'): if img.has_key('alt'): alt = img["alt"] if img["src"].lower().startswith("http"): #import ipdb;ipdb.set_trace() src = img["src"] else: # TODO src extraction from relative url src = urlparse.urlunparse(parsed) print src image = Image() image.src = src image.alt = alt image.item = item image.save() # 4. tags tags = Tag.objects.all() for tag in tags: if re.search(tag.name, text2parse, re.IGNORECASE): self.stdout.write('\n***Tag %s is in this item.' % tag.name) item.tags.add(tag) # 5. places # let's check if there are places linked to this item pm=placemaker('4BRX3JfV34E7uaK02MDR.5nn7EAw7DptfhbRTdrMQQjHbXVedgXfsQLaFWwp7fIm') pm_places = pm.find_places(text2parse) for pm_place in pm_places: placename = pm_place.name.decode('utf-8') # first check if we need to add new place places = Place.objects.all().filter(name=placename) if not places: self.stdout.write('\nAdding a new place to the system: %s for this item.' % pm_place.name) place = Place() place.name = placename place.slug = slugify(place.name) place.geometry = 'POINT(%s %s)' % (pm_place.centroid.longitude, pm_place.centroid.latitude) place.save() else: place = places[0] # add the item to the place #import ipdb;ipdb.set_trace() place.items.add(item) except Exception,e: self.stdout.write('\nThere was an error importing this item. Skipping to the next one...')
import csv from feeds.models import Place from django.template.defaultfilters import slugify from django.contrib.gis.geos import Point with open('/home/capooti/temp/allCountries.txt', 'rb') as csvfile: Place.objects.all().delete() csvreader = csv.reader(csvfile, delimiter='\t') for row in csvreader: name = row[1] lat = float(row[4]) lon = float(row[5]) print name, lat, lon p = Place() p.name = name p.slug = slugify(name) p.geometry = Point(lon, lat) p.save()