def populate(self, tree): #get the tree root: root = tree.getroot() #now, get the topics: topics = root.findall('{%s}Topic' % ROOT_SPEC) #create an entry for all the topics: for topic in topics: attributes={ 'topic_id' : topic.get('{%s}id' % r), 'title' : topic.find('{%s}Title' % d).text, 'dmoz_code' : topic.find('{%s}catid'%ROOT_SPEC).text, 'last_updated' : topic.find('{%s}lastUpdate'%ROOT_SPEC).text, 'description' : topic.find('{%s}Description'%d).text, 'es_alt' : "", } es_alt = "" for res in topic.findall('{%s}altlang' % ROOT_SPEC): lang, url = res.get('{%s}resource' % r).split(':') if lang.lower() in DESIRED_LANGS : es_alt = url.replace(u'World/', u'') if es_alt: attributes.update({'es_alt': es_alt}) #create or update the category: try: category = create_or_update(attributes, {'topic_id': attributes['topic_id']}, DmozCategory) except Exception, e: raise CommandError(e.message)
def handle(self, dir=os.path.join(settings.DATA_PATH, "Top"), *args, **options): c = 0 p = 0 for dirpath, dirnames, filenames in os.walk(dir): for filename in filenames: try: f = open(os.path.join(dirpath, filename), "r") info = json.load(f) f.close() except Exception as e: logging.error('Exception "%s" while json-decoding file %s' % (e.message, filename), exc_info=True) # get the category: try: cat = DmozCategory.objects.get( Q(topic_id="Top/%s" % info["category"][:-1]) | Q(es_alt="Top/%s" % info["category"][:-1]) ) except MultipleObjectsReturned: logging.error("There are multiple entries for category Top/%s !" % info["category"]) cat = None except DmozCategory.DoesNotExist: logging.error("There is no such category: Top/%s !" % info["category"]) cat = None # create date_added = None try: date_added = datetime.strftime( datetime.strptime(info.get("retrieved_on", time.asctime()), "%a %b %d %H:%M:%S %Y"), "%Y-%m-%d %H:%M:%S", ) except: logging.error("Error parsing date for file %s" % filename) date_added = datetime.strftime( datetime.strptime(time.asctime(), "%a %b %d %H:%M:%S %Y"), "%Y-%m-%d %H:%M:%S" ) attrs = { "title": info.get("name", ""), "origin": info.get("url", ""), "summary": info.get("description", ""), "added": date_added, "type": info.get("type", "html"), "text": "", "lang": info.get("lang", "en"), } if cat: attrs.update({"category_id": cat.pk}) # get the contents from a file: # THIS ONE USES A LOOOT OF MEMORY! if info.get("content"): content = cleanup(info["content"].replace("$HOME", os.environ["HOME"])) if content: attrs.update({"text": content}) else: logging.info("No content could be parsed from file %s" % filename) else: logging.info("Document surrogate %s has no content!" % filename) try: create_or_update(attrs, {"origin": attrs["origin"]}, DocumentSurrogate, False) if "text" in attrs and attrs["text"]: c += 1 except Exception: logging.error("Exception while saving file %s to db" % filename, exc_info=True) p += 1 logging.info("Parsed %s documents \n And added %s documents to the database" % (p, c)) logging.info("Now, trying to index them...") try: subprocess.call(["%s/manage.py" % os.environ["PWD"], "index", "--rebuild"]) except: logging.error("Error indexing the files", exc_info=True) finally: logging.info("Files loaded and indexed!") try: import smtplib mailer = smtplib.SMTP() mailer.connect() mailer.sendmail( "root@localhost", "*****@*****.**", "Parsed %s documents \n And added %s documents to the database" % (p, c), ) except: logging.info("Could not send mail... :(")