Ejemplo n.º 1
0
 def populate(self, tree):
     #get the tree root:
     root = tree.getroot()
     #now, get the topics:
     topics = root.findall('{%s}Topic' % ROOT_SPEC)
     #create an entry for all the topics:
     for topic in topics:
         attributes={
                     'topic_id' : topic.get('{%s}id' % r),
                     'title' : topic.find('{%s}Title' % d).text,
                     'dmoz_code' : topic.find('{%s}catid'%ROOT_SPEC).text,
                     'last_updated' : topic.find('{%s}lastUpdate'%ROOT_SPEC).text,
                     'description' : topic.find('{%s}Description'%d).text,
                     'es_alt' : "", 
         }
         es_alt = "" 
         for res in topic.findall('{%s}altlang' % ROOT_SPEC):
             lang, url = res.get('{%s}resource' % r).split(':') 
             if lang.lower() in DESIRED_LANGS :
                 es_alt = url.replace(u'World/', u'')
         if es_alt:
             attributes.update({'es_alt': es_alt})
         #create or update the category:
         try:        
             category = create_or_update(attributes, {'topic_id': attributes['topic_id']}, DmozCategory)
         except Exception, e:                              
             raise CommandError(e.message)
Ejemplo n.º 2
0
    def handle(self, dir=os.path.join(settings.DATA_PATH, "Top"), *args, **options):
        c = 0
        p = 0
        for dirpath, dirnames, filenames in os.walk(dir):
            for filename in filenames:
                try:
                    f = open(os.path.join(dirpath, filename), "r")
                    info = json.load(f)
                    f.close()
                except Exception as e:
                    logging.error('Exception "%s" while json-decoding file %s' % (e.message, filename), exc_info=True)
                # get the category:
                try:
                    cat = DmozCategory.objects.get(
                        Q(topic_id="Top/%s" % info["category"][:-1]) | Q(es_alt="Top/%s" % info["category"][:-1])
                    )
                except MultipleObjectsReturned:
                    logging.error("There are multiple entries for category Top/%s !" % info["category"])
                    cat = None
                except DmozCategory.DoesNotExist:
                    logging.error("There is no such category: Top/%s !" % info["category"])
                    cat = None

                # create
                date_added = None
                try:
                    date_added = datetime.strftime(
                        datetime.strptime(info.get("retrieved_on", time.asctime()), "%a %b %d %H:%M:%S %Y"),
                        "%Y-%m-%d %H:%M:%S",
                    )
                except:
                    logging.error("Error parsing date for file %s" % filename)
                    date_added = datetime.strftime(
                        datetime.strptime(time.asctime(), "%a %b %d %H:%M:%S %Y"), "%Y-%m-%d %H:%M:%S"
                    )

                attrs = {
                    "title": info.get("name", ""),
                    "origin": info.get("url", ""),
                    "summary": info.get("description", ""),
                    "added": date_added,
                    "type": info.get("type", "html"),
                    "text": "",
                    "lang": info.get("lang", "en"),
                }
                if cat:
                    attrs.update({"category_id": cat.pk})

                # get the contents from a file:
                # THIS ONE USES A LOOOT OF MEMORY!
                if info.get("content"):
                    content = cleanup(info["content"].replace("$HOME", os.environ["HOME"]))
                    if content:
                        attrs.update({"text": content})
                    else:
                        logging.info("No content could be parsed from file %s" % filename)
                else:
                    logging.info("Document surrogate %s has no content!" % filename)
                try:
                    create_or_update(attrs, {"origin": attrs["origin"]}, DocumentSurrogate, False)
                    if "text" in attrs and attrs["text"]:
                        c += 1
                except Exception:
                    logging.error("Exception while saving file %s to db" % filename, exc_info=True)
                p += 1

        logging.info("Parsed %s documents \n And added %s documents to the database" % (p, c))
        logging.info("Now, trying to index them...")
        try:
            subprocess.call(["%s/manage.py" % os.environ["PWD"], "index", "--rebuild"])
        except:
            logging.error("Error indexing the files", exc_info=True)
        finally:
            logging.info("Files loaded and indexed!")
        try:
            import smtplib

            mailer = smtplib.SMTP()
            mailer.connect()
            mailer.sendmail(
                "root@localhost",
                "*****@*****.**",
                "Parsed %s documents \n And added %s documents to the database" % (p, c),
            )
        except:
            logging.info("Could not send mail... :(")