Example #1
0
    def save_task(self):
        uris = []
        val = URLValidator(self.schemes)
        out_f = open(self.out_path, "r")
        for line in out_f:
            self.content_bytes += len(line)
            try:
                js = json.loads(line)
            except ValueError:
                js = ast.literal_eval(line)
                if not isinstance(js, dict):
                    # logging.error("The line %s is not dict or json"%(line))
                    CrawlerGeneratorErrorLog(name="ERROR_URI",
                                             content="The line %s is not dict or json" % (line),
                                             hostname=socket.gethostname()).save()
                    continue
            uri_data = []
            try:
                # validate uri
                if 'uri' not in js:
                    uri_de = js['uri'].encode("utf-8")
                    val(uri_de)
                    uri_data.append(uri_de)
                    if 'args' not in js:
                        uri_args = js['args'].encode("utf-8")
                        uri_data.append(uri_args)
                    uri_str = str(uri_data)
                    uris.append(uri_str)
                else:
                    CrawlerGeneratorErrorLog(name="ERROR_JSON",
                                             content="JSON ValidationError without key 'uri' : %s" % (js),
                                             hostname=socket.gethostname()).save()
            except ValidationError:
                CrawlerGeneratorErrorLog(name="ERROR_URI",
                                         content="URI ValidationError: %s " % (js['uri']),
                                         hostname=socket.gethostname()).save()
        out_f.close()
        os.remove(self.out_path)
        dereplicated_uris = dereplicate_uris(uris)

        for uri_str in dereplicated_uris:
            try:
                eval_uri = eval(uri_str)
                uri = eval_uri[0]
                try:
                    args = eval_uri[1]
                except IndexError:
                    args = "No more args"
                crawler_task = CrawlerTask(job=self.job,
                                           task_generator=self.task_generator,
                                           uri=uri,
                                           args=args,
                                           from_host=socket.gethostname())
                # crawler_task.args = ""
                crawler_task.save()
            except:
                # logging.error("add %s failed: %s", line, traceback.format_exc(10))
                content = traceback.format_exc(10)
                CrawlerGeneratorErrorLog(name="ERROR_URI", content=content, hostname=socket.gethostname()).save()
        self.save_generate_log(CrawlerGeneratorLog.STATUS_SUCCESS, "After generating, save task succeed!")
Example #2
0
 def save(self, text=None, script=None, settings=None):
     """ save uri(schemes) or script(cron) to mongodb server
         return True if success else False
     """
     if text is not None:
         try:
             schemes = settings.pop('schemes', None)
             assert self.save_text(text, schemes)
         except AssertionError as e:
             content = "%s : Error occured when saving text" % (type(e))
             CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save()
     elif script is not None:
         if 'cron' not in settings:
             content = "cron is not found in settings"
             CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save()
             return
         if 'code_type' not in settings:
             content = "code type is not found in settings"
             CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save()
             return
         try:
             schemes = settings.pop('schemes', None)
             assert self.save_script(script, settings['cron'], settings['code_type'], schemes)
         except AssertionError:
             CrawlerGeneratorErrorLog(name="ERROR_SAVE",
                                      content="Error occured when saving script ",
                                      hostname=socket.gethostname()).save()
     else:
         CrawlerGeneratorErrorLog(name="ERROR_SAVE",
                                  content="No text or script found .",
                                  hostname=socket.gethostname()).save()
Example #3
0
 def __init__(self, job_id):
     super(GeneratorDispatch, self).__init__()
     try:
         job_doc = Job.objects.with_id(job_id)
     except Exception:
         CrawlerGeneratorErrorLog(name="ERROR_JOB",
                                  content="Can't find job_id: %s in mongodb!" % (job_id),
                                  hostname=socket.gethostname()).save()
         raise KeyError("Can't find job_id: %s in mongodb!" % (str(job_id)))
     self.job_id = job_id
     self.job = job_doc
     self.priority = job_doc.priority
Example #4
0
 def save_script(self, script, cron, code_type=1, schemes=[]):
     """ saving script with cron settings to mongodb
         if params are None or saving excepts return False
         else return True
     """
     if script is None:
         content = "ScriptError : Error occured when saving script with job!"
         CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save()
         return False
     if not CronSlices.is_valid(cron):
         content = "CronError : Error occured when saving cron with job!"
         CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save()
         return False
     self.extend_schemes(schemes)
     try:
         CrawlerTaskGenerator(job=self.job, code=script, cron=cron, code_type=code_type, schemes=self.schemes).save()
     except Exception as e:
         content = "%s : Error occured when saving script with job!" % (e)
         CrawlerGeneratorErrorLog(name="ERROR_SAVE", content=content, hostname=socket.gethostname()).save()
         return False
     return True
Example #5
0
 def __init__(self, job_id):
     self.uris = None
     self.script = None
     self.schemes = ['http', 'https', 'ftp', 'ftps']
     job_doc = None
     try:
         job_doc = Job.objects.with_id(job_id)
     except Exception as e:
         content = "%s : Can't find job_id: %s in mongodb!" % (type(e), job_id)
         logging.error(content)
         CrawlerGeneratorErrorLog(name="ERROR_JOB", content=content, hostname=socket.gethostname()).save()
         raise KeyError("Can't find job_id: %s in mongodb!" % (str(job_id)))
     self.job_id = job_id
     self.job = job_doc
     self.failed_uris = []
Example #6
0
 def __init__(self, task_generator):
     self.task_generator = task_generator
     try:
         self.job = task_generator.job
     except Exception:
         CrawlerGeneratorErrorLog(name="ERROR_JOB",
                                  content="Can't find job id in task generator document!",
                                  hostname=socket.gethostname()).save()
         raise ValueError("Can't find job id in task generator document")
     self.schemes = task_generator.schemes
     self.out_path = "/tmp/task_generator_%s" % str(self.task_generator.id)
     self.hostname = socket.gethostname()[:16]
     self.generate_log = CrawlerGeneratorLog(job=self.job,
                                             task_generator=self.task_generator,
                                             hostname=self.hostname)
     self.start_time = time.time()
     self.end_time = None
     self.content_bytes = 0
Example #7
0
 def save_text(self, text, schemes=None):
     """
     """
     uris = self.read_from_strings(text, schemes)
     # for uri in uris:
     #     try:
     #         CrawlerTask(job= self.job, uri= uri, from_host= socket.gethostname()).save()
     #     except Exception as e:
     #         content = "%s : Error occured when saving uris %s."%(type(e), uri)
     #         # logging.error(content)
     #         CrawlerGeneratorErrorLog(name= "ERROR_SAVE", content= content, hostname= socket.gethostname()).save()
     bulk = []
     for uri in uris:
         bulk.append(CrawlerTask(job=self.job, uri=uri, from_host=socket.gethostname()))
     try:
         CrawlerTask.objects.insert(bulk)
     except Exception, e:
         CrawlerGeneratorErrorLog(name="ERROR_SAVE",
                                  content="%s : Error occured when saving uris." % (type(e)),
                                  hostname=socket.gethostname()).save()
Example #8
0
 def __validate_uris(self, uri_list, schemes=None):
     """ validate uri from uri list,
         return valid uri list
     """
     uris = []
     if not isinstance(uri_list, list):
         return uris
     self.extend_schemes(schemes)
     val = URLValidator(self.schemes)
     for uri in uri_list:
         for uri in uri.split(";"):
             if uri:
                 try:
                     # for csv file
                     uri = uri.strip()
                     val(uri)
                     uris.append(uri)
                 except ValidationError, e:
                     content = "%s : URI ValidationError: %s" % (type(e), uri)
                     # logging.error(content)
                     CrawlerGeneratorErrorLog(name="ERROR_URI",
                                              content=content,
                                              hostname=socket.gethostname()).save()
                     self.failed_uris.append(uri)