def do(self): logger.info('Getting open tasks from DB') opentasks = Task.objects.filter(status='S') # get all started tasks for t in opentasks: payload = json.load(urllib2.urlopen(t.task_uri + '?format=json')) logger.info("Checking: " + payload['resource_uri']) if payload['answer'] != 'NULL': # save answer when there is one t.answer = payload['answer'] t.status = 'D' # set to to done (enable for processeing) try: # see of worker alread is known w = Worker.objects.get(worker_uri=payload['worker']) t.worker_id = w.id # set Task-Worker realtion logger.info("worker is already known: id = " + str(w.id)) logger.info("worker " + str(w.id) + " did task " + str(t.id)) except Worker.DoesNotExist: # create new worker newworker = Worker(worker_uri=payload['worker']) newworker.save() # save befor assigning to t becasue newworker hast'got an id yet t.worker_id = newworker.id logger.info("new worker created: id = " + str(newworker.id)) logger.info("worker " + str(newworker.id) + " did task " + str(t.id)) t.save() """ Process Task Type 1 """ opentasks = Task.objects.filter(status='D') # get all started tasks logger.info('Getting done tasks from DB. ' + str(opentasks.count()) + ' elements found') for t in opentasks: logger.info("Processing Task " + str(t.id)) try: #answer = json.loads(t.answer) answer = t.answer #TODO replace all anser with t.answer except ValueError: logger.error("the answer field of task " + str(t.id) + " does not contain a valid JSON Format. Skipping.") continue # if there is not valid JSON there is no pint of considerung this answer # TODO: Implement quality control and pot task # again for kw in answer['keywords'].keys(): try: keyword = Keyword.objects.get(text=kw) logger.info('Keyword "' + kw + '" already in DB') t.keywords.add(keyword) logger.info('Keyword "' + kw + '" assigned to Task' + str(t.id)) except Keyword.DoesNotExist: newkeyword=Keyword(text=str(kw), category=answer['keywords'][kw]) newkeyword.save() t.keywords.add(newkeyword) logger.info('new keyword "' + kw + '" created and assigned to Task' + str(t.id)) t.status='P' # set status to processed t.save()
def words_split(): global keywords # mysql 中的ids oldids = Job.objects.values_list('jobId') oldidset = set() for comp in oldids: oldidset.add(comp[0]) # hbase 中的 ids try: newidset = hbase_tool.getalljobid() except BrokenPipeError as e: print(e.strerror) return # TODO 修改 # newset = newidset newset = newidset - oldidset print("start split words") # 缓存keyword对象 allkw = Keyword.objects.all() for kw in allkw: keywords[kw.keyword] = kw for id in newset: keyword = hbase_tool.getkeyword_byjobid(id) s = hbase_tool.getjobinfo_byjobid(id) s = str(s).strip() # 判断缓存中是否存在 if (keywords.get(keyword) is None): print("new keyword : ", keyword) newkeyword = Keyword() newkeyword.keyword = keyword newkeyword.save() kw = Keyword.objects.get(keyword__contains=keyword) keywords[keyword] = kw executor.submit(thread_deal, s, keyword)
def process_task_answers(): """ This funcion goes trough all unprocessed tasks and add news keywords,sentiments, workes and relations """ logger.info("start processing... ") opentasks = Task.objects.filter(status='D') # get all started tasks logger.info('Getting done tasks from DB. ' + str(opentasks.count()) + ' elements found') for t in opentasks: logger.info("Processing Task " + str(t.id)) try: answer = json.loads(t.answer) #answer = t.answer except ValueError: logger.error("the answer field of task " + str(t.id) + " does not contain a valid JSON Format. Skipping.") continue # if there is not valid JSON there is no pint of considerung this answer # TODO: Implement quality control and pot task # again # Process Task Type 1 if t.question == "Question1": logger.info("processing question1... ") for kw in answer['keywords'].keys(): try: keyword = Keyword.objects.get(text = kw, category = answer['keywords'][kw]) logger.info('Keyword "' + kw + '" already in DB') t.keywords.add(keyword) # add Task --> Keyword Relationship keyword.feed.add(t.feed) # add Keyword --> Feed Relationship keyword.save() logger.info('Keyword "' + kw + '" assigned to Task' + str(t.id)) keyword_inverse = Keyword.objects.filter(text=kw).exclude(category=answer['keywords'][kw]) if len(keyword_inverse) == 0: logger.info("no inverse keyword found. skipping") continue elif keyword.task_set.count() / (keyword_inverse[0].task_set.count() + keyword.task_set.count()) <= 0.34: t.worker.score -= 1 t.worker.save() logger.info('Keyword "' + kw + '" has wrong catgory set.' + str(t.worker.id)+ ' was degraded') else: logger.info("no penalty with" + str(keyword.task_set.count() / (keyword_inverse[0].task_set.count() + keyword.task_set.coun()))) except Keyword.DoesNotExist: if t.feed.content.find(kw) == -1: #if keyword is not found in text of the feed t.worker.score -= 1 t.worker.save() logger.info('Keyword "' + kw + '" was not found in feed. worker ' + str(t.worker.id)+ ' was degraded') else: # TODO: Debug here newkeyword=Keyword(text=str(kw), category=answer['keywords'][kw]) newkeyword.save() t.keywords.add(newkeyword) newkeyword.feed.add(t.feed) # add Keyword --> Feed Relationship newkeyword.save() logger.info('new keyword "' + kw + '" created and assigned to Task' + str(t.id)) #uncomment the following line if no autmatic post of task 2 is required #post_task2_to_crowd(t.feed) # Process Task Type 2 elif t.question == "Question2": logger.info("processing question1... ") for sen in answer['keywords'].keys(): logger.info("processing Sentiment " + sen + "(" + answer['keywords'][sen] + ")") keywords = Keyword.objects.filter(text=sen) if len(keywords) == 0: logger.error("Received sentiment for non existing Keyword") continue else: # Quality Contol new_score = answer['keywords'][sen] scores=[] for k in keywords: scores.extend(k.get_sentiment_scores()) print scores if len(scores) <= 3: logger.info("to few sentiments: Keyword " + sen + "has only " + str(len(scores)) + " sentiments") elif abs(median(scores) - int(new_score)) >= 3: logger.info("bad sentiment") t.worker.score -= 1 logger.info('Worker ' + str(t.worker.id)+ ' was degraded') #save sentiment new_sentiment = Sentiment(score=new_score) new_sentiment.worker = t.worker # all the are Forein Keys of Sentiment new_sentiment.feed = t.feed new_sentiment.keyword = keywords[0] # choose better keyword instead of always thealways the first new_sentiment.save() logger.info('new sentiment "' + sen + '" created with score "' + str(new_sentiment.score) + '" and relationships set: worker=' + t.worker.worker_uri + ' feed=' + str(t.feed.id) + ' keyword=' + str(keywords[0].text)) else: logger.error("Keywords/Sentiments could not be processed.Something is wrong with task") t.status = 'P' # set status to processed t.save()