def handle(self, *args, **options): # make sure we have our recommendations index create_index("recommendation") days = options['days'] domains = options["domain"] if not domains: raise CommandError('Domain name is required') start, end = self.get_date_range(days) for domain in Domain.objects.filter(domain_id__in=domains): results = scan(es, index='article', query={"query": {"bool": {"must": [{"term": {"domain": domain.domain_id}}, {"range": {"published_on": {"gte": start,"lt": end}}}]}},"sort": [{"published_on": {"order": "desc"}}]}, preserve_order=True) for current in results: article_id, title, domain = current['_source']['id'], current['_source']['title'], current['_source']['domain'] document = {} document['id'] = article_id document['recommendation'] = self.get_recommendations(title, domain) ingest_to_elastic([document], "recommendation", "recommendation", "id") if self.DEBUG: print(f"Generated Recommendation for: {title}") for item in document['recommendation']: print("\t", item['title'])
def handle(self, *args, **options): if options['source'] == None: raise CommandError("Option `--source=...` must be specified.") # start prometheus http server for metrics start_http_server(8686) source = options['source'] index = options['index'] create_index(index) domain = Domain.objects.get(domain_id="newscout") try: while True: file_path = self.get_data_from_redis(source) if file_path: date = datetime.now( pytz.timezone("Asia/Kolkata")).strftime("%Y-%m-%d") self.task_state.state("running") self.sleep_time = 0 if os.path.isfile(file_path): doc = cPickle.loads( zlib.decompress(open(file_path, "rb").read())) try: self.create_model_obj(doc, domain, index) if date == self.now: self.source_ingest.labels( source=doc.get("source", "source"), category=doc.get("category", "category")).inc() else: self.now = datetime.now( pytz.timezone("Asia/Kolkata")).strftime( "%Y-%m-%d") # self.reset_stats() self.source_ingest.labels( source=doc.get("source", "source"), category=doc.get("category", "category")).inc() except Exception as e: print("error in doc read") print(e) else: msg = "Data file not found: {0}".format(file_path) print(msg) else: self.task_state.state("waiting") print("Sleeping...!!!") time.sleep(10) self.sleep_time += 10 if self.sleep_time >= 60: if self.batch: ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!") self.batch = [] self.sleep_time = 0 except KeyboardInterrupt: sys.exit(0)
def ingest(self, *args, **options): print("Ingesting Data from Database\n") index = 'article' create_index(index) for article in Article.objects.all().iterator(): serializer = ArticleSerializer(article) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")
def handle(self, *args, **options): if options['source'] == None: raise CommandError("Option `--source=...` must be specified.") json_files = options['json'] index = options['index'] domain_name = options["domain_name"] domain_id = options["domain_id"] if not domain_name: raise CommandError("Option `--domain_name=...` must be specified.") if not domain_id: raise CommandError("Option `--domain_id=...` must be specified.") create_index(index) domain, _ = Domain.objects.get_or_create(domain_name=domain_name, domain_id=domain_id) try: for root, _, files in os.walk(json_files): if files: for f in files: if f.endswith(".dat"): file_path = "{0}/{1}".format(root, f) if os.path.isfile(file_path): doc = cPickle.loads( zlib.decompress(open(file_path).read())) try: self.create_model_obj(doc, index, domain) except Exception as e: print(e) else: msg = "Data file not found: {0}".format( file_path) print(msg) if self.batch: ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!") self.batch = [] except KeyboardInterrupt: sys.exit(0)
def handle(self, *args, **options): index = options['index'] json_data = json.loads(open("categories.json").read()) for k, v in json_data.items(): cat_obj = Category.objects.get(name=k) for article in Article.objects.filter( category=Category.objects.get( name="uncategorised")).iterator(): if any( re.search(r'\b' + word.lower() + r'\b', article.title.lower()) for word in k): article.category = cat_obj article.save() serializer = ArticleSerializer(article) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if any(word.lower() in list( article.hash_tags.all().values_list("name", flat=True)) for word in k): article.category = cat_obj article.save() serializer = ArticleSerializer(article) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") print(len(self.batch)) ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")
def handle(self, *args, **options): index = 'article' for current in Article.objects.filter(cover_image=""): cover_image_url = CategoryDefaultImage.get_default_image( current.category) current.cover_image = cover_image_url current.save() print(current.id, current, current.category, cover_image_url) serializer = ArticleSerializer(current) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id')
def handle(self, *args, **options): print("Ingesting Data from Database\n") index = options['index'] create_index(index) categories = Category.objects.all() domain = Domain.objects.get(domain_id="newscout") for cat in categories: if Article.objects.filter(category=cat, domain=domain).exists(): article_objs = Article.objects.filter(category=cat, domain=domain)[:200] for article in article_objs: serializer = ArticleSerializer(article) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 200: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")
def save_model(self, request, obj, form, change): if change: obj.edited_by = request.user obj.manually_edit = True serializer = ArticleSerializer(obj) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list if not obj.spam: ingest_to_elastic([json_data], "article", "article", "id") else: delete_from_elastic([json_data], "article", "article", "id") super(ArticleAdmin, self).save_model(request, obj, form, change) super(ArticleAdmin, self).save_model(request, obj, form, change) serializer = ArticleSerializer(obj) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list ingest_to_elastic([json_data], "article", "article", "id")
def ingest(self, *args, **options): print("Ingesting Data from Database\n") index = 'auto_suggestions' create_index(index, auto_suggestion_mapping) for domain in Domain.objects.filter( domain_name__isnull=False).iterator(): if domain.domain_name: as_dict = {} as_dict["desc"] = domain.domain_name as_dict["name_suggest"] = domain.domain_name as_dict["id"] = md5(str( domain.domain_name).encode("utf-8")).hexdigest() self.batch.append(as_dict) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") for source in Source.objects.filter(name__isnull=False).iterator(): if source.name: as_dict = {} as_dict["desc"] = source.name as_dict["name_suggest"] = source.name as_dict["id"] = md5(str( source.name).encode("utf-8")).hexdigest() self.batch.append(as_dict) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") for cat in Category.objects.filter(name__isnull=False).iterator(): if cat.name: as_dict = {} as_dict["desc"] = cat.name as_dict["name_suggest"] = cat.name as_dict["id"] = md5(str(cat.name).encode("utf-8")).hexdigest() self.batch.append(as_dict) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")
def create_model_obj(self, doc, index, domain): """ this method is used to create django article model object """ title = doc["title"] category = doc["category"] source = doc["source"] source_url = doc["source_url"] cover_image = doc["cover_image"] blurb = doc["blurb"] full_text = doc.get("short_description") or doc.get("full_text", "") published_on = self.parse_date(doc["published_on"]) if not published_on: published_on = timezone.now() author = doc.get("author", "") author_twitter = doc.get("author_twitter", "") video_data = doc.get("video_data", "") images = doc["images"] tags = doc["tags"] if not cover_image: if video_data: cover_image = video_data[0].get("video_image", "") if title and full_text: if not Article.objects.filter(title=title).exists(): if category == "Uncategorised": # apply regex based category only if article is uncategorised # get category id from regex classfication category_id = self.classify.match(title) category = Category.objects.get(id=category_id) else: category, _ = Category.objects.get_or_create(name=category) source, _ = Source.objects.get_or_create(name=source) article_obj = Article.objects.create(domain=domain, title=title, source=source, category=category, source_url=source_url, cover_image=cover_image, blurb=blurb, full_text=full_text, published_on=published_on, active=True) if len(images) > 1: for img in images: _ = ArticleMedia.objects.create(article=article_obj, category="image", url=img) if len(video_data) > 0: for video_dic in video_data: _ = ArticleMedia.objects.create( article=article_obj, category="video", url=video_dic.get("video_image", ""), video_url=video_dic.get("video_url", "")) if len(tags) > 0: tag_objs = [] new_tags = self.remove_special_chars(tags) if new_tags: for tag in new_tags: tag_obj = HashTag.objects.filter(name=tag) if tag_obj: tag_objs.append(tag_obj.first()) else: tag_obj = HashTag.objects.create(name=tag) tag_objs.append(tag_obj) article_obj.hash_tags.add(*tag_objs) serializer = ArticleSerializer(article_obj) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 99: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch To Elastic...!!!")