def handle(self, *args, **options): # make sure we have our recommendations index create_index("recommendation") days = options['days'] domains = options["domain"] if not domains: raise CommandError('Domain name is required') start, end = self.get_date_range(days) for domain in Domain.objects.filter(domain_id__in=domains): results = scan(es, index='article', query={"query": {"bool": {"must": [{"term": {"domain": domain.domain_id}}, {"range": {"published_on": {"gte": start,"lt": end}}}]}},"sort": [{"published_on": {"order": "desc"}}]}, preserve_order=True) for current in results: article_id, title, domain = current['_source']['id'], current['_source']['title'], current['_source']['domain'] document = {} document['id'] = article_id document['recommendation'] = self.get_recommendations(title, domain) ingest_to_elastic([document], "recommendation", "recommendation", "id") if self.DEBUG: print(f"Generated Recommendation for: {title}") for item in document['recommendation']: print("\t", item['title'])
def handle(self, *args, **options): if options['source'] == None: raise CommandError("Option `--source=...` must be specified.") # start prometheus http server for metrics start_http_server(8686) source = options['source'] index = options['index'] create_index(index) domain = Domain.objects.get(domain_id="newscout") try: while True: file_path = self.get_data_from_redis(source) if file_path: date = datetime.now( pytz.timezone("Asia/Kolkata")).strftime("%Y-%m-%d") self.task_state.state("running") self.sleep_time = 0 if os.path.isfile(file_path): doc = cPickle.loads( zlib.decompress(open(file_path, "rb").read())) try: self.create_model_obj(doc, domain, index) if date == self.now: self.source_ingest.labels( source=doc.get("source", "source"), category=doc.get("category", "category")).inc() else: self.now = datetime.now( pytz.timezone("Asia/Kolkata")).strftime( "%Y-%m-%d") # self.reset_stats() self.source_ingest.labels( source=doc.get("source", "source"), category=doc.get("category", "category")).inc() except Exception as e: print("error in doc read") print(e) else: msg = "Data file not found: {0}".format(file_path) print(msg) else: self.task_state.state("waiting") print("Sleeping...!!!") time.sleep(10) self.sleep_time += 10 if self.sleep_time >= 60: if self.batch: ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!") self.batch = [] self.sleep_time = 0 except KeyboardInterrupt: sys.exit(0)
def ingest(self, *args, **options): print("Ingesting Data from Database\n") index = 'article' create_index(index) for article in Article.objects.all().iterator(): serializer = ArticleSerializer(article) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")
def ingest(self, *args, **options): print("Ingesting Data from Database\n") index = 'auto_suggestions' create_index(index, auto_suggestion_mapping) for domain in Domain.objects.filter( domain_name__isnull=False).iterator(): if domain.domain_name: as_dict = {} as_dict["desc"] = domain.domain_name as_dict["name_suggest"] = domain.domain_name as_dict["id"] = md5(str( domain.domain_name).encode("utf-8")).hexdigest() self.batch.append(as_dict) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") for source in Source.objects.filter(name__isnull=False).iterator(): if source.name: as_dict = {} as_dict["desc"] = source.name as_dict["name_suggest"] = source.name as_dict["id"] = md5(str( source.name).encode("utf-8")).hexdigest() self.batch.append(as_dict) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") for cat in Category.objects.filter(name__isnull=False).iterator(): if cat.name: as_dict = {} as_dict["desc"] = cat.name as_dict["name_suggest"] = cat.name as_dict["id"] = md5(str(cat.name).encode("utf-8")).hexdigest() self.batch.append(as_dict) if len(self.batch) == 999: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")
def handle(self, *args, **options): if options['source'] == None: raise CommandError("Option `--source=...` must be specified.") json_files = options['json'] index = options['index'] domain_name = options["domain_name"] domain_id = options["domain_id"] if not domain_name: raise CommandError("Option `--domain_name=...` must be specified.") if not domain_id: raise CommandError("Option `--domain_id=...` must be specified.") create_index(index) domain, _ = Domain.objects.get_or_create(domain_name=domain_name, domain_id=domain_id) try: for root, _, files in os.walk(json_files): if files: for f in files: if f.endswith(".dat"): file_path = "{0}/{1}".format(root, f) if os.path.isfile(file_path): doc = cPickle.loads( zlib.decompress(open(file_path).read())) try: self.create_model_obj(doc, index, domain) except Exception as e: print(e) else: msg = "Data file not found: {0}".format( file_path) print(msg) if self.batch: ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!") self.batch = [] except KeyboardInterrupt: sys.exit(0)
def handle(self, *args, **options): print("Ingesting Data from Database\n") index = options['index'] create_index(index) categories = Category.objects.all() domain = Domain.objects.get(domain_id="newscout") for cat in categories: if Article.objects.filter(category=cat, domain=domain).exists(): article_objs = Article.objects.filter(category=cat, domain=domain)[:200] for article in article_objs: serializer = ArticleSerializer(article) json_data = serializer.data if json_data["hash_tags"]: tag_list = self.get_tags(json_data["hash_tags"]) json_data["hash_tags"] = tag_list self.batch.append(json_data) if len(self.batch) == 200: ingest_to_elastic(self.batch, index, index, 'id') self.batch = [] print("Ingesting Batch...!!!") ingest_to_elastic(self.batch, index, index, 'id') print("Ingesting Final Batch...!!!")