Exemple #1
0
    def handle(self, *args, **options):
        # make sure we have our recommendations index
        create_index("recommendation")
        days = options['days']
        domains = options["domain"]

        if not domains:
            raise CommandError('Domain name is required')

        start, end = self.get_date_range(days)

        for domain in Domain.objects.filter(domain_id__in=domains):
            results = scan(es, index='article', query={"query": {"bool": {"must": [{"term": {"domain": domain.domain_id}}, {"range": {"published_on": {"gte": start,"lt": end}}}]}},"sort": [{"published_on": {"order": "desc"}}]}, preserve_order=True)

            for current in results:
                article_id, title, domain = current['_source']['id'], current['_source']['title'], current['_source']['domain']
                document = {}
                document['id'] = article_id
                document['recommendation'] = self.get_recommendations(title, domain)
                ingest_to_elastic([document], "recommendation", "recommendation", "id")

                if self.DEBUG:
                    print(f"Generated Recommendation for: {title}")

                    for item in document['recommendation']:
                        print("\t", item['title'])
Exemple #2
0
    def handle(self, *args, **options):
        if options['source'] == None:
            raise CommandError("Option `--source=...` must be specified.")

        # start prometheus http server for metrics
        start_http_server(8686)

        source = options['source']
        index = options['index']
        create_index(index)
        domain = Domain.objects.get(domain_id="newscout")
        try:
            while True:
                file_path = self.get_data_from_redis(source)
                if file_path:
                    date = datetime.now(
                        pytz.timezone("Asia/Kolkata")).strftime("%Y-%m-%d")
                    self.task_state.state("running")
                    self.sleep_time = 0
                    if os.path.isfile(file_path):
                        doc = cPickle.loads(
                            zlib.decompress(open(file_path, "rb").read()))
                        try:
                            self.create_model_obj(doc, domain, index)
                            if date == self.now:
                                self.source_ingest.labels(
                                    source=doc.get("source", "source"),
                                    category=doc.get("category",
                                                     "category")).inc()
                            else:
                                self.now = datetime.now(
                                    pytz.timezone("Asia/Kolkata")).strftime(
                                        "%Y-%m-%d")
                                # self.reset_stats()
                                self.source_ingest.labels(
                                    source=doc.get("source", "source"),
                                    category=doc.get("category",
                                                     "category")).inc()
                        except Exception as e:
                            print("error in doc read")
                            print(e)
                    else:
                        msg = "Data file not found: {0}".format(file_path)
                        print(msg)
                else:
                    self.task_state.state("waiting")
                    print("Sleeping...!!!")
                    time.sleep(10)
                    self.sleep_time += 10
                    if self.sleep_time >= 60:
                        if self.batch:
                            ingest_to_elastic(self.batch, index, index, 'id')
                            print("Ingesting Final Batch...!!!")
                            self.batch = []
                            self.sleep_time = 0
        except KeyboardInterrupt:
            sys.exit(0)
 def ingest(self, *args, **options):
     print("Ingesting Data from Database\n")
     index = 'article'
     create_index(index)
     for article in Article.objects.all().iterator():
         serializer = ArticleSerializer(article)
         json_data = serializer.data
         if json_data["hash_tags"]:
             tag_list = self.get_tags(json_data["hash_tags"])
             json_data["hash_tags"] = tag_list
         self.batch.append(json_data)
         if len(self.batch) == 999:
             ingest_to_elastic(self.batch, index, index, 'id')
             self.batch = []
             print("Ingesting Batch...!!!")
     ingest_to_elastic(self.batch, index, index, 'id')
     print("Ingesting Final Batch...!!!")
Exemple #4
0
    def handle(self, *args, **options):
        if options['source'] == None:
            raise CommandError("Option `--source=...` must be specified.")

        json_files = options['json']
        index = options['index']
        domain_name = options["domain_name"]
        domain_id = options["domain_id"]
        if not domain_name:
            raise CommandError("Option `--domain_name=...` must be specified.")

        if not domain_id:
            raise CommandError("Option `--domain_id=...` must be specified.")

        create_index(index)
        domain, _ = Domain.objects.get_or_create(domain_name=domain_name,
                                                 domain_id=domain_id)
        try:
            for root, _, files in os.walk(json_files):
                if files:
                    for f in files:
                        if f.endswith(".dat"):
                            file_path = "{0}/{1}".format(root, f)
                            if os.path.isfile(file_path):
                                doc = cPickle.loads(
                                    zlib.decompress(open(file_path).read()))
                                try:
                                    self.create_model_obj(doc, index, domain)
                                except Exception as e:
                                    print(e)
                            else:
                                msg = "Data file not found: {0}".format(
                                    file_path)
                                print(msg)

            if self.batch:
                ingest_to_elastic(self.batch, index, index, 'id')
                print("Ingesting Final Batch...!!!")
                self.batch = []
        except KeyboardInterrupt:
            sys.exit(0)
    def handle(self, *args, **options):
        index = options['index']
        json_data = json.loads(open("categories.json").read())
        for k, v in json_data.items():
            cat_obj = Category.objects.get(name=k)
            for article in Article.objects.filter(
                    category=Category.objects.get(
                        name="uncategorised")).iterator():
                if any(
                        re.search(r'\b' + word.lower() +
                                  r'\b', article.title.lower()) for word in k):
                    article.category = cat_obj
                    article.save()
                    serializer = ArticleSerializer(article)
                    json_data = serializer.data
                    if json_data["hash_tags"]:
                        tag_list = self.get_tags(json_data["hash_tags"])
                        json_data["hash_tags"] = tag_list
                    self.batch.append(json_data)

                if any(word.lower() in list(
                        article.hash_tags.all().values_list("name", flat=True))
                       for word in k):
                    article.category = cat_obj
                    article.save()
                    serializer = ArticleSerializer(article)
                    json_data = serializer.data
                    if json_data["hash_tags"]:
                        tag_list = self.get_tags(json_data["hash_tags"])
                        json_data["hash_tags"] = tag_list
                    self.batch.append(json_data)

                if len(self.batch) == 999:
                    ingest_to_elastic(self.batch, index, index, 'id')
                    self.batch = []
                    print("Ingesting Batch...!!!")

        print(len(self.batch))
        ingest_to_elastic(self.batch, index, index, 'id')
        print("Ingesting Final Batch...!!!")
    def handle(self, *args, **options):
        index = 'article'

        for current in Article.objects.filter(cover_image=""):
            cover_image_url = CategoryDefaultImage.get_default_image(
                current.category)
            current.cover_image = cover_image_url
            current.save()
            print(current.id, current, current.category, cover_image_url)
            serializer = ArticleSerializer(current)
            json_data = serializer.data

            if json_data["hash_tags"]:
                tag_list = self.get_tags(json_data["hash_tags"])
                json_data["hash_tags"] = tag_list
            self.batch.append(json_data)

            if len(self.batch) == 999:
                ingest_to_elastic(self.batch, index, index, 'id')
                self.batch = []
                print("Ingesting Batch...!!!")
        ingest_to_elastic(self.batch, index, index, 'id')
 def handle(self, *args, **options):
     print("Ingesting Data from Database\n")
     index = options['index']
     create_index(index)
     categories = Category.objects.all()
     domain = Domain.objects.get(domain_id="newscout")
     for cat in categories:
         if Article.objects.filter(category=cat, domain=domain).exists():
             article_objs = Article.objects.filter(category=cat,
                                                   domain=domain)[:200]
             for article in article_objs:
                 serializer = ArticleSerializer(article)
                 json_data = serializer.data
                 if json_data["hash_tags"]:
                     tag_list = self.get_tags(json_data["hash_tags"])
                     json_data["hash_tags"] = tag_list
                 self.batch.append(json_data)
                 if len(self.batch) == 200:
                     ingest_to_elastic(self.batch, index, index, 'id')
                     self.batch = []
                     print("Ingesting Batch...!!!")
             ingest_to_elastic(self.batch, index, index, 'id')
             print("Ingesting Final Batch...!!!")
Exemple #8
0
    def save_model(self, request, obj, form, change):
        if change:
            obj.edited_by = request.user
            obj.manually_edit = True
            serializer = ArticleSerializer(obj)
            json_data = serializer.data
            if json_data["hash_tags"]:
                tag_list = self.get_tags(json_data["hash_tags"])
                json_data["hash_tags"] = tag_list

            if not obj.spam:
                ingest_to_elastic([json_data], "article", "article", "id")
            else:
                delete_from_elastic([json_data], "article", "article", "id")
            super(ArticleAdmin, self).save_model(request, obj, form, change)

        super(ArticleAdmin, self).save_model(request, obj, form, change)
        serializer = ArticleSerializer(obj)
        json_data = serializer.data
        if json_data["hash_tags"]:
            tag_list = self.get_tags(json_data["hash_tags"])
            json_data["hash_tags"] = tag_list
        ingest_to_elastic([json_data], "article", "article", "id")
 def ingest(self, *args, **options):
     print("Ingesting Data from Database\n")
     index = 'auto_suggestions'
     create_index(index, auto_suggestion_mapping)
     for domain in Domain.objects.filter(
             domain_name__isnull=False).iterator():
         if domain.domain_name:
             as_dict = {}
             as_dict["desc"] = domain.domain_name
             as_dict["name_suggest"] = domain.domain_name
             as_dict["id"] = md5(str(
                 domain.domain_name).encode("utf-8")).hexdigest()
             self.batch.append(as_dict)
             if len(self.batch) == 999:
                 ingest_to_elastic(self.batch, index, index, 'id')
                 self.batch = []
                 print("Ingesting Batch...!!!")
     for source in Source.objects.filter(name__isnull=False).iterator():
         if source.name:
             as_dict = {}
             as_dict["desc"] = source.name
             as_dict["name_suggest"] = source.name
             as_dict["id"] = md5(str(
                 source.name).encode("utf-8")).hexdigest()
             self.batch.append(as_dict)
             if len(self.batch) == 999:
                 ingest_to_elastic(self.batch, index, index, 'id')
                 self.batch = []
                 print("Ingesting Batch...!!!")
     for cat in Category.objects.filter(name__isnull=False).iterator():
         if cat.name:
             as_dict = {}
             as_dict["desc"] = cat.name
             as_dict["name_suggest"] = cat.name
             as_dict["id"] = md5(str(cat.name).encode("utf-8")).hexdigest()
             self.batch.append(as_dict)
             if len(self.batch) == 999:
                 ingest_to_elastic(self.batch, index, index, 'id')
                 self.batch = []
                 print("Ingesting Batch...!!!")
     ingest_to_elastic(self.batch, index, index, 'id')
     print("Ingesting Final Batch...!!!")
Exemple #10
0
    def create_model_obj(self, doc, index, domain):
        """
        this method is used to create django article model object
        """
        title = doc["title"]
        category = doc["category"]
        source = doc["source"]
        source_url = doc["source_url"]
        cover_image = doc["cover_image"]
        blurb = doc["blurb"]
        full_text = doc.get("short_description") or doc.get("full_text", "")
        published_on = self.parse_date(doc["published_on"])
        if not published_on:
            published_on = timezone.now()
        author = doc.get("author", "")
        author_twitter = doc.get("author_twitter", "")
        video_data = doc.get("video_data", "")
        images = doc["images"]
        tags = doc["tags"]
        if not cover_image:
            if video_data:
                cover_image = video_data[0].get("video_image", "")
        if title and full_text:
            if not Article.objects.filter(title=title).exists():
                if category == "Uncategorised":
                    # apply regex based category only if article is uncategorised
                    # get category id from regex classfication
                    category_id = self.classify.match(title)
                    category = Category.objects.get(id=category_id)
                else:
                    category, _ = Category.objects.get_or_create(name=category)
                source, _ = Source.objects.get_or_create(name=source)
                article_obj = Article.objects.create(domain=domain,
                                                     title=title,
                                                     source=source,
                                                     category=category,
                                                     source_url=source_url,
                                                     cover_image=cover_image,
                                                     blurb=blurb,
                                                     full_text=full_text,
                                                     published_on=published_on,
                                                     active=True)

                if len(images) > 1:
                    for img in images:
                        _ = ArticleMedia.objects.create(article=article_obj,
                                                        category="image",
                                                        url=img)

                if len(video_data) > 0:
                    for video_dic in video_data:
                        _ = ArticleMedia.objects.create(
                            article=article_obj,
                            category="video",
                            url=video_dic.get("video_image", ""),
                            video_url=video_dic.get("video_url", ""))

                if len(tags) > 0:
                    tag_objs = []
                    new_tags = self.remove_special_chars(tags)

                    if new_tags:
                        for tag in new_tags:
                            tag_obj = HashTag.objects.filter(name=tag)
                            if tag_obj:
                                tag_objs.append(tag_obj.first())
                            else:
                                tag_obj = HashTag.objects.create(name=tag)
                                tag_objs.append(tag_obj)
                        article_obj.hash_tags.add(*tag_objs)

                serializer = ArticleSerializer(article_obj)
                json_data = serializer.data
                if json_data["hash_tags"]:
                    tag_list = self.get_tags(json_data["hash_tags"])
                    json_data["hash_tags"] = tag_list
                self.batch.append(json_data)
                if len(self.batch) == 99:
                    ingest_to_elastic(self.batch, index, index, 'id')
                    self.batch = []
                    print("Ingesting Batch To Elastic...!!!")