Example #1
0
    def preprocessing_raw_data(self):
        batch_size = 10000

        def is_kazakh(text):
            return sum([c in "ӘәҒғҚқҢңӨөҰұҮүІі" for c in text]) > 0.07

        qs = Document.objects.filter(id__gt=0).order_by('id')
        number_of_documents = qs.count()
        for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)):
            print(f"Processing {i*batch_size}/{number_of_documents}")
            for j, doc in enumerate(batch):
                if i == 0:
                    print(f"{j}/{batch_size}")
                if "<" in doc.text or ">" in doc.text or "<" in doc.title or ">" in doc.title:
                    doc.text = BeautifulSoup(
                        doc.text,
                        "html.parser").text.strip().replace('\n', '')
                    doc.title = BeautifulSoup(
                        doc.title,
                        "html.parser").text.strip().replace('\n', '')
            Document.objects.bulk_update(batch, fields=['text', 'title'])
        for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)):
            print(f"Deleting {i*batch_size}/{number_of_documents}")
            for doc in batch:
                if is_kazakh(doc.text + doc.title):
                    doc.delete()
Example #2
0
def document_generator(qs):
    for batch in batch_qs(qs, batch_size=batch_size):
        for document in batch:
            obj = ESDocument()
            obj.init_from_model(document)
            obj = obj.to_dict()
            obj['corpus'] = f"hate_{obj['class_label']}"
            if random.randint(1, 100) <= percent_test:
                obj['corpus'] = "hate_test"
            yield obj
def init_document_datetime_activity_parsed(apps, schema_editor):
    MyModel = apps.get_model('mainapp', 'Document')
    qs = MyModel.objects.exclude(datetime=None).order_by('id').only('datetime_activity_parsed', 'datetime_created',)
    qs = qs.exclude(num_views=None)
    qs = qs.order_by('id')
    number_of_documents = qs.count()
    batch_size = 10000
    for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)):
        print(f"Processing {i * batch_size}/{number_of_documents}")
        for j, doc in enumerate(batch):
            if i == 0:
                print(f"{j}/{batch_size}")
            doc.datetime_activity_parsed = doc.datetime_created
        MyModel.objects.bulk_update(batch, fields=['datetime_activity_parsed'])
Example #4
0
def copy_date(apps, schema_editor):
    MyModel = apps.get_model('mainapp', 'Document')
    qs = MyModel.objects.exclude(datetime=None).order_by('id').only(
        'datetime',
        'date',
    )
    number_of_documents = qs.count()
    batch_size = 10000
    for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)):
        print(f"Processing {i * batch_size}/{number_of_documents}")
        for j, doc in enumerate(batch):
            if i == 0:
                print(f"{j}/{batch_size}")
            doc.date = doc.datetime.date()
        MyModel.objects.bulk_update(batch, fields=['date'])
Example #5
0
import datetime

from mainapp.models import *
from mainapp.services import batch_qs

batch_size = 10000

qs = Document.objects.filter(id__gt=0).order_by('id')
number_of_documents = qs.count()
for i, batch in enumerate(batch_qs(qs, batch_size=batch_size)):
    print(f"Processing {i*batch_size}/{number_of_documents}")
    for j, doc in enumerate(batch):
        if i == 0:
            print(f"{j}/{batch_size}")
        if doc.datetime and doc.datetime.date() > datetime.datetime.now().date(
        ):
            actual_date = doc.datetime + datetime.timedelta(hours=6)
            if actual_date.day <= 12:
                doc.datetime = doc.datetime.replace(month=actual_date.day,
                                                    day=actual_date.month)
    Document.objects.bulk_update(batch, fields=['datetime'])
Example #6
0
 def document_generator(self, qs):
     for batch in batch_qs(qs, batch_size=self.batch_size):
         for document in batch:
             obj = ESDocument()
             obj.init_from_model(document)
             yield obj.to_dict()